From 0d4ae7961fd29d86369f5fb088f6804195e68441 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 9 Sep 2025 05:12:05 +0000
Subject: [PATCH 001/194] [Frontend] Use ops instead of raw assembly code

---
 .../mlir/mlir_codegen_backend.py              | 522 ++++++++----------
 PyTorchSimFrontend/mlir/mlir_common.py        |  81 +--
 PyTorchSimFrontend/mlir/mlir_template.py      | 145 ++---
 Simulator/simulator.py                        |   2 +-
 4 files changed, 351 insertions(+), 399 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 6650f429..d4c2fdd6 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -36,19 +36,9 @@ def reduction_init(reduction_type, dtype):
     if reduction_type == "prod":
         return float(1) if dtype.is_floating_point else int(1)
     if reduction_type in {"max", "argmax"}:
-        if dtype == torch.float32:
-            return f"0x{mlir_common.MLIR_INF['-inf']['f32']:x}"
-        elif dtype == torch.float64:
-            return f"0x{mlir_common.MLIR_INF['-inf']['f64']:x}"
-        else:
-            return "0.0"
+        return "-inf"
     if reduction_type in {"min", "argmin"}:
-        if dtype == torch.float32:
-            return f"0x{mlir_common.MLIR_INF['inf']['f32']:x}"
-        elif dtype == torch.float64:
-            return f"0x{mlir_common.MLIR_INF['inf']['f64']:x}"
-        else:
-            return "0.0"
+        return "inf"
     if reduction_type in {"welford_reduce"}:
         return f"0.0"
     raise AssertionError(reduction_type)
@@ -221,9 +211,9 @@ class ExtensionOverrides(common.OpOverrides):
     def custom_cast(operand, target_type, *args, var_info=None, **kwargs):
         dtype = var_info[operand][1]
         if dtype == "index":
-            ret = ops.index_cast(operand, target_type, var_info=var_info)
+            ret = ops.index_cast(operand, target_type)
         else:
-            ret = ops.to_dtype(operand, target_type, var_info=var_info)
+            ret = ops.to_dtype(operand, target_type)
         return ret, var_info[ret]
 
     @staticmethod
@@ -238,26 +228,26 @@ def binary_elementwise_common(operand1, operand2, var_info):
             lhs_tile_size, lhs_dtype = op_type1
             rhs_tile_size, rhs_dtype = op_type2
             if lhs_tile_size > rhs_tile_size:
-                operand2 = ops.broadcast(operand2, operand1, var_info=var_info)
+                operand2 = ops.broadcast(operand2, lhs_tile_size)
                 op_type2 = var_info[operand2]
             elif lhs_tile_size < rhs_tile_size:
-                operand1 = ops.broadcast(operand1, operand2, var_info=var_info)
+                operand1 = ops.broadcast(operand1, rhs_tile_size)
                 op_type1 = var_info[operand1]
 
         # Data type check
         if op_type1[1] != op_type2[1]:
             if op_type1[1] == "index" or op_type1 == "index":
                 if op_type1[1] == "index":
-                    operand1 = ops.index_cast(operand1, op_type2[1], var_info)
+                    operand1 = ops.index_cast(operand1, op_type2[1])
                     op_type1 = var_info[operand1]
                 if op_type2[1] == "index":
-                    operand2 = ops.index_cast(operand2, op_type1[1], var_info)
+                    operand2 = ops.index_cast(operand2, op_type1[1])
                     op_type2 = var_info[operand2]
             elif op_type1[1][0] == "i" and op_type2[1][0] == "f":
-                operand1 = ops.to_dtype(operand1, op_type2[1], var_info)
+                operand1 = ops.to_dtype(operand1, op_type2[1])
                 op_type1 = var_info[operand1]
             elif op_type1[1][0] == "f" and op_type2[1][0] == "i":
-                operand2 = ops.to_dtype(operand2, op_type1[1], var_info)
+                operand2 = ops.to_dtype(operand2, op_type1[1])
                 op_type2 = var_info[operand2]
             elif op_type1[1][0] == op_type2[1][0]:
                 if mlir_common.MLIR_TO_BIT[op_type1[1]] > mlir_common.MLIR_TO_BIT[op_type2[1]]:
@@ -332,7 +322,7 @@ def minimum(operand1, operand2, *args, var_info=None, **kwargs):
         if ret_type[0] == "f":
             opcode = f'arith.minimumf'
         else:
-            opcode = f'arith.minimumui'
+            opcode = f'arith.minui'
         return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
 
     @staticmethod
@@ -342,14 +332,14 @@ def maximum(operand1, operand2, *args, var_info=None, **kwargs):
         if ret_type[0] == "f":
             opcode = f'arith.maximumf'
         else:
-            opcode = f'arith.maximumui'
+            opcode = f'arith.maxui'
         return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
 
     @staticmethod
     def to_dtype(operand, dst_mlir_dtype, *args, var_info=None, **kwargs):
         src_mlir_dtype = var_info[operand][1]
         if src_mlir_dtype == "index":
-            operand = ops.index_cast(operand, "i64", var_info=var_info)
+            operand = ops.index_cast(operand, "i64")
             src_mlir_dtype = var_info[operand][1]
 
         tile_size = var_info[operand][0]
@@ -368,7 +358,7 @@ def to_dtype(operand, dst_mlir_dtype, *args, var_info=None, **kwargs):
                 return f"arith.extui %{operand} : {src_shape} to {shape}", [tile_size, dst_mlir_dtype]
             elif dst_bits < src_bits:
                 return f"arith.trunc %{operand} : {src_shape} to {shape}", [tile_size, dst_mlir_dtype]
-            return f"arith.maximumi %{operand}, %{operand} : {shape}", [tile_size, dst_mlir_dtype]
+            return f"arith.maxui %{operand}, %{operand} : {shape}", [tile_size, dst_mlir_dtype]
         elif dst_mlir_dtype[0] == "f":
             if dst_bits > src_bits:
                 return f"arith.extf %{operand} : {src_shape} to {shape}", [tile_size, dst_mlir_dtype]
@@ -389,7 +379,7 @@ def constant(value, src_type, *args, var_info=None, **kwargs):
         elif "e" in str(value):
             value = format(float(value), ".20f")
         elif src_type[0] == "f":
-            value = format(value, ".20f")
+            value = format(float(value), ".20f")
         elif src_type[0] == "i":
             value = int(value)
         return f'arith.constant {value} : {src_type}', [1, src_type]
@@ -412,9 +402,7 @@ def exp(operand, *args, var_info=None, **kwargs):
         # Check scalar
         op_type = var_info[operand]
         if op_type[0] == 1:
-            val = ops.constant(0, op_type[1])
-            var_info[val][0] = 4
-            operand = ops.broadcast(operand, val)
+            operand = ops.broadcast(operand, 4)
             val = ops.exp(operand)
             result = ops.extractelement(val, 0)
             return result, var_info[result]
@@ -440,9 +428,7 @@ def erf(operand, *args, var_info=None, **kwargs):
         # Check scalar
         op_type = var_info[operand]
         if op_type[0] == 1:
-            val = ops.constant(0, op_type[1])
-            var_info[val][0] = 4
-            operand = ops.broadcast(operand, val)
+            operand = ops.broadcast(operand, 4)
             val = ops.erf(operand)
             result = ops.extractelement(val, 0)
             return result, var_info[result]
@@ -459,9 +445,7 @@ def tanh(operand, *args, var_info=None, **kwargs):
         # Check scalar
         op_type = var_info[operand]
         if op_type[0] == 1:
-            val = ops.constant(0, op_type[1])
-            var_info[val][0] = 4
-            operand = ops.broadcast(operand, val)
+            operand = ops.broadcast(operand, 4)
             val = ops.tanh(operand)
             result = ops.extractelement(val, 0)
             return result, var_info[result]
@@ -471,7 +455,7 @@ def tanh(operand, *args, var_info=None, **kwargs):
 
         # Type check & auto cast
         if dtype[0] != "f":
-            operand, dtype = ops.to_dtype(operand, "f32", var_info=var_info)
+            operand, dtype = ops.to_dtype(operand, "f32")
             var_info[operand] = dtype
         shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
         return f'math.tanh %{operand} : {shape}', [tile_size, dtype]
@@ -483,9 +467,7 @@ def sin(operand, *args, var_info=None, **kwargs):
         # Check scalar
         op_type = var_info[operand]
         if op_type[0] == 1:
-            val = ops.constant(0, op_type[1])
-            var_info[val][0] = 4
-            operand = ops.broadcast(operand, val)
+            operand = ops.broadcast(operand, 4)
             val = ops.sin(operand)
             result = ops.extractelement(val, 0)
             return result, var_info[result]
@@ -495,7 +477,7 @@ def sin(operand, *args, var_info=None, **kwargs):
 
         # Type check & auto cast
         if dtype[0] != "f":
-            operand, dtype = ops.to_dtype(operand, "f32", var_info=var_info)
+            operand, dtype = ops.to_dtype(operand, "f32")
             var_info[operand] = dtype
         shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
         return f'math.sin %{operand} : {shape}', [tile_size, dtype]
@@ -507,9 +489,7 @@ def cos(operand, *args, var_info=None, **kwargs):
         # Check scalar
         op_type = var_info[operand]
         if op_type[0] == 1:
-            val = ops.constant(0, op_type[1])
-            var_info[val][0] = 4
-            operand = ops.broadcast(operand, val)
+            operand = ops.broadcast(operand, 4)
             val = ops.cos(operand)
             result = ops.extractelement(val, 0)
             return result, var_info[result]
@@ -519,7 +499,7 @@ def cos(operand, *args, var_info=None, **kwargs):
 
         # Type check & auto cast
         if dtype[0] != "f":
-            operand, dtype = ops.to_dtype(operand, "f32", var_info=var_info)
+            operand, dtype = ops.to_dtype(operand, "f32")
             var_info[operand] = dtype
         shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
         return f'math.cos %{operand} : {shape}', [tile_size, dtype]
@@ -532,7 +512,7 @@ def sqrt(operand, *args, var_info=None, **kwargs):
 
         # Type check & auto cast
         if dtype[0] != "f":
-            operand, dtype = ops.to_dtype(operand, "f32", var_info=var_info)
+            operand, dtype = ops.to_dtype(operand, "f32")
             var_info[operand] = dtype
 
         shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
@@ -546,7 +526,7 @@ def rsqrt(operand, *args, var_info=None, **kwargs):
 
         # Type check & auto cast
         if dtype[0] != "f":
-            operand, dtype = ops.to_dtype(operand, "f32", var_info=var_info)
+            operand, dtype = ops.to_dtype(operand, "f32")
             var_info[operand] = dtype
 
         shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
@@ -557,12 +537,12 @@ def pow(operand1, operand2, *args, var_info=None, **kwargs):
         tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
         # Type check & auto cast
         if ret_type[0] != "f":
-            operand1, ret_type = ops.to_dtype(operand1, "f32", var_info=var_info)
+            operand1, ret_type = ops.to_dtype(operand1, "f32")
             var_info[operand1] = ret_type
 
         # Type check & auto cast
         if ret_type[0] != "f":
-            operand2, ret_type = ops.to_dtype(operand2, "f32", var_info=var_info)
+            operand2, ret_type = ops.to_dtype(operand2, "f32")
             var_info[operand2] = ret_type
 
         shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
@@ -576,7 +556,7 @@ def log(operand, *args, var_info=None, **kwargs):
 
         # Type check & auto cast
         if dtype[0] != "f":
-            operand, dtype = ops.to_dtype(operand, "f32", var_info=var_info)
+            operand, dtype = ops.to_dtype(operand, "f32")
             var_info[operand] = dtype
 
         shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
@@ -590,7 +570,7 @@ def reciprocal(operand, *args, var_info=None, **kwargs):
 
         # Type check & auto cast
         if dtype[0] != "f":
-            operand, dtype = ops.to_dtype(operand, "f32", var_info=var_info)
+            operand, dtype = ops.to_dtype(operand, "f32")
             var_info[operand] = dtype
 
         return ops.div(ops.constant(1.0, dtype), operand), [tile_size, dtype]
@@ -615,7 +595,7 @@ def neg(operand, *args, var_info=None, **kwargs):
 
         # Type check & auto cast
         if dtype[0] != "f":
-            operand, dtype = ops.to_dtype(operand, "f32", var_info=var_info)
+            operand, dtype = ops.to_dtype(operand, "f32")
             var_info[operand] = dtype
 
         shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
@@ -718,12 +698,12 @@ def and_(operand1, operand2, *args, var_info=None, **kwargs):
 
         # Type check & auto cast
         if op_type1[1][0] != "i":
-            operand1, dtype = ops.to_dtype(operand1, "i32", var_info=var_info)
+            operand1, dtype = ops.to_dtype(operand1, "i32")
             var_info[operand1] = dtype
 
         # Type check & auto cast
         if op_type2[1][0] != "i":
-            operand1, dtype = ops.to_dtype(operand1, "i32", var_info=var_info)
+            operand1, dtype = ops.to_dtype(operand1, "i32")
             var_info[operand2] = dtype
 
         ret_type = op_type1[1]
@@ -739,12 +719,12 @@ def or_(operand1, operand2, *args, var_info=None, **kwargs):
 
         # Type check & auto cast
         if op_type1[1][0] != "i":
-            operand1, dtype = ops.to_dtype(operand1, "i32", var_info=var_info)
+            operand1, dtype = ops.to_dtype(operand1, "i32")
             var_info[operand1] = dtype
 
         # Type check & auto cast
         if op_type2[1][0] != "i":
-            operand1, dtype = ops.to_dtype(operand1, "i32", var_info=var_info)
+            operand1, dtype = ops.to_dtype(operand1, "i32")
             var_info[operand2] = dtype
 
         ret_type = op_type1[1]
@@ -760,12 +740,12 @@ def xor(operand1, operand2, *args, var_info=None, **kwargs):
 
         # Type check & auto cast
         if op_type1[1][0] != "i":
-            operand1, dtype = ops.to_dtype(operand1, "i32", var_info=var_info)
+            operand1, dtype = ops.to_dtype(operand1, "i32")
             var_info[operand1] = dtype
 
         # Type check & auto cast
         if op_type2[1][0] != "i":
-            operand1, dtype = ops.to_dtype(operand1, "i32", var_info=var_info)
+            operand1, dtype = ops.to_dtype(operand1, "i32")
             var_info[operand2] = dtype
 
         ret_type = op_type1[1]
@@ -791,7 +771,7 @@ def logical_not(operand, *args, var_info=None, **kwargs):
         tile_size = op_type[0]
         shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
         const_one = ops.constant(0, ret_type)
-        const_one = ops.broadcast(const_one, operand, var_info=var_info)
+        const_one = ops.broadcast(const_one, tile_size)
         ret = ops.eq(operand,const_one)
         return ret, [tile_size, var_info[ret]]
 
@@ -831,17 +811,22 @@ def sigmoid(operand, *args, var_info=None, **kwargs):
     def where(condition, operand1, operand2, *args, var_info=None, **kwargs):
         tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
         cond_type = var_info[condition]
+        operand_type = var_info[operand1]
         if cond_type[0] < tile_size:
-            condition = ops.broadcast(condition, operand1, var_info=var_info)
+            condition = ops.broadcast(condition, operand_type[0])
         elif cond_type[0] > tile_size:
-            operand1 = ops.broadcast(operand1, condition, var_info=var_info)
-            operand2 = ops.broadcast(operand2, condition, var_info=var_info)
+            operand1 = ops.broadcast(operand1, operand_type[0])
+            operand2 = ops.broadcast(operand2, operand_type[0])
         tile_size, ret_type = var_info[operand1]
 
         shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
         cond_shape = f"vector<{tile_size}xi1>," if tile_size > 1 else ""
         return f"arith.select %{condition}, %{operand1}, %{operand2} : {cond_shape} {shape}", [tile_size, ret_type]
 
+    @staticmethod
+    def step(size, dtype, *args, **kwargs):
+        index_shape = f"vector<{size}x{dtype}>"
+        return f"vector.step : {index_shape}", [size, dtype]
 
     @staticmethod
     def masked(mask, body, other, *args, var_info=None, tile_size=16, dtype="f32", ninf_declared=False, **kwargs):
@@ -858,32 +843,77 @@ def index_cast(operand, target_type, *args, var_info=None, **kwrags):
         return f"arith.index_cast %{operand} : {src_shape} to {des_shape}", [op_type[0], target_type]
 
     @staticmethod
-    def broadcast_unflat(operand1, operand2, *args, var_info=None, **kwargs):
+    def broadcast_unflat(operand1, target_size, *args, var_info=None, **kwargs):
         op_type1 = var_info[operand1]
-        op_type2 = var_info[operand2]
         src_shape = f"vector<{op_type1[0]}x{op_type1[1]}>"# if op_type1[0] > 1 else op_type1[1]
-        des_shape = f"vector<{op_type2[0]//op_type1[0]}x{op_type1[0]}x{op_type1[1]}>"# if op_type2[0] > 1 else op_type1[1] # Use tile size only
+        des_shape = f"vector<{target_size//op_type1[0]}x{op_type1[0]}x{op_type1[1]}>"# if op_type2[0] > 1 else op_type1[1] # Use tile size only
 
         expand = f"vector.broadcast %{operand1} : {src_shape} to {des_shape}"
-        return expand, [op_type2[0], op_type1[1]]
+        return expand, [target_size, op_type1[1]]
 
     @staticmethod
-    def broadcast(operand1, operand2, *args, var_info=None, **kwargs):
+    def broadcast(operand1, target_size, *args, var_info=None, **kwargs):
         op_type1 = var_info[operand1]
-        op_type2 = var_info[operand2]
         src_shape = f"vector<{op_type1[0]}x{op_type1[1]}>" if op_type1[0] > 1 else op_type1[1]
-        des_shape = f"vector<{op_type2[0]}x{op_type1[1]}>" # if op_type2[0] > 1 else op_type1[1] # Use tile size only
+        des_shape = f"vector<{target_size}x{op_type1[1]}>" # if op_type2[0] > 1 else op_type1[1] # Use tile size only
 
         # Special case for length 2 vector. We used this vector to avoid scalar operations...
-        if op_type1[0] != 1 and op_type2[0] % op_type1[0] == 0:
-            unflat_operand = ops.broadcast_unflat(operand1, operand2)
-            unflat_shape = f"vector<{op_type2[0]//op_type1[0]}x{op_type1[0]}x{op_type1[1]}>"
+        if op_type1[0] != 1 and target_size % op_type1[0] == 0:
+            unflat_operand = ops.broadcast_unflat(operand1, target_size)
+            unflat_shape = f"vector<{target_size//op_type1[0]}x{op_type1[0]}x{op_type1[1]}>"
             expand = f"vector.shape_cast %{unflat_operand} : {unflat_shape} to {des_shape}"
         elif op_type1[0] == 1:
             expand = f"vector.broadcast %{operand1} : {src_shape} to {des_shape}"
         else:
             raise NotImplementedError("Not supporting broadcast type...")
-        return expand, [op_type2[0], op_type1[1]]
+        return expand, [target_size, op_type1[1]]
+
+    @staticmethod
+    def shape_cast(operand, src_shape, dst_shape, *args, var_info=None, **kwargs):
+        operand_type = var_info[operand]
+        return f"vector.shape_cast %{operand} : {src_shape} to {dst_shape}", operand_type
+
+    @staticmethod
+    def multi_reduction(acc, init, vec_size, red_size, red_shape, red_type, type_name, *args, **kwargs):
+        if red_size == 1:
+            final_reduced_shape = f"{type_name}"
+            line = reduction_combine_vec(red_type, acc, init, axis=0, shape=red_shape, reduced_shape=final_reduced_shape)
+        else:
+            final_reduced_shape = f"vector<{red_size}x{type_name}>"
+            new_vshape= f"vector<{vec_size//red_size}x{red_size}x{type_name}>"
+            value = ops.shape_cast(acc, red_shape, new_vshape)
+            line = reduction_combine_vec(red_type, value, init, axis=0, shape=new_vshape, reduced_shape=final_reduced_shape)
+        return line, [red_size, type_name]
+
+    @staticmethod
+    def _load(compute_vec_size, mlir_dtype, buffer, indices, buffer_shape, *args, var_info=None, **kwargs):
+        if compute_vec_size == 1:
+            vshape = f"{mlir_dtype}"
+            operation = "affine.load"
+            line = f"{operation} %{buffer}[{indices}] : {buffer_shape}"
+        else:
+            vshape = f"vector<{compute_vec_size}x{mlir_dtype}>"
+            operation = "affine.vector_load"
+            line = f"{operation} %{buffer}[{indices}] : {buffer_shape}, {vshape}"
+        return line, [compute_vec_size, mlir_dtype]
+
+    @staticmethod
+    def _store(operand, buffer, indices, buffer_shape, *args, buffer_name=None, var_info=None, **kwargs):
+        compute_vec_size, mlir_dtype = var_info[operand][0], var_info[operand][1]
+
+        if compute_vec_size == 1:
+            vshape = f"{mlir_dtype}"
+            operation = "affine.store"
+            line = f"{operation} %{operand}, %{buffer}[{indices}] : {buffer_shape}"
+        else:
+            vshape = f"vector<{compute_vec_size}x{mlir_dtype}>"
+            operation = "affine.vector_store"
+            line = f"{operation} %{operand}, %{buffer}[{indices}] : {buffer_shape}, {vshape}"
+
+        if buffer_name is not None:
+            return common.DeferredLine(buffer_name, line), [None, None]
+        else:
+            return line, [None, None]
 
 RTYPE_TO_MLIR = {
     "sum": "add",
@@ -1031,7 +1061,6 @@ def parse_indices(self, expr, buffer=None, comments="", indirect_dims=[]) -> com
     def parse_index_list(self, expr_list:list, buffer=None, offset=sympy.Number(0)) -> common.CSEVariable:
         if buffer is None:
             buffer = self.applys
-        zero_var = self.get_const_cse(0)
         expr_list = [arg for arg in expr_list]
         dim_list = [f"d{i}" for i in range(len(expr_list))]
 
@@ -1102,6 +1131,7 @@ def load(self, name: str, index: sympy.Expr):
 
         # Define scratch pad buffer
         sram_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, local_tile_desc, index)
+        compute_index_var = ",".join(sram_index_var.split(",")[:-1] + [f"%{self.compute_idx}"])
 
         # MVIN Encoding
         attribute = f"{{dram_stride={dram_stride}, sram_stride={tile_stride}, padding={padding}}}"
@@ -1110,24 +1140,15 @@ def load(self, name: str, index: sympy.Expr):
         self.cse.generate(dma_buffer, code, assignment = False) # FIXME: assignment = False does not support caching
 
         if not comptute_depedency:
-            compute_index_var = ",".join(sram_index_var.split(",")[:-1] + [f"%{self.compute_idx}"])
             # Generate vector load instruction
-            if compute_vec_size > 1:
-                operation = "affine.vector_load"
-                line = f"{operation} %{sram_var}[{compute_index_var}] : {tile_shape}, {vshape}"
-            else:
-                operation = "affine.load"
-                line = f"{operation} %{sram_var}[{compute_index_var}] : {tile_shape}"
-
-            out = self.cse.generate(load_buffer, line)
-            self.register_var_info(out, [compute_vec_size, mlir_dtype])
-            self.spad_buffer_dict[str(out)] = [sram_var, local_tile_desc.get_tile_size(), tile_numel_per_lane, sram_index_var, tile_shape, vshape]
-            return out
+            with self.override_buffer_cse(buffer=load_buffer):
+                out = ops._load(compute_vec_size, mlir_dtype, sram_var, compute_index_var, tile_shape)
         else:
+            # FIXME. Any good idea?
             out = sram_var
             self.register_var_info(out, [compute_vec_size, mlir_dtype])
-            self.spad_buffer_dict[str(out)] = [sram_var, local_tile_desc.get_tile_size(), tile_numel_per_lane, sram_index_var, tile_shape, vshape]
-            return out
+        self.spad_buffer_dict[str(out)] = [sram_var, local_tile_desc.get_tile_size(), tile_numel_per_lane, sram_index_var, tile_shape, vshape]
+        return out
 
     def store(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         index = self.rename_indexing(index)
@@ -1148,30 +1169,25 @@ def store(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         vshape = self.kernel_group.tile_desc.get_mlir_vshape(mlir_dtype)
         compute_vec_size = self.kernel_group.tile_desc.get_compute_vec_size()
         require_store = True
-        if compute_vec_size < self.var_info[value][0]:
-            value = self.cse.generate(self.stores, f"vector.extract_strided_slice  %{value} {{offsets = [0], sizes = [{compute_vec_size}], strides = [1]}}: vector<{self.var_info[value][0]}x{self.var_info[value][1]}> to {vshape}")
-            self.register_var_info(value, [compute_vec_size, mlir_dtype])
 
         if str(value) in self.spad_buffer_dict:
             # Todo. If tile_size is not same (i.e., view operation), we can't apply peephole optimization easily
             require_store = self.spad_buffer_dict[str(value)][1] != tile_size
 
+        if compute_vec_size < self.var_info[value][0]:
+            value = self.cse.generate(self.stores, f"vector.extract_strided_slice  %{value} {{offsets = [0], sizes = [{compute_vec_size}], strides = [1]}}: vector<{self.var_info[value][0]}x{self.var_info[value][1]}> to {vshape}")
+            self.register_var_info(value, [compute_vec_size, mlir_dtype])
+
         if require_store:
             # Define scratch pad buffer
             sram_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, local_tile_desc, index)
             compute_index_var = ",".join(sram_index_var.split(",")[:-1] + [f"%{self.compute_idx}"])
             # Generate vector store instruction
-            store_size, operand_type = self.var_info[value]
+            _, operand_type = self.var_info[value]
             if mlir_dtype != operand_type:
-                value = ops.custom_cast(value, mlir_dtype, var_info=self.var_info)
-
-            if compute_vec_size > 1 and store_size > 1:
-                operation = "affine.vector_store"
-                line = f"{operation} %{value}, %{sram_var}[{compute_index_var}] : {tile_shape}, {vshape}"
-            else:
-                operation = "affine.store"
-                line = f"{operation} %{value}, %{sram_var}[{compute_index_var}] : {tile_shape}"
-            self.stores.writeline(common.DeferredLine(name, line)) # TODO: Should be changed to self.compute?
+                value = ops.custom_cast(value, mlir_dtype)
+            with self.override_buffer_cse(buffer=self.stores):
+                ops._store(value, sram_var, compute_index_var, tile_shape, buffer_name=name)
         else:
             sram_var = self.spad_buffer_dict[str(value)][0]
             sram_index_var = self.spad_buffer_dict[str(value)][3]
@@ -1207,9 +1223,9 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
         reduced_shape = self.kernel_group.tile_desc.get_mlir_vshape(type_name)
 
         # Prepare reduction init
-        init = self.const_cse.generate(self.const_buffer, f"arith.constant {reduction_init(reduction_type, dtype)} : {type_name}")
-        init_vec = init if vec_len == 1 else self.const_cse.generate(self.const_buffer, f"vector.broadcast %{init} : {type_name} to {reduced_shape}")
-        self.register_var_info(init_vec, [vec_len, type_name])
+        with self.override_buffer_cse(cse=self.const_cse, buffer=self.const_buffer):
+            init = self.get_const_cse(reduction_init(reduction_type, dtype), type_name)
+            init_vec = init if vec_len == 1 else ops.broadcast(init, vec_len)
 
         acc_var_list = []
         iter_var_list = []
@@ -1248,95 +1264,65 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
             self.affine_yield[acc] = reduced_shape, reduction_depth
 
         # Final reduction
-        acc = acc_var_list[0] # Set outermost acc var
         reduction_size = self.kernel_group.tile_desc.get_numel_per_lane() // self.kernel_group.tile_desc.get_reduction_numel()
+        acc = acc_var_list[0] # Set outermost acc var
+        self.register_var_info(acc, [reduction_size, type_name])
         assert(vec_len % reduction_size==0)
-        if vec_len > reduction_size:
-            init = self.const_cse.generate(self.reductions_suffix, f"arith.constant {reduction_init(reduction_type, dtype)} : {type_name}")
-            if reduction_size == 1:
-                final_reduced_shape = f"{type_name}"
-                out = self.cse.generate(self.reductions_suffix, reduction_combine_vec(reduction_type, acc, init, axis=0, shape=reduced_shape, reduced_shape=final_reduced_shape))
-            else:
-                final_reduced_shape = f"vector<{reduction_size}x{type_name}>"
-                init_vec = self.cse.generate(self.reductions_suffix, f"vector.broadcast %{init} : {type_name} to {final_reduced_shape}")
-                new_vshape= f"vector<{vec_len//reduction_size}x{reduction_size}x{type_name}>"
-                value = self.cse.generate(self.reductions_suffix, f"vector.shape_cast %{acc} : {reduced_shape} to {new_vshape}")
-                out = self.cse.generate(self.reductions_suffix, reduction_combine_vec(reduction_type, value, init_vec, axis=0, shape=new_vshape, reduced_shape=final_reduced_shape))
-            acc = out
-
-        # reigster reduction output
-        var_info = [reduction_size, mlir_common.DTYPE_TO_MLIR[dtype]]
-        self.register_var_info(acc, var_info)
+
+        # Prepare init value
+        init = self.get_const_cse(reduction_init(reduction_type, dtype), type_name)
+        if reduction_size != 1:
+            with self.override_buffer_cse(buffer=self.reductions_suffix):
+                init = ops.broadcast(init, reduction_size)
+
+        # Final reduction codegen
+        with self.override_buffer_cse(buffer=self.reductions_suffix):
+            if vec_len > reduction_size:
+                acc = ops.multi_reduction(acc, init, vec_len, reduction_size, reduced_shape, reduction_type, type_name)
         return acc
 
     def store_reduction(self, name, index, value):
-        # Note: Change cse temporaily
         # Store reduction can't share cached value stored in cse,
         # since it is not innermost loop body.
-        tmp_cse = self.cse
-        tmp_apply_cse = self.apply_cse
-        self.cse = self.reduction_cse
-        self.apply_cse = self.reduction_cse
-
         dram_var = self.kernel_group.args.output(name)
         dtype = V.graph.get_dtype(name)
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
         index = self.rename_indexing(index)
 
-        # Tile is always reuduced in inner loop
-        local_tile_desc, index_var, dram_stride = self.get_dma_info(name, index, broadcast=False, store_reduction=True, buffer=self.reductions_suffix)
-        vlane_split_axis = local_tile_desc.vmap.vlane_split_axis
-        vlane_stride = local_tile_desc.vmap.vlane_stride
-
-        dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
-        tile_shape = local_tile_desc.get_mlir_shape(mlir_dtype)
-        tile_stride = local_tile_desc.get_tile_stride()
-        compute_vec_size = self.kernel_group.tile_desc.get_numel_per_lane() // self.kernel_group.tile_desc.get_reduction_numel()
-        if compute_vec_size == 1:
-            vshape = f"{mlir_dtype}"
-        else:
-            vshape = f"vector<{compute_vec_size}x{mlir_dtype}>"
-        sram_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, local_tile_desc, index)
-        if self.welford_reduce_out is not None:
-            sum, sqr_sum, _ = self.welford_reduce_out
-            # mean
-            reduction_numel = reduce(mul, self.ranges[self.reduction_depth:], 1)
-            divider = self.cse.generate(self.reductions_suffix, f"arith.constant {float(reduction_numel)} : f32")
-            if compute_vec_size > 1:
-                divider_vec = self.cse.generate(self.reductions_suffix, f"vector.broadcast %{divider} : f32 to vector<{self.var_info[sum][0]}x{mlir_dtype}>")
-            else:
-                divider_vec = divider
-            mean = self.cse.generate(self.reductions_suffix, f"arith.divf %{sum}, %{divider_vec} : {vshape}")
-
-            # m2 = (E(X^2) - E(X)^2) * N
-            sqr_mean = self.cse.generate(self.reductions_suffix, f"arith.divf %{sqr_sum}, %{divider_vec} : {vshape}")
-            mean_sqr = self.cse.generate(self.reductions_suffix, f"arith.mulf %{mean}, %{mean} : {vshape}")
-            variance = self.cse.generate(self.reductions_suffix, f"arith.subf %{sqr_mean}, %{mean_sqr} : {vshape}")
-            m2 = self.cse.generate(self.reductions_suffix, f"arith.mulf %{variance}, %{divider_vec} : {vshape}")
-            if self.current_node.node.origin_node: # FIXME: This is a temporary solution
-                value = mean
-            else:
-                value = m2
-
-        # Select src type
-        if compute_vec_size == 1:
-            operation = "affine.store"
-            line = f"{operation} %{value}, %{sram_var}[{sram_index_var}] : {tile_shape}"
-        else:
-            operation =  "affine.vector_store"
-            line = f"{operation} %{value}, %{sram_var}[{sram_index_var}] : {tile_shape}, {vshape}"
-        self.reductions_suffix.writeline(common.DeferredLine(name, line))
+        with self.override_buffer_cse(cse=self.reduction_cse):
+            # Tile is always reuduced in inner loop
+            local_tile_desc, index_var, dram_stride = self.get_dma_info(name, index, broadcast=False, store_reduction=True, buffer=self.reductions_suffix)
+            vlane_split_axis = local_tile_desc.vmap.vlane_split_axis
+            vlane_stride = local_tile_desc.vmap.vlane_stride
 
-        # MVOUT Encoding
-        # Generate DMA instruction
-        attribute = f"{{dram_stride={dram_stride}, sram_stride={tile_stride}, padding=0}}"
-        code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
-                                 dram_shape, tile_shape, attribute)
-        self.reductions_suffix.writeline(common.DeferredLine(name, code))
+            dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
+            tile_shape = local_tile_desc.get_mlir_shape(mlir_dtype)
+            tile_stride = local_tile_desc.get_tile_stride()
 
-        # Restore origin cse
-        self.cse = tmp_cse
-        self.apply_cse = tmp_apply_cse
+            sram_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, local_tile_desc, index)
+            with self.override_buffer_cse(buffer=self.reductions_suffix):
+                if self.welford_reduce_out is not None:
+                    # Calc var and mean
+                    sum, sqr_sum, _ = self.welford_reduce_out
+                    reduction_numel = reduce(mul, self.ranges[self.reduction_depth:], 1)
+                    divider = self.get_const_cse(float(reduction_numel), "f32")
+                    mean = ops.div(sum, divider)
+                    sqr_mean = ops.div(sqr_sum, divider)
+                    mean_sqr = ops.mul(mean, mean)
+                    variance = ops.sub(sqr_mean, mean_sqr)
+                    m2 = ops.mul(variance, divider)
+                    if self.current_node.node.origin_node: # FIXME: This is a temporary solution
+                        value = mean
+                    else:
+                        value = m2
+                # Store value to scratch pad
+                ops._store(value, sram_var, sram_index_var, tile_shape, buffer_name=name)
+
+            # Generate DMA instruction
+            attribute = f"{{dram_stride={dram_stride}, sram_stride={tile_stride}, padding=0}}"
+            code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
+                                    dram_shape, tile_shape, attribute)
+            self.reductions_suffix.writeline(common.DeferredLine(name, code))
 
     def indirect_indexing(self, index_var, size, check=True):
         return str(index_var)
@@ -1354,77 +1340,71 @@ def _index_expr(self, tile_desc, renamed_expression, index, base_vector_index):
         strides = tile_desc.get_tile_stride_per_lane()
 
         # Create vector index
-        compute_vec = self.cse.generate(self.compute, f"vector.broadcast %{self.compute_idx} : index to vector<{compute_vec_size}xindex>")
-        self.register_var_info(compute_vec, [compute_vec_size, "index"])
+        compute_vec = ops.broadcast(self.compute_idx, compute_vec_size)
         vector_index = ops.add(base_vector_index, compute_vec)
 
         # Create tile_dim index
         dim_list = []
         for idx in range(len(tile_size)):
-            div_coeff = self.get_const_cse(strides[idx], "index")
-            mod_coeff = self.get_const_cse(tile_size[idx], "index")
-            div_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{div_coeff} : index to vector<{compute_vec_size}xindex>")
-            mod_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{mod_coeff} : index to vector<{compute_vec_size}xindex>")
-            self.register_var_info(div_vec, [compute_vec_size, "index"])
-            self.register_var_info(mod_vec, [compute_vec_size, "index"])
-            dim = ops.modular(ops.div(vector_index, div_vec), mod_vec)
-            if idx == tile_desc.vmap.vlane_split_axis: # Need to add vector lane offset
-                offset = tile_desc.vmap.vlane_stride #* strides[idx]
-                outer_sz = tile_size[idx] // tile_desc.vmap.vlane_stride
-
+            # Prepare initial values
+            offset = tile_desc.vlane_stride #* strides[idx]
+            outer_sz = tile_size[idx] // tile_desc.vlane_stride
+            with self.override_buffer_cse(buffer=self.const_buffer, cse=self.const_cse):
+                div_coeff = self.get_const_cse(strides[idx], "index")
+                mod_coeff = self.get_const_cse(tile_size[idx], "index")
+                vlane_stride_coeff = self.get_const_cse(tile_desc.vlane_stride, "index")
+                vlane_outer_coeff = self.get_const_cse(outer_sz, "index")
                 nr_vector_lane = self.get_const_cse(self.vector_lane, "index")
-                nr_vector_lane_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{nr_vector_lane} : index to vector<{compute_vec_size}xindex>")
-                self.register_var_info(nr_vector_lane_vec, [compute_vec_size, "index"])
+                vlane_coeff = self.get_const_cse(0, "i64")
 
-                vlane_stride_coeff = self.get_const_cse(tile_desc.vmap.vlane_stride, "index")
-                vlane_outer_coeff = self.get_const_cse(outer_sz, "index")
-                vlane_stride_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{vlane_stride_coeff} : index to vector<{compute_vec_size}xindex>")
-                vlane_outer_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{vlane_outer_coeff} : index to vector<{compute_vec_size}xindex>")
-                self.register_var_info(vlane_stride_vec, [compute_vec_size, "index"])
-                self.register_var_info(vlane_outer_vec, [compute_vec_size, "index"])
+                div_vec = ops.broadcast(div_coeff, compute_vec_size)
+                mod_vec = ops.broadcast(mod_coeff, compute_vec_size)
+                nr_vector_lane_vec = ops.broadcast(nr_vector_lane, compute_vec_size)
+                vlane_stride_vec = ops.broadcast(vlane_stride_coeff, compute_vec_size)
+                vlane_outer_vec = ops.broadcast(vlane_outer_coeff, compute_vec_size)
+
+                # Prepare vlane offset (vidx)
+                vlane_vec_size = 4
+                vlane_vec = ops.broadcast(vlane_coeff, vlane_vec_size)
+
+            dim = ops.modular(ops.div(vector_index, div_vec), mod_vec)
+            if idx == tile_desc.vlane_split_axis: # Need to add vector lane offset
                 stride_dim = ops.modular(dim, vlane_stride_vec)
                 outer_dim = ops.modular(ops.div(dim, vlane_stride_vec), vlane_outer_vec)
-
                 dim = ops.add(stride_dim, ops.mul(outer_dim, nr_vector_lane_vec))
 
-                # Prepare vlane offset (vidx)
-                vlane_coeff = self.get_const_cse(0, "i64")
-                vlane_vec_size = 4
-                vlane_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{vlane_coeff} : i64 to vector<{vlane_vec_size}xi64>")
                 vlane_offset = self.const_cse.generate(self.const_buffer, f"arith.addi %{vlane_vec}, %{vlane_vec} {{ vlane_offset={offset} }} : vector<{vlane_vec_size}xi64> // vlane offset")
                 self.register_var_info(vlane_offset, [vlane_vec_size, "i64"])
                 vlane_offset = ops.index_cast(vlane_offset, "index")
-                self.register_var_info(vlane_offset, [vlane_vec_size, "index"])
-
                 dim = ops.add(dim, vlane_offset)
             dim_list.append(dim)
 
         indices = [str(i) for i in index.free_symbols]
         for idx in indices:
             i = int(idx[5:])
-            index_vec = self.cse.generate(self.compute, f"vector.broadcast %{idx} : index to vector<{compute_vec_size}xindex>")
-            self.register_var_info(index_vec, [compute_vec_size, "index"])
+            idx = self.itervar_cses[idx]
+            index_vec = ops.broadcast(idx, compute_vec_size)
             offset = ops.add(index_vec, dim_list[i])
             dim_list[i] = offset
 
         arg_lists = []
         for arg in renamed_expression.args:
             if isinstance(arg, sympy.Integer):
-                offset = self.get_const_cse(int(arg))
-                offset_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{offset} : index to vector<{compute_vec_size}xindex>")
-                self.register_var_info(offset_vec, [compute_vec_size, "index"])
+                with self.override_buffer_cse(buffer=self.const_buffer, cse=self.const_cse):
+                    offset = self.get_const_cse(int(arg), "index")
+                    offset_vec = ops.broadcast(offset, compute_vec_size)
                 arg_lists.append(offset_vec)
             elif isinstance(arg, sympy.Mul):
                 if isinstance(arg.args[0], sympy.Integer) and isinstance(arg.args[1], sympy.Symbol):
-                    coeff = self.get_const_cse(int(arg.args[0]))
-                    coeff_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{coeff} : index to vector<{compute_vec_size}xindex>")
-                    self.register_var_info(coeff_vec, [compute_vec_size, "index"])
+                    with self.override_buffer_cse(buffer=self.const_buffer, cse=self.const_cse):
+                        coeff = self.get_const_cse(int(arg.args[0]), "index")
+                        coeff_vec = ops.broadcast(coeff, compute_vec_size)
                     result = ops.mul(dim_list[int(str(arg.args[1])[1:])], coeff_vec)
                     arg_lists.append(result)
                 elif isinstance(arg.args[1], sympy.Integer) and isinstance(arg.args[0], sympy.Symbol):
-                    coeff = self.get_const_cse(int(arg.args[1]))
-                    coeff_vec = self.cse.generate(self.compute, f"vector.broadcast %{coeff} : index to vector<{compute_vec_size}xindex>")
-                    self.register_var_info(coeff_vec, [compute_vec_size, "index"])
+                    with self.override_buffer_cse(buffer=self.const_buffer, cse=self.const_cse):
+                        coeff = self.get_const_cse(int(arg.args[1]), "index")
+                        coeff_vec = ops.broadcast(coeff, compute_vec_size)
                     result = ops.mul(dim_list[int(str(arg.args[0])[1:])], coeff_vec)
                     arg_lists.append(result)
                 else:
@@ -1474,18 +1454,16 @@ def index_expr(self, index, dtype):
 
         # Initialize base vector
         if not self.base_vector_initialized:
-            init_iter = "iter"
+            init_iter = self.register_var_cse("init_iter", 1, "index")
             parallel_map = f"affine.parallel (%{init_iter}) = ({0}) to ({compute_vec_size}) {{ // Base vector initializer"
             self.spad_buffer.writeline(parallel_map)
             with self.spad_buffer.indent():
-                self.spad_buffer.writeline(f"%init_vec = vector.broadcast %{init_iter} : index to vector<2xindex>")
-                self.spad_buffer.writeline(f"affine.vector_store %init_vec, %{sram_var}[%{init_iter}] : {tile_shape}, vector<2xindex>")
+                with self.override_buffer_cse(buffer=self.spad_buffer, cse=self.init_vec_cse):
+                    init_vec = ops.broadcast(init_iter, 2)
+                    ops._store(init_vec, sram_var, f"%{init_iter}", tile_shape)
             self.spad_buffer.writeline("}")
             self.base_vector_initialized = True
-
-        line = f"affine.vector_load %{sram_var}[0] : {tile_shape}, {vshape}"
-        base_vector_index = self.cse.generate(self.compute, line)
-        self.register_var_info(base_vector_index, [compute_vec_size, "index"])
+        base_vector_index = ops._load(compute_vec_size, "index", sram_var, "0", tile_shape)
 
         renamed_symbols = {symbol: "d"+str(symbol)[5:] for symbol in index.free_symbols}
         renamed_expression = index.subs(renamed_symbols)
@@ -1744,7 +1722,7 @@ def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffe
         if broadcast and (total_dims != local_dims or (self.reduction_depth!=len(total_dims) and total_dims[:self.reduction_depth] == local_dims)):
             local_dims = total_dims # Brodatcast tile shape
 
-        index_var = self.parse_indices(index, buffer=buffer, indirect_dims=indirect_dims)
+        index_var = self.parse_indices(index, buffer=buffer, indirect_dims=indirect_dims, comments=f"// store_reduction={store_reduction}")
 
         if kg_tile_desc.vmap.vlane_split_axis in local_dims:
             local_vlane_split_axis = local_dims.index(kg_tile_desc.vmap.vlane_split_axis)
@@ -1957,14 +1935,18 @@ def get_scratchpad_buffer(self, dtype, dram_name, tile_desc, raw_index, buffer=N
         return sram_var, sram_index_var
 
     def get_const_cse(self, value, dtype="index") -> common.CSEVariable:
+        # Why not use ops.constant? Because there are some cases that can't use ops (e.g., def_dma_op)
         # Type convert
-        if dtype[0] == "f":
+        if value in ["inf", "-inf", "nan"]:
+            value = f"0x{mlir_common.MLIR_INF[value][dtype]:x}"
+        elif dtype[0] == "f":
             value = float(value)
         else:
             value = int(value)
 
         if value not in self.consts:
             self.consts[str(value)+dtype] = self.const_cse.generate(self.const_buffer, f"arith.constant {value} : {dtype}")
+            self.register_var_info(self.consts[str(value)+dtype], [1, dtype])
         return self.consts[str(value)+dtype]
 
     def get_tag_cse(self, value=None, shape="memref<1xi32>"):
@@ -1979,16 +1961,16 @@ def get_mask(self):
         if self.compute_body_loop.size % self.compute_body_loop.step == 0:
             return None, None
         compute_vec_size = self.kernel_group.tile_desc.get_compute_vec_size()
-        index_shape = f"vector<{self.compute_body_loop.step}xindex>"
         mask_shape = f"vector<{compute_vec_size}xi1>"
 
-        upper_bound = self.get_const_cse(self.compute_body_loop.size)
-        step_vec = self.const_cse.generate(self.const_buffer, f"vector.step : {index_shape}")
+        with self.override_buffer_cse(buffer=self.const_buffer, cse=self.const_cse):
+            upper_bound = ops.constant(self.compute_body_loop.size, "index")
+            step_vec = ops.step(self.compute_body_loop.step, "index")
 
-        gap = self.mask_cse.generate(self.masks, f"arith.subi %{upper_bound}, %{self.compute_idx} : index")
-        gap_vec = self.mask_cse.generate(self.masks, f"vector.broadcast %{gap} : index to {index_shape}")
-        mask_var = self.mask_cse.generate(self.masks, f"arith.cmpi ult, %{step_vec}, %{gap_vec} : {index_shape}")
-        self.register_var_info(mask_var, [compute_vec_size, "i1"])
+        with self.override_buffer_cse(buffer=self.masks, cse=self.mask_cse):
+            gap = ops.sub(upper_bound, self.compute_idx)
+            gap_vec = ops.broadcast(gap, self.compute_body_loop.step)
+            mask_var = ops.lt(step_vec, gap_vec)
         return mask_shape, mask_var
 
     def convert_indirect_indexing(self, index :sympy.Expr):
@@ -2007,14 +1989,8 @@ def convert_indirect_indexing(self, index :sympy.Expr):
         indirect_dims.sort()
         first_dim = indirect_dims[0]
         spad_vars = dict()
-        old_compute, old_dma_lods, old_dma_stores = self.compute, self.dma_loads, self.dma_stores
         compute_dependecy = any([target_dim not in self.spad_buffer_dict for target_dim in indirect_dims])
-        if compute_dependecy:
-            self.compute = old_dma_stores
-            target_dma_buffers = self.dma_stores
-        else:
-            self.compute = old_dma_lods
-            target_dma_buffers = self.dma_loads
+        target_dma_buffers = self.dma_stores if compute_dependecy else self.dma_loads
 
         # Load indirect operands
         for target_dim in indirect_dims:
@@ -2028,6 +2004,7 @@ def convert_indirect_indexing(self, index :sympy.Expr):
                 local_tile_desc = self.kernel_group.tile_desc
                 tile_numel_per_lane = local_tile_desc.get_numel_per_lane()
                 tile_shape = local_tile_desc.get_mlir_shape(var_info[1])
+                tile_vec = local_tile_desc.get_compute_vec_size()
                 vshape = f"vector<{var_info[0]}x{var_info[1]}>"
                 sram_var, sram_index_var = self.get_scratchpad_buffer(dtype, target_dim, local_tile_desc, target_dim)
                 self.spad_buffer_dict[target_dim] = [sram_var, local_tile_desc.get_tile_size(), tile_numel_per_lane, sram_index_var, tile_shape, vshape]
@@ -2038,52 +2015,37 @@ def convert_indirect_indexing(self, index :sympy.Expr):
                 line = f"{opeartion} %{target_dim}, %{sram_var}[{compute_index_var}] : {tile_shape}, {vshape}"
                 self.stores.writeline(line)
             mlir_dtype = vshape.split("x")[1][:-1]
-            vshape = f"vector<{tile_numel_per_lane}x{mlir_dtype}>" # FIXME. Maybe require fine grain compute...
-            if tile_numel_per_lane > 1:
-                operation = "affine.vector_load"
-                line = f"{operation} %{sram_var}[{sram_index_var}] : {tile_shape}, {vshape} // For indirect access"
-            else:
-                operation = "affine.load"
-                line = f"{operation} %{sram_var}[{sram_index_var}] : {tile_shape} // For indirect access"
-            out = self.cse.generate(target_dma_buffers, line)
-            self.register_var_info(out, [tile_numel_per_lane, mlir_dtype])
-            spad_vars[target_dim] = out
-
-        # Apply stride
-        for arg in index.args:
-            if "tmp" not in str(arg):
-                continue
-            if arg.is_Mul and arg.args[0].is_number:
-                coeff_dtype = self.var_info[spad_vars[str(arg.args[1])]][1]
-                coeff = ops.constant(int(arg.args[0]), coeff_dtype)
-                spad_vars[str(arg.args[1])] = ops.mul(spad_vars[str(arg.args[1])], coeff)
-            index = index.replace(arg, 0)
-
-        # Sum
-        for dim, var in spad_vars.items():
-            if dim == first_dim:
-                continue
-            spad_vars[first_dim] = ops.add(spad_vars[first_dim], var)
+            with self.override_buffer_cse(buffer=target_dma_buffers):
+                out = ops._load(tile_numel_per_lane, mlir_dtype, sram_var, sram_index_var, tile_shape)
+                spad_vars[target_dim] = out
+
+        with self.override_buffer_cse(buffer=target_dma_buffers):
+            # Apply stride
+            for arg in index.args:
+                if "tmp" not in str(arg):
+                    continue
+                if arg.is_Mul and arg.args[0].is_number:
+                    coeff_dtype = self.var_info[spad_vars[str(arg.args[1])]][1]
+                    coeff = self.get_const_cse(int(arg.args[0]), coeff_dtype)
+                    spad_vars[str(arg.args[1])] = ops.mul(spad_vars[str(arg.args[1])], coeff)
+                index = index.replace(arg, 0)
+
+            # Sum
+            for dim, var in spad_vars.items():
+                if dim == first_dim:
+                    continue
+                spad_vars[first_dim] = ops.add(spad_vars[first_dim], var)
 
         # Store index var
         sram_var, _, tile_numel_per_lane, sram_index_var, tile_shape, vshape = self.spad_buffer_dict[first_dim]
         mlir_dtype = vshape.split("x")[1][:-1]
-        vshape = f"vector<{tile_numel_per_lane}x{mlir_dtype}>" # FIXME. Maybe require fine grain compute...
-        if tile_numel_per_lane > 1:
-            operation = "affine.vector_store"
-            line = f"{operation} %{spad_vars[first_dim]}, %{sram_var}[{sram_index_var}] : {tile_shape}, {vshape}"
-        else:
-            operation = "affine.store"
-            line = f"{operation} %{spad_vars[first_dim]}, %{sram_var}[{sram_index_var}] : {tile_shape}"
-        out = self.cse.generate(target_dma_buffers, line, assignment=False)
+        with self.override_buffer_cse(buffer=target_dma_buffers):
+            ops._store(spad_vars[first_dim], sram_var, sram_index_var, tile_shape) # FIXME. Maybe require fine grain compute...
 
         # Conversion
         mlir_dtype = self.var_info[spad_vars[first_dim]][1]
-        line = f"affine.load %{sram_var}[{sram_index_var}] : {tile_shape}"
-        out = self.cse.generate(target_dma_buffers, line)
-        if mlir_dtype != "index":
-            line = f"arith.index_cast %{out} : {mlir_dtype} to {'index'}"
-            out = self.cse.generate(target_dma_buffers, line)
-        self.register_var_info(out, [1, "index", [1]])
-        self.compute, self.dma_loads, self.dma_stores = old_compute, old_dma_lods, old_dma_stores
+        with self.override_buffer_cse(buffer=target_dma_buffers):
+            out = ops._load(1, mlir_dtype, sram_var, sram_index_var, tile_shape)
+            if mlir_dtype != "index":
+                out = ops.index_cast(out, "index")
         return index + sympy.Symbol(str(out)), compute_dependecy
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 4d33eea4..f4dbe678 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -1,5 +1,7 @@
 import dataclasses
 import math
+import contextvars
+from contextlib import contextmanager
 from dataclasses import dataclass
 from typing import Dict
 from typing import List
@@ -68,7 +70,7 @@
     torch.int8: "int8_t",
     torch.uint8: "uint8_t",
     torch.bool: "uint8_t",
-    torch.bfloat16: "bfloat16",
+    torch.bfloat16: "uint16_t",
 }
 
 MLIR_TO_BIT = {
@@ -588,6 +590,7 @@ def __init__(self, kernel_group, reason=None):
         self.ranges = None
         self.reduction_depth = None
         self.itervars = None
+        self.itervar_cses = None
         # Code buffer
         self.vector_compute = IndentedBuffer()
         self.reductions_suffix = IndentedBuffer()
@@ -595,12 +598,17 @@ def __init__(self, kernel_group, reason=None):
         # MLIR SSA tracker
         self.var_info = {} # MLIR variable info
         self.buffer_types : dict = None # format: dtype, numel, size, stride
-        self.compute_idx = "compute_idx"
+        # Create compute idx
+        self.compute_idx = self.register_var_cse("compute_idx", 1, "index")
         self.compute_body_loop = LoopLevel(self.compute_idx, 1)
         self.prologue_compute_body_loop = LoopLevel(self.compute_idx, 1)
         self.recodegen = reason # spad overflow, tile size, vlane stride
         self.stop_autotune = False
 
+        # Context var for codegen
+        self.target_buffer_override = contextvars.ContextVar("Handler_compute_override", default=self.compute)
+        self.target_cse_override = contextvars.ContextVar("Handler_cse_override", default=self.cse)
+
     def set_ranges(self, lengths, reduction_lengths):
         if self.call_ranges:
             assert self.call_ranges == tuple(lengths) + tuple(
@@ -611,6 +619,7 @@ def set_ranges(self, lengths, reduction_lengths):
             self.call_ranges = tuple(lengths) + tuple(reduction_lengths)
             self.ranges = [self.rename_indexing(x) for x in self.call_ranges]
             self.itervars = [sympy.Symbol(f"index{n}") for n in range(len(self.ranges))]
+            self.itervar_cses = {str(index) : self.register_var_cse(str(index), 1, "index") for index in self.itervars}
             self.reduction_depth = len(lengths)
         return (
             self.itervars[: self.reduction_depth],
@@ -801,28 +810,6 @@ def get_constant_vector(self, expr):
         constant_vector = [[int(expr.coeff(var)),None] for var in self.itervars]
         return constant_vector
 
-    def get_constant_vector2(self, expr):
-        # Case 0. symbol ex) index 0
-        # Case 1. inner product form ex) 16 * index0 + 1 * index1
-        # Case 2. Complicated form ex) 16 * index0 + 8 * (index//4) + (index % 4)
-        constant_vector = []
-        if expr.is_symbol:
-            constant_vector.append(tuple([1, expr]))
-            return constant_vector
-
-        for arg in expr.args:
-            if arg.is_symbol:
-                constant_vector.append(tuple([1,arg]))
-                continue
-            if len(arg.args) == 0: #TODO: check this
-                continue
-            if arg.args[0].is_number:
-                constant_vector.append(arg.args)
-            else:
-                constant_vector.append([1, arg])
-
-        return constant_vector
-
     def find_node_by_name(self, name):
         if name in V.graph.graph_inputs:
             return V.graph.graph_inputs[name]
@@ -837,6 +824,11 @@ def is_scalar(self, name):
     def roundup_vectorlane(self, size, amp=1):
         return ((size + self.vector_lane - 1) // self.vector_lane) * self.vector_lane * amp
 
+    def register_var_cse(self, name, size, dtype):
+        var = self.create_cse_var(name, ValueRanges.unknown())
+        self.register_var_info(var, [size, dtype])
+        return var
+
     def register_var_info(self, var, var_info):
         self.var_info[var] = var_info
 
@@ -854,6 +846,21 @@ def rename_indexing(self, index) -> sympy.Expr:
         }
         return sympy_subs(index, replacements)
 
+    @contextmanager
+    def override_buffer_cse(self, *, buffer=None, cse=None):
+        target_buffer = target_cse = None
+        try:
+            if buffer is not None:
+                target_buffer = self.target_buffer_override.set(buffer)
+            if cse is not None:
+                target_cse = self.target_cse_override.set(cse)
+            yield self
+        finally:
+            if target_cse is not None:
+                self.target_cse_override.reset(target_cse)
+            if target_buffer is not None:
+                self.target_buffer_override.reset(target_buffer)
+
     def __enter__(self):
         class CSEProxy:
             self.name = "CSEProxy"
@@ -861,16 +868,22 @@ class CSEProxy:
             @staticmethod
             def __getattr__(name: str) -> Callable[..., common.CSEVariable]:  # type: ignore[misc]
                 def inner(*args, **kwargs):
-                    code, ret_info = getattr(parent_handler, name)(*args, var_info=self.var_info)
-                    csevar = self.cse.generate(
-                        self.compute,
-                        code,
-                        bounds=ValueRanges.unknown(),
-                        assignment=(ret_info[0] is not None)
-                    )
-                    if ret_info[0] is not None:
-                        self.register_var_info(csevar, ret_info)
-                        csevar.update_on_args(name, args, kwargs)
+                    code, ret_info = getattr(parent_handler, name)(*args, var_info=self.var_info, **kwargs)
+                    target_buffer = self.target_buffer_override.get()
+                    target_cse = self.target_cse_override.get()
+                    if isinstance(code, common.DeferredLine):
+                        target_buffer.writeline(code)
+                        return None
+                    else:
+                        csevar = target_cse.generate(
+                            target_buffer,
+                            code,
+                            bounds=ValueRanges.unknown(),
+                            assignment=(ret_info[0] is not None)
+                        )
+                        if ret_info[0] is not None:
+                            self.register_var_info(csevar, ret_info)
+                            csevar.update_on_args(name, args, kwargs)
                     return csevar
 
                 return inner
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index e493464a..12782ce8 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -85,7 +85,8 @@ def as_local(self):
         }
         try:
             self.set_buffers()
-            yield self
+            with self.kernel.override_buffer_cse(buffer=self.compute, cse=self.cse):
+                yield self
         finally:
             self.restore_buffers()
 
@@ -822,7 +823,7 @@ def def_dma_op(self, dma_type, dram_var:str, index_list:list, tile_desc:mlir_com
                 attribute_parts.append(f"subtile_size={subtile_size}, async={int(async_type) if async_type is not None else 1}")
             attribute = "  {" + ", ".join(attribute_parts) + "}"
             code = self.get_dma_code(dma_type, vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
-                                     dram_shape, tile_shape, "")
+                                    dram_shape, tile_shape, "")
             local_code.writeline(code)
             local_code.writeline(attribute)
         return textwrap.indent(local_code.getvalue(), " "*indent_size).strip()
@@ -885,28 +886,18 @@ def load_epilogue(self, name: str, index: sympy.Expr):
         zero_var = self.get_const_cse(0)
         if not self.reduction_fusion:
             compute_index_var = ",".join([f"%{zero_var}"] * (self.kernel_group.tile_desc.get_nr_dim()-1) + [f"%{self.compute_idx}"])
-            if compute_vec_size > 1:
-                operation = "affine.vector_load"
-                line = f"{operation} %{sram_var}[{compute_index_var}] : {tile_shape}, {vshape}"
-            else:
-                operation = "affine.load"
-                line = f"{operation} %{sram_var}[{compute_index_var}] : {tile_shape}"
-            out = self.cse.generate(self.loads, line)
-            self.register_var_info(out, [compute_vec_size, mlir_dtype])
+            with self.override_buffer_cse(buffer=self.loads):
+                out = ops._load(compute_vec_size, mlir_dtype, sram_var, compute_index_var, tile_shape)
         else: # For reduction case
             reduce_size = self.reduction_nr_outer_loop
             vsize = compute_vec_size//reduce_size
-            vshape = f"vector<{vsize}x{mlir_dtype}>"
 
             if compute_vec_size > 1:
                 offset = self.cse.generate(self.loads, f"affine.apply affine_map<(d0, d1) -> (d0 + d1*{(self.r_tile_size)})>(%{self.compute_idx}, %{self.reduction_loop_idx})")
                 compute_index_var = ",".join([f"%{zero_var}"] * (self.kernel_group.tile_desc.get_nr_dim()-1) + [f"%{offset}"])
-                operation = "affine.vector_load"
-                line = f"{operation} %{sram_var}[{compute_index_var}] : {tile_shape}, {vshape}"
-                out = self.cse.generate(self.loads, line)
-            else:
-                line = f"{operation} %{sram_var}[{compute_index_var}] : {tile_shape}"
-                out = self.cse.generate(self.loads, line)
+
+            with self.override_buffer_cse(buffer=self.loads):
+                out = ops._load(vsize, mlir_dtype, sram_var, compute_index_var, tile_shape)
             self.register_var_info(out, [self.compute_body_loop.step, mlir_dtype])
         return out
 
@@ -924,10 +915,6 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         tile_shape = self.kernel_group.tile_desc.get_mlir_shape(mlir_dtype)
         tile_stride = self.kernel_group.tile_desc.get_tile_stride()
 
-        # Compute vector unit size
-        vshape = self.kernel_group.tile_desc.get_mlir_vshape(mlir_dtype)
-        compute_vec_size = self.kernel_group.tile_desc.get_compute_vec_size()
-
         if name not in self.buffer_names:
             sram_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, self.kernel_group.tile_desc, index)
             self.buffer_names[name] = sram_var
@@ -945,14 +932,9 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
             value = ops.to_dtype(value, mlir_dtype, var_info=self.var_info)
         compute_index_var = ",".join([f"%{zero_var}"] * (self.kernel_group.tile_desc.get_nr_dim()-1) + [f"%{self.compute_idx}"])
         # Generate vector load instruction
-        if compute_vec_size > 1:
-            operation = "affine.vector_store"
-            line = f"{operation} %{value}, %{sram_var}[{compute_index_var}] : {tile_shape}, {vshape}"
-        else:
-            operation = "affine.store"
-            line = f"{operation} %{value}, %{sram_var}[{compute_index_var}] : {tile_shape}"
-        line = line if store_force else DeferredLine(name, line)
-        self.stores.writeline(line)
+        buffer_name = name if not store_force else None
+        with self.override_buffer_cse(buffer=self.stores):
+            ops._store(value, sram_var, compute_index_var, tile_shape, buffer_name=buffer_name)
 
         # Generate DMA instruction
         attribute = f"{{dram_stride={dram_stride}, sram_stride={tile_stride}, padding=0}}"
@@ -991,6 +973,7 @@ def reduction_epilogue(self, dtype, src_dtype, reduction_type, value):
 
         tile_shape = local_tile_desc.get_mlir_shape(type_name)
         vshape = local_tile_desc.get_mlir_vshape(type_name)
+        compute_vec_size = self.kernel_group.tile_desc.get_compute_vec_size()
 
         name = f"{reduction_type}_buffer{self.reduction_buffer_idx}"
         self.reduction_buffer_idx += 1
@@ -1002,24 +985,21 @@ def reduction_epilogue(self, dtype, src_dtype, reduction_type, value):
         zero_var_list = [f"%{self.get_const_cse(0)}"] * local_tile_desc.get_nr_dim()
         zero_var_list[-2] = f"%{self.reduction_loop_idx}"
         compute_index_var = ", ".join(zero_var_list)
-        operation = "affine.vector_load"
-        line = f"{operation} %{sram_var}[{compute_index_var}] : {tile_shape}, {vshape}"
-        out = self.cse.generate(self.loads, line)
-        self.register_var_info(out, [self.compute_body_loop.step, type_name])
+        with self.override_buffer_cse(buffer=self.loads):
+            out = ops._load(vec_size, type_name, sram_var, compute_index_var, tile_shape)
 
         # Reduction body codegen
-        init = self.const_cse.generate(self.const_buffer, f"arith.constant {reduction_init(reduction_type, dtype)} : {type_name}")
-        init_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{init} : {type_name} to {vshape}")
-        self.register_var_info(init_vec, [local_tile_desc.get_compute_vec_size(), type_name])
+        with self.override_buffer_cse(buffer=self.const_buffer, cse=self.const_cse):
+            init = ops.constant(reduction_init(reduction_type, dtype), type_name)
+            init_vec = ops.broadcast(init, compute_vec_size)
+
         mask_shape, mask_var = self.get_mask()
         if mask_var is not None:
             value = ops.where(mask_var, value, init_vec)
         result = reduction_partial_combine_vec(reduction_type, value, out)
 
         # Store partial result
-        operation = "affine.vector_store"
-        line = f"{operation} %{result}, %{sram_var}[{compute_index_var}] : {tile_shape}, {vshape}"
-        self.compute.writeline(line) # Need to be placed after partial reduction
+        ops._store(result, sram_var, compute_index_var, tile_shape) # Need to be placed after partial reduction
         self.reduction_info[sram_var] = [reduction_type, local_tile_desc]
         return sram_var
 
@@ -1050,63 +1030,60 @@ def store_reduction_epilogue(self, name, index, value):
         partial_tile_shape = partial_tile_desc.get_mlir_shape(mlir_dtype)
 
         # Prepare constant
-        init = self.const_cse.generate(self.const_buffer, f"arith.constant {reduction_init(self.reduction_info[value][0], dtype)} : {mlir_dtype}")
+        with self.override_buffer_cse(buffer=self.const_buffer, cse=self.const_cse):
+            init = ops.constant(reduction_init(self.reduction_info[value][0], dtype), mlir_dtype)
+            init_vec = ops.broadcast(init, partial_vec_size)
+            init_vec2 = ops.broadcast(init, 2)
+
         partial_zero_var_list = [f"%{self.get_const_cse(0)}"] * partial_tile_desc.get_nr_dim()
         final_zero_var_list = [f"%{self.get_const_cse(0)}"] * final_tile_desc.get_nr_dim()
         for i in range(self.reduction_body_loop.size):
             # Load partial result
-            body_index_var = self.const_cse.generate(self.const_buffer, f"arith.constant {i} : index")
-            partial_zero_var_list[-2] = f"%{body_index_var}"
-            compute_index_var = ",".join(partial_zero_var_list)
-
-            operation = "affine.vector_load"
-            line = f"{operation} %{value}[{compute_index_var}] : {partial_tile_shape}, {partial_vshape}"
-            out = self.cse.generate(self.reductions_suffix, line)
-            operation = "affine.vector_store"
-            init_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{init} : {mlir_dtype} to {partial_vshape}")
-            line = f"{operation} %{init_vec}, %{value}[{compute_index_var}] : {partial_tile_shape}, {partial_vshape}"
-            self.reductions_suffix.writeline(line)
-
-            # 2 step reduction
-            new_vec_size = 2
-            new_vshape = f"vector<{partial_vec_size//new_vec_size}x{new_vec_size}x{mlir_dtype}>"
-            new_reduced_shape = f"vector<{new_vec_size}x{mlir_dtype}>"
-            out = self.cse.generate(self.reductions_suffix, f"vector.shape_cast %{out} : {partial_vshape} to {new_vshape}")
-            init_vec = self.const_cse.generate(self.const_buffer, f"vector.broadcast %{init} : {mlir_dtype} to {new_reduced_shape}")
-            out = self.cse.generate(self.reductions_suffix, reduction_combine_vec(self.reduction_info[value][0], out, init_vec, axis=0, shape=new_vshape, reduced_shape=new_reduced_shape))
-            out2 = self.cse.generate(self.reductions_suffix, f"vector.shuffle %{out}, %{out} [1, 0] : {new_reduced_shape}, {new_reduced_shape}")
+            with self.override_buffer_cse(buffer=self.const_buffer, cse=self.const_cse):
+                body_index_var = ops.constant(i, "index")
+                partial_zero_var_list[-2] = f"%{body_index_var}"
+                compute_index_var = ",".join(partial_zero_var_list)
+
+            with self.override_buffer_cse(buffer=self.reductions_suffix):
+                out = ops._load(partial_vec_size, mlir_dtype, sram_var, compute_index_var, partial_tile_shape)
+                ops._store(init_vec, value, compute_index_var, partial_tile_shape) # Clear the partial buffer to zero
+
+                # 2 step reduction
+                new_vec_size = 2
+                new_reduced_shape = f"<{new_vec_size}x{mlir_dtype}>"
+                reduction_type = self.reduction_info[value][0]
+                out = ops.multi_reduction(out, init_vec, partial_vec_size, new_vec_size, reduction_type, partial_vshape, self.reduction_info[value][0], mlir_dtype)
 
-            self.compute, self.reductions_suffix = self.reductions_suffix, self.compute
-            self.register_var_info(out, [new_vec_size, mlir_dtype])
+            out2 = self.cse.generate(self.reductions_suffix, f"vector.shuffle %{out}, %{out} [1, 0] : {new_reduced_shape}, {new_reduced_shape}")
             self.register_var_info(out2, [new_vec_size, mlir_dtype])
-            out = reduction_partial_combine_vec(self.reduction_info[value][0], out, out2)
-            self.compute, self.reductions_suffix = self.reductions_suffix, self.compute
+
+            with self.override_buffer_cse(buffer=self.reductions_suffix):
+                out = reduction_partial_combine_vec(self.reduction_info[value][0], out, out2)
 
             if self.welford_reduce_out is not None:
                 # NOTE: It not a real welford algorithm... We just used E(X^2) - E(X)^2
-                divider = self.cse.generate(self.reductions_suffix, f"arith.constant {float(self.r_dim_size)} : f32")
-                if self.buffer_types[name][1] > 1:
-                    divider_vec = self.cse.generate(self.reductions_suffix, f"vector.broadcast %{divider} : f32 to {new_reduced_shape}")
-                else:
-                    divider_vec = divider
-
-                if self.current_node.node.origin_node: # FIXME: This is a temporary solution
-                    # mean = SUM(X) / N
-                    self.reduction_mean.append(self.cse.generate(self.reductions_suffix, f"arith.divf %{out}, %{divider_vec} : {new_reduced_shape}"))
-                    out = self.reduction_mean[i]
-                else:
-                    # m2 = (E(X^2) - E(X)^2) * N
-                    sqr_mean = self.cse.generate(self.reductions_suffix, f"arith.divf %{out}, %{divider_vec} : {new_reduced_shape}")
-                    mean_sqr = self.cse.generate(self.reductions_suffix, f"arith.mulf %{self.reduction_mean[i]}, %{self.reduction_mean[i]} : {new_reduced_shape}")
-                    variance = self.cse.generate(self.reductions_suffix, f"arith.subf %{sqr_mean}, %{mean_sqr} : {new_reduced_shape}")
-                    m2 = self.cse.generate(self.reductions_suffix, f"arith.mulf %{variance}, %{divider_vec} : {new_reduced_shape}")
-                    out = m2
+                with self.override_buffer_cse(buffer=self.reductions_suffix):
+                    divider = ops.constant(float(self.reduction_axis_size), "f32")
+                    if self.buffer_types[name][1] > 1:
+                        divider_vec = ops.broadcast(divider, new_vec_size)
+                    else:
+                        divider_vec = divider
+
+                    if self.current_node.node.origin_node: # FIXME: This is a temporary solution
+                        # mean = SUM(X) / N
+                        self.reduction_mean.append(ops.div(out, divider_vec))
+                        out = self.reduction_mean[i]
+                    else:
+                        # m2 = (E(X^2) - E(X)^2) * N
+                        sqr_mean = ops.div(out, divider_vec)
+                        mean_sqr = ops.mul(self.reduction_mean[i], self.reduction_mean[i])
+                        variance = ops.sub(sqr_mean, mean_sqr)
+                        m2 = ops.mul(variance, divider_vec)
+                        out = m2
 
             final_zero_var_list[-1] = f"%{body_index_var}"
             final_compute_index_var = ",".join(final_zero_var_list)
-            operation = "affine.vector_store"
-            line = f"{operation} %{out}, %{sram_var}[{final_compute_index_var}] : {final_tile_shape}, {new_reduced_shape}"
-            self.reductions_suffix.writeline(DeferredLine(name, line))
+            ops._store(out, sram_var, final_compute_index_var, final_tile_shape, buffer_name=name)
 
         # MVOUT Encoding
         # Generate DMA instruction
diff --git a/Simulator/simulator.py b/Simulator/simulator.py
index 322d9b12..91d53b09 100644
--- a/Simulator/simulator.py
+++ b/Simulator/simulator.py
@@ -53,7 +53,7 @@ def write_arg(self, arg, path, name):
             tensor = arg.cpu().detach()
             buffer_size = tensor.untyped_storage().size()
             buffer = (ctypes.c_char * buffer_size).from_address(tensor.data_ptr())
-            t_arr = np.frombuffer(buffer, dtype=tensor.numpy().dtype, count=buffer_size // tensor.element_size())
+            t_arr = np.frombuffer(buffer, dtype=TORCH_TO_NUMPY[tensor.dtype], count=buffer_size // tensor.element_size())
             t_arr.tofile(data_path)
         else:
             assert(0)

From bea9bd2f6c7575b1a456a91d844493defc296f6b Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 9 Sep 2025 12:12:51 +0000
Subject: [PATCH 002/194] [Test] Add matmul vector fusion case

---
 tests/Fusion/test_matmul_vector.py | 52 ++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)
 create mode 100644 tests/Fusion/test_matmul_vector.py

diff --git a/tests/Fusion/test_matmul_vector.py b/tests/Fusion/test_matmul_vector.py
new file mode 100644
index 00000000..bf1bd513
--- /dev/null
+++ b/tests/Fusion/test_matmul_vector.py
@@ -0,0 +1,52 @@
+import torch
+import torch._dynamo
+import torch.utils.cpp_extension
+
+def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
+    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        message = f"|{name} Test Passed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+    else:
+        message = f"|{name} Test Failed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+        print("custom out: ", out.cpu())
+        print("cpu out: ", cpu_out)
+        exit(1)
+
+def test_matmul_vector(device, size=[56, 78, 239], dim=0):
+    def matmul_fused(a, b, c, d):
+        return torch.matmul(a, b) + c + d
+    torch.manual_seed(0)
+    input = torch.randn(size[:2])
+    weight = torch.randn(size[1:])
+    output_sz = [size[0], size[2]]
+    output_sz[dim]=1
+    bias = torch.zeros(output_sz)
+    add = torch.zeros(output_sz)
+    x1 = input.to(device=device)
+    w1 = weight.to(device=device)
+    b1 = bias.to(device=device)
+    a1 = add.to(device=device)
+    x2 = input.to("cpu")
+    w2 = weight.to("cpu")
+    b2 = bias.to("cpu")
+    a2 = add.to("cpu")
+    opt_fn = torch.compile(dynamic=False)(matmul_fused)
+    res = opt_fn(x1, w1, a1, b1)
+    y = matmul_fused(x2, w2, a2, b2)
+    test_result("Matmul Vector Fusion Forward", res, y)
+
+if __name__ == "__main__":
+    import os
+    import sys
+    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
+
+    from Scheduler.scheduler import ExecutionEngine
+    module = ExecutionEngine.setup_device()
+    device = module.custom_device()
+    test_matmul_vector(device, size=[253, 123, 47], dim=0)
+    test_matmul_vector(device, size=[253, 123, 47], dim=1)
\ No newline at end of file

From 837b0627df4a5c3134a493a4996c0847d7d123ba Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 9 Sep 2025 14:14:09 +0000
Subject: [PATCH 003/194] [Frontend] Fix ops conversion

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 16 ++++++++--------
 PyTorchSimFrontend/mlir/mlir_template.py        | 17 ++++++++---------
 2 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index d4c2fdd6..13d75c94 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -813,12 +813,11 @@ def where(condition, operand1, operand2, *args, var_info=None, **kwargs):
         cond_type = var_info[condition]
         operand_type = var_info[operand1]
         if cond_type[0] < tile_size:
-            condition = ops.broadcast(condition, operand_type[0])
+            condition = ops.broadcast(condition, tile_size)
         elif cond_type[0] > tile_size:
-            operand1 = ops.broadcast(operand1, operand_type[0])
-            operand2 = ops.broadcast(operand2, operand_type[0])
+            operand1 = ops.broadcast(operand1, cond_type[0])
+            operand2 = ops.broadcast(operand2, cond_type[0])
         tile_size, ret_type = var_info[operand1]
-
         shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
         cond_shape = f"vector<{tile_size}xi1>," if tile_size > 1 else ""
         return f"arith.select %{condition}, %{operand1}, %{operand2} : {cond_shape} {shape}", [tile_size, ret_type]
@@ -1174,10 +1173,6 @@ def store(self, name: str, index: sympy.Expr, value, *args, **kwargs):
             # Todo. If tile_size is not same (i.e., view operation), we can't apply peephole optimization easily
             require_store = self.spad_buffer_dict[str(value)][1] != tile_size
 
-        if compute_vec_size < self.var_info[value][0]:
-            value = self.cse.generate(self.stores, f"vector.extract_strided_slice  %{value} {{offsets = [0], sizes = [{compute_vec_size}], strides = [1]}}: vector<{self.var_info[value][0]}x{self.var_info[value][1]}> to {vshape}")
-            self.register_var_info(value, [compute_vec_size, mlir_dtype])
-
         if require_store:
             # Define scratch pad buffer
             sram_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, local_tile_desc, index)
@@ -1186,6 +1181,11 @@ def store(self, name: str, index: sympy.Expr, value, *args, **kwargs):
             _, operand_type = self.var_info[value]
             if mlir_dtype != operand_type:
                 value = ops.custom_cast(value, mlir_dtype)
+
+            if compute_vec_size < self.var_info[value][0]:
+                value = self.cse.generate(self.stores, f"vector.extract_strided_slice  %{value} {{offsets = [0], sizes = [{compute_vec_size}], strides = [1]}}: vector<{self.var_info[value][0]}x{self.var_info[value][1]}> to {vshape}")
+                self.register_var_info(value, [compute_vec_size, mlir_dtype])
+
             with self.override_buffer_cse(buffer=self.stores):
                 ops._store(value, sram_var, compute_index_var, tile_shape, buffer_name=name)
         else:
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 12782ce8..b51c2794 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -1045,14 +1045,14 @@ def store_reduction_epilogue(self, name, index, value):
                 compute_index_var = ",".join(partial_zero_var_list)
 
             with self.override_buffer_cse(buffer=self.reductions_suffix):
-                out = ops._load(partial_vec_size, mlir_dtype, sram_var, compute_index_var, partial_tile_shape)
+                out = ops._load(partial_vec_size, mlir_dtype, value, compute_index_var, partial_tile_shape)
                 ops._store(init_vec, value, compute_index_var, partial_tile_shape) # Clear the partial buffer to zero
 
                 # 2 step reduction
                 new_vec_size = 2
-                new_reduced_shape = f"<{new_vec_size}x{mlir_dtype}>"
+                new_reduced_shape = f"vector<{new_vec_size}x{mlir_dtype}>"
                 reduction_type = self.reduction_info[value][0]
-                out = ops.multi_reduction(out, init_vec, partial_vec_size, new_vec_size, reduction_type, partial_vshape, self.reduction_info[value][0], mlir_dtype)
+                out = ops.multi_reduction(out, init_vec2, partial_vec_size, new_vec_size, partial_vshape, reduction_type, mlir_dtype)
 
             out2 = self.cse.generate(self.reductions_suffix, f"vector.shuffle %{out}, %{out} [1, 0] : {new_reduced_shape}, {new_reduced_shape}")
             self.register_var_info(out2, [new_vec_size, mlir_dtype])
@@ -1060,9 +1060,8 @@ def store_reduction_epilogue(self, name, index, value):
             with self.override_buffer_cse(buffer=self.reductions_suffix):
                 out = reduction_partial_combine_vec(self.reduction_info[value][0], out, out2)
 
-            if self.welford_reduce_out is not None:
-                # NOTE: It not a real welford algorithm... We just used E(X^2) - E(X)^2
-                with self.override_buffer_cse(buffer=self.reductions_suffix):
+                if self.welford_reduce_out is not None:
+                    # NOTE: It not a real welford algorithm... We just used E(X^2) - E(X)^2
                     divider = ops.constant(float(self.reduction_axis_size), "f32")
                     if self.buffer_types[name][1] > 1:
                         divider_vec = ops.broadcast(divider, new_vec_size)
@@ -1081,9 +1080,9 @@ def store_reduction_epilogue(self, name, index, value):
                         m2 = ops.mul(variance, divider_vec)
                         out = m2
 
-            final_zero_var_list[-1] = f"%{body_index_var}"
-            final_compute_index_var = ",".join(final_zero_var_list)
-            ops._store(out, sram_var, final_compute_index_var, final_tile_shape, buffer_name=name)
+                final_zero_var_list[-1] = f"%{body_index_var}"
+                final_compute_index_var = ",".join(final_zero_var_list)
+                ops._store(out, sram_var, final_compute_index_var, final_tile_shape, buffer_name=name)
 
         # MVOUT Encoding
         # Generate DMA instruction

From a33659af759442cacf4c6e85b0582c0c30b964f1 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 10 Sep 2025 06:05:59 +0000
Subject: [PATCH 004/194] [Frontend] Use custom malloc in the validation
 wrapper code

---
 .../mlir/mlir_caller_codegen.py               | 32 ++++++++++---------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_caller_codegen.py b/PyTorchSimFrontend/mlir/mlir_caller_codegen.py
index dff6b0fd..38a1f7a9 100644
--- a/PyTorchSimFrontend/mlir/mlir_caller_codegen.py
+++ b/PyTorchSimFrontend/mlir/mlir_caller_codegen.py
@@ -58,7 +58,11 @@ def load_arg(self):
             if self.is_in_arg(arg_attribute[0]):
                 argv_idx = self.get_argv_idx() if arg_name not in self.load_args else self.load_args[arg_name]
                 self.load_args[arg_name] = argv_idx
-                self.writeline(f'if(load_arg(c_{arg_name}, sizeof(c_{arg_name}), argv[{argv_idx}]) == -1){self.open_bracket}')
+                ctype = DTYPE_TO_C[arg_attribute[1]]
+                elem_count = arg_attribute[2]
+                size_expr = f'({elem_count}ULL * sizeof({ctype}))'
+
+                self.writeline(f'if(load_arg(c_{arg_name}, {size_expr}, argv[{argv_idx}]) == -1){self.open_bracket}')
                 with self.code.indent():
                     self.writeline(f'return -1{self.ending}')
                 self.writeline(self.closed_bracket)
@@ -67,7 +71,10 @@ def dump_arg(self):
         for arg_name, arg_attribute in self.arg_attributes:
             if self.is_out_arg(arg_attribute[0]):
                 argv_idx = self.get_argv_idx() if not self.is_inout_arg(arg_attribute[0]) else self.load_args[arg_name]
-                self.writeline(f'if(dump_arg(c_{arg_name}, sizeof(c_{arg_name}), argv[{argv_idx}]) == -1){self.open_bracket}')
+                ctype = DTYPE_TO_C[arg_attribute[1]]
+                elem_count = arg_attribute[2]
+                size_expr = f'({elem_count}ULL * sizeof({ctype}))'
+                self.writeline(f'if(dump_arg(c_{arg_name}, {size_expr}, argv[{argv_idx}]) == -1){self.open_bracket}')
                 with self.code.indent():
                     self.writeline(f'return -1{self.ending}')
                 self.writeline(self.closed_bracket)
@@ -84,29 +91,24 @@ def generate_kernel_declare(self):
     def generate_args_define(self):
         name_set = set()
         if self.validation:
-            self.writeline(f'int padding[0x100000]{self.ending}') # FIXME. For pooling operation... Some pooling layer use negative offset
+            self.writeline(f"int* padding = malloc(0x100000ULL * sizeof(int)){self.ending}")
         for arg_name, (_, arg_type, arg_size, arg_sizes, arg_stride) in self.arg_attributes:
             if not arg_name in name_set:
-                if self.validation:
-                    self.writeline(f'{DTYPE_TO_C[arg_type]} c_{arg_name}[{arg_size}ULL]{self.ending}')
+                if torch.is_floating_point(torch.tensor([], dtype=arg_type)):
+                    bits = torch.finfo(arg_type).bits
+                elif arg_type == torch.bool:
+                    bits = 8
                 else:
-                    if torch.is_floating_point(torch.tensor([], dtype=arg_type)):
-                        bits = torch.finfo(arg_type).bits
-                    elif arg_type == torch.bool:
-                        bits = 8
-                    else:
-                        bits = torch.iinfo(arg_type).bits
-                    self.writeline(f'{DTYPE_TO_C[arg_type]}* c_{arg_name} = malloc({arg_size * bits // 8}ULL){self.ending}')
+                    bits = torch.iinfo(arg_type).bits
+                self.writeline(f'{DTYPE_TO_C[arg_type]}* c_{arg_name} = malloc({arg_size * bits // 8}ULL){self.ending}')
                 name_set.add(arg_name)
         self.writeline(self.newline)
 
     def generate_main(self):
-        if self.validation:
-            self.generate_args_define()
-
         self.writeline(f'{self.newline}int main(int argc, char *argv[]) {self.open_bracket}{self.newline}')
         with self.code.indent():
             if self.validation:
+                self.generate_args_define()
                 self.load_arg()
                 self.writeline(self.newline)
             else:

From 4e2d0a022bc01b8551a73f1dbe690afffaadb63d Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 10 Sep 2025 13:55:27 +0000
Subject: [PATCH 005/194] [Device] Add missing operations

---
 PyTorchSimFrontend/extension_device.cpp | 301 ++++++++++++++++++------
 1 file changed, 225 insertions(+), 76 deletions(-)

diff --git a/PyTorchSimFrontend/extension_device.cpp b/PyTorchSimFrontend/extension_device.cpp
index 1a02bfe3..b728a852 100644
--- a/PyTorchSimFrontend/extension_device.cpp
+++ b/PyTorchSimFrontend/extension_device.cpp
@@ -16,6 +16,34 @@
 #include <ATen/core/GeneratorForPrivateuseone.h>
 #include <ATen/NativeFunctions.h>
 #include <ATen/native/CPUFallback.h>
+#include <pybind11/pybind11.h>
+namespace py = pybind11;
+
+namespace {
+  bool g_amp_enabled = false;
+  at::ScalarType g_amp_dtype = at::kFloat;
+}
+
+static at::ScalarType to_scalar_type(const py::object& dtype_obj) {
+  py::module torch_mod = py::module::import("torch");
+  if (dtype_obj.is(torch_mod.attr("bfloat16"))) return at::kBFloat16;
+  if (dtype_obj.is(torch_mod.attr("float16")))  return at::kHalf;
+  if (dtype_obj.is(torch_mod.attr("float32")))  return at::kFloat;
+  if (dtype_obj.is(torch_mod.attr("float64")))  return at::kDouble;
+  throw std::runtime_error("Unsupported dtype for extension_device AMP");
+}
+
+static py::object to_torch_dtype(at::ScalarType st) {
+  py::module torch_mod = py::module::import("torch");
+  switch (st) {
+    case at::kBFloat16: return torch_mod.attr("bfloat16");
+    case at::kHalf:     return torch_mod.attr("float16");
+    case at::kFloat:    return torch_mod.attr("float32");
+    case at::kDouble:   return torch_mod.attr("float64");
+    default:
+      throw std::runtime_error("Unsupported scalar type in get_autocast_dtype");
+  }
+}
 
 static uint64_t op_counter = 0;
 static uint64_t last_saved_value = 0;
@@ -99,8 +127,16 @@ at::Tensor custom_to_device(
   TORCH_CHECK(self.is_contiguous());
 
   op_counter += 1;
-  if (device != at::DeviceType::CPU) {
-    return at::empty(self.sizes(), self.options());
+  if (device.type() == at::DeviceType::CPU) {
+    auto out = at::empty(self.sizes(), dtype, self.options().layout(),
+                         device, false, memory_format);
+    std::memcpy(out.mutable_data_ptr(), self.data_ptr(), self.nbytes());
+    return out;
+  } else {
+    auto opts = self.options().device(device).dtype(dtype);
+    auto out = at::empty(self.sizes(), opts);
+    std::memcpy(out.mutable_data_ptr(), self.data_ptr(), self.nbytes());
+    return out;
   }
 
   auto out = at::empty(self.sizes(), dtype, self.options().layout(), device, false, memory_format);
@@ -135,33 +171,86 @@ static DummyCustomAllocator global_custom_alloc;
 REGISTER_ALLOCATOR(c10::DeviceType::PrivateUse1, &global_custom_alloc);
 
 at::Tensor & custom_fill__scalar(at::Tensor & self, const at::Scalar & value) {
-  TORCH_CHECK(self.device().type() == c10::DeviceType::PrivateUse1, "Dummy test only allows dummy device.");
+  TORCH_CHECK(self.device().type() == c10::DeviceType::PrivateUse1,
+              "Dummy test only allows dummy device.");
   TORCH_CHECK(self.is_contiguous());
-  // TORCH_CHECK(self.scalar_type() == c10::ScalarType::Float);
 
   op_counter += 1;
-  if (self.scalar_type() == c10::ScalarType::Float) {
-    auto _data = static_cast<float*>(self.mutable_data_ptr());
-    for (size_t idx = 0; idx < self.numel(); idx++) {
-      _data[idx] = value.toFloat();
+
+  switch (self.scalar_type()) {
+    case c10::ScalarType::Float: {
+      auto* data = self.mutable_data_ptr<float>();
+      for (int64_t i = 0; i < self.numel(); i++) {
+        data[i] = value.toFloat();
+      }
+      break;
     }
-    return self;
-  } else if (self.scalar_type() == c10::ScalarType::Int) {
-    auto _data = static_cast<int*>(self.mutable_data_ptr());
-    for (size_t idx = 0; idx < self.numel(); idx++) {
-      _data[idx] = value.toInt();
+    case c10::ScalarType::Double: {
+      auto* data = self.mutable_data_ptr<double>();
+      for (int64_t i = 0; i < self.numel(); i++) {
+        data[i] = value.toDouble();
+      }
+      break;
     }
-    return self;
-  } else if (self.scalar_type() == c10::ScalarType::Long) {
-    auto _data = static_cast<int64_t*>(self.mutable_data_ptr());
-    for (size_t idx = 0; idx < self.numel(); idx++) {
-      _data[idx] = value.toLong();
+    case c10::ScalarType::Half: {
+      auto* data = self.mutable_data_ptr<at::Half>();
+      for (int64_t i = 0; i < self.numel(); i++) {
+        data[i] = at::Half(value.toHalf());
+      }
+      break;
     }
-    return self;
-  } else {
-    TORCH_CHECK(false, "Unsupported scalar type.");
+    case c10::ScalarType::BFloat16: {
+      auto* data = self.mutable_data_ptr<at::BFloat16>();
+      for (int64_t i = 0; i < self.numel(); i++) {
+        data[i] = at::BFloat16(value.toBFloat16());
+      }
+      break;
+    }
+    case c10::ScalarType::Int: {
+      auto* data = self.mutable_data_ptr<int>();
+      for (int64_t i = 0; i < self.numel(); i++) {
+        data[i] = value.toInt();
+      }
+      break;
+    }
+    case c10::ScalarType::Long: {
+      auto* data = self.mutable_data_ptr<int64_t>();
+      for (int64_t i = 0; i < self.numel(); i++) {
+        data[i] = value.toLong();
+      }
+      break;
+    }
+    case c10::ScalarType::Short: {
+      auto* data = self.mutable_data_ptr<int16_t>();
+      for (int64_t i = 0; i < self.numel(); i++) {
+        data[i] = static_cast<int16_t>(value.toShort());
+      }
+      break;
+    }
+    case c10::ScalarType::Char: {
+      auto* data = self.mutable_data_ptr<int8_t>();
+      for (int64_t i = 0; i < self.numel(); i++) {
+        data[i] = static_cast<int8_t>(value.toChar());
+      }
+      break;
+    }
+    case c10::ScalarType::Byte: {
+      auto* data = self.mutable_data_ptr<uint8_t>();
+      for (int64_t i = 0; i < self.numel(); i++) {
+        data[i] = static_cast<uint8_t>(value.toByte());
+      }
+      break;
+    }
+    case c10::ScalarType::Bool: {
+      auto* data = self.mutable_data_ptr<bool>();
+      for (int64_t i = 0; i < self.numel(); i++) {
+        data[i] = value.toBool();
+      }
+      break;
+    }
+    default:
+      TORCH_CHECK(false, "Unsupported scalar type: ", self.scalar_type());
   }
-
   return self;
 }
 
@@ -204,6 +293,9 @@ at::Tensor custom__copy_from(const at::Tensor& self, const at::Tensor& dst, bool
       "Dummy test only allows copy from cpu -> dummy device.");
 
   // Some dummy asserts for the basic use case: inputs are the same size / dtype, all contiguous.
+  if (self.numel() != dst.numel()) {
+    custom_resize_(dst, self.sizes(), c10::nullopt);
+  }
   TORCH_CHECK(self.sizes() == dst.sizes());
 
   const bool same_dtype = (self.scalar_type() == dst.scalar_type());
@@ -255,9 +347,36 @@ at::Tensor& custom_arange_start_out_impl(
     const c10::Scalar& end,
     const c10::Scalar& step,
     at::Tensor& out) {
-    //const int64_t n = arange_len(start.toDouble(), end.toDouble(), step.toDouble());
-    //at::native::resize_output(out, {n});
-    return out;
+  double s = start.toDouble();
+  double e = end.toDouble();
+  double st = step.toDouble();
+  TORCH_CHECK(st != 0.0, "step must be nonzero");
+
+  int64_t length = 0;
+  if (st > 0) {
+    if (e > s) length = static_cast<int64_t>(std::ceil((e - s) / st));
+  } else {
+    if (e < s) length = static_cast<int64_t>(std::ceil((e - s) / st));
+  }
+
+  // Resize out tensor
+  custom_resize_(out, {length}, c10::nullopt);
+
+  if (out.scalar_type() == at::kFloat || out.scalar_type() == at::kDouble) {
+    double* data = out.mutable_data_ptr<double>();
+    for (int64_t i = 0; i < length; i++) {
+      data[i] = s + i * st;
+    }
+  } else if (out.scalar_type() == at::kLong) {
+    int64_t* data = out.mutable_data_ptr<int64_t>();
+    for (int64_t i = 0; i < length; i++) {
+      data[i] = static_cast<int64_t>(s + i * st);
+    }
+  } else {
+    TORCH_CHECK(false, "Unsupported dtype for arange on dummy device");
+  }
+
+  return out;
 }
 
 static at::Tensor custom_to_dtype_impl(const at::Tensor& self,
@@ -276,16 +395,16 @@ static at::Tensor custom_to_dtype_impl(const at::Tensor& self,
 // This macro registers your kernels to the PyTorch Dispatcher.
 // More details on the dispatcher can be found at http://blog.ezyang.com/2020/09/lets-talk-about-the-pytorch-dispatcher/.
 TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
-  m.impl("to.Device", &custom_to_device);
-  m.impl("to.dtype", &custom_to_dtype_impl);
-  m.impl("fill_.Scalar", &custom_fill__scalar);
-  m.impl("_copy_from", &custom__copy_from);
+  m.impl("to.Device",             &custom_to_device);
+  m.impl("to.dtype",              &custom_to_dtype_impl);
+  m.impl("fill_.Scalar",          &custom_fill__scalar);
+  m.impl("_copy_from",            &custom__copy_from);
   m.impl("_copy_from_and_resize", &custom__copy_from_and_resize);
-  m.impl("empty_strided", &custom_empty_strided);
-  m.impl("empty.memory_format", &custom_empty);
-  m.impl("as_strided", at::native::as_strided_tensorimpl);
-  m.impl("view", at::native::view);
-  m.impl("arange.start_out", &custom_arange_start_out_impl);
+  m.impl("empty_strided",         &custom_empty_strided);
+  m.impl("empty.memory_format",   &custom_empty);
+  m.impl("as_strided",            at::native::as_strided_tensorimpl);
+  m.impl("view",                  at::native::view);
+  m.impl("arange.start_out",      &custom_arange_start_out_impl);
 }
 
 TORCH_LIBRARY_IMPL(aten, AutogradPrivateUse1, m) {
@@ -293,11 +412,11 @@ TORCH_LIBRARY_IMPL(aten, AutogradPrivateUse1, m) {
 }
 
 TORCH_LIBRARY_FRAGMENT(aten, m) {
-m.def(
-  "_reinterpret_tensor(Tensor self, int[] size, int[] stride, int offset_increment=0) -> Tensor",
-  torch::dispatch(
-      c10::DispatchKey::AutogradPrivateUse1, _reinterpret_tensor),
-  {at::Tag::pt2_compliant_tag});
+  m.def(
+    "_reinterpret_tensor(Tensor self, int[] size, int[] stride, int offset_increment=0) -> Tensor",
+    torch::dispatch(c10::DispatchKey::AutogradPrivateUse1, _reinterpret_tensor),
+    {at::Tag::pt2_compliant_tag}
+  );
 }
 
 void custom_cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
@@ -305,39 +424,56 @@ void custom_cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack
 }
 
 TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
-  m.impl("add.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("add.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("abs.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("sub.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("mul.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("div.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("pow.Tensor_Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("zero_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("_foreach_add.List", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("index.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("triu_indices", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("neg.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("sum.IntList_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("eq.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("all.all_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("_local_scalar_dense", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("_log_softmax", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("_log_softmax_backward_data", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("mse_loss.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("nll_loss_forward", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("nll_loss_backward", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("_foreach_lerp_.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("_foreach_mul_.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("_foreach_addcmul_.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("_foreach_sqrt", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("_foreach_div_.ScalarList", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("_foreach_add_.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("_foreach_addcdiv_.ScalarList", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("_foreach_add_.List", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("cat.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("_native_multi_head_attention", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("resize_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("exp.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("add.Tensor",                    torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("add.out",                       torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("abs.out",                       torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("sub.Tensor",                    torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("mul.Tensor",                    torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("div.Tensor",                    torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("pow.Tensor_Scalar",             torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("zero_",                         torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("_foreach_add.List",             torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("index.Tensor",                  torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("triu_indices",                  torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("neg.out",                       torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("sum.IntList_out",               torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("eq.Tensor",                     torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("all.all_out",                   torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("_local_scalar_dense",           torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("_log_softmax",                  torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("_log_softmax_backward_data",    torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("mse_loss.out",                  torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("nll_loss_forward",              torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("nll_loss_backward",             torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("_foreach_lerp_.Scalar",         torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("_foreach_mul_.Scalar",          torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("_foreach_addcmul_.Scalar",      torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("_foreach_sqrt",                 torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("_foreach_div_.ScalarList",      torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("_foreach_add_.Scalar",          torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("_foreach_addcdiv_.ScalarList",  torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("_foreach_add_.List",            torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("cat.out",                       torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("_native_multi_head_attention",  torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("resize_",                       torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("exp.out",                       torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("where.self",                    torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("ge.Scalar",                     torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("ge.Tensor",                     torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("le.Scalar",                     torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("le.Tensor",                     torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("lt.Scalar",                     torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("lt.Tensor",                     torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("gt.Scalar",                     torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("gt.Tensor",                     torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("triu",                          torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("tril",                          torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("logical_and.out",               torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("logical_and.Tensor",            torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("logical_or.out",                torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("logical_or.Tensor",             torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("logical_not.out",                torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("logical_not.Tensor",             torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
 }
 
 // This basic implementation doesn't bother dealing with different device indices
@@ -360,7 +496,6 @@ bool custom_op_called() {
 
 class PrivateGeneratorImpl : public at::CPUGeneratorImpl {
 public:
-  // Constructors
   PrivateGeneratorImpl(c10::DeviceIndex device_index) {
     device_ = c10::Device(c10::DeviceType::PrivateUse1, device_index);
     key_set_ = c10::DispatchKeySet(c10::DispatchKey::PrivateUse1);
@@ -382,7 +517,21 @@ void register_generator() {
 // that's implemented in C++.
 // The implementation in this file maps directly to the `PrivateUse1` device type.
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-    m.def("custom_device", &get_custom_device, "get custom device object");
-    m.def("custom_op_called", &custom_op_called, "check if our custom function was called");
-    m.def("register_generator", &register_generator, "register generator for custom device");
+  m.def("custom_device", &get_custom_device, "get custom device object");
+  m.def("custom_op_called", &custom_op_called, "check if our custom function was called");
+  m.def("register_generator", &register_generator, "register generator for custom device");
+  m.def("is_autocast_enabled", []() -> bool { return g_amp_enabled;});
+  m.def("set_autocast_enabled", [](bool flag) -> void {g_amp_enabled = flag;});
+  m.def("get_autocast_dtype", []() -> py::object { return to_torch_dtype(g_amp_dtype); });
+  m.def("set_autocast_dtype", [](py::object dtype_obj) -> void {
+    auto st = to_scalar_type(dtype_obj);
+    g_amp_dtype = st;
+  });
+  m.def("get_amp_supported_dtype", []() -> py::list {
+    py::module torch_mod = py::module::import("torch");
+    py::list lst;
+    lst.append(torch_mod.attr("float16"));
+    lst.append(torch_mod.attr("float32"));
+    return lst;
+  });
 }
\ No newline at end of file

From 6e70edccfb515b85551933f20be22bd4e2f1fd35 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 10 Sep 2025 13:56:00 +0000
Subject: [PATCH 006/194] [Frontend] Add typecasting for logical operation

---
 .../mlir/mlir_codegen_backend.py              | 48 ++++++++++++++-----
 1 file changed, 35 insertions(+), 13 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 13d75c94..f8195b58 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -624,7 +624,7 @@ def ne(operand1, operand2, *args, var_info=None, **kwargs):
             attribute = "one"
         elif ret_type[0] == "i":
             op_type = "arith.cmpi"
-            attribute = "sne"
+            attribute = "ne"
         else:
             raise ValueError(f"Unsupported data type for 'ne' operation: {ret_type}")
 
@@ -754,13 +754,25 @@ def xor(operand1, operand2, *args, var_info=None, **kwargs):
         shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
         return f'arith.xori %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
 
+    @staticmethod
+    def to_bool(operand, *args, var_info=None, **kwargs):
+        tile_size, ret_type = var_info[operand]
+        const_one = ops.constant(0, ret_type)
+        if tile_size > 1:
+            const_one = ops.broadcast(const_one, tile_size)
+        ret = ops.ne(operand, const_one)
+        return ret, [tile_size, "i1"]
 
     @staticmethod
     def logical_and(operand1, operand2, *args, var_info=None, **kwargs):
-        op_type = var_info[operand1]
+        op_type1 = var_info[operand1]
+        op_type2 = var_info[operand2]
         # Type check & auto cast
-        if op_type[1] != "i1":
-            raise NotImplementedError("Logical operation with not bool data type")
+        if op_type1[1] != "i1":
+            operand1 = ops.to_bool(operand1)
+        if op_type2[1] != "i1":
+            operand2 = ops.to_bool(operand2)
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
         return ExtensionOverrides.and_(operand1, operand2, *args, var_info=var_info, **kwargs)
 
     @staticmethod
@@ -773,22 +785,30 @@ def logical_not(operand, *args, var_info=None, **kwargs):
         const_one = ops.constant(0, ret_type)
         const_one = ops.broadcast(const_one, tile_size)
         ret = ops.eq(operand,const_one)
-        return ret, [tile_size, var_info[ret]]
+        return ret, [tile_size, "i1"]
 
     @staticmethod
     def logical_or(operand1, operand2, *args, var_info=None, **kwargs):
-        op_type = var_info[operand1]
+        op_type1 = var_info[operand1]
+        op_type2 = var_info[operand2]
         # Type check & auto cast
-        if op_type[1] != "i1":
-            raise NotImplementedError("Logical operation with not bool data type")
+        if op_type1[1] != "i1":
+            operand1 = ops.to_bool(operand1)
+        if op_type2[1] != "i1":
+            operand2 = ops.to_bool(operand2)
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
         return ExtensionOverrides.or_(operand1, operand2, *args, var_info=var_info, **kwargs)
 
     @staticmethod
     def logical_xor(operand1, operand2, *args, var_info=None, **kwargs):
-        op_type = var_info[operand1]
+        op_type1 = var_info[operand1]
+        op_type2 = var_info[operand2]
         # Type check & auto cast
-        if op_type[1] != "i1":
-            raise NotImplementedError("Logical operation with not bool data type")
+        if op_type1[1] != "i1":
+            operand1 = ops.to_bool(operand1)
+        if op_type2[1] != "i1":
+            operand2 = ops.to_bool(operand2)
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
         return ExtensionOverrides.xor(operand1, operand2, *args, var_info=var_info, **kwargs)
 
     @staticmethod
@@ -1006,8 +1026,10 @@ def convert_index(self, expr, buffer):
             expr_str = expr_str.replace("//", " floordiv ")
         else:
             raise NotImplementedError("What is this case?")
-
-        indices = [expr.args[0]]
+        first_arg = expr.args[0]
+        if len(first_arg.free_symbols) != 1:
+            raise NotImplementedError("What is this case?")
+        indices = [list(first_arg.free_symbols)[0]]
         args = ", ".join(map(str, indices))
         map_var = self.map_cse.generate(self.global_vars, f"affine_map<({args}) -> ({expr_str})>")
         args = ", ".join([f"%{i}" for i in indices])

From 54f450a44101425fcc3dfa30d15e761ce1b53c33 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 10 Sep 2025 13:56:28 +0000
Subject: [PATCH 007/194] [Device] register amp

---
 Scheduler/scheduler.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Scheduler/scheduler.py b/Scheduler/scheduler.py
index ffe8e4fc..d10df556 100644
--- a/Scheduler/scheduler.py
+++ b/Scheduler/scheduler.py
@@ -179,6 +179,7 @@ def setup_device():
         )
 
         torch.utils.rename_privateuse1_backend("npu")
+        torch._register_device_module("extension_device", module)
         from torch._inductor.codegen.common import (
             get_scheduling_for_device,
             get_wrapper_codegen_for_device,

From 8985ab8f47779be62b0efffafbcc24a33f7da134 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 12 Sep 2025 11:52:03 +0000
Subject: [PATCH 008/194] [Frontend+Test] Support scatter pattern with a test
 case

---
 PyTorchSimFrontend/extension_device.cpp       | 195 ++++++++++++++----
 .../mlir/mlir_codegen_backend.py              |  22 +-
 PyTorchSimFrontend/mlir/mlir_common.py        |   2 -
 tests/test_indirect_access.py                 |  36 ++++
 4 files changed, 205 insertions(+), 50 deletions(-)

diff --git a/PyTorchSimFrontend/extension_device.cpp b/PyTorchSimFrontend/extension_device.cpp
index b728a852..f1351fab 100644
--- a/PyTorchSimFrontend/extension_device.cpp
+++ b/PyTorchSimFrontend/extension_device.cpp
@@ -424,56 +424,165 @@ void custom_cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack
 }
 
 TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
-  m.impl("add.Tensor",                    torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("add.out",                       torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("abs.out",                       torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("sub.Tensor",                    torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("mul.Tensor",                    torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("div.Tensor",                    torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("pow.Tensor_Scalar",             torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("zero_",                         torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("_foreach_add.List",             torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("index.Tensor",                  torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("triu_indices",                  torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("neg.out",                       torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("sum.IntList_out",               torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("eq.Tensor",                     torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("abs", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("abs.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("abs_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("absolute", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("absolute.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("absolute_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("add.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("add.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("add.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("add_.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("add_.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+
+  m.impl("cat", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("cat.names", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("cat.names_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("cat.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+
+  m.impl("div.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("div.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("div.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("div_.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("div_.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+
+  m.impl("eq.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("eq.Scalar_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("eq.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("eq.Tensor_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("equal", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+
+  m.impl("erf", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("erf.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("erf_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("erfc", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("erfc.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("erfc_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+
+  m.impl("exp", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("exp.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+
+  m.impl("ge.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("ge.Scalar_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("ge.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("ge.Tensor_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("gt.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("gt.Scalar_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("gt.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("gt.Tensor_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("le.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("le.Scalar_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("le.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("le.Tensor_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("lt.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("lt.Scalar_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("lt.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("lt.Tensor_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("ne.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("ne.Scalar_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("ne.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("ne.Tensor_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+
+  m.impl("logical_and", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("logical_and.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("logical_and_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("logical_not", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("logical_not.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("logical_not_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("logical_or", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("logical_or.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("logical_or_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("logical_xor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("logical_xor.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("logical_xor_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+
+  m.impl("neg", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("neg.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("neg_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+
+  m.impl("mul.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("mul.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("mul_.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+
+  m.impl("pow.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("pow.Scalar_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("pow.Tensor_Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("pow.Tensor_Scalar_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("pow.Tensor_Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("pow.Tensor_Tensor_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("pow_.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("pow_.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+
+  m.impl("sub.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("sub.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("sub.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("sub_.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("sub_.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+
+  m.impl("sum", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("sum.DimnameList_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("sum.IntList_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("sum.dim_DimnameList", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("sum.dim_IntList", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+
+  m.impl("resize_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("resize_as_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+
+  // Foreach ops
+  m.impl("_foreach_add.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("_foreach_add_.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("_foreach_add_.ScalarList", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("_foreach_add.List", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("_foreach_add_.List", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+
+  // Indexed
+  m.impl("index_add.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("index_add_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("index_copy.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("index_copy_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("index_fill.int_Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("index_fill.int_Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("index_fill.int_Scalar_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("index_fill.int_Tensor_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("index_fill_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+
+  m.impl("tril", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("tril_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("triu", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("triu_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("triu_indices", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+
+  m.impl("nll_loss2d_forward", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("nll_loss2d_backward", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("nll_loss_backward", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("nll_loss_forward", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+
+  m.impl("scatter.src_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("scatter.value_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+
+  m.impl("index_put.Default", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("index.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+
+  m.impl("mm.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("sigmoid.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("gather.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("silu.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+
   m.impl("all.all_out",                   torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
   m.impl("_local_scalar_dense",           torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
   m.impl("_log_softmax",                  torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
   m.impl("_log_softmax_backward_data",    torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
   m.impl("mse_loss.out",                  torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("nll_loss_forward",              torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("nll_loss_backward",             torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("_foreach_lerp_.Scalar",         torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("_foreach_mul_.Scalar",          torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("_foreach_addcmul_.Scalar",      torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("_foreach_sqrt",                 torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("_foreach_div_.ScalarList",      torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("_foreach_add_.Scalar",          torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("_foreach_addcdiv_.ScalarList",  torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("_foreach_add_.List",            torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("cat.out",                       torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
   m.impl("_native_multi_head_attention",  torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("resize_",                       torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("exp.out",                       torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
   m.impl("where.self",                    torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("ge.Scalar",                     torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("ge.Tensor",                     torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("le.Scalar",                     torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("le.Tensor",                     torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("lt.Scalar",                     torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("lt.Tensor",                     torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("gt.Scalar",                     torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("gt.Tensor",                     torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("triu",                          torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("tril",                          torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("logical_and.out",               torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("logical_and.Tensor",            torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("logical_or.out",                torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("logical_or.Tensor",             torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("logical_not.out",                torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("logical_not.Tensor",             torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("min",                           torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("max",                           torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("index_select",                  torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("nonzero",                       torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+
+  m.impl("zero_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
+  m.impl("zeros_like", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
 }
 
 // This basic implementation doesn't bother dealing with different device indices
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index f8195b58..382825f5 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -1171,12 +1171,24 @@ def load(self, name: str, index: sympy.Expr):
         self.spad_buffer_dict[str(out)] = [sram_var, local_tile_desc.get_tile_size(), tile_numel_per_lane, sram_index_var, tile_shape, vshape]
         return out
 
-    def store(self, name: str, index: sympy.Expr, value, *args, **kwargs):
+    def store(self, name: str, index: sympy.Expr, value, mode=None, *args, **kwargs):
         index = self.rename_indexing(index)
-        dram_var = self.kernel_group.args.output(name)
         dtype = V.graph.get_dtype(name)
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
 
+        # Handle scatter store
+        if "tmp" in str(index):
+            if mode == "atomic_add":
+                # Convert the output buffer type to the inplace buffer
+                arg_name =  V.graph.scheduler.mutation_real_name.get(name, name)
+                if arg_name not in self.kernel_group.args.inplace_buffers:
+                    self.kernel_group.args.make_inplace(arg_name, arg_name)
+
+                loaded_value = ops.load(name, index)
+                value = ops.add(loaded_value, value)
+            index, _ = self.convert_indirect_indexing(index)
+        dram_var = self.kernel_group.args.output(name)
+
         # Prepare dma instruction
         local_tile_desc, index_var, dram_stride = self.get_dma_info(name, index)
         vlane_split_axis = local_tile_desc.vmap.vlane_split_axis
@@ -1736,9 +1748,9 @@ def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffe
         total_dims =  [int(str(i)[5:]) for i in self.itervars]
         local_tile_desc = mlir_common.MLIRMultiDimTile([1], self.vector_lane)
         local_dims.sort() # Assume that smaller index is placed in the outer loop
-        indirect_dims = [f"{i}" for i in index.free_symbols if "tmp" in str(i)]
-        for indirect_dim in indirect_dims:
-            index = index.replace(sympy.Symbol(indirect_dim), 0)
+        indirect_syms = [s for s in index.free_symbols if "tmp" in s.name]
+        index = index.subs({s: 0 for s in indirect_syms}, simultaneous=True)
+        indirect_dims = [f"{i}" for i in indirect_syms]
 
         # Reduction can have two type of tile size
         if broadcast and (total_dims != local_dims or (self.reduction_depth!=len(total_dims) and total_dims[:self.reduction_depth] == local_dims)):
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index f4dbe678..15408c0d 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -792,8 +792,6 @@ def codegen_kernel(self, kernel_name):
         code.splice(self.codegen_global_init())
         code.writeline(f'func.func @{kernel_decl_name}({arg_defs})')
         with code.indent():
-            for old, new in self.kernel_group.args.aliases():
-                code.writeline(f"auto {old} = {new};")
             # Loop body part
             code.splice(self.codegen_loops())
         return code.getvalue()
diff --git a/tests/test_indirect_access.py b/tests/test_indirect_access.py
index c6afaf86..6cfa7b58 100644
--- a/tests/test_indirect_access.py
+++ b/tests/test_indirect_access.py
@@ -43,6 +43,40 @@ def test_embedding(device, vocab_size, dim):
     cpu_res = cpu_emb(cpu_prompt)
     test_result("Embedding", res, cpu_res)
 
+def test_scatter_add(device, num_tokens=256, hidden_size=256, num_assignments=3, dtype=torch.float32, seed=0):
+    torch.manual_seed(seed)
+
+    def scatter_only(out, token_indices, weighted_output):
+        # token_indices: [N] (long), weighted_output: [N, H]
+        out.index_add_(0, token_indices, weighted_output)
+        return out
+
+    out = torch.randn(num_tokens, hidden_size, dtype=dtype)
+    out_cp = out.clone()
+    token_indices = torch.randint(0, num_tokens, (num_assignments,))
+    weighted_output = torch.randn(num_assignments, hidden_size, dtype=dtype)
+
+    cpu_out = scatter_only(out, token_indices, weighted_output)
+
+    out = out_cp.to(device=device)
+    token_indices = token_indices.to(device=device)
+    weighted_output = weighted_output.to(device=device)
+    opt_fn = torch.compile(dynamic=False)(scatter_only)
+    res = opt_fn(out, token_indices, weighted_output)
+    test_result("ScatterAdd(index_add_)", res, cpu_out)
+
+def test_scatter_full(device, size=(128, 128)):
+    def vectoradd(a, idx, b):
+        a[idx, :] = b
+        return a
+    x = torch.randn(size, dtype=torch.float32).to(device=device)
+    idx = torch.randint(0,128, [128]).to(device=device)
+    y = torch.randn(128, dtype=torch.float32).to(device=device)
+    opt_fn = torch.compile(dynamic=False)(vectoradd)
+    res = opt_fn(x, idx, y)
+    out = vectoradd(x.cpu(), idx.cpu(), y.cpu())
+    test_result("Indirect VectorAdd", res, out)
+
 if __name__ == "__main__":
     import os
     import sys
@@ -51,5 +85,7 @@ def test_embedding(device, vocab_size, dim):
     from Scheduler.scheduler import PyTorchSimRunner
     module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
+    test_scatter_full(device)
+    test_scatter_add(device)
     test_indirect_vectoradd(device)
     #test_embedding(device, 1024, 2048)
\ No newline at end of file

From 1c2c8bf010661b2695b932288cb6ced19dfd47fa Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 5 Dec 2025 13:12:45 +0000
Subject: [PATCH 009/194] [Fix] minor bugs

---
 PyTorchSimFrontend/extension_config.py          | 2 +-
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 6 +++---
 Scheduler/scheduler.py                          | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py
index 239bbefe..8d668b58 100644
--- a/PyTorchSimFrontend/extension_config.py
+++ b/PyTorchSimFrontend/extension_config.py
@@ -67,7 +67,7 @@ def __getattr__(name):
             "multi_tile_conv",
             "subtile"
         }
-        if opt_level == "all" or opt_level is "none":
+        if opt_level == "all" or opt_level == "none":
             pass
         elif isinstance(opt_level, list):
             # Check if provided list contains only valid options
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 382825f5..5a29bc87 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -1381,12 +1381,12 @@ def _index_expr(self, tile_desc, renamed_expression, index, base_vector_index):
         dim_list = []
         for idx in range(len(tile_size)):
             # Prepare initial values
-            offset = tile_desc.vlane_stride #* strides[idx]
-            outer_sz = tile_size[idx] // tile_desc.vlane_stride
+            offset = tile_desc.vmap.vlane_stride #* strides[idx]
+            outer_sz = tile_size[idx] // tile_desc.vmap.vlane_stride
             with self.override_buffer_cse(buffer=self.const_buffer, cse=self.const_cse):
                 div_coeff = self.get_const_cse(strides[idx], "index")
                 mod_coeff = self.get_const_cse(tile_size[idx], "index")
-                vlane_stride_coeff = self.get_const_cse(tile_desc.vlane_stride, "index")
+                vlane_stride_coeff = self.get_const_cse(tile_desc.vmap.vlane_stride, "index")
                 vlane_outer_coeff = self.get_const_cse(outer_sz, "index")
                 nr_vector_lane = self.get_const_cse(self.vector_lane, "index")
                 vlane_coeff = self.get_const_cse(0, "i64")
diff --git a/Scheduler/scheduler.py b/Scheduler/scheduler.py
index d10df556..31dbf6c0 100644
--- a/Scheduler/scheduler.py
+++ b/Scheduler/scheduler.py
@@ -179,7 +179,7 @@ def setup_device():
         )
 
         torch.utils.rename_privateuse1_backend("npu")
-        torch._register_device_module("extension_device", module)
+        torch._register_device_module("npu", module)
         from torch._inductor.codegen.common import (
             get_scheduling_for_device,
             get_wrapper_codegen_for_device,

From 1895958e75c7f094b35927734be2d76d4fda661e Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 8 Dec 2025 05:56:29 +0000
Subject: [PATCH 010/194] [Fix] Fix the acceess to wrong variable

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 2 +-
 PyTorchSimFrontend/mlir/mlir_template.py        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 5a29bc87..a14dd10b 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -1402,7 +1402,7 @@ def _index_expr(self, tile_desc, renamed_expression, index, base_vector_index):
                 vlane_vec = ops.broadcast(vlane_coeff, vlane_vec_size)
 
             dim = ops.modular(ops.div(vector_index, div_vec), mod_vec)
-            if idx == tile_desc.vlane_split_axis: # Need to add vector lane offset
+            if idx == tile_desc.vmap.vlane_split_axis: # Need to add vector lane offset
                 stride_dim = ops.modular(dim, vlane_stride_vec)
                 outer_dim = ops.modular(ops.div(dim, vlane_stride_vec), vlane_outer_vec)
                 dim = ops.add(stride_dim, ops.mul(outer_dim, nr_vector_lane_vec))
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index b51c2794..cc17ada1 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -1062,7 +1062,7 @@ def store_reduction_epilogue(self, name, index, value):
 
                 if self.welford_reduce_out is not None:
                     # NOTE: It not a real welford algorithm... We just used E(X^2) - E(X)^2
-                    divider = ops.constant(float(self.reduction_axis_size), "f32")
+                    divider = ops.constant(float(self.r_dim_size), "f32")
                     if self.buffer_types[name][1] > 1:
                         divider_vec = ops.broadcast(divider, new_vec_size)
                     else:

From cd14109e6db330170ebbfe6b2bfb1aa13a4f8867 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 8 Dec 2025 05:56:56 +0000
Subject: [PATCH 011/194] [Log] Add print lock to prevent log crash

---
 Simulator/simulator.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/Simulator/simulator.py b/Simulator/simulator.py
index 91d53b09..4786fd32 100644
--- a/Simulator/simulator.py
+++ b/Simulator/simulator.py
@@ -16,6 +16,8 @@
 from PyTorchSimFrontend.mlir.mlir_common import MLIRKernelArgs
 from PyTorchSimFrontend import extension_config
 
+print_lock = threading.Lock()
+
 TORCH_TO_NUMPY = {
     torch.float32: np.float32,
     torch.float64: np.float64,
@@ -157,9 +159,12 @@ def show_progress():
             while not finished:
                 i = (i + 1) % 3
                 tail = "." * i + " " * (3-i)
-                sys.stdout.write("\r[Gem5] Gem5 is running." + tail)
+                with print_lock:
+                    sys.stdout.write("\r[Gem5] Gem5 is running." + tail)
+                    sys.stdout.flush()
                 time.sleep(1)
-            print("")
+            with print_lock:
+                print("")
 
         dir_path = os.path.join(os.path.dirname(target_binary), "m5out")
         gem5_script_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "gem5_script/script_systolic.py")

From 5fe87e9a64168c7a1f8640801e1d23fadbeb8c4e Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 8 Dec 2025 10:07:22 +0000
Subject: [PATCH 012/194] [Device] Add custom zero_, zeors_like

---
 PyTorchSimFrontend/extension_device.cpp | 73 +++++++++++++++++++++++--
 1 file changed, 69 insertions(+), 4 deletions(-)

diff --git a/PyTorchSimFrontend/extension_device.cpp b/PyTorchSimFrontend/extension_device.cpp
index f1351fab..cfaecf2b 100644
--- a/PyTorchSimFrontend/extension_device.cpp
+++ b/PyTorchSimFrontend/extension_device.cpp
@@ -45,6 +45,16 @@ static py::object to_torch_dtype(at::ScalarType st) {
   }
 }
 
+static inline at::MemoryFormat fix_memory_format(c10::optional<at::MemoryFormat> mf_opt) {
+    if (!mf_opt.has_value()) return at::MemoryFormat::Contiguous;
+
+    auto mf = mf_opt.value();
+    if (mf == at::MemoryFormat::Preserve) {
+        return at::MemoryFormat::Contiguous;
+    }
+    return mf;
+}
+
 static uint64_t op_counter = 0;
 static uint64_t last_saved_value = 0;
 
@@ -339,7 +349,7 @@ at::Tensor custom_empty(c10::IntArrayRef size, c10::optional<at::ScalarType> dty
 
   constexpr c10::DispatchKeySet private_use_ks(c10::DispatchKey::PrivateUse1);
   auto dtype = c10::dtype_or_default(dtype_opt);
-  return  at::detail::empty_generic(size, &global_custom_alloc, private_use_ks, dtype, optional_memory_format);
+  return  at::detail::empty_generic(size, &global_custom_alloc, private_use_ks, dtype, fix_memory_format(optional_memory_format));
 }
 
 at::Tensor& custom_arange_start_out_impl(
@@ -386,6 +396,62 @@ static at::Tensor custom_to_dtype_impl(const at::Tensor& self,
   return at::native::to(self, dtype, non_blocking, copy, memory_format);
 }
 
+at::Tensor custom_zeros_like(
+    const at::Tensor& input,
+    c10::optional<at::ScalarType> dtype_opt,
+    c10::optional<at::Layout> layout_opt,
+    c10::optional<c10::Device> device_opt,
+    c10::optional<bool> pin_memory_opt,
+    c10::optional<c10::MemoryFormat> memory_format_opt)
+{
+  // dtype / layout / device fallback
+  auto dtype   = dtype_opt.value_or(input.scalar_type());
+  auto layout  = layout_opt.value_or(input.layout());
+  auto device  = device_opt.value_or(input.device());
+  auto memfmt  = memory_format_opt.value_or(c10::MemoryFormat::Contiguous);
+
+  TORCH_CHECK(
+      device.type() == c10::DeviceType::PrivateUse1,
+      "custom_zeros_like: device must be PrivateUse1");
+
+  at::Tensor out = custom_empty(
+      input.sizes(),
+      dtype,
+      layout,
+      device,
+      pin_memory_opt,
+      memfmt
+  );
+  size_t nbytes = out.numel() * out.element_size();
+  void* ptr = out.mutable_data_ptr();
+
+  TORCH_CHECK(ptr != nullptr,
+      "custom_zeros_like: out.mutable_data_ptr() returned NULL");
+  std::memset(ptr, 0, nbytes);
+  return out;
+}
+
+at::Tensor& custom_zero_impl(at::Tensor& self)
+{
+    TORCH_CHECK(
+        self.device().type() == c10::DeviceType::PrivateUse1,
+        "custom_zero_: expected a PrivateUse1 device tensor");
+
+    if (self.numel() == 0) {
+        return self;
+    }
+
+    void* data = self.mutable_data_ptr();
+    TORCH_CHECK(data != nullptr,
+        "custom_zero_: self.mutable_data_ptr() returned NULL "
+        "(storage was not allocated)");
+
+    size_t nbytes = self.numel() * self.element_size();
+    std::memset(data, 0, nbytes);
+
+    return self;
+}
+
 // With TORCH_LIBRARY_IMPL, you can register custom kernels for your backend.
 // For open registration, we're registering all of our kernels to the PrivateUse1 dispatch key.
 // Later in this file, we map a custom device to the PrivateUse1 device type,
@@ -405,6 +471,8 @@ TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
   m.impl("as_strided",            at::native::as_strided_tensorimpl);
   m.impl("view",                  at::native::view);
   m.impl("arange.start_out",      &custom_arange_start_out_impl);
+  m.impl("zeros_like",            &custom_zeros_like);
+  m.impl("zero_",                 &custom_zero_impl);
 }
 
 TORCH_LIBRARY_IMPL(aten, AutogradPrivateUse1, m) {
@@ -580,9 +648,6 @@ TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
   m.impl("max",                           torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
   m.impl("index_select",                  torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
   m.impl("nonzero",                       torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-
-  m.impl("zero_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("zeros_like", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
 }
 
 // This basic implementation doesn't bother dealing with different device indices

From db18cbd3fac47f9a3c88462a6fdc5941aa55af94 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 8 Dec 2025 10:57:01 +0000
Subject: [PATCH 013/194] [Frontend/Spike] Use 64byte aligned buffer size

---
 PyTorchSimFrontend/mlir/mlir_caller_codegen.py | 4 +++-
 tests/Mixtral_8x7B/test_attention.py           | 4 ++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_caller_codegen.py b/PyTorchSimFrontend/mlir/mlir_caller_codegen.py
index 38a1f7a9..a539bdb9 100644
--- a/PyTorchSimFrontend/mlir/mlir_caller_codegen.py
+++ b/PyTorchSimFrontend/mlir/mlir_caller_codegen.py
@@ -1,4 +1,5 @@
 import os
+import math
 import subprocess
 import shlex
 import re
@@ -100,7 +101,8 @@ def generate_args_define(self):
                     bits = 8
                 else:
                     bits = torch.iinfo(arg_type).bits
-                self.writeline(f'{DTYPE_TO_C[arg_type]}* c_{arg_name} = malloc({arg_size * bits // 8}ULL){self.ending}')
+                buffer_size = int(math.ceil(arg_size * bits // 8 / 64) * 64) # Round up to 64 bytes
+                self.writeline(f'{DTYPE_TO_C[arg_type]}* c_{arg_name} = malloc({buffer_size}ULL){self.ending}')
                 name_set.add(arg_name)
         self.writeline(self.newline)
 
diff --git a/tests/Mixtral_8x7B/test_attention.py b/tests/Mixtral_8x7B/test_attention.py
index 6a7747f7..58955928 100644
--- a/tests/Mixtral_8x7B/test_attention.py
+++ b/tests/Mixtral_8x7B/test_attention.py
@@ -166,8 +166,8 @@ def test_rmsnorm(device, seq=32):
     from Scheduler.scheduler import PyTorchSimRunner
     module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
-    test_rmsnorm(device, seq=1)
-    test_concat(device, size1=(1, 8, 64, 64), size2=(1,8,1,64), dim=2)
+    #test_rmsnorm(device, seq=1)
+    #test_concat(device, size1=(1, 8, 64, 64), size2=(1,8,1,64), dim=2)
     test_decode(device, 32, 3)
     #test_attention(device)
     #test_ffn(device)

From 11524280328b41bcecb728b80cb730cd5835f3b5 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 8 Dec 2025 16:06:32 +0000
Subject: [PATCH 014/194] [Refactor] Seperate OpOverrides

---
 .../mlir/mlir_codegen_backend.py              |  760 +-----------
 PyTorchSimFrontend/mlir/mlir_ops.py           | 1034 +++++++++++++++++
 PyTorchSimFrontend/mlir/mlir_template.py      |    6 +-
 3 files changed, 1049 insertions(+), 751 deletions(-)
 create mode 100644 PyTorchSimFrontend/mlir/mlir_ops.py

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index a14dd10b..cda996ab 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -24,6 +24,7 @@
 from PyTorchSimFrontend import extension_config
 from . import mlir_common
 from .mlir_common import LoopLevel, LoopNest
+from .mlir_ops import ExtensionOverrides
 from PyTorchSimFrontend.mlir.mlir_autotune import MLIRBenchmarkRequest
 
 def reduction_init(reduction_type, dtype):
@@ -56,19 +57,6 @@ def reduction_partial_combine_vec(reduction_type, vector_value, init_value):
         return ops.logical_and(vector_value, init_value)
     raise AssertionError(reduction_type)
 
-def reduction_combine_vec(reduction_type, vector_value, init_value, axis, shape, reduced_shape):
-    if reduction_type == "sum":
-        return f"vector.multi_reduction <add>, %{vector_value}, %{init_value} [{axis}] : {shape} to {reduced_shape}"
-    if reduction_type == "prod":
-        return f"vector.multi_reduction <mul>, %{vector_value}, %{init_value} [{axis}] : {shape} to {reduced_shape}"
-    if reduction_type == "max":
-        return f"vector.multi_reduction <maximumf>, %{vector_value}, %{init_value} [{axis}] : {shape} to {reduced_shape}"
-    if reduction_type == "min":
-        return f"vector.multi_reduction <minimumf>, %{vector_value}, %{init_value} [{axis}] : {shape} to {reduced_shape}"
-    if reduction_type == "any":
-        return f"vector.multi_reduction <and>, %{vector_value}, %{init_value} [{axis}] : {shape} to {reduced_shape}"
-    raise AssertionError(reduction_type)
-
 class ExtensionWrapperCodegen(wrapper.WrapperCodeGen):
     def __init__(self):
         super().__init__()
@@ -205,734 +193,6 @@ def generate(self, is_inference):
 
     def memory_plan(self):
         self.lines = memory_planning.MemoryPlanner(self).plan(self.lines)
-class ExtensionOverrides(common.OpOverrides):
-    # Binary element wise operations
-    @staticmethod
-    def custom_cast(operand, target_type, *args, var_info=None, **kwargs):
-        dtype = var_info[operand][1]
-        if dtype == "index":
-            ret = ops.index_cast(operand, target_type)
-        else:
-            ret = ops.to_dtype(operand, target_type)
-        return ret, var_info[ret]
-
-    @staticmethod
-    def binary_elementwise_common(operand1, operand2, var_info):
-        operand1.bounds = operand1.bounds.unknown()
-        operand2.bounds = operand2.bounds.unknown()
-        op_type1 = var_info[operand1]
-        op_type2 = var_info[operand2]
-        # Tile size check
-        if op_type1[0] != op_type2[0]:
-            # Try to broad cast
-            lhs_tile_size, lhs_dtype = op_type1
-            rhs_tile_size, rhs_dtype = op_type2
-            if lhs_tile_size > rhs_tile_size:
-                operand2 = ops.broadcast(operand2, lhs_tile_size)
-                op_type2 = var_info[operand2]
-            elif lhs_tile_size < rhs_tile_size:
-                operand1 = ops.broadcast(operand1, rhs_tile_size)
-                op_type1 = var_info[operand1]
-
-        # Data type check
-        if op_type1[1] != op_type2[1]:
-            if op_type1[1] == "index" or op_type1 == "index":
-                if op_type1[1] == "index":
-                    operand1 = ops.index_cast(operand1, op_type2[1])
-                    op_type1 = var_info[operand1]
-                if op_type2[1] == "index":
-                    operand2 = ops.index_cast(operand2, op_type1[1])
-                    op_type2 = var_info[operand2]
-            elif op_type1[1][0] == "i" and op_type2[1][0] == "f":
-                operand1 = ops.to_dtype(operand1, op_type2[1])
-                op_type1 = var_info[operand1]
-            elif op_type1[1][0] == "f" and op_type2[1][0] == "i":
-                operand2 = ops.to_dtype(operand2, op_type1[1])
-                op_type2 = var_info[operand2]
-            elif op_type1[1][0] == op_type2[1][0]:
-                if mlir_common.MLIR_TO_BIT[op_type1[1]] > mlir_common.MLIR_TO_BIT[op_type2[1]]:
-                   operand2 = ops.ext(operand2, op_type1[1])
-                   op_type2 = var_info[operand2]
-                elif mlir_common.MLIR_TO_BIT[op_type1[1]] < mlir_common.MLIR_TO_BIT[op_type2[1]]:
-                   operand1 = ops.ext(operand1, op_type2[1])
-                   op_type1 = var_info[operand1]
-            else:
-                raise NotImplementedError("Unsupported type converting")
-
-        # Updated var info
-        tile_size = op_type1[0]
-        ret_type = op_type1[1]
-        return tile_size, ret_type, operand1, operand2
-
-    @staticmethod
-    def add(operand1, operand2, *args, var_info=None, **kwargs):
-        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
-        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
-        opcode = f'arith.add{ret_type[0]}'
-        return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
-
-    @staticmethod
-    def sub(operand1, operand2, *args, var_info=None, **kwargs):
-        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
-        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
-        opcode = f'arith.sub{ret_type[0]}'
-        return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
-
-    @staticmethod
-    def mul(operand1, operand2, *args, var_info=None, **kwargs):
-        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
-        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
-        opcode = f'arith.mul{ret_type[0]}'
-        return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
-
-    @staticmethod
-    def div(operand1, operand2, *args, var_info=None, **kwargs):
-        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
-        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
-        if ret_type[0] == "f":
-            opcode = f'arith.divf'
-        else:
-            opcode = f'arith.divui'
-        return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
-
-    @staticmethod
-    def truediv(operand1, operand2, *args, var_info=None, **kwargs):
-        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
-        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
-        if ret_type[0] == "f":
-            opcode = f'arith.divf'
-        else:
-            opcode = f'arith.divui'
-        return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
-
-    @staticmethod
-    def modular(operand1, operand2, *args, var_info=None, **kwargs):
-        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
-        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
-        if ret_type[0] == "f":
-            raise NotImplementedError("Not support remainder operation for floating point")
-        else:
-            opcode = f'arith.remui'
-        return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
-
-    @staticmethod
-    def minimum(operand1, operand2, *args, var_info=None, **kwargs):
-        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
-        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
-        if ret_type[0] == "f":
-            opcode = f'arith.minimumf'
-        else:
-            opcode = f'arith.minui'
-        return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
-
-    @staticmethod
-    def maximum(operand1, operand2, *args, var_info=None, **kwargs):
-        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
-        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
-        if ret_type[0] == "f":
-            opcode = f'arith.maximumf'
-        else:
-            opcode = f'arith.maxui'
-        return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
-
-    @staticmethod
-    def to_dtype(operand, dst_mlir_dtype, *args, var_info=None, **kwargs):
-        src_mlir_dtype = var_info[operand][1]
-        if src_mlir_dtype == "index":
-            operand = ops.index_cast(operand, "i64")
-            src_mlir_dtype = var_info[operand][1]
-
-        tile_size = var_info[operand][0]
-        if isinstance(dst_mlir_dtype, torch.dtype):
-            dst_mlir_dtype = mlir_common.DTYPE_TO_MLIR[dst_mlir_dtype]
-        dst_bits = mlir_common.MLIR_TO_BIT[dst_mlir_dtype]
-        src_bits = mlir_common.MLIR_TO_BIT[src_mlir_dtype]
-        shape = f"vector<{tile_size}x{dst_mlir_dtype}>" if tile_size > 1 else dst_mlir_dtype
-        src_shape = f"vector<{tile_size}x{src_mlir_dtype}>" if tile_size > 1 else src_mlir_dtype
-        if dst_mlir_dtype[0] == "i" and src_mlir_dtype[0] == "f":
-            return f"arith.fptoui %{operand} : {src_shape} to {shape}", [tile_size, dst_mlir_dtype]
-        if dst_mlir_dtype[0] == "f" and src_mlir_dtype[0] == "i":
-            return f"arith.uitofp %{operand} : {src_shape} to {shape}", [tile_size, dst_mlir_dtype]
-        if dst_mlir_dtype[0] == "i":
-            if dst_bits > src_bits:
-                return f"arith.extui %{operand} : {src_shape} to {shape}", [tile_size, dst_mlir_dtype]
-            elif dst_bits < src_bits:
-                return f"arith.trunc %{operand} : {src_shape} to {shape}", [tile_size, dst_mlir_dtype]
-            return f"arith.maxui %{operand}, %{operand} : {shape}", [tile_size, dst_mlir_dtype]
-        elif dst_mlir_dtype[0] == "f":
-            if dst_bits > src_bits:
-                return f"arith.extf %{operand} : {src_shape} to {shape}", [tile_size, dst_mlir_dtype]
-            elif dst_bits < src_bits:
-                return f"arith.trunf %{operand} : {src_shape} to {shape}", [tile_size, dst_mlir_dtype]
-            return f"arith.maximumf %{operand}, %{operand} : {shape}", [tile_size, dst_mlir_dtype]
-        else:
-            raise NotImplementedError("Unsupported type for to_dtype ops")
-
-    @staticmethod
-    def constant(value, src_type, *args, var_info=None, **kwargs):
-        if isinstance(src_type, torch.dtype):
-            src_type = mlir_common.DTYPE_TO_MLIR[src_type]
-
-        if "inf" == str(value) or "-inf" == str(value) or "nan" == str(value):
-            value = f"0x{mlir_common.MLIR_INF[str(value)][src_type]:x}"
-        # if value represented by e notation, convert to float (ex 1e-3 -> 1.0e-3)
-        elif "e" in str(value):
-            value = format(float(value), ".20f")
-        elif src_type[0] == "f":
-            value = format(float(value), ".20f")
-        elif src_type[0] == "i":
-            value = int(value)
-        return f'arith.constant {value} : {src_type}', [1, src_type]
-
-    @staticmethod
-    def alloc(size, src_type, *args, var_info=None, **kwargs):
-        return f"memref.alloc() : memref<{size}x{src_type}>", [size, src_type]
-
-    @staticmethod
-    def extractelement(operand, idx, *args, var_info=None, **kwargs):
-        op_type = var_info[operand]
-        tile_size = op_type[0]
-        dtype = op_type[1]
-        shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
-        return f"vector.extract %{operand}[{idx}]: {dtype} from {shape}", [1, dtype]
-
-    # transcendental functions
-    @staticmethod
-    def exp(operand, *args, var_info=None, **kwargs):
-        # Check scalar
-        op_type = var_info[operand]
-        if op_type[0] == 1:
-            operand = ops.broadcast(operand, 4)
-            val = ops.exp(operand)
-            result = ops.extractelement(val, 0)
-            return result, var_info[result]
-        op_type = var_info[operand]
-        tile_size = op_type[0]
-        dtype = op_type[1]
-        shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
-        return f'math.exp %{operand} : {shape}', [tile_size, dtype]
-
-    @staticmethod
-    def exp2(operand, *args, var_info=None, **kwargs):
-        # Hands-on part: implement exp2 using math.exp2
-        # var_info = {operand: [tile_size, dtype]}
-        # Ex) var_info[operand] = [8, "f32"]
-
-        ln2 = math.log(2)
-        coeff = ops.constant(ln2, "f32")
-        operand = ops.mul(operand, coeff)
-        return ops.exp(operand), var_info[operand]
-
-    @staticmethod
-    def erf(operand, *args, var_info=None, **kwargs):
-        # Check scalar
-        op_type = var_info[operand]
-        if op_type[0] == 1:
-            operand = ops.broadcast(operand, 4)
-            val = ops.erf(operand)
-            result = ops.extractelement(val, 0)
-            return result, var_info[result]
-        op_type = var_info[operand]
-        tile_size = op_type[0]
-        dtype = op_type[1]
-        shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
-        return f'math.erf %{operand} : {shape}', [tile_size, dtype]
-
-    @staticmethod
-    def tanh(operand, *args, var_info=None, **kwargs):
-        op_type = var_info[operand]
-
-        # Check scalar
-        op_type = var_info[operand]
-        if op_type[0] == 1:
-            operand = ops.broadcast(operand, 4)
-            val = ops.tanh(operand)
-            result = ops.extractelement(val, 0)
-            return result, var_info[result]
-        op_type = var_info[operand]
-        tile_size = op_type[0]
-        dtype = op_type[1]
-
-        # Type check & auto cast
-        if dtype[0] != "f":
-            operand, dtype = ops.to_dtype(operand, "f32")
-            var_info[operand] = dtype
-        shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
-        return f'math.tanh %{operand} : {shape}', [tile_size, dtype]
-
-    @staticmethod
-    def sin(operand, *args, var_info=None, **kwargs):
-        op_type = var_info[operand]
-
-        # Check scalar
-        op_type = var_info[operand]
-        if op_type[0] == 1:
-            operand = ops.broadcast(operand, 4)
-            val = ops.sin(operand)
-            result = ops.extractelement(val, 0)
-            return result, var_info[result]
-        op_type = var_info[operand]
-        tile_size = op_type[0]
-        dtype = op_type[1]
-
-        # Type check & auto cast
-        if dtype[0] != "f":
-            operand, dtype = ops.to_dtype(operand, "f32")
-            var_info[operand] = dtype
-        shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
-        return f'math.sin %{operand} : {shape}', [tile_size, dtype]
-
-    @staticmethod
-    def cos(operand, *args, var_info=None, **kwargs):
-        op_type = var_info[operand]
-
-        # Check scalar
-        op_type = var_info[operand]
-        if op_type[0] == 1:
-            operand = ops.broadcast(operand, 4)
-            val = ops.cos(operand)
-            result = ops.extractelement(val, 0)
-            return result, var_info[result]
-        op_type = var_info[operand]
-        tile_size = op_type[0]
-        dtype = op_type[1]
-
-        # Type check & auto cast
-        if dtype[0] != "f":
-            operand, dtype = ops.to_dtype(operand, "f32")
-            var_info[operand] = dtype
-        shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
-        return f'math.cos %{operand} : {shape}', [tile_size, dtype]
-
-    @staticmethod
-    def sqrt(operand, *args, var_info=None, **kwargs):
-        op_type = var_info[operand]
-        tile_size = op_type[0]
-        dtype = op_type[1]
-
-        # Type check & auto cast
-        if dtype[0] != "f":
-            operand, dtype = ops.to_dtype(operand, "f32")
-            var_info[operand] = dtype
-
-        shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
-        return f'math.sqrt %{operand} : {shape}', [tile_size, dtype]
-
-    @staticmethod
-    def rsqrt(operand, *args, var_info=None, **kwargs):
-        op_type = var_info[operand]
-        tile_size = op_type[0]
-        dtype = op_type[1]
-
-        # Type check & auto cast
-        if dtype[0] != "f":
-            operand, dtype = ops.to_dtype(operand, "f32")
-            var_info[operand] = dtype
-
-        shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
-        return f'math.rsqrt %{operand} : {shape}', [tile_size, dtype]
-
-    @staticmethod
-    def pow(operand1, operand2, *args, var_info=None, **kwargs):
-        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
-        # Type check & auto cast
-        if ret_type[0] != "f":
-            operand1, ret_type = ops.to_dtype(operand1, "f32")
-            var_info[operand1] = ret_type
-
-        # Type check & auto cast
-        if ret_type[0] != "f":
-            operand2, ret_type = ops.to_dtype(operand2, "f32")
-            var_info[operand2] = ret_type
-
-        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
-        return f"math.pow{ret_type[0]} %{operand1}, %{operand2} : {shape}", [tile_size, ret_type]
-
-    @staticmethod
-    def log(operand, *args, var_info=None, **kwargs):
-        op_type = var_info[operand]
-        tile_size = op_type[0]
-        dtype = op_type[1]
-
-        # Type check & auto cast
-        if dtype[0] != "f":
-            operand, dtype = ops.to_dtype(operand, "f32")
-            var_info[operand] = dtype
-
-        shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
-        return f'math.log %{operand} : {shape}', [tile_size, dtype]
-
-    @staticmethod
-    def reciprocal(operand, *args, var_info=None, **kwargs):
-        op_type = var_info[operand]
-        tile_size = op_type[0]
-        dtype = op_type[1]
-
-        # Type check & auto cast
-        if dtype[0] != "f":
-            operand, dtype = ops.to_dtype(operand, "f32")
-            var_info[operand] = dtype
-
-        return ops.div(ops.constant(1.0, dtype), operand), [tile_size, dtype]
-
-    @staticmethod
-    def ext(operand, dtype, *args, var_info=None, **kwargs):
-        op_type = var_info[operand]
-        shape = f"vector<{op_type[0]}x{op_type[1]}>" if op_type[0] > 1 else f"{op_type[1]}"
-        target_type = f"vector<{op_type[0]}x{dtype}>" if op_type[0] > 1 else f"{dtype}"
-        if op_type[0] == "f":
-            opcode = f'arith.extf'
-        else:
-            opcode = f'arith.extui'
-        return f'{opcode} %{operand} : {shape} to {target_type}', [op_type[0], dtype]
-
-    # Logical operations
-    @staticmethod
-    def neg(operand, *args, var_info=None, **kwargs):
-        op_type = var_info[operand]
-        tile_size = op_type[0]
-        dtype = op_type[1]
-
-        # Type check & auto cast
-        if dtype[0] != "f":
-            operand, dtype = ops.to_dtype(operand, "f32")
-            var_info[operand] = dtype
-
-        shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
-        return f'arith.negf %{operand} : {shape}', [tile_size, dtype]
-
-    @staticmethod
-    def eq(operand1, operand2, *args, var_info=None, **kwargs):
-        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
-        if ret_type[0] == "f":
-            op_type = "arith.cmpf"
-            attribute = "oeq"
-        elif ret_type[0] == "i":
-            op_type = "arith.cmpi"
-            attribute = "eq"
-        else:
-            raise ValueError(f"Unsupported data type for 'eq' operation: {ret_type}")
-
-        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
-        return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"]
-
-    @staticmethod
-    def ne(operand1, operand2, *args, var_info=None, **kwargs):
-        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
-        if ret_type[0] == "f":
-            op_type = "arith.cmpf"
-            attribute = "one"
-        elif ret_type[0] == "i":
-            op_type = "arith.cmpi"
-            attribute = "ne"
-        else:
-            raise ValueError(f"Unsupported data type for 'ne' operation: {ret_type}")
-
-        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
-        return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"]
-
-    @staticmethod
-    def lt(operand1, operand2, *args, var_info=None, **kwargs):
-        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
-        if ret_type[0] == "f":
-            op_type = "arith.cmpf"
-            attribute = "olt"
-        elif ret_type[0] == "i":
-            op_type = "arith.cmpi"
-            attribute = "slt"
-        else:
-            raise ValueError(f"Unsupported data type for 'lt' operation: {ret_type}")
-
-        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
-        return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"]
-
-    @staticmethod
-    def gt(operand1, operand2, *args, var_info=None, **kwargs):
-        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
-        if ret_type[0] == "f":
-            op_type = "arith.cmpf"
-            attribute = "ogt"
-        elif ret_type[0] == "i":
-            op_type = "arith.cmpi"
-            attribute = "sgt"
-        else:
-            raise ValueError(f"Unsupported data type for 'gt' operation: {ret_type}")
-
-        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
-        return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"]
-
-    @staticmethod
-    def le(operand1, operand2, *args, var_info=None, **kwargs):
-        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
-        if ret_type[0] == "f":
-            op_type = "arith.cmpf"
-            attribute = "ole"
-        elif ret_type[0] == "i":
-            op_type = "arith.cmpi"
-            attribute = "sle"
-        else:
-            raise ValueError(f"Unsupported data type for 'le' operation: {ret_type}")
-
-        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
-        return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"]
-
-    @staticmethod
-    def ge(operand1, operand2, *args, var_info=None, **kwargs):
-        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
-        if ret_type[0] == "f":
-            op_type = "arith.cmpf"
-            attribute = "oge"
-        elif ret_type[0] == "i":
-            op_type = "arith.cmpi"
-            attribute = "sge"
-        else:
-            raise ValueError(f"Unsupported data type for 'ne' operation: {ret_type}")
-
-        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
-        return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"]
-
-    @staticmethod
-    def and_(operand1, operand2, *args, var_info=None, **kwargs):
-        op_type1 = var_info[operand1]
-        op_type2 = var_info[operand2]
-
-        # Type check & auto cast
-        if op_type1[1][0] != "i":
-            operand1, dtype = ops.to_dtype(operand1, "i32")
-            var_info[operand1] = dtype
-
-        # Type check & auto cast
-        if op_type2[1][0] != "i":
-            operand1, dtype = ops.to_dtype(operand1, "i32")
-            var_info[operand2] = dtype
-
-        ret_type = op_type1[1]
-        tile_size = op_type1[0]
-
-        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
-        return f'arith.andi %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
-
-    @staticmethod
-    def or_(operand1, operand2, *args, var_info=None, **kwargs):
-        op_type1 = var_info[operand1]
-        op_type2 = var_info[operand2]
-
-        # Type check & auto cast
-        if op_type1[1][0] != "i":
-            operand1, dtype = ops.to_dtype(operand1, "i32")
-            var_info[operand1] = dtype
-
-        # Type check & auto cast
-        if op_type2[1][0] != "i":
-            operand1, dtype = ops.to_dtype(operand1, "i32")
-            var_info[operand2] = dtype
-
-        ret_type = op_type1[1]
-        tile_size = op_type1[0]
-
-        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
-        return f'arith.ori %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
-
-    @staticmethod
-    def xor(operand1, operand2, *args, var_info=None, **kwargs):
-        op_type1 = var_info[operand1]
-        op_type2 = var_info[operand2]
-
-        # Type check & auto cast
-        if op_type1[1][0] != "i":
-            operand1, dtype = ops.to_dtype(operand1, "i32")
-            var_info[operand1] = dtype
-
-        # Type check & auto cast
-        if op_type2[1][0] != "i":
-            operand1, dtype = ops.to_dtype(operand1, "i32")
-            var_info[operand2] = dtype
-
-        ret_type = op_type1[1]
-        tile_size = op_type1[0]
-
-        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
-        return f'arith.xori %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
-
-    @staticmethod
-    def to_bool(operand, *args, var_info=None, **kwargs):
-        tile_size, ret_type = var_info[operand]
-        const_one = ops.constant(0, ret_type)
-        if tile_size > 1:
-            const_one = ops.broadcast(const_one, tile_size)
-        ret = ops.ne(operand, const_one)
-        return ret, [tile_size, "i1"]
-
-    @staticmethod
-    def logical_and(operand1, operand2, *args, var_info=None, **kwargs):
-        op_type1 = var_info[operand1]
-        op_type2 = var_info[operand2]
-        # Type check & auto cast
-        if op_type1[1] != "i1":
-            operand1 = ops.to_bool(operand1)
-        if op_type2[1] != "i1":
-            operand2 = ops.to_bool(operand2)
-        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
-        return ExtensionOverrides.and_(operand1, operand2, *args, var_info=var_info, **kwargs)
-
-    @staticmethod
-    def logical_not(operand, *args, var_info=None, **kwargs):
-        op_type = var_info[operand]
-
-        ret_type = op_type[1]
-        tile_size = op_type[0]
-        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
-        const_one = ops.constant(0, ret_type)
-        const_one = ops.broadcast(const_one, tile_size)
-        ret = ops.eq(operand,const_one)
-        return ret, [tile_size, "i1"]
-
-    @staticmethod
-    def logical_or(operand1, operand2, *args, var_info=None, **kwargs):
-        op_type1 = var_info[operand1]
-        op_type2 = var_info[operand2]
-        # Type check & auto cast
-        if op_type1[1] != "i1":
-            operand1 = ops.to_bool(operand1)
-        if op_type2[1] != "i1":
-            operand2 = ops.to_bool(operand2)
-        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
-        return ExtensionOverrides.or_(operand1, operand2, *args, var_info=var_info, **kwargs)
-
-    @staticmethod
-    def logical_xor(operand1, operand2, *args, var_info=None, **kwargs):
-        op_type1 = var_info[operand1]
-        op_type2 = var_info[operand2]
-        # Type check & auto cast
-        if op_type1[1] != "i1":
-            operand1 = ops.to_bool(operand1)
-        if op_type2[1] != "i1":
-            operand2 = ops.to_bool(operand2)
-        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
-        return ExtensionOverrides.xor(operand1, operand2, *args, var_info=var_info, **kwargs)
-
-    @staticmethod
-    def relu(operand, *args, var_info=None, **kwargs):
-        op_type = var_info[operand]
-        tile_size = op_type[0]
-        ret_type = "f32"
-        return ops.maximum(operand, ops.constant(0.0, "f32")), [tile_size, ret_type]
-
-    @staticmethod
-    def sigmoid(operand, *args, var_info=None, **kwargs):
-        op_type = var_info[operand]
-        tile_size = op_type[0]
-        ret_type = "f32"
-        one = ops.constant(1, "f32")
-        return ops.truediv(one, ops.add(one, ops.exp(ops.neg(operand)))), [tile_size, ret_type]
-
-    # Special operaitons
-    @staticmethod
-    def where(condition, operand1, operand2, *args, var_info=None, **kwargs):
-        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
-        cond_type = var_info[condition]
-        operand_type = var_info[operand1]
-        if cond_type[0] < tile_size:
-            condition = ops.broadcast(condition, tile_size)
-        elif cond_type[0] > tile_size:
-            operand1 = ops.broadcast(operand1, cond_type[0])
-            operand2 = ops.broadcast(operand2, cond_type[0])
-        tile_size, ret_type = var_info[operand1]
-        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
-        cond_shape = f"vector<{tile_size}xi1>," if tile_size > 1 else ""
-        return f"arith.select %{condition}, %{operand1}, %{operand2} : {cond_shape} {shape}", [tile_size, ret_type]
-
-    @staticmethod
-    def step(size, dtype, *args, **kwargs):
-        index_shape = f"vector<{size}x{dtype}>"
-        return f"vector.step : {index_shape}", [size, dtype]
-
-    @staticmethod
-    def masked(mask, body, other, *args, var_info=None, tile_size=16, dtype="f32", ninf_declared=False, **kwargs):
-        result = body()
-        val = ops.constant(other, dtype, *args, **kwargs)
-        result = ops.where(mask, result, val)
-        return result, var_info[result]
-
-    @staticmethod
-    def index_cast(operand, target_type, *args, var_info=None, **kwrags):
-        op_type = var_info[operand]
-        src_shape = f"vector<{op_type[0]}x{op_type[1]}>" if op_type[0] > 1 else op_type[1]
-        des_shape = f"vector<{op_type[0]}x{target_type}>" if op_type[0] > 1 else target_type
-        return f"arith.index_cast %{operand} : {src_shape} to {des_shape}", [op_type[0], target_type]
-
-    @staticmethod
-    def broadcast_unflat(operand1, target_size, *args, var_info=None, **kwargs):
-        op_type1 = var_info[operand1]
-        src_shape = f"vector<{op_type1[0]}x{op_type1[1]}>"# if op_type1[0] > 1 else op_type1[1]
-        des_shape = f"vector<{target_size//op_type1[0]}x{op_type1[0]}x{op_type1[1]}>"# if op_type2[0] > 1 else op_type1[1] # Use tile size only
-
-        expand = f"vector.broadcast %{operand1} : {src_shape} to {des_shape}"
-        return expand, [target_size, op_type1[1]]
-
-    @staticmethod
-    def broadcast(operand1, target_size, *args, var_info=None, **kwargs):
-        op_type1 = var_info[operand1]
-        src_shape = f"vector<{op_type1[0]}x{op_type1[1]}>" if op_type1[0] > 1 else op_type1[1]
-        des_shape = f"vector<{target_size}x{op_type1[1]}>" # if op_type2[0] > 1 else op_type1[1] # Use tile size only
-
-        # Special case for length 2 vector. We used this vector to avoid scalar operations...
-        if op_type1[0] != 1 and target_size % op_type1[0] == 0:
-            unflat_operand = ops.broadcast_unflat(operand1, target_size)
-            unflat_shape = f"vector<{target_size//op_type1[0]}x{op_type1[0]}x{op_type1[1]}>"
-            expand = f"vector.shape_cast %{unflat_operand} : {unflat_shape} to {des_shape}"
-        elif op_type1[0] == 1:
-            expand = f"vector.broadcast %{operand1} : {src_shape} to {des_shape}"
-        else:
-            raise NotImplementedError("Not supporting broadcast type...")
-        return expand, [target_size, op_type1[1]]
-
-    @staticmethod
-    def shape_cast(operand, src_shape, dst_shape, *args, var_info=None, **kwargs):
-        operand_type = var_info[operand]
-        return f"vector.shape_cast %{operand} : {src_shape} to {dst_shape}", operand_type
-
-    @staticmethod
-    def multi_reduction(acc, init, vec_size, red_size, red_shape, red_type, type_name, *args, **kwargs):
-        if red_size == 1:
-            final_reduced_shape = f"{type_name}"
-            line = reduction_combine_vec(red_type, acc, init, axis=0, shape=red_shape, reduced_shape=final_reduced_shape)
-        else:
-            final_reduced_shape = f"vector<{red_size}x{type_name}>"
-            new_vshape= f"vector<{vec_size//red_size}x{red_size}x{type_name}>"
-            value = ops.shape_cast(acc, red_shape, new_vshape)
-            line = reduction_combine_vec(red_type, value, init, axis=0, shape=new_vshape, reduced_shape=final_reduced_shape)
-        return line, [red_size, type_name]
-
-    @staticmethod
-    def _load(compute_vec_size, mlir_dtype, buffer, indices, buffer_shape, *args, var_info=None, **kwargs):
-        if compute_vec_size == 1:
-            vshape = f"{mlir_dtype}"
-            operation = "affine.load"
-            line = f"{operation} %{buffer}[{indices}] : {buffer_shape}"
-        else:
-            vshape = f"vector<{compute_vec_size}x{mlir_dtype}>"
-            operation = "affine.vector_load"
-            line = f"{operation} %{buffer}[{indices}] : {buffer_shape}, {vshape}"
-        return line, [compute_vec_size, mlir_dtype]
-
-    @staticmethod
-    def _store(operand, buffer, indices, buffer_shape, *args, buffer_name=None, var_info=None, **kwargs):
-        compute_vec_size, mlir_dtype = var_info[operand][0], var_info[operand][1]
-
-        if compute_vec_size == 1:
-            vshape = f"{mlir_dtype}"
-            operation = "affine.store"
-            line = f"{operation} %{operand}, %{buffer}[{indices}] : {buffer_shape}"
-        else:
-            vshape = f"vector<{compute_vec_size}x{mlir_dtype}>"
-            operation = "affine.vector_store"
-            line = f"{operation} %{operand}, %{buffer}[{indices}] : {buffer_shape}, {vshape}"
-
-        if buffer_name is not None:
-            return common.DeferredLine(buffer_name, line), [None, None]
-        else:
-            return line, [None, None]
 
 RTYPE_TO_MLIR = {
     "sum": "add",
@@ -1214,7 +474,7 @@ def store(self, name: str, index: sympy.Expr, value, mode=None, *args, **kwargs)
             # Generate vector store instruction
             _, operand_type = self.var_info[value]
             if mlir_dtype != operand_type:
-                value = ops.custom_cast(value, mlir_dtype)
+                value = ops.to_dtype(value, mlir_dtype)
 
             if compute_vec_size < self.var_info[value][0]:
                 value = self.cse.generate(self.stores, f"vector.extract_strided_slice  %{value} {{offsets = [0], sizes = [{compute_vec_size}], strides = [1]}}: vector<{self.var_info[value][0]}x{self.var_info[value][1]}> to {vshape}")
@@ -1256,6 +516,8 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
         vec_len = self.kernel_group.tile_desc.get_compute_vec_size()
         reduced_shape = self.kernel_group.tile_desc.get_mlir_vshape(type_name)
 
+
+
         # Prepare reduction init
         with self.override_buffer_cse(cse=self.const_cse, buffer=self.const_buffer):
             init = self.get_const_cse(reduction_init(reduction_type, dtype), type_name)
@@ -1289,10 +551,12 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
         _, mask_var = self.get_mask()
         if mask_var is not None:
             value = ops.where(mask_var, value, init_vec)
+
         result = reduction_partial_combine_vec(reduction_type, value, body_iter_arg)
+        result = ops.to_dtype(result, type_name)
+
         self.compute_body_loop.reduction_vars[body_acc] = (reduction_type, body_iter_arg, iter_var_list[-1], reduced_shape)
         self.compute_body_loop.affine_yield[result] = reduced_shape
-
         # Register affine yield var
         for reduction_depth, acc in enumerate(acc_var_list[1:]):
             self.affine_yield[acc] = reduced_shape, reduction_depth
@@ -1340,8 +604,8 @@ def store_reduction(self, name, index, value):
                     sum, sqr_sum, _ = self.welford_reduce_out
                     reduction_numel = reduce(mul, self.ranges[self.reduction_depth:], 1)
                     divider = self.get_const_cse(float(reduction_numel), "f32")
-                    mean = ops.div(sum, divider)
-                    sqr_mean = ops.div(sqr_sum, divider)
+                    mean = ops.truediv(sum, divider)
+                    sqr_mean = ops.truediv(sqr_sum, divider)
                     mean_sqr = ops.mul(mean, mean)
                     variance = ops.sub(sqr_mean, mean_sqr)
                     m2 = ops.mul(variance, divider)
@@ -1401,10 +665,10 @@ def _index_expr(self, tile_desc, renamed_expression, index, base_vector_index):
                 vlane_vec_size = 4
                 vlane_vec = ops.broadcast(vlane_coeff, vlane_vec_size)
 
-            dim = ops.modular(ops.div(vector_index, div_vec), mod_vec)
+            dim = ops.remainder(ops.truncdiv(vector_index, div_vec), mod_vec)
             if idx == tile_desc.vmap.vlane_split_axis: # Need to add vector lane offset
-                stride_dim = ops.modular(dim, vlane_stride_vec)
-                outer_dim = ops.modular(ops.div(dim, vlane_stride_vec), vlane_outer_vec)
+                stride_dim = ops.remainder(dim, vlane_stride_vec)
+                outer_dim = ops.remainder(ops.truncdiv(dim, vlane_stride_vec), vlane_outer_vec)
                 dim = ops.add(stride_dim, ops.mul(outer_dim, nr_vector_lane_vec))
 
                 vlane_offset = self.const_cse.generate(self.const_buffer, f"arith.addi %{vlane_vec}, %{vlane_vec} {{ vlane_offset={offset} }} : vector<{vlane_vec_size}xi64> // vlane offset")
diff --git a/PyTorchSimFrontend/mlir/mlir_ops.py b/PyTorchSimFrontend/mlir/mlir_ops.py
new file mode 100644
index 00000000..ebf0c111
--- /dev/null
+++ b/PyTorchSimFrontend/mlir/mlir_ops.py
@@ -0,0 +1,1034 @@
+import math
+import torch
+
+from torch._inductor.codegen import common
+from torch._inductor.virtualized import V, _ops as ops
+from . import mlir_common
+
+def reduction_combine_vec(reduction_type, vector_value, init_value, axis, shape, reduced_shape):
+    if reduction_type == "sum":
+        return f"vector.multi_reduction <add>, %{vector_value}, %{init_value} [{axis}] : {shape} to {reduced_shape}"
+    if reduction_type == "prod":
+        return f"vector.multi_reduction <mul>, %{vector_value}, %{init_value} [{axis}] : {shape} to {reduced_shape}"
+    if reduction_type == "max":
+        return f"vector.multi_reduction <maximumf>, %{vector_value}, %{init_value} [{axis}] : {shape} to {reduced_shape}"
+    if reduction_type == "min":
+        return f"vector.multi_reduction <minimumf>, %{vector_value}, %{init_value} [{axis}] : {shape} to {reduced_shape}"
+    if reduction_type == "any":
+        return f"vector.multi_reduction <and>, %{vector_value}, %{init_value} [{axis}] : {shape} to {reduced_shape}"
+    raise AssertionError(reduction_type)
+
+class ExtensionOverrides(common.OpOverrides):
+    @staticmethod
+    def constant(value, src_type, *args, var_info=None, **kwargs):
+        if isinstance(src_type, torch.dtype):
+            src_type = mlir_common.DTYPE_TO_MLIR[src_type]
+
+        str_val = str(value)
+        if "inf" == str_val or "-inf" == str_val or "nan" == str_val:
+            value = f"0x{mlir_common.MLIR_INF[str_val][src_type]:x}"
+        # scientific notation check
+        elif "e" in str_val:
+            value = format(float(value), ".20f")
+        elif src_type[0] == "f":
+            value = format(float(value), ".20f")
+        elif src_type[0] == "i":
+            value = int(float(value)) 
+        return f'arith.constant {value} : {src_type}', [1, src_type]
+
+    @staticmethod
+    def broadcast(operand, target_size, *args, var_info=None, **kwargs):
+        src_size, dtype = var_info[operand]
+
+        src_shape = f"vector<{src_size}x{dtype}>" if src_size > 1 else dtype
+        dst_shape = f"vector<{target_size}x{dtype}>"
+
+        op_str = ""
+        # Special case for length 2 vector. We used this vector to avoid scalar operations...
+        if src_size > 1:
+            if target_size % src_size == 0:
+                unflat_operand = ops.broadcast_unflat(operand, target_size)
+                outer_dim = target_size // src_size
+                unflat_shape = f"vector<{outer_dim}x{src_size}x{dtype}>"
+                # Flatten back to 1D
+                op_str = f"vector.shape_cast %{unflat_operand} : {unflat_shape} to {dst_shape}"
+            else:
+                raise NotImplementedError(
+                    f"Vector broadcast size mismatch: src={src_size} cannot broadcast to target={target_size}"
+                )
+        elif src_size == 1:
+            op_str = f"vector.broadcast %{operand} : {src_shape} to {dst_shape}"
+        else:
+            raise ValueError(f"Invalid source size: {src_size}")
+        return op_str, [target_size, dtype]
+
+    @staticmethod
+    def broadcast_unflat(operand, target_size, *args, var_info=None, **kwargs):
+        src_size, dtype = var_info[operand]
+
+        outer_dim = target_size // src_size
+        src_shape = f"vector<{src_size}x{dtype}>"
+        dst_shape = f"vector<{outer_dim}x{src_size}x{dtype}>"
+
+        op_str = f"vector.broadcast %{operand} : {src_shape} to {dst_shape}"
+        return op_str, [target_size, dtype]
+
+    def load_seed(self, *args, **kwargs):
+        raise NotImplementedError
+
+    def rand(self, *args, **kwargs):
+        raise NotImplementedError
+
+    def randn(self, *args, **kwargs):
+        raise NotImplementedError
+
+    def randint64(self, *args, **kwargs):
+        raise NotImplementedError
+
+    # Special operaitons
+    @staticmethod
+    def masked(mask, body, other, *args, var_info=None, tile_size=16, dtype="f32", ninf_declared=False, **kwargs):
+        result = body()
+        val = ops.constant(other, dtype, *args, **kwargs)
+        result = ops.where(mask, result, val)
+        return result, var_info[result]
+
+    @staticmethod
+    def where(condition, operand1, operand2, *args, var_info=None, **kwargs):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+        cond_type = var_info[condition]
+        operand_type = var_info[operand1]
+        if cond_type[0] < tile_size:
+            condition = ops.broadcast(condition, tile_size)
+        elif cond_type[0] > tile_size:
+            operand1 = ops.broadcast(operand1, cond_type[0])
+            operand2 = ops.broadcast(operand2, cond_type[0])
+        tile_size, ret_type = var_info[operand1]
+        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
+        cond_shape = f"vector<{tile_size}xi1>" if tile_size > 1 else ""
+        return f"arith.select %{condition}, %{operand1}, %{operand2} : {cond_shape}, {shape}", [tile_size, ret_type]
+
+    @staticmethod
+    def to_dtype(operand, dst_mlir_dtype, *args, var_info=None, **kwargs):
+        # Extract source information
+        src_mlir_dtype = var_info[operand][1]
+        tile_size = var_info[operand][0]
+
+        # Normalize destination type (Torch dtype -> MLIR string)
+        if isinstance(dst_mlir_dtype, torch.dtype):
+            dst_mlir_dtype = mlir_common.DTYPE_TO_MLIR[dst_mlir_dtype]
+
+        if src_mlir_dtype == "index" and dst_mlir_dtype != "index":
+            operand = ops.index_cast(operand, "i64")
+            src_mlir_dtype = "i64" # Update explicitly
+
+        if dst_mlir_dtype == "index":
+            # If source is already index, return as is; otherwise cast
+            if src_mlir_dtype == "index":
+                return operand, [tile_size, "index"]
+            return ops.index_cast(operand, "index"), [tile_size, "index"]
+
+        # Early return if types are identical
+        if src_mlir_dtype == dst_mlir_dtype:
+            return operand, [tile_size, dst_mlir_dtype]
+
+        dst_bits = mlir_common.MLIR_TO_BIT[dst_mlir_dtype]
+        src_bits = mlir_common.MLIR_TO_BIT[src_mlir_dtype]
+        shape = f"vector<{tile_size}x{dst_mlir_dtype}>" if tile_size > 1 else dst_mlir_dtype
+        src_shape = f"vector<{tile_size}x{src_mlir_dtype}>" if tile_size > 1 else src_mlir_dtype
+        src_type_char = src_mlir_dtype[0] # 'i' or 'f'
+        dst_type_char = dst_mlir_dtype[0] # 'i' or 'f'o
+
+        op_str = ""
+
+        # Case A: Integer -> Float
+        if src_type_char == "i" and dst_type_char == "f":
+            op_str = f"arith.sitofp %{operand} : {src_shape} to {shape}"
+        # Case B: Float -> Integer
+        elif src_type_char == "f" and dst_type_char == "i":
+            op_str = f"arith.fptosi %{operand} : {src_shape} to {shape}"
+        # Case C: Integer -> Integer (Extension / Truncation)
+        elif src_type_char == "i" and dst_type_char == "i":
+            if dst_bits > src_bits:
+                op_str = f"arith.extsi %{operand} : {src_shape} to {shape}"
+            elif dst_bits < src_bits:
+                # Use arith.trunci for integer truncation
+                op_str = f"arith.trunci %{operand} : {src_shape} to {shape}" 
+            else:
+                return operand, [tile_size, dst_mlir_dtype]
+        # Case D: Float -> Float (Extension / Truncation)
+        elif src_type_char == "f" and dst_type_char == "f":
+            if dst_bits > src_bits:
+                op_str = f"arith.extf %{operand} : {src_shape} to {shape}"
+            elif dst_bits < src_bits:
+                # Corrected 'trunf' to 'truncf'
+                op_str = f"arith.truncf %{operand} : {src_shape} to {shape}" 
+            else:
+                return operand, [tile_size, dst_mlir_dtype]
+        else:
+            raise NotImplementedError(f"Unsupported conversion: {src_mlir_dtype} -> {dst_mlir_dtype}")
+
+        return op_str, [tile_size, dst_mlir_dtype]
+
+    @staticmethod
+    def identity(operand, *args, var_info=None, **kwargs):
+        operand_info = var_info[operand]
+        return operand, operand_info
+
+    @staticmethod
+    def to_dtype_bitcast(operand, dtype, *args, var_info=None, **kwargs):
+        tile_size, current_src_type = var_info[operand]
+
+        if isinstance(dtype, torch.dtype):
+            dst_mlir_type = mlir_common.DTYPE_TO_MLIR[dtype]
+        else:
+            dst_mlir_type = dtype
+
+        src_bits = mlir_common.MLIR_TO_BIT[current_src_type]
+        dst_bits = mlir_common.MLIR_TO_BIT[dst_mlir_type]
+
+        if src_bits != dst_bits:
+            raise ValueError(
+                f"Bitcast failed: Bit width mismatch. "
+                f"Src: {current_src_type}({src_bits}b) != Dst: {dst_mlir_type}({dst_bits}b)"
+            )
+
+        src_shape = f"vector<{tile_size}x{current_src_type}>" if tile_size > 1 else current_src_type
+        dst_shape = f"vector<{tile_size}x{dst_mlir_type}>" if tile_size > 1 else dst_mlir_type
+
+        return f"arith.bitcast %{operand} : {src_shape} to {dst_shape}", [tile_size, dst_mlir_type]
+
+    # Binary element wise operations
+    @staticmethod
+    def binary_elementwise_common(operand1, operand2, var_info):
+        operand1.bounds = operand1.bounds.unknown()
+        operand2.bounds = operand2.bounds.unknown()
+        op_type1 = var_info[operand1]
+        op_type2 = var_info[operand2]
+        # Tile size check
+        if op_type1[0] != op_type2[0]:
+            # Try to broad cast
+            lhs_tile_size, lhs_dtype = op_type1
+            rhs_tile_size, rhs_dtype = op_type2
+            if lhs_tile_size > rhs_tile_size:
+                operand2 = ops.broadcast(operand2, lhs_tile_size)
+                op_type2 = var_info[operand2]
+            elif lhs_tile_size < rhs_tile_size:
+                operand1 = ops.broadcast(operand1, rhs_tile_size)
+                op_type1 = var_info[operand1]
+
+        # Data type check
+        if op_type1[1] != op_type2[1]:
+            if op_type1[1] == "index" or op_type1 == "index":
+                if op_type1[1] == "index":
+                    operand1 = ops.index_cast(operand1, op_type2[1])
+                    op_type1 = var_info[operand1]
+                if op_type2[1] == "index":
+                    operand2 = ops.index_cast(operand2, op_type1[1])
+                    op_type2 = var_info[operand2]
+            elif op_type1[1][0] == "i" and op_type2[1][0] == "f":
+                operand1 = ops.to_dtype(operand1, op_type2[1])
+                op_type1 = var_info[operand1]
+            elif op_type1[1][0] == "f" and op_type2[1][0] == "i":
+                operand2 = ops.to_dtype(operand2, op_type1[1])
+                op_type2 = var_info[operand2]
+            elif op_type1[1][0] == op_type2[1][0]:
+                if mlir_common.MLIR_TO_BIT[op_type1[1]] > mlir_common.MLIR_TO_BIT[op_type2[1]]:
+                   operand2 = ops.ext(operand2, op_type1[1])
+                   op_type2 = var_info[operand2]
+                elif mlir_common.MLIR_TO_BIT[op_type1[1]] < mlir_common.MLIR_TO_BIT[op_type2[1]]:
+                   operand1 = ops.ext(operand1, op_type2[1])
+                   op_type1 = var_info[operand1]
+            else:
+                raise NotImplementedError("Unsupported type converting")
+
+        # Updated var info
+        tile_size = op_type1[0]
+        ret_type = op_type1[1]
+        return tile_size, ret_type, operand1, operand2
+
+    @staticmethod
+    def abs(operand, *args, var_info=None, **kwargs):
+        raise NotImplementedError
+
+    @staticmethod
+    def exp(operand, *args, var_info=None, **kwargs):
+        # Check scalar
+        op_type = var_info[operand]
+        if op_type[0] == 1:
+            operand = ops.broadcast(operand, 4)
+            val = ops.exp(operand)
+            result = ops.extractelement(val, 0)
+            return result, var_info[result]
+        op_type = var_info[operand]
+        tile_size = op_type[0]
+        dtype = op_type[1]
+        shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
+        return f'math.exp %{operand} : {shape}', [tile_size, dtype]
+
+    @staticmethod
+    def exp2(operand, *args, var_info=None, **kwargs):
+        # Hands-on part: implement exp2 using math.exp2
+        # var_info = {operand: [tile_size, dtype]}
+        # Ex) var_info[operand] = [8, "f32"]
+
+        ln2 = math.log(2)
+        coeff = ops.constant(ln2, "f32")
+        operand = ops.mul(operand, coeff)
+        return ops.exp(operand), var_info[operand]
+
+    @staticmethod
+    def expm1(operand, *args, var_info=None, **kwargs):
+        coeff = ops.constant(1.0, "f32")
+        operand = ops.exp(operand)
+        operand = ops.sub(operand, coeff)
+        return operand, var_info[operand]
+
+    @staticmethod
+    def sqrt(operand, *args, var_info=None, **kwargs):
+        op_type = var_info[operand]
+
+        tile_size = op_type[0]
+        dtype = op_type[1]
+
+        # Type check & auto cast
+        if dtype.startswith("f"):
+            operand = ops.to_dtype(operand, "f32")
+
+        shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
+        return f'math.sqrt %{operand} : {shape}', [tile_size, dtype]
+
+    @staticmethod
+    def relu(operand, *args, var_info=None, **kwargs):
+        src_mlir_dtype = var_info[operand][1]
+        tile_size = var_info[operand][0]
+        return ops.maximum(operand, ops.constant(0, src_mlir_dtype)), [tile_size, src_mlir_dtype]
+
+    @staticmethod
+    def minimum(operand1, operand2, *args, var_info=None, **kwargs):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
+        if ret_type[0] == "f":
+            opcode = f'arith.minimumf'
+        else:
+            opcode = f'arith.minsi'
+        return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
+
+    @staticmethod
+    def maximum(operand1, operand2, *args, var_info=None, **kwargs):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
+        if ret_type[0] == "f":
+            opcode = f'arith.maximumf'
+        else:
+            opcode = f'arith.maxsi'
+        return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
+
+    @staticmethod
+    def cos(operand, *args, var_info=None, **kwargs):
+        op_type = var_info[operand]
+
+        # Check scalar
+        op_type = var_info[operand]
+        if op_type[0] == 1:
+            operand = ops.broadcast(operand, 4)
+            val = ops.cos(operand)
+            result = ops.extractelement(val, 0)
+            return result, var_info[result]
+        op_type = var_info[operand]
+        tile_size = op_type[0]
+        dtype = op_type[1]
+
+        # Type check & auto cast
+        if dtype.startswith("f"):
+            operand = ops.to_dtype(operand, "f32")
+        shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
+        return f'math.cos %{operand} : {shape}', [tile_size, dtype]
+
+    @staticmethod
+    def sin(operand, *args, var_info=None, **kwargs):
+        op_type = var_info[operand]
+
+        # Check scalar
+        op_type = var_info[operand]
+        if op_type[0] == 1:
+            operand = ops.broadcast(operand, 4)
+            val = ops.sin(operand)
+            result = ops.extractelement(val, 0)
+            return result, var_info[result]
+        op_type = var_info[operand]
+        tile_size = op_type[0]
+        dtype = op_type[1]
+
+        # Type check & auto cast
+        if dtype.startswith("f"):
+            operand = ops.to_dtype(operand, "f32")
+        shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
+        return f'math.sin %{operand} : {shape}', [tile_size, dtype]
+
+    @staticmethod
+    def tan(operand, *args, var_info=None, **kwargs):
+        sin_res = ops.sin(operand)
+        cos_res = ops.cos(operand)
+        operand = ops.truediv(sin_res, cos_res)
+        return operand, var_info[operand]
+
+    @staticmethod
+    def lgamma(operand, *args, var_info=None, **kwargs):
+        raise NotImplementedError
+
+    @staticmethod
+    def erf(operand, *args, var_info=None, **kwargs):
+        # Check scalar
+        op_type = var_info[operand]
+        if op_type[0] == 1:
+            operand = ops.broadcast(operand, 4)
+            val = ops.erf(operand)
+            result = ops.extractelement(val, 0)
+            return result, var_info[result]
+        op_type = var_info[operand]
+        tile_size = op_type[0]
+        dtype = op_type[1]
+        shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
+        return f'math.erf %{operand} : {shape}', [tile_size, dtype]
+
+    @staticmethod
+    def cosh(operand, *args, var_info=None, **kwargs):
+        raise NotImplementedError
+
+    @staticmethod
+    def sinh(operand, *args, var_info=None, **kwargs):
+        raise NotImplementedError
+
+    @staticmethod
+    def tanh(operand, *args, var_info=None, **kwargs):
+        op_type = var_info[operand]
+
+        # Check scalar
+        op_type = var_info[operand]
+        if op_type[0] == 1:
+            operand = ops.broadcast(operand, 4)
+            val = ops.tanh(operand)
+            result = ops.extractelement(val, 0)
+            return result, var_info[result]
+        op_type = var_info[operand]
+        tile_size = op_type[0]
+        dtype = op_type[1]
+
+        # Type check & auto cast
+        if dtype.startswith("f"):
+            operand = ops.to_dtype(operand, "f32")
+        shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
+        return f'math.tanh %{operand} : {shape}', [tile_size, dtype]
+
+    @staticmethod
+    def acos(operand, *args, var_info=None, **kwargs):
+        raise NotImplementedError
+
+    @staticmethod
+    def acosh(operand, *args, var_info=None, **kwargs):
+        raise NotImplementedError
+
+    @staticmethod
+    def asin(operand, *args, var_info=None, **kwargs):
+        raise NotImplementedError
+
+    @staticmethod
+    def asinh(operand, *args, var_info=None, **kwargs):
+        raise NotImplementedError
+
+    @staticmethod
+    def atan2(operand1, operand2, *args, var_info=None, **kwargs):
+        raise NotImplementedError
+
+    @staticmethod
+    def atan(operand, *args, var_info=None, **kwargs):
+        raise NotImplementedError
+
+    @staticmethod
+    def atanh(operand, *args, var_info=None, **kwargs):
+        raise NotImplementedError
+
+    @staticmethod
+    def copysign(operand1, operand2, *args, var_info=None, **kwargs):
+        raise NotImplementedError
+
+    @staticmethod
+    def erfc(operand, *args, var_info=None, **kwargs):
+        raise NotImplementedError
+
+    @staticmethod
+    def erfinv(operand, *args, var_info=None, **kwargs):
+        raise NotImplementedError
+
+    @staticmethod
+    def frexp(operand, *args, var_info=None, **kwargs):
+        raise NotImplementedError
+
+    @staticmethod
+    def hypot(operand1, operand2, *args, var_info=None, **kwargs):
+        raise NotImplementedError
+
+    @staticmethod
+    def log10(operand, *args, var_info=None, **kwargs):
+        val_ln = ops.log(operand)
+        
+        tile_size, dtype = var_info[val_ln]
+        inv_ln10 = 1/math.log(10)
+        const_op = ops.constant(inv_ln10, dtype)
+        
+        # Multiply: ln(x) * (1/ln(10))
+        result = ops.mul(val_ln, const_op)
+        return result, var_info[result]
+
+    @staticmethod
+    def log2(operand, *args, var_info=None, **kwargs):
+        val_ln = ops.log(operand)
+        
+        tile_size, dtype = var_info[val_ln]
+        inv_ln10 = 1/math.log(2)
+        const_op = ops.constant(inv_ln10, dtype)
+        
+        # Multiply: ln(x) * (1/ln(10))
+        result = ops.mul(val_ln, const_op)
+        return result, var_info[result]
+
+    @staticmethod
+    def log(operand, *args, var_info=None, **kwargs):
+        op_type = var_info[operand]
+        tile_size = op_type[0]
+        dtype = op_type[1]
+
+        # Type check & auto cast
+        if dtype.startswith("f"):
+            operand = ops.to_dtype(operand, "f32")
+
+        shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
+        return f'math.log %{operand} : {shape}', [tile_size, dtype]
+
+    @staticmethod
+    def log1p(operand, *args, var_info=None, **kwargs):
+        tile_size, dtype = var_info[operand]
+        const_one = ops.constant(1, dtype)
+
+        # 3. 덧셈 연산: (x + 1)
+        # ops.add가 (result_ssa, result_info)를 반환한다고 가정
+        val_add = ops.add(operand, const_one)
+        result = ops.log(val_add)
+        return result, var_info[result]
+
+    @staticmethod
+    def nextafter(operand1, operand2, *args, var_info=None, **kwargs):
+        raise NotImplementedError
+
+    @staticmethod
+    def logical_and(operand1, operand2, *args, var_info=None, **kwargs):
+        if var_info[operand1][1] != "i1":
+            operand1 = ops.to_bool(operand1)
+        
+        if var_info[operand2][1] != "i1":
+            operand2 = ops.to_bool(operand2)
+        result = ops.and_(operand1, operand2)
+        return result, var_info[result]
+
+    @staticmethod
+    def logical_or(operand1, operand2, *args, var_info=None, **kwargs):
+        if var_info[operand1][1] != "i1":
+            operand1 = ops.to_bool(operand1)
+        
+        if var_info[operand2][1] != "i1":
+            operand2 = ops.to_bool(operand2)
+        result = ops.or_(operand1, operand2)
+        return result, var_info[result]
+
+    @staticmethod
+    def logical_xor(operand1, operand2, *args, var_info=None, **kwargs):
+        if var_info[operand1][1] != "i1":
+            operand1 = ops.to_bool(operand1)
+        
+        if var_info[operand2][1] != "i1":
+            operand2 = ops.to_bool(operand2)
+        result = ops.xor(operand1, operand2)
+        return result, var_info[result]
+    
+    @staticmethod
+    def logical_not(operand, *args, var_info=None, **kwargs):
+        op_info = var_info[operand]
+        tile_size = op_info[0]
+        dtype = op_info[1]
+        
+        zero_const = ops.constant(0, dtype)
+        result = ops.eq(operand, zero_const)
+        return result, var_info[result]
+
+    @staticmethod
+    def bitwise_and(operand1, operand2, *args, var_info=None, **kwargs):
+        # Float check
+        if var_info[operand1][1].startswith("f") or var_info[operand2][1].startswith("f"):
+            raise ValueError("Bitwise AND not supported for floats")
+            
+        result = ops.and_(operand1, operand2)
+        return result, var_info[result]
+
+    @staticmethod
+    def bitwise_not(operand, *args, var_info=None, **kwargs):
+        tile_size, dtype = var_info[operand]
+        # Float check
+        if var_info[operand][1].startswith("f"):
+            raise ValueError("Bitwise NOT not supported for floats")
+        
+        neg_one = ops.constant(-1, dtype)
+        result = ops.xor(operand, neg_one) 
+        return result, var_info[result]
+
+    @staticmethod
+    def bitwise_or(operand1, operand2, *args, var_info=None, **kwargs):
+        # Float check
+        if var_info[operand1][1].startswith("f") or var_info[operand2][1].startswith("f"):
+            raise ValueError("Bitwise AND not supported for floats")
+            
+        result = ops.or_(operand1, operand2)
+        return result, var_info[result]
+
+    @staticmethod
+    def bitwise_xor(operand1, operand2, *args, var_info=None, **kwargs):
+                # Float check
+        if var_info[operand1][1].startswith("f") or var_info[operand2][1].startswith("f"):
+            raise ValueError("Bitwise AND not supported for floats")
+            
+        result = ops.xor(operand1, operand2)
+        return result, var_info[result]
+
+    @staticmethod
+    def bitwise_left_shift(operand1, operand2, *args, var_info=None, **kwargs):
+        raise NotImplementedError
+
+    @staticmethod
+    def bitwise_right_shift(operand1, operand2, *args, var_info=None, **kwargs):
+        raise NotImplementedError
+
+    @staticmethod
+    def rsqrt(operand, *args, var_info=None, **kwargs):
+        op_type = var_info[operand]
+        tile_size = op_type[0]
+        dtype = op_type[1]
+
+        # Type check & auto cast
+        if dtype.startswith("f"):
+            operand = ops.to_dtype(operand, "f32")
+
+        shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
+        return f'math.rsqrt %{operand} : {shape}', [tile_size, dtype]
+
+    @staticmethod
+    def sigmoid(operand, *args, var_info=None, **kwargs):
+        op_type = var_info[operand]
+        tile_size = op_type[0]
+        dtype = op_type[1]
+        one = ops.constant(1, dtype)
+        return ops.truediv(one, ops.expm1(operand)), [tile_size, dtype]
+
+    @staticmethod
+    def fmod(operand1, operand2, *args, var_info=None, **kwargs):
+        raise NotImplementedError
+
+    @staticmethod
+    def isinf(operand, *args, var_info=None, **kwargs):
+        raise NotImplementedError
+
+    @staticmethod
+    def isnan(operand, *args, var_info=None, **kwargs):
+        raise NotImplementedError
+
+    @staticmethod
+    def round(operand, *args, var_info=None, **kwargs):
+        tile_size, dtype = var_info[operand]
+        shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
+
+        if dtype.startswith("f"):
+            return f"math.roundeven %{operand} : {shape}", [tile_size, dtype]
+        else:
+            return operand, [tile_size, dtype]
+
+    @staticmethod
+    def floor(operand, *args, var_info=None, **kwargs):
+        tile_size, dtype = var_info[operand]
+        shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
+
+        if dtype.startswith("f"):
+            return f"math.floor %{operand} : {shape}", [tile_size, dtype]
+        else:
+            return operand, [tile_size, dtype]
+
+    @staticmethod
+    def sign(operand, *args, var_info=None, **kwargs):
+        raise NotImplementedError
+
+    @staticmethod
+    def trunc(operand, *args, var_info=None, **kwargs):
+        tile_size, dtype = var_info[operand]
+        shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
+
+        if dtype.startswith("f"):
+            return f"math.trunc %{operand} : {shape}", [tile_size, dtype]
+        else:
+            return operand, [tile_size, dtype]
+
+    @staticmethod
+    def ceil(operand, *args, var_info=None, **kwargs):
+        tile_size, dtype = var_info[operand]
+        shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
+
+        if dtype.startswith("f"):
+            return f"math.ceil %{operand} : {shape}", [tile_size, dtype]
+        else:
+            return operand, [tile_size, dtype]
+
+    # Logical operations
+    @staticmethod
+    def neg(operand, *args, var_info=None, **kwargs):
+        op_type = var_info[operand]
+        tile_size = op_type[0]
+        dtype = op_type[1]
+
+        # Type check & auto cast
+        if dtype.startswith("f"):
+            operand = ops.to_dtype(operand, "f32")
+
+        shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
+        return f'arith.negf %{operand} : {shape}', [tile_size, dtype]
+
+    @staticmethod
+    def reciprocal(operand, *args, var_info=None, **kwargs):
+        op_type = var_info[operand]
+        tile_size = op_type[0]
+        dtype = op_type[1]
+
+        # Type check & auto cast
+        if dtype.startswith("f"):
+            operand = ops.to_dtype(operand, "f32")
+
+        return ops.truediv(ops.constant(1.0, dtype), operand), [tile_size, dtype]
+
+    @staticmethod
+    def eq(operand1, operand2, *args, var_info=None, **kwargs):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+        if ret_type[0] == "f":
+            op_type = "arith.cmpf"
+            attribute = "oeq"
+        elif ret_type[0] == "i":
+            op_type = "arith.cmpi"
+            attribute = "eq"
+        else:
+            raise ValueError(f"Unsupported data type for 'eq' operation: {ret_type}")
+
+        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
+        return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"]
+
+    @staticmethod
+    def ne(operand1, operand2, *args, var_info=None, **kwargs):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+        if ret_type[0] == "f":
+            op_type = "arith.cmpf"
+            attribute = "one"
+        elif ret_type[0] == "i":
+            op_type = "arith.cmpi"
+            attribute = "ne"
+        else:
+            raise ValueError(f"Unsupported data type for 'ne' operation: {ret_type}")
+
+        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
+        return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"]
+
+    @staticmethod
+    def lt(operand1, operand2, *args, var_info=None, **kwargs):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+        if ret_type[0] == "f":
+            op_type = "arith.cmpf"
+            attribute = "olt"
+        elif ret_type[0] == "i":
+            op_type = "arith.cmpi"
+            attribute = "slt"
+        else:
+            raise ValueError(f"Unsupported data type for 'lt' operation: {ret_type}")
+
+        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
+        return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"]
+
+    @staticmethod
+    def gt(operand1, operand2, *args, var_info=None, **kwargs):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+        if ret_type[0] == "f":
+            op_type = "arith.cmpf"
+            attribute = "ogt"
+        elif ret_type[0] == "i":
+            op_type = "arith.cmpi"
+            attribute = "sgt"
+        else:
+            raise ValueError(f"Unsupported data type for 'gt' operation: {ret_type}")
+
+        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
+        return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"]
+
+    @staticmethod
+    def le(operand1, operand2, *args, var_info=None, **kwargs):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+        if ret_type[0] == "f":
+            op_type = "arith.cmpf"
+            attribute = "ole"
+        elif ret_type[0] == "i":
+            op_type = "arith.cmpi"
+            attribute = "sle"
+        else:
+            raise ValueError(f"Unsupported data type for 'le' operation: {ret_type}")
+
+        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
+        return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"]
+
+    @staticmethod
+    def ge(operand1, operand2, *args, var_info=None, **kwargs):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+        if ret_type[0] == "f":
+            op_type = "arith.cmpf"
+            attribute = "oge"
+        elif ret_type[0] == "i":
+            op_type = "arith.cmpi"
+            attribute = "sge"
+        else:
+            raise ValueError(f"Unsupported data type for 'ne' operation: {ret_type}")
+
+        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
+        return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"]
+
+    @staticmethod
+    def add(operand1, operand2, *args, var_info=None, **kwargs):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
+        opcode = f'arith.add{ret_type[0]}'
+        return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
+
+    @staticmethod
+    def sub(operand1, operand2, *args, var_info=None, **kwargs):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
+        opcode = f'arith.sub{ret_type[0]}'
+        return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
+
+    @staticmethod
+    def mul(operand1, operand2, *args, var_info=None, **kwargs):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
+        opcode = f'arith.mul{ret_type[0]}'
+        return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
+
+    @staticmethod
+    def pow(operand1, operand2, *args, var_info=None, **kwargs):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+        # Type check & auto cast
+        if ret_type.startswith("f"):
+            operand1 = ops.to_dtype(operand1, "f32")
+
+        # Type check & auto cast
+        if ret_type.startswith("f"):
+            operand2 = ops.to_dtype(operand2, "f32")
+
+        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
+        return f"math.pow{ret_type[0]} %{operand1}, %{operand2} : {shape}", [tile_size, ret_type]
+
+    @staticmethod
+    def and_(operand1, operand2, *args, var_info=None, **kwargs):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+        
+        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
+        return f'arith.andi %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
+
+    @staticmethod
+    def or_(operand1, operand2, *args, var_info=None, **kwargs):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+        
+        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
+        return f'arith.ori %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
+
+    @staticmethod
+    def xor(operand1, operand2, *args, var_info=None, **kwargs):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+        
+        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
+        return f'arith.xori %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
+
+    @staticmethod
+    def lshift(operand1, operand2, *args, var_info=None, **kwargs):
+        raise NotImplementedError
+
+    @staticmethod
+    def rshift(operand1, operand2, *args, var_info=None, **kwargs):
+        raise NotImplementedError
+
+    @staticmethod
+    def truncdiv(operand1, operand2, *args, var_info=None, **kwargs):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
+
+        if ret_type.startswith("f"):
+            raise ValueError("truncdiv is strictly for integers. Use truediv for floats.")
+        
+        # arith.divsi: Signed Integer Division (Result is truncated)
+        return f'arith.divsi %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
+
+    @staticmethod
+    def floordiv(operand1, operand2, *args, var_info=None, **kwargs):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
+
+        if ret_type.startswith("f"):
+             # Float의 floor division은 보통 divf 후 floor를 하므로 여기선 정수만 처리
+             raise ValueError("floordiv implementation expects integers based on definition.")
+
+        # arith.floordivsi: Floor Division for Signed Integers
+        return f'arith.floordivsi %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
+
+    @staticmethod
+    def truediv(operand1, operand2, *args, var_info=None, **kwargs):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
+
+        if not ret_type.startswith("f"):
+            raise ValueError(f"truediv expects float inputs, but got {ret_type}. Use int_truediv for integers.")
+
+        return f'arith.divf %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
+
+    @staticmethod
+    def int_truediv(operand1, operand2, *args, var_info=None, **kwargs):
+        """
+        True division for Integers (Int -> Float).
+        Promotes integers to floats, then performs floating-point division.
+        """
+        tile_size, src_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+        if not src_type.startswith("f"):
+            target_float_type = "f32"
+            operand1 = ops.to_dtype(operand1, target_float_type)
+            operand2 = ops.to_dtype(operand2, target_float_type)
+            src_type = target_float_type
+
+        result = ops.truediv(operand1, operand2)
+        return result, var_info[result]
+
+    @staticmethod
+    def mod(operand1, operand2, *args, var_info=None, **kwargs):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
+        if ret_type[0] == "f":
+            raise NotImplementedError("Not support remainder operation for floating point")
+        else:
+            opcode = f'arith.remsi'
+        return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
+
+    @staticmethod
+    def remainder(operand1, operand2, *args, var_info=None, **kwargs):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
+
+        if ret_type.startswith("f"):
+            opcode = 'arith.remf'
+        else:
+            opcode = 'arith.remsi' # Signed Integer Remainder (LHS sign)
+
+        return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
+
+    @staticmethod
+    def square(operand, *args, var_info=None, **kwargs):
+        result = ops.mul(operand, operand)
+        return result, var_info[result]
+
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # PyTorchSim specific operations 
+
+    @staticmethod
+    def alloc(size, src_type, *args, var_info=None, **kwargs):
+        return f"memref.alloc() : memref<{size}x{src_type}>", [size, src_type]
+
+    @staticmethod
+    def extractelement(operand, idx, *args, var_info=None, **kwargs):
+        op_type = var_info[operand]
+        tile_size = op_type[0]
+        dtype = op_type[1]
+        shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
+        return f"vector.extract %{operand}[{idx}]: {dtype} from {shape}", [1, dtype]
+
+    @staticmethod
+    def ext(operand, dtype, *args, var_info=None, **kwargs):
+        op_type = var_info[operand]
+        shape = f"vector<{op_type[0]}x{op_type[1]}>" if op_type[0] > 1 else f"{op_type[1]}"
+        target_type = f"vector<{op_type[0]}x{dtype}>" if op_type[0] > 1 else f"{dtype}"
+        if op_type[0] == "f":
+            opcode = f'arith.extf'
+        else:
+            opcode = f'arith.extui'
+        return f'{opcode} %{operand} : {shape} to {target_type}', [op_type[0], dtype]
+
+    @staticmethod
+    def to_bool(operand, *args, var_info=None, **kwargs):
+        tile_size, ret_type = var_info[operand]
+        const_one = ops.constant(0, ret_type)
+        if tile_size > 1:
+            const_one = ops.broadcast(const_one, tile_size)
+        ret = ops.ne(operand, const_one)
+        return ret, [tile_size, "i1"]
+    @staticmethod
+    def step(size, dtype, *args, **kwargs):
+        index_shape = f"vector<{size}x{dtype}>"
+        return f"vector.step : {index_shape}", [size, dtype]
+
+    @staticmethod
+    def index_cast(operand, target_type, *args, var_info=None, **kwrags):
+        op_type = var_info[operand]
+        src_shape = f"vector<{op_type[0]}x{op_type[1]}>" if op_type[0] > 1 else op_type[1]
+        des_shape = f"vector<{op_type[0]}x{target_type}>" if op_type[0] > 1 else target_type
+        return f"arith.index_cast %{operand} : {src_shape} to {des_shape}", [op_type[0], target_type]
+
+    @staticmethod
+    def shape_cast(operand, src_shape, dst_shape, *args, var_info=None, **kwargs):
+        operand_type = var_info[operand]
+        return f"vector.shape_cast %{operand} : {src_shape} to {dst_shape}", operand_type
+
+    @staticmethod
+    def multi_reduction(acc, init, vec_size, red_size, red_shape, red_type, type_name, *args, **kwargs):
+        if red_size == 1:
+            final_reduced_shape = f"{type_name}"
+            line = reduction_combine_vec(red_type, acc, init, axis=0, shape=red_shape, reduced_shape=final_reduced_shape)
+        else:
+            final_reduced_shape = f"vector<{red_size}x{type_name}>"
+            new_vshape= f"vector<{vec_size//red_size}x{red_size}x{type_name}>"
+            value = ops.shape_cast(acc, red_shape, new_vshape)
+            line = reduction_combine_vec(red_type, value, init, axis=0, shape=new_vshape, reduced_shape=final_reduced_shape)
+        return line, [red_size, type_name]
+
+    @staticmethod
+    def _load(compute_vec_size, mlir_dtype, buffer, indices, buffer_shape, *args, var_info=None, **kwargs):
+        if compute_vec_size == 1:
+            vshape = f"{mlir_dtype}"
+            operation = "affine.load"
+            line = f"{operation} %{buffer}[{indices}] : {buffer_shape}"
+        else:
+            vshape = f"vector<{compute_vec_size}x{mlir_dtype}>"
+            operation = "affine.vector_load"
+            line = f"{operation} %{buffer}[{indices}] : {buffer_shape}, {vshape}"
+        return line, [compute_vec_size, mlir_dtype]
+
+    @staticmethod
+    def _store(operand, buffer, indices, buffer_shape, *args, buffer_name=None, var_info=None, **kwargs):
+        compute_vec_size, mlir_dtype = var_info[operand][0], var_info[operand][1]
+
+        if compute_vec_size == 1:
+            vshape = f"{mlir_dtype}"
+            operation = "affine.store"
+            line = f"{operation} %{operand}, %{buffer}[{indices}] : {buffer_shape}"
+        else:
+            vshape = f"vector<{compute_vec_size}x{mlir_dtype}>"
+            operation = "affine.vector_store"
+            line = f"{operation} %{operand}, %{buffer}[{indices}] : {buffer_shape}, {vshape}"
+
+        if buffer_name is not None:
+            return common.DeferredLine(buffer_name, line), [None, None]
+        else:
+            return line, [None, None]
\ No newline at end of file
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index cc17ada1..a36bc907 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -25,7 +25,7 @@
 import PyTorchSimFrontend.extension_codecache as extension_codecache
 from PyTorchSimFrontend.mlir.mlir_autotune import MLIRBenchmarkRequest
 from PyTorchSimFrontend.mlir.mlir_common import BaseMLIRHardwareInfo
-from PyTorchSimFrontend.mlir.mlir_codegen_backend import MLIRKernel, reduction_init, reduction_partial_combine_vec, reduction_combine_vec, is_welford_reduction
+from PyTorchSimFrontend.mlir.mlir_codegen_backend import MLIRKernel, reduction_init, reduction_partial_combine_vec, is_welford_reduction
 from PyTorchSimFrontend.mlir.mlir_scheduling import SchedulerNode
 from torch._inductor.codegen import common
 
@@ -1070,11 +1070,11 @@ def store_reduction_epilogue(self, name, index, value):
 
                     if self.current_node.node.origin_node: # FIXME: This is a temporary solution
                         # mean = SUM(X) / N
-                        self.reduction_mean.append(ops.div(out, divider_vec))
+                        self.reduction_mean.append(ops.truediv(out, divider_vec))
                         out = self.reduction_mean[i]
                     else:
                         # m2 = (E(X^2) - E(X)^2) * N
-                        sqr_mean = ops.div(out, divider_vec)
+                        sqr_mean = ops.truediv(out, divider_vec)
                         mean_sqr = ops.mul(self.reduction_mean[i], self.reduction_mean[i])
                         variance = ops.sub(sqr_mean, mean_sqr)
                         m2 = ops.mul(variance, divider_vec)

From 8452f5c67f0f88d42d1d5918343b1f9d365bc4e9 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 8 Dec 2025 16:29:36 +0000
Subject: [PATCH 015/194] [Test] Add Llama1&2 test cases

---
 .github/workflows/pytorchsim_test.yml |  21 +++++
 tests/Llama/test_llama.py             | 113 ++++++++++++++++++++++++++
 2 files changed, 134 insertions(+)
 create mode 100644 tests/Llama/test_llama.py

diff --git a/.github/workflows/pytorchsim_test.yml b/.github/workflows/pytorchsim_test.yml
index fe8a4a7d..8444f318 100644
--- a/.github/workflows/pytorchsim_test.yml
+++ b/.github/workflows/pytorchsim_test.yml
@@ -663,6 +663,27 @@ jobs:
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_scheduler.py
 
+  test_llama:
+    name: Run test_llama1&2
+    runs-on: self-hosted
+    steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Run test_llama.py
+        run: |
+          echo "Running test_llama.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump \
+            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
+            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/Llama/test_llama.py
+
   test_accuracy:
     name: Run test_accuracy
     runs-on: self-hosted
diff --git a/tests/Llama/test_llama.py b/tests/Llama/test_llama.py
new file mode 100644
index 00000000..17672563
--- /dev/null
+++ b/tests/Llama/test_llama.py
@@ -0,0 +1,113 @@
+import os
+import sys
+import argparse
+import copy
+import torch
+from transformers.models.llama.configuration_llama import LlamaConfig
+from transformers.models.llama.modeling_llama import LlamaForCausalLM
+
+def test_result(name, out, ref, rtol=1e-4, atol=1e-4):
+    if torch.allclose(out.cpu(), ref.cpu(), rtol=rtol, atol=atol):
+        msg = f"|{name} Test Passed|"
+        print("-" * len(msg)); print(msg); print("-" * len(msg))
+    else:
+        msg = f"|{name} Test Failed|"
+        print("-" * len(msg)); print(msg); print("-" * len(msg))
+        diff = (out.cpu() - ref.cpu()).abs().max().item()
+        print("device out:", out.detach().cpu())
+        print("cpu ref  :", ref.detach().cpu())
+        print(f"Max abs diff: {diff}")
+        sys.exit(1)
+
+@torch.no_grad()
+def run_custom_llama_test(
+    device,
+    batch=1,
+    seq_len=32,
+    dtype="float32",
+    rtol=1e-3,
+    atol=1e-3,
+    max_new_tokens=16,
+):
+    dtype_map = {"float32": torch.float32, "float16": torch.float16, "bfloat16": torch.bfloat16}
+    torch_dtype = dtype_map.get(dtype, torch.float32)
+
+    cfg = LlamaConfig(
+        _name_or_path="custom-llama",
+        architectures=["LlamaForCausalLM"],
+        attention_bias=False,
+        attention_dropout=0.0,
+        bos_token_id=1,
+        eos_token_id=2,
+        hidden_act="silu",
+        hidden_size=4096,
+        initializer_range=0.02,
+        intermediate_size=11008,
+        max_position_embeddings=4096,
+        mlp_bias=False,
+        model_type="llama",
+        num_attention_heads=32,
+        num_hidden_layers=1,
+        num_key_value_heads=32,
+        pretraining_tp=1,
+        rms_norm_eps=1e-06,
+        rope_scaling=None,
+        rope_theta=10000.0,
+        tie_word_embeddings=True,
+        torch_dtype=dtype,
+        transformers_version="4.43.4",
+        use_cache=True,
+        vocab_size=8192,
+    )
+
+    print("Building LlamaForCausalLM from custom config (random init).")
+    base_model = LlamaForCausalLM(cfg).eval()
+    cpu_model  = copy.deepcopy(base_model).eval()
+
+    # dtype & device 세팅
+    cpu_model.to(dtype=torch_dtype, device="cpu")
+    model = base_model.to(dtype=torch_dtype, device=device)
+
+    # ---- 입력 텐서 (랜덤 ids) ----
+    g = torch.Generator().manual_seed(0)
+    vocab = cfg.vocab_size
+    input_ids_cpu = torch.randint(low=0, high=vocab, size=(batch, seq_len), generator=g, dtype=torch.long)
+    attn_mask_cpu = torch.ones_like(input_ids_cpu, dtype=torch.long)
+
+    input_ids_dev = input_ids_cpu.to(device)
+    attn_mask_dev = attn_mask_cpu.to(device)
+
+    # ---- forward comparison (compile vs CPU baseline) ----
+    print("Compiling model with torch.compile(...)")
+    compiled = torch.compile(model, dynamic=False)
+
+    logits_cpu = cpu_model(input_ids=input_ids_cpu, attention_mask=attn_mask_cpu).logits
+    logits_dev = compiled(input_ids=input_ids_dev, attention_mask=attn_mask_dev).logits
+
+    test_result("Custom Llama forward(logits)", logits_dev, logits_cpu, rtol=rtol, atol=atol)
+    print("Max diff >", (logits_dev.detach().cpu() - logits_cpu.detach().cpu()).abs().max().item())
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Test Custom Llama (random weights, no tokenizer)")
+    parser.add_argument("--batch", type=int, default=1)
+    parser.add_argument("--seq_len", type=int, default=32)
+    parser.add_argument("--dtype", type=str, default="float32", choices=["float32", "float16", "bfloat16"])
+    parser.add_argument("--rtol", type=float, default=1e-3)
+    parser.add_argument("--atol", type=float, default=1e-3)
+    parser.add_argument("--max_new_tokens", type=int, default=16)
+    args = parser.parse_args()
+
+    sys.path.append(os.environ.get("PYTORCHSIM_ROOT_PATH", "/workspace/PyTorchSim"))
+    from Scheduler.scheduler import PyTorchSimRunner
+    module = PyTorchSimRunner.setup_device()
+    device = module.custom_device()
+    #test_triu(device, size=(32, 128), diagonal=1)
+    torch.compiler.is_compiling = lambda: True # FIXME. How to fix this?
+    run_custom_llama_test(
+        device=device,
+        batch=args.batch,
+        seq_len=args.seq_len,
+        dtype=args.dtype,
+        rtol=args.rtol,
+        atol=args.atol,
+    )

From 00cd8c7cc9f0577e4ec4e974ec9e5f1467f86c67 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 8 Dec 2025 16:29:57 +0000
Subject: [PATCH 016/194] [TOGSim] Add error handling

---
 TOGSim/src/TileGraphParser.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/TOGSim/src/TileGraphParser.cc b/TOGSim/src/TileGraphParser.cc
index 42776a51..761530ab 100644
--- a/TOGSim/src/TileGraphParser.cc
+++ b/TOGSim/src/TileGraphParser.cc
@@ -696,6 +696,9 @@ TileGraphParser::TileGraphParser(std::string onnx_path, std::string attribute_pa
   loadConfig(config_path, _config_json);
   _attribute_path = attribute_path;
 
+  if (!std::filesystem::exists(onnx_path)) {
+    throw std::runtime_error("Error: ONNX file not found at path: " + onnx_path);
+  }
   /* Note: this parsing algorithm assume that all node are sorted in topological-order */
   std::ifstream model_istream(onnx_path);
   google::protobuf::io::IstreamInputStream zero_copy_input(&model_istream);

From a8d96cda4a8ebf1f281a4f27778d9df649cbc35c Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 8 Dec 2025 16:30:51 +0000
Subject: [PATCH 017/194] [Scheduler] Use given config file for compilations

---
 Scheduler/scheduler.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Scheduler/scheduler.py b/Scheduler/scheduler.py
index 31dbf6c0..98ebb1d5 100644
--- a/Scheduler/scheduler.py
+++ b/Scheduler/scheduler.py
@@ -358,6 +358,7 @@ def __init__(self, num_request_queue=1, max_batch=1, engine_select=FIFO_ENGINE,
 
         togsim_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "TOGSim")
         self.tog_simulator = TOGSimulator(togsim_path, togsim_config)
+        os.environ['TOGSIM_CONFIG'] = togsim_config
         self.tog_simulator.interactive_simulation()
         if engine_select == Scheduler.FIFO_ENGINE:
             self.execution_engine = FIFORunner(self.tog_simulator, self.num_request_queue)

From 8aac3ab08fb63bc1ba3b2bb13c0de8c2b298e4e2 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 9 Dec 2025 05:10:24 +0000
Subject: [PATCH 018/194] [Fix/ops] Fix wrong implementation of sigmoid

---
 PyTorchSimFrontend/mlir/mlir_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_ops.py b/PyTorchSimFrontend/mlir/mlir_ops.py
index ebf0c111..af323c1e 100644
--- a/PyTorchSimFrontend/mlir/mlir_ops.py
+++ b/PyTorchSimFrontend/mlir/mlir_ops.py
@@ -626,7 +626,7 @@ def sigmoid(operand, *args, var_info=None, **kwargs):
         tile_size = op_type[0]
         dtype = op_type[1]
         one = ops.constant(1, dtype)
-        return ops.truediv(one, ops.expm1(operand)), [tile_size, dtype]
+        return ops.truediv(one, ops.add(one, ops.exp(ops.neg(operand)))), [tile_size, dtype]
 
     @staticmethod
     def fmod(operand1, operand2, *args, var_info=None, **kwargs):

From fd6a846094df2ee73d2f7c1dcaa21d2c218411db Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 9 Dec 2025 06:29:26 +0000
Subject: [PATCH 019/194] [Tests] Use manual mask for Llama

---
 tests/Llama/test_llama.py | 301 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 286 insertions(+), 15 deletions(-)

diff --git a/tests/Llama/test_llama.py b/tests/Llama/test_llama.py
index 17672563..98820fd9 100644
--- a/tests/Llama/test_llama.py
+++ b/tests/Llama/test_llama.py
@@ -4,7 +4,7 @@
 import copy
 import torch
 from transformers.models.llama.configuration_llama import LlamaConfig
-from transformers.models.llama.modeling_llama import LlamaForCausalLM
+from transformers.models.llama.modeling_llama import LlamaForCausalLM, LlamaDecoderLayer, LlamaRMSNorm, LlamaRotaryEmbedding, LlamaModel
 
 def test_result(name, out, ref, rtol=1e-4, atol=1e-4):
     if torch.allclose(out.cpu(), ref.cpu(), rtol=rtol, atol=atol):
@@ -13,12 +13,216 @@ def test_result(name, out, ref, rtol=1e-4, atol=1e-4):
     else:
         msg = f"|{name} Test Failed|"
         print("-" * len(msg)); print(msg); print("-" * len(msg))
-        diff = (out.cpu() - ref.cpu()).abs().max().item()
+        diff = (out.cpu().int() - ref.cpu().int()).abs().max().item()
         print("device out:", out.detach().cpu())
         print("cpu ref  :", ref.detach().cpu())
         print(f"Max abs diff: {diff}")
         sys.exit(1)
 
+@torch.no_grad()
+def run_rmsnorm_test(
+    device,
+    batch=1,
+    seq_len=32,
+    dtype="float32",
+    rtol=1e-3,
+    atol=1e-3,
+):
+    print("\n[Running LlamaRMSNorm Test]")
+    dtype_map = {"float32": torch.float32, "float16": torch.float16, "bfloat16": torch.bfloat16}
+    torch_dtype = dtype_map.get(dtype, torch.float32)
+
+    hidden_size = 4096
+    eps = 1e-6
+
+    print(f"Building LlamaRMSNorm (hidden_size={hidden_size}, eps={eps})")
+    base_norm = LlamaRMSNorm(hidden_size=hidden_size, eps=eps).eval()
+    cpu_norm = copy.deepcopy(base_norm).eval()
+
+    cpu_norm.to(dtype=torch_dtype, device="cpu")
+    model = base_norm.to(dtype=torch_dtype, device=device)
+
+    g = torch.Generator().manual_seed(0)
+    hidden_states = torch.randn(batch, seq_len, hidden_size, generator=g, dtype=torch_dtype)
+    hs_dev = hidden_states.to(device)
+
+    print("Compiling LlamaRMSNorm with torch.compile(...)")
+    compiled_norm = torch.compile(model, dynamic=False)
+
+    out_cpu = cpu_norm(hidden_states)
+    out_dev = compiled_norm(hs_dev)
+
+    test_result("LlamaRMSNorm forward", out_dev, out_cpu, rtol=rtol, atol=atol)
+    print("Max diff >", (out_dev.detach().cpu() - out_cpu.detach().cpu()).abs().max().item())
+
+
+@torch.no_grad()
+def run_rotary_embedding_test(
+    device,
+    batch=1,
+    seq_len=32,
+    dtype="float32",
+    rtol=1e-3,
+    atol=1e-3,
+):
+    print("\n[Running LlamaRotaryEmbedding Test]")
+    dtype_map = {"float32": torch.float32, "float16": torch.float16, "bfloat16": torch.bfloat16}
+    torch_dtype = dtype_map.get(dtype, torch.float32)
+
+    hidden_size = 4096
+    num_heads = 32
+    head_dim = hidden_size // num_heads
+
+    cfg = LlamaConfig(
+        _name_or_path="custom-llama",
+        architectures=["LlamaForCausalLM"],
+        attention_bias=False,
+        attention_dropout=0.0,
+        bos_token_id=1,
+        eos_token_id=2,
+        hidden_act="silu",
+        hidden_size=4096,
+        initializer_range=0.02,
+        intermediate_size=11008,
+        max_position_embeddings=4096,
+        mlp_bias=False,
+        model_type="llama",
+        num_attention_heads=32,
+        num_hidden_layers=1,
+        num_key_value_heads=32,
+        pretraining_tp=1,
+        rms_norm_eps=1e-06,
+        rope_scaling=None,
+        rope_theta=10000.0,
+        tie_word_embeddings=True,
+        torch_dtype=dtype,
+        transformers_version="4.43.4",
+        use_cache=True,
+        vocab_size=8192,
+        _attn_implementation = "sdpa"
+    )
+    base_rope = LlamaRotaryEmbedding(cfg)
+
+    cpu_rope = copy.deepcopy(base_rope)
+
+    cpu_rope.to(device="cpu")
+    model = base_rope.to(device=device)
+
+    g = torch.Generator().manual_seed(0)
+    value = torch.randn(batch, num_heads, seq_len, head_dim, generator=g, dtype=torch_dtype)
+    position_ids = torch.arange(seq_len, dtype=torch.long).unsqueeze(0).expand(batch, -1)
+
+    val_dev = value.to(device)
+    pos_dev = position_ids.to(device)
+
+    print("Compiling LlamaRotaryEmbedding with torch.compile(...)")
+    compiled_rope = torch.compile(model, dynamic=False)
+
+    cos_cpu, sin_cpu = cpu_rope(value, position_ids)
+    cos_dev, sin_dev = compiled_rope(val_dev, pos_dev)
+
+    print(f"Output dtype check - CPU: {cos_cpu.dtype}, Device: {cos_dev.dtype}")
+
+    test_result("LlamaRotaryEmbedding (Cos)", cos_dev, cos_cpu, rtol=rtol, atol=atol)
+    test_result("LlamaRotaryEmbedding (Sin)", sin_dev, sin_cpu, rtol=rtol, atol=atol)
+
+    diff_cos = (cos_dev.detach().cpu() - cos_cpu.detach().cpu()).abs().max().item()
+    diff_sin = (sin_dev.detach().cpu() - sin_cpu.detach().cpu()).abs().max().item()
+    print(f"Max diff (Cos) > {diff_cos}")
+    print(f"Max diff (Sin) > {diff_sin}")
+
+@torch.no_grad()
+def run_decoder_layer_test(
+    device,
+    batch=1,
+    seq_len=32,
+    dtype="float32",
+    rtol=1e-3,
+    atol=1e-3,
+):
+    print("\n[Running LlamaDecoderLayer Test]")
+    dtype_map = {"float32": torch.float32, "float16": torch.float16, "bfloat16": torch.bfloat16}
+    torch_dtype = dtype_map.get(dtype, torch.float32)
+
+    cfg = LlamaConfig(
+        _name_or_path="custom-llama",
+        architectures=["LlamaForCausalLM"],
+        attention_bias=False,
+        attention_dropout=0.0,
+        bos_token_id=1,
+        eos_token_id=2,
+        hidden_act="silu",
+        hidden_size=4096,
+        initializer_range=0.02,
+        intermediate_size=11008,
+        max_position_embeddings=4096,
+        mlp_bias=False,
+        model_type="llama",
+        num_attention_heads=32,
+        num_hidden_layers=1,
+        num_key_value_heads=32,
+        pretraining_tp=1,
+        rms_norm_eps=1e-06,
+        rope_scaling=None,
+        rope_theta=10000.0,
+        tie_word_embeddings=True,
+        torch_dtype=dtype,
+        transformers_version="4.43.4",
+        use_cache=True,
+        vocab_size=8192,
+        _attn_implementation = "sdpa"
+    )
+
+    print("Building LlamaDecoderLayer from custom config.")
+    base_layer = LlamaDecoderLayer(cfg, layer_idx=0).eval()
+    cpu_layer = copy.deepcopy(base_layer).eval()
+
+    cpu_layer.to(dtype=torch_dtype, device="cpu")
+    model = base_layer.to(dtype=torch_dtype, device=device)
+
+    g = torch.Generator().manual_seed(0)
+    hidden_states = torch.randn(batch, seq_len, cfg.hidden_size, generator=g, dtype=torch_dtype)
+    position_ids = torch.arange(seq_len, dtype=torch.long).unsqueeze(0).expand(batch, -1)
+
+    attention_mask = torch.zeros(batch, 1, seq_len, seq_len, dtype=torch_dtype)
+    mask = torch.triu(torch.ones(seq_len, seq_len, dtype=torch.bool), diagonal=1)
+    attention_mask.masked_fill_(mask, torch.finfo(torch_dtype).min)
+
+    # Shape: (1, seq_len, head_dim) or (batch, seq_len, head_dim)
+    head_dim = cfg.hidden_size // cfg.num_attention_heads
+    cos = torch.randn(1, seq_len, head_dim, generator=g, dtype=torch_dtype)
+    sin = torch.randn(1, seq_len, head_dim, generator=g, dtype=torch_dtype)
+    position_embeddings = (cos, sin)
+
+    hs_dev = hidden_states.to(device)
+    pos_dev = position_ids.to(device)
+    att_dev = attention_mask.to(device)
+    pos_emb_dev = (cos.to(device), sin.to(device))
+
+    print("Compiling LlamaDecoderLayer with torch.compile(...)")
+    compiled_layer = torch.compile(model, dynamic=False)
+
+    out_cpu = cpu_layer(
+        hidden_states=hidden_states,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        position_embeddings=position_embeddings
+    )
+    if isinstance(out_cpu, tuple):
+        out_cpu = out_cpu[0]
+
+    out_dev = compiled_layer(
+        hidden_states=hs_dev,
+        attention_mask=att_dev,
+        position_ids=pos_dev,
+        position_embeddings=pos_emb_dev
+    )
+    if isinstance(out_dev, tuple):
+        out_dev = out_dev[0]
+
+    test_result("LlamaDecoderLayer forward", out_dev, out_cpu, rtol=rtol, atol=atol)
+    print("Max diff >", (out_dev.detach().cpu() - out_cpu.detach().cpu()).abs().max().item())
+
 @torch.no_grad()
 def run_custom_llama_test(
     device,
@@ -40,7 +244,7 @@ def run_custom_llama_test(
         bos_token_id=1,
         eos_token_id=2,
         hidden_act="silu",
-        hidden_size=4096,
+        hidden_size=1024,
         initializer_range=0.02,
         intermediate_size=11008,
         max_position_embeddings=4096,
@@ -64,11 +268,9 @@ def run_custom_llama_test(
     base_model = LlamaForCausalLM(cfg).eval()
     cpu_model  = copy.deepcopy(base_model).eval()
 
-    # dtype & device 세팅
     cpu_model.to(dtype=torch_dtype, device="cpu")
     model = base_model.to(dtype=torch_dtype, device=device)
 
-    # ---- 입력 텐서 (랜덤 ids) ----
     g = torch.Generator().manual_seed(0)
     vocab = cfg.vocab_size
     input_ids_cpu = torch.randint(low=0, high=vocab, size=(batch, seq_len), generator=g, dtype=torch.long)
@@ -81,12 +283,70 @@ def run_custom_llama_test(
     print("Compiling model with torch.compile(...)")
     compiled = torch.compile(model, dynamic=False)
 
-    logits_cpu = cpu_model(input_ids=input_ids_cpu, attention_mask=attn_mask_cpu).logits
-    logits_dev = compiled(input_ids=input_ids_dev, attention_mask=attn_mask_dev).logits
+    logits_cpu = cpu_model(input_ids=input_ids_cpu, attention_mask=attn_mask_cpu)#.logits
+    logits_dev = compiled(input_ids=input_ids_dev, attention_mask=attn_mask_dev)#.logits
 
     test_result("Custom Llama forward(logits)", logits_dev, logits_cpu, rtol=rtol, atol=atol)
     print("Max diff >", (logits_dev.detach().cpu() - logits_cpu.detach().cpu()).abs().max().item())
 
+@torch.no_grad()
+def run_llama_model_test(
+    device,
+    batch=1,
+    seq_len=32,
+    dtype="float32",
+    rtol=1e-3,
+    atol=1e-3,
+):
+    print("\n[Running LlamaModel Test]")
+    dtype_map = {"float32": torch.float32, "float16": torch.float16, "bfloat16": torch.bfloat16}
+    torch_dtype = dtype_map.get(dtype, torch.float32)
+
+    cfg = LlamaConfig(
+        vocab_size=8192,
+        hidden_size=1024,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        intermediate_size=11008 // 4,
+        num_hidden_layers=1,
+        max_position_embeddings=4096,
+        hidden_act="silu",
+        use_cache=False,
+        torch_dtype=dtype,
+    )
+
+    print("Building LlamaModel from custom config (random init).")
+    base_model = LlamaModel(cfg).eval()
+    cpu_model = copy.deepcopy(base_model).eval()
+
+    cpu_model.to(dtype=torch_dtype, device="cpu")
+    model = base_model.to(dtype=torch_dtype, device=device)
+
+    g = torch.Generator().manual_seed(0)
+    input_ids_cpu = torch.randint(low=0, high=cfg.vocab_size, size=(batch, seq_len), generator=g, dtype=torch.long)
+
+    # FIXME: Currently, the user must provide the mask manually.
+    # There is a functionality issue with the model generating the mask internally,
+    # so we explicitly create and inject a Causal Mask (lower triangular matrix) from the outside.
+    causal_mask = torch.tril(torch.ones(seq_len, seq_len, dtype=torch.long))
+    attn_mask_cpu = causal_mask.unsqueeze(0).unsqueeze(0).expand(batch, 1, -1, -1).bool()
+
+    input_ids_dev = input_ids_cpu.to(device)
+    attn_mask_dev = attn_mask_cpu.to(device)
+
+    print("Compiling LlamaModel with torch.compile(...)")
+    compiled_model = torch.compile(model, dynamic=False)
+
+    out_cpu = cpu_model(input_ids=input_ids_cpu, attention_mask=attn_mask_cpu)
+    out_dev = compiled_model(input_ids=input_ids_dev, attention_mask=attn_mask_dev)
+
+    last_hidden_state_cpu = out_cpu.last_hidden_state
+    last_hidden_state_dev = out_dev.last_hidden_state
+
+    test_result("LlamaModel (last_hidden_state)", last_hidden_state_dev, last_hidden_state_cpu, rtol=rtol, atol=atol)
+    diff = (last_hidden_state_dev.detach().cpu() - last_hidden_state_cpu.detach().cpu()).abs().max().item()
+    print(f"Max diff > {diff}")
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Test Custom Llama (random weights, no tokenizer)")
     parser.add_argument("--batch", type=int, default=1)
@@ -103,11 +363,22 @@ def run_custom_llama_test(
     device = module.custom_device()
     #test_triu(device, size=(32, 128), diagonal=1)
     torch.compiler.is_compiling = lambda: True # FIXME. How to fix this?
-    run_custom_llama_test(
-        device=device,
-        batch=args.batch,
-        seq_len=args.seq_len,
-        dtype=args.dtype,
-        rtol=args.rtol,
-        atol=args.atol,
-    )
+    #run_rmsnorm_test(device)
+    #run_rotary_embedding_test(device)
+    #run_decoder_layer_test(
+    #    device=device,
+    #    batch=args.batch,
+    #    seq_len=args.seq_len,
+    #    dtype=args.dtype,
+    #    rtol=args.rtol,
+    #    atol=args.atol,
+    #)
+    run_llama_model_test(device)
+    #run_custom_llama_test(
+    #    device=device,
+    #    batch=args.batch,
+    #    seq_len=args.seq_len,
+    #    dtype=args.dtype,
+    #    rtol=args.rtol,
+    #    atol=args.atol,
+    #)

From dea7f47f943302c3ab6104433ea29a6607a1bbf4 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 9 Dec 2025 07:40:25 +0000
Subject: [PATCH 020/194] [TOGSim] Use YAML instead of json

---
 TOGSim/conanfile.txt              |   2 +-
 TOGSim/include/Common.h           |   7 +-
 TOGSim/include/SimulationConfig.h |   6 +-
 TOGSim/include/SparseCore.h       |   1 +
 TOGSim/include/TileGraphParser.h  |  14 ++-
 TOGSim/src/Common.cc              | 165 +++++++++++++++---------------
 TOGSim/src/TileGraphParser.cc     | 117 ++++++++++++---------
 TOGSim/src/main.cc                |   8 +-
 8 files changed, 170 insertions(+), 150 deletions(-)

diff --git a/TOGSim/conanfile.txt b/TOGSim/conanfile.txt
index 7a57f52f..ce5268c7 100644
--- a/TOGSim/conanfile.txt
+++ b/TOGSim/conanfile.txt
@@ -2,6 +2,6 @@
 boost/1.79.0
 robin-hood-hashing/3.11.5
 spdlog/1.11.0
-nlohmann_json/3.11.2
+yaml-cpp/0.8.0
 [generators]
 cmake
diff --git a/TOGSim/include/Common.h b/TOGSim/include/Common.h
index 640cba0c..c62c3e0b 100644
--- a/TOGSim/include/Common.h
+++ b/TOGSim/include/Common.h
@@ -3,6 +3,7 @@
 #include <robin_hood.h>
 #include <spdlog/fmt/ranges.h>
 #include <spdlog/spdlog.h>
+#include <yaml-cpp/yaml.h>
 
 #include <cassert>
 #include <cstdint>
@@ -14,7 +15,6 @@
 
 #include "SimulationConfig.h"
 #include "Instruction.h"
-#include "nlohmann/json.hpp"
 
 #define MIN(x, y) (((x) > (y)) ? (y) : (x))
 #define MIN3(x, y, z) MIN(MIN(x, y), z)
@@ -24,10 +24,7 @@
 
 #define PAGE_SIZE 4096
 
-using json = nlohmann::json;
-
 typedef uint64_t addr_type;
 typedef uint64_t cycle_type;
 
-uint32_t generate_id();
-SimulationConfig initialize_config(json config);
\ No newline at end of file
+SimulationConfig initialize_config(YAML::Node config);
\ No newline at end of file
diff --git a/TOGSim/include/SimulationConfig.h b/TOGSim/include/SimulationConfig.h
index 64cfa223..090f5520 100644
--- a/TOGSim/include/SimulationConfig.h
+++ b/TOGSim/include/SimulationConfig.h
@@ -1,13 +1,11 @@
 #pragma once
 
-#include <nlohmann/json.hpp>
 #include <string>
-
-using json = nlohmann::json;
+#include <yaml-cpp/yaml.h>
 
 enum class CoreType { WS_MESH, STONNE };
 
-enum class DramType { SIMPLE, RAMULATOR1, RAMULATOR2 };
+enum class DramType { SIMPLE, RAMULATOR2 };
 
 enum class IcntType { SIMPLE, BOOKSIM2 };
 
diff --git a/TOGSim/include/SparseCore.h b/TOGSim/include/SparseCore.h
index 9188b21d..02781ab3 100644
--- a/TOGSim/include/SparseCore.h
+++ b/TOGSim/include/SparseCore.h
@@ -1,5 +1,6 @@
 #include <map>
 #include <vector>
+#include <iomanip>
 #include "Core.h"
 #include "sstStonne.h"
 #include "SimpleMem.h"
diff --git a/TOGSim/include/TileGraphParser.h b/TOGSim/include/TileGraphParser.h
index 9cc61d4a..07e5b212 100644
--- a/TOGSim/include/TileGraphParser.h
+++ b/TOGSim/include/TileGraphParser.h
@@ -2,7 +2,7 @@
 #include <fstream>
 #include <algorithm>
 #include <filesystem>
-#include <nlohmann/json.hpp>
+#include <yaml-cpp/yaml.h>
 #include <fmt/ranges.h>
 #include <google/protobuf/io/zero_copy_stream_impl.h>
 #include "TileGraph.h"
@@ -13,8 +13,6 @@
 #include "onnx/onnx-operators_pb.h"
 #include "onnx/onnx_pb.h"
 
-using json = nlohmann::json;
-
 enum class TileType{
   LOOP_INDEX_NODE,
   LOOP_END_NODE,
@@ -35,7 +33,7 @@ enum class LoopType {
   INNER_LOOP
 };
 
-bool loadConfig(const std::string& config_path, json& config_json);
+bool loadConfig(const std::string& config_path, YAML::Node& config_yaml);
 
 class TileNode {
  public:
@@ -80,9 +78,9 @@ class TileGraphParser {
   LoopType get_loop_type(std::string key) { return std::get<2>(_loop_size_map[key]); }
   const std::map<std::string, std::tuple<int, int, LoopType>> & get_loop_map() { return _loop_size_map; }
   const std::vector<uint32_t> &lookupNumaInfo(std::string key);
-  int getCoreIdFromJson(const json& attribute_json, int subgraph_id);
+  int getCoreIdFromConfig(const YAML::Node& attribute_config, int subgraph_id);
   std::string getMetaByName(std::string key) { return _tog_meta[key]; }
-  const json& get_attribute_file() { return _attribute_json; }
+  const YAML::Node& get_attribute_file() { return _attribute_config; }
   std::vector<int> calc_tag(std::vector<int>& accum_tag, std::vector<int>& tag_idx, std::vector<int>& tag_stride);
   void register_memory_tag(std::string name, std::vector<int>& tag_key);
   bool check_memory_tag(std::string name, std::vector<int>& tag_key);
@@ -135,8 +133,8 @@ class TileGraphParser {
   void _tile_index_generate() {}
   int _loop_stack_pointer = 0;
 
-  json _attribute_json;
-  json _config_json;
+  YAML::Node _attribute_config; 
+  YAML::Node _config_yaml;
   std::string _tog_path;
   std::string _attribute_path;
   uint64_t indirect_counter = 0;
diff --git a/TOGSim/src/Common.cc b/TOGSim/src/Common.cc
index 9a6b7798..63d360c6 100644
--- a/TOGSim/src/Common.cc
+++ b/TOGSim/src/Common.cc
@@ -1,28 +1,24 @@
 #include "Common.h"
 
-uint32_t generate_id() {
-  static uint32_t id_counter{0};
-  return id_counter++;
-}
-
 template <typename T>
-T get_config_value(json config, std::string key) {
-  if (config.contains(key)) {
-    return config[key];
+T get_config_value(const YAML::Node& config, std::string key) {
+  if (config[key]) {
+    return config[key].as<T>();
   } else {
     throw std::runtime_error(fmt::format("Config key {} not found", key));
   }
 }
 
-SimulationConfig initialize_config(json config) {
+SimulationConfig initialize_config(YAML::Node config) {
   SimulationConfig parsed_config;
-  // print json
-  spdlog::info("TOGSim Config: {}", config.dump(2));
+  YAML::Emitter emitter;
+  emitter << config;
+  spdlog::info("PyTorchSim config:\n{}", emitter.c_str());
 
   /* Core configs */
-  parsed_config.num_cores = config["num_cores"];
-  if (config.contains("core_type")) {
-    std::vector<std::string> core_types = config["core_type"].get<std::vector<std::string>>();
+  parsed_config.num_cores = get_config_value<uint32_t>(config, "num_cores");
+  if (config["core_type"]) {
+    std::vector<std::string> core_types = config["core_type"].as<std::vector<std::string>>();
 
     if (core_types.size() != parsed_config.num_cores)
       throw std::runtime_error("Mismatch between num_cores and core_type list size");
@@ -41,100 +37,105 @@ SimulationConfig initialize_config(json config) {
     for (int i=0; i<parsed_config.num_cores; i++)
       parsed_config.core_type.push_back(CoreType::WS_MESH);
   }
-  parsed_config.core_freq_mhz = config["core_freq_mhz"];
-  if (config.contains("num_systolic_array_per_core"))
-    parsed_config.num_systolic_array_per_core = config["num_systolic_array_per_core"];
-  if (config.contains("num_stonne_per_core"))
-    parsed_config.num_stonne_per_core = config["num_stonne_per_core"];
-   if (config.contains("num_stonne_port"))
-    parsed_config.num_stonne_port = config["num_stonne_port"];
+
+  parsed_config.core_freq_mhz = get_config_value<uint32_t>(config, "core_freq_mhz");
+  if (config["num_systolic_array_per_core"])
+    parsed_config.num_systolic_array_per_core = config["num_systolic_array_per_core"].as<uint32_t>();
+  if (config["num_stonne_per_core"])
+    parsed_config.num_stonne_per_core = config["num_stonne_per_core"].as<uint32_t>();
+  if (config["num_stonne_port"])
+    parsed_config.num_stonne_port = config["num_stonne_port"].as<uint32_t>();
   parsed_config.core_print_interval = get_config_value<uint32_t>(config, "core_stats_print_period_cycles");
 
-  /* Stonne config */ 
-  if (config.contains("stonne_config_path"))
-    parsed_config.stonne_config_path = config["stonne_config_path"];
+  /* Stonne config */
+  if (config["stonne_config_path"])
+    parsed_config.stonne_config_path = config["stonne_config_path"].as<std::string>();
 
   /* DRAM config */
-  if ((std::string)config["dram_type"] == "simple")
+  std::string dram_type_str = get_config_value<std::string>(config, "dram_type");
+
+  if (dram_type_str == "simple") {
     parsed_config.dram_type = DramType::SIMPLE;
-  else if ((std::string)config["dram_type"] == "ramulator")
-    parsed_config.dram_type = DramType::RAMULATOR1;
-  else if ((std::string)config["dram_type"] == "ramulator2")
+    parsed_config.dram_latency = get_config_value<uint32_t>(config, "dram_latency");
+  } else if (dram_type_str == "ramulator2") {
     parsed_config.dram_type = DramType::RAMULATOR2;
-  else
-    throw std::runtime_error(fmt::format("Not implemented dram type {} ",
-                                         (std::string)config["dram_type"]));
-  parsed_config.dram_freq_mhz = config["dram_freq_mhz"];
-  if (config.contains("dram_latency"))
-    parsed_config.dram_latency = config["dram_latency"];
-  if (config.contains("ramulator_config_path"))
-    parsed_config.dram_config_path = config["ramulator_config_path"];
-  parsed_config.dram_channels = config["dram_channels"];
-  if (config.contains("dram_req_size_byte"))
-    parsed_config.dram_req_size = config["dram_req_size_byte"];
-  if (config.contains("dram_stats_print_period_cycles"))
-    parsed_config.dram_print_interval = config["dram_stats_print_period_cycles"];
-  if(config.contains("dram_num_burst_length"))
-    parsed_config.dram_nbl = config["dram_num_burst_length"];
-  if (config.contains("dram_num_partitions")) {
-    parsed_config.dram_num_partitions = config["dram_num_partitions"];
+    parsed_config.dram_config_path = get_config_value<std::string>(config, "ramulator_config_path");
+  } else {
+    throw std::runtime_error(fmt::format("Not implemented dram type {} ", dram_type_str));
+  }
+
+  parsed_config.dram_freq_mhz = get_config_value<uint32_t>(config, "dram_freq_mhz");
+  parsed_config.dram_channels = get_config_value<uint32_t>(config, "dram_channels");
+  parsed_config.dram_req_size = get_config_value<uint32_t>(config, "dram_req_size_byte");
+  parsed_config.dram_nbl = get_config_value<uint32_t>(config, "dram_num_burst_length");
+
+  if (config["dram_stats_print_period_cycles"])
+    parsed_config.dram_print_interval = config["dram_stats_print_period_cycles"].as<uint32_t>();
+  if (config["dram_num_partitions"]) {
+    parsed_config.dram_num_partitions = config["dram_num_partitions"].as<uint32_t>();
     if (parsed_config.dram_channels % parsed_config.dram_num_partitions != 0) {
       throw std::runtime_error("[Config] DRAM channels must be divisible by dram_num_partitions");
     }
   }
-  parsed_config.dram_channels_per_partitions =
-    parsed_config.dram_channels / parsed_config.dram_num_partitions;
 
+  if (parsed_config.dram_num_partitions != 0) {
+      parsed_config.dram_channels_per_partitions =
+        parsed_config.dram_channels / parsed_config.dram_num_partitions;
+  } else {
+      parsed_config.dram_channels_per_partitions = parsed_config.dram_channels;
+  }
 
    /* L2D config */
-  if (config.contains("l2d_type")) {
-    if ((std::string)config["l2d_type"] == "nocache")
+  if (config["l2d_type"]) {
+    std::string l2d_type_str = config["l2d_type"].as<std::string>();
+    if (l2d_type_str == "nocache")
       parsed_config.l2d_type = L2CacheType::NOCACHE;
-    else if ((std::string)config["l2d_type"] == "datacache")
+    else if (l2d_type_str == "datacache") {
       parsed_config.l2d_type = L2CacheType::DATACACHE;
-    else
-      throw std::runtime_error(fmt::format("Not implemented l2 cache type {} ",
-                                          (std::string)config["l2d_type"]));
+      parsed_config.l2d_config_str = get_config_value<std::string>(config, "l2d_config");
+      if (config["l2d_hit_latency"])
+        parsed_config.l2d_hit_latency = config["l2d_hit_latency"].as<uint32_t>();
+    } else
+      throw std::runtime_error(fmt::format("Not implemented l2 cache type {} ", l2d_type_str));
   } else {
     parsed_config.l2d_type = L2CacheType::NOCACHE;
   }
 
-  if (config.contains("l2d_config"))
-    parsed_config.l2d_config_str = config["l2d_config"];
-  if (config.contains("l2d_hit_latency"))
-    parsed_config.l2d_config_str = config["l2d_hit_latency"];
-
   /* Icnt config */
-  if ((std::string)config["icnt_type"] == "simple")
+  std::string icnt_type_str = config["icnt_type"].as<std::string>();
+  if (icnt_type_str == "simple") {
     parsed_config.icnt_type = IcntType::SIMPLE;
-  else if ((std::string)config["icnt_type"] == "booksim2")
+    if (config["icnt_latency_cycles"])
+      parsed_config.icnt_latency = config["icnt_latency_cycles"].as<uint32_t>();
+  } else if (icnt_type_str == "booksim2") {
     parsed_config.icnt_type = IcntType::BOOKSIM2;
-  else
-    throw std::runtime_error(fmt::format("Not implemented icnt type {} ",
-                                         (std::string)config["icnt_type"]));
-  parsed_config.icnt_freq_mhz = config["icnt_freq_mhz"];
-  if (config.contains("icnt_latency_cycles"))
-    parsed_config.icnt_latency = config["icnt_latency_cycles"];
-  if (config.contains("booksim_config_path"))
-    parsed_config.icnt_config_path = config["booksim_config_path"];
-  if (config.contains("icnt_stats_print_period_cycles"))
-    parsed_config.icnt_stats_print_period_cycles = config["icnt_stats_print_period_cycles"];
-  if (config.contains("icnt_injection_ports_per_core"))
-    parsed_config.icnt_injection_ports_per_core = config["icnt_injection_ports_per_core"];
-
-  if (config.contains("scheduler"))
-    parsed_config.scheduler_type = config["scheduler"];
-  if (config.contains("num_partition"))
-    parsed_config.num_partition = config["num_partition"];
-  if (config.contains("partition")) {
+    parsed_config.icnt_config_path = get_config_value<std::string>(config, "booksim_config_path");
+  } else
+    throw std::runtime_error(fmt::format("Not implemented icnt type {} ", icnt_type_str));
+
+  parsed_config.icnt_freq_mhz = config["icnt_freq_mhz"].as<double>();
+  if (config["icnt_stats_print_period_cycles"])
+    parsed_config.icnt_stats_print_period_cycles = config["icnt_stats_print_period_cycles"].as<uint32_t>();
+  if (config["icnt_injection_ports_per_core"])
+    parsed_config.icnt_injection_ports_per_core = config["icnt_injection_ports_per_core"].as<uint32_t>();
+
+  if (config["scheduler"])
+    parsed_config.scheduler_type = config["scheduler"].as<std::string>();
+  if (config["num_partition"])
+    parsed_config.num_partition = config["num_partition"].as<uint32_t>();
+  if (config["partition"]) {
     for (int i=0; i<parsed_config.num_cores; i++) {
       std::string core_partition = "core_" + std::to_string(i);
-      uint32_t partition_id = uint32_t(config["partition"][core_partition]);
-      parsed_config.partiton_map[i] = partition_id;
-      spdlog::info("[Config/Core] CPU {}: Partition {}", i, partition_id);
+      if (config["partition"][core_partition]) {
+          uint32_t partition_id = config["partition"][core_partition].as<uint32_t>();
+          parsed_config.partiton_map[i] = partition_id;
+          spdlog::info("[Config/Core] CPU {}: Partition {}", i, partition_id);
+      } else {
+          spdlog::warn("[Config/Core] CPU {}: Partition key not found, defaulting to 0", i);
+          parsed_config.partiton_map[i] = 0;
+      }
     }
   } else {
-    /* Default: all partition 0 */
     for (int i=0; i<parsed_config.num_cores; i++) {
       parsed_config.partiton_map[i] = 0;
       spdlog::info("[Config/Core] CPU {}: Partition {}", i, 0);
diff --git a/TOGSim/src/TileGraphParser.cc b/TOGSim/src/TileGraphParser.cc
index 761530ab..ae8954d9 100644
--- a/TOGSim/src/TileGraphParser.cc
+++ b/TOGSim/src/TileGraphParser.cc
@@ -1,14 +1,18 @@
 #include "TileGraphParser.h"
 
-bool loadConfig(const std::string& config_path, json& config_json) {
-  std::ifstream config_file(config_path);
-  if (config_file.is_open()) {
-      config_file >> config_json;
-      config_file.close();
-      spdlog::info("[LoadConfig] Success to open \"{}\"", config_path);
-      return true;
-  } else {
-    spdlog::error("[LoadConfig] Failed to open \"{}\"", config_path);
+bool loadConfig(const std::string& config_path, YAML::Node& config_yaml) {
+  try {
+    config_yaml = YAML::LoadFile(config_path);
+    spdlog::info("[LoadConfig] Success to open \"{}\"", config_path);
+    return true;
+  } catch (const YAML::BadFile& e) {
+    spdlog::error("[LoadConfig] Failed to open \"{}\" (File not found or inaccessible)", config_path);
+    return false;
+  } catch (const YAML::ParserException& e) {
+    spdlog::error("[LoadConfig] Failed to parse YAML file \"{}\": {}", config_path, e.what());
+    return false;
+  } catch (const std::exception& e) {
+    spdlog::error("[LoadConfig] Unknown error loading \"{}\": {}", config_path, e.what());
     return false;
   }
 }
@@ -87,26 +91,33 @@ bool find_output_idx(TileGraphParser* tog_parser, std::vector<uint32_t>& output_
   m = output_idx.at(0);
   n = output_idx.at(1);
   k = output_idx.at(2);
+  auto attr_file = tog_parser->get_attribute_file();
 
-  auto attr_json = tog_parser->get_attribute_file();
+  if (!attr_file["zero_skip"]) {
+      return false;
+  }
 
-  // Check arg0: m -> k
+  YAML::Node zero_skip = attr_file["zero_skip"];
   bool found_arg0 = false;
-  if (attr_json["zero_skip"].contains("arg0")) {
-    auto& arg0 = attr_json["zero_skip"]["arg0"];
-    if (arg0.contains(std::to_string(m)) && arg0[std::to_string(m)].contains(std::to_string(k))) {
+  if (zero_skip["arg0"]) {
+    YAML::Node arg0 = zero_skip["arg0"];
+    std::string m_str = std::to_string(m);
+    std::string k_str = std::to_string(k);
+    if (arg0[m_str] && arg0[m_str][k_str]) {
       found_arg0 = true;
     }
   }
 
-  // Check arg1: n -> k
   bool found_arg1 = false;
-  if (attr_json["zero_skip"].contains("arg1")) {
-    auto& arg1 = attr_json["zero_skip"]["arg1"];
-    if (arg1.contains(std::to_string(k)) && arg1[std::to_string(k)].contains(std::to_string(n))) {
+  if (zero_skip["arg1"]) {
+    YAML::Node arg1 = zero_skip["arg1"];
+    std::string k_str = std::to_string(k);
+    std::string n_str = std::to_string(n);
+    if (arg1[k_str] && arg1[k_str][n_str]) {
       found_arg1 = true;
     }
   }
+
   return found_arg0 || found_arg1;
 }
 
@@ -692,8 +703,8 @@ void TileLoopNode::print_node() {
 }
 
 TileGraphParser::TileGraphParser(std::string onnx_path, std::string attribute_path, std::string config_path) {
-  loadConfig(attribute_path, _attribute_json);
-  loadConfig(config_path, _config_json);
+  loadConfig(attribute_path, _attribute_config);
+  loadConfig(config_path, _config_yaml);
   _attribute_path = attribute_path;
 
   if (!std::filesystem::exists(onnx_path)) {
@@ -705,32 +716,45 @@ TileGraphParser::TileGraphParser(std::string onnx_path, std::string attribute_pa
   onnx::ModelProto model_proto;
 
   /* Attribute parsing */
-  if (_attribute_json.contains("address_info")) {
-    auto address_info = _attribute_json["address_info"];
-    for (auto it = address_info.begin(); it != address_info.end(); ++it) {
-      uint64_t value = it.value();
-      _arg_to_address[it.key()] = value;
-      spdlog::info("[TOGParser/Attribute] Address Attribute key: {} address: 0x{:x}", it.key(), value);
+  if (_attribute_config["address_info"]) {
+    const auto& address_info = _attribute_config["address_info"];
+    for (YAML::const_iterator it = address_info.begin(); it != address_info.end(); ++it) {
+      std::string key = it->first.as<std::string>();
+      uint64_t value = it->second.as<uint64_t>();
+
+      _arg_to_address[key] = value;
+      spdlog::info("[TOGParser/Attribute] Address Attribute key: {} address: 0x{:x}", key, value);
     }
   }
-  if (_attribute_json.contains("address_numa_stride")) {
-    auto address_numa_stride = _attribute_json["address_numa_stride"];
-    for (auto it = address_numa_stride.begin(); it != address_numa_stride.end(); ++it) {
-      auto value_list = it.value();
-      for (auto value : value_list) {
-        _arg_numa_stride[it.key()].push_back(value);
+
+  if (_attribute_config["address_numa_stride"]) {
+    const auto& address_numa_stride = _attribute_config["address_numa_stride"];
+    for (YAML::const_iterator it = address_numa_stride.begin(); it != address_numa_stride.end(); ++it) {
+      std::string key = it->first.as<std::string>();
+      const auto& value_list = it->second; // YAML Sequence Node
+
+      for (const auto& val : value_list) {
+        _arg_numa_stride[key].push_back(val.as<uint32_t>());
       }
-      spdlog::info("[TOGParser/Attribute] Address numa info key: {} numa stride : {}", it.key(), fmt::join(_arg_numa_stride[it.key()], ", "));
+      spdlog::info("[TOGParser/Attribute] Address numa info key: {} numa stride : {}", key, fmt::join(_arg_numa_stride[key], ", "));
     }
   }
-  if (_attribute_json.contains("sram_alloc") and _config_json.contains("l2d_type") and _config_json["l2d_type"] == "datacache") {
-    auto sram_alloc_list = _attribute_json["sram_alloc"];
+
+  if (_attribute_config["sram_alloc"] &&
+      _config_yaml["l2d_type"] &&
+      _config_yaml["l2d_type"].as<std::string>() == "datacache") {
+
+    auto sram_alloc_list = _attribute_config["sram_alloc"];
     spdlog::info("[TOGParser/Attribute] ================= SRAM Alloc Plan ================");
-    for (auto it = sram_alloc_list.begin(); it != sram_alloc_list.end(); ++it) {
-      auto value_list = it.value();
-      unsigned long long start = value_list.at(0);
-      unsigned long long end = value_list.at(1);
-      spdlog::info("[TOGParser/Attribute] {:16s}: 0x{:016x} ~ 0x{:016x}", it.key(), start, end);
+
+    for (YAML::const_iterator it = sram_alloc_list.begin(); it != sram_alloc_list.end(); ++it) {
+      std::string key = it->first.as<std::string>();
+      const auto& value_list = it->second; // List [start, end]
+
+      unsigned long long start = value_list[0].as<unsigned long long>();
+      unsigned long long end = value_list[1].as<unsigned long long>();
+
+      spdlog::info("[TOGParser/Attribute] {:16s}: 0x{:016x} ~ 0x{:016x}", key, start, end);
       Interval<unsigned long long, int> entry = {start, end, 0};
       _cache_plan.push_back(entry);
     }
@@ -838,7 +862,7 @@ TileGraphParser::TileGraphParser(std::string onnx_path, std::string attribute_pa
   /* Iterate outer loop and initialize inner loop */
   for (auto iter=_tile_graph->begin(); iter!=_tile_graph->end(); ++iter) {
     std::shared_ptr<TileSubGraph> subgraph = std::make_shared<TileSubGraph>();
-    subgraph->set_core_id(getCoreIdFromJson(_attribute_json, subgraph->get_id()));
+    subgraph->set_core_id(getCoreIdFromConfig(_attribute_config, subgraph->get_id()));
     auto indices = iter.get_indices();
     for (auto loop : _loop_nodes.at(last_outer_idx)) {
       std::shared_ptr<TileLoopNode> outer_loop = std::static_pointer_cast<TileLoopNode>(loop);
@@ -941,11 +965,12 @@ const std::vector<uint32_t>& TileGraphParser::lookupNumaInfo(std::string key) {
   return _arg_numa_stride.at(key);
 }
 
-int TileGraphParser::getCoreIdFromJson(const json& attribute_json, int subgraph_id) {
-  if (attribute_json.contains("subgraph_map")) {
-    const auto& subgraph_map = attribute_json["subgraph_map"];
-    if (subgraph_map.contains(std::to_string(subgraph_id)) && subgraph_map[std::to_string(subgraph_id)].is_number_integer()) {
-        return subgraph_map[std::to_string(subgraph_id)];
+int TileGraphParser::getCoreIdFromConfig(const YAML::Node& attribute_config, int subgraph_id) {
+  std::string key = std::to_string(subgraph_id);
+  if (attribute_config["subgraph_map"]) {
+    const auto& subgraph_map = attribute_config["subgraph_map"];
+    if (subgraph_map[key]) {
+      return subgraph_map[key].as<int>();
     }
   }
   return -1;
diff --git a/TOGSim/src/main.cc b/TOGSim/src/main.cc
index 77c1bae7..bee1b45f 100644
--- a/TOGSim/src/main.cc
+++ b/TOGSim/src/main.cc
@@ -22,11 +22,11 @@ void launchKernel(Simulator* simulator, std::string onnx_path, std::string attri
 }
 
 Simulator* create_simulator(std::string config_path) {
-  json config_json;
-  if(!loadConfig(config_path, config_json)) {
+  YAML::Node config_yaml;
+  if (!loadConfig(config_path, config_yaml))
     exit(1);
-  }
-  SimulationConfig config = initialize_config(config_json);
+  SimulationConfig config = initialize_config(config_yaml);
+
   auto simulator = new Simulator(config);
   return simulator;
 }

From d66df91d973b3a01dc2be81d763b0305569ad9e5 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 9 Dec 2025 09:06:58 +0000
Subject: [PATCH 021/194] [Frontend] Use YAML config file instead of json

---
 PyTorchSimFrontend/extension_config.py        | 30 +++++++------
 PyTorchSimFrontend/extension_op.py            |  2 +-
 README.md                                     |  8 ++--
 Simulator/simulator.py                        | 29 +++++++------
 TOGSim/include/Common.h                       |  1 +
 TOGSim/include/TileGraphParser.h              |  3 +-
 TOGSim/src/Common.cc                          | 17 ++++++++
 TOGSim/src/TileGraphParser.cc                 | 17 --------
 configs/heterogeneous_c2_simple_noc.json      | 40 -----------------
 configs/heterogeneous_c2_simple_noc.yml       | 37 ++++++++++++++++
 configs/stonne_big_c1_simple_noc.json         | 22 ----------
 configs/stonne_big_c1_simple_noc.yml          | 21 +++++++++
 configs/stonne_single_c1_simple_noc.json      | 22 ----------
 configs/stonne_single_c1_simple_noc.yml       | 21 +++++++++
 configs/stonne_validation_c1_simple_noc.json  | 23 ----------
 configs/stonne_validation_c1_simple_noc.yml   | 22 ++++++++++
 .../systolic_ws_128x128_c1_booksim_tpuv2.json | 29 -------------
 .../systolic_ws_128x128_c1_booksim_tpuv2.yml  | 26 +++++++++++
 .../systolic_ws_128x128_c1_booksim_tpuv3.json | 32 --------------
 .../systolic_ws_128x128_c1_booksim_tpuv3.yml  | 30 +++++++++++++
 ...stolic_ws_128x128_c1_simple_noc_tpuv2.json | 31 -------------
 ...ystolic_ws_128x128_c1_simple_noc_tpuv2.yml | 29 +++++++++++++
 ...stolic_ws_128x128_c1_simple_noc_tpuv3.json | 32 --------------
 ...ystolic_ws_128x128_c1_simple_noc_tpuv3.yml | 30 +++++++++++++
 ...c_ws_128x128_c1_simple_noc_tpuv3_half.json | 32 --------------
 ...ic_ws_128x128_c1_simple_noc_tpuv3_half.yml | 30 +++++++++++++
 ...stolic_ws_128x128_c1_simple_noc_tpuv4.json | 34 ---------------
 ...ystolic_ws_128x128_c1_simple_noc_tpuv4.yml | 32 ++++++++++++++
 .../systolic_ws_128x128_c2_booksim_tpuv3.json | 32 --------------
 .../systolic_ws_128x128_c2_booksim_tpuv3.yml  | 30 +++++++++++++
 ...s_128x128_c2_booksim_tpuv3_bw_quarter.json | 43 -------------------
 ...ws_128x128_c2_booksim_tpuv3_bw_quarter.yml | 39 +++++++++++++++++
 .../systolic_ws_128x128_c2_chiplet_tpuv3.json | 34 ---------------
 .../systolic_ws_128x128_c2_chiplet_tpuv3.yml  | 32 ++++++++++++++
 ...lic_ws_128x128_c2_chiplet_tpuv3_xnuma.json | 33 --------------
 ...olic_ws_128x128_c2_chiplet_tpuv3_xnuma.yml | 31 +++++++++++++
 ...stolic_ws_128x128_c2_simple_noc_tpuv2.json | 31 -------------
 ...ystolic_ws_128x128_c2_simple_noc_tpuv2.yml | 29 +++++++++++++
 ...stolic_ws_128x128_c2_simple_noc_tpuv3.json | 32 --------------
 ...ystolic_ws_128x128_c2_simple_noc_tpuv3.yml | 30 +++++++++++++
 ...128x128_c2_simple_noc_tpuv3_partition.json | 38 ----------------
 ..._128x128_c2_simple_noc_tpuv3_partition.yml | 34 +++++++++++++++
 ...stolic_ws_128x128_c2_simple_noc_tpuv4.json | 34 ---------------
 ...ystolic_ws_128x128_c2_simple_noc_tpuv4.yml | 32 ++++++++++++++
 configs/systolic_ws_8x8_c1_booksim.json       | 29 -------------
 configs/systolic_ws_8x8_c1_booksim.yml        | 27 ++++++++++++
 configs/systolic_ws_8x8_c1_simple_noc.json    | 30 -------------
 configs/systolic_ws_8x8_c1_simple_noc.yml     | 28 ++++++++++++
 experiments/BERT.py                           |  2 +-
 .../artifact/cycle_validation/run_cycle.sh    |  2 +-
 experiments/artifact/speedup/run_speedup.sh   |  4 +-
 .../speedup/scripts/run_speed_ils_bert.sh     |  8 ++--
 .../speedup/scripts/run_speed_ils_conv.sh     |  8 ++--
 .../speedup/scripts/run_speed_ils_matmul.sh   |  8 ++--
 .../speedup/scripts/run_speed_ils_resnet.sh   |  8 ++--
 experiments/attention.py                      |  2 +-
 experiments/conv.py                           |  2 +-
 experiments/gemm.py                           |  2 +-
 experiments/layernorm.py                      |  2 +-
 experiments/resnet18.py                       |  2 +-
 experiments/resnet50.py                       |  2 +-
 experiments/softmax.py                        |  2 +-
 scripts/CompilerOpt_experiment/DMAopt.sh      |  2 +-
 scripts/chiplet.sh                            | 10 ++---
 scripts/chiplet_prep.py                       | 11 +++--
 scripts/sparsity_experiment/run.sh            | 12 +++---
 scripts/stonne_experiment/run.sh              |  6 +--
 scripts/stonne_experiment2/tog_gen.py         |  2 +-
 tests/test_compile_overhead.py                |  2 +-
 tests/test_hetro.py                           |  2 +-
 tests/test_scheduler.py                       |  2 +-
 tests/test_scheduler_batching.py              |  2 +-
 tutorial/session1/CompilerOptimization.ipynb  |  4 +-
 tutorial/session1/ExecutionMode.ipynb         |  8 ++--
 tutorial/session1/LogAnalysis.ipynb           |  2 +-
 tutorial/session1/Mapping.ipynb               |  4 +-
 76 files changed, 708 insertions(+), 745 deletions(-)
 delete mode 100644 configs/heterogeneous_c2_simple_noc.json
 create mode 100644 configs/heterogeneous_c2_simple_noc.yml
 delete mode 100644 configs/stonne_big_c1_simple_noc.json
 create mode 100644 configs/stonne_big_c1_simple_noc.yml
 delete mode 100644 configs/stonne_single_c1_simple_noc.json
 create mode 100644 configs/stonne_single_c1_simple_noc.yml
 delete mode 100644 configs/stonne_validation_c1_simple_noc.json
 create mode 100644 configs/stonne_validation_c1_simple_noc.yml
 delete mode 100644 configs/systolic_ws_128x128_c1_booksim_tpuv2.json
 create mode 100644 configs/systolic_ws_128x128_c1_booksim_tpuv2.yml
 delete mode 100644 configs/systolic_ws_128x128_c1_booksim_tpuv3.json
 create mode 100644 configs/systolic_ws_128x128_c1_booksim_tpuv3.yml
 delete mode 100644 configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json
 create mode 100644 configs/systolic_ws_128x128_c1_simple_noc_tpuv2.yml
 delete mode 100644 configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json
 create mode 100644 configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml
 delete mode 100644 configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json
 create mode 100644 configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.yml
 delete mode 100644 configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json
 create mode 100644 configs/systolic_ws_128x128_c1_simple_noc_tpuv4.yml
 delete mode 100644 configs/systolic_ws_128x128_c2_booksim_tpuv3.json
 create mode 100644 configs/systolic_ws_128x128_c2_booksim_tpuv3.yml
 delete mode 100644 configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.json
 create mode 100644 configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.yml
 delete mode 100644 configs/systolic_ws_128x128_c2_chiplet_tpuv3.json
 create mode 100644 configs/systolic_ws_128x128_c2_chiplet_tpuv3.yml
 delete mode 100644 configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json
 create mode 100644 configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.yml
 delete mode 100644 configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json
 create mode 100644 configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml
 delete mode 100644 configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json
 create mode 100644 configs/systolic_ws_128x128_c2_simple_noc_tpuv3.yml
 delete mode 100644 configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json
 create mode 100644 configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.yml
 delete mode 100644 configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json
 create mode 100644 configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml
 delete mode 100644 configs/systolic_ws_8x8_c1_booksim.json
 create mode 100644 configs/systolic_ws_8x8_c1_booksim.yml
 delete mode 100644 configs/systolic_ws_8x8_c1_simple_noc.json
 create mode 100644 configs/systolic_ws_8x8_c1_simple_noc.yml

diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py
index 8d668b58..ab8aea69 100644
--- a/PyTorchSimFrontend/extension_config.py
+++ b/PyTorchSimFrontend/extension_config.py
@@ -1,7 +1,7 @@
 import os
 import sys
 import importlib
-import json
+import yaml
 
 CONFIG_TORCHSIM_DIR = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
 CONFIG_GEM5_PATH = os.environ.get('GEM5_PATH', default="/workspace/gem5/build/RISCV/gem5.opt")
@@ -13,51 +13,53 @@
 def __getattr__(name):
     # TOGSim config
     config_path = os.environ.get('TOGSIM_CONFIG',
-                default=f"{CONFIG_TORCHSIM_DIR}/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json")
+                default=f"{CONFIG_TORCHSIM_DIR}/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml")
     if name == "CONFIG_TOGSIM_CONFIG":
         return config_path
-    config_json = json.load(open(config_path, 'r'))
+
+    with open(config_path, 'r') as f:
+        config_yaml = yaml.safe_load(f)
 
     # Hardware info config
     if name == "vpu_num_lanes":
-        return config_json["vpu_num_lanes"]
+        return config_yaml["vpu_num_lanes"]
     if name == "CONFIG_SPAD_INFO":
         return {
           "spad_vaddr" : 0xD0000000,
           "spad_paddr" : 0x2000000000,
-          "spad_size" : config_json["vpu_spad_size_kb_per_lane"] << 10 # Note: spad size per lane
+          "spad_size" : config_yaml["vpu_spad_size_kb_per_lane"] << 10 # Note: spad size per lane
         }
 
     if name == "CONFIG_PRECISION":
         return 4 # 32bit
     if name == "CONFIG_NUM_CORES":
-        return config_json["num_cores"]
+        return config_yaml["num_cores"]
     if name == "vpu_vector_length_bits":
-        return config_json["vpu_vector_length_bits"]
+        return config_yaml["vpu_vector_length_bits"]
 
     if name == "pytorchsim_functional_mode":
-        return config_json['pytorchsim_functional_mode']
+        return config_yaml['pytorchsim_functional_mode']
     if name == "pytorchsim_timing_mode":
-        return config_json['pytorchsim_timing_mode']
+        return config_yaml['pytorchsim_timing_mode']
 
     # Mapping strategy
     if name == "codegen_mapping_strategy":
-        codegen_mapping_strategy = config_json["codegen_mapping_strategy"]
+        codegen_mapping_strategy = config_yaml["codegen_mapping_strategy"]
         assert(codegen_mapping_strategy in ["heuristic", "autotune", "external-then-heuristic", "external-then-autotune"]), "Invalid mapping strategy!"
         return codegen_mapping_strategy
 
     if name == "codegen_external_mapping_file":
-        return config_json["codegen_external_mapping_file"]
+        return config_yaml["codegen_external_mapping_file"]
 
     # Autotune config
     if name == "codegen_autotune_max_retry":
-        return config_json["codegen_autotune_max_retry"]
+        return config_yaml["codegen_autotune_max_retry"]
     if name == "codegen_autotune_template_topk":
-        return config_json["codegen_autotune_template_topk"]
+        return config_yaml["codegen_autotune_template_topk"]
 
     # Compiler Optimization
     if name == "codegen_compiler_optimization":
-        opt_level = config_json["codegen_compiler_optimization"]
+        opt_level = config_yaml["codegen_compiler_optimization"]
         valid_opts = {
             "fusion",
             "reduction_epilogue",
diff --git a/PyTorchSimFrontend/extension_op.py b/PyTorchSimFrontend/extension_op.py
index 786e7398..18bf65c3 100644
--- a/PyTorchSimFrontend/extension_op.py
+++ b/PyTorchSimFrontend/extension_op.py
@@ -276,7 +276,7 @@ def sparse_mm_stonne_outer(a, b, out):
     onnx_path, attribute_path, c_result_path = prepare_outer_product_matrix(a, b, out)
 
     togsim_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "TOGSim")
-    stonne_config_path = f'{extension_config.CONFIG_TORCHSIM_DIR}/configs/stonne_single_c1_simple_noc.json'
+    stonne_config_path = f'{extension_config.CONFIG_TORCHSIM_DIR}/configs/stonne_single_c1_simple_noc.yml'
     TOGSim = TOGSimulator(togsim_path, stonne_config_path)
     result_path = TOGSim.simulation(onnx_path)
     TOGSimulator.get_result_from_file(result_path)
diff --git a/README.md b/README.md
index 103131c1..4d98baa4 100644
--- a/README.md
+++ b/README.md
@@ -220,7 +220,7 @@ Our load generator supports multi-tenancy experiments. You can run a simple exam
 python tests/test_scheduler.py
 ```
 Below is an example code of multi-tenancy `resnet18` and `EncoderBlock`.
-In this example, the `Scheduler` is initialized with a number of request queues, a scheduling policy, and a TOGSimulator config file(`.json`). The compiled PyTorch models are then registered with a unique model id.
+In this example, the `Scheduler` is initialized with a number of request queues, a scheduling policy, and a TOGSimulator config file(`.yml`). The compiled PyTorch models are then registered with a unique model id.
 
 ```python3
 import os
@@ -228,7 +228,7 @@ import sys
 import torch
 from torchvision.models import resnet18
 base_path = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-config = f'{base_path}/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json'
+config = f'{base_path}/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.yml'
 
 sys.path.append(base_path)
 from tests.test_transformer import EncoderBlock
@@ -244,7 +244,7 @@ SchedulerDNNModel.register_model("model0", opt_model0)
 SchedulerDNNModel.register_model("model1", opt_model1)
 ```
 
-The config file(`.json`) specifies two key items:
+The config file(`.yml`) specifies two key items:
 - `num_partition`: The total number of independent request queues to create.
 - `partition`: Defines the hardware mapping, assigning each queue (identified by its index) to a specific physical core.
 For example, the configuration below creates two scheduling queues (`0` and `1`) and maps `core_0` to queue `0` and `core_1` to queue `1`:
@@ -415,7 +415,7 @@ export TORCHSIM_USE_TIMING_POOLING=0 # use lightweight pooling for timing
 ```
 You can set TOGSim config path as below.
 ```bash
-export TORCHSIM_CONFIG=/workspace/PyTorchSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json
+export TORCHSIM_CONFIG=/workspace/PyTorchSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml
 ```
 ## Future Works
 Currently, PyTorchSim supports PyTorch 2.2. Support for newer versions will be added soon.
diff --git a/Simulator/simulator.py b/Simulator/simulator.py
index 4786fd32..a46243f0 100644
--- a/Simulator/simulator.py
+++ b/Simulator/simulator.py
@@ -4,7 +4,7 @@
 import subprocess
 import re
 import sys
-import json
+import yaml
 import time
 import datetime
 import threading
@@ -204,7 +204,7 @@ class TOGSimulator():
     def __init__(self, togsim_path, config_path, vectorlane_size=-1) -> None:
         self.base_dir = togsim_path
         self.config_path = config_path
-        self.config_json = self.load_json(self.config_path)
+        self.config_yaml = self.load_yaml(self.config_path)
         self.process = None
         self.vectorlane_size = vectorlane_size
 
@@ -347,40 +347,41 @@ def sram_dealloc(cls, buf_name, addr_range):
     def create_attribute_file(self, attribute_path, inputs, **kwargs):
         address_info = {}
         sram_buffer = {}
-        json_content = {}
+        yaml_content = {}
+
         os.makedirs(attribute_path, exist_ok=True)
         index = str(len(os.listdir(attribute_path)))
         attribute_path = os.path.join(attribute_path, index)
 
         for idx, tensor in enumerate(inputs):
             address_info[f"arg{idx}"] = tensor.data_ptr()
-        json_content["address_info"] = address_info
+        yaml_content["address_info"] = address_info
 
         for buf_name, range in self.ALLOC_POOL.items():
             sram_buffer[buf_name] = range
-        json_content["sram_alloc"] = sram_buffer
+        yaml_content["sram_alloc"] = sram_buffer
 
         with open(attribute_path, "w") as f:
-            json.dump(json_content, f, indent=4)
+            yaml.dump(yaml_content, f, default_flow_style=False)
             f.flush()
             os.fsync(f.fileno()) # There could be a race condition.
         return attribute_path
 
-    def load_json(self, config_path):
+    def load_yaml(self, config_path):
         config_path = Path(config_path)
         if not config_path.is_file():
-            raise FileNotFoundError(f"JSON file not found: {config_path}")
+            raise FileNotFoundError(f"YAML file not found: {config_path}")
 
         try:
             with open(config_path, "r") as file:
-                data = json.load(file)
+                data = yaml.safe_load(file)
                 return data
-        except json.JSONDecodeError as e:
-            raise ValueError(f"Invalid JSON format: {e}")
+        except yaml.YAMLError as e:
+            raise ValueError(f"Invalid YAML format: {e}")
 
     def get_core_freq(self):
-        if "core_freq_mhz" in self.config_json:
-            return self.config_json["core_freq_mhz"] * 1000 * 1000 # MHz
+        if "core_freq_mhz" in self.config_yaml:
+            return self.config_yaml["core_freq_mhz"] * 1000 * 1000 # MHz
         else:
             raise KeyError("Key 'core_freq' not found in JSON.")
 
@@ -462,6 +463,6 @@ def get_result_from_file(result_path):
         return core_metrics, dram_channel_bw, avg_dram_bw, simulation_time, total_cycle
 
 if __name__ == "__main__":
-    sim = TOGSimulator("/workspace/PyTorchSim/TOGSim", "/workspace/PyTorchSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json")
+    sim = TOGSimulator("/workspace/PyTorchSim/TOGSim", "/workspace/PyTorchSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.yml")
     sim.interactive_simulation()
     sim.until(4000)
\ No newline at end of file
diff --git a/TOGSim/include/Common.h b/TOGSim/include/Common.h
index c62c3e0b..2fd62681 100644
--- a/TOGSim/include/Common.h
+++ b/TOGSim/include/Common.h
@@ -27,4 +27,5 @@
 typedef uint64_t addr_type;
 typedef uint64_t cycle_type;
 
+bool loadConfig(const std::string& config_path, YAML::Node& config_yaml);
 SimulationConfig initialize_config(YAML::Node config);
\ No newline at end of file
diff --git a/TOGSim/include/TileGraphParser.h b/TOGSim/include/TileGraphParser.h
index 07e5b212..9c176966 100644
--- a/TOGSim/include/TileGraphParser.h
+++ b/TOGSim/include/TileGraphParser.h
@@ -9,6 +9,7 @@
 #include "Instruction.h"
 #include "sstStonne.h"
 #include "IntervalTree.h"
+#include "Common.h"
 #include "onnx/defs/schema.h"
 #include "onnx/onnx-operators_pb.h"
 #include "onnx/onnx_pb.h"
@@ -33,8 +34,6 @@ enum class LoopType {
   INNER_LOOP
 };
 
-bool loadConfig(const std::string& config_path, YAML::Node& config_yaml);
-
 class TileNode {
  public:
   TileNode(onnx::NodeProto& node);
diff --git a/TOGSim/src/Common.cc b/TOGSim/src/Common.cc
index 63d360c6..b15381a6 100644
--- a/TOGSim/src/Common.cc
+++ b/TOGSim/src/Common.cc
@@ -1,5 +1,22 @@
 #include "Common.h"
 
+bool loadConfig(const std::string& config_path, YAML::Node& config_yaml) {
+  try {
+    config_yaml = YAML::LoadFile(config_path);
+    spdlog::info("[LoadConfig] Success to open \"{}\"", config_path);
+    return true;
+  } catch (const YAML::BadFile& e) {
+    spdlog::error("[LoadConfig] Failed to open \"{}\" (File not found or inaccessible)", config_path);
+    return false;
+  } catch (const YAML::ParserException& e) {
+    spdlog::error("[LoadConfig] Failed to parse YAML file \"{}\": {}", config_path, e.what());
+    return false;
+  } catch (const std::exception& e) {
+    spdlog::error("[LoadConfig] Unknown error loading \"{}\": {}", config_path, e.what());
+    return false;
+  }
+}
+
 template <typename T>
 T get_config_value(const YAML::Node& config, std::string key) {
   if (config[key]) {
diff --git a/TOGSim/src/TileGraphParser.cc b/TOGSim/src/TileGraphParser.cc
index ae8954d9..515f6247 100644
--- a/TOGSim/src/TileGraphParser.cc
+++ b/TOGSim/src/TileGraphParser.cc
@@ -1,22 +1,5 @@
 #include "TileGraphParser.h"
 
-bool loadConfig(const std::string& config_path, YAML::Node& config_yaml) {
-  try {
-    config_yaml = YAML::LoadFile(config_path);
-    spdlog::info("[LoadConfig] Success to open \"{}\"", config_path);
-    return true;
-  } catch (const YAML::BadFile& e) {
-    spdlog::error("[LoadConfig] Failed to open \"{}\" (File not found or inaccessible)", config_path);
-    return false;
-  } catch (const YAML::ParserException& e) {
-    spdlog::error("[LoadConfig] Failed to parse YAML file \"{}\": {}", config_path, e.what());
-    return false;
-  } catch (const std::exception& e) {
-    spdlog::error("[LoadConfig] Unknown error loading \"{}\": {}", config_path, e.what());
-    return false;
-  }
-}
-
 void printIndexMap(std::string prefix, const std::map<std::string, int>& indexMap) {
     std::ostringstream oss;
     for (const auto& [key, value] : indexMap) {
diff --git a/configs/heterogeneous_c2_simple_noc.json b/configs/heterogeneous_c2_simple_noc.json
deleted file mode 100644
index a68f38c2..00000000
--- a/configs/heterogeneous_c2_simple_noc.json
+++ /dev/null
@@ -1,40 +0,0 @@
-{
-  "core_type" : ["stonne", "ws_mesh"],
-  "stonne_config_path" : "/workspace/PyTorchSim/TOGSim/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg",
-  "num_cores" : 2,
-  "core_freq_mhz" : 940,
-  "core_stats_print_period_cycles" : 10000,
-
-  "num_stonne_per_core" : 8,
-  "num_stonne_port" : 64,
-  "num_systolic_array_per_core" : 2,
-
-  "vpu_num_lanes" : 128,
-  "vpu_spad_size_kb_per_lane" : 128,
-  "vpu_vector_length_bits" : 256,
-
-  "dram_type" : "ramulator2",
-  "dram_freq_mhz" : 940,
-  "dram_channels": 16,
-  "dram_req_size_byte": 32,
-  "dram_num_burst_length" : 2,
-  "dram_stats_print_period_cycles": 10000,
-  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
-
-  "icnt_type" : "simple",
-  "icnt_latency_cycles" : 10,
-  "icnt_freq_mhz" : 940,
-  "icnt_injection_ports_per_core" : 16,
-
-  "num_partition" : 2,
-  "partition": {
-    "core_0":0,
-    "core_1":1
-  },
-
-  "codegen_mapping_strategy" : "autotune",
-  "codegen_external_mapping_file" : "",
-  "codegen_autotune_max_retry": 10,
-  "codegen_autotune_template_topk": 4,
-  "codegen_compiler_optimization" : "all"
-}
\ No newline at end of file
diff --git a/configs/heterogeneous_c2_simple_noc.yml b/configs/heterogeneous_c2_simple_noc.yml
new file mode 100644
index 00000000..9c596d85
--- /dev/null
+++ b/configs/heterogeneous_c2_simple_noc.yml
@@ -0,0 +1,37 @@
+core_type:
+- stonne
+- ws_mesh
+stonne_config_path: /workspace/PyTorchSim/TOGSim/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg
+num_cores: 2
+core_freq_mhz: 940
+core_stats_print_period_cycles: 10000
+num_stonne_per_core: 8
+num_stonne_port: 64
+num_systolic_array_per_core: 2
+
+vpu_num_lanes: 128
+vpu_spad_size_kb_per_lane: 128
+vpu_vector_length_bits: 256
+
+dram_type: ramulator2
+dram_freq_mhz: 940
+dram_channels: 16
+dram_req_size_byte: 32
+dram_num_burst_length: 2
+dram_stats_print_period_cycles: 10000
+ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
+
+icnt_type: simple
+icnt_latency_cycles: 10
+icnt_freq_mhz: 940
+icnt_injection_ports_per_core: 16
+num_partition: 2
+partition:
+  core_0: 0
+  core_1: 1
+
+codegen_mapping_strategy: autotune
+codegen_external_mapping_file: ''
+codegen_autotune_max_retry: 10
+codegen_autotune_template_topk: 4
+codegen_compiler_optimization: all
diff --git a/configs/stonne_big_c1_simple_noc.json b/configs/stonne_big_c1_simple_noc.json
deleted file mode 100644
index 0a8ca3c2..00000000
--- a/configs/stonne_big_c1_simple_noc.json
+++ /dev/null
@@ -1,22 +0,0 @@
-{
-  "core_type" : ["stonne"],
-  "stonne_config_path" : "/workspace/PyTorchSim/TOGSim/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg",
-  "num_cores" : 1,
-  "core_freq_mhz" : 940,
-  "core_stats_print_period_cycles" : 10000,
-  "num_stonne_per_core" : 8,
-  "num_stonne_port" : 64,
-
-  "dram_type" : "ramulator2",
-  "dram_freq_mhz" : 940,
-  "dram_channels": 8,
-  "dram_req_size_byte": 32,
-  "dram_num_burst_length" : 2,
-  "dram_stats_print_period_cycless": 10000,
-  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
-
-  "icnt_type" : "simple",
-  "icnt_latency_cycles" : 10,
-  "icnt_freq_mhz" : 940,
-  "icnt_injection_ports_per_core" : 16
-}
\ No newline at end of file
diff --git a/configs/stonne_big_c1_simple_noc.yml b/configs/stonne_big_c1_simple_noc.yml
new file mode 100644
index 00000000..b14838c8
--- /dev/null
+++ b/configs/stonne_big_c1_simple_noc.yml
@@ -0,0 +1,21 @@
+core_type:
+- stonne
+stonne_config_path: /workspace/PyTorchSim/TOGSim/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg
+num_cores: 1
+core_freq_mhz: 940
+core_stats_print_period_cycles: 10000
+num_stonne_per_core: 8
+num_stonne_port: 64
+
+dram_type: ramulator2
+dram_freq_mhz: 940
+dram_channels: 8
+dram_req_size_byte: 32
+dram_num_burst_length: 2
+dram_stats_print_period_cycless: 10000
+ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
+
+icnt_type: simple
+icnt_latency_cycles: 10
+icnt_freq_mhz: 940
+icnt_injection_ports_per_core: 16
diff --git a/configs/stonne_single_c1_simple_noc.json b/configs/stonne_single_c1_simple_noc.json
deleted file mode 100644
index 3421d4f1..00000000
--- a/configs/stonne_single_c1_simple_noc.json
+++ /dev/null
@@ -1,22 +0,0 @@
-{
-  "core_type" : ["stonne"],
-  "stonne_config_path" : "/workspace/PyTorchSim/TOGSim/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg",
-  "num_cores" : 1,
-  "core_freq_mhz" : 700,
-  "core_stats_print_period_cycles" : 10000,
-  "num_stonne_per_core" : 1,
-  "num_stonne_port" : 8,
-
-  "dram_type" : "ramulator2",
-  "dram_freq_mhz" : 700,
-  "dram_channels": 8,
-  "dram_req_size_byte": 32,
-  "dram_num_burst_length" : 2,
-  "dram_stats_print_period_cycles": 10000,
-  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
-
-  "icnt_type" : "simple",
-  "icnt_latency_cycles" : 10,
-  "icnt_freq_mhz" : 700,
-  "icnt_injection_ports_per_core" : 8
-}
\ No newline at end of file
diff --git a/configs/stonne_single_c1_simple_noc.yml b/configs/stonne_single_c1_simple_noc.yml
new file mode 100644
index 00000000..0ed7962c
--- /dev/null
+++ b/configs/stonne_single_c1_simple_noc.yml
@@ -0,0 +1,21 @@
+core_type:
+- stonne
+stonne_config_path: /workspace/PyTorchSim/TOGSim/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg
+num_cores: 1
+core_freq_mhz: 700
+core_stats_print_period_cycles: 10000
+num_stonne_per_core: 1
+num_stonne_port: 8
+
+dram_type: ramulator2
+dram_freq_mhz: 700
+dram_channels: 8
+dram_req_size_byte: 32
+dram_num_burst_length: 2
+dram_stats_print_period_cycles: 10000
+ramulator_config_path: ../configs/ramulator2_configs/HBM2.yaml
+
+icnt_type: simple
+icnt_latency_cycles: 10
+icnt_freq_mhz: 700
+icnt_injection_ports_per_core: 8
diff --git a/configs/stonne_validation_c1_simple_noc.json b/configs/stonne_validation_c1_simple_noc.json
deleted file mode 100644
index fb196dfb..00000000
--- a/configs/stonne_validation_c1_simple_noc.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "core_type" : ["stonne"],
-  "stonne_config_path" : "/workspace/PyTorchSim/TOGSim/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg",
-  "num_cores" : 1,
-  "core_freq_mhz" : 1000,
-  "core_stats_print_period_cycles" : 10000,
-  "num_stonne_per_core" : 1,
-  "num_stonne_port" : 32,
-
-  "dram_type" : "simple",
-  "dram_freq_mhz" : 1000,
-  "dram_channels": 1,
-  "dram_req_size_byte": 32,
-  "dram_latency" : 100,
-  "dram_stats_print_period_cycles": 10000,
-  "l2d_type" : "datacache",
-  "l2d_config" : "S:128:128:64,32,L:T:m:W:L,A:192:4,32:0,32",
-
-  "icnt_type" : "simple",
-  "icnt_latency_cycles" : 10,
-  "icnt_freq_mhz" : 1000,
-  "icnt_injection_ports_per_core" : 8
-}
\ No newline at end of file
diff --git a/configs/stonne_validation_c1_simple_noc.yml b/configs/stonne_validation_c1_simple_noc.yml
new file mode 100644
index 00000000..f86dcce1
--- /dev/null
+++ b/configs/stonne_validation_c1_simple_noc.yml
@@ -0,0 +1,22 @@
+core_type:
+- stonne
+stonne_config_path: /workspace/PyTorchSim/TOGSim/extern/stonneCore/tests/sparseflex_op_128mses_128_bw.cfg
+num_cores: 1
+core_freq_mhz: 1000
+core_stats_print_period_cycles: 10000
+num_stonne_per_core: 1
+num_stonne_port: 32
+
+dram_type: simple
+dram_freq_mhz: 1000
+dram_channels: 1
+dram_req_size_byte: 32
+dram_latency: 100
+dram_stats_print_period_cycles: 10000
+l2d_type: datacache
+l2d_config: S:128:128:64,32,L:T:m:W:L,A:192:4,32:0,32
+
+icnt_type: simple
+icnt_latency_cycles: 10
+icnt_freq_mhz: 1000
+icnt_injection_ports_per_core: 8
diff --git a/configs/systolic_ws_128x128_c1_booksim_tpuv2.json b/configs/systolic_ws_128x128_c1_booksim_tpuv2.json
deleted file mode 100644
index 686827dc..00000000
--- a/configs/systolic_ws_128x128_c1_booksim_tpuv2.json
+++ /dev/null
@@ -1,29 +0,0 @@
-{
-  "num_cores" : 1,
-  "core_freq_mhz" : 700,
-  "core_stats_print_period_cycles" : 10000,
-
-  "vpu_num_lanes" : 128,
-  "vpu_spad_size_kb_per_lane" : 128,
-  "vpu_vector_length_bits" : 256,
-
-  "dram_type" : "ramulator2",
-  "dram_freq_mhz" :700,
-  "dram_channels": 16,
-  "dram_req_size_byte": 32,
-
-  "dram_num_burst_length" : 2,
-  "dram_stats_print_period_cycles": 10000,
-  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
-
-  "icnt_type" : "booksim2",
-  "icnt_freq_mhz" : 700,
-  "icnt_injection_ports_per_core" : 16,
-  "booksim_config_path" : "../configs/booksim2_configs/fly_c16_m16.icnt",
-
-  "codegen_mapping_strategy" : "autotune",
-  "codegen_external_mapping_file" : "",
-  "codegen_autotune_max_retry": 10,
-  "codegen_autotune_template_topk": 4,
-  "codegen_compiler_optimization" : "all"
-}
\ No newline at end of file
diff --git a/configs/systolic_ws_128x128_c1_booksim_tpuv2.yml b/configs/systolic_ws_128x128_c1_booksim_tpuv2.yml
new file mode 100644
index 00000000..08149005
--- /dev/null
+++ b/configs/systolic_ws_128x128_c1_booksim_tpuv2.yml
@@ -0,0 +1,26 @@
+num_cores: 1
+core_freq_mhz: 700
+core_stats_print_period_cycles: 10000
+
+vpu_num_lanes: 128
+vpu_spad_size_kb_per_lane: 128
+vpu_vector_length_bits: 256
+
+dram_type: ramulator2
+dram_freq_mhz: 700
+dram_channels: 16
+dram_req_size_byte: 32
+dram_num_burst_length: 2
+dram_stats_print_period_cycles: 10000
+ramulator_config_path: ../configs/ramulator2_configs/HBM2.yaml
+
+icnt_type: booksim2
+icnt_freq_mhz: 700
+icnt_injection_ports_per_core: 16
+booksim_config_path: ../configs/booksim2_configs/fly_c16_m16.icnt
+
+codegen_mapping_strategy: autotune
+codegen_external_mapping_file: ''
+codegen_autotune_max_retry: 10
+codegen_autotune_template_topk: 4
+codegen_compiler_optimization: all
diff --git a/configs/systolic_ws_128x128_c1_booksim_tpuv3.json b/configs/systolic_ws_128x128_c1_booksim_tpuv3.json
deleted file mode 100644
index 1109dc0f..00000000
--- a/configs/systolic_ws_128x128_c1_booksim_tpuv3.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-  "num_cores" : 1,
-  "core_freq_mhz" : 940,
-  "core_stats_print_period_cycles" : 10000,
-  "num_systolic_array_per_core" : 2,
-
-  "vpu_num_lanes" : 128,
-  "vpu_spad_size_kb_per_lane" : 128,
-  "vpu_vector_length_bits" : 256,
-
-  "dram_type" : "ramulator2",
-  "dram_freq_mhz" : 940,
-  "dram_channels": 16,
-  "dram_req_size_byte": 32,
-  "dram_num_burst_length" : 2,
-  "dram_stats_print_period_cycles": 10000,
-  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
-
-  "icnt_type" : "booksim2",
-  "icnt_freq_mhz" : 940,
-  "icnt_injection_ports_per_core" : 16,
-  "booksim_config_path" : "../configs/booksim2_configs/fly_c16_m16.icnt",
-
-  "pytorchsim_functional_mode" : 1,
-  "pytorchsim_timing_mode" : 1,
-
-  "codegen_mapping_strategy" : "autotune",
-  "codegen_external_mapping_file" : "",
-  "codegen_autotune_max_retry": 10,
-  "codegen_autotune_template_topk": 4,
-  "codegen_compiler_optimization" : "all"
-}
diff --git a/configs/systolic_ws_128x128_c1_booksim_tpuv3.yml b/configs/systolic_ws_128x128_c1_booksim_tpuv3.yml
new file mode 100644
index 00000000..12304ce2
--- /dev/null
+++ b/configs/systolic_ws_128x128_c1_booksim_tpuv3.yml
@@ -0,0 +1,30 @@
+num_cores: 1
+core_freq_mhz: 940
+core_stats_print_period_cycles: 10000
+num_systolic_array_per_core: 2
+
+vpu_num_lanes: 128
+vpu_spad_size_kb_per_lane: 128
+vpu_vector_length_bits: 256
+
+dram_type: ramulator2
+dram_freq_mhz: 940
+dram_channels: 16
+dram_req_size_byte: 32
+dram_num_burst_length: 2
+dram_stats_print_period_cycles: 10000
+ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
+
+icnt_type: booksim2
+icnt_freq_mhz: 940
+icnt_injection_ports_per_core: 16
+booksim_config_path: ../configs/booksim2_configs/fly_c16_m16.icnt
+
+pytorchsim_functional_mode: 1
+pytorchsim_timing_mode: 1
+
+codegen_mapping_strategy: autotune
+codegen_external_mapping_file: ''
+codegen_autotune_max_retry: 10
+codegen_autotune_template_topk: 4
+codegen_compiler_optimization: all
diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json b/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json
deleted file mode 100644
index 22aedcf8..00000000
--- a/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.json
+++ /dev/null
@@ -1,31 +0,0 @@
-{
-  "num_cores" : 1,
-  "core_freq_mhz" : 700,
-  "core_stats_print_period_cycles" : 10000,
-
-  "vpu_num_lanes" : 128,
-  "vpu_spad_size_kb_per_lane" : 128,
-  "vpu_vector_length_bits" : 256,
-
-  "dram_type" : "ramulator2",
-  "dram_freq_mhz" : 700,
-  "dram_channels": 32,
-  "dram_req_size_byte": 32,
-  "dram_num_burst_length" : 2,
-  "dram_stats_print_period_cycless": 10000,
-  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
-
-  "icnt_type" : "simple",
-  "icnt_latency_cycles" : 10,
-  "icnt_freq_mhz" : 700,
-  "icnt_injection_ports_per_core" : 16,
-
-  "pytorchsim_functional_mode" : 1,
-  "pytorchsim_timing_mode" : 1,
-
-  "codegen_mapping_strategy" : "autotune",
-  "codegen_external_mapping_file" : "",
-  "codegen_autotune_max_retry": 10,
-  "codegen_autotune_template_topk": 4,
-  "codegen_compiler_optimization" : "all"
-}
\ No newline at end of file
diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.yml b/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.yml
new file mode 100644
index 00000000..aec29ff8
--- /dev/null
+++ b/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.yml
@@ -0,0 +1,29 @@
+num_cores: 1
+core_freq_mhz: 700
+core_stats_print_period_cycles: 10000
+
+vpu_num_lanes: 128
+vpu_spad_size_kb_per_lane: 128
+vpu_vector_length_bits: 256
+
+dram_type: ramulator2
+dram_freq_mhz: 700
+dram_channels: 32
+dram_req_size_byte: 32
+dram_num_burst_length: 2
+dram_stats_print_period_cycless: 10000
+ramulator_config_path: ../configs/ramulator2_configs/HBM2.yaml
+
+icnt_type: simple
+icnt_latency_cycles: 10
+icnt_freq_mhz: 700
+icnt_injection_ports_per_core: 16
+
+pytorchsim_functional_mode: 1
+pytorchsim_timing_mode: 1
+
+codegen_mapping_strategy: autotune
+codegen_external_mapping_file: ''
+codegen_autotune_max_retry: 10
+codegen_autotune_template_topk: 4
+codegen_compiler_optimization: all
diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json
deleted file mode 100644
index e8e489d9..00000000
--- a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-  "num_cores" : 1,
-  "core_freq_mhz" : 940,
-  "core_stats_print_period_cycles" : 10000,
-  "num_systolic_array_per_core" : 2,
-
-  "vpu_num_lanes" : 128,
-  "vpu_spad_size_kb_per_lane" : 128,
-  "vpu_vector_length_bits" : 256,
-
-  "dram_type" : "ramulator2",
-  "dram_freq_mhz" : 940,
-  "dram_channels": 16,
-  "dram_req_size_byte": 32,
-  "dram_num_burst_length" : 2,
-  "dram_stats_print_period_cycles": 10000,
-  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
-
-  "icnt_type" : "simple",
-  "icnt_latency_cycles" : 10,
-  "icnt_freq_mhz" : 940,
-  "icnt_injection_ports_per_core" : 16,
-
-  "pytorchsim_functional_mode" : 1,
-  "pytorchsim_timing_mode" : 1,
-
-  "codegen_mapping_strategy" : "heuristic",
-  "codegen_external_mapping_file" : "",
-  "codegen_autotune_max_retry": 10,
-  "codegen_autotune_template_topk": 4,
-  "codegen_compiler_optimization" : "all"
-}
\ No newline at end of file
diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml
new file mode 100644
index 00000000..72873f1c
--- /dev/null
+++ b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml
@@ -0,0 +1,30 @@
+num_cores: 1
+core_freq_mhz: 940
+core_stats_print_period_cycles: 10000
+num_systolic_array_per_core: 2
+
+vpu_num_lanes: 128
+vpu_spad_size_kb_per_lane: 128
+vpu_vector_length_bits: 256
+
+dram_type: ramulator2
+dram_freq_mhz: 940
+dram_channels: 16
+dram_req_size_byte: 32
+dram_num_burst_length: 2
+dram_stats_print_period_cycles: 10000
+ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
+
+icnt_type: simple
+icnt_latency_cycles: 10
+icnt_freq_mhz: 940
+icnt_injection_ports_per_core: 16
+
+pytorchsim_functional_mode: 1
+pytorchsim_timing_mode: 1
+
+codegen_mapping_strategy: heuristic
+codegen_external_mapping_file: ''
+codegen_autotune_max_retry: 10
+codegen_autotune_template_topk: 4
+codegen_compiler_optimization: all
diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json
deleted file mode 100644
index 980bfc73..00000000
--- a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-  "num_cores" : 1,
-  "core_freq_mhz" : 940,
-  "core_stats_print_period_cycles" : 10000,
-  "num_systolic_array_per_core" : 2,
-
-  "vpu_num_lanes" : 128,
-  "vpu_spad_size_kb_per_lane" : 128,
-  "vpu_vector_length_bits" : 256,
-
-  "dram_type" : "ramulator2",
-  "dram_freq_mhz" : 940,
-  "dram_channels": 8,
-  "dram_req_size_byte": 32,
-  "dram_num_burst_length" : 2,
-  "dram_stats_print_period_cycles": 10000,
-  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
-
-  "icnt_type" : "simple",
-  "icnt_latency_cycles" : 10,
-  "icnt_freq_mhz" : 940,
-  "icnt_injection_ports_per_core" : 16,
-
-  "pytorchsim_functional_mode" : 1,
-  "pytorchsim_timing_mode" : 1,
-
-  "codegen_mapping_strategy" : "autotune",
-  "codegen_external_mapping_file" : "",
-  "codegen_autotune_max_retry": 10,
-  "codegen_autotune_template_topk": 4,
-  "codegen_compiler_optimization" : "all"
-}
\ No newline at end of file
diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.yml b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.yml
new file mode 100644
index 00000000..c2e962e3
--- /dev/null
+++ b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.yml
@@ -0,0 +1,30 @@
+num_cores: 1
+core_freq_mhz: 940
+core_stats_print_period_cycles: 10000
+num_systolic_array_per_core: 2
+
+vpu_num_lanes: 128
+vpu_spad_size_kb_per_lane: 128
+vpu_vector_length_bits: 256
+
+dram_type: ramulator2
+dram_freq_mhz: 940
+dram_channels: 8
+dram_req_size_byte: 32
+dram_num_burst_length: 2
+dram_stats_print_period_cycles: 10000
+ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
+
+icnt_type: simple
+icnt_latency_cycles: 10
+icnt_freq_mhz: 940
+icnt_injection_ports_per_core: 16
+
+pytorchsim_functional_mode: 1
+pytorchsim_timing_mode: 1
+
+codegen_mapping_strategy: autotune
+codegen_external_mapping_file: ''
+codegen_autotune_max_retry: 10
+codegen_autotune_template_topk: 4
+codegen_compiler_optimization: all
diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json b/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json
deleted file mode 100644
index 02bfd75c..00000000
--- a/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.json
+++ /dev/null
@@ -1,34 +0,0 @@
-{
-  "num_cores" : 1,
-  "core_freq_mhz" : 1050,
-  "core_stats_print_period_cycles" : 10000,
-  "num_systolic_array_per_core" : 4,
-
-  "vpu_num_lanes" : 128,
-  "vpu_spad_size_kb_per_lane" : 128,
-  "vpu_vector_length_bits" : 256,
-
-  "dram_type" : "ramulator2",
-  "dram_freq_mhz" :1200,
-  "dram_channels": 16,
-  "dram_req_size_byte": 32,
-  "dram_num_burst_length" : 2,
-  "dram_stats_print_period_cycles": 10000,
-  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
-  "l2d_type" : "datacache",
-  "l2d_config" : "S:128:128:512,32,L:T:m:W:L,A:192:4,32:0,32",
-
-  "icnt_type" : "simple",
-  "icnt_latency_cycles" : 10,
-  "icnt_freq_mhz" : 1050,
-  "icnt_injection_ports_per_core" : 16,
-
-  "pytorchsim_functional_mode" : 1,
-  "pytorchsim_timing_mode" : 1,
-
-  "codegen_mapping_strategy" : "autotune",
-  "codegen_external_mapping_file" : "",
-  "codegen_autotune_max_retry": 10,
-  "codegen_autotune_template_topk": 4,
-  "codegen_compiler_optimization" : "all"
-}
\ No newline at end of file
diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.yml b/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.yml
new file mode 100644
index 00000000..0415876d
--- /dev/null
+++ b/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.yml
@@ -0,0 +1,32 @@
+num_cores: 1
+core_freq_mhz: 1050
+core_stats_print_period_cycles: 10000
+num_systolic_array_per_core: 4
+
+vpu_num_lanes: 128
+vpu_spad_size_kb_per_lane: 128
+vpu_vector_length_bits: 256
+
+dram_type: ramulator2
+dram_freq_mhz: 1200
+dram_channels: 16
+dram_req_size_byte: 32
+dram_num_burst_length: 2
+dram_stats_print_period_cycles: 10000
+ramulator_config_path: ../configs/ramulator2_configs/HBM2.yaml
+l2d_type: datacache
+l2d_config: S:128:128:512,32,L:T:m:W:L,A:192:4,32:0,32
+
+icnt_type: simple
+icnt_latency_cycles: 10
+icnt_freq_mhz: 1050
+icnt_injection_ports_per_core: 16
+
+pytorchsim_functional_mode: 1
+pytorchsim_timing_mode: 1
+
+codegen_mapping_strategy: autotune
+codegen_external_mapping_file: ''
+codegen_autotune_max_retry: 10
+codegen_autotune_template_topk: 4
+codegen_compiler_optimization: all
diff --git a/configs/systolic_ws_128x128_c2_booksim_tpuv3.json b/configs/systolic_ws_128x128_c2_booksim_tpuv3.json
deleted file mode 100644
index 66566324..00000000
--- a/configs/systolic_ws_128x128_c2_booksim_tpuv3.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-  "num_cores" : 2,
-  "core_freq_mhz" : 940,
-  "core_stats_print_period_cycles" : 10000,
-  "num_systolic_array_per_core" : 2,
-
-  "vpu_num_lanes" : 128,
-  "vpu_spad_size_kb_per_lane" : 128,
-  "vpu_vector_length_bits" : 256,
-
-  "dram_type" : "ramulator2",
-  "dram_freq_mhz" : 940,
-  "dram_channels": 32,
-  "dram_req_size_byte": 32,
-  "dram_num_burst_length" : 2,
-  "dram_stats_print_period_cycles": 10000,
-  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
-
-  "icnt_type" : "booksim2",
-  "icnt_freq_mhz" : 940,
-  "icnt_injection_ports_per_core" : 16,
-  "booksim_config_path" : "../configs/booksim2_configs/fly_c32_m32.icnt",
-
-  "pytorchsim_functional_mode" : 1,
-  "pytorchsim_timing_mode" : 1,
-
-  "codegen_mapping_strategy" : "autotune",
-  "codegen_external_mapping_file" : "",
-  "codegen_autotune_max_retry": 10,
-  "codegen_autotune_template_topk": 4,
-  "codegen_compiler_optimization" : "all"
-}
diff --git a/configs/systolic_ws_128x128_c2_booksim_tpuv3.yml b/configs/systolic_ws_128x128_c2_booksim_tpuv3.yml
new file mode 100644
index 00000000..e411c0f3
--- /dev/null
+++ b/configs/systolic_ws_128x128_c2_booksim_tpuv3.yml
@@ -0,0 +1,30 @@
+num_cores: 2
+core_freq_mhz: 940
+core_stats_print_period_cycles: 10000
+num_systolic_array_per_core: 2
+
+vpu_num_lanes: 128
+vpu_spad_size_kb_per_lane: 128
+vpu_vector_length_bits: 256
+
+dram_type: ramulator2
+dram_freq_mhz: 940
+dram_channels: 32
+dram_req_size_byte: 32
+dram_num_burst_length: 2
+dram_stats_print_period_cycles: 10000
+ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
+
+icnt_type: booksim2
+icnt_freq_mhz: 940
+icnt_injection_ports_per_core: 16
+booksim_config_path: ../configs/booksim2_configs/fly_c32_m32.icnt
+
+pytorchsim_functional_mode: 1
+pytorchsim_timing_mode: 1
+
+codegen_mapping_strategy: autotune
+codegen_external_mapping_file: ''
+codegen_autotune_max_retry: 10
+codegen_autotune_template_topk: 4
+codegen_compiler_optimization: all
diff --git a/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.json b/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.json
deleted file mode 100644
index 8ef47e87..00000000
--- a/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.json
+++ /dev/null
@@ -1,43 +0,0 @@
-{
-  "num_cores" : 2,
-  "core_freq_mhz" : 940,
-  "sram_size" : 65536,
-  "core_print_interval" : 10000,
-  "num_systolic_array_per_core" : 2,
-
-  "vpu_num_lanes" : 128,
-  "vpu_spad_size_kb_per_lane" : 128,
-  "vpu_vector_length_bits" : 256,
-
-  "dram_type" : "ramulator2",
-  "dram_freq" : 940,
-  "dram_channels": 8,
-  "dram_req_size": 32,
-  "dram_latency" : 10,
-  "dram_nbl" : 2,
-  "dram_print_interval": 10000,
-  "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
- 
-  "icnt_type" : "booksim2",
-  "icnt_latency_cycles" : 10,
-  "icnt_freq" : 940,
-  "icnt_injection_ports_per_core" : 16,
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c32_m8.icnt",
- 
-  "precision" : 4,
-  "scheduler" : "simple",
-  "num_partition" : 2,
-  "partition": {
-    "core_0":0,
-    "core_1":0
-  },
-
-  "pytorchsim_functional_mode" : 1,
-  "pytorchsim_timing_mode" : 1,
-
-  "codegen_mapping_strategy" : "autotune",
-  "codegen_external_mapping_file" : "",
-  "codegen_autotune_max_retry": 10,
-  "codegen_autotune_template_topk": 4,
-  "codegen_compiler_optimization" : "all"
-}
\ No newline at end of file
diff --git a/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.yml b/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.yml
new file mode 100644
index 00000000..f164b108
--- /dev/null
+++ b/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.yml
@@ -0,0 +1,39 @@
+num_cores: 2
+core_freq_mhz: 940
+sram_size: 65536
+core_print_interval: 10000
+num_systolic_array_per_core: 2
+
+vpu_num_lanes: 128
+vpu_spad_size_kb_per_lane: 128
+vpu_vector_length_bits: 256
+
+dram_type: ramulator2
+dram_freq: 940
+dram_channels: 8
+dram_req_size: 32
+dram_latency: 10
+dram_nbl: 2
+dram_print_interval: 10000
+dram_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
+
+icnt_type: booksim2
+icnt_latency_cycles: 10
+icnt_freq: 940
+icnt_injection_ports_per_core: 16
+icnt_config_path: ../configs/booksim2_configs/fly_c32_m8.icnt
+precision: 4
+scheduler: simple
+num_partition: 2
+partition:
+  core_0: 0
+  core_1: 0
+
+pytorchsim_functional_mode: 1
+pytorchsim_timing_mode: 1
+
+codegen_mapping_strategy: autotune
+codegen_external_mapping_file: ''
+codegen_autotune_max_retry: 10
+codegen_autotune_template_topk: 4
+codegen_compiler_optimization: all
diff --git a/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json b/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json
deleted file mode 100644
index ecd671bf..00000000
--- a/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json
+++ /dev/null
@@ -1,34 +0,0 @@
-{
-  "num_cores" : 2,
-  "core_freq_mhz" : 940,
-  "core_stats_print_period_cycles" : 10000,
-  "num_systolic_array_per_core" : 2,
-
-  "vpu_num_lanes" : 128,
-  "vpu_spad_size_kb_per_lane" : 128,
-  "vpu_vector_length_bits" : 256,
-
-  "dram_type" : "ramulator2",
-  "dram_freq_mhz" : 940,
-  "dram_channels": 32,
-  "dram_req_size_byte": 32,
-  "dram_num_burst_length" : 2,
-  "dram_stats_print_period_cycles": 10000,
-  "dram_num_partitions" : 2,
-  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
-
-  "icnt_type" : "booksim2",
-  "icnt_freq_mhz" : 1000,
-  "icnt_injection_ports_per_core" : 16,
-  "booksim_config_path" : "../configs/booksim2_configs/chiplet_32_32_2.icnt",
-  "icnt_stats_print_period_cycles" : 10000,
-
-  "pytorchsim_functional_mode" : 1,
-  "pytorchsim_timing_mode" : 1,
-
-  "codegen_mapping_strategy" : "autotune",
-  "codegen_external_mapping_file" : "",
-  "codegen_autotune_max_retry": 10,
-  "codegen_autotune_template_topk": 4,
-  "codegen_compiler_optimization" : "all"
-}
\ No newline at end of file
diff --git a/configs/systolic_ws_128x128_c2_chiplet_tpuv3.yml b/configs/systolic_ws_128x128_c2_chiplet_tpuv3.yml
new file mode 100644
index 00000000..e38f091f
--- /dev/null
+++ b/configs/systolic_ws_128x128_c2_chiplet_tpuv3.yml
@@ -0,0 +1,32 @@
+num_cores: 2
+core_freq_mhz: 940
+core_stats_print_period_cycles: 10000
+num_systolic_array_per_core: 2
+
+vpu_num_lanes: 128
+vpu_spad_size_kb_per_lane: 128
+vpu_vector_length_bits: 256
+
+dram_type: ramulator2
+dram_freq_mhz: 940
+dram_channels: 32
+dram_req_size_byte: 32
+dram_num_burst_length: 2
+dram_stats_print_period_cycles: 10000
+dram_num_partitions: 2
+ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
+
+icnt_type: booksim2
+icnt_freq_mhz: 1000
+icnt_injection_ports_per_core: 16
+booksim_config_path: ../configs/booksim2_configs/chiplet_32_32_2.icnt
+icnt_stats_print_period_cycles: 10000
+
+pytorchsim_functional_mode: 1
+pytorchsim_timing_mode: 1
+
+codegen_mapping_strategy: autotune
+codegen_external_mapping_file: ''
+codegen_autotune_max_retry: 10
+codegen_autotune_template_topk: 4
+codegen_compiler_optimization: all
diff --git a/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json b/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json
deleted file mode 100644
index 168fbe3a..00000000
--- a/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json
+++ /dev/null
@@ -1,33 +0,0 @@
-{
-  "num_cores" : 2,
-  "core_freq_mhz" : 940,
-  "core_stats_print_period_cycles" : 10000,
-  "num_systolic_array_per_core" : 2,
-
-  "vpu_num_lanes" : 128,
-  "vpu_spad_size_kb_per_lane" : 128,
-  "vpu_vector_length_bits" : 256,
-
-  "dram_type" : "ramulator2",
-  "dram_freq_mhz" : 940,
-  "dram_channels": 32,
-  "dram_req_size_byte": 32,
-  "dram_num_burst_length" : 2,
-  "dram_stats_print_period_cycles": 10000,
-  "dram_num_partitions" : 1,
-  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
-
-  "icnt_type" : "booksim2",
-  "icnt_freq_mhz" : 1000,
-  "icnt_injection_ports_per_core" : 16,
-  "booksim_config_path" : "../configs/booksim2_configs/chiplet_32_32_2.icnt",
-
-  "pytorchsim_functional_mode" : 1,
-  "pytorchsim_timing_mode" : 1,
-
-  "codegen_mapping_strategy" : "autotune",
-  "codegen_external_mapping_file" : "",
-  "codegen_autotune_max_retry": 10,
-  "codegen_autotune_template_topk": 4,
-  "codegen_compiler_optimization" : "all"
-}
\ No newline at end of file
diff --git a/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.yml b/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.yml
new file mode 100644
index 00000000..57696243
--- /dev/null
+++ b/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.yml
@@ -0,0 +1,31 @@
+num_cores: 2
+core_freq_mhz: 940
+core_stats_print_period_cycles: 10000
+num_systolic_array_per_core: 2
+
+vpu_num_lanes: 128
+vpu_spad_size_kb_per_lane: 128
+vpu_vector_length_bits: 256
+
+dram_type: ramulator2
+dram_freq_mhz: 940
+dram_channels: 32
+dram_req_size_byte: 32
+dram_num_burst_length: 2
+dram_stats_print_period_cycles: 10000
+dram_num_partitions: 1
+ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
+
+icnt_type: booksim2
+icnt_freq_mhz: 1000
+icnt_injection_ports_per_core: 16
+booksim_config_path: ../configs/booksim2_configs/chiplet_32_32_2.icnt
+
+pytorchsim_functional_mode: 1
+pytorchsim_timing_mode: 1
+
+codegen_mapping_strategy: autotune
+codegen_external_mapping_file: ''
+codegen_autotune_max_retry: 10
+codegen_autotune_template_topk: 4
+codegen_compiler_optimization: all
diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json b/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json
deleted file mode 100644
index 0a5f15b2..00000000
--- a/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json
+++ /dev/null
@@ -1,31 +0,0 @@
-{
-  "num_cores" : 2,
-  "core_freq_mhz" : 700,
-  "core_stats_print_period_cycles" : 10000,
-
-  "vpu_num_lanes" : 128,
-  "vpu_spad_size_kb_per_lane" : 128,
-  "vpu_vector_length_bits" : 256,
-
-  "dram_type" : "ramulator2",
-  "dram_freq_mhz" :700,
-  "dram_channels": 32,
-  "dram_req_size_byte": 32,
-  "dram_num_burst_length" : 2,
-  "dram_stats_print_period_cycles": 10000,
-  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
-
-  "icnt_type" : "simple",
-  "icnt_latency_cycles" : 10,
-  "icnt_freq_mhz" : 700,
-  "icnt_injection_ports_per_core" : 16,
-
-  "pytorchsim_functional_mode" : 1,
-  "pytorchsim_timing_mode" : 1,
-
-  "codegen_mapping_strategy" : "heuristic",
-  "codegen_external_mapping_file" : "",
-  "codegen_autotune_max_retry": 10,
-  "codegen_autotune_template_topk": 4,
-  "codegen_compiler_optimization" : "all"
-}
\ No newline at end of file
diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml b/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml
new file mode 100644
index 00000000..f0686055
--- /dev/null
+++ b/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml
@@ -0,0 +1,29 @@
+num_cores: 2
+core_freq_mhz: 700
+core_stats_print_period_cycles: 10000
+
+vpu_num_lanes: 128
+vpu_spad_size_kb_per_lane: 128
+vpu_vector_length_bits: 256
+
+dram_type: ramulator2
+dram_freq_mhz: 700
+dram_channels: 32
+dram_req_size_byte: 32
+dram_num_burst_length: 2
+dram_stats_print_period_cycles: 10000
+ramulator_config_path: ../configs/ramulator2_configs/HBM2.yaml
+
+icnt_type: simple
+icnt_latency_cycles: 10
+icnt_freq_mhz: 700
+icnt_injection_ports_per_core: 16
+
+pytorchsim_functional_mode: 1
+pytorchsim_timing_mode: 1
+
+codegen_mapping_strategy: heuristic
+codegen_external_mapping_file: ''
+codegen_autotune_max_retry: 10
+codegen_autotune_template_topk: 4
+codegen_compiler_optimization: all
diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json
deleted file mode 100644
index f099b93d..00000000
--- a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-  "num_cores" : 2,
-  "core_freq_mhz" : 940,
-  "core_stats_print_period_cycles" : 10000,
-  "num_systolic_array_per_core" : 2,
-
-  "vpu_num_lanes" : 128,
-  "vpu_spad_size_kb_per_lane" : 128,
-  "vpu_vector_length_bits" : 256,
-
-  "dram_type" : "ramulator2",
-  "dram_freq_mhz" : 940,
-  "dram_channels": 32,
-  "dram_req_size_byte": 32,
-  "dram_num_burst_length" : 2,
-  "dram_stats_print_period_cycles": 10000,
-  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
-
-  "icnt_type" : "simple",
-  "icnt_latency_cycles" : 10,
-  "icnt_freq_mhz" : 940,
-  "icnt_injection_ports_per_core" : 16,
-
-  "pytorchsim_functional_mode" : 1,
-  "pytorchsim_timing_mode" : 1,
-
-  "codegen_mapping_strategy" : "heuristic",
-  "codegen_external_mapping_file" : "",
-  "codegen_autotune_max_retry": 10,
-  "codegen_autotune_template_topk": 4,
-  "codegen_compiler_optimization" : "all"
-}
\ No newline at end of file
diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.yml b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.yml
new file mode 100644
index 00000000..511a5a09
--- /dev/null
+++ b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.yml
@@ -0,0 +1,30 @@
+num_cores: 2
+core_freq_mhz: 940
+core_stats_print_period_cycles: 10000
+num_systolic_array_per_core: 2
+
+vpu_num_lanes: 128
+vpu_spad_size_kb_per_lane: 128
+vpu_vector_length_bits: 256
+
+dram_type: ramulator2
+dram_freq_mhz: 940
+dram_channels: 32
+dram_req_size_byte: 32
+dram_num_burst_length: 2
+dram_stats_print_period_cycles: 10000
+ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
+
+icnt_type: simple
+icnt_latency_cycles: 10
+icnt_freq_mhz: 940
+icnt_injection_ports_per_core: 16
+
+pytorchsim_functional_mode: 1
+pytorchsim_timing_mode: 1
+
+codegen_mapping_strategy: heuristic
+codegen_external_mapping_file: ''
+codegen_autotune_max_retry: 10
+codegen_autotune_template_topk: 4
+codegen_compiler_optimization: all
diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json
deleted file mode 100644
index 681ef884..00000000
--- a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json
+++ /dev/null
@@ -1,38 +0,0 @@
-{
-  "num_cores" : 2,
-  "core_freq_mhz" : 940,
-  "core_stats_print_period_cycles" : 10000,
-  "num_systolic_array_per_core" : 2,
-
-  "vpu_num_lanes" : 128,
-  "vpu_spad_size_kb_per_lane" : 128,
-  "vpu_vector_length_bits" : 256,
-
-  "dram_type" : "ramulator2",
-  "dram_freq_mhz" : 940,
-  "dram_channels": 32,
-  "dram_req_size_byte": 32,
-  "dram_num_burst_length" : 2,
-  "dram_stats_print_period_cycles": 10000,
-  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
-
-  "icnt_type" : "simple",
-  "icnt_latency_cycles" : 10,
-  "icnt_freq_mhz" : 940,
-  "icnt_injection_ports_per_core" : 16,
-
-  "num_partition" : 2,
-  "partition": {
-    "core_0":0,
-    "core_1":1
-  },
-
-  "pytorchsim_functional_mode" : 1,
-  "pytorchsim_timing_mode" : 1,
-
-  "codegen_mapping_strategy" : "autotune",
-  "codegen_external_mapping_file" : "",
-  "codegen_autotune_max_retry": 10,
-  "codegen_autotune_template_topk": 4,
-  "codegen_compiler_optimization" : "all"
-}
diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.yml b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.yml
new file mode 100644
index 00000000..499ad823
--- /dev/null
+++ b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.yml
@@ -0,0 +1,34 @@
+num_cores: 2
+core_freq_mhz: 940
+core_stats_print_period_cycles: 10000
+num_systolic_array_per_core: 2
+
+vpu_num_lanes: 128
+vpu_spad_size_kb_per_lane: 128
+vpu_vector_length_bits: 256
+
+dram_type: ramulator2
+dram_freq_mhz: 940
+dram_channels: 32
+dram_req_size_byte: 32
+dram_num_burst_length: 2
+dram_stats_print_period_cycles: 10000
+ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
+
+icnt_type: simple
+icnt_latency_cycles: 10
+icnt_freq_mhz: 940
+icnt_injection_ports_per_core: 16
+num_partition: 2
+partition:
+  core_0: 0
+  core_1: 1
+
+pytorchsim_functional_mode: 1
+pytorchsim_timing_mode: 1
+
+codegen_mapping_strategy: autotune
+codegen_external_mapping_file: ''
+codegen_autotune_max_retry: 10
+codegen_autotune_template_topk: 4
+codegen_compiler_optimization: all
diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json b/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json
deleted file mode 100644
index d09228a1..00000000
--- a/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json
+++ /dev/null
@@ -1,34 +0,0 @@
-{
-  "num_cores" : 2,
-  "core_freq_mhz" : 1050,
-  "core_stats_print_period_cycles" : 10000,
-  "num_systolic_array_per_core" : 4,
-
-  "vpu_num_lanes" : 128,
-  "vpu_spad_size_kb_per_lane" : 128,
-  "vpu_vector_length_bits" : 256,
-
-  "dram_type" : "ramulator2",
-  "dram_freq_mhz" :1200,
-  "dram_channels": 32,
-  "dram_req_size_byte": 32,
-  "dram_num_burst_length" : 2,
-  "dram_stats_print_period_cycles": 10000,
-  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2.yaml",
-  "l2d_type" : "datacache",
-  "l2d_config" : "S:64:128:512,32,L:B:m:W:L,A:192:4,32:0,32",
-
-  "icnt_type" : "simple",
-  "icnt_latency_cycles" : 10,
-  "icnt_freq_mhz" : 1050,
-  "icnt_injection_ports_per_core" : 16,
-
-  "pytorchsim_functional_mode" : 1,
-  "pytorchsim_timing_mode" : 1,
-
-  "codegen_mapping_strategy" : "autotune",
-  "codegen_external_mapping_file" : "",
-  "codegen_autotune_max_retry": 10,
-  "codegen_autotune_template_topk": 4,
-  "codegen_compiler_optimization" : "all"
-}
\ No newline at end of file
diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml b/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml
new file mode 100644
index 00000000..da40f01e
--- /dev/null
+++ b/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml
@@ -0,0 +1,32 @@
+num_cores: 2
+core_freq_mhz: 1050
+core_stats_print_period_cycles: 10000
+num_systolic_array_per_core: 4
+
+vpu_num_lanes: 128
+vpu_spad_size_kb_per_lane: 128
+vpu_vector_length_bits: 256
+
+dram_type: ramulator2
+dram_freq_mhz: 1200
+dram_channels: 32
+dram_req_size_byte: 32
+dram_num_burst_length: 2
+dram_stats_print_period_cycles: 10000
+ramulator_config_path: ../configs/ramulator2_configs/HBM2.yaml
+l2d_type: datacache
+l2d_config: S:64:128:512,32,L:B:m:W:L,A:192:4,32:0,32
+
+icnt_type: simple
+icnt_latency_cycles: 10
+icnt_freq_mhz: 1050
+icnt_injection_ports_per_core: 16
+
+pytorchsim_functional_mode: 1
+pytorchsim_timing_mode: 1
+
+codegen_mapping_strategy: autotune
+codegen_external_mapping_file: ''
+codegen_autotune_max_retry: 10
+codegen_autotune_template_topk: 4
+codegen_compiler_optimization: all
diff --git a/configs/systolic_ws_8x8_c1_booksim.json b/configs/systolic_ws_8x8_c1_booksim.json
deleted file mode 100644
index 851664e6..00000000
--- a/configs/systolic_ws_8x8_c1_booksim.json
+++ /dev/null
@@ -1,29 +0,0 @@
-{
-  "num_cores" : 1,
-  "core_freq_mhz" : 800,
-  "core_stats_print_period_cycles" : 100000,
-
-  "vpu_num_lanes" : 8,
-  "vpu_spad_size_kb_per_lane" : 32,
-  "vpu_vector_length_bits" : 256,
-
-  "dram_type" : "ramulator2",
-  "dram_freq_mhz" :800,
-  "dram_channels": 1,
-  "dram_req_size_byte": 64,
-  "dram_num_burst_length" : 4,
-  "dram_stats_print_period_cycles": 100000,
-  "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
-
-  "icnt_type" : "booksim2",
-  "icnt_freq_mhz" : 800,
-
-  "pytorchsim_functional_mode" : 1,
-  "pytorchsim_timing_mode" : 1,
-
-  "codegen_mapping_strategy" : "autotune",
-  "codegen_external_mapping_file" : "",
-  "codegen_autotune_max_retry": 10,
-  "codegen_autotune_template_topk": 4,
-  "codegen_compiler_optimization" : "all"
-}
\ No newline at end of file
diff --git a/configs/systolic_ws_8x8_c1_booksim.yml b/configs/systolic_ws_8x8_c1_booksim.yml
new file mode 100644
index 00000000..6fd305f9
--- /dev/null
+++ b/configs/systolic_ws_8x8_c1_booksim.yml
@@ -0,0 +1,27 @@
+num_cores: 1
+core_freq_mhz: 800
+core_stats_print_period_cycles: 100000
+
+vpu_num_lanes: 8
+vpu_spad_size_kb_per_lane: 32
+vpu_vector_length_bits: 256
+
+dram_type: ramulator2
+dram_freq_mhz: 800
+dram_channels: 1
+dram_req_size_byte: 64
+dram_num_burst_length: 4
+dram_stats_print_period_cycles: 100000
+ramulator_config_path: ../configs/ramulator2_configs/DDR4.yaml
+
+icnt_type: booksim2
+icnt_freq_mhz: 800
+
+pytorchsim_functional_mode: 1
+pytorchsim_timing_mode: 1
+
+codegen_mapping_strategy: autotune
+codegen_external_mapping_file: ''
+codegen_autotune_max_retry: 10
+codegen_autotune_template_topk: 4
+codegen_compiler_optimization: all
diff --git a/configs/systolic_ws_8x8_c1_simple_noc.json b/configs/systolic_ws_8x8_c1_simple_noc.json
deleted file mode 100644
index 2eb7e183..00000000
--- a/configs/systolic_ws_8x8_c1_simple_noc.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
-  "num_cores" : 1,
-  "core_freq_mhz" : 800,
-  "core_stats_print_period_cycles" : 100000,
-
-  "vpu_num_lanes" : 8,
-  "vpu_spad_size_kb_per_lane" : 32,
-  "vpu_vector_length_bits" : 256,
-
-  "dram_type" : "ramulator2",
-  "dram_freq_mhz" :800,
-  "dram_channels": 1,
-  "dram_req_size_byte": 64,
-  "dram_num_burst_length" : 4,
-  "dram_stats_print_period_cycles": 100000,
-  "ramulator_config_path" : "../configs/ramulator2_configs/DDR4.yaml",
- 
-  "icnt_type" : "simple",
-  "icnt_latency_cycles" : 10,
-  "icnt_freq_mhz" : 800,
-
-  "pytorchsim_functional_mode" : 1,
-  "pytorchsim_timing_mode" : 1,
-
-  "codegen_mapping_strategy" : "autotune",
-  "codegen_external_mapping_file" : "",
-  "codegen_autotune_max_retry": 10,
-  "codegen_autotune_template_topk": 4,
-  "codegen_compiler_optimization" : "all"
-}
\ No newline at end of file
diff --git a/configs/systolic_ws_8x8_c1_simple_noc.yml b/configs/systolic_ws_8x8_c1_simple_noc.yml
new file mode 100644
index 00000000..274f633c
--- /dev/null
+++ b/configs/systolic_ws_8x8_c1_simple_noc.yml
@@ -0,0 +1,28 @@
+num_cores: 1
+core_freq_mhz: 800
+core_stats_print_period_cycles: 100000
+
+vpu_num_lanes: 8
+vpu_spad_size_kb_per_lane: 32
+vpu_vector_length_bits: 256
+
+dram_type: ramulator2
+dram_freq_mhz: 800
+dram_channels: 1
+dram_req_size_byte: 64
+dram_num_burst_length: 4
+dram_stats_print_period_cycles: 100000
+ramulator_config_path: ../configs/ramulator2_configs/DDR4.yaml
+
+icnt_type: simple
+icnt_latency_cycles: 10
+icnt_freq_mhz: 800
+
+pytorchsim_functional_mode: 1
+pytorchsim_timing_mode: 1
+
+codegen_mapping_strategy: autotune
+codegen_external_mapping_file: ''
+codegen_autotune_max_retry: 10
+codegen_autotune_template_topk: 4
+codegen_compiler_optimization: all
diff --git a/experiments/BERT.py b/experiments/BERT.py
index 3311682c..5ccd3084 100644
--- a/experiments/BERT.py
+++ b/experiments/BERT.py
@@ -36,7 +36,7 @@ def run_BERT(size, input_seq, config):
     import os
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json')
+    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml')
     config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path FIXME: gem5 result is different as directoy name
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
diff --git a/experiments/artifact/cycle_validation/run_cycle.sh b/experiments/artifact/cycle_validation/run_cycle.sh
index 99eed4ed..9cfd1e98 100755
--- a/experiments/artifact/cycle_validation/run_cycle.sh
+++ b/experiments/artifact/cycle_validation/run_cycle.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 set -e
 
-export TORCHSIM_CONFIG=$TORCHSIM_DIR/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json
+export TORCHSIM_CONFIG=$TORCHSIM_DIR/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml
 LOG_DIR=$TORCHSIM_DIR/experiments/artifact/logs
 mkdir -p $LOG_DIR
 
diff --git a/experiments/artifact/speedup/run_speedup.sh b/experiments/artifact/speedup/run_speedup.sh
index 9a19e9af..e84ab1a9 100755
--- a/experiments/artifact/speedup/run_speedup.sh
+++ b/experiments/artifact/speedup/run_speedup.sh
@@ -4,8 +4,8 @@ CONFIG_DIR="$TORCHSIM_DIR/configs"
 SIMULATOR_BIN="$TORCHSIM_DIR/TOGSim/build/bin/Simulator"
 
 configs=(
-    "systolic_ws_128x128_c2_simple_noc_tpuv3.json"
-    "systolic_ws_128x128_c2_booksim_tpuv3.json"
+    "systolic_ws_128x128_c2_simple_noc_tpuv3.yml"
+    "systolic_ws_128x128_c2_booksim_tpuv3.yml"
 )
 
 target_list=(
diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh b/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh
index fe872e02..467949af 100755
--- a/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh
+++ b/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh
@@ -2,10 +2,10 @@
 
 base_dir=$TORCHSIM_DIR/experiments/artifact/speedup
 config=(
-    # "systolic_ws_8x8_c1_simple_noc.json"
-    "systolic_ws_128x128_c2_simple_noc_tpuv3.json"
-    #"systolic_ws_128x128_c2_booksim_tpuv3.json"
-    # "systolic_ws_128x128_c2_simple_noc_tpuv4.json"
+    # "systolic_ws_8x8_c1_simple_noc.yml"
+    "systolic_ws_128x128_c2_simple_noc_tpuv3.yml"
+    #"systolic_ws_128x128_c2_booksim_tpuv3.yml"
+    # "systolic_ws_128x128_c2_simple_noc_tpuv4.yml"
 )
 TIMESTAMP=$(date +"%Y-%m-%d_%H-%M-%S")
 SIZE_LIST=(
diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh b/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh
index 19613a34..fb681c74 100755
--- a/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh
+++ b/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh
@@ -2,10 +2,10 @@
 
 base_dir=$TORCHSIM_DIR/experiments/artifact/speedup
 config=(
-    # "systolic_ws_8x8_c1_simple_noc.json"
-    "systolic_ws_128x128_c2_simple_noc_tpuv3.json"
-    #"systolic_ws_128x128_c2_booksim_tpuv3.json"
-    # "systolic_ws_128x128_c2_simple_noc_tpuv4.json"
+    # "systolic_ws_8x8_c1_simple_noc.yml"
+    "systolic_ws_128x128_c2_simple_noc_tpuv3.yml"
+    #"systolic_ws_128x128_c2_booksim_tpuv3.yml"
+    # "systolic_ws_128x128_c2_simple_noc_tpuv4.yml"
 )
 TIMESTAMP=$(date +"%Y-%m-%d_%H-%M-%S")
 SHAPE_LIST=(
diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh b/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh
index 6f3385f1..dc0fdd20 100755
--- a/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh
+++ b/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh
@@ -2,10 +2,10 @@
 
 base_dir=$TORCHSIM_DIR/experiments/artifact/speedup
 config=(
-    # "systolic_ws_8x8_c1_simple_noc.json"
-    "systolic_ws_128x128_c2_simple_noc_tpuv3.json"
-    #"systolic_ws_128x128_c2_booksim_tpuv3.json"
-    # "systolic_ws_128x128_c2_simple_noc_tpuv4.json"
+    # "systolic_ws_8x8_c1_simple_noc.yml"
+    "systolic_ws_128x128_c2_simple_noc_tpuv3.yml"
+    #"systolic_ws_128x128_c2_booksim_tpuv3.yml"
+    # "systolic_ws_128x128_c2_simple_noc_tpuv4.yml"
 )
 TIMESTAMP=$(date +"%Y-%m-%d_%H-%M-%S")
 SHAPE_LIST=(
diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh b/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh
index ca4cfa39..2346ab3c 100755
--- a/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh
+++ b/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh
@@ -2,10 +2,10 @@
 
 base_dir=$TORCHSIM_DIR/experiments/artifact/speedup
 config=(
-    # "systolic_ws_8x8_c1_simple_noc.json"
-    "systolic_ws_128x128_c2_simple_noc_tpuv3.json"
-    #"systolic_ws_128x128_c2_booksim_tpuv3.json"
-    # "systolic_ws_128x128_c2_simple_noc_tpuv4.json"
+    # "systolic_ws_8x8_c1_simple_noc.yml"
+    "systolic_ws_128x128_c2_simple_noc_tpuv3.yml"
+    #"systolic_ws_128x128_c2_booksim_tpuv3.yml"
+    # "systolic_ws_128x128_c2_simple_noc_tpuv4.yml"
 )
 TIMESTAMP=$(date +"%Y-%m-%d_%H-%M-%S")
 SIZE_LIST=(
diff --git a/experiments/attention.py b/experiments/attention.py
index bbd2734e..842f105a 100644
--- a/experiments/attention.py
+++ b/experiments/attention.py
@@ -36,7 +36,7 @@ def attention(query, key, value):
     import os
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json')
+    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml')
     config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
diff --git a/experiments/conv.py b/experiments/conv.py
index f439c5e3..25952fb0 100644
--- a/experiments/conv.py
+++ b/experiments/conv.py
@@ -37,7 +37,7 @@ def custom_conv2d(a, b, bias):
     import os
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json')
+    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml')
     config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
diff --git a/experiments/gemm.py b/experiments/gemm.py
index e92200d1..3090e331 100644
--- a/experiments/gemm.py
+++ b/experiments/gemm.py
@@ -31,7 +31,7 @@ def custom_matmul(a, b):
     import os
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json')
+    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml)
     config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
diff --git a/experiments/layernorm.py b/experiments/layernorm.py
index 74b6d286..9c9934a1 100644
--- a/experiments/layernorm.py
+++ b/experiments/layernorm.py
@@ -27,7 +27,7 @@ def run_layernorm(size, config):
     import os
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json')
+    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml')
     config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
diff --git a/experiments/resnet18.py b/experiments/resnet18.py
index 45311d59..5451e0f5 100644
--- a/experiments/resnet18.py
+++ b/experiments/resnet18.py
@@ -29,7 +29,7 @@ def run_resnet(batch, config):
     import os
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.json')
+    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml')
     config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
diff --git a/experiments/resnet50.py b/experiments/resnet50.py
index 4f03ea15..83d82db4 100644
--- a/experiments/resnet50.py
+++ b/experiments/resnet50.py
@@ -29,7 +29,7 @@ def run_resnet(batch, config):
     import os
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json')
+    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml')
     config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
diff --git a/experiments/softmax.py b/experiments/softmax.py
index b47bd685..580d56ca 100644
--- a/experiments/softmax.py
+++ b/experiments/softmax.py
@@ -27,7 +27,7 @@ def run_softmax(size, config, dim=1):
     import os
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.json')
+    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml')
     config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()
diff --git a/scripts/CompilerOpt_experiment/DMAopt.sh b/scripts/CompilerOpt_experiment/DMAopt.sh
index 5c2dc65c..9e494d9b 100644
--- a/scripts/CompilerOpt_experiment/DMAopt.sh
+++ b/scripts/CompilerOpt_experiment/DMAopt.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-export TORCHSIM_CONFIG="/root/workspace/PyTorchSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json"
+export TORCHSIM_CONFIG="/root/workspace/PyTorchSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml"
 
 # None FG DMA
 export TORCHSIM_SUBTILE=0
diff --git a/scripts/chiplet.sh b/scripts/chiplet.sh
index 0d56ecae..e622874b 100755
--- a/scripts/chiplet.sh
+++ b/scripts/chiplet.sh
@@ -19,11 +19,11 @@ GEMM_DIR_NAME=$(basename "$GEMM_PATH")
 echo "GEMM Directory Name: $GEMM_DIR_NAME"
 
 CONFIG_LIST=(
-    "$TORCHSIM_DIR/configs/systolic_ws_128x128_c2_chiplet_tpuv3.json"
+    "$TORCHSIM_DIR/configs/systolic_ws_128x128_c2_chiplet_tpuv3.yml"
 )
 CONFIG_LIST2=(
-    "$TORCHSIM_DIR/configs/systolic_ws_128x128_c2_booksim_tpuv3.json"
-    "$TORCHSIM_DIR/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.json"
+    "$TORCHSIM_DIR/configs/systolic_ws_128x128_c2_booksim_tpuv3.yml"
+    "$TORCHSIM_DIR/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.yml"
 )
 shift
 shift
@@ -39,7 +39,7 @@ MODELS_LIST="$GEMM_PATH/tile_graph.onnx"
 ATTRIBUTE_PATH="$GEMM_PATH/runtime_0000/attribute"
 
 for CONFIG in "${CONFIG_LIST[@]}"; do
-    CONFIG_NAME=$(basename "$CONFIG" .json)
+    CONFIG_NAME=$(basename "$CONFIG" .yml)
 
     for ATTRIBUTE_FILE in "${ATTRIBUTE_FILES[@]}"; do
         ATTRIBUTE_NAME=$(basename "$ATTRIBUTE_FILE")
@@ -56,7 +56,7 @@ for CONFIG in "${CONFIG_LIST[@]}"; do
 done
 
 for CONFIG in "${CONFIG_LIST2[@]}"; do
-    CONFIG_NAME=$(basename "$CONFIG" .json)
+    CONFIG_NAME=$(basename "$CONFIG" .yml)
     ATTRIBUTE_NAME=0
     RESULTS_DIR="./chiplet_results$INDEX_NAME/$GEMM_DIR_NAME/$ATTRIBUTE_NAME"
     mkdir -p "$RESULTS_DIR"
diff --git a/scripts/chiplet_prep.py b/scripts/chiplet_prep.py
index 32f7ad50..4f8b7f7c 100644
--- a/scripts/chiplet_prep.py
+++ b/scripts/chiplet_prep.py
@@ -1,5 +1,5 @@
 import os
-import json
+import yaml
 import shutil
 import argparse
 import torch
@@ -41,9 +41,11 @@ def modify_file(dump_path, name, address_numa_stride=None, subgraph_map=None):
     if not os.path.exists(file_path):
         print(f"File {file_path} does not exist.")
         return
+
     with open(file_path, 'r') as f:
-        data = json.load(f)
-    # address_numa_stride와 subgraph_map 추가
+        data = yaml.safe_load(f)
+
+    # address_numa_stride, subgraph_map
     if address_numa_stride:
         data['address_numa_stride'] = address_numa_stride
     if subgraph_map:
@@ -52,8 +54,9 @@ def modify_file(dump_path, name, address_numa_stride=None, subgraph_map=None):
     output_path = file_path = os.path.join(dump_path, 'runtime_0000', 'attribute')
     os.makedirs(output_path, exist_ok=True)
     output_file = os.path.join(output_path, name)
+
     with open(output_file, 'w') as f:
-        json.dump(data, f, indent=4)
+        yaml.dump(data, f, default_flow_style=False, sort_keys=False)
     print(f"Modified file saved to {output_file}")
 
 if __name__ == "__main__":
diff --git a/scripts/sparsity_experiment/run.sh b/scripts/sparsity_experiment/run.sh
index 4f5dd3a6..84c818ac 100755
--- a/scripts/sparsity_experiment/run.sh
+++ b/scripts/sparsity_experiment/run.sh
@@ -5,7 +5,7 @@ export TORCHSIM_FORCE_TIME_M=8
 export TORCHSIM_FORCE_TIME_N=8
 
 OUTPUT_DIR="12GB"
-export TORCHSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c1_12G_simple_noc.json"
+export TORCHSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c1_12G_simple_noc.yml"
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.0  > ${OUTPUT_DIR}/0.0
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.2  > ${OUTPUT_DIR}/0.2
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.4  > ${OUTPUT_DIR}/0.4
@@ -13,7 +13,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.6  > ${OUTPUT
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.8  > ${OUTPUT_DIR}/0.8
 
 OUTPUT_DIR="24GB"
-export TORCHSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c1_24G_simple_noc.json"
+export TORCHSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c1_24G_simple_noc.yml"
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.0  > ${OUTPUT_DIR}/0.0
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.2  > ${OUTPUT_DIR}/0.2
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.4  > ${OUTPUT_DIR}/0.4
@@ -21,7 +21,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.6  > ${OUTPUT
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.8  > ${OUTPUT_DIR}/0.8
 
 OUTPUT_DIR="48GB"
-export TORCHSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c1_48G_simple_noc.json"
+export TORCHSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c1_48G_simple_noc.yml"
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.0  > ${OUTPUT_DIR}/0.0
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.2  > ${OUTPUT_DIR}/0.2
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.4  > ${OUTPUT_DIR}/0.4
@@ -29,7 +29,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.6  > ${OUTPUT
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.8  > ${OUTPUT_DIR}/0.8
 
 OUTPUT_DIR="12GB_2core"
-export TORCHSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c2_12G_simple_noc.json"
+export TORCHSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c2_12G_simple_noc.yml"
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.0  > ${OUTPUT_DIR}/0.0
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.2  > ${OUTPUT_DIR}/0.2
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.4  > ${OUTPUT_DIR}/0.4
@@ -37,7 +37,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.6  > ${OUTPUT
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.8  > ${OUTPUT_DIR}/0.8
 
 OUTPUT_DIR="24GB_2core"
-export TORCHSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c2_24G_simple_noc.json"
+export TORCHSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c2_24G_simple_noc.yml"
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.0  > ${OUTPUT_DIR}/0.0
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.2  > ${OUTPUT_DIR}/0.2
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.4  > ${OUTPUT_DIR}/0.4
@@ -45,7 +45,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.6  > ${OUTPUT
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.8  > ${OUTPUT_DIR}/0.8
 
 OUTPUT_DIR="48GB_2core"
-export TORCHSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c2_48G_simple_noc.json"
+export TORCHSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c2_48G_simple_noc.yml"
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.0  > ${OUTPUT_DIR}/0.0
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.2  > ${OUTPUT_DIR}/0.2
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.4  > ${OUTPUT_DIR}/0.4
diff --git a/scripts/stonne_experiment/run.sh b/scripts/stonne_experiment/run.sh
index 1825817f..2e386d9c 100755
--- a/scripts/stonne_experiment/run.sh
+++ b/scripts/stonne_experiment/run.sh
@@ -2,8 +2,8 @@
 export TORCHSIM_FORCE_TIME_M=1024
 export TORCHSIM_FORCE_TIME_K=1024
 export TORCHSIM_FORCE_TIME_N=1024
-python3 ../../tests/test_hetro.py --M 1024 --N 1024 --K 1024 --sparsity 0.9 --config stonne_big_c1_simple_noc.json --mode 0 > hetero/big_sparse.log
-python3 ../../tests/test_hetro.py --M 1024 --N 1024 --K 1024 --sparsity 0.9 --config systolic_ws_128x128_c1_simple_noc_tpuv3_half.json --mode 1 > hetero/big.log
-python3 ../../tests/test_hetro.py --M 1024 --N 1024 --K 1024 --sparsity 0.9 --config heterogeneous_c2_simple_noc.json --mode 2 > hetero/hetero.log
+python3 ../../tests/test_hetro.py --M 1024 --N 1024 --K 1024 --sparsity 0.9 --config stonne_big_c1_simple_noc.yml --mode 0 > hetero/big_sparse.log
+python3 ../../tests/test_hetro.py --M 1024 --N 1024 --K 1024 --sparsity 0.9 --config systolic_ws_128x128_c1_simple_noc_tpuv3_half.yml --mode 1 > hetero/big.log
+python3 ../../tests/test_hetro.py --M 1024 --N 1024 --K 1024 --sparsity 0.9 --config heterogeneous_c2_simple_noc.yml --mode 2 > hetero/hetero.log
 
 echo "All processes completed!"
diff --git a/scripts/stonne_experiment2/tog_gen.py b/scripts/stonne_experiment2/tog_gen.py
index d4f93d4d..e8013da7 100644
--- a/scripts/stonne_experiment2/tog_gen.py
+++ b/scripts/stonne_experiment2/tog_gen.py
@@ -72,7 +72,7 @@ def extract_simulation_stats(result_path):
             continue
         tog_path = os.path.join(path, "tile_graph.onnx")
         togsim_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "TOGSim")
-        stonne_config_path = f'{extension_config.CONFIG_TORCHSIM_DIR}/configs/stonne_validation_c1_simple_noc.json'
+        stonne_config_path = f'{extension_config.CONFIG_TORCHSIM_DIR}/configs/stonne_validation_c1_simple_noc.yml'
         backsim = TOGSimulator(togsim_path, stonne_config_path)
         result_path = backsim.simulation(tog_path)
         nr_multiplications, total_cycle, sim_time = extract_simulation_stats(result_path)
diff --git a/tests/test_compile_overhead.py b/tests/test_compile_overhead.py
index 030f548e..449707a5 100644
--- a/tests/test_compile_overhead.py
+++ b/tests/test_compile_overhead.py
@@ -21,7 +21,7 @@
         #    shutil.rmtree("/tmp/torchinductor")
         #except FileNotFoundError:
         #    print("no cache")
-        scheduler = Scheduler(num_request_queue=1, max_batch=4, engine_select=Scheduler.FIFO_ENGINE, togsim_config=f"{CONFIG_TORCHSIM_DIR}/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json")
+        scheduler = Scheduler(num_request_queue=1, max_batch=4, engine_select=Scheduler.FIFO_ENGINE, togsim_config=f"{CONFIG_TORCHSIM_DIR}/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml")
         # Register compiled model
         opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last), dynamic=False)
         SchedulerDNNModel.register_model("resnet18", opt_model1)
diff --git a/tests/test_hetro.py b/tests/test_hetro.py
index a0716e2d..9fac8c65 100644
--- a/tests/test_hetro.py
+++ b/tests/test_hetro.py
@@ -17,7 +17,7 @@ def custom_matmul(a, b):
     parser.add_argument("--N", type=int, default=128, help="Input layer size")
     parser.add_argument("--K", type=int, default=128, help="Hidden layer size")
     parser.add_argument("--sparsity", type=float, default=0.9, help="Output layer size")
-    parser.add_argument("--config", type=str, default="stonne_big_c1_simple_noc.json", help="Output layer size")
+    parser.add_argument("--config", type=str, default="stonne_big_c1_simple_noc.yml", help="Output layer size")
     parser.add_argument("--mode", type=int, default=0, help="Output layer size")
     args = parser.parse_args()
 
diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py
index 4860de56..9c7ca255 100644
--- a/tests/test_scheduler.py
+++ b/tests/test_scheduler.py
@@ -7,7 +7,7 @@
 base_path = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
 sys.path.append(base_path)
 from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
-config = f'{base_path}/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.json'
+config = f'{base_path}/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.yml'
 
 target_model1 = model1().eval()
 target_model2 = model2(768, 12).eval()
diff --git a/tests/test_scheduler_batching.py b/tests/test_scheduler_batching.py
index 53f9256d..65213ef0 100644
--- a/tests/test_scheduler_batching.py
+++ b/tests/test_scheduler_batching.py
@@ -17,7 +17,7 @@
     target_model1 = model1().eval()
 
     # Init scheduler
-    scheduler = Scheduler(num_request_queue=1, max_batch=32, engine_select=Scheduler.FIFO_ENGINE, togsim_config=f"{CONFIG_TORCHSIM_DIR}/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.json")
+    scheduler = Scheduler(num_request_queue=1, max_batch=32, engine_select=Scheduler.FIFO_ENGINE, togsim_config=f"{CONFIG_TORCHSIM_DIR}/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml")
     # Register compiled model
     opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last), dynamic=False)
     SchedulerDNNModel.register_model("resnet18", opt_model1)
diff --git a/tutorial/session1/CompilerOptimization.ipynb b/tutorial/session1/CompilerOptimization.ipynb
index 178974c1..ead695c0 100644
--- a/tutorial/session1/CompilerOptimization.ipynb
+++ b/tutorial/session1/CompilerOptimization.ipynb
@@ -18,7 +18,7 @@
     "import sys\n",
     "base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')\n",
     "sys.path.append(base_dir)\n",
-    "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_timing_only.json\""
+    "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_timing_only.yml\""
    ]
   },
   {
@@ -71,7 +71,7 @@
    "outputs": [],
    "source": [
     "os.environ['TORCHSIM_DUMP_PATH']=os.path.join(os.getcwd(), \"non_fused\")\n",
-    "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_no_compiler_optimization.json\"\n",
+    "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_no_compiler_optimization.yml\"\n",
     "\n",
     "input = torch.randn(1024, 1024).to(device=device)\n",
     "weight = torch.randn(1024, 1024).to(device=device)\n",
diff --git a/tutorial/session1/ExecutionMode.ipynb b/tutorial/session1/ExecutionMode.ipynb
index 22e00bed..b6f0e048 100644
--- a/tutorial/session1/ExecutionMode.ipynb
+++ b/tutorial/session1/ExecutionMode.ipynb
@@ -56,7 +56,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_functional_only.json\"\n",
+    "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_functional_only.yml\"\n",
     "\n",
     "input = torch.randn(1024, 1024).to(device=device)\n",
     "weight = torch.randn(1024, 1024).to(device=device)\n",
@@ -78,7 +78,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_timing_only.json\"\n",
+    "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_timing_only.yml\"\n",
     "\n",
     "input = torch.randn(1024, 1024).to(device=device)\n",
     "weight = torch.randn(1024, 1024).to(device=device)\n",
@@ -101,7 +101,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_timing_only.json\"\n",
+    "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_timing_only.yml\"\n",
     "\n",
     "input = torch.randn(2048, 2048).to(device=device)\n",
     "weight = torch.randn(2048, 2048).to(device=device)\n",
@@ -132,7 +132,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_2_cores.json\"\n",
+    "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_2_cores.yml\"\n",
     "\n",
     "input = torch.randn(2048, 2048).to(device=device)\n",
     "weight = torch.randn(2048, 2048).to(device=device)\n",
diff --git a/tutorial/session1/LogAnalysis.ipynb b/tutorial/session1/LogAnalysis.ipynb
index 4f1e17cb..d3207af1 100644
--- a/tutorial/session1/LogAnalysis.ipynb
+++ b/tutorial/session1/LogAnalysis.ipynb
@@ -18,7 +18,7 @@
     "import sys\n",
     "base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')\n",
     "sys.path.append(base_dir)\n",
-    "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_timing_only.json\"\n",
+    "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_timing_only.yml\"\n",
     "os.environ['TORCHSIM_DUMP_LOG_PATH']=os.path.join(os.getcwd(), \"togsim_results\")"
    ]
   },
diff --git a/tutorial/session1/Mapping.ipynb b/tutorial/session1/Mapping.ipynb
index b02c98fe..684b69c0 100644
--- a/tutorial/session1/Mapping.ipynb
+++ b/tutorial/session1/Mapping.ipynb
@@ -68,7 +68,7 @@
    "source": [
     "torch._dynamo.reset()\n",
     "\n",
-    "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_external_mapping.json\"\n",
+    "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_external_mapping.yml\"\n",
     "\n",
     "input = torch.randn(1024, 1024).to(device=device)\n",
     "weight = torch.randn(1024, 1024).to(device=device)\n",
@@ -101,7 +101,7 @@
    "source": [
     "torch._dynamo.reset()\n",
     "\n",
-    "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_autotune.json\"\n",
+    "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_autotune.yml\"\n",
     "\n",
     "input = torch.randn(1024, 1024).to(device=device)\n",
     "weight = torch.randn(1024, 1024).to(device=device)\n",

From dce58d080d8bd044e8f59197f223532725e727b0 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 9 Dec 2025 09:31:12 +0000
Subject: [PATCH 022/194] [Test] Change attention masek for Llama

---
 tests/Llama/test_llama.py | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/tests/Llama/test_llama.py b/tests/Llama/test_llama.py
index 98820fd9..443f3fc2 100644
--- a/tests/Llama/test_llama.py
+++ b/tests/Llama/test_llama.py
@@ -274,7 +274,17 @@ def run_custom_llama_test(
     g = torch.Generator().manual_seed(0)
     vocab = cfg.vocab_size
     input_ids_cpu = torch.randint(low=0, high=vocab, size=(batch, seq_len), generator=g, dtype=torch.long)
-    attn_mask_cpu = torch.ones_like(input_ids_cpu, dtype=torch.long)
+
+    min_dtype = torch.finfo(torch_dtype).min
+    causal_mask = torch.zeros((seq_len, seq_len), dtype=torch_dtype, device="cpu")
+
+    if seq_len > 1:
+        causal_mask = torch.triu(torch.full_like(causal_mask, min_dtype), diagonal=1)
+
+    cache_position = torch.arange(seq_len, device="cpu")
+    mask_condition = torch.arange(seq_len, device="cpu") > cache_position.reshape(-1, 1)
+    causal_mask.masked_fill_(mask_condition, min_dtype)
+    attn_mask_cpu = causal_mask[None, None, :, :].expand(batch, 1, -1, -1)
 
     input_ids_dev = input_ids_cpu.to(device)
     attn_mask_dev = attn_mask_cpu.to(device)
@@ -325,11 +335,11 @@ def run_llama_model_test(
     g = torch.Generator().manual_seed(0)
     input_ids_cpu = torch.randint(low=0, high=cfg.vocab_size, size=(batch, seq_len), generator=g, dtype=torch.long)
 
-    # FIXME: Currently, the user must provide the mask manually.
-    # There is a functionality issue with the model generating the mask internally,
-    # so we explicitly create and inject a Causal Mask (lower triangular matrix) from the outside.
-    causal_mask = torch.tril(torch.ones(seq_len, seq_len, dtype=torch.long))
-    attn_mask_cpu = causal_mask.unsqueeze(0).unsqueeze(0).expand(batch, 1, -1, -1).bool()
+    min_dtype = torch.finfo(torch_dtype).min
+    causal_mask = torch.full((seq_len, seq_len), fill_value=min_dtype, dtype=torch_dtype, device="cpu")
+    if seq_len > 1:
+        causal_mask = torch.triu(causal_mask, diagonal=1)
+    attn_mask_cpu = causal_mask[None, None, :, :].expand(batch, 1, -1, -1)
 
     input_ids_dev = input_ids_cpu.to(device)
     attn_mask_dev = attn_mask_cpu.to(device)

From 1c2ab36117f90ff67f0c579220ad54568654ab91 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 9 Dec 2025 13:54:30 +0000
Subject: [PATCH 023/194] [Autotune] Fix autotune log path

---
 PyTorchSimFrontend/extension_codecache.py     |  4 +-
 .../mlir/mlir_codegen_backend.py              |  2 +-
 Simulator/simulator.py                        | 43 +++++++++++++------
 3 files changed, 33 insertions(+), 16 deletions(-)

diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index 4d57b987..2e35220c 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -278,7 +278,7 @@ def dummy_simulator(*args, **kwargs):
                                     vectorlane_size=vectorlane_size, spad_info=spad_info,
                                     silent_mode=silent_mode)
                 if not extension_config.pytorchsim_timing_mode:
-                    return
+                    return [float("inf")]
 
                 onnx_path = os.path.join(result_path, "tile_graph.onnx")
                 attribute_path = os.path.join(runtime_path, "attribute")
@@ -286,7 +286,7 @@ def dummy_simulator(*args, **kwargs):
                 TOGSim = TOGSimulator(togsim_path, extension_config.CONFIG_TOGSIM_CONFIG)
                 TOGSim.vectorlane_size = vectorlane_size
                 attribute_path = TOGSim.create_attribute_file(attribute_path, args, loop_size=loop_size)
-                result_path = TOGSim.simulation(onnx_path, attribute_path, silent_mode=silent_mode)
+                result_path = TOGSim.simulation(onnx_path, attribute_path, silent_mode=silent_mode, autotune_mode=autotune)
                 result = TOGSimulator.get_result_from_file(result_path)
                 return result
 
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index cda996ab..266d884b 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -919,7 +919,7 @@ def get_cycle(choice):
             return float("inf") # Exceeded maximum number of autotuning attempts
         choices = self.make_choices(*args)
 
-        if len(choices) == 0: # can't autotune
+        if len(choices) == 0: # Can't autotune
             return [None, None]
         with ThreadPoolExecutor(max_workers=8) as executor:
             results = list(executor.map(get_cycle, choices))
diff --git a/Simulator/simulator.py b/Simulator/simulator.py
index a46243f0..672ae6ec 100644
--- a/Simulator/simulator.py
+++ b/Simulator/simulator.py
@@ -9,6 +9,7 @@
 import datetime
 import threading
 from pathlib import Path
+import uuid
 
 import torch
 import numpy as np
@@ -214,7 +215,7 @@ def get_togsim_command(self):
         cmd = f"{bin} --config {config}"
         return cmd
 
-    def simulation(self, model_path, attribute_path="", silent_mode=False):
+    def simulation(self, model_path, attribute_path="", silent_mode=False, autotune_mode=False):
         def show_progress():
             i = 0
             while not finished:
@@ -245,19 +246,35 @@ def show_progress():
             if not silent_mode:
                 finished = True
                 progress_thread.join()
-                print("[TOGSim] Command failed with exit code", e.returncode)
-                print("[TOGSim] Error output:", e.output)
+                with print_lock:
+                    print("[TOGSim] Command failed with exit code", e.returncode)
+                    print("[TOGSim] Error output:", e.output)
             assert 0
-        # Save result to result_path
-        result_path = extension_config.CONFIG_TORCHSIM_LOG_PATH
-        os.makedirs(result_path, exist_ok=True)
-        file_name = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')+".log"
-        result_path = os.path.join(result_path, file_name)
+
+        # Separate Autotune logs
+        if autotune_mode:
+            base_dir = Path(model_path).parent / "togsim_result"
+            base_dir.mkdir(parents=True, exist_ok=True)
+            file_name = f"{len(list(base_dir.iterdir()))}.log"
+        else:
+            base_dir = Path(extension_config.CONFIG_TORCHSIM_LOG_PATH)
+            unique_id = uuid.uuid4().hex[:8]
+            timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
+            file_name = f"{unique_id}_{timestamp}.log"
+
+        base_dir.mkdir(parents=True, exist_ok=True)
+        result_path = base_dir / file_name
+
+        # Prevent race condition
         with open(result_path, "w") as f:
             f.write(result.decode())
+            f.flush()
+            os.fsync(f.fileno())
+
         if not silent_mode or extension_config.CONFIG_DEBUG_MODE:
             model_path_log = f' of "{model_path}" ' if extension_config.CONFIG_DEBUG_MODE else " "
-            print(f'[TOGSim] Simulation log{model_path_log}is stored to "{result_path}"')
+            with print_lock:
+                print(f'[TOGSim] Simulation log{model_path_log}is stored to "{result_path}"')
         return result_path
 
     def interactive_simulation(self):
@@ -406,9 +423,9 @@ def find_zero_sub_tensors(self, tensor):
     def get_result_from_file(result_path):
         core_metrics = {}
         dram_channel_bw = {}
-        avg_dram_bw = None
-        simulation_time = None
-        total_cycle = None
+        avg_dram_bw = 0.0
+        simulation_time = float("inf")
+        total_cycle = float("inf")
 
         # Read and find total stat position
         with open(result_path, "r") as f:
@@ -423,7 +440,7 @@ def get_result_from_file(result_path):
                 break
 
         if simulation_finished_idx == -1:
-            print("[TOGSim] Tried to parsing wrong formated output file!")
+            print(f"[TOGSim] Warning: Unable to parse the output file ({result_path}). The file may be improperly formatted.")
             return core_metrics, dram_channel_bw, avg_dram_bw, simulation_time
 
         total_stat_lines = lines[simulation_finished_idx:]

From 20af55066a6e1a73e99149ac6d3b23b903031264 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 9 Dec 2025 14:39:58 +0000
Subject: [PATCH 024/194] [Fix] Fix codegen error in ops.select

---
 PyTorchSimFrontend/mlir/mlir_ops.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/PyTorchSimFrontend/mlir/mlir_ops.py b/PyTorchSimFrontend/mlir/mlir_ops.py
index af323c1e..21995512 100644
--- a/PyTorchSimFrontend/mlir/mlir_ops.py
+++ b/PyTorchSimFrontend/mlir/mlir_ops.py
@@ -98,6 +98,7 @@ def where(condition, operand1, operand2, *args, var_info=None, **kwargs):
         tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
         cond_type = var_info[condition]
         operand_type = var_info[operand1]
+        condition = ops.to_bool(condition)
         if cond_type[0] < tile_size:
             condition = ops.broadcast(condition, tile_size)
         elif cond_type[0] > tile_size:
@@ -969,6 +970,9 @@ def ext(operand, dtype, *args, var_info=None, **kwargs):
     @staticmethod
     def to_bool(operand, *args, var_info=None, **kwargs):
         tile_size, ret_type = var_info[operand]
+        if ret_type == "i1":
+            return operand, [tile_size, ret_type]
+
         const_one = ops.constant(0, ret_type)
         if tile_size > 1:
             const_one = ops.broadcast(const_one, tile_size)

From c39c3a3c8e661db989c7d87bbd5bba9c981e5075 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 11 Dec 2025 08:14:27 +0000
Subject: [PATCH 025/194] [Tutorial] Update environment setting for the
 tutorial

---
 Dockerfile.base             |  2 +-
 Dockerfile.ksc2025          |  2 +-
 tutorial/session2/Warmup.py | 13 +++++++++----
 3 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/Dockerfile.base b/Dockerfile.base
index 1ac5e175..6a21760b 100644
--- a/Dockerfile.base
+++ b/Dockerfile.base
@@ -33,7 +33,7 @@ RUN apt -y update && \
     python3-dev python-is-python3 libboost-all-dev \
     libhdf5-serial-dev python3-pydot libpng-dev libelf-dev pkg-config pip \
     python3-venv black libssl-dev libasan5 libubsan1 curl device-tree-compiler wget ninja-build && \
-    pip install onnx matplotlib scikit-learn && pip install --user conan==1.56.0 && rm -rf /var/lib/apt/lists/*
+    pip install onnx matplotlib scikit-learn pydot tabulate && pip install --user conan==1.56.0 && rm -rf /var/lib/apt/lists/* 
 
 # Download RISC-V tool chain
 RUN wget https://github.com/riscv-collab/riscv-gnu-toolchain/releases/download/2023.12.14/riscv64-glibc-ubuntu-22.04-llvm-nightly-2023.12.14-nightly.tar.gz && \
diff --git a/Dockerfile.ksc2025 b/Dockerfile.ksc2025
index 2ac210e0..b70b2b77 100644
--- a/Dockerfile.ksc2025
+++ b/Dockerfile.ksc2025
@@ -33,7 +33,7 @@ RUN apt -y update && apt -y upgrade && \
     python3-dev python-is-python3 doxygen libboost-all-dev \
     libhdf5-serial-dev python3-pydot libpng-dev libelf-dev pkg-config pip \
     python3-venv black libssl-dev libasan5 libubsan1
-RUN pip install mypy pre-commit jupyter
+RUN pip install mypy pre-commit jupyter pydot tabulate jupyterlab_execute_time
 
 # Pass Access Token securely
 ENV PATH=$PATH:/root/.local/bin
diff --git a/tutorial/session2/Warmup.py b/tutorial/session2/Warmup.py
index ce215cf5..a45734ad 100644
--- a/tutorial/session2/Warmup.py
+++ b/tutorial/session2/Warmup.py
@@ -1,13 +1,19 @@
 from typing import List
 import os
 from torch.fx.passes.graph_drawer import FxGraphDrawer
-os.environ['TORCH_LOGS'] = 'bytecode'
 import torch
+import inspect
 
 def dummy_compiler(gm: torch.fx.GraphModule, _):
-    gm.graph.print_tabular()
+    sep = "-" * 80
     drawer = FxGraphDrawer(gm, "my_model")
     drawer.get_dot_graph().write_svg("fx_graph.svg")
+
+    print(f"\n{sep}\n[1] FX Graph Tabular View\n{sep}")
+    gm.graph.print_tabular()
+
+    print(f"\n{sep}\n[2] Generated Forward Source Code\n{sep}")
+    print(inspect.getsource(gm.forward))
     return gm.forward # Return a callable object
 
 class MyModel(torch.nn.Module):
@@ -23,5 +29,4 @@ def f(x, y):
 if __name__ == "__main__":
     x = torch.randn(7, 5,requires_grad=False)
     y = torch.randn(5, 3,requires_grad=False)
-    k = f(x, y)
-    print(k)
+    k = f(x, y)
\ No newline at end of file

From 8678fe631db988c70c35f3e428553692af835d0d Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 12 Dec 2025 17:59:19 +0900
Subject: [PATCH 026/194] [Tutorial] Add tutorail env setting scripts

---
 .github/workflows/docker-tutorial-image.yml   |  2 +-
 tutorial/jupyterhub/Dockerfile                |  7 +++++
 .../jupyterhub/Dockerfile.ksc2025             |  8 ++++--
 tutorial/jupyterhub/docker-compose.yml        | 25 +++++++++++++++++
 tutorial/jupyterhub/jupyterhub_config.py      | 28 +++++++++++++++++++
 tutorial/jupyterhub/setting.sh                |  5 ++++
 6 files changed, 71 insertions(+), 4 deletions(-)
 create mode 100644 tutorial/jupyterhub/Dockerfile
 rename Dockerfile.ksc2025 => tutorial/jupyterhub/Dockerfile.ksc2025 (96%)
 create mode 100644 tutorial/jupyterhub/docker-compose.yml
 create mode 100644 tutorial/jupyterhub/jupyterhub_config.py
 create mode 100755 tutorial/jupyterhub/setting.sh

diff --git a/.github/workflows/docker-tutorial-image.yml b/.github/workflows/docker-tutorial-image.yml
index c7d3a2ca..c0d8267d 100644
--- a/.github/workflows/docker-tutorial-image.yml
+++ b/.github/workflows/docker-tutorial-image.yml
@@ -30,6 +30,6 @@ jobs:
         uses: docker/build-push-action@v4
         with:
           context: .
-          file: ./Dockerfile.ksc2025
+          file: ./tutorial/jupyterhub/Dockerfile.ksc2025
           push: true
           tags: ghcr.io/psal-postech/torchsim_ksc2025:latest
diff --git a/tutorial/jupyterhub/Dockerfile b/tutorial/jupyterhub/Dockerfile
new file mode 100644
index 00000000..f98b2294
--- /dev/null
+++ b/tutorial/jupyterhub/Dockerfile
@@ -0,0 +1,7 @@
+FROM jupyterhub/jupyterhub:latest
+
+RUN pip install --no-cache-dir \
+    dockerspawner \
+    jupyterhub-nativeauthenticator
+
+WORKDIR /srv/jupyterhub
diff --git a/Dockerfile.ksc2025 b/tutorial/jupyterhub/Dockerfile.ksc2025
similarity index 96%
rename from Dockerfile.ksc2025
rename to tutorial/jupyterhub/Dockerfile.ksc2025
index b70b2b77..5ff5d40d 100644
--- a/Dockerfile.ksc2025
+++ b/tutorial/jupyterhub/Dockerfile.ksc2025
@@ -40,8 +40,8 @@ ENV PATH=$PATH:/root/.local/bin
 ENV LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:/opt/conda/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$LD_LIBRARY_PATH
 
 # Build Gem5
-RUN git clone https://github.com/PSAL-POSTECH/gem5.git --branch TorchSim
-RUN cd gem5 && scons build/RISCV/gem5.opt -j $(nproc)
+RUN git clone https://github.com/PSAL-POSTECH/gem5.git --branch tutorial
+RUN cd gem5 && scons build/RISCV/gem5.opt -j $(nproc) && git checkout TorchSim
 ENV GEM5_PATH=/workspace/gem5/build/RISCV/gem5.opt
 
 # Build LLVM RISC-V
@@ -87,4 +87,6 @@ RUN cd PyTorchSim/TOGSim && \
     cd build && \
     conan install .. --build=missing && \
     cmake .. && \
-    make -j$(nproc)
\ No newline at end of file
+    make -j$(nproc)
+
+RUN pip install jupyterhub jupyterlab
diff --git a/tutorial/jupyterhub/docker-compose.yml b/tutorial/jupyterhub/docker-compose.yml
new file mode 100644
index 00000000..62c07ff1
--- /dev/null
+++ b/tutorial/jupyterhub/docker-compose.yml
@@ -0,0 +1,25 @@
+version: '3'
+
+services:
+  jupyterhub:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    container_name: jupyterhub
+    image: my-jupyterhub-image
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock
+      - ./jupyterhub_config.py:/srv/jupyterhub/jupyterhub_config.py
+    environment:
+      # DockerSpawner가 사용할 네트워크 이름
+      DOCKER_NETWORK_NAME: jupyterhub-network
+      # Hub가 내부적으로 사용할 IP
+      HUB_IP: jupyterhub
+    ports:
+      - "8888:8000"
+    networks:
+      - jupyterhub-network
+
+networks:
+  jupyterhub-network:
+    external: true
diff --git a/tutorial/jupyterhub/jupyterhub_config.py b/tutorial/jupyterhub/jupyterhub_config.py
new file mode 100644
index 00000000..a43c0543
--- /dev/null
+++ b/tutorial/jupyterhub/jupyterhub_config.py
@@ -0,0 +1,28 @@
+import os
+
+c = get_config()
+
+# ------------------------------------------------------------------------------
+# Spawner config
+# ------------------------------------------------------------------------------
+c.JupyterHub.spawner_class = 'dockerspawner.DockerSpawner'
+c.DockerSpawner.image = "ghcr.io/psal-postech/torchsim_ksc2025:latest"
+
+# Resource limit
+c.DockerSpawner.mem_limit = '16G'
+c.DockerSpawner.cpu_limit = 4.0
+
+c.DockerSpawner.network_name = 'jupyterhub-network'
+c.Spawner.default_url = '/lab'
+c.Spawner.ip = '0.0.0.0'
+c.DockerSpawner.remove = False
+c.DockerSpawner.cmd = ["jupyterhub-singleuser", "--allow-root"]
+
+c.JupyterHub.authenticator_class = 'nativeauthenticator.NativeAuthenticator'
+c.Authenticator.admin_users = {'admin'}
+
+c.JupyterHub.hub_ip = 'jupyterhub'
+c.JupyterHub.hub_port = 8081
+
+c.NativeAuthenticator.open_signup = True
+c.NativeAuthenticator.allow_all = True
diff --git a/tutorial/jupyterhub/setting.sh b/tutorial/jupyterhub/setting.sh
new file mode 100755
index 00000000..3e544839
--- /dev/null
+++ b/tutorial/jupyterhub/setting.sh
@@ -0,0 +1,5 @@
+if [ -z "$(docker network ls | grep jupyterhub-network)" ]; then
+    docker network create jupyterhub-network
+fi
+
+docker compose up -d --build
\ No newline at end of file

From 0a5d0e70dcd212880eab04b33b0c21b2e915fe15 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 15 Dec 2025 15:39:22 +0900
Subject: [PATCH 027/194] [Tutorial] Change format of config files to yml

---
 .../togsim_configs/togsim_config.json         | 32 -------------------
 .../session1/togsim_configs/togsim_config.yml | 30 +++++++++++++++++
 .../togsim_configs/togsim_config_2_cores.json | 32 -------------------
 .../togsim_configs/togsim_config_2_cores.yml  | 30 +++++++++++++++++
 .../togsim_config_autotune.json               | 32 -------------------
 .../togsim_configs/togsim_config_autotune.yml | 30 +++++++++++++++++
 .../togsim_config_external_mapping.json       | 32 -------------------
 .../togsim_config_external_mapping.yml        | 30 +++++++++++++++++
 .../togsim_config_functional_only.json        | 32 -------------------
 .../togsim_config_functional_only.yml         | 30 +++++++++++++++++
 ...ogsim_config_no_compiler_optimization.json | 32 -------------------
 ...togsim_config_no_compiler_optimization.yml | 30 +++++++++++++++++
 .../togsim_config_timing_only.json            | 32 -------------------
 .../togsim_config_timing_only.yml             | 30 +++++++++++++++++
 14 files changed, 210 insertions(+), 224 deletions(-)
 delete mode 100644 tutorial/session1/togsim_configs/togsim_config.json
 create mode 100644 tutorial/session1/togsim_configs/togsim_config.yml
 delete mode 100644 tutorial/session1/togsim_configs/togsim_config_2_cores.json
 create mode 100644 tutorial/session1/togsim_configs/togsim_config_2_cores.yml
 delete mode 100644 tutorial/session1/togsim_configs/togsim_config_autotune.json
 create mode 100644 tutorial/session1/togsim_configs/togsim_config_autotune.yml
 delete mode 100644 tutorial/session1/togsim_configs/togsim_config_external_mapping.json
 create mode 100644 tutorial/session1/togsim_configs/togsim_config_external_mapping.yml
 delete mode 100644 tutorial/session1/togsim_configs/togsim_config_functional_only.json
 create mode 100644 tutorial/session1/togsim_configs/togsim_config_functional_only.yml
 delete mode 100644 tutorial/session1/togsim_configs/togsim_config_no_compiler_optimization.json
 create mode 100644 tutorial/session1/togsim_configs/togsim_config_no_compiler_optimization.yml
 delete mode 100644 tutorial/session1/togsim_configs/togsim_config_timing_only.json
 create mode 100644 tutorial/session1/togsim_configs/togsim_config_timing_only.yml

diff --git a/tutorial/session1/togsim_configs/togsim_config.json b/tutorial/session1/togsim_configs/togsim_config.json
deleted file mode 100644
index e8e489d9..00000000
--- a/tutorial/session1/togsim_configs/togsim_config.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-  "num_cores" : 1,
-  "core_freq_mhz" : 940,
-  "core_stats_print_period_cycles" : 10000,
-  "num_systolic_array_per_core" : 2,
-
-  "vpu_num_lanes" : 128,
-  "vpu_spad_size_kb_per_lane" : 128,
-  "vpu_vector_length_bits" : 256,
-
-  "dram_type" : "ramulator2",
-  "dram_freq_mhz" : 940,
-  "dram_channels": 16,
-  "dram_req_size_byte": 32,
-  "dram_num_burst_length" : 2,
-  "dram_stats_print_period_cycles": 10000,
-  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
-
-  "icnt_type" : "simple",
-  "icnt_latency_cycles" : 10,
-  "icnt_freq_mhz" : 940,
-  "icnt_injection_ports_per_core" : 16,
-
-  "pytorchsim_functional_mode" : 1,
-  "pytorchsim_timing_mode" : 1,
-
-  "codegen_mapping_strategy" : "heuristic",
-  "codegen_external_mapping_file" : "",
-  "codegen_autotune_max_retry": 10,
-  "codegen_autotune_template_topk": 4,
-  "codegen_compiler_optimization" : "all"
-}
\ No newline at end of file
diff --git a/tutorial/session1/togsim_configs/togsim_config.yml b/tutorial/session1/togsim_configs/togsim_config.yml
new file mode 100644
index 00000000..72873f1c
--- /dev/null
+++ b/tutorial/session1/togsim_configs/togsim_config.yml
@@ -0,0 +1,30 @@
+num_cores: 1
+core_freq_mhz: 940
+core_stats_print_period_cycles: 10000
+num_systolic_array_per_core: 2
+
+vpu_num_lanes: 128
+vpu_spad_size_kb_per_lane: 128
+vpu_vector_length_bits: 256
+
+dram_type: ramulator2
+dram_freq_mhz: 940
+dram_channels: 16
+dram_req_size_byte: 32
+dram_num_burst_length: 2
+dram_stats_print_period_cycles: 10000
+ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
+
+icnt_type: simple
+icnt_latency_cycles: 10
+icnt_freq_mhz: 940
+icnt_injection_ports_per_core: 16
+
+pytorchsim_functional_mode: 1
+pytorchsim_timing_mode: 1
+
+codegen_mapping_strategy: heuristic
+codegen_external_mapping_file: ''
+codegen_autotune_max_retry: 10
+codegen_autotune_template_topk: 4
+codegen_compiler_optimization: all
diff --git a/tutorial/session1/togsim_configs/togsim_config_2_cores.json b/tutorial/session1/togsim_configs/togsim_config_2_cores.json
deleted file mode 100644
index c50edaa9..00000000
--- a/tutorial/session1/togsim_configs/togsim_config_2_cores.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-  "num_cores" : 2,
-  "core_freq_mhz" : 940,
-  "core_stats_print_period_cycles" : 10000,
-  "num_systolic_array_per_core" : 2,
-
-  "vpu_num_lanes" : 128,
-  "vpu_spad_size_kb_per_lane" : 128,
-  "vpu_vector_length_bits" : 256,
-
-  "dram_type" : "ramulator2",
-  "dram_freq_mhz" : 940,
-  "dram_channels": 32,
-  "dram_req_size_byte": 32,
-  "dram_num_burst_length" : 2,
-  "dram_stats_print_period_cycles": 10000,
-  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
-
-  "icnt_type" : "simple",
-  "icnt_latency_cycles" : 10,
-  "icnt_freq_mhz" : 940,
-  "icnt_injection_ports_per_core" : 16,
-
-  "pytorchsim_functional_mode" : 0,
-  "pytorchsim_timing_mode" : 1,
-
-  "codegen_mapping_strategy" : "heuristic",
-  "codegen_external_mapping_file" : "",
-  "codegen_autotune_max_retry": 10,
-  "codegen_autotune_template_topk": 4,
-  "codegen_compiler_optimization" : "all"
-}
\ No newline at end of file
diff --git a/tutorial/session1/togsim_configs/togsim_config_2_cores.yml b/tutorial/session1/togsim_configs/togsim_config_2_cores.yml
new file mode 100644
index 00000000..3b9b8fc8
--- /dev/null
+++ b/tutorial/session1/togsim_configs/togsim_config_2_cores.yml
@@ -0,0 +1,30 @@
+num_cores: 2
+core_freq_mhz: 940
+core_stats_print_period_cycles: 10000
+num_systolic_array_per_core: 2
+
+vpu_num_lanes: 128
+vpu_spad_size_kb_per_lane: 128
+vpu_vector_length_bits: 256
+
+dram_type: ramulator2
+dram_freq_mhz: 940
+dram_channels: 32
+dram_req_size_byte: 32
+dram_num_burst_length: 2
+dram_stats_print_period_cycles: 10000
+ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
+
+icnt_type: simple
+icnt_latency_cycles: 10
+icnt_freq_mhz: 940
+icnt_injection_ports_per_core: 16
+
+pytorchsim_functional_mode: 0
+pytorchsim_timing_mode: 1
+
+codegen_mapping_strategy: heuristic
+codegen_external_mapping_file: ''
+codegen_autotune_max_retry: 10
+codegen_autotune_template_topk: 4
+codegen_compiler_optimization: all
diff --git a/tutorial/session1/togsim_configs/togsim_config_autotune.json b/tutorial/session1/togsim_configs/togsim_config_autotune.json
deleted file mode 100644
index c9763e92..00000000
--- a/tutorial/session1/togsim_configs/togsim_config_autotune.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-  "num_cores" : 1,
-  "core_freq_mhz" : 940,
-  "core_stats_print_period_cycles" : 10000,
-  "num_systolic_array_per_core" : 2,
-
-  "vpu_num_lanes" : 128,
-  "vpu_spad_size_kb_per_lane" : 128,
-  "vpu_vector_length_bits" : 256,
-
-  "dram_type" : "ramulator2",
-  "dram_freq_mhz" : 940,
-  "dram_channels": 16,
-  "dram_req_size_byte": 32,
-  "dram_num_burst_length" : 2,
-  "dram_stats_print_period_cycles": 10000,
-  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
-
-  "icnt_type" : "simple",
-  "icnt_latency_cycles" : 10,
-  "icnt_freq_mhz" : 940,
-  "icnt_injection_ports_per_core" : 16,
-
-  "pytorchsim_functional_mode" : 1,
-  "pytorchsim_timing_mode" : 1,
-
-  "codegen_mapping_strategy" : "autotune",
-  "codegen_external_mapping_file" : "",
-  "codegen_autotune_max_retry": 10,
-  "codegen_autotune_template_topk": 4,
-  "codegen_compiler_optimization" : "all"
-}
\ No newline at end of file
diff --git a/tutorial/session1/togsim_configs/togsim_config_autotune.yml b/tutorial/session1/togsim_configs/togsim_config_autotune.yml
new file mode 100644
index 00000000..2726736a
--- /dev/null
+++ b/tutorial/session1/togsim_configs/togsim_config_autotune.yml
@@ -0,0 +1,30 @@
+num_cores: 1
+core_freq_mhz: 940
+core_stats_print_period_cycles: 10000
+num_systolic_array_per_core: 2
+
+vpu_num_lanes: 128
+vpu_spad_size_kb_per_lane: 128
+vpu_vector_length_bits: 256
+
+dram_type: ramulator2
+dram_freq_mhz: 940
+dram_channels: 16
+dram_req_size_byte: 32
+dram_num_burst_length: 2
+dram_stats_print_period_cycles: 10000
+ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
+
+icnt_type: simple
+icnt_latency_cycles: 10
+icnt_freq_mhz: 940
+icnt_injection_ports_per_core: 16
+
+pytorchsim_functional_mode: 1
+pytorchsim_timing_mode: 1
+
+codegen_mapping_strategy: autotune
+codegen_external_mapping_file: ''
+codegen_autotune_max_retry: 10
+codegen_autotune_template_topk: 4
+codegen_compiler_optimization: all
diff --git a/tutorial/session1/togsim_configs/togsim_config_external_mapping.json b/tutorial/session1/togsim_configs/togsim_config_external_mapping.json
deleted file mode 100644
index c8ddb0f3..00000000
--- a/tutorial/session1/togsim_configs/togsim_config_external_mapping.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-  "num_cores" : 1,
-  "core_freq_mhz" : 940,
-  "core_stats_print_period_cycles" : 10000,
-  "num_systolic_array_per_core" : 2,
-
-  "vpu_num_lanes" : 128,
-  "vpu_spad_size_kb_per_lane" : 128,
-  "vpu_vector_length_bits" : 256,
-
-  "dram_type" : "ramulator2",
-  "dram_freq_mhz" : 940,
-  "dram_channels": 16,
-  "dram_req_size_byte": 32,
-  "dram_num_burst_length" : 2,
-  "dram_stats_print_period_cycles": 10000,
-  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
-
-  "icnt_type" : "simple",
-  "icnt_latency_cycles" : 10,
-  "icnt_freq_mhz" : 940,
-  "icnt_injection_ports_per_core" : 16,
-
-  "pytorchsim_functional_mode" : 1,
-  "pytorchsim_timing_mode" : 1,
-
-  "codegen_mapping_strategy" : "external-then-heuristic",
-  "codegen_external_mapping_file" : "/workspace/PyTorchSim/tutorial/session1/tutorial_external_mapping.json",
-  "codegen_autotune_max_retry": 10,
-  "codegen_autotune_template_topk": 4,
-  "codegen_compiler_optimization" : "all"
-}
\ No newline at end of file
diff --git a/tutorial/session1/togsim_configs/togsim_config_external_mapping.yml b/tutorial/session1/togsim_configs/togsim_config_external_mapping.yml
new file mode 100644
index 00000000..468a0b44
--- /dev/null
+++ b/tutorial/session1/togsim_configs/togsim_config_external_mapping.yml
@@ -0,0 +1,30 @@
+num_cores: 1
+core_freq_mhz: 940
+core_stats_print_period_cycles: 10000
+num_systolic_array_per_core: 2
+
+vpu_num_lanes: 128
+vpu_spad_size_kb_per_lane: 128
+vpu_vector_length_bits: 256
+
+dram_type: ramulator2
+dram_freq_mhz: 940
+dram_channels: 16
+dram_req_size_byte: 32
+dram_num_burst_length: 2
+dram_stats_print_period_cycles: 10000
+ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
+
+icnt_type: simple
+icnt_latency_cycles: 10
+icnt_freq_mhz: 940
+icnt_injection_ports_per_core: 16
+
+pytorchsim_functional_mode: 1
+pytorchsim_timing_mode: 1
+
+codegen_mapping_strategy: external-then-heuristic
+codegen_external_mapping_file: /workspace/PyTorchSim/tutorial/session1/tutorial_external_mapping.json
+codegen_autotune_max_retry: 10
+codegen_autotune_template_topk: 4
+codegen_compiler_optimization: all
diff --git a/tutorial/session1/togsim_configs/togsim_config_functional_only.json b/tutorial/session1/togsim_configs/togsim_config_functional_only.json
deleted file mode 100644
index 53072307..00000000
--- a/tutorial/session1/togsim_configs/togsim_config_functional_only.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-  "num_cores" : 1,
-  "core_freq_mhz" : 940,
-  "core_stats_print_period_cycles" : 10000,
-  "num_systolic_array_per_core" : 2,
-
-  "vpu_num_lanes" : 128,
-  "vpu_spad_size_kb_per_lane" : 128,
-  "vpu_vector_length_bits" : 256,
-
-  "dram_type" : "ramulator2",
-  "dram_freq_mhz" : 940,
-  "dram_channels": 16,
-  "dram_req_size_byte": 32,
-  "dram_num_burst_length" : 2,
-  "dram_stats_print_period_cycles": 10000,
-  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
-
-  "icnt_type" : "simple",
-  "icnt_latency_cycles" : 10,
-  "icnt_freq_mhz" : 940,
-  "icnt_injection_ports_per_core" : 16,
-
-  "pytorchsim_functional_mode" : 1,
-  "pytorchsim_timing_mode" : 0,
-
-  "codegen_mapping_strategy" : "heuristic",
-  "codegen_external_mapping_file" : "",
-  "codegen_autotune_max_retry": 10,
-  "codegen_autotune_template_topk": 4,
-  "codegen_compiler_optimization" : "all"
-}
\ No newline at end of file
diff --git a/tutorial/session1/togsim_configs/togsim_config_functional_only.yml b/tutorial/session1/togsim_configs/togsim_config_functional_only.yml
new file mode 100644
index 00000000..a1f1b432
--- /dev/null
+++ b/tutorial/session1/togsim_configs/togsim_config_functional_only.yml
@@ -0,0 +1,30 @@
+num_cores: 1
+core_freq_mhz: 940
+core_stats_print_period_cycles: 10000
+num_systolic_array_per_core: 2
+
+vpu_num_lanes: 128
+vpu_spad_size_kb_per_lane: 128
+vpu_vector_length_bits: 256
+
+dram_type: ramulator2
+dram_freq_mhz: 940
+dram_channels: 16
+dram_req_size_byte: 32
+dram_num_burst_length: 2
+dram_stats_print_period_cycles: 10000
+ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
+
+icnt_type: simple
+icnt_latency_cycles: 10
+icnt_freq_mhz: 940
+icnt_injection_ports_per_core: 16
+
+pytorchsim_functional_mode: 1
+pytorchsim_timing_mode: 0
+
+codegen_mapping_strategy: heuristic
+codegen_external_mapping_file: ''
+codegen_autotune_max_retry: 10
+codegen_autotune_template_topk: 4
+codegen_compiler_optimization: all
diff --git a/tutorial/session1/togsim_configs/togsim_config_no_compiler_optimization.json b/tutorial/session1/togsim_configs/togsim_config_no_compiler_optimization.json
deleted file mode 100644
index e2b9c8c8..00000000
--- a/tutorial/session1/togsim_configs/togsim_config_no_compiler_optimization.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-  "num_cores" : 1,
-  "core_freq_mhz" : 940,
-  "core_stats_print_period_cycles" : 10000,
-  "num_systolic_array_per_core" : 2,
-
-  "vpu_num_lanes" : 128,
-  "vpu_spad_size_kb_per_lane" : 128,
-  "vpu_vector_length_bits" : 256,
-
-  "dram_type" : "ramulator2",
-  "dram_freq_mhz" : 940,
-  "dram_channels": 16,
-  "dram_req_size_byte": 32,
-  "dram_num_burst_length" : 2,
-  "dram_stats_print_period_cycles": 10000,
-  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
-
-  "icnt_type" : "simple",
-  "icnt_latency_cycles" : 10,
-  "icnt_freq_mhz" : 940,
-  "icnt_injection_ports_per_core" : 16,
-
-  "pytorchsim_functional_mode" : 0,
-  "pytorchsim_timing_mode" : 1,
-
-  "codegen_mapping_strategy" : "heuristic",
-  "codegen_external_mapping_file" : "",
-  "codegen_autotune_max_retry": 10,
-  "codegen_autotune_template_topk": 4,
-  "codegen_compiler_optimization" : "none"
-}
\ No newline at end of file
diff --git a/tutorial/session1/togsim_configs/togsim_config_no_compiler_optimization.yml b/tutorial/session1/togsim_configs/togsim_config_no_compiler_optimization.yml
new file mode 100644
index 00000000..62d627a6
--- /dev/null
+++ b/tutorial/session1/togsim_configs/togsim_config_no_compiler_optimization.yml
@@ -0,0 +1,30 @@
+num_cores: 1
+core_freq_mhz: 940
+core_stats_print_period_cycles: 10000
+num_systolic_array_per_core: 2
+
+vpu_num_lanes: 128
+vpu_spad_size_kb_per_lane: 128
+vpu_vector_length_bits: 256
+
+dram_type: ramulator2
+dram_freq_mhz: 940
+dram_channels: 16
+dram_req_size_byte: 32
+dram_num_burst_length: 2
+dram_stats_print_period_cycles: 10000
+ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
+
+icnt_type: simple
+icnt_latency_cycles: 10
+icnt_freq_mhz: 940
+icnt_injection_ports_per_core: 16
+
+pytorchsim_functional_mode: 0
+pytorchsim_timing_mode: 1
+
+codegen_mapping_strategy: heuristic
+codegen_external_mapping_file: ''
+codegen_autotune_max_retry: 10
+codegen_autotune_template_topk: 4
+codegen_compiler_optimization: none
diff --git a/tutorial/session1/togsim_configs/togsim_config_timing_only.json b/tutorial/session1/togsim_configs/togsim_config_timing_only.json
deleted file mode 100644
index 0b846bbd..00000000
--- a/tutorial/session1/togsim_configs/togsim_config_timing_only.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-  "num_cores" : 1,
-  "core_freq_mhz" : 940,
-  "core_stats_print_period_cycles" : 10000,
-  "num_systolic_array_per_core" : 2,
-
-  "vpu_num_lanes" : 128,
-  "vpu_spad_size_kb_per_lane" : 128,
-  "vpu_vector_length_bits" : 256,
-
-  "dram_type" : "ramulator2",
-  "dram_freq_mhz" : 940,
-  "dram_channels": 16,
-  "dram_req_size_byte": 32,
-  "dram_num_burst_length" : 2,
-  "dram_stats_print_period_cycles": 10000,
-  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
-
-  "icnt_type" : "simple",
-  "icnt_latency_cycles" : 10,
-  "icnt_freq_mhz" : 940,
-  "icnt_injection_ports_per_core" : 16,
-
-  "pytorchsim_functional_mode" : 0,
-  "pytorchsim_timing_mode" : 1,
-
-  "codegen_mapping_strategy" : "heuristic",
-  "codegen_external_mapping_file" : "",
-  "codegen_autotune_max_retry": 10,
-  "codegen_autotune_template_topk": 4,
-  "codegen_compiler_optimization" : "all"
-}
\ No newline at end of file
diff --git a/tutorial/session1/togsim_configs/togsim_config_timing_only.yml b/tutorial/session1/togsim_configs/togsim_config_timing_only.yml
new file mode 100644
index 00000000..0024c073
--- /dev/null
+++ b/tutorial/session1/togsim_configs/togsim_config_timing_only.yml
@@ -0,0 +1,30 @@
+num_cores: 1
+core_freq_mhz: 940
+core_stats_print_period_cycles: 10000
+num_systolic_array_per_core: 2
+
+vpu_num_lanes: 128
+vpu_spad_size_kb_per_lane: 128
+vpu_vector_length_bits: 256
+
+dram_type: ramulator2
+dram_freq_mhz: 940
+dram_channels: 16
+dram_req_size_byte: 32
+dram_num_burst_length: 2
+dram_stats_print_period_cycles: 10000
+ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
+
+icnt_type: simple
+icnt_latency_cycles: 10
+icnt_freq_mhz: 940
+icnt_injection_ports_per_core: 16
+
+pytorchsim_functional_mode: 0
+pytorchsim_timing_mode: 1
+
+codegen_mapping_strategy: heuristic
+codegen_external_mapping_file: ''
+codegen_autotune_max_retry: 10
+codegen_autotune_template_topk: 4
+codegen_compiler_optimization: all

From 008cf4c2fe92b03a2c76c325febc0171e6c0acc6 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 15 Dec 2025 16:20:40 +0900
Subject: [PATCH 028/194] [Tutorial] Fix typo dockerfile

---
 tutorial/jupyterhub/Dockerfile.ksc2025 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorial/jupyterhub/Dockerfile.ksc2025 b/tutorial/jupyterhub/Dockerfile.ksc2025
index 5ff5d40d..9eaec15a 100644
--- a/tutorial/jupyterhub/Dockerfile.ksc2025
+++ b/tutorial/jupyterhub/Dockerfile.ksc2025
@@ -79,7 +79,7 @@ RUN git clone https://github.com/riscv-software-src/riscv-pk.git && \
 # Install torchsim dependency
 RUN apt install ninja-build && pip install onnx matplotlib && pip install --user conan==1.56.0
 
-# Prepare ONNXim project
+# Prepare PyTorchSim project
 RUN git clone https://github.com/PSAL-POSTECH/PyTorchSim.git --branch tutorial
 RUN cd PyTorchSim/TOGSim && \
     git submodule update --recursive --init && \

From 18d7babf7cf7c4ed15fd68f418bf5bf8d31d233a Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 15 Dec 2025 21:54:43 +0900
Subject: [PATCH 029/194] [Tutorial] Fix wrong config name

---
 PyTorchSimFrontend/extension_config.py | 2 +-
 tutorial/session1/LogAnalysis.ipynb    | 2 +-
 tutorial/session2/Hands_on.ipynb       | 1 +
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py
index ab8aea69..2b1b3102 100644
--- a/PyTorchSimFrontend/extension_config.py
+++ b/PyTorchSimFrontend/extension_config.py
@@ -103,7 +103,7 @@ def __getattr__(name):
     if name == "CONFIG_TORCHSIM_DUMP_PATH":
         return os.environ.get('TORCHSIM_DUMP_PATH', default = CONFIG_TORCHSIM_DIR)
     if name == "CONFIG_TORCHSIM_LOG_PATH":
-        return os.environ.get('TORCHSIM_DUMP_LOG_PATH', default = os.path.join(CONFIG_TORCHSIM_DIR, "togsim_results"))
+        return os.environ.get('TORCHSIM_LOG_PATH', default = os.path.join(CONFIG_TORCHSIM_DIR, "togsim_results"))
 
     if name == "CONFIG_TOGSIM_EAGER_MODE":
         return int(os.environ.get("TOGSIM_EAGER_MODE", default=False))
diff --git a/tutorial/session1/LogAnalysis.ipynb b/tutorial/session1/LogAnalysis.ipynb
index d3207af1..a82737db 100644
--- a/tutorial/session1/LogAnalysis.ipynb
+++ b/tutorial/session1/LogAnalysis.ipynb
@@ -19,7 +19,7 @@
     "base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')\n",
     "sys.path.append(base_dir)\n",
     "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_timing_only.yml\"\n",
-    "os.environ['TORCHSIM_DUMP_LOG_PATH']=os.path.join(os.getcwd(), \"togsim_results\")"
+    "os.environ['TORCHSIM_LOG_PATH']=os.path.join(os.getcwd(), \"togsim_results\")"
    ]
   },
   {
diff --git a/tutorial/session2/Hands_on.ipynb b/tutorial/session2/Hands_on.ipynb
index 33ec1a28..2d5a5cdc 100644
--- a/tutorial/session2/Hands_on.ipynb
+++ b/tutorial/session2/Hands_on.ipynb
@@ -32,6 +32,7 @@
     "import torch._dynamo\n",
     "import torch.utils.cpp_extension\n",
     "base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')\n",
+    "os.environ['TORCHSIM_DUMP_PATH']=os.path.join(os.getcwd(), \"togsim_results\")\n",
     "sys.path.append(base_dir)\n",
     "\n",
     "from Scheduler.scheduler import PyTorchSimRunner\n",

From 1e4d72a0fea00c80ef681b9da6bec783f4b2bd93 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 17 Dec 2025 22:01:36 +0900
Subject: [PATCH 030/194] [Fix] configuration reference in DNNServing.ipynb

---
 tutorial/session1/DNNServing.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorial/session1/DNNServing.ipynb b/tutorial/session1/DNNServing.ipynb
index b38bfe6a..56ad5ab6 100644
--- a/tutorial/session1/DNNServing.ipynb
+++ b/tutorial/session1/DNNServing.ipynb
@@ -38,7 +38,7 @@
     "from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request\n",
     "from PyTorchSimFrontend import extension_config\n",
     "\n",
-    "scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=extension_config.TOGSIM_CONFIG)\n",
+    "scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=extension_config.CONFIG_TOGSIM_CONFIG)\n",
     "device = scheduler.execution_engine.module.custom_device()\n",
     "\n",
     "model = resnet18().eval()\n",

From 232c4a69053f0082e254e536cfcd04dddbc9c2c0 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 17 Dec 2025 22:02:27 +0900
Subject: [PATCH 031/194] Change log level from warn to debug for unused tags

---
 TOGSim/include/DMA.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/TOGSim/include/DMA.h b/TOGSim/include/DMA.h
index 2f41c6f3..3056c626 100644
--- a/TOGSim/include/DMA.h
+++ b/TOGSim/include/DMA.h
@@ -62,7 +62,7 @@ class DMA {
         const std::vector<int>& tag_key = tag_entry.first;
         uint32_t value = tag_entry.second;
         if (value == 1) {
-          spdlog::warn("[Tag Table][{}] Unused tag found: (key={}, val={})",
+          spdlog::debug("[Tag Table][{}] Unused tag found: (key={}, val={})",
             subgraph_id, fmt::format("[{}]", fmt::join(tag_key, ", ")), value);
         }
       }
@@ -134,4 +134,4 @@ class DMA {
   std::queue<mem_fetch*> _pending_accesses;
   bool _generated_once = false;
 };
-#endif
\ No newline at end of file
+#endif

From 8b0f5354bcfce195d7a48498d0dde824be278a94 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 17 Dec 2025 22:08:26 +0900
Subject: [PATCH 032/194] Add placeholder echo command in Dockerfile

Added a placeholder echo command for future removal.
---
 tutorial/jupyterhub/Dockerfile.ksc2025 | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tutorial/jupyterhub/Dockerfile.ksc2025 b/tutorial/jupyterhub/Dockerfile.ksc2025
index 9eaec15a..4993538a 100644
--- a/tutorial/jupyterhub/Dockerfile.ksc2025
+++ b/tutorial/jupyterhub/Dockerfile.ksc2025
@@ -80,6 +80,8 @@ RUN git clone https://github.com/riscv-software-src/riscv-pk.git && \
 RUN apt install ninja-build && pip install onnx matplotlib && pip install --user conan==1.56.0
 
 # Prepare PyTorchSim project
+RUN echo "Remove me!"
+
 RUN git clone https://github.com/PSAL-POSTECH/PyTorchSim.git --branch tutorial
 RUN cd PyTorchSim/TOGSim && \
     git submodule update --recursive --init && \

From 602131571983a5b752bcea4bd929043aca556023 Mon Sep 17 00:00:00 2001
From: Yunseon <ysshin@postech.ac.kr>
Date: Wed, 17 Dec 2025 22:53:21 +0900
Subject: [PATCH 033/194] [Frontend] prevent reload device

---
 Scheduler/scheduler.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Scheduler/scheduler.py b/Scheduler/scheduler.py
index 98ebb1d5..34f0eda4 100644
--- a/Scheduler/scheduler.py
+++ b/Scheduler/scheduler.py
@@ -163,6 +163,8 @@ def __init__(self, tog_simulator : TOGSimulator, num_partion=1) -> None:
 
     @staticmethod
     def setup_device():
+        if cls._npu_module is not None:
+            return cls._npu_module
         source_file_path = os.path.dirname(os.path.abspath(__file__))
         source_file = os.path.join(
             source_file_path, f"{extension_config.CONFIG_TORCHSIM_DIR}/PyTorchSimFrontend/extension_device.cpp"
@@ -201,6 +203,7 @@ def setup_device():
         get_wrapper_codegen_for_device("npu")
             == ExtensionWrapperCodegen
         )
+        cls._npu_module = module
         return module
 
     def submit(self, batched_req, partition_idx) -> List[RequestReturn]:

From 7c5dcccd539b9174da3ff8e1751117fa72910fd6 Mon Sep 17 00:00:00 2001
From: Yunseon <ysshin@postech.ac.kr>
Date: Wed, 17 Dec 2025 22:59:39 +0900
Subject: [PATCH 034/194] [fix] setup_device to class method

---
 Scheduler/scheduler.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Scheduler/scheduler.py b/Scheduler/scheduler.py
index 34f0eda4..94092723 100644
--- a/Scheduler/scheduler.py
+++ b/Scheduler/scheduler.py
@@ -144,6 +144,7 @@ class PyTorchSimRunner:
     PARTITION_BUSY = 0
     PARTITION_IDLE = 1
     SELECT_NOTHING = 2
+    _npu_module = None
     def __init__(self, tog_simulator : TOGSimulator, num_partion=1) -> None:
         self.module = self.setup_device()
         self.num_partion = num_partion
@@ -161,7 +162,7 @@ def __init__(self, tog_simulator : TOGSimulator, num_partion=1) -> None:
         # Dry run for compile and create generator
         os.environ["TOGSIM_EAGER_MODE"] = "1"
 
-    @staticmethod
+    @classmethod
     def setup_device():
         if cls._npu_module is not None:
             return cls._npu_module

From 88d9eb8f74f03566034e2de95323f6662b3cd183 Mon Sep 17 00:00:00 2001
From: Yunseon <ysshin@postech.ac.kr>
Date: Wed, 17 Dec 2025 23:13:59 +0900
Subject: [PATCH 035/194] [Fix] typo in TOGSIM_CONFIG

---
 tutorial/session1/DNNServing.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorial/session1/DNNServing.ipynb b/tutorial/session1/DNNServing.ipynb
index 56ad5ab6..741f463f 100644
--- a/tutorial/session1/DNNServing.ipynb
+++ b/tutorial/session1/DNNServing.ipynb
@@ -83,7 +83,7 @@
     "target_model1 = resnet18().eval()\n",
     "\n",
     "# Init scheduler\n",
-    "scheduler = Scheduler(num_request_queue=1, max_batch=32, engine_select=Scheduler.FIFO_ENGINE, togsim_config=extension_config.TOGSIM_CONFIG)\n",
+    "scheduler = Scheduler(num_request_queue=1, max_batch=32, engine_select=Scheduler.FIFO_ENGINE, togsim_config=extension_config.CONFIG_TOGSIM_CONFIG)\n",
     "# Register compiled model\n",
     "opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last), dynamic=False)\n",
     "SchedulerDNNModel.register_model(\"resnet18\", opt_model1)\n",

From d1ffac21708a30d40ff378e315c00c44d840fdc5 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 17 Dec 2025 23:15:10 +0900
Subject: [PATCH 036/194] Remove echo command from Dockerfile

Removed unnecessary echo command from Dockerfile.
---
 tutorial/jupyterhub/Dockerfile.ksc2025 | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tutorial/jupyterhub/Dockerfile.ksc2025 b/tutorial/jupyterhub/Dockerfile.ksc2025
index 4993538a..9eaec15a 100644
--- a/tutorial/jupyterhub/Dockerfile.ksc2025
+++ b/tutorial/jupyterhub/Dockerfile.ksc2025
@@ -80,8 +80,6 @@ RUN git clone https://github.com/riscv-software-src/riscv-pk.git && \
 RUN apt install ninja-build && pip install onnx matplotlib && pip install --user conan==1.56.0
 
 # Prepare PyTorchSim project
-RUN echo "Remove me!"
-
 RUN git clone https://github.com/PSAL-POSTECH/PyTorchSim.git --branch tutorial
 RUN cd PyTorchSim/TOGSim && \
     git submodule update --recursive --init && \

From 7c45f8015e1a83b2abc177c25e3b2d66ff2ac0a7 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 18 Dec 2025 00:21:44 +0900
Subject: [PATCH 037/194] Refactor NPU module variable naming convention

---
 Scheduler/scheduler.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/Scheduler/scheduler.py b/Scheduler/scheduler.py
index 94092723..8aa849b1 100644
--- a/Scheduler/scheduler.py
+++ b/Scheduler/scheduler.py
@@ -144,7 +144,7 @@ class PyTorchSimRunner:
     PARTITION_BUSY = 0
     PARTITION_IDLE = 1
     SELECT_NOTHING = 2
-    _npu_module = None
+    NPU_MODULE = None
     def __init__(self, tog_simulator : TOGSimulator, num_partion=1) -> None:
         self.module = self.setup_device()
         self.num_partion = num_partion
@@ -163,9 +163,9 @@ def __init__(self, tog_simulator : TOGSimulator, num_partion=1) -> None:
         os.environ["TOGSIM_EAGER_MODE"] = "1"
 
     @classmethod
-    def setup_device():
-        if cls._npu_module is not None:
-            return cls._npu_module
+    def setup_device(cls):
+        if cls.NPU_MODULE is not None:
+            return cls.NPU_MODULE
         source_file_path = os.path.dirname(os.path.abspath(__file__))
         source_file = os.path.join(
             source_file_path, f"{extension_config.CONFIG_TORCHSIM_DIR}/PyTorchSimFrontend/extension_device.cpp"
@@ -204,7 +204,7 @@ def setup_device():
         get_wrapper_codegen_for_device("npu")
             == ExtensionWrapperCodegen
         )
-        cls._npu_module = module
+        cls.NPU_MODULE = module
         return module
 
     def submit(self, batched_req, partition_idx) -> List[RequestReturn]:

From af48bc382958846e064d9ad4d7cdd21daeedcac4 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 5 Jan 2026 11:25:22 +0000
Subject: [PATCH 038/194] [Fix] Indirect store & add a test case

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 10 +++++-----
 tests/test_indirect_access.py                   |  6 ++++--
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 266d884b..297ea162 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -438,12 +438,12 @@ def store(self, name: str, index: sympy.Expr, value, mode=None, *args, **kwargs)
 
         # Handle scatter store
         if "tmp" in str(index):
-            if mode == "atomic_add":
-                # Convert the output buffer type to the inplace buffer
-                arg_name =  V.graph.scheduler.mutation_real_name.get(name, name)
-                if arg_name not in self.kernel_group.args.inplace_buffers:
-                    self.kernel_group.args.make_inplace(arg_name, arg_name)
+            # Convert the output buffer type to the inplace buffer
+            arg_name =  V.graph.scheduler.mutation_real_name.get(name, name)
+            if arg_name not in self.kernel_group.args.inplace_buffers:
+                self.kernel_group.args.make_inplace(arg_name, arg_name)
 
+            if mode == "atomic_add":
                 loaded_value = ops.load(name, index)
                 value = ops.add(loaded_value, value)
             index, _ = self.convert_indirect_indexing(index)
diff --git a/tests/test_indirect_access.py b/tests/test_indirect_access.py
index 6cfa7b58..d103ee1b 100644
--- a/tests/test_indirect_access.py
+++ b/tests/test_indirect_access.py
@@ -70,11 +70,12 @@ def vectoradd(a, idx, b):
         a[idx, :] = b
         return a
     x = torch.randn(size, dtype=torch.float32).to(device=device)
+    x_cpu = x.clone().cpu()
     idx = torch.randint(0,128, [128]).to(device=device)
-    y = torch.randn(128, dtype=torch.float32).to(device=device)
+    y = torch.randn(size[1], dtype=torch.float32).to(device=device)
     opt_fn = torch.compile(dynamic=False)(vectoradd)
     res = opt_fn(x, idx, y)
-    out = vectoradd(x.cpu(), idx.cpu(), y.cpu())
+    out = vectoradd(x_cpu, idx.cpu(), y.cpu())
     test_result("Indirect VectorAdd", res, out)
 
 if __name__ == "__main__":
@@ -86,6 +87,7 @@ def vectoradd(a, idx, b):
     module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
     test_scatter_full(device)
+    test_scatter_full(device, size=(2048, 2048))
     test_scatter_add(device)
     test_indirect_vectoradd(device)
     #test_embedding(device, 1024, 2048)
\ No newline at end of file

From 6d043ad4675a4cee2242b1f0f7226f9e47926bf4 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 5 Jan 2026 14:09:42 +0000
Subject: [PATCH 039/194] [Fix] relax vlane_stride constraints to resolve tile
 size conflicts #201

---
 PyTorchSimFrontend/mlir/mlir_common.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 15408c0d..b86607ea 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -332,8 +332,8 @@ def _adjust_one(dim_size, tile_size):
         remain = candidate_tile_size[axis] % stride
 
         if remain:
-            candidate_tile_size[axis] += stride - remain
-            self.tile_constraint[axis].must_divide_dim = False
+            # #201: relax vlane_stride constraints
+            self.vmap.vlane_stride = 1
         return candidate_tile_size
 
     def scale_tile_dim(self, axis, dim_sz, scale_factor=2):
@@ -488,7 +488,7 @@ def __init__(self, tile_size, vector_lane, vlane_split_axis=None, vlane_stride=N
         self.name = ""
         self._tile_size = list(tile_size)
         self._tile_stride = None
-        self.tile_constraint = [TileConstraint(vlane_stride) for _ in tile_size]
+        self.tile_constraint = [TileConstraint(vlane_stride if idx == vlane_split_axis else 1) for idx, _ in enumerate(tile_size)]
         self.tile_axis_order = list(range(len(tile_size)))
         self.update_tile_stride()
 
@@ -718,13 +718,13 @@ def compute_tile_size(self, nodes, vars, reduction_vars):
             init_tile_desc.nr_rdim = len(reduction_vars)
             self.kernel_group.set_tile_info(init_tile_desc)
 
-        # Handle edge case
-        if len(self.ranges)==1 and self.ranges[0] == 1: # Scalar case 2
-            self.kernel_group.tile_desc.vmap.vlane_stride = 1
-            self.kernel_group.tile_desc.vmap.vlane_split_axis = 0
-        elif vlane_split_axis == -1: # Reduction only case
-            self.kernel_group.tile_desc.vmap.vlane_split_axis = 0
-            self.kernel_group.tile_desc.vmap.vlane_stride = self.kernel_group.tile_desc.get_tile_size()[0]
+            # Handle edge case
+            if len(self.ranges)==1 and self.ranges[0] == 1: # Scalar case 2
+                self.kernel_group.tile_desc.vmap.vlane_stride = 1
+                self.kernel_group.tile_desc.vmap.vlane_split_axis = 0
+            elif vlane_split_axis == -1: # Reduction only case
+                self.kernel_group.tile_desc.vmap.vlane_split_axis = 0
+                self.kernel_group.tile_desc.vmap.vlane_stride = self.kernel_group.tile_desc.get_tile_size()[0]
 
         # Handle implict dims. Input operand could be high dimension tensor.
         # Note: https://github.com/PSAL-POSTECH/PyTorchSim/issues/173

From f6ada1f5b1fe4f44e0162c03dfbe12c21633734c Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 5 Jan 2026 14:10:35 +0000
Subject: [PATCH 040/194] [Refactor] Remove unused env vars

---
 Dockerfile.base                        | 2 --
 tutorial/jupyterhub/Dockerfile.ksc2025 | 2 --
 2 files changed, 4 deletions(-)

diff --git a/Dockerfile.base b/Dockerfile.base
index 6a21760b..f961859e 100644
--- a/Dockerfile.base
+++ b/Dockerfile.base
@@ -67,9 +67,7 @@ RUN curl -L -H "Accept: application/octet-stream" https://api.github.com/repos/P
 
 # Store RISC-V LLVM for TorchSim
 ENV TORCHSIM_LLVM_PATH=/riscv-llvm/bin
-ENV TORCHSIM_LLVM_INCLUDE_PATH=/riscv-llvm/include
 ENV TORCHSIM_DIR=/workspace/PyTorchSim
-ENV LLVM_DIR=/riscv-llvm
 
 # Download Spike simulator
 RUN curl -L -H "Accept: application/octet-stream" https://api.github.com/repos/PSAL-POSTECH/riscv-isa-sim/releases/assets/${SPIKE_ASSET_ID} -o /tmp/spike-release.tar.gz && \
diff --git a/tutorial/jupyterhub/Dockerfile.ksc2025 b/tutorial/jupyterhub/Dockerfile.ksc2025
index 9eaec15a..7633c048 100644
--- a/tutorial/jupyterhub/Dockerfile.ksc2025
+++ b/tutorial/jupyterhub/Dockerfile.ksc2025
@@ -52,9 +52,7 @@ RUN cd llvm-project && mkdir build && cd build && \
 
 # Store RISC-V LLVM for TorchSim
 ENV TORCHSIM_LLVM_PATH=/riscv-llvm/bin
-ENV TORCHSIM_LLVM_INCLUDE_PATH=/riscv-llvm/include
 ENV TORCHSIM_DIR=/workspace/PyTorchSim
-ENV LLVM_DIR=/riscv-llvm
 
 # Download RISC-V tool chain
 RUN apt install -y wget && \

From 3ccfc113940def78366d773c4fe19a3d8bfe7232 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 6 Jan 2026 08:03:38 +0000
Subject: [PATCH 041/194] [CI] Add CI for pytorch2.8

---
 .github/workflows/docker-base-image-2-8.yml | 71 +++++++++++++++++++++
 .github/workflows/docker-base-image.yml     | 10 ++-
 .github/workflows/docker-image-2-8.yml      | 61 ++++++++++++++++++
 Dockerfile                                  |  3 +-
 Dockerfile.base                             |  3 +-
 5 files changed, 143 insertions(+), 5 deletions(-)
 create mode 100644 .github/workflows/docker-base-image-2-8.yml
 create mode 100644 .github/workflows/docker-image-2-8.yml

diff --git a/.github/workflows/docker-base-image-2-8.yml b/.github/workflows/docker-base-image-2-8.yml
new file mode 100644
index 00000000..f8649303
--- /dev/null
+++ b/.github/workflows/docker-base-image-2-8.yml
@@ -0,0 +1,71 @@
+name: Docker Base Image CI (PyTorch 2.8)
+
+on:
+  push:
+    branches: [ "base" ]
+  workflow_dispatch:
+  repository_dispatch:
+    types: [ build_base ]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    permissions:
+      contents: read
+      packages: write
+
+    steps:
+      - name: Checkout Code
+        uses: actions/checkout@v4
+
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Set environment
+        env:
+          GIT_ACCESS_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          if [ -n "${{ github.event.pull_request.head.sha }}" ]; then
+            echo "GITHUB_SHA=${{ github.event.pull_request.head.sha }}" >> $GITHUB_ENV
+            echo "GITHUB_SHA=${{ github.event.pull_request.head.sha }}"
+          else
+            echo "GITHUB_SHA=${{ github.sha }}" >> $GITHUB_ENV
+            echo "GITHUB_SHA=${{ github.sha }}"
+          fi
+
+          gem5_response_file=/tmp/releases-gem5-latest.json
+          curl -s https://api.github.com/repos/PSAL-POSTECH/GEM5/releases/latest > ${gem5_response_file}
+          GEM5_ASSET_ID=$(jq ".assets[0].id" ${gem5_response_file})
+          echo "GEM5_ASSET_ID=$GEM5_ASSET_ID"
+          echo "GEM5_ASSET_ID=$GEM5_ASSET_ID" >> $GITHUB_ENV
+
+          llvm_response_file=/tmp/releases-gem5-latest.json
+          curl -s https://api.github.com/repos/PSAL-POSTECH/llvm-project/releases/latest > ${llvm_response_file}
+          LLVM_ASSET_ID=$(jq ".assets[0].id" ${llvm_response_file})
+          echo "LLVM_ASSET_ID=$LLVM_ASSET_ID"
+          echo "LLVM_ASSET_ID=$LLVM_ASSET_ID" >> $GITHUB_ENV
+
+          spike_response_file=/tmp/releases-spike-latest.json
+          curl -s https://api.github.com/repos/PSAL-POSTECH/riscv-isa-sim/releases/latest > ${spike_response_file}
+          SPIKE_ASSET_ID=$(jq ".assets[0].id" ${spike_response_file})
+          echo "SPIKE_ASSET_ID=$SPIKE_ASSET_ID"
+          echo "SPIKE_ASSET_ID=$SPIKE_ASSET_ID" >> $GITHUB_ENV
+
+      - name: Build and Push Docker Image (PyTorch 2.8)
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          file: ./Dockerfile.base
+          push: true
+          build-args: |
+            PYTORCH_IMAGE=pytorch/pytorch:2.8.0-cuda12.6-cudnn9-runtime
+            GEM5_ASSET_ID=${{ env.GEM5_ASSET_ID }}
+            LLVM_ASSET_ID=${{ env.LLVM_ASSET_ID }}
+            SPIKE_ASSET_ID=${{ env.SPIKE_ASSET_ID }}
+          tags: |
+            ghcr.io/psal-postech/torchsim_base_2_8:latest
diff --git a/.github/workflows/docker-base-image.yml b/.github/workflows/docker-base-image.yml
index bb79925c..2c29a11b 100644
--- a/.github/workflows/docker-base-image.yml
+++ b/.github/workflows/docker-base-image.yml
@@ -32,9 +32,13 @@ jobs:
         env:
           GIT_ACCESS_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         run: |
-          echo "IMAGE_TAG=torchsim-ci:${GITHUB_SHA}" >> $GITHUB_ENV
-          echo "GITHUB_SHA=${{github.event.pull_request.head.sha}}" >> $GITHUB_ENV
-          echo "GITHUB_SHA=${{github.event.pull_request.head.sha}}"
+          if [ -n "${{ github.event.pull_request.head.sha }}" ]; then
+            echo "GITHUB_SHA=${{ github.event.pull_request.head.sha }}" >> $GITHUB_ENV
+            echo "GITHUB_SHA=${{ github.event.pull_request.head.sha }}"
+          else
+            echo "GITHUB_SHA=${{ github.sha }}" >> $GITHUB_ENV
+            echo "GITHUB_SHA=${{ github.sha }}"
+          fi
 
           gem5_response_file=/tmp/releases-gem5-latest.json
           curl -s https://api.github.com/repos/PSAL-POSTECH/GEM5/releases/latest > ${gem5_response_file}
diff --git a/.github/workflows/docker-image-2-8.yml b/.github/workflows/docker-image-2-8.yml
new file mode 100644
index 00000000..cb5f73d1
--- /dev/null
+++ b/.github/workflows/docker-image-2-8.yml
@@ -0,0 +1,61 @@
+name: Docker image CI (PyTorch 2.8)
+
+on:
+  pull_request:
+    branches: [ "torch_v2.8" ]
+  workflow_dispatch:
+
+jobs:
+  build-and-test:
+    runs-on: self-hosted
+
+    permissions:
+      contents: read
+      packages: write
+
+    steps:
+      - name: Checkout Code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.pull_request.head.sha }}
+          submodules: recursive
+
+      - name: Login to GHCR
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Build and Push Docker Image (PyTorch 2.8)
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          file: ./Dockerfile
+          push: true
+          no-cache: true
+          build-args: |
+            BASE_IMAGE=ghcr.io/psal-postech/torchsim_base_2_8:latest
+          tags: ghcr.io/psal-postech/torchsim-test-2-8:${{ github.sha }}
+
+      - name: Wait for GHCR propagation
+        run: |
+          for i in {1..30}; do
+            echo "Checking if image exists in GHCR (attempt $i)..."
+            if docker manifest inspect ghcr.io/psal-postech/torchsim-test-2-8:${GITHUB_SHA} > /dev/null 2>&1; then
+              echo "Image is now available in GHCR."
+              exit 0
+            fi
+            echo "Image not yet available, retrying in 30 seconds..."
+            sleep 20
+          done
+          echo "Image did not become available in GHCR within expected time."
+          exit 1
+
+  test-pytorchsim-wrapper:
+    needs: build-and-test
+    uses: ./.github/workflows/pytorchsim_test.yml
+    with:
+      image_name: ghcr.io/psal-postech/torchsim-test-2-8:${{ github.sha }}
+      vector_lane: 128
+      spad_size: 128
diff --git a/Dockerfile b/Dockerfile
index 37721940..088daa43 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,5 +1,6 @@
 # syntax=docker/dockerfile:1.4
-FROM ghcr.io/psal-postech/torchsim_base:latest
+ARG BASE_IMAGE=ghcr.io/psal-postech/torchsim_base:latest
+FROM ${BASE_IMAGE}
 
 # Prepare PyTorchSim project
 COPY . /workspace/PyTorchSim
diff --git a/Dockerfile.base b/Dockerfile.base
index f961859e..897b8195 100644
--- a/Dockerfile.base
+++ b/Dockerfile.base
@@ -23,7 +23,8 @@
 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-FROM pytorch/pytorch:2.2.0-cuda12.1-cudnn8-runtime
+ARG PYTORCH_IMAGE=pytorch/pytorch:2.2.0-cuda12.1-cudnn8-runtime
+FROM ${PYTORCH_IMAGE}
 
 # Copied from Gem5 Docker file
 ENV DEBIAN_FRONTEND=noninteractive

From 0abfffefcef1cf09ba54be79ba3bc01a881c3d87 Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Wed, 24 Sep 2025 12:59:04 +0000
Subject: [PATCH 042/194] PyTorch version upgrade: tested on single-operator
 tests

---
 PyTorchSimFrontend/extension_codecache.py     |  3 +-
 PyTorchSimFrontend/extension_device.cpp       |  6 +-
 .../extension_device_interface.py             | 63 +++++++++++++++++++
 .../extension_device_op_overrides.py          | 25 ++++++++
 PyTorchSimFrontend/extension_utils.py         | 26 ++++++++
 PyTorchSimFrontend/mlir/mlir_autotune.py      |  8 ++-
 .../mlir/mlir_codegen_backend.py              | 63 ++++++++++++++++---
 PyTorchSimFrontend/mlir/mlir_common.py        | 35 ++++++-----
 PyTorchSimFrontend/mlir/mlir_scheduling.py    | 44 ++++++-------
 PyTorchSimFrontend/mlir/mlir_template.py      | 21 +++----
 Scheduler/scheduler.py                        | 17 +++--
 11 files changed, 243 insertions(+), 68 deletions(-)
 create mode 100644 PyTorchSimFrontend/extension_device_interface.py
 create mode 100644 PyTorchSimFrontend/extension_device_op_overrides.py
 create mode 100644 PyTorchSimFrontend/extension_utils.py

diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index 2e35220c..ef8c63e6 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -3,7 +3,8 @@
 import shlex
 import subprocess
 
-from torch._inductor.codecache import AsyncCompile, get_lock_dir, get_hash, write
+from torch._inductor.codecache import get_lock_dir, get_hash, write
+from torch._inductor.async_compile import AsyncCompile
 from AsmParser.tog_generator import tog_generator
 from PyTorchSimFrontend.mlir.mlir_caller_codegen import MLIRKernelCallerCodeGen
 from PyTorchSimFrontend import extension_config
diff --git a/PyTorchSimFrontend/extension_device.cpp b/PyTorchSimFrontend/extension_device.cpp
index cfaecf2b..b8a6e092 100644
--- a/PyTorchSimFrontend/extension_device.cpp
+++ b/PyTorchSimFrontend/extension_device.cpp
@@ -159,7 +159,7 @@ at::Tensor custom_to_device(
 // A dummy allocator for our custom device, that secretly uses the CPU
 struct DummyCustomAllocator final : at::Allocator {
   DummyCustomAllocator() = default;
-  at::DataPtr allocate(size_t nbytes) const override {
+  at::DataPtr allocate(size_t nbytes) override {
     void* data = c10::alloc_cpu(nbytes);
     return {data, data, &ReportAndDelete, at::Device(at::DeviceType::PrivateUse1, 0)};
   }
@@ -174,6 +174,10 @@ struct DummyCustomAllocator final : at::Allocator {
   at::DeleterFnPtr raw_deleter() const override {
     return &ReportAndDelete;
   }
+
+  void copy_data(void* dest, const void* src, std::size_t count) const override {
+    std::memcpy(dest, src, count);
+  }
 };
 
 // Register our dummy allocator
diff --git a/PyTorchSimFrontend/extension_device_interface.py b/PyTorchSimFrontend/extension_device_interface.py
new file mode 100644
index 00000000..e5875ab7
--- /dev/null
+++ b/PyTorchSimFrontend/extension_device_interface.py
@@ -0,0 +1,63 @@
+import torch
+from torch._dynamo.device_interface import DeviceInterface, caching_worker_current_devices, caching_worker_device_properties
+
+class _ExtensionDeviceProperties:   # FIXME: Dummy property values
+    name: str = "Extension_device"
+    platform_name: str
+    vendor: str
+    driver_version: str
+    version: str
+    max_compute_units: int
+    gpu_eu_count: int
+    max_work_group_size: int
+    max_num_sub_groups: int
+    sub_group_sizes: list[int]
+    has_fp16: bool
+    has_fp64: bool
+    has_atomic64: bool
+    has_bfloat16_conversions: bool
+    has_subgroup_matrix_multiply_accumulate: bool
+    has_subgroup_matrix_multiply_accumulate_tensor_float32: bool
+    has_subgroup_2d_block_io: bool
+    total_memory: int
+    multi_processor_count: int = 128     # gpu_subslice_count, num_sm
+    architecture: int
+    type: str
+
+_ExtensionDeviceProperties = _ExtensionDeviceProperties
+
+class ExtensionDeviceInterface(DeviceInterface):
+    class Worker:
+        @staticmethod
+        def set_device(device: int):
+            caching_worker_current_devices["extension_device"] = device
+
+        @staticmethod
+        def current_device() -> int:
+            if "extension_device" in caching_worker_current_devices:
+                return caching_worker_current_devices["extension_device"]
+            return torch.xpu.current_device()
+
+        @staticmethod
+        def get_device_properties(device: torch.types.Device = None) -> _ExtensionDeviceProperties:
+            if device is not None:
+                if isinstance(device, str):
+                    device = torch.device(device)
+                    assert device.type == "extension_device"
+                if isinstance(device, torch.device):
+                    device = device.index
+            if device is None:
+                device = ExtensionDeviceInterface.Worker.current_device()
+
+            if "extension_device" not in caching_worker_device_properties:
+                device_prop = [
+                    torch.cuda.get_device_properties(i)
+                    for i in range(torch.cuda.device_count())
+                ]
+                caching_worker_device_properties["extension_device"] = device_prop
+
+            return _ExtensionDeviceProperties
+
+    @staticmethod
+    def get_compute_capability(device: torch.types.Device = None):
+        return 36
\ No newline at end of file
diff --git a/PyTorchSimFrontend/extension_device_op_overrides.py b/PyTorchSimFrontend/extension_device_op_overrides.py
new file mode 100644
index 00000000..b76dae0f
--- /dev/null
+++ b/PyTorchSimFrontend/extension_device_op_overrides.py
@@ -0,0 +1,25 @@
+from __future__ import annotations
+
+from textwrap import dedent
+
+from torch._inductor.codegen.common import DeviceOpOverrides, register_device_op_overrides
+
+class ExtensionDeviceOpOverrides(DeviceOpOverrides):
+    def import_get_raw_stream_as(self, name: str) -> str:
+        return dedent(
+            """
+            def get_raw_stream(_):
+                return 0
+            """
+        )
+
+    def set_device(self, device_idx: int) -> str:
+        return "pass"
+
+    def synchronize(self) -> str:
+        return "pass"
+
+    def device_guard(self, device_idx: int) -> str:
+        return "pass"
+
+register_device_op_overrides("extension_device", ExtensionDeviceOpOverrides())
\ No newline at end of file
diff --git a/PyTorchSimFrontend/extension_utils.py b/PyTorchSimFrontend/extension_utils.py
new file mode 100644
index 00000000..0418cacd
--- /dev/null
+++ b/PyTorchSimFrontend/extension_utils.py
@@ -0,0 +1,26 @@
+import sympy
+import torch
+
+"""
+NOTE: Temporary File
+
+This file contains functions that were removed or changed in newer versions
+of PyTorch. It is kept here only to temporarily enable compatibility while
+upgrading to PyTorch 2.8 from PyTorch 2.2.
+
+These functions will eventually be integrated into the appropriate source files
+or removed once no longer needed.
+
+This file is not intended to be permanent and should be deleted in the future.
+"""
+
+def free_symbol_startswith(index: sympy.Expr, prefix: str):
+    return any(v.name.startswith(prefix) for v in index.free_symbols)
+
+def sympy_symbol(name: str) -> sympy.Symbol:
+    # This should never be used for creating shape/stride symbols, as those
+    # should all be allocated before Inductor.
+    assert name[0] != "s"
+    # NOTE: shape symbols are positive (> 0), but index variables are only
+    # non-negative (>= 0).
+    return sympy.Symbol(name, integer=True, nonnegative=True)
\ No newline at end of file
diff --git a/PyTorchSimFrontend/mlir/mlir_autotune.py b/PyTorchSimFrontend/mlir/mlir_autotune.py
index 988408ea..138bec50 100644
--- a/PyTorchSimFrontend/mlir/mlir_autotune.py
+++ b/PyTorchSimFrontend/mlir/mlir_autotune.py
@@ -49,6 +49,9 @@ def __init__(
         self.extra_args = extra_args
         #self.hash_key, self.source_file = CUDACodeCache.write(self.source_code, "so")
 
+    def __str__(self) -> str:
+        return f"{self.kernel_name=}, {self.source_file=}, {self.hash_key=}"
+
     def make_run_fn(
         self, input_tensors: torch.Tensor, output_tensors: torch.Tensor
     ) -> Callable[[], None]:
@@ -84,5 +87,6 @@ def cached_run_fn(*args, **kwargs):
             *args,
         )
 
-    def __str__(self) -> str:
-        return f"{self.kernel_name=}, {self.source_file=}, {self.hash_key=}"
\ No newline at end of file
+    def update_workspace_size(self) -> None:
+        # FIXME: Not implemented yet. Checkout torch/_inductor/codegen/rocm/rocm_benchmark_request.py
+        return
\ No newline at end of file
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 297ea162..9f5c0674 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -6,12 +6,14 @@
 from functools import reduce
 from operator import mul
 import torch
+from typing import Optional
 from collections import defaultdict
 from concurrent.futures import ThreadPoolExecutor
 from torch._dynamo.testing import rand_strided
 from torch._inductor.autotune_process import TensorMeta
 from torch._dynamo.utils import dynamo_timed
 from torch._inductor.codegen import cpp, wrapper, common, memory_planning
+from torch._inductor.ir import GraphPartitionSignature
 from torch._inductor.virtualized import V, _ops as ops
 from torch._inductor.codecache import write_atomic
 from torch._inductor.utils import (
@@ -57,10 +59,25 @@ def reduction_partial_combine_vec(reduction_type, vector_value, init_value):
         return ops.logical_and(vector_value, init_value)
     raise AssertionError(reduction_type)
 
-class ExtensionWrapperCodegen(wrapper.WrapperCodeGen):
+class ExtensionWrapperCodegen(wrapper.PythonWrapperCodegen):
     def __init__(self):
         super().__init__()
 
+    @classmethod
+    def create(
+        cls,
+        is_subgraph: bool,
+        subgraph_name: Optional[str],
+        parent_wrapper: Optional[wrapper.PythonWrapperCodegen],
+        partition_signatures: Optional[GraphPartitionSignature] = None,
+    ):
+        if is_subgraph:
+            assert subgraph_name is not None and parent_wrapper is not None
+            return wrapper.SubgraphPythonWrapperCodegen(
+                subgraph_name, parent_wrapper, partition_signatures
+            )
+        return cls()
+
     def write_header(self):
         self.header.splice(
             f"""
@@ -89,6 +106,7 @@ def write_header(self):
                 reinterpret_tensor = torch.ops.aten._reinterpret_tensor
                 custom_async_compile = CustomAsyncCompile()
                 os.environ["TORCHSIM_LAST_COMPILED_MODULE"] = __file__
+                print(f\'Wrapper Codegen Path = {{__file__}}\')
             """
         )
         self.header.splice(
@@ -132,7 +150,7 @@ def call(args):
                 self.prefix.writeline(f"{lhs} = args")
                 self.prefix.writeline("args.clear()")
 
-            self.codegen_inputs(self.prefix, V.graph.graph_inputs)
+            self.codegen_inputs()
             self.codegen_input_size_asserts()
             self.codegen_sram_plan_prefix()
 
@@ -152,10 +170,27 @@ def codegen_sram_plan_postfix(self, outputs):
                 continue
             self.wrapper_call.writeline(f"sram_plan_postfix('{name}', {name})")
 
-    @dynamo_timed
+    def _generate_kernel_call_helper(
+        self,
+        kernel_name: str,
+        call_args,
+        *,
+        device=None,
+        triton=True,
+        arg_types=None,
+        raw_keys=None,
+        raw_args=None,
+        triton_meta=None,
+        graph_name="",
+        original_fxnode_name=None,
+    ):
+        device = device or V.graph.get_current_device_or_throw()
+        self.writeline(self.wrap_kernel_call(kernel_name, call_args))
+        return
+
     def generate(self, is_inference):
         result = IndentedBuffer()
-        result.splice(self.header)
+        # result.splice(self.header)
 
         with contextlib.ExitStack() as stack:
             stack.enter_context(self.wrapper_call.indent())
@@ -170,8 +205,13 @@ def generate(self, is_inference):
 
                 if isinstance(line, wrapper.MemoryPlanningLine):
                     line.codegen(self.wrapper_call)
+                elif isinstance(line, wrapper.KernelCallLine):
+                    self.wrapper_call.writeline(self.wrap_kernel_call(line.kernel_name, line.call_args))
                 else:
-                    self.wrapper_call.writeline(line)
+                    if isinstance(line, wrapper.WrapperLine):
+                        line.codegen(self.wrapper_call)
+                    else:
+                        self.wrapper_call.writeline(line)
                 # Add buffer plan hook for alloc
                 if isinstance(line, memory_planning.AllocFromPoolLine) or isinstance(line, wrapper.AllocateLine):
                     self.wrapper_call.writeline(f"sram_plan_prefix('{line.node.get_name()}', {line.node.get_name()})")
@@ -180,7 +220,9 @@ def generate(self, is_inference):
             self.mark_output_type()
             self.generate_return(output_refs)
 
-        self.append_precomputed_sizes_to_prefix()
+        # self.append_precomputed_sizes_to_prefix() # FIXME: Need to replace append_precomputed_sizes_to_prefix()
+        result.splice(self.header)
+
         self.finalize_prefix()
         result.splice(self.prefix)
 
@@ -189,7 +231,10 @@ def generate(self, is_inference):
 
         self.generate_end(result)
         self.add_benchmark_harness(result)
-        return result.getvaluewithlinemap()
+        return (
+            result.getvaluewithlinemap(),
+            self.kernel_declarations.getvaluewithlinemap(),
+        )
 
     def memory_plan(self):
         self.lines = memory_planning.MemoryPlanner(self).plan(self.lines)
@@ -964,13 +1009,13 @@ def _log_autotune_result(self, best_choice, best_cycle):
         )
 
     def codegen_nodes(self, nodes, kernel_name):
-        src_code = super().codegen_nodes(nodes, kernel_name)
+        src_code, meta_code = super().codegen_nodes(nodes, kernel_name)
         self._prepare_simulator_headers(src_code)
         if "autotune" in extension_config.codegen_mapping_strategy and extension_config.pytorchsim_timing_mode:
             optimal_src_code = self.autotune(nodes, kernel_name)[0]
             if optimal_src_code is not None:
                 return optimal_src_code
-        return src_code
+        return src_code, meta_code
 
     def _prepare_simulator_headers(self, src_code):
         write_path = extension_codecache.get_write_path(src_code)
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index b86607ea..f98a2132 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -14,6 +14,7 @@
 from torch._inductor.virtualized import V
 from torch._inductor.ir import MultiOutputLayout
 from torch._inductor.dependencies import MemoryDep, StarDep, WeakDep
+from torch._inductor.codegen.wrapper import KernelDefinitionLine
 from torch.utils._sympy.functions import ModularIndexing, FloorDiv, Mod
 import sympy
 import contextlib
@@ -25,15 +26,20 @@
 import torch.fx
 from torch.utils._sympy.value_ranges import ValueRanges
 from torch._inductor.utils import (
-    free_symbol_startswith,
     get_sympy_Expr_dtype,
     IndentedBuffer,
     sympy_subs,
-    sympy_symbol,
     unique,
 )
 from PyTorchSimFrontend import extension_config
 from PyTorchSimFrontend import extension_codecache
+from PyTorchSimFrontend.mlir.mlir_autotune import MLIRBenchmarkRequest
+
+from PyTorchSimFrontend.extension_utils import (
+    free_symbol_startswith,
+    sympy_symbol
+)
+
 schedule_log = torch._logging.getArtifactLogger(__name__, "schedule")
 
 DTYPE_TO_MLIR = {
@@ -654,7 +660,7 @@ def call_kernel(self, kernel_name):
         wrapper = V.graph.wrapper_code
         _, call_args, _, _ = self.kernel_group.args.mlir_argdefs()
        # generate the code to call this
-        wrapper.generate_kernel_call(kernel_name, call_args, cuda=False)
+        wrapper.generate_kernel_call(kernel_name, call_args, triton=False)
 
     def is_modular_indexing(self, expr):
         return "ModularIndexing" in str(expr)
@@ -778,8 +784,8 @@ def codegen_nodes(self, nodes, kernel_name):
             V.graph.removed_buffers |= self.removed_buffers
             # V.graph.inplaced_to_remove |= self.inplaced_to_remove
             src_code = self.codegen_kernel(kernel_name=kernel_name)
-            self.meta_kernel()
-            return src_code
+            meta_code = self.meta_kernel()
+            return src_code, meta_code
 
     def codegen_kernel(self, kernel_name):
         arg_defs, _, _, _ = self.kernel_group.args.mlir_argdefs()
@@ -797,12 +803,9 @@ def codegen_kernel(self, kernel_name):
         return code.getvalue()
 
     def meta_kernel(self):
-        wrapper = V.graph.wrapper_code
         _, _, arg_attributes, _ = self.kernel_group.args.mlir_argdefs()
-        wrapper.add_import_once('\nprint(f\'Wrapper Codegen Path = {__file__}\')')
-        # Dump loop and load/store information
-        wrapper.add_import_once(f"arg_attributes = {arg_attributes}")
-        return arg_attributes
+        meta_code = arg_attributes
+        return meta_code
 
     def get_constant_vector(self, expr):
         constant_vector = [[int(expr.coeff(var)),None] for var in self.itervars]
@@ -903,10 +906,10 @@ def load(name: str, index: sympy.Expr):
                 if name in store_cache:
                     return store_cache[name]
                 key = name+str(index)
-                if key not in self.cse.cache:
+                if key not in self.cse._cache:
                     result = self.load(name, index)
-                    self.cse.cache[key] = result
-                return self.cse.cache[key]
+                    self.cse._cache[key] = result
+                return self.cse._cache[key]
 
             @staticmethod
             def store(name, index, value, mode=None):
@@ -914,7 +917,7 @@ def store(name, index, value, mode=None):
                 if mode is None:
                     self.cse.store_cache[name] = value
                     if self.current_node:
-                        for other_name in self.current_node.get_mutations():
+                        for other_name in self.current_node.get_output(name).get_mutations():
                             self.cse.store_cache[other_name] = value
                 if name not in V.graph.removed_buffers:
                     return self.store(name, index, value, mode=mode)
@@ -924,7 +927,7 @@ def store_reduction(name, index, value):
                 self.store_buffer_names.add(name)
                 self.cse.store_cache[name] = value
                 if self.current_node:
-                    for other_name in self.current_node.get_mutations():
+                    for other_name in self.current_node.get_output(name).get_mutations():
                         self.cse.store_cache[other_name] = value
 
                 if name not in V.graph.removed_buffers:
@@ -970,7 +973,7 @@ def bucketize(
 
         super().__enter__()
         assert self.overrides
-        parent_handler = self.overrides(V.get_ops_handler())
+        parent_handler = self.overrides()
         self.exit_stack.enter_context(V.set_ops_handler(CSEProxy()))
         self.exit_stack.enter_context(V.set_kernel_handler(self))
         return self
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index 23be941c..66155e9c 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -22,8 +22,6 @@ class MLIRScheduling(BaseScheduling):
     target_kernel = MLIRKernel
     def __init__(self, scheduler):
         self.scheduler = scheduler
-        self.scheduler.can_fuse_origin = self.scheduler.can_fuse
-        self.scheduler.can_fuse = self.can_fuse_with_exceptions
         #self.scheduler.enter_context = self.enter_context_fixed # FIXME. Monkey patch: For fixing the inductor bug
         self.kernel_group = mlir_common.MLIRWrapperKenrelGroup()
         self._ready_to_flush = False
@@ -90,6 +88,9 @@ def can_fuse_with_exceptions(self, node1: BaseSchedulerNode, node2: BaseSchedule
     def _set_flush_status(self, status: bool):
         self._ready_to_flush = status
 
+    def reset_kernel_group(self):
+        self.kernel_group = mlir_common.MLIRWrapperKenrelGroup()
+
     def can_fuse_vertical(self, node1, node2):
         return self.can_fuse_horizontal(node1, node2)
 
@@ -103,7 +104,7 @@ def can_fuse_horizontal(self, node1, node2):
 
         # Reduction is currently not supported
         if node1.is_reduction() and node2.is_reduction() and not node1.is_template() and not node2.is_template() and extension_config.CONFIG_FUSION_REDUCTION_REDUCTION:
-            return vars1 == vars2 and reduce1 == reduce2 and node1.inverse_users == node2.inverse_users
+            return vars1 == vars2 and reduce1 == reduce2 # and node1.inverse_users == node2.inverse_users
         if node1.is_reduction() or node2.is_reduction():
             return False
 
@@ -180,7 +181,8 @@ def revert_group(self, act_nodes, args=None, var_ranges=None):
     def group_fn(self, sizes):
         return tuple(tuple(map(V.graph.sizevars.simplify, s)) for s in sizes)
 
-    def codegen_nodes(self, nodes):
+    def codegen_node(self, _node):
+        nodes = _node.get_nodes()
         _, (group, reduction_group) = max(
             nodes, key=lambda x: int(x.is_reduction())
         ).group
@@ -210,8 +212,8 @@ def codegen_nodes(self, nodes):
 
         kernel_name_candidate = f"extension_kernel_{MLIRScheduling.count}"
         MLIRScheduling.count += 1
-        src_code = ex_kernel.codegen_nodes(nodes, kernel_name_candidate)
-        kernel_name = self.define_kernel(src_code, kernel_name_candidate, ex_kernel.vector_lane,
+        src_code, meta_code = ex_kernel.codegen_nodes(nodes, kernel_name_candidate)
+        kernel_name = self.define_kernel(src_code, meta_code, kernel_name_candidate, ex_kernel.vector_lane,
                            ex_kernel.spad_info, origins= {str(i) for i in nodes[0].node.origins})
         ex_kernel.call_kernel(kernel_name)
         _, args, _, _ = ex_kernel.args.mlir_argdefs()
@@ -230,26 +232,30 @@ def codegen_sync(self):
         pass
 
     def flush(self):
-        self.kernel_group.codegen_define_and_call(V.graph.wrapper_code)
-        self.kernel_group = mlir_common.MLIRWrapperKenrelGroup()
+        src_code = self.kernel_group.codegen_group()
+        if src_code:
+            kernel_name = self.define_kernel(
+                src_code, self.kernel_group.scheduled_nodes
+            )
+            self.kernel_group.call_kernel(V.graph.wrapper_code, kernel_name)
+        self.reset_kernel_group()
         self._set_flush_status(False)
 
     def define_function(self, kernel):
         partial_code, function_name = kernel.def_function()
         if partial_code is not None and function_name not in self.outer_function:
             with V.set_kernel_handler(kernel):
-                code = partial_code.finalize()
+                code = partial_code.finalize_all()
                 wrapper = V.graph.wrapper_code
                 wrapper.header.writeline(code)
                 self.outer_function.add(function_name)
 
-    def define_kernel(self, src_code, kernel_name, vector_lane, spad_info, loop_size=None, origins={}):
+    def define_kernel(self, src_code, meta_code, kernel_name, vector_lane, spad_info, loop_size=None, origins={}):
         wrapper = V.graph.wrapper_code
         if src_code in wrapper.src_to_kernel:
             kernel_name = wrapper.src_to_kernel[src_code]
         else:
             wrapper.src_to_kernel[src_code] = kernel_name
-
             codecache_def = IndentedBuffer()
             codecache_def.writeline(f"custom_async_compile.mlir('''{src_code}''', ")
             codecache_def.writeline(f"vectorlane_size={vector_lane},")
@@ -261,26 +267,16 @@ def define_kernel(self, src_code, kernel_name, vector_lane, spad_info, loop_size
             wrapper.define_kernel(kernel_name, codecache_def.getvalue(), cuda=False)
         return kernel_name
 
-    def codegen_template(self, template_node, epilogue_nodes):
-        # Handle prologue pattern
-        prologue_nodes = []
-        if not template_node.is_template():
-            epilogue_nodes = [template_node] + epilogue_nodes
-            for i, node in enumerate(epilogue_nodes):
-                if node.is_template():
-                    template_node = node
-                    prologue_nodes = epilogue_nodes[:i]
-                    epilogue_nodes = epilogue_nodes[i+1:]
-                    break
-
+    def codegen_template(self, template_node, prologue_nodes, epilogue_nodes):
         # Generate template code
         template_buffer = template_node.node
         kernel, tile_candidates, render = template_buffer.make_kernel_render(template_buffer, prologue_nodes=prologue_nodes, epilogue_nodes=epilogue_nodes, kernel_group=self.kernel_group)
         _, _, _, kernel.buffer_types = self.kernel_group.args.mlir_argdefs()
         src_code = kernel.codegen_nodes(tile_candidates, render, template_node, prologue_nodes, epilogue_nodes)
+        meta_code = kernel.meta_kernel()
 
         with V.set_kernel_handler(kernel):
-            kernel_name = self.define_kernel(src_code, kernel.kernel_name, kernel.vector_lane, kernel.spad_info,
+            kernel_name = self.define_kernel(src_code, meta_code, kernel.kernel_name, kernel.vector_lane, kernel.spad_info,
                                              kernel.loop_size, origins={str(i) for i in template_node.node.origins})
             self.define_function(kernel)
 
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index a36bc907..4cfe71bf 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -13,8 +13,8 @@
 from typing import List, Optional
 from unittest.mock import patch
 
-from torch._inductor.codegen.common import KernelTemplate, ChoiceCaller, CSE, DeferredLine
-from torch._inductor.ir import Buffer, IRNode, TemplateBuffer
+from torch._inductor.codegen.common import KernelTemplate, CSE, DeferredLine
+from torch._inductor.ir import Buffer, IRNode, TemplateBuffer, ChoiceCaller
 from torch._inductor.select_algorithm import PartialRender
 from torch._inductor.codegen.cuda.cuda_kernel import CUDATemplateCaller
 from torch._inductor.autotune_process import TensorMeta
@@ -394,18 +394,14 @@ def meta_kernel(self):
                 for idx in range(len(arg_attributes)):
                     if arg_attributes[idx][0] == name:
                         arg_attributes[idx][1] = attr
-        wrapper.add_import_once('\nprint(f\'Wrapper Codegen Path = {__file__}\')')
-        # Dump loop and load/store information
-        wrapper.add_import_once(f"loop_info = {self.loop_info}")
-        wrapper.add_import_once(f"arg_attributes = {arg_attributes}")
+        return arg_attributes
 
     def call_kernel(self, kernel_name):
         wrapper = V.graph.wrapper_code
         _, call_args, _, _ = self.kernel_group.args.mlir_argdefs()
         # generate the code to call this
         wrapper.generate_kernel_call(
-            kernel_name if self.outer_func_name is None else self.outer_func_name + f"_{len(call_args)}",
-            call_args, cuda=False)
+            kernel_name if self.outer_func_name is None else self.outer_func_name + f"_{len(call_args)}", call_args)
 
     def codegen_template_code(self, render, template_node, prologue_nodes, epilogue_nodes, tile_info):
         with self as kernel:
@@ -479,7 +475,7 @@ def codegen_template_code(self, render, template_node, prologue_nodes, epilogue_
             src_code = (
                 partial_code
                 if isinstance(partial_code, str)
-                else partial_code.finalize()
+                else partial_code.finalize_all()
             )
 
             # For consistency, white space could make wrong write_path
@@ -753,7 +749,7 @@ def hook():
         return "<REDUCTION_OUTPUT>"
 
     def def_function(self):
-        _, call_args, _ = self.kernel_group.args.python_argdefs()
+        _, call_args, _, _ = self.kernel_group.args.python_argdefs()
         if self.outer_func_render is not None:
             partial_code, function_name = self.outer_func_render(input_args=call_args)
             return PartialRender(
@@ -1153,7 +1149,7 @@ def __init__(self, name, input_nodes, layout, input_reorder = None):
         """
         super().__init__(name)
         self.input_nodes = [node for node in input_nodes if node is not None]
-        self.output_node: Buffer = Buffer("buf_out", layout)
+        self.output_node: Buffer = Buffer(name="buf_out", layout=layout)
         self.input_reorder = input_reorder
         self.layout = layout
 
@@ -1218,7 +1214,10 @@ def make_kernel_render(
             self.output_node.get_layout(),
             make_kernel_render,
             bmreq,
+            False,  # supports_epilogue_fusion
             self,
+            kwargs,
+            "" # Currently Empty description
         )
 
     def get_tile_candidates(self, **kwargs):
diff --git a/Scheduler/scheduler.py b/Scheduler/scheduler.py
index 8aa849b1..04fa3c8d 100644
--- a/Scheduler/scheduler.py
+++ b/Scheduler/scheduler.py
@@ -1,5 +1,6 @@
 from typing import List
 import os
+import sys
 import numpy as np
 import torch
 from pathlib import Path
@@ -7,6 +8,10 @@
 from PyTorchSimFrontend.extension_codecache import hash_prefix
 from Simulator.simulator import TOGSimulator
 from PyTorchSimFrontend import extension_config
+from PyTorchSimFrontend.extension_device_interface import ExtensionDeviceInterface
+
+from torch._dynamo.device_interface import register_interface_for_device
+
 
 def import_module_from_path(module_name, path):
     module_path = Path(path)  # Convert to Path object for safety
@@ -194,17 +199,21 @@ def setup_device(cls):
         from PyTorchSimFrontend.mlir.mlir_scheduling import (
             MLIRScheduling
         )
+
         register_backend_for_device(
-            "npu", MLIRScheduling, ExtensionWrapperCodegen
-        )
-        assert(
-            get_scheduling_for_device("npu") == MLIRScheduling
+            "npu",
+            lambda scheduling: MLIRScheduling(scheduling),
+            ExtensionWrapperCodegen
         )
+        import PyTorchSimFrontend.extension_device_op_overrides
+
         assert(
         get_wrapper_codegen_for_device("npu")
             == ExtensionWrapperCodegen
         )
         cls.NPU_MODULE = module
+        sys.modules['torch.npu'] = module
+        register_interface_for_device(module.custom_device(), ExtensionDeviceInterface)
         return module
 
     def submit(self, batched_req, partition_idx) -> List[RequestReturn]:

From b7a275e186ff24f68f91442c6b763e43cfceb2c1 Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Wed, 24 Sep 2025 13:28:55 +0000
Subject: [PATCH 043/194] [Test] Add torch.no_grad(), change to use
 torch.nn.ReLU, fuion off

---
 PyTorchSimFrontend/mlir/mlir_scheduling.py |  1 +
 tests/test_activation.py                   |  5 +++--
 tests/test_conv2d.py                       | 25 +++++++++++-----------
 tests/test_layernorm.py                    |  5 +++--
 4 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index 66155e9c..b6b8dea5 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -97,6 +97,7 @@ def can_fuse_vertical(self, node1, node2):
     def can_fuse_horizontal(self, node1, node2):
         if not extension_config.CONFIG_FUSION:
             return False
+
         if (len(node1.get_nodes())+ len(node2.get_nodes())) > self.max_fusion_size:
             return False
         _, (vars1, reduce1) = node1.group
diff --git a/tests/test_activation.py b/tests/test_activation.py
index 575fc7e8..49a9467c 100644
--- a/tests/test_activation.py
+++ b/tests/test_activation.py
@@ -23,9 +23,10 @@ def test_ReLU(device, size=(128, 128)):
     input = torch.randn(size)
     x1 = input.to(device=device)
     x2 = input.to("cpu")
-    opt_fn = torch.compile(dynamic=False)(torch.nn.functional.relu)
+    ReLU = torch.nn.ReLU()
+    opt_fn = torch.compile(dynamic=False)(ReLU)
     y = opt_fn(x1)
-    cpu_y = torch.nn.functional.relu(x2)
+    cpu_y = ReLU(x2)
     test_result("ReLU", y, cpu_y)
 
 def test_GeLU(device, size=(128, 128), approximate='none'):
diff --git a/tests/test_conv2d.py b/tests/test_conv2d.py
index e964319d..97e5cdea 100644
--- a/tests/test_conv2d.py
+++ b/tests/test_conv2d.py
@@ -44,15 +44,16 @@ def custom_conv2d(a, b, bias):
     module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
     torch._dynamo.config.cache_size_limit = 64
-    test_conv2d(device, batch_size=8, in_channels=3, out_channels=32, input_size=32, kernel_size=1, stride=1, padding=0)
-    test_conv2d(device, batch_size=1, in_channels=3, out_channels=64, input_size=64//2, kernel_size=7, stride=2, padding=3)
-    test_conv2d(device, batch_size=2, in_channels=3, out_channels=64, input_size=32//2, kernel_size=7, stride=1, padding=3)
-    test_conv2d(device, batch_size=4, in_channels=3, out_channels=64, input_size=64//2, kernel_size=7, stride=1, padding=3)
-    test_conv2d(device, batch_size=4, in_channels=3, out_channels=64, input_size=64//2, kernel_size=7, stride=1, padding=3)
-    test_conv2d(device, batch_size=2, in_channels=128, out_channels=256, input_size=13, kernel_size=5, stride=1, padding=2)
-    test_conv2d(device, batch_size=2, in_channels=128, out_channels=512, input_size=14, kernel_size=7, stride=1, padding=3)
-    test_conv2d(device, batch_size=1, in_channels=128, out_channels=256, input_size=14, kernel_size=3, stride=2, padding=1)
-    test_conv2d(device, batch_size=1, in_channels=128, out_channels=256, input_size=7, kernel_size=3, stride=2, padding=1)
-    test_conv2d(device, batch_size=1, in_channels=128, out_channels=256, input_size=2, kernel_size=1, stride=1, padding=0)
-    test_conv2d(device, batch_size=1, in_channels=128, out_channels=256, input_size=14, kernel_size=1, stride=2, padding=0)
-    test_conv2d(device, batch_size=1, in_channels=3, out_channels=768, input_size=224, kernel_size=16,stride=16, padding=0)
+    with torch.no_grad():
+        test_conv2d(device, batch_size=8, in_channels=3, out_channels=32, input_size=32, kernel_size=1, stride=1, padding=0)
+        test_conv2d(device, batch_size=1, in_channels=3, out_channels=64, input_size=64//2, kernel_size=7, stride=2, padding=3)
+        test_conv2d(device, batch_size=2, in_channels=3, out_channels=64, input_size=32//2, kernel_size=7, stride=1, padding=3)
+        test_conv2d(device, batch_size=4, in_channels=3, out_channels=64, input_size=64//2, kernel_size=7, stride=1, padding=3)
+        test_conv2d(device, batch_size=4, in_channels=3, out_channels=64, input_size=64//2, kernel_size=7, stride=1, padding=3)
+        test_conv2d(device, batch_size=2, in_channels=128, out_channels=256, input_size=13, kernel_size=5, stride=1, padding=2)
+        test_conv2d(device, batch_size=2, in_channels=128, out_channels=512, input_size=14, kernel_size=7, stride=1, padding=3)
+        test_conv2d(device, batch_size=1, in_channels=128, out_channels=256, input_size=14, kernel_size=3, stride=2, padding=1)
+        test_conv2d(device, batch_size=1, in_channels=128, out_channels=256, input_size=7, kernel_size=3, stride=2, padding=1)
+        test_conv2d(device, batch_size=1, in_channels=128, out_channels=256, input_size=2, kernel_size=1, stride=1, padding=0)
+        test_conv2d(device, batch_size=1, in_channels=128, out_channels=256, input_size=14, kernel_size=1, stride=2, padding=0)
+        test_conv2d(device, batch_size=1, in_channels=3, out_channels=768, input_size=224, kernel_size=16,stride=16, padding=0)
diff --git a/tests/test_layernorm.py b/tests/test_layernorm.py
index 28e38d37..a2e842d0 100644
--- a/tests/test_layernorm.py
+++ b/tests/test_layernorm.py
@@ -44,5 +44,6 @@ def test_LayerNorm(device, size=(64, 64)):
     from Scheduler.scheduler import PyTorchSimRunner
     module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
-    #test_LayerNorm(device)
-    test_LayerNorm(device, shape)
+    with torch.no_grad():
+        #test_LayerNorm(device)
+        test_LayerNorm(device, shape)

From 5c5e61c82b1482ec8b2eb48cf64e956bfccd4d94 Mon Sep 17 00:00:00 2001
From: OkkyunWoo <okkyun.w@postech.ac.kr>
Date: Thu, 6 Nov 2025 05:28:52 +0000
Subject: [PATCH 044/194] [Implement] Hook and GuardImpl for extension device

---
 PyTorchSimDevice/ExtensionDeviceGuardImpl.cpp |   8 ++
 PyTorchSimDevice/ExtensionDeviceGuardImpl.h   | 127 ++++++++++++++++++
 .../extension_device.cpp                      |  10 +-
 .../extension_device_interface.py             |   0
 .../extension_device_op_overrides.py          |   0
 PyTorchSimDevice/extension_hooks.cpp          |  48 +++++++
 PyTorchSimDevice/extension_hooks.h            |  30 +++++
 Scheduler/scheduler.py                        |   8 +-
 8 files changed, 221 insertions(+), 10 deletions(-)
 create mode 100644 PyTorchSimDevice/ExtensionDeviceGuardImpl.cpp
 create mode 100644 PyTorchSimDevice/ExtensionDeviceGuardImpl.h
 rename {PyTorchSimFrontend => PyTorchSimDevice}/extension_device.cpp (99%)
 rename {PyTorchSimFrontend => PyTorchSimDevice}/extension_device_interface.py (100%)
 rename {PyTorchSimFrontend => PyTorchSimDevice}/extension_device_op_overrides.py (100%)
 create mode 100644 PyTorchSimDevice/extension_hooks.cpp
 create mode 100644 PyTorchSimDevice/extension_hooks.h

diff --git a/PyTorchSimDevice/ExtensionDeviceGuardImpl.cpp b/PyTorchSimDevice/ExtensionDeviceGuardImpl.cpp
new file mode 100644
index 00000000..a0b1395d
--- /dev/null
+++ b/PyTorchSimDevice/ExtensionDeviceGuardImpl.cpp
@@ -0,0 +1,8 @@
+#include "ExtensionDeviceGuardImpl.h"
+#include <c10/core/impl/DeviceGuardImplRegistry.h>
+
+namespace c10::extension_device::impl {
+
+C10_REGISTER_GUARD_IMPL(extension_device, ExtensionDeviceGuardImpl);
+
+} // namespace c10::extension_device::impl
diff --git a/PyTorchSimDevice/ExtensionDeviceGuardImpl.h b/PyTorchSimDevice/ExtensionDeviceGuardImpl.h
new file mode 100644
index 00000000..6d35677b
--- /dev/null
+++ b/PyTorchSimDevice/ExtensionDeviceGuardImpl.h
@@ -0,0 +1,127 @@
+#pragma once
+
+#include <c10/core/DeviceGuard.h>
+#include <c10/core/impl/DeviceGuardImplInterface.h>
+#include <c10/core/Stream.h>
+#include <c10/core/Event.h>
+#include <c10/core/DeviceType.h>
+#include <c10/util/Optional.h>
+
+namespace c10::extension_device::impl {
+
+struct ExtensionDeviceGuardImpl final : public c10::impl::DeviceGuardImplInterface {
+  static constexpr DeviceType static_type = DeviceType::PrivateUse1; // ✅ your backend type
+
+  ExtensionDeviceGuardImpl() = default;
+
+  explicit ExtensionDeviceGuardImpl(DeviceType t) {
+    TORCH_CHECK(
+        t == static_type,
+        "ExtensionDeviceGuardImpl initialized with non-extension_device DeviceType: ",
+        t);
+  }
+
+  // --------------------------------------------------------------------------
+  // 기본적인 device guard (CPU처럼 동작)
+  // --------------------------------------------------------------------------
+  DeviceType type() const override {
+    return static_type;
+  }
+
+  Device exchangeDevice(Device d) const override {
+    TORCH_CHECK(d.type() == static_type, "Expected extension_device but got ", d);
+    return d; // nothing to exchange, CPU-like
+  }
+
+  Device getDevice() const override {
+    return Device(static_type, 0);
+  }
+
+  void setDevice(Device d) const override {
+    TORCH_CHECK(d.type() == static_type, "Expected extension_device but got ", d);
+  }
+
+  void uncheckedSetDevice(Device d) const noexcept override {}
+
+  DeviceIndex deviceCount() const noexcept override {
+    return 1; // pretend single device
+  }
+
+  // --------------------------------------------------------------------------
+  // Stream handling (동기식이므로 기본 stream만 사용)
+  // --------------------------------------------------------------------------
+  Stream getStream(Device d) const override {
+    return Stream(Stream::DEFAULT, d);
+  }
+
+  Stream getNewStream(Device d, int priority = 0) const override {
+    return Stream(Stream::DEFAULT, d);
+  }
+
+  Stream getStreamFromGlobalPool(Device d, bool = false) const override {
+    return Stream(Stream::DEFAULT, d);
+  }
+
+  Stream exchangeStream(Stream s) const override {
+    return s;
+  }
+
+  bool queryStream(const Stream& stream) const override {
+    (void)stream;
+    return true;
+  }
+
+  void synchronizeStream(const Stream& stream) const override {
+    (void)stream;
+  }
+
+  void synchronizeDevice(DeviceIndex device_index) const override {
+    (void)device_index;
+  }
+
+  // --------------------------------------------------------------------------
+  // Event handling (전부 no-op)
+  // --------------------------------------------------------------------------
+  void destroyEvent(void* event, const DeviceIndex device_index) const noexcept override {
+    (void)event;
+    (void)device_index;
+  }
+
+  void record(void** event, const Stream& stream, const DeviceIndex device_index, const EventFlag flag) const override {
+    (void)event;
+    (void)stream;
+    (void)device_index;
+    (void)flag;
+  }
+
+  void block(void* event, const Stream& stream) const override {
+    (void)event;
+    (void)stream;
+  }
+
+  bool queryEvent(void* event) const override {
+    (void)event;
+    return true;
+  }
+
+  void synchronizeEvent(void* event) const override {
+    (void)event;
+  }
+
+  double elapsedTime(void* start_event, void* end_event, const DeviceIndex device_index) const override {
+    (void)start_event;
+    (void)end_event;
+    (void)device_index;
+    return 0.0;
+  }
+
+  // --------------------------------------------------------------------------
+  // Misc (allocator integration)
+  // --------------------------------------------------------------------------
+  void recordDataPtrOnStream(const c10::DataPtr& data_ptr, const Stream& stream) const override {
+    (void)data_ptr;
+    (void)stream;
+  }
+};
+
+} // namespace c10::extension_device::impl
diff --git a/PyTorchSimFrontend/extension_device.cpp b/PyTorchSimDevice/extension_device.cpp
similarity index 99%
rename from PyTorchSimFrontend/extension_device.cpp
rename to PyTorchSimDevice/extension_device.cpp
index b8a6e092..a1dcfcf4 100644
--- a/PyTorchSimFrontend/extension_device.cpp
+++ b/PyTorchSimDevice/extension_device.cpp
@@ -55,16 +55,12 @@ static inline at::MemoryFormat fix_memory_format(c10::optional<at::MemoryFormat>
     return mf;
 }
 
+#include "ExtensionDeviceGuardImpl.h"
+
 static uint64_t op_counter = 0;
 static uint64_t last_saved_value = 0;
 
-// register guard
-namespace at {
-namespace detail {
-
-C10_REGISTER_GUARD_IMPL(PrivateUse1, c10::impl::NoOpDeviceGuardImpl<DeviceType::PrivateUse1>);
-
-}} // namespace at::detail
+C10_REGISTER_GUARD_IMPL(PrivateUse1, c10::extension_device::impl::ExtensionDeviceGuardImpl);
 
 // basic dummy add function
 at::Tensor custom_add_Tensor(const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha) {
diff --git a/PyTorchSimFrontend/extension_device_interface.py b/PyTorchSimDevice/extension_device_interface.py
similarity index 100%
rename from PyTorchSimFrontend/extension_device_interface.py
rename to PyTorchSimDevice/extension_device_interface.py
diff --git a/PyTorchSimFrontend/extension_device_op_overrides.py b/PyTorchSimDevice/extension_device_op_overrides.py
similarity index 100%
rename from PyTorchSimFrontend/extension_device_op_overrides.py
rename to PyTorchSimDevice/extension_device_op_overrides.py
diff --git a/PyTorchSimDevice/extension_hooks.cpp b/PyTorchSimDevice/extension_hooks.cpp
new file mode 100644
index 00000000..aadd6d2a
--- /dev/null
+++ b/PyTorchSimDevice/extension_hooks.cpp
@@ -0,0 +1,48 @@
+#include "extension_hooks.h"
+
+bool ExtensionPU1Hooks::isBuilt() const { return true; }
+bool ExtensionPU1Hooks::isAvailable() const { return true; }
+
+const at::Generator& ExtensionPU1Hooks::getDefaultGenerator(c10::DeviceIndex idx) const {
+  if (idx < 0) idx = 0;
+  static std::vector<at::Generator> gens;
+  static std::mutex m;
+  std::lock_guard<std::mutex> g(m);
+  if (gens.size() <= (size_t)idx) gens.resize((size_t)idx + 1);
+  if (!gens[idx].defined()) gens[idx] = at::GetGeneratorForPrivateuse1(idx);
+  return gens[idx]; // 영속 객체 참조 반환
+}
+
+at::Generator ExtensionPU1Hooks::getNewGenerator(c10::DeviceIndex idx) const {
+  if (idx < 0) idx = 0;
+  return at::GetGeneratorForPrivateuse1(idx);
+}
+
+at::Device ExtensionPU1Hooks::getDeviceFromPtr(void* data) const {
+  return at::Device(at::kPrivateUse1, 0); // MVP: 단일 디바이스 가정
+}
+
+bool ExtensionPU1Hooks::isPinnedPtr(const void* data) const {
+  return false;
+}
+
+at::Allocator* ExtensionPU1Hooks::getPinnedMemoryAllocator() const {
+  return at::getHostAllocator(at::kPrivateUse1);
+}
+
+bool ExtensionPU1Hooks::hasPrimaryContext(c10::DeviceIndex device_index) const { return true; }
+
+void ExtensionPU1Hooks::resizePrivateUse1Bytes(const c10::Storage&, size_t) const {
+  TORCH_CHECK(false, "resizePrivateUse1Bytes not implemented");
+}
+
+// REGISTER_EXTENSION_HOOKS(ExtensionPU1Hooks);
+
+namespace {
+struct AutoRegistrar {
+  AutoRegistrar() {
+    at::RegisterPrivateUse1HooksInterface(new ExtensionPU1Hooks());
+  }
+};
+static AutoRegistrar _auto_registrar;
+}
diff --git a/PyTorchSimDevice/extension_hooks.h b/PyTorchSimDevice/extension_hooks.h
new file mode 100644
index 00000000..fdf3505a
--- /dev/null
+++ b/PyTorchSimDevice/extension_hooks.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <ATen/core/CachingHostAllocator.h>
+#include <ATen/detail/PrivateUse1HooksInterface.h>
+
+#include <ATen/core/Generator.h>
+#include <c10/core/Allocator.h>
+#include <c10/core/Device.h>
+#include <c10/core/Storage.h>
+#include <c10/util/Exception.h>
+
+struct ExtensionPU1Hooks final : public at::PrivateUse1HooksInterface {
+  ExtensionPU1Hooks() {}
+  bool isBuilt() const;
+  bool isAvailable() const;
+
+  const at::Generator& getDefaultGenerator(c10::DeviceIndex device_index) const override;
+
+  at::Generator getNewGenerator(c10::DeviceIndex device_index = -1) const override;
+
+  at::Device getDeviceFromPtr(void* data) const override;
+
+  bool isPinnedPtr(const void* data) const override;
+
+  at::Allocator* getPinnedMemoryAllocator() const override;
+
+  bool hasPrimaryContext(c10::DeviceIndex device_index) const override;
+
+  void resizePrivateUse1Bytes(const c10::Storage& /*storage*/, size_t /*newsize*/) const override;
+};
\ No newline at end of file
diff --git a/Scheduler/scheduler.py b/Scheduler/scheduler.py
index 04fa3c8d..215700eb 100644
--- a/Scheduler/scheduler.py
+++ b/Scheduler/scheduler.py
@@ -8,7 +8,7 @@
 from PyTorchSimFrontend.extension_codecache import hash_prefix
 from Simulator.simulator import TOGSimulator
 from PyTorchSimFrontend import extension_config
-from PyTorchSimFrontend.extension_device_interface import ExtensionDeviceInterface
+from PyTorchSimDevice.extension_device_interface import ExtensionDeviceInterface
 
 from torch._dynamo.device_interface import register_interface_for_device
 
@@ -173,14 +173,16 @@ def setup_device(cls):
             return cls.NPU_MODULE
         source_file_path = os.path.dirname(os.path.abspath(__file__))
         source_file = os.path.join(
-            source_file_path, f"{extension_config.CONFIG_TORCHSIM_DIR}/PyTorchSimFrontend/extension_device.cpp"
+            source_file_path, f"{extension_config.CONFIG_TORCHSIM_DIR}/PyTorchSimDevice/extension_device.cpp"
         )
+        hook_file = os.path.join(source_file_path, f"{extension_config.CONFIG_TORCHSIM_DIR}/PyTorchSimDevice/extension_hooks.cpp")
 
         import torch.utils.cpp_extension
         module = torch.utils.cpp_extension.load(
             name="npu",
             sources=[
                 str(source_file),
+                str(hook_file),
             ],
             extra_cflags=["-g"],
             verbose=True,
@@ -205,7 +207,7 @@ def setup_device(cls):
             lambda scheduling: MLIRScheduling(scheduling),
             ExtensionWrapperCodegen
         )
-        import PyTorchSimFrontend.extension_device_op_overrides
+        import PyTorchSimDevice.extension_device_op_overrides
 
         assert(
         get_wrapper_codegen_for_device("npu")

From 74704b8fbbc38763b7214c1ca4dd0679623c5b98 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 6 Jan 2026 11:21:20 +0000
Subject: [PATCH 045/194] [CI] Change the trigger condition

---
 .github/workflows/docker-image-2-8.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docker-image-2-8.yml b/.github/workflows/docker-image-2-8.yml
index cb5f73d1..4d511a1a 100644
--- a/.github/workflows/docker-image-2-8.yml
+++ b/.github/workflows/docker-image-2-8.yml
@@ -1,7 +1,7 @@
 name: Docker image CI (PyTorch 2.8)
 
 on:
-  pull_request:
+  push:
     branches: [ "torch_v2.8" ]
   workflow_dispatch:
 

From d3f32988da41de1334159d2d8c783a4a1fdd059a Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 6 Jan 2026 12:42:26 +0000
Subject: [PATCH 046/194] [CI] Use CMake 3 to build pytorchsim

---
 Dockerfile.base | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile.base b/Dockerfile.base
index 897b8195..c5f200bc 100644
--- a/Dockerfile.base
+++ b/Dockerfile.base
@@ -34,7 +34,7 @@ RUN apt -y update && \
     python3-dev python-is-python3 libboost-all-dev \
     libhdf5-serial-dev python3-pydot libpng-dev libelf-dev pkg-config pip \
     python3-venv black libssl-dev libasan5 libubsan1 curl device-tree-compiler wget ninja-build && \
-    pip install onnx matplotlib scikit-learn pydot tabulate && pip install --user conan==1.56.0 && rm -rf /var/lib/apt/lists/* 
+    pip install onnx matplotlib scikit-learn pydot tabulate && pip install --user conan==1.56.0 cmake==3.26.4 && rm -rf /var/lib/apt/lists/*
 
 # Download RISC-V tool chain
 RUN wget https://github.com/riscv-collab/riscv-gnu-toolchain/releases/download/2023.12.14/riscv64-glibc-ubuntu-22.04-llvm-nightly-2023.12.14-nightly.tar.gz && \

From 07633630d7c008bddb1bdbee3c288e7d8b771aae Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 6 Jan 2026 12:44:55 +0000
Subject: [PATCH 047/194] [CI] Seperate base image

---
 .github/workflows/docker-base-image-2-8.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docker-base-image-2-8.yml b/.github/workflows/docker-base-image-2-8.yml
index f8649303..3a1d97a1 100644
--- a/.github/workflows/docker-base-image-2-8.yml
+++ b/.github/workflows/docker-base-image-2-8.yml
@@ -2,7 +2,7 @@ name: Docker Base Image CI (PyTorch 2.8)
 
 on:
   push:
-    branches: [ "base" ]
+    branches: [ "base_v2.8" ]
   workflow_dispatch:
   repository_dispatch:
     types: [ build_base ]

From 45914036118126799b762e06d990115f4372fde5 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 7 Jan 2026 04:43:59 +0000
Subject: [PATCH 048/194] [Fix] PyTorch2.8 support (WIP)

---
 PyTorchSimDevice/extension_device_op_overrides.py | 2 +-
 PyTorchSimFrontend/mlir/mlir_common.py            | 2 --
 PyTorchSimFrontend/mlir/mlir_scheduling.py        | 8 +++++---
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/PyTorchSimDevice/extension_device_op_overrides.py b/PyTorchSimDevice/extension_device_op_overrides.py
index b76dae0f..17439b95 100644
--- a/PyTorchSimDevice/extension_device_op_overrides.py
+++ b/PyTorchSimDevice/extension_device_op_overrides.py
@@ -22,4 +22,4 @@ def synchronize(self) -> str:
     def device_guard(self, device_idx: int) -> str:
         return "pass"
 
-register_device_op_overrides("extension_device", ExtensionDeviceOpOverrides())
\ No newline at end of file
+register_device_op_overrides("npu", ExtensionDeviceOpOverrides())
\ No newline at end of file
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index f98a2132..6888f9a1 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -23,7 +23,6 @@
 
 import sympy
 
-import torch.fx
 from torch.utils._sympy.value_ranges import ValueRanges
 from torch._inductor.utils import (
     get_sympy_Expr_dtype,
@@ -33,7 +32,6 @@
 )
 from PyTorchSimFrontend import extension_config
 from PyTorchSimFrontend import extension_codecache
-from PyTorchSimFrontend.mlir.mlir_autotune import MLIRBenchmarkRequest
 
 from PyTorchSimFrontend.extension_utils import (
     free_symbol_startswith,
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index b6b8dea5..2d578c61 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -167,6 +167,8 @@ def revert_group(self, act_nodes, args=None, var_ranges=None):
                 act_node.node.get_store_function(),
                 (args if act_node.node.get_reduction_type() else args[:1]),
                 var_ranges,
+                args[0],
+                args[1]
             )
             index_size = []
             reduce_size = []
@@ -188,7 +190,7 @@ def codegen_node(self, _node):
             nodes, key=lambda x: int(x.is_reduction())
         ).group
 
-        # Note: We assume that ther is at least one loop in the nodes
+        # Note: We assume that there is at least one loop in the nodes
         # But, inductor simplifies the group, there could be no loop
         # In that case, we add dummy loop(size=1) to the group
         if len(group) == 0:
@@ -263,9 +265,9 @@ def define_kernel(self, src_code, meta_code, kernel_name, vector_lane, spad_info
             codecache_def.writeline(f"loop_size={loop_size},")
             codecache_def.writeline(f"spad_info={spad_info},")
             codecache_def.writeline(f"origins={origins},")
-            codecache_def.writeline("arg_attributes=arg_attributes,")
+            codecache_def.writeline(f"arg_attributes={meta_code},")
             codecache_def.writeline(f"vlen={extension_config.vpu_vector_length_bits})")
-            wrapper.define_kernel(kernel_name, codecache_def.getvalue(), cuda=False)
+            wrapper.define_kernel(kernel_name, codecache_def.getvalue(), gpu=False)
         return kernel_name
 
     def codegen_template(self, template_node, prologue_nodes, epilogue_nodes):

From b9d4144bdba3c4007079c180934570eac245f61c Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 7 Jan 2026 07:51:11 +0000
Subject: [PATCH 049/194] [Fix] Use official prologue fusion path

---
 PyTorchSimFrontend/mlir/mlir_scheduling.py | 150 ++++++++++-----------
 1 file changed, 75 insertions(+), 75 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index 2d578c61..3799633c 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -29,62 +29,6 @@ def __init__(self, scheduler):
         config.inplace_buffers = False # FIXME. inout kernel makes trouble.. So disabled it!
         self.max_fusion_size = 5
 
-    def can_fuse_with_exceptions(self, node1: BaseSchedulerNode, node2: BaseSchedulerNode) -> bool:
-        # Extract base template node
-        base_template_node1 = [node for node in node1.get_nodes() if node.is_template()]
-        base_template_node2 = [node for node in node2.get_nodes() if node.is_template()]
-        if node1.get_device() != node2.get_device():
-            return False
-        if not (isinstance(node1, (SchedulerNode, FusedSchedulerNode)) and isinstance(node2, (SchedulerNode, FusedSchedulerNode))):
-            return False
-
-        if len(base_template_node1) == 1 and len(base_template_node2) == 0 and extension_config.CONFIG_FUSION_REDUCTION_EPILOGUE:
-            from PyTorchSimFrontend.mlir.mlir_gemm_template import MLIRGemmTemplate
-            from PyTorchSimFrontend.mlir.mlir_bmm_template import MLIRBMMTemplate
-            if (isinstance(base_template_node1[0].node.template, MLIRGemmTemplate) or isinstance(base_template_node1[0].node.template, MLIRBMMTemplate)) and node2.is_reduction():
-                # For matmul/bmm+reduction case
-                size_match = node1.get_nodes()[0].node.get_numel() == reduce(operator.mul, node2.get_nodes()[0].node.get_size(), 1) * reduce(operator.mul, node2.get_nodes()[0].node.get_reduction_size(), 1)
-                target_symbol = symbols("r0")
-                try:
-                    stride = [i.strip()[:-1].split(",")[-1].strip() for i in str(node2.get_nodes()[0].node).split("\n") if "r0" in i][1]
-                    stride = int(sympify(stride).coeff(target_symbol))
-                except:
-                    return False
-
-                # We can't fuse dim=-1
-                layout_possible = stride != 1
-                # Directed linked?
-                dependency_check = node2.get_nodes()[0] in [node.node for node in base_template_node1[0].users]# and len(node2.read_writes.reads)==1
-                dependency_size = all([i.get_numel() == node1.get_nodes()[0].node.get_numel() for i in node2.read_writes.reads])
-                return size_match and layout_possible and dependency_check and dependency_size
-
-        # For prologue fusion case
-        if extension_config.CONFIG_FUSION_PROLOGUE and len(base_template_node1) == 0 and len(node1.get_nodes())==1 and len(base_template_node2) == 1:
-            from PyTorchSimFrontend.mlir.mlir_gemm_template import MLIRGemmTemplate
-            from PyTorchSimFrontend.mlir.mlir_bmm_template import MLIRBMMTemplate
-            target_node = base_template_node2[0].node
-            if target_node.origin_node is not None and hasattr(target_node.origin_node.target, "_name") and target_node.origin_node.target._name == 'aten::convolution':
-                return False
-            if node1.is_reduction():
-                return False
-            if len(node1.read_writes.writes) != 1:
-                return False
-            if node1.node not in target_node.inputs or any(["view" in str(ori) for ori in node1.node.origins]): #FIXME
-                return False
-
-            # Currently only BMM, MM support prologue fusion
-            if not isinstance(target_node.template, (MLIRBMMTemplate, MLIRGemmTemplate)):
-                return False
-            # We don't fuse this edge case...
-            if base_template_node2[0].group[1][0][0] == 1:
-                return False
-
-            if list(node1.read_writes.writes)[0].name in [dep.name for dep in node2.read_writes.reads]:
-                node1 = self.revert_group(node1)
-                return True
-
-        return self.scheduler.can_fuse_origin(node1, node2)
-
     def _set_flush_status(self, status: bool):
         self._ready_to_flush = status
 
@@ -100,15 +44,10 @@ def can_fuse_horizontal(self, node1, node2):
 
         if (len(node1.get_nodes())+ len(node2.get_nodes())) > self.max_fusion_size:
             return False
+
         _, (vars1, reduce1) = node1.group
         _, (vars2, reduce2) = node2.group
 
-        # Reduction is currently not supported
-        if node1.is_reduction() and node2.is_reduction() and not node1.is_template() and not node2.is_template() and extension_config.CONFIG_FUSION_REDUCTION_REDUCTION:
-            return vars1 == vars2 and reduce1 == reduce2 # and node1.inverse_users == node2.inverse_users
-        if node1.is_reduction() or node2.is_reduction():
-            return False
-
         # Can't fuse two template node
         if node1.is_template() and node2.is_template():
             return False
@@ -116,17 +55,25 @@ def can_fuse_horizontal(self, node1, node2):
         if '_unsafe_index' in node1.get_nodes()[0].node.origins or "_unsafe_index" in node2.get_nodes()[0].node.origins:
             return False
 
-        # Check template node fusion
-        if node1.is_template() or node2.is_template():
+        # Extract base template node
+        base_template_node1 = [node for node in node1.get_nodes() if node.is_template()]
+        base_template_node2 = [node for node in node2.get_nodes() if node.is_template()]
+
+        # Case 0: Reduction fusion
+        if node1.is_reduction() and node2.is_reduction() and not node1.is_template() and not node2.is_template() and extension_config.CONFIG_FUSION_REDUCTION_REDUCTION:
+            return vars1 == vars2 and reduce1 == reduce2
+
+        # Case 1: Template + Pointwise fusion
+        if len(base_template_node1) == 1 and len(base_template_node2) == 0 and not node2.is_reduction():
             # Don't fuse maxpool template code
             from PyTorchSimFrontend.mlir.mlir_maxpool_template import MLIRMaxPoolTemplate
             from PyTorchSimFrontend.mlir.mlir_bmm_template import MLIRBMMTemplate
             from PyTorchSimFrontend.mlir.mlir_gemm_template import MLIRGemmTemplate
-            template_node1 = next((n for n in node1.get_nodes() if n.is_template()), None)
-            template_node2 = next((n for n in node2.get_nodes() if n.is_template()), None)
 
-            if template_node1 and len(node1.get_nodes()) == 1 and isinstance(template_node1.node.template, MLIRMaxPoolTemplate) or \
-               template_node2 and len(node2.get_nodes()) == 1 and isinstance(template_node2.node.template, MLIRMaxPoolTemplate):
+            template_node = base_template_node1[0]
+            epilogue_node = node2
+
+            if isinstance(template_node.node.template, MLIRMaxPoolTemplate):
                 return False
 
             # Pointwise check
@@ -135,23 +82,76 @@ def can_fuse_horizontal(self, node1, node2):
             if v1_total != v2_total:
                 return False
 
-            # Pattern check
-            template_node, act_node = (template_node1, node2) if template_node1 else (template_node2, node1)
-            has_depedency = set(act_node.inverse_users) <= set(template_node.get_nodes())
+            # Pattern check: check data dependency between act_node and template_node
+            template_sched_nodes = list(template_node.get_nodes())
+            # Buffers produced by the template (its outputs)
+            template_writes = {
+                dep
+                for n in template_sched_nodes
+                for dep in n.read_writes.writes
+            }
+            # Buffers still required by the activation node (unmet) or read by it
+            epilogue_unmet = { dep for dep in epilogue_node.unmet_dependencies }
+            has_depedency = bool(template_writes) and template_writes.issubset(epilogue_unmet)
             if not has_depedency:
                 return False
 
             # Revert act_node.group : simplify_and_reorder() modified _body, _size, group
-            if template_node.group != act_node.group:
+            if template_node.group != epilogue_node.group:
                 # We don't fuse this case...
                 if (isinstance(template_node.node.template, MLIRBMMTemplate) or isinstance(template_node.node.template, MLIRGemmTemplate)) and template_node.group[1][0][0] == 1:
                     return False
 
-                if list(template_node.group[1][0]) != list(act_node.get_nodes()[0].node.data.get_size()):
+                if list(template_node.group[1][0]) != list(epilogue_node.get_nodes()[0].node.data.get_size()):
                     return False
-                self.revert_group(act_node)
+                self.revert_group(epilogue_node)
             return True
 
+        # Case 2: Tempalte + Reduction fusion
+        if len(base_template_node1) == 1 and len(base_template_node2) == 0 and node2.is_reduction() and extension_config.CONFIG_FUSION_REDUCTION_EPILOGUE:
+            from PyTorchSimFrontend.mlir.mlir_gemm_template import MLIRGemmTemplate
+            from PyTorchSimFrontend.mlir.mlir_bmm_template import MLIRBMMTemplate
+            if not isinstance(target_node.template, (MLIRBMMTemplate, MLIRGemmTemplate)):
+                return False
+
+            size_match = node1.get_nodes()[0].node.get_numel() == reduce(operator.mul, node2.get_nodes()[0].node.get_size(), 1) * reduce(operator.mul, node2.get_nodes()[0].node.get_reduction_size(), 1)
+            target_symbol = symbols("r0")
+            try:
+                stride = [i.strip()[:-1].split(",")[-1].strip() for i in str(node2.get_nodes()[0].node).split("\n") if "r0" in i][1]
+                stride = int(sympify(stride).coeff(target_symbol))
+            except:
+                return False
+
+            # We can't fuse dim=-1
+            layout_possible = stride != 1
+            # Directed linked?
+            dependency_check = node2.get_nodes()[0] in [node.node for node in base_template_node1[0].users]# and len(node2.read_writes.reads)==1
+            dependency_size = all([i.get_numel() == node1.get_nodes()[0].node.get_numel() for i in node2.read_writes.reads])
+            return size_match and layout_possible and dependency_check and dependency_size
+
+        # Case 3: Prologue(Pointwise) + Tempalte
+        if len(base_template_node1) == 0 and len(node1.get_nodes())==1 and not node1.is_reduction() and len(base_template_node2) == 1 and extension_config.CONFIG_FUSION_PROLOGUE:
+            from PyTorchSimFrontend.mlir.mlir_gemm_template import MLIRGemmTemplate
+            from PyTorchSimFrontend.mlir.mlir_bmm_template import MLIRBMMTemplate
+
+            target_node = base_template_node2[0].node
+            # Currently only BMM, MM support prologue fusion
+            if not isinstance(target_node.template, (MLIRBMMTemplate, MLIRGemmTemplate)):
+                return False
+
+            if len(node1.read_writes.writes) != 1:
+                return False
+            if node1.node not in target_node.inputs or any(["view" in str(ori) for ori in node1.node.origins]): #FIXME
+                return False
+
+            # We don't fuse this edge case...
+            if base_template_node2[0].group[1][0][0] == 1:
+                return False
+
+            if list(node1.read_writes.writes)[0].name in [dep.name for dep in node2.read_writes.reads]:
+                node1 = self.revert_group(node1)
+                return True
+
         # Check elementwise fusion
         if vars1 == vars2 and reduce1 == reduce2:
             return True
@@ -270,7 +270,7 @@ def define_kernel(self, src_code, meta_code, kernel_name, vector_lane, spad_info
             wrapper.define_kernel(kernel_name, codecache_def.getvalue(), gpu=False)
         return kernel_name
 
-    def codegen_template(self, template_node, prologue_nodes, epilogue_nodes):
+    def codegen_template(self, template_node, epilogue_nodes, prologue_nodes):
         # Generate template code
         template_buffer = template_node.node
         kernel, tile_candidates, render = template_buffer.make_kernel_render(template_buffer, prologue_nodes=prologue_nodes, epilogue_nodes=epilogue_nodes, kernel_group=self.kernel_group)

From 9abc0602b7b279e064ec2a4ec3ac921fae658d64 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 7 Jan 2026 10:25:51 +0000
Subject: [PATCH 050/194] [Fix] Don't split a reduce kernel

---
 PyTorchSimFrontend/mlir/mlir_scheduling.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index 3799633c..640a00be 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -7,12 +7,14 @@
 from PyTorchSimFrontend import extension_config
 from PyTorchSimFrontend.mlir.mlir_codegen_backend import MLIRKernel
 
+from torch.utils._ordered_set import OrderedSet
 from torch._inductor import config
 from torch._inductor.scheduler import BaseScheduling, FusedSchedulerNode, SchedulerNode, BaseSchedulerNode
 from torch._inductor.utils import IndentedBuffer
 from torch._inductor.virtualized import V
 from torch._inductor.ir import LoopBody
 from torch._inductor import dependencies
+from torch._inductor.codegen.common import BackendFeature
 
 from . import mlir_common
 from . import mlir_lowering # DO NOT REMOVE THIS LINE, it is used for lowering
@@ -35,6 +37,10 @@ def _set_flush_status(self, status: bool):
     def reset_kernel_group(self):
         self.kernel_group = mlir_common.MLIRWrapperKenrelGroup()
 
+    def get_backend_features(self, device):
+        """Return a set of .codegen.common.BackendFeature()"""
+        return OrderedSet([BackendFeature.REDUCE_TO_SINGLE_ELEMENT])
+
     def can_fuse_vertical(self, node1, node2):
         return self.can_fuse_horizontal(node1, node2)
 

From 2c7264b903bc2aae7d215a2c1f9de592c2ac94a3 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 7 Jan 2026 10:43:44 +0000
Subject: [PATCH 051/194] [Fix] Add a missing reduction fusion condition

---
 PyTorchSimFrontend/mlir/mlir_scheduling.py | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index 640a00be..35ccfee8 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -53,6 +53,11 @@ def can_fuse_horizontal(self, node1, node2):
 
         _, (vars1, reduce1) = node1.group
         _, (vars2, reduce2) = node2.group
+        # For input/dependency checks
+        reads1 = {dep.name for dep in node1.read_writes.reads}
+        reads2 = {dep.name for dep in node2.read_writes.reads}
+        writes1 = {dep.name for dep in node1.read_writes.writes}
+        writes2 = {dep.name for dep in node2.read_writes.writes}
 
         # Can't fuse two template node
         if node1.is_template() and node2.is_template():
@@ -66,8 +71,20 @@ def can_fuse_horizontal(self, node1, node2):
         base_template_node2 = [node for node in node2.get_nodes() if node.is_template()]
 
         # Case 0: Reduction fusion
-        if node1.is_reduction() and node2.is_reduction() and not node1.is_template() and not node2.is_template() and extension_config.CONFIG_FUSION_REDUCTION_REDUCTION:
-            return vars1 == vars2 and reduce1 == reduce2
+        if (
+            node1.is_reduction()
+            and node2.is_reduction()
+            and not node1.is_template()
+            and not node2.is_template()
+            and extension_config.CONFIG_FUSION_REDUCTION_REDUCTION
+        ):
+            # 1) Same loop/iteration domain
+            same_iter = vars1 == vars2 and reduce1 == reduce2
+            # 2) No data dependency between the two reductions
+            no_dependency = not (
+                writes1 & (reads2 | writes2) or writes2 & (reads1 | writes1)
+            )
+            return same_iter and no_dependency
 
         # Case 1: Template + Pointwise fusion
         if len(base_template_node1) == 1 and len(base_template_node2) == 0 and not node2.is_reduction():

From b951b95ac596692a83fca926d0a44de3776d5e30 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 7 Jan 2026 11:20:55 +0000
Subject: [PATCH 052/194] [Fix] update indirect_index interface for v2.8

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 2 +-
 PyTorchSimFrontend/mlir/mlir_common.py          | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 9f5c0674..bc4592b4 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -667,7 +667,7 @@ def store_reduction(self, name, index, value):
                                     dram_shape, tile_shape, attribute)
             self.reductions_suffix.writeline(common.DeferredLine(name, code))
 
-    def indirect_indexing(self, index_var, size, check=True):
+    def indirect_indexing(self, index_var, size, check=True, wrap_neg=True):
         return str(index_var)
 
     def _index_expr(self, tile_desc, renamed_expression, index, base_vector_index):
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 6888f9a1..468f1a47 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -645,7 +645,7 @@ def store(self, name, index, value, mode=None):
     def reduction(self, dtype, src_dtype, reduction_type, value):
         raise NotImplementedError()
 
-    def indirect_indexing(self, index_var, size, check):
+    def indirect_indexing(self, index_var, size, check, wrap_neg):
         raise NotImplementedError()
 
     def codegen_global_init(self):
@@ -888,9 +888,9 @@ def inner(*args, **kwargs):
                 return inner
 
             @staticmethod
-            def indirect_indexing(index_var, size, check=True):
+            def indirect_indexing(index_var, size, check=True, wrap_neg=True):
                 # Skip CSE since this doesn't return an expression
-                return self.indirect_indexing(index_var, size, check)
+                return self.indirect_indexing(index_var, size, check, wrap_neg)
 
             @staticmethod
             def load(name: str, index: sympy.Expr):

From c6ba98c6e0d82bdadc013918c54aeaa56a0520df Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 7 Jan 2026 13:53:31 +0000
Subject: [PATCH 053/194] [Fix] Allow cpp kernel code in the wrapper function

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index bc4592b4..654099c1 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -91,6 +91,7 @@ def write_header(self):
                 from torch._inductor.hooks import run_intermediate_hooks
                 from torch._inductor.utils import maybe_profile
                 from torch._inductor.codegen.memory_planning import _align as align
+                from torch._inductor.async_compile import AsyncCompile
 
                 from torch import device, empty, empty_strided
                 from {extension_codecache.__name__} import CustomAsyncCompile
@@ -105,6 +106,7 @@ def write_header(self):
                 alloc_from_pool = torch.ops.inductor._alloc_from_pool
                 reinterpret_tensor = torch.ops.aten._reinterpret_tensor
                 custom_async_compile = CustomAsyncCompile()
+                async_compile = AsyncCompile()
                 os.environ["TORCHSIM_LAST_COMPILED_MODULE"] = __file__
                 print(f\'Wrapper Codegen Path = {{__file__}}\')
             """
@@ -138,6 +140,7 @@ def device2host_memcpy(buffer):
         )
 
     def write_prefix(self):
+        self.write_async_compile_wait()
         self.prefix.splice(
             """
             def call(args):

From fd07eda99e4f8ceea01d0388a39d3d8952f0c139 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 8 Jan 2026 05:42:25 +0000
Subject: [PATCH 054/194] [Ops] Use V.kernel instead of argument passing

---
 PyTorchSimFrontend/mlir/mlir_common.py   |   2 +-
 PyTorchSimFrontend/mlir/mlir_ops.py      | 413 +++++++++++------------
 PyTorchSimFrontend/mlir/mlir_template.py |   2 +-
 3 files changed, 208 insertions(+), 209 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 468f1a47..7b6ee11c 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -867,7 +867,7 @@ class CSEProxy:
             @staticmethod
             def __getattr__(name: str) -> Callable[..., common.CSEVariable]:  # type: ignore[misc]
                 def inner(*args, **kwargs):
-                    code, ret_info = getattr(parent_handler, name)(*args, var_info=self.var_info, **kwargs)
+                    code, ret_info = getattr(parent_handler, name)(*args, **kwargs)
                     target_buffer = self.target_buffer_override.get()
                     target_cse = self.target_cse_override.get()
                     if isinstance(code, common.DeferredLine):
diff --git a/PyTorchSimFrontend/mlir/mlir_ops.py b/PyTorchSimFrontend/mlir/mlir_ops.py
index 21995512..2b964c55 100644
--- a/PyTorchSimFrontend/mlir/mlir_ops.py
+++ b/PyTorchSimFrontend/mlir/mlir_ops.py
@@ -20,7 +20,7 @@ def reduction_combine_vec(reduction_type, vector_value, init_value, axis, shape,
 
 class ExtensionOverrides(common.OpOverrides):
     @staticmethod
-    def constant(value, src_type, *args, var_info=None, **kwargs):
+    def constant(value, src_type, *args, **kwargs):
         if isinstance(src_type, torch.dtype):
             src_type = mlir_common.DTYPE_TO_MLIR[src_type]
 
@@ -37,8 +37,8 @@ def constant(value, src_type, *args, var_info=None, **kwargs):
         return f'arith.constant {value} : {src_type}', [1, src_type]
 
     @staticmethod
-    def broadcast(operand, target_size, *args, var_info=None, **kwargs):
-        src_size, dtype = var_info[operand]
+    def broadcast(operand, target_size, *args, **kwargs):
+        src_size, dtype = V.kernel.var_info[operand]
 
         src_shape = f"vector<{src_size}x{dtype}>" if src_size > 1 else dtype
         dst_shape = f"vector<{target_size}x{dtype}>"
@@ -63,8 +63,8 @@ def broadcast(operand, target_size, *args, var_info=None, **kwargs):
         return op_str, [target_size, dtype]
 
     @staticmethod
-    def broadcast_unflat(operand, target_size, *args, var_info=None, **kwargs):
-        src_size, dtype = var_info[operand]
+    def broadcast_unflat(operand, target_size, *args, **kwargs):
+        src_size, dtype = V.kernel.var_info[operand]
 
         outer_dim = target_size // src_size
         src_shape = f"vector<{src_size}x{dtype}>"
@@ -87,33 +87,33 @@ def randint64(self, *args, **kwargs):
 
     # Special operaitons
     @staticmethod
-    def masked(mask, body, other, *args, var_info=None, tile_size=16, dtype="f32", ninf_declared=False, **kwargs):
+    def masked(mask, body, other, *args, tile_size=16, dtype="f32", ninf_declared=False, **kwargs):
         result = body()
         val = ops.constant(other, dtype, *args, **kwargs)
         result = ops.where(mask, result, val)
-        return result, var_info[result]
+        return result, V.kernel.var_info[result]
 
     @staticmethod
-    def where(condition, operand1, operand2, *args, var_info=None, **kwargs):
-        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
-        cond_type = var_info[condition]
-        operand_type = var_info[operand1]
+    def where(condition, operand1, operand2, *args, **kwargs):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2)
+        cond_type = V.kernel.var_info[condition]
+        operand_type = V.kernel.var_info[operand1]
         condition = ops.to_bool(condition)
         if cond_type[0] < tile_size:
             condition = ops.broadcast(condition, tile_size)
         elif cond_type[0] > tile_size:
             operand1 = ops.broadcast(operand1, cond_type[0])
             operand2 = ops.broadcast(operand2, cond_type[0])
-        tile_size, ret_type = var_info[operand1]
+        tile_size, ret_type = V.kernel.var_info[operand1]
         shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
         cond_shape = f"vector<{tile_size}xi1>" if tile_size > 1 else ""
         return f"arith.select %{condition}, %{operand1}, %{operand2} : {cond_shape}, {shape}", [tile_size, ret_type]
 
     @staticmethod
-    def to_dtype(operand, dst_mlir_dtype, *args, var_info=None, **kwargs):
+    def to_dtype(operand, dst_mlir_dtype, *args, **kwargs):
         # Extract source information
-        src_mlir_dtype = var_info[operand][1]
-        tile_size = var_info[operand][0]
+        src_mlir_dtype = V.kernel.var_info[operand][1]
+        tile_size = V.kernel.var_info[operand][0]
 
         # Normalize destination type (Torch dtype -> MLIR string)
         if isinstance(dst_mlir_dtype, torch.dtype):
@@ -172,13 +172,13 @@ def to_dtype(operand, dst_mlir_dtype, *args, var_info=None, **kwargs):
         return op_str, [tile_size, dst_mlir_dtype]
 
     @staticmethod
-    def identity(operand, *args, var_info=None, **kwargs):
-        operand_info = var_info[operand]
+    def identity(operand, *args, **kwargs):
+        operand_info = V.kernel.var_info[operand]
         return operand, operand_info
 
     @staticmethod
-    def to_dtype_bitcast(operand, dtype, *args, var_info=None, **kwargs):
-        tile_size, current_src_type = var_info[operand]
+    def to_dtype_bitcast(operand, dtype, *args, **kwargs):
+        tile_size, current_src_type = V.kernel.var_info[operand]
 
         if isinstance(dtype, torch.dtype):
             dst_mlir_type = mlir_common.DTYPE_TO_MLIR[dtype]
@@ -201,11 +201,12 @@ def to_dtype_bitcast(operand, dtype, *args, var_info=None, **kwargs):
 
     # Binary element wise operations
     @staticmethod
-    def binary_elementwise_common(operand1, operand2, var_info):
+    def binary_elementwise_common(operand1, operand2):
+        V.kernel.var_info = V.kernel.var_info
         operand1.bounds = operand1.bounds.unknown()
         operand2.bounds = operand2.bounds.unknown()
-        op_type1 = var_info[operand1]
-        op_type2 = var_info[operand2]
+        op_type1 = V.kernel.var_info[operand1]
+        op_type2 = V.kernel.var_info[operand2]
         # Tile size check
         if op_type1[0] != op_type2[0]:
             # Try to broad cast
@@ -213,33 +214,33 @@ def binary_elementwise_common(operand1, operand2, var_info):
             rhs_tile_size, rhs_dtype = op_type2
             if lhs_tile_size > rhs_tile_size:
                 operand2 = ops.broadcast(operand2, lhs_tile_size)
-                op_type2 = var_info[operand2]
+                op_type2 = V.kernel.var_info[operand2]
             elif lhs_tile_size < rhs_tile_size:
                 operand1 = ops.broadcast(operand1, rhs_tile_size)
-                op_type1 = var_info[operand1]
+                op_type1 = V.kernel.var_info[operand1]
 
         # Data type check
         if op_type1[1] != op_type2[1]:
             if op_type1[1] == "index" or op_type1 == "index":
                 if op_type1[1] == "index":
                     operand1 = ops.index_cast(operand1, op_type2[1])
-                    op_type1 = var_info[operand1]
+                    op_type1 = V.kernel.var_info[operand1]
                 if op_type2[1] == "index":
                     operand2 = ops.index_cast(operand2, op_type1[1])
-                    op_type2 = var_info[operand2]
+                    op_type2 = V.kernel.var_info[operand2]
             elif op_type1[1][0] == "i" and op_type2[1][0] == "f":
                 operand1 = ops.to_dtype(operand1, op_type2[1])
-                op_type1 = var_info[operand1]
+                op_type1 = V.kernel.var_info[operand1]
             elif op_type1[1][0] == "f" and op_type2[1][0] == "i":
                 operand2 = ops.to_dtype(operand2, op_type1[1])
-                op_type2 = var_info[operand2]
+                op_type2 = V.kernel.var_info[operand2]
             elif op_type1[1][0] == op_type2[1][0]:
                 if mlir_common.MLIR_TO_BIT[op_type1[1]] > mlir_common.MLIR_TO_BIT[op_type2[1]]:
                    operand2 = ops.ext(operand2, op_type1[1])
-                   op_type2 = var_info[operand2]
+                   op_type2 = V.kernel.var_info[operand2]
                 elif mlir_common.MLIR_TO_BIT[op_type1[1]] < mlir_common.MLIR_TO_BIT[op_type2[1]]:
                    operand1 = ops.ext(operand1, op_type2[1])
-                   op_type1 = var_info[operand1]
+                   op_type1 = V.kernel.var_info[operand1]
             else:
                 raise NotImplementedError("Unsupported type converting")
 
@@ -249,45 +250,45 @@ def binary_elementwise_common(operand1, operand2, var_info):
         return tile_size, ret_type, operand1, operand2
 
     @staticmethod
-    def abs(operand, *args, var_info=None, **kwargs):
+    def abs(operand, *args, **kwargs):
         raise NotImplementedError
 
     @staticmethod
-    def exp(operand, *args, var_info=None, **kwargs):
+    def exp(operand, *args, **kwargs):
         # Check scalar
-        op_type = var_info[operand]
+        op_type = V.kernel.var_info[operand]
         if op_type[0] == 1:
             operand = ops.broadcast(operand, 4)
             val = ops.exp(operand)
             result = ops.extractelement(val, 0)
-            return result, var_info[result]
-        op_type = var_info[operand]
+            return result, V.kernel.var_info[result]
+        op_type = V.kernel.var_info[operand]
         tile_size = op_type[0]
         dtype = op_type[1]
         shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
         return f'math.exp %{operand} : {shape}', [tile_size, dtype]
 
     @staticmethod
-    def exp2(operand, *args, var_info=None, **kwargs):
+    def exp2(operand, *args, **kwargs):
         # Hands-on part: implement exp2 using math.exp2
-        # var_info = {operand: [tile_size, dtype]}
-        # Ex) var_info[operand] = [8, "f32"]
+        # V.kernel.var_info = {operand: [tile_size, dtype]}
+        # Ex) V.kernel.var_info[operand] = [8, "f32"]
 
         ln2 = math.log(2)
         coeff = ops.constant(ln2, "f32")
         operand = ops.mul(operand, coeff)
-        return ops.exp(operand), var_info[operand]
+        return ops.exp(operand), V.kernel.var_info[operand]
 
     @staticmethod
-    def expm1(operand, *args, var_info=None, **kwargs):
+    def expm1(operand, *args, **kwargs):
         coeff = ops.constant(1.0, "f32")
         operand = ops.exp(operand)
         operand = ops.sub(operand, coeff)
-        return operand, var_info[operand]
+        return operand, V.kernel.var_info[operand]
 
     @staticmethod
-    def sqrt(operand, *args, var_info=None, **kwargs):
-        op_type = var_info[operand]
+    def sqrt(operand, *args, **kwargs):
+        op_type = V.kernel.var_info[operand]
 
         tile_size = op_type[0]
         dtype = op_type[1]
@@ -300,14 +301,14 @@ def sqrt(operand, *args, var_info=None, **kwargs):
         return f'math.sqrt %{operand} : {shape}', [tile_size, dtype]
 
     @staticmethod
-    def relu(operand, *args, var_info=None, **kwargs):
-        src_mlir_dtype = var_info[operand][1]
-        tile_size = var_info[operand][0]
+    def relu(operand, *args, **kwargs):
+        src_mlir_dtype = V.kernel.var_info[operand][1]
+        tile_size = V.kernel.var_info[operand][0]
         return ops.maximum(operand, ops.constant(0, src_mlir_dtype)), [tile_size, src_mlir_dtype]
 
     @staticmethod
-    def minimum(operand1, operand2, *args, var_info=None, **kwargs):
-        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+    def minimum(operand1, operand2, *args, **kwargs):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2)
         shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
         if ret_type[0] == "f":
             opcode = f'arith.minimumf'
@@ -316,8 +317,8 @@ def minimum(operand1, operand2, *args, var_info=None, **kwargs):
         return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
 
     @staticmethod
-    def maximum(operand1, operand2, *args, var_info=None, **kwargs):
-        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+    def maximum(operand1, operand2, *args, **kwargs):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2)
         shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
         if ret_type[0] == "f":
             opcode = f'arith.maximumf'
@@ -326,17 +327,17 @@ def maximum(operand1, operand2, *args, var_info=None, **kwargs):
         return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
 
     @staticmethod
-    def cos(operand, *args, var_info=None, **kwargs):
-        op_type = var_info[operand]
+    def cos(operand, *args, **kwargs):
+        op_type = V.kernel.var_info[operand]
 
         # Check scalar
-        op_type = var_info[operand]
+        op_type = V.kernel.var_info[operand]
         if op_type[0] == 1:
             operand = ops.broadcast(operand, 4)
             val = ops.cos(operand)
             result = ops.extractelement(val, 0)
-            return result, var_info[result]
-        op_type = var_info[operand]
+            return result, V.kernel.var_info[result]
+        op_type = V.kernel.var_info[operand]
         tile_size = op_type[0]
         dtype = op_type[1]
 
@@ -347,17 +348,17 @@ def cos(operand, *args, var_info=None, **kwargs):
         return f'math.cos %{operand} : {shape}', [tile_size, dtype]
 
     @staticmethod
-    def sin(operand, *args, var_info=None, **kwargs):
-        op_type = var_info[operand]
+    def sin(operand, *args, **kwargs):
+        op_type = V.kernel.var_info[operand]
 
         # Check scalar
-        op_type = var_info[operand]
+        op_type = V.kernel.var_info[operand]
         if op_type[0] == 1:
             operand = ops.broadcast(operand, 4)
             val = ops.sin(operand)
             result = ops.extractelement(val, 0)
-            return result, var_info[result]
-        op_type = var_info[operand]
+            return result, V.kernel.var_info[result]
+        op_type = V.kernel.var_info[operand]
         tile_size = op_type[0]
         dtype = op_type[1]
 
@@ -368,51 +369,51 @@ def sin(operand, *args, var_info=None, **kwargs):
         return f'math.sin %{operand} : {shape}', [tile_size, dtype]
 
     @staticmethod
-    def tan(operand, *args, var_info=None, **kwargs):
+    def tan(operand, *args, **kwargs):
         sin_res = ops.sin(operand)
         cos_res = ops.cos(operand)
         operand = ops.truediv(sin_res, cos_res)
-        return operand, var_info[operand]
+        return operand, V.kernel.var_info[operand]
 
     @staticmethod
-    def lgamma(operand, *args, var_info=None, **kwargs):
+    def lgamma(operand, *args, **kwargs):
         raise NotImplementedError
 
     @staticmethod
-    def erf(operand, *args, var_info=None, **kwargs):
+    def erf(operand, *args, **kwargs):
         # Check scalar
-        op_type = var_info[operand]
+        op_type = V.kernel.var_info[operand]
         if op_type[0] == 1:
             operand = ops.broadcast(operand, 4)
             val = ops.erf(operand)
             result = ops.extractelement(val, 0)
-            return result, var_info[result]
-        op_type = var_info[operand]
+            return result, V.kernel.var_info[result]
+        op_type = V.kernel.var_info[operand]
         tile_size = op_type[0]
         dtype = op_type[1]
         shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
         return f'math.erf %{operand} : {shape}', [tile_size, dtype]
 
     @staticmethod
-    def cosh(operand, *args, var_info=None, **kwargs):
+    def cosh(operand, *args, **kwargs):
         raise NotImplementedError
 
     @staticmethod
-    def sinh(operand, *args, var_info=None, **kwargs):
+    def sinh(operand, *args, **kwargs):
         raise NotImplementedError
 
     @staticmethod
-    def tanh(operand, *args, var_info=None, **kwargs):
-        op_type = var_info[operand]
+    def tanh(operand, *args, **kwargs):
+        op_type = V.kernel.var_info[operand]
 
         # Check scalar
-        op_type = var_info[operand]
+        op_type = V.kernel.var_info[operand]
         if op_type[0] == 1:
             operand = ops.broadcast(operand, 4)
             val = ops.tanh(operand)
             result = ops.extractelement(val, 0)
-            return result, var_info[result]
-        op_type = var_info[operand]
+            return result, V.kernel.var_info[result]
+        op_type = V.kernel.var_info[operand]
         tile_size = op_type[0]
         dtype = op_type[1]
 
@@ -423,80 +424,80 @@ def tanh(operand, *args, var_info=None, **kwargs):
         return f'math.tanh %{operand} : {shape}', [tile_size, dtype]
 
     @staticmethod
-    def acos(operand, *args, var_info=None, **kwargs):
+    def acos(operand, *args, **kwargs):
         raise NotImplementedError
 
     @staticmethod
-    def acosh(operand, *args, var_info=None, **kwargs):
+    def acosh(operand, *args, **kwargs):
         raise NotImplementedError
 
     @staticmethod
-    def asin(operand, *args, var_info=None, **kwargs):
+    def asin(operand, *args, **kwargs):
         raise NotImplementedError
 
     @staticmethod
-    def asinh(operand, *args, var_info=None, **kwargs):
+    def asinh(operand, *args, **kwargs):
         raise NotImplementedError
 
     @staticmethod
-    def atan2(operand1, operand2, *args, var_info=None, **kwargs):
+    def atan2(operand1, operand2, *args, **kwargs):
         raise NotImplementedError
 
     @staticmethod
-    def atan(operand, *args, var_info=None, **kwargs):
+    def atan(operand, *args, **kwargs):
         raise NotImplementedError
 
     @staticmethod
-    def atanh(operand, *args, var_info=None, **kwargs):
+    def atanh(operand, *args, **kwargs):
         raise NotImplementedError
 
     @staticmethod
-    def copysign(operand1, operand2, *args, var_info=None, **kwargs):
+    def copysign(operand1, operand2, *args, **kwargs):
         raise NotImplementedError
 
     @staticmethod
-    def erfc(operand, *args, var_info=None, **kwargs):
+    def erfc(operand, *args, **kwargs):
         raise NotImplementedError
 
     @staticmethod
-    def erfinv(operand, *args, var_info=None, **kwargs):
+    def erfinv(operand, *args, **kwargs):
         raise NotImplementedError
 
     @staticmethod
-    def frexp(operand, *args, var_info=None, **kwargs):
+    def frexp(operand, *args, **kwargs):
         raise NotImplementedError
 
     @staticmethod
-    def hypot(operand1, operand2, *args, var_info=None, **kwargs):
+    def hypot(operand1, operand2, *args, **kwargs):
         raise NotImplementedError
 
     @staticmethod
-    def log10(operand, *args, var_info=None, **kwargs):
+    def log10(operand, *args, **kwargs):
         val_ln = ops.log(operand)
         
-        tile_size, dtype = var_info[val_ln]
+        tile_size, dtype = V.kernel.var_info[val_ln]
         inv_ln10 = 1/math.log(10)
         const_op = ops.constant(inv_ln10, dtype)
         
         # Multiply: ln(x) * (1/ln(10))
         result = ops.mul(val_ln, const_op)
-        return result, var_info[result]
+        return result, V.kernel.var_info[result]
 
     @staticmethod
-    def log2(operand, *args, var_info=None, **kwargs):
+    def log2(operand, *args, **kwargs):
         val_ln = ops.log(operand)
         
-        tile_size, dtype = var_info[val_ln]
+        tile_size, dtype = V.kernel.var_info[val_ln]
         inv_ln10 = 1/math.log(2)
         const_op = ops.constant(inv_ln10, dtype)
         
         # Multiply: ln(x) * (1/ln(10))
         result = ops.mul(val_ln, const_op)
-        return result, var_info[result]
+        return result, V.kernel.var_info[result]
 
     @staticmethod
-    def log(operand, *args, var_info=None, **kwargs):
-        op_type = var_info[operand]
+    def log(operand, *args, **kwargs):
+        op_type = V.kernel.var_info[operand]
         tile_size = op_type[0]
         dtype = op_type[1]
 
@@ -508,109 +509,107 @@ def log(operand, *args, var_info=None, **kwargs):
         return f'math.log %{operand} : {shape}', [tile_size, dtype]
 
     @staticmethod
-    def log1p(operand, *args, var_info=None, **kwargs):
-        tile_size, dtype = var_info[operand]
+    def log1p(operand, *args, **kwargs):
+        tile_size, dtype = V.kernel.var_info[operand]
         const_one = ops.constant(1, dtype)
 
-        # 3. 덧셈 연산: (x + 1)
-        # ops.add가 (result_ssa, result_info)를 반환한다고 가정
         val_add = ops.add(operand, const_one)
         result = ops.log(val_add)
-        return result, var_info[result]
+        return result, V.kernel.var_info[result]
 
     @staticmethod
-    def nextafter(operand1, operand2, *args, var_info=None, **kwargs):
+    def nextafter(operand1, operand2, *args, **kwargs):
         raise NotImplementedError
 
     @staticmethod
-    def logical_and(operand1, operand2, *args, var_info=None, **kwargs):
-        if var_info[operand1][1] != "i1":
+    def logical_and(operand1, operand2, *args, **kwargs):
+        if V.kernel.var_info[operand1][1] != "i1":
             operand1 = ops.to_bool(operand1)
         
-        if var_info[operand2][1] != "i1":
+        if V.kernel.var_info[operand2][1] != "i1":
             operand2 = ops.to_bool(operand2)
         result = ops.and_(operand1, operand2)
-        return result, var_info[result]
+        return result, V.kernel.var_info[result]
 
     @staticmethod
-    def logical_or(operand1, operand2, *args, var_info=None, **kwargs):
-        if var_info[operand1][1] != "i1":
+    def logical_or(operand1, operand2, *args, **kwargs):
+        if V.kernel.var_info[operand1][1] != "i1":
             operand1 = ops.to_bool(operand1)
         
-        if var_info[operand2][1] != "i1":
+        if V.kernel.var_info[operand2][1] != "i1":
             operand2 = ops.to_bool(operand2)
         result = ops.or_(operand1, operand2)
-        return result, var_info[result]
+        return result, V.kernel.var_info[result]
 
     @staticmethod
-    def logical_xor(operand1, operand2, *args, var_info=None, **kwargs):
-        if var_info[operand1][1] != "i1":
+    def logical_xor(operand1, operand2, *args, **kwargs):
+        if V.kernel.var_info[operand1][1] != "i1":
             operand1 = ops.to_bool(operand1)
         
-        if var_info[operand2][1] != "i1":
+        if V.kernel.var_info[operand2][1] != "i1":
             operand2 = ops.to_bool(operand2)
         result = ops.xor(operand1, operand2)
-        return result, var_info[result]
+        return result, V.kernel.var_info[result]
     
     @staticmethod
-    def logical_not(operand, *args, var_info=None, **kwargs):
-        op_info = var_info[operand]
+    def logical_not(operand, *args, **kwargs):
+        op_info = V.kernel.var_info[operand]
         tile_size = op_info[0]
         dtype = op_info[1]
         
         zero_const = ops.constant(0, dtype)
         result = ops.eq(operand, zero_const)
-        return result, var_info[result]
+        return result, V.kernel.var_info[result]
 
     @staticmethod
-    def bitwise_and(operand1, operand2, *args, var_info=None, **kwargs):
+    def bitwise_and(operand1, operand2, *args, **kwargs):
         # Float check
-        if var_info[operand1][1].startswith("f") or var_info[operand2][1].startswith("f"):
+        if V.kernel.var_info[operand1][1].startswith("f") or V.kernel.var_info[operand2][1].startswith("f"):
             raise ValueError("Bitwise AND not supported for floats")
             
         result = ops.and_(operand1, operand2)
-        return result, var_info[result]
+        return result, V.kernel.var_info[result]
 
     @staticmethod
-    def bitwise_not(operand, *args, var_info=None, **kwargs):
-        tile_size, dtype = var_info[operand]
+    def bitwise_not(operand, *args, **kwargs):
+        tile_size, dtype = V.kernel.var_info[operand]
         # Float check
-        if var_info[operand][1].startswith("f"):
+        if V.kernel.var_info[operand][1].startswith("f"):
             raise ValueError("Bitwise NOT not supported for floats")
         
         neg_one = ops.constant(-1, dtype)
         result = ops.xor(operand, neg_one) 
-        return result, var_info[result]
+        return result, V.kernel.var_info[result]
 
     @staticmethod
-    def bitwise_or(operand1, operand2, *args, var_info=None, **kwargs):
+    def bitwise_or(operand1, operand2, *args, **kwargs):
         # Float check
-        if var_info[operand1][1].startswith("f") or var_info[operand2][1].startswith("f"):
+        if V.kernel.var_info[operand1][1].startswith("f") or V.kernel.var_info[operand2][1].startswith("f"):
             raise ValueError("Bitwise AND not supported for floats")
             
         result = ops.or_(operand1, operand2)
-        return result, var_info[result]
+        return result, V.kernel.var_info[result]
 
     @staticmethod
-    def bitwise_xor(operand1, operand2, *args, var_info=None, **kwargs):
+    def bitwise_xor(operand1, operand2, *args, **kwargs):
                 # Float check
-        if var_info[operand1][1].startswith("f") or var_info[operand2][1].startswith("f"):
+        if V.kernel.var_info[operand1][1].startswith("f") or V.kernel.var_info[operand2][1].startswith("f"):
             raise ValueError("Bitwise AND not supported for floats")
             
         result = ops.xor(operand1, operand2)
-        return result, var_info[result]
+        return result, V.kernel.var_info[result]
 
     @staticmethod
-    def bitwise_left_shift(operand1, operand2, *args, var_info=None, **kwargs):
+    def bitwise_left_shift(operand1, operand2, *args, **kwargs):
         raise NotImplementedError
 
     @staticmethod
-    def bitwise_right_shift(operand1, operand2, *args, var_info=None, **kwargs):
+    def bitwise_right_shift(operand1, operand2, *args, **kwargs):
         raise NotImplementedError
 
     @staticmethod
-    def rsqrt(operand, *args, var_info=None, **kwargs):
-        op_type = var_info[operand]
+    def rsqrt(operand, *args, **kwargs):
+        op_type = V.kernel.var_info[operand]
         tile_size = op_type[0]
         dtype = op_type[1]
 
@@ -622,28 +621,28 @@ def rsqrt(operand, *args, var_info=None, **kwargs):
         return f'math.rsqrt %{operand} : {shape}', [tile_size, dtype]
 
     @staticmethod
-    def sigmoid(operand, *args, var_info=None, **kwargs):
-        op_type = var_info[operand]
+    def sigmoid(operand, *args, **kwargs):
+        op_type = V.kernel.var_info[operand]
         tile_size = op_type[0]
         dtype = op_type[1]
         one = ops.constant(1, dtype)
         return ops.truediv(one, ops.add(one, ops.exp(ops.neg(operand)))), [tile_size, dtype]
 
     @staticmethod
-    def fmod(operand1, operand2, *args, var_info=None, **kwargs):
+    def fmod(operand1, operand2, *args, **kwargs):
         raise NotImplementedError
 
     @staticmethod
-    def isinf(operand, *args, var_info=None, **kwargs):
+    def isinf(operand, *args, **kwargs):
         raise NotImplementedError
 
     @staticmethod
-    def isnan(operand, *args, var_info=None, **kwargs):
+    def isnan(operand, *args, **kwargs):
         raise NotImplementedError
 
     @staticmethod
-    def round(operand, *args, var_info=None, **kwargs):
-        tile_size, dtype = var_info[operand]
+    def round(operand, *args, **kwargs):
+        tile_size, dtype = V.kernel.var_info[operand]
         shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
 
         if dtype.startswith("f"):
@@ -652,8 +651,8 @@ def round(operand, *args, var_info=None, **kwargs):
             return operand, [tile_size, dtype]
 
     @staticmethod
-    def floor(operand, *args, var_info=None, **kwargs):
-        tile_size, dtype = var_info[operand]
+    def floor(operand, *args, **kwargs):
+        tile_size, dtype = V.kernel.var_info[operand]
         shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
 
         if dtype.startswith("f"):
@@ -662,12 +661,12 @@ def floor(operand, *args, var_info=None, **kwargs):
             return operand, [tile_size, dtype]
 
     @staticmethod
-    def sign(operand, *args, var_info=None, **kwargs):
+    def sign(operand, *args, **kwargs):
         raise NotImplementedError
 
     @staticmethod
-    def trunc(operand, *args, var_info=None, **kwargs):
-        tile_size, dtype = var_info[operand]
+    def trunc(operand, *args, **kwargs):
+        tile_size, dtype = V.kernel.var_info[operand]
         shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
 
         if dtype.startswith("f"):
@@ -676,8 +675,8 @@ def trunc(operand, *args, var_info=None, **kwargs):
             return operand, [tile_size, dtype]
 
     @staticmethod
-    def ceil(operand, *args, var_info=None, **kwargs):
-        tile_size, dtype = var_info[operand]
+    def ceil(operand, *args, **kwargs):
+        tile_size, dtype = V.kernel.var_info[operand]
         shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
 
         if dtype.startswith("f"):
@@ -687,8 +686,8 @@ def ceil(operand, *args, var_info=None, **kwargs):
 
     # Logical operations
     @staticmethod
-    def neg(operand, *args, var_info=None, **kwargs):
-        op_type = var_info[operand]
+    def neg(operand, *args, **kwargs):
+        op_type = V.kernel.var_info[operand]
         tile_size = op_type[0]
         dtype = op_type[1]
 
@@ -700,8 +699,8 @@ def neg(operand, *args, var_info=None, **kwargs):
         return f'arith.negf %{operand} : {shape}', [tile_size, dtype]
 
     @staticmethod
-    def reciprocal(operand, *args, var_info=None, **kwargs):
-        op_type = var_info[operand]
+    def reciprocal(operand, *args, **kwargs):
+        op_type = V.kernel.var_info[operand]
         tile_size = op_type[0]
         dtype = op_type[1]
 
@@ -712,8 +711,8 @@ def reciprocal(operand, *args, var_info=None, **kwargs):
         return ops.truediv(ops.constant(1.0, dtype), operand), [tile_size, dtype]
 
     @staticmethod
-    def eq(operand1, operand2, *args, var_info=None, **kwargs):
-        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+    def eq(operand1, operand2, *args, **kwargs):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2)
         if ret_type[0] == "f":
             op_type = "arith.cmpf"
             attribute = "oeq"
@@ -727,8 +726,8 @@ def eq(operand1, operand2, *args, var_info=None, **kwargs):
         return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"]
 
     @staticmethod
-    def ne(operand1, operand2, *args, var_info=None, **kwargs):
-        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+    def ne(operand1, operand2, *args, **kwargs):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2)
         if ret_type[0] == "f":
             op_type = "arith.cmpf"
             attribute = "one"
@@ -742,8 +741,8 @@ def ne(operand1, operand2, *args, var_info=None, **kwargs):
         return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"]
 
     @staticmethod
-    def lt(operand1, operand2, *args, var_info=None, **kwargs):
-        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+    def lt(operand1, operand2, *args, **kwargs):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2)
         if ret_type[0] == "f":
             op_type = "arith.cmpf"
             attribute = "olt"
@@ -757,8 +756,8 @@ def lt(operand1, operand2, *args, var_info=None, **kwargs):
         return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"]
 
     @staticmethod
-    def gt(operand1, operand2, *args, var_info=None, **kwargs):
-        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+    def gt(operand1, operand2, *args, **kwargs):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2)
         if ret_type[0] == "f":
             op_type = "arith.cmpf"
             attribute = "ogt"
@@ -772,8 +771,8 @@ def gt(operand1, operand2, *args, var_info=None, **kwargs):
         return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"]
 
     @staticmethod
-    def le(operand1, operand2, *args, var_info=None, **kwargs):
-        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+    def le(operand1, operand2, *args, **kwargs):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2)
         if ret_type[0] == "f":
             op_type = "arith.cmpf"
             attribute = "ole"
@@ -787,8 +786,8 @@ def le(operand1, operand2, *args, var_info=None, **kwargs):
         return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"]
 
     @staticmethod
-    def ge(operand1, operand2, *args, var_info=None, **kwargs):
-        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+    def ge(operand1, operand2, *args, **kwargs):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2)
         if ret_type[0] == "f":
             op_type = "arith.cmpf"
             attribute = "oge"
@@ -802,29 +801,29 @@ def ge(operand1, operand2, *args, var_info=None, **kwargs):
         return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"]
 
     @staticmethod
-    def add(operand1, operand2, *args, var_info=None, **kwargs):
-        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+    def add(operand1, operand2, *args, **kwargs):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2)
         shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
         opcode = f'arith.add{ret_type[0]}'
         return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
 
     @staticmethod
-    def sub(operand1, operand2, *args, var_info=None, **kwargs):
-        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+    def sub(operand1, operand2, *args, **kwargs):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2)
         shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
         opcode = f'arith.sub{ret_type[0]}'
         return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
 
     @staticmethod
-    def mul(operand1, operand2, *args, var_info=None, **kwargs):
-        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+    def mul(operand1, operand2, *args, **kwargs):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2)
         shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
         opcode = f'arith.mul{ret_type[0]}'
         return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
 
     @staticmethod
-    def pow(operand1, operand2, *args, var_info=None, **kwargs):
-        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+    def pow(operand1, operand2, *args, **kwargs):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2)
         # Type check & auto cast
         if ret_type.startswith("f"):
             operand1 = ops.to_dtype(operand1, "f32")
@@ -837,37 +836,37 @@ def pow(operand1, operand2, *args, var_info=None, **kwargs):
         return f"math.pow{ret_type[0]} %{operand1}, %{operand2} : {shape}", [tile_size, ret_type]
 
     @staticmethod
-    def and_(operand1, operand2, *args, var_info=None, **kwargs):
-        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+    def and_(operand1, operand2, *args, **kwargs):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2)
         
         shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
         return f'arith.andi %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
 
     @staticmethod
-    def or_(operand1, operand2, *args, var_info=None, **kwargs):
-        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+    def or_(operand1, operand2, *args, **kwargs):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2)
         
         shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
         return f'arith.ori %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
 
     @staticmethod
-    def xor(operand1, operand2, *args, var_info=None, **kwargs):
-        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+    def xor(operand1, operand2, *args, **kwargs):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2)
         
         shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
         return f'arith.xori %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
 
     @staticmethod
-    def lshift(operand1, operand2, *args, var_info=None, **kwargs):
+    def lshift(operand1, operand2, *args, **kwargs):
         raise NotImplementedError
 
     @staticmethod
-    def rshift(operand1, operand2, *args, var_info=None, **kwargs):
+    def rshift(operand1, operand2, *args, **kwargs):
         raise NotImplementedError
 
     @staticmethod
-    def truncdiv(operand1, operand2, *args, var_info=None, **kwargs):
-        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+    def truncdiv(operand1, operand2, *args, **kwargs):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2)
         shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
 
         if ret_type.startswith("f"):
@@ -877,8 +876,8 @@ def truncdiv(operand1, operand2, *args, var_info=None, **kwargs):
         return f'arith.divsi %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
 
     @staticmethod
-    def floordiv(operand1, operand2, *args, var_info=None, **kwargs):
-        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+    def floordiv(operand1, operand2, *args, **kwargs):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2)
         shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
 
         if ret_type.startswith("f"):
@@ -889,8 +888,8 @@ def floordiv(operand1, operand2, *args, var_info=None, **kwargs):
         return f'arith.floordivsi %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
 
     @staticmethod
-    def truediv(operand1, operand2, *args, var_info=None, **kwargs):
-        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+    def truediv(operand1, operand2, *args, **kwargs):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2)
         shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
 
         if not ret_type.startswith("f"):
@@ -899,12 +898,12 @@ def truediv(operand1, operand2, *args, var_info=None, **kwargs):
         return f'arith.divf %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
 
     @staticmethod
-    def int_truediv(operand1, operand2, *args, var_info=None, **kwargs):
+    def int_truediv(operand1, operand2, *args, **kwargs):
         """
         True division for Integers (Int -> Float).
         Promotes integers to floats, then performs floating-point division.
         """
-        tile_size, src_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+        tile_size, src_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2)
         if not src_type.startswith("f"):
             target_float_type = "f32"
             operand1 = ops.to_dtype(operand1, target_float_type)
@@ -912,11 +911,11 @@ def int_truediv(operand1, operand2, *args, var_info=None, **kwargs):
             src_type = target_float_type
 
         result = ops.truediv(operand1, operand2)
-        return result, var_info[result]
+        return result, V.kernel.var_info[result]
 
     @staticmethod
-    def mod(operand1, operand2, *args, var_info=None, **kwargs):
-        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+    def mod(operand1, operand2, *args, **kwargs):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2)
         shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
         if ret_type[0] == "f":
             raise NotImplementedError("Not support remainder operation for floating point")
@@ -925,8 +924,8 @@ def mod(operand1, operand2, *args, var_info=None, **kwargs):
         return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
 
     @staticmethod
-    def remainder(operand1, operand2, *args, var_info=None, **kwargs):
-        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2, var_info)
+    def remainder(operand1, operand2, *args, **kwargs):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2)
         shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
 
         if ret_type.startswith("f"):
@@ -937,28 +936,28 @@ def remainder(operand1, operand2, *args, var_info=None, **kwargs):
         return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
 
     @staticmethod
-    def square(operand, *args, var_info=None, **kwargs):
+    def square(operand, *args, **kwargs):
         result = ops.mul(operand, operand)
-        return result, var_info[result]
+        return result, V.kernel.var_info[result]
 
     # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     # PyTorchSim specific operations 
 
     @staticmethod
-    def alloc(size, src_type, *args, var_info=None, **kwargs):
+    def alloc(size, src_type, *args, **kwargs):
         return f"memref.alloc() : memref<{size}x{src_type}>", [size, src_type]
 
     @staticmethod
-    def extractelement(operand, idx, *args, var_info=None, **kwargs):
-        op_type = var_info[operand]
+    def extractelement(operand, idx, *args, **kwargs):
+        op_type = V.kernel.var_info[operand]
         tile_size = op_type[0]
         dtype = op_type[1]
         shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
         return f"vector.extract %{operand}[{idx}]: {dtype} from {shape}", [1, dtype]
 
     @staticmethod
-    def ext(operand, dtype, *args, var_info=None, **kwargs):
-        op_type = var_info[operand]
+    def ext(operand, dtype, *args, **kwargs):
+        op_type = V.kernel.var_info[operand]
         shape = f"vector<{op_type[0]}x{op_type[1]}>" if op_type[0] > 1 else f"{op_type[1]}"
         target_type = f"vector<{op_type[0]}x{dtype}>" if op_type[0] > 1 else f"{dtype}"
         if op_type[0] == "f":
@@ -968,8 +967,8 @@ def ext(operand, dtype, *args, var_info=None, **kwargs):
         return f'{opcode} %{operand} : {shape} to {target_type}', [op_type[0], dtype]
 
     @staticmethod
-    def to_bool(operand, *args, var_info=None, **kwargs):
-        tile_size, ret_type = var_info[operand]
+    def to_bool(operand, *args, **kwargs):
+        tile_size, ret_type = V.kernel.var_info[operand]
         if ret_type == "i1":
             return operand, [tile_size, ret_type]
 
@@ -984,15 +983,15 @@ def step(size, dtype, *args, **kwargs):
         return f"vector.step : {index_shape}", [size, dtype]
 
     @staticmethod
-    def index_cast(operand, target_type, *args, var_info=None, **kwrags):
-        op_type = var_info[operand]
+    def index_cast(operand, target_type, *args, **kwrags):
+        op_type = V.kernel.var_info[operand]
         src_shape = f"vector<{op_type[0]}x{op_type[1]}>" if op_type[0] > 1 else op_type[1]
         des_shape = f"vector<{op_type[0]}x{target_type}>" if op_type[0] > 1 else target_type
         return f"arith.index_cast %{operand} : {src_shape} to {des_shape}", [op_type[0], target_type]
 
     @staticmethod
-    def shape_cast(operand, src_shape, dst_shape, *args, var_info=None, **kwargs):
-        operand_type = var_info[operand]
+    def shape_cast(operand, src_shape, dst_shape, *args, **kwargs):
+        operand_type = V.kernel.var_info[operand]
         return f"vector.shape_cast %{operand} : {src_shape} to {dst_shape}", operand_type
 
     @staticmethod
@@ -1008,7 +1007,7 @@ def multi_reduction(acc, init, vec_size, red_size, red_shape, red_type, type_nam
         return line, [red_size, type_name]
 
     @staticmethod
-    def _load(compute_vec_size, mlir_dtype, buffer, indices, buffer_shape, *args, var_info=None, **kwargs):
+    def _load(compute_vec_size, mlir_dtype, buffer, indices, buffer_shape, *args, **kwargs):
         if compute_vec_size == 1:
             vshape = f"{mlir_dtype}"
             operation = "affine.load"
@@ -1020,8 +1019,8 @@ def _load(compute_vec_size, mlir_dtype, buffer, indices, buffer_shape, *args, va
         return line, [compute_vec_size, mlir_dtype]
 
     @staticmethod
-    def _store(operand, buffer, indices, buffer_shape, *args, buffer_name=None, var_info=None, **kwargs):
-        compute_vec_size, mlir_dtype = var_info[operand][0], var_info[operand][1]
+    def _store(operand, buffer, indices, buffer_shape, *args, buffer_name=None, **kwargs):
+        compute_vec_size, mlir_dtype = V.kernel.var_info[operand][0], V.kernel.var_info[operand][1]
 
         if compute_vec_size == 1:
             vshape = f"{mlir_dtype}"
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 4cfe71bf..8f92554c 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -925,7 +925,7 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
 
         _, operand_type = self.var_info[value]
         if mlir_dtype != operand_type:
-            value = ops.to_dtype(value, mlir_dtype, var_info=self.var_info)
+            value = ops.to_dtype(value, mlir_dtype)
         compute_index_var = ",".join([f"%{zero_var}"] * (self.kernel_group.tile_desc.get_nr_dim()-1) + [f"%{self.compute_idx}"])
         # Generate vector load instruction
         buffer_name = name if not store_force else None

From 4bed31b4e48031ac4dacfeb4062180c34b166ca8 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 8 Jan 2026 07:52:40 +0000
Subject: [PATCH 055/194] [Fix] Set epilogue fusoin condition

---
 PyTorchSimFrontend/mlir/mlir_scheduling.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index 35ccfee8..f5fadbc3 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -115,7 +115,7 @@ def can_fuse_horizontal(self, node1, node2):
             }
             # Buffers still required by the activation node (unmet) or read by it
             epilogue_unmet = { dep for dep in epilogue_node.unmet_dependencies }
-            has_depedency = bool(template_writes) and template_writes.issubset(epilogue_unmet)
+            has_depedency = bool(template_writes) and epilogue_unmet.issubset(template_writes)
             if not has_depedency:
                 return False
 

From 758b5b379b5880c3214b5d2a3356f5603850d9f6 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 8 Jan 2026 07:53:39 +0000
Subject: [PATCH 056/194] [Fix] Support Identity indexing + Fix wrapper codegen

---
 .../mlir/mlir_codegen_backend.py              | 48 +++++++++++--------
 1 file changed, 28 insertions(+), 20 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 654099c1..72cd691e 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -21,7 +21,7 @@
     is_welford_reduction,
     sympy_product
 )
-from torch.utils._sympy.functions import ModularIndexing, FloorDiv
+from torch.utils._sympy.functions import ModularIndexing, FloorDiv, Identity
 from PyTorchSimFrontend import extension_codecache
 from PyTorchSimFrontend import extension_config
 from . import mlir_common
@@ -198,26 +198,27 @@ def generate(self, is_inference):
         with contextlib.ExitStack() as stack:
             stack.enter_context(self.wrapper_call.indent())
             self.memory_plan_reuse()
-            for line in self.lines:
-                # Add buffer plan hook for dealloc
-                if isinstance(line, memory_planning.DeallocFromPoolLine):
-                    self.wrapper_call.writeline(f"sram_plan_postfix('{line.node.get_name()}', {line.node.get_name()})")
-                elif isinstance(line, str) and "del" in line:
-                    name = line.split(" ")[1]
-                    self.wrapper_call.writeline(f"sram_plan_postfix('{name}', {name})")
-
-                if isinstance(line, wrapper.MemoryPlanningLine):
-                    line.codegen(self.wrapper_call)
-                elif isinstance(line, wrapper.KernelCallLine):
-                    self.wrapper_call.writeline(self.wrap_kernel_call(line.kernel_name, line.call_args))
-                else:
-                    if isinstance(line, wrapper.WrapperLine):
+            with self.set_writeline(self.wrapper_call.writeline):
+                for line in self.lines:
+                    # Add buffer plan hook for dealloc
+                    if isinstance(line, memory_planning.DeallocFromPoolLine):
+                        self.wrapper_call.writeline(f"sram_plan_postfix('{line.node.get_name()}', {line.node.get_name()})")
+                    elif isinstance(line, str) and "del" in line:
+                        name = line.split(" ")[1]
+                        self.wrapper_call.writeline(f"sram_plan_postfix('{name}', {name})")
+
+                    if isinstance(line, wrapper.MemoryPlanningLine):
                         line.codegen(self.wrapper_call)
+                    elif isinstance(line, wrapper.KernelCallLine):
+                        self.wrapper_call.writeline(self.wrap_kernel_call(line.kernel_name, line.call_args))
                     else:
-                        self.wrapper_call.writeline(line)
-                # Add buffer plan hook for alloc
-                if isinstance(line, memory_planning.AllocFromPoolLine) or isinstance(line, wrapper.AllocateLine):
-                    self.wrapper_call.writeline(f"sram_plan_prefix('{line.node.get_name()}', {line.node.get_name()})")
+                        if isinstance(line, wrapper.WrapperLine):
+                            line.codegen(self.wrapper_call)
+                        else:
+                            self.wrapper_call.writeline(line)
+                    # Add buffer plan hook for alloc
+                    if isinstance(line, memory_planning.AllocFromPoolLine) or isinstance(line, wrapper.AllocateLine):
+                        self.wrapper_call.writeline(f"sram_plan_prefix('{line.node.get_name()}', {line.node.get_name()})")
             output_refs = self.get_output_refs()
             self.codegen_sram_plan_postfix(output_refs)
             self.mark_output_type()
@@ -334,6 +335,7 @@ def convert_index(self, expr, buffer):
             expr_str = expr_str.replace("//", " floordiv ")
         else:
             raise NotImplementedError("What is this case?")
+
         first_arg = expr.args[0]
         if len(first_arg.free_symbols) != 1:
             raise NotImplementedError("What is this case?")
@@ -356,6 +358,11 @@ def parse_indices(self, expr, buffer=None, comments="", indirect_dims=[]) -> com
         if len(expr.args) == 0 and len(indirect_dims) == 0:
             return expr
 
+        # Replace Identity arguments with Identity.args[0]
+        for arg in expr.args:
+            if isinstance(arg, Identity):
+                expr = expr.replace(arg, arg.args[0] if arg.args else arg)
+
         if len(expr.args) == 0:
             args = [expr]
         else:
@@ -677,9 +684,10 @@ def _index_expr(self, tile_desc, renamed_expression, index, base_vector_index):
         # In case of index expr, dimension size should be divisible by tile size
         if not self.kernel_group.tile_desc.is_dim_dividable(self.ranges):
             new_tile_size = self.kernel_group.tile_desc.adjust_tile_to_divisible(self.ranges)
+            prior_tile_size, prior_ranges = self.kernel_group.tile_desc.get_tile_size(), self.ranges
             self.kernel_group.tile_desc.set_tile_size(new_tile_size)
             self.reset("recompile")
-            raise mlir_common.RecompileSignal(f"Index access (tile size {self.kernel_group.tile_desc.get_tile_size()} is not divisible by {self.ranges})")
+            raise mlir_common.RecompileSignal(f"Index access (tile size {prior_tile_size} is not divisible by {prior_ranges})")
 
         tile_size = tile_desc.get_tile_size_per_lane()
         compute_vec_size = tile_desc.get_compute_vec_size()

From a7ab604788e84f2ddfef55cd46e370deac5bc44d Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 8 Jan 2026 07:54:31 +0000
Subject: [PATCH 057/194] [Fix] Keep contextvar after reset()

---
 PyTorchSimFrontend/mlir/mlir_common.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 7b6ee11c..3bbf3db7 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -609,9 +609,14 @@ def __init__(self, kernel_group, reason=None):
         self.recodegen = reason # spad overflow, tile size, vlane stride
         self.stop_autotune = False
 
-        # Context var for codegen
-        self.target_buffer_override = contextvars.ContextVar("Handler_compute_override", default=self.compute)
-        self.target_cse_override = contextvars.ContextVar("Handler_cse_override", default=self.cse)
+        # Context var for codegen - preserve existing ContextVar on reset to avoid Token mismatch
+        # Don't recreate if already exists (e.g., when reset() is called during active context manager)
+        if not hasattr(self, 'target_buffer_override'):
+            instance_id = id(self)
+            self.target_buffer_override = contextvars.ContextVar(f"Handler_compute_override_{instance_id}", default=self.compute)
+            self.target_cse_override = contextvars.ContextVar(f"Handler_cse_override_{instance_id}", default=self.cse)
+        else:
+            pass
 
     def set_ranges(self, lengths, reduction_lengths):
         if self.call_ranges:

From cd52f57713e2ec18439d28eae47d8f8346aaa4f9 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 8 Jan 2026 11:19:26 +0000
Subject: [PATCH 058/194] [Frontend] Add decompsition of default attetnion

---
 .../extension_device_op_overrides.py          |   4 +-
 PyTorchSimFrontend/mlir/mlir_decomposition.py | 146 ++++++++++++++++++
 PyTorchSimFrontend/mlir/mlir_scheduling.py    |   1 +
 3 files changed, 150 insertions(+), 1 deletion(-)
 create mode 100644 PyTorchSimFrontend/mlir/mlir_decomposition.py

diff --git a/PyTorchSimDevice/extension_device_op_overrides.py b/PyTorchSimDevice/extension_device_op_overrides.py
index 17439b95..27a47357 100644
--- a/PyTorchSimDevice/extension_device_op_overrides.py
+++ b/PyTorchSimDevice/extension_device_op_overrides.py
@@ -3,6 +3,7 @@
 from textwrap import dedent
 
 from torch._inductor.codegen.common import DeviceOpOverrides, register_device_op_overrides
+from torch._inductor.codegen.cpu_device_op_overrides import CpuDeviceOpOverrides
 
 class ExtensionDeviceOpOverrides(DeviceOpOverrides):
     def import_get_raw_stream_as(self, name: str) -> str:
@@ -22,4 +23,5 @@ def synchronize(self) -> str:
     def device_guard(self, device_idx: int) -> str:
         return "pass"
 
-register_device_op_overrides("npu", ExtensionDeviceOpOverrides())
\ No newline at end of file
+register_device_op_overrides("npu", ExtensionDeviceOpOverrides())
+register_device_op_overrides("cpu", CpuDeviceOpOverrides())
\ No newline at end of file
diff --git a/PyTorchSimFrontend/mlir/mlir_decomposition.py b/PyTorchSimFrontend/mlir/mlir_decomposition.py
new file mode 100644
index 00000000..33389a91
--- /dev/null
+++ b/PyTorchSimFrontend/mlir/mlir_decomposition.py
@@ -0,0 +1,146 @@
+import math
+import torch
+import torch.nn.functional as F
+from torch._inductor.decomposition import register_decomposition
+
+aten = torch.ops.aten
+
+@register_decomposition(aten._native_multi_head_attention.default)
+def decompose_native_multi_head_attention(
+    query,
+    key,
+    value,
+    embed_dim: int,
+    num_heads: int,
+    qkv_weight,
+    qkv_bias,
+    proj_weight,
+    proj_bias,
+    mask=None,
+    need_weights: bool = False,
+):
+    """
+    Decompose _native_multi_head_attention into scaled_dot_product_attention operations.
+
+    Based on F.scaled_dot_product_attention and nn.MultiheadAttention implementation:
+    1. QKV projection (if needed - but query/key/value may already be projected)
+    2. Reshape to multi-head format
+    3. Scaled dot product: Q @ K^T / sqrt(head_dim)
+    4. Softmax
+    5. Attention @ V
+    6. Reshape back and output projection
+    """
+    head_dim = embed_dim // num_heads
+    scale_factor = 1.0 / math.sqrt(head_dim)
+
+    # Get input shapes - assuming [batch, seq_len, embed_dim] format
+    query_shape = query.shape
+    if len(query_shape) == 3:
+        # [batch, seq_len, embed_dim] format
+        batch_size = query_shape[0]
+        seq_len = query_shape[1]
+    elif len(query_shape) == 2:
+        # [seq_len, embed_dim] -> add batch dimension
+        batch_size = 1
+        seq_len = query_shape[0]
+        query = query.unsqueeze(0)  # [1, seq_len, embed_dim]
+        key = key.unsqueeze(0)
+        value = value.unsqueeze(0)
+    else:
+        # Fallback: assume first dim is batch, second is seq_len
+        batch_size = query_shape[0] if len(query_shape) > 0 else 1
+        seq_len = query_shape[1] if len(query_shape) > 1 else query_shape[0]
+
+    # Step 1: QKV projection (if query/key/value are not already projected)
+    # In many cases, query/key/value are already projected, so we check if qkv_weight is used
+    # For now, assume they might need projection
+    # Note: In practice, _native_multi_head_attention often receives already projected inputs
+
+    # Reshape for projection: [batch, seq_len, embed_dim] -> [batch*seq_len, embed_dim]
+    if len(query.shape) == 3:
+        query_flat = query.view(-1, embed_dim)
+        key_flat = key.view(-1, embed_dim)
+        value_flat = value.view(-1, embed_dim)
+    else:
+        query_flat = query
+        key_flat = key
+        value_flat = value
+
+    # QKV projection using qkv_weight and qkv_bias
+    # qkv_weight shape: [3*embed_dim, embed_dim] -> split into 3 parts
+    # Split qkv_weight into Q, K, V weights
+    qkv_weight_q, qkv_weight_k, qkv_weight_v = torch.split(qkv_weight, embed_dim, dim=0)
+    if qkv_bias is not None:
+        # qkv_bias shape: [3*embed_dim] -> split into 3 parts
+        qkv_bias_q, qkv_bias_k, qkv_bias_v = torch.split(qkv_bias, embed_dim, dim=0)
+    else:
+        qkv_bias_q = qkv_bias_k = qkv_bias_v = None
+
+    # Project Q, K, V
+    q = torch.nn.functional.linear(query_flat, qkv_weight_q, qkv_bias_q)
+    k = torch.nn.functional.linear(key_flat, qkv_weight_k, qkv_bias_k)
+    v = torch.nn.functional.linear(value_flat, qkv_weight_v, qkv_bias_v)
+
+    # Reshape back: [batch*seq_len, embed_dim] -> [batch, seq_len, embed_dim]
+    q = q.view(batch_size, seq_len, embed_dim)
+    k = k.view(batch_size, seq_len, embed_dim)
+    v = v.view(batch_size, seq_len, embed_dim)
+
+    # Step 2: Reshape to multi-head format
+    # [batch, seq_len, embed_dim] -> [batch, seq_len, num_heads, head_dim]
+    q = q.view(batch_size, seq_len, num_heads, head_dim)
+    k = k.view(batch_size, seq_len, num_heads, head_dim)
+    v = v.view(batch_size, seq_len, num_heads, head_dim)
+
+    # Transpose to [batch, num_heads, seq_len, head_dim] for bmm
+    # [batch, seq_len, embed_dim] -> [batch, seq_len, num_heads, head_dim]
+    q = q.view(batch_size, seq_len, num_heads, head_dim)
+    k = k.view(batch_size, seq_len, num_heads, head_dim)
+    v = v.view(batch_size, seq_len, num_heads, head_dim)
+
+    # Transpose to [batch, num_heads, seq_len, head_dim] for bmm
+    q = q.transpose(1, 2)  # [batch, num_heads, seq_len, head_dim]
+    k = k.transpose(1, 2)  # [batch, num_heads, seq_len, head_dim]
+    v = v.transpose(1, 2)  # [batch, num_heads, seq_len, head_dim]
+
+    # Step 3: Scaled dot product attention
+    # Scale Q
+    q_scaled = q * scale_factor
+
+    # Q @ K^T: [batch, num_heads, seq_len, head_dim] @ [batch, num_heads, head_dim, seq_len]
+    # -> [batch, num_heads, seq_len, seq_len]
+    k_transposed = k.transpose(-2, -1)  # [batch, num_heads, head_dim, seq_len]
+    scores = torch.matmul(q_scaled, k_transposed)  # [batch, num_heads, seq_len, seq_len]
+
+    # Step 4: Apply mask if provided
+    if mask is not None:
+        scores = scores + mask
+
+    # Step 5: Softmax along the last dimension (seq_len dimension)
+    # Stable softmax: subtract max, exp, divide by sum
+    scores_max = scores.amax(dim=-1, keepdim=True)  # [batch, num_heads, seq_len, 1]
+    scores_shifted = scores - scores_max
+    scores_exp = scores_shifted.exp()
+    scores_sum = scores_exp.sum(dim=-1, keepdim=True)  # [batch, num_heads, seq_len, 1]
+    attn_weights = scores_exp / scores_sum  # [batch, num_heads, seq_len, seq_len]
+
+    # Step 6: Attention @ V
+    # [batch, num_heads, seq_len, seq_len] @ [batch, num_heads, seq_len, head_dim]
+    # -> [batch, num_heads, seq_len, head_dim]
+    attn_output = torch.matmul(attn_weights, v)
+
+    # Step 7: Reshape back to [batch, seq_len, embed_dim]
+    attn_output = attn_output.transpose(1, 2)  # [batch, seq_len, num_heads, head_dim]
+    attn_output = attn_output.contiguous().view(batch_size, seq_len, embed_dim)
+
+    # Step 8: Output projection
+    attn_output_flat = attn_output.view(-1, embed_dim)
+    output = torch.nn.functional.linear(attn_output_flat, proj_weight, proj_bias)
+    output = output.view(batch_size, seq_len, embed_dim)
+
+    if need_weights:
+        # Return attention weights: [batch, num_heads, seq_len, seq_len] -> [batch, seq_len, seq_len]
+        attn_weights_mean = attn_weights.mean(dim=1)  # Average over heads
+        return output, attn_weights_mean
+    else:
+        return (output, None)
\ No newline at end of file
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index f5fadbc3..bfcda258 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -18,6 +18,7 @@
 
 from . import mlir_common
 from . import mlir_lowering # DO NOT REMOVE THIS LINE, it is used for lowering
+from . import mlir_decomposition # DO NOT REMOVE THIS LINE, it is used for decomposition
 
 class MLIRScheduling(BaseScheduling):
     count = 0

From 08e0c8be825a8c41633ca02da5972c1f8089d053 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 8 Jan 2026 12:54:16 +0000
Subject: [PATCH 059/194] [Fix] Add missing case

---
 .../mlir/mlir_codegen_backend.py              |  2 ++
 PyTorchSimFrontend/mlir/mlir_common.py        | 25 +++++++++----------
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 72cd691e..27fdf757 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -360,6 +360,8 @@ def parse_indices(self, expr, buffer=None, comments="", indirect_dims=[]) -> com
 
         # Replace Identity arguments with Identity.args[0]
         for arg in expr.args:
+            if arg.is_Mul and arg.args[0].is_number and isinstance(arg.args[1], Identity):
+                expr = expr.replace(arg.args[1], arg.args[1].args[0])
             if isinstance(arg, Identity):
                 expr = expr.replace(arg, arg.args[0] if arg.args else arg)
 
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 3bbf3db7..d96eb452 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -609,14 +609,9 @@ def __init__(self, kernel_group, reason=None):
         self.recodegen = reason # spad overflow, tile size, vlane stride
         self.stop_autotune = False
 
-        # Context var for codegen - preserve existing ContextVar on reset to avoid Token mismatch
-        # Don't recreate if already exists (e.g., when reset() is called during active context manager)
-        if not hasattr(self, 'target_buffer_override'):
-            instance_id = id(self)
-            self.target_buffer_override = contextvars.ContextVar(f"Handler_compute_override_{instance_id}", default=self.compute)
-            self.target_cse_override = contextvars.ContextVar(f"Handler_cse_override_{instance_id}", default=self.cse)
-        else:
-            pass
+        instance_id = id(self)
+        self.target_buffer_override = contextvars.ContextVar(f"Handler_compute_override_{instance_id}", default=self.compute)
+        self.target_cse_override = contextvars.ContextVar(f"Handler_cse_override_{instance_id}", default=self.cse)
 
     def set_ranges(self, lengths, reduction_lengths):
         if self.call_ranges:
@@ -697,7 +692,9 @@ def extract_dividers(self, implicit_ops):
             }
             new_index = operand.index.subs(subs_map)
             for arg in new_index.args:
-                if len(arg.free_symbols) != 1:
+                if arg.is_number:
+                    continue
+                if len(arg.free_symbols) > 1:
                     raise NotImplementedError("Not supporting this view operation...!")
                 if arg.is_Mul and arg.args[0].is_number:
                     arg = arg.args[1]
@@ -852,18 +849,20 @@ def rename_indexing(self, index) -> sympy.Expr:
 
     @contextmanager
     def override_buffer_cse(self, *, buffer=None, cse=None):
+        buffer_override = self.target_buffer_override
+        cse_override = self.target_cse_override
         target_buffer = target_cse = None
         try:
             if buffer is not None:
-                target_buffer = self.target_buffer_override.set(buffer)
+                target_buffer = buffer_override.set(buffer)
             if cse is not None:
-                target_cse = self.target_cse_override.set(cse)
+                target_cse = cse_override.set(cse)
             yield self
         finally:
             if target_cse is not None:
-                self.target_cse_override.reset(target_cse)
+                cse_override.reset(target_cse)
             if target_buffer is not None:
-                self.target_buffer_override.reset(target_buffer)
+                buffer_override.reset(target_buffer)
 
     def __enter__(self):
         class CSEProxy:

From 1d1508acc3be5623c0a3672b03e3c63e9d664414 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 8 Jan 2026 12:54:47 +0000
Subject: [PATCH 060/194] [Test] Add GQA test file

---
 PyTorchSimFrontend/mlir/mlir_decomposition.py |  61 +++-
 tests/test_gqa.py                             | 335 ++++++++++++++++++
 2 files changed, 377 insertions(+), 19 deletions(-)
 create mode 100644 tests/test_gqa.py

diff --git a/PyTorchSimFrontend/mlir/mlir_decomposition.py b/PyTorchSimFrontend/mlir/mlir_decomposition.py
index 33389a91..141fa9e4 100644
--- a/PyTorchSimFrontend/mlir/mlir_decomposition.py
+++ b/PyTorchSimFrontend/mlir/mlir_decomposition.py
@@ -67,14 +67,37 @@ def decompose_native_multi_head_attention(
         value_flat = value
 
     # QKV projection using qkv_weight and qkv_bias
-    # qkv_weight shape: [3*embed_dim, embed_dim] -> split into 3 parts
-    # Split qkv_weight into Q, K, V weights
-    qkv_weight_q, qkv_weight_k, qkv_weight_v = torch.split(qkv_weight, embed_dim, dim=0)
-    if qkv_bias is not None:
-        # qkv_bias shape: [3*embed_dim] -> split into 3 parts
-        qkv_bias_q, qkv_bias_k, qkv_bias_v = torch.split(qkv_bias, embed_dim, dim=0)
+    # Check if GQA (Grouped Query Attention) is used
+    # Standard MHA: qkv_weight shape = [3*embed_dim, embed_dim]
+    # GQA: qkv_weight shape = [embed_dim + 2*kv_embed_dim, embed_dim] where kv_embed_dim < embed_dim
+    qkv_weight_total = qkv_weight.shape[0]
+
+    # Determine if GQA: if qkv_weight is not exactly 3*embed_dim, it might be GQA
+    if qkv_weight_total == 3 * embed_dim:
+        # Standard MHA: split equally
+        qkv_weight_q, qkv_weight_k, qkv_weight_v = torch.split(qkv_weight, embed_dim, dim=0)
+        if qkv_bias is not None:
+            qkv_bias_q, qkv_bias_k, qkv_bias_v = torch.split(qkv_bias, embed_dim, dim=0)
+        else:
+            qkv_bias_q = qkv_bias_k = qkv_bias_v = None
+        kv_embed_dim = embed_dim
+        kv_heads = num_heads
     else:
-        qkv_bias_q = qkv_bias_k = qkv_bias_v = None
+        # GQA: Q has embed_dim, K and V share the rest
+        # Assume Q = embed_dim, K = V = (qkv_weight_total - embed_dim) / 2
+        q_dim = embed_dim
+        kv_dim = (qkv_weight_total - embed_dim) // 2
+        qkv_weight_q = qkv_weight[:q_dim]
+        qkv_weight_k = qkv_weight[q_dim:q_dim + kv_dim]
+        qkv_weight_v = qkv_weight[q_dim + kv_dim:]
+        if qkv_bias is not None:
+            qkv_bias_q = qkv_bias[:q_dim]
+            qkv_bias_k = qkv_bias[q_dim:q_dim + kv_dim]
+            qkv_bias_v = qkv_bias[q_dim + kv_dim:]
+        else:
+            qkv_bias_q = qkv_bias_k = qkv_bias_v = None
+        kv_embed_dim = kv_dim
+        kv_heads = kv_embed_dim // head_dim  # Number of KV heads
 
     # Project Q, K, V
     q = torch.nn.functional.linear(query_flat, qkv_weight_q, qkv_bias_q)
@@ -83,25 +106,25 @@ def decompose_native_multi_head_attention(
 
     # Reshape back: [batch*seq_len, embed_dim] -> [batch, seq_len, embed_dim]
     q = q.view(batch_size, seq_len, embed_dim)
-    k = k.view(batch_size, seq_len, embed_dim)
-    v = v.view(batch_size, seq_len, embed_dim)
+    k = k.view(batch_size, seq_len, kv_embed_dim)
+    v = v.view(batch_size, seq_len, kv_embed_dim)
 
     # Step 2: Reshape to multi-head format
     # [batch, seq_len, embed_dim] -> [batch, seq_len, num_heads, head_dim]
     q = q.view(batch_size, seq_len, num_heads, head_dim)
-    k = k.view(batch_size, seq_len, num_heads, head_dim)
-    v = v.view(batch_size, seq_len, num_heads, head_dim)
-
-    # Transpose to [batch, num_heads, seq_len, head_dim] for bmm
-    # [batch, seq_len, embed_dim] -> [batch, seq_len, num_heads, head_dim]
-    q = q.view(batch_size, seq_len, num_heads, head_dim)
-    k = k.view(batch_size, seq_len, num_heads, head_dim)
-    v = v.view(batch_size, seq_len, num_heads, head_dim)
+    k = k.view(batch_size, seq_len, kv_heads, head_dim)
+    v = v.view(batch_size, seq_len, kv_heads, head_dim)
 
     # Transpose to [batch, num_heads, seq_len, head_dim] for bmm
     q = q.transpose(1, 2)  # [batch, num_heads, seq_len, head_dim]
-    k = k.transpose(1, 2)  # [batch, num_heads, seq_len, head_dim]
-    v = v.transpose(1, 2)  # [batch, num_heads, seq_len, head_dim]
+    k = k.transpose(1, 2)  # [batch, kv_heads, seq_len, head_dim]
+    v = v.transpose(1, 2)  # [batch, kv_heads, seq_len, head_dim]
+
+    # GQA: If key/value have fewer heads, repeat them to match query heads
+    if kv_heads < num_heads:
+        repeat_factor = num_heads // kv_heads
+        k = k.repeat_interleave(repeat_factor, dim=1)  # [batch, num_heads, seq_len, head_dim]
+        v = v.repeat_interleave(repeat_factor, dim=1)  # [batch, num_heads, seq_len, head_dim]
 
     # Step 3: Scaled dot product attention
     # Scale Q
diff --git a/tests/test_gqa.py b/tests/test_gqa.py
new file mode 100644
index 00000000..c5f2f6f6
--- /dev/null
+++ b/tests/test_gqa.py
@@ -0,0 +1,335 @@
+import sys
+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch._dynamo
+import argparse
+
+
+def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
+    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        message = f"|{name} Test Passed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+    else:
+        message = f"|{name} Test Failed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+        print("custom out: ", out.cpu())
+        print("cpu out: ", cpu_out)
+        exit(1)
+
+
+class GQAMultiheadAttention(nn.Module):
+    """
+    Grouped Query Attention (GQA) implementation.
+    Query has num_heads, but key/value have num_kv_heads (num_kv_heads < num_heads).
+    """
+    def __init__(self, embed_dim, num_heads, num_kv_heads=None, head_dim=None, bias=True, dropout=0.0):
+        super().__init__()
+        assert embed_dim % num_heads == 0
+        if head_dim is None:
+            head_dim = embed_dim // num_heads
+        assert embed_dim == num_heads * head_dim
+        
+        # If num_kv_heads is not specified, use num_heads (standard MHA)
+        if num_kv_heads is None:
+            num_kv_heads = num_heads
+        
+        assert num_kv_heads <= num_heads
+        assert embed_dim % num_kv_heads == 0
+        
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.num_kv_heads = num_kv_heads
+        self.head_dim = head_dim
+        self.dropout = dropout
+        
+        # QKV projection: Q has embed_dim, K and V have kv_embed_dim each
+        kv_embed_dim = num_kv_heads * head_dim
+        total_qkv_dim = embed_dim + 2 * kv_embed_dim
+        
+        self.qkv_proj = nn.Linear(embed_dim, total_qkv_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        
+    def forward(self, query, key=None, value=None, attn_mask=None, need_weights=False):
+        """
+        Args:
+            query: [batch, seq_len, embed_dim] or [seq_len, batch, embed_dim]
+            key: optional, same shape as query
+            value: optional, same shape as query
+            attn_mask: optional attention mask
+            need_weights: whether to return attention weights
+        """
+        # For compatibility with nn.MultiheadAttention API
+        if key is None:
+            key = query
+        if value is None:
+            value = query
+        
+        # Handle batch_first vs batch_second
+        if query.dim() == 3:
+            batch_first = True
+            batch_size, seq_len, _ = query.shape
+        else:
+            batch_first = False
+            seq_len, batch_size, _ = query.shape
+            query = query.transpose(0, 1)
+            key = key.transpose(0, 1)
+            value = value.transpose(0, 1)
+        
+        # Project QKV
+        # Use query for QKV projection (standard MHA/GQA pattern)
+        qkv = self.qkv_proj(query)  # [batch, seq_len, total_qkv_dim]
+        
+        # Split into Q, K, V
+        kv_embed_dim = self.num_kv_heads * self.head_dim
+        q = qkv[:, :, :self.embed_dim]  # [batch, seq_len, embed_dim]
+        k = qkv[:, :, self.embed_dim:self.embed_dim + kv_embed_dim]  # [batch, seq_len, kv_embed_dim]
+        v = qkv[:, :, self.embed_dim + kv_embed_dim:]  # [batch, seq_len, kv_embed_dim]
+        
+        # Reshape to multi-head format
+        q = q.view(batch_size, seq_len, self.num_heads, self.head_dim)  # [batch, seq_len, num_heads, head_dim]
+        k = k.view(batch_size, seq_len, self.num_kv_heads, self.head_dim)  # [batch, seq_len, num_kv_heads, head_dim]
+        v = v.view(batch_size, seq_len, self.num_kv_heads, self.head_dim)  # [batch, seq_len, num_kv_heads, head_dim]
+        
+        # Transpose for attention: [batch, num_heads, seq_len, head_dim]
+        q = q.transpose(1, 2)  # [batch, num_heads, seq_len, head_dim]
+        k = k.transpose(1, 2)  # [batch, num_kv_heads, seq_len, head_dim]
+        v = v.transpose(1, 2)  # [batch, num_kv_heads, seq_len, head_dim]
+        
+        # Scaled dot product attention with GQA support
+        # enable_gqa=True allows different number of heads for Q vs K/V
+        attn_output = F.scaled_dot_product_attention(
+            q, k, v,
+            attn_mask=attn_mask,
+            dropout_p=self.dropout if self.training else 0.0,
+            is_causal=False,
+            enable_gqa=(self.num_kv_heads < self.num_heads)
+        )  # [batch, num_heads, seq_len, head_dim]
+        
+        # Reshape back: [batch, num_heads, seq_len, head_dim] -> [batch, seq_len, embed_dim]
+        attn_output = attn_output.transpose(1, 2)  # [batch, seq_len, num_heads, head_dim]
+        attn_output = attn_output.contiguous().view(batch_size, seq_len, self.embed_dim)
+        
+        # Output projection
+        output = self.out_proj(attn_output)  # [batch, seq_len, embed_dim]
+        
+        if not batch_first:
+            output = output.transpose(0, 1)  # [seq_len, batch, embed_dim]
+        
+        if need_weights:
+            # Compute attention weights for return
+            # This is simplified - in practice you'd want the actual attention weights
+            attn_weights = None
+            return output, attn_weights
+        else:
+            return output
+
+
+def test_gqa_attention(device, batch=1, seq_len=32, embed_dim=768, num_heads=12, num_kv_heads=4):
+    """
+    Test Grouped Query Attention (GQA) where num_kv_heads < num_heads.
+    
+    Args:
+        device: target device
+        batch: batch size
+        seq_len: sequence length
+        embed_dim: embedding dimension
+        num_heads: number of query heads
+        num_kv_heads: number of key/value heads (should be <= num_heads)
+    """
+    print(f"Testing GQA Attention (batch={batch}, seq_len={seq_len}, embed_dim={embed_dim}, "
+          f"num_heads={num_heads}, num_kv_heads={num_kv_heads})")
+    
+    # Create GQA model
+    gqa = GQAMultiheadAttention(
+        embed_dim=embed_dim,
+        num_heads=num_heads,
+        num_kv_heads=num_kv_heads,
+        bias=True,
+        dropout=0.0
+    ).eval()
+    
+    # Initialize weights
+    torch.nn.init.normal_(gqa.qkv_proj.weight, mean=0.0, std=0.02)
+    torch.nn.init.normal_(gqa.qkv_proj.bias, mean=0.0, std=0.02)
+    torch.nn.init.normal_(gqa.out_proj.weight, mean=0.0, std=0.02)
+    torch.nn.init.normal_(gqa.out_proj.bias, mean=0.0, std=0.02)
+    
+    # Create input
+    x = torch.randn(batch, seq_len, embed_dim)
+    query = x.clone()
+    key = x.clone()
+    value = x.clone()
+    
+    # Run on custom device
+    gqa_device = gqa.to(device)
+    q1, k1, v1 = query.to(device), key.to(device), value.to(device)
+    
+    compiled_gqa = torch.compile(gqa_device, dynamic=False)
+    with torch.no_grad():
+        out_device = compiled_gqa(q1, k1, v1)
+    
+    # Run on CPU
+    gqa_cpu = gqa.cpu()
+    q2, k2, v2 = query.cpu(), key.cpu(), value.cpu()
+    with torch.no_grad():
+        out_cpu = gqa_cpu(q2, k2, v2)
+    
+    test_result("GQA Attention", out_device, out_cpu)
+    print("Max diff > ", torch.max(torch.abs(out_device.cpu() - out_cpu)))
+    print("GQA Attention Simulation Done")
+
+
+def test_standard_mha_via_gqa(device, batch=1, seq_len=32, embed_dim=768, num_heads=12):
+    """
+    Test standard Multi-Head Attention using GQA with num_kv_heads == num_heads.
+    This should behave the same as standard MHA.
+    """
+    print(f"Testing Standard MHA via GQA (batch={batch}, seq_len={seq_len}, "
+          f"embed_dim={embed_dim}, num_heads={num_heads})")
+    
+    test_gqa_attention(device, batch, seq_len, embed_dim, num_heads, num_kv_heads=num_heads)
+
+
+def test_repeat_interleave_compilation(device, batch=1, seq_len=32, embed_dim=768, num_heads=12, num_kv_heads=4):
+    """
+    Test that repeat_interleave operation compiles and works correctly using scaled_dot_product_attention implementation.
+    
+    This test uses the exact implementation from F.scaled_dot_product_attention to verify
+    that repeat_interleave works correctly when enable_gqa=True.
+    
+    Args:
+        device: target device
+        batch: batch size
+        seq_len: sequence length
+        embed_dim: embedding dimension
+        num_heads: number of query heads
+        num_kv_heads: number of key/value heads (should be < num_heads)
+    """
+    import math
+    
+    print(f"Testing repeat_interleave compilation using scaled_dot_product_attention implementation "
+          f"(batch={batch}, seq_len={seq_len}, embed_dim={embed_dim}, "
+          f"num_heads={num_heads}, num_kv_heads={num_kv_heads})")
+    
+    head_dim = embed_dim // num_heads
+    assert num_kv_heads < num_heads, "num_kv_heads must be less than num_heads for GQA"
+    
+    # Create Q, K, V tensors
+    # Q: [batch, num_heads, seq_len, head_dim]
+    # K, V: [batch, num_kv_heads, seq_len, head_dim]
+    q = torch.randn(batch, num_heads, seq_len, head_dim)
+    k = torch.randn(batch, num_kv_heads, seq_len, head_dim)
+    v = torch.randn(batch, num_kv_heads, seq_len, head_dim)
+    
+    # Move to device
+    q_device = q.to(device)
+    k_device = k.to(device)
+    v_device = v.to(device)
+    
+    # Implementation from F.scaled_dot_product_attention
+    def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0,
+            is_causal=False, scale=None, enable_gqa=False) -> torch.Tensor:
+        L, S = query.size(-2), key.size(-2)
+        scale_factor = 1 / math.sqrt(query.size(-1)) if scale is None else scale
+        attn_bias = torch.zeros(L, S, dtype=query.dtype, device=query.device)
+        if is_causal:
+            assert attn_mask is None
+            temp_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0)
+            attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
+            attn_bias.to(query.dtype)
+
+        if attn_mask is not None:
+            if attn_mask.dtype == torch.bool:
+                attn_bias.masked_fill_(attn_mask.logical_not(), float("-inf"))
+            else:
+                attn_bias = attn_mask + attn_bias
+
+        if enable_gqa:
+            key = key.repeat_interleave(query.size(-3)//key.size(-3), -3)
+            value = value.repeat_interleave(query.size(-3)//value.size(-3), -3)
+
+        attn_weight = query @ key.transpose(-2, -1) * scale_factor
+        attn_weight += attn_bias
+        attn_weight = torch.softmax(attn_weight, dim=-1)
+        return attn_weight, value, attn_weight @ value
+    
+    # Compile the function
+    compiled_attn = torch.compile(scaled_dot_product_attention, dynamic=False)
+    
+    # Run on custom device with enable_gqa=True
+    with torch.no_grad():
+        output_device = compiled_attn(q_device, k_device, v_device, 
+                                      attn_mask=None, dropout_p=0.0, 
+                                      is_causal=False, scale=None, enable_gqa=True)
+    
+    # Run on CPU for comparison
+    q_cpu = q.cpu()
+    k_cpu = k.cpu()
+    v_cpu = v.cpu()
+    with torch.no_grad():
+        output_cpu = scaled_dot_product_attention(q_cpu, k_cpu, v_cpu,
+                                                  attn_mask=None, dropout_p=0.0,
+                                                  is_causal=False, scale=None, enable_gqa=True)
+    
+    # Compare results
+    test_result("repeat_interleave in scaled_dot_product_attention", output_device[0], output_cpu[0])
+    print("Max diff > ", torch.max(torch.abs(output_device[0].cpu() - output_cpu[0])))
+    test_result("repeat_interleave in scaled_dot_product_attention", output_device[1], output_cpu[1])
+    print("Max diff > ", torch.max(torch.abs(output_device[1].cpu() - output_cpu[1])))
+    test_result("repeat_interleave in scaled_dot_product_attention", output_device[2], output_cpu[2])
+    print("Max diff > ", torch.max(torch.abs(output_device[2].cpu() - output_cpu[2])))
+    print("repeat_interleave compilation test Done")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--device", type=str, default="npu", help="Device to use")
+    parser.add_argument("--batch", type=int, default=1, help="Batch size")
+    parser.add_argument("--seq_len", type=int, default=32, help="Sequence length")
+    parser.add_argument("--embed_dim", type=int, default=768, help="Embedding dimension")
+    parser.add_argument("--num_heads", type=int, default=8, help="Number of query heads")
+    parser.add_argument("--num_kv_heads", type=int, default=4, help="Number of key/value heads")
+    parser.add_argument("--test_standard", action="store_true", help="Also test standard MHA via GQA")
+    parser.add_argument("--test_repeat_interleave", action="store_true", help="Test repeat_interleave compilation")
+    
+    args = parser.parse_args()
+
+    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
+    from Scheduler.scheduler import PyTorchSimRunner
+    module = PyTorchSimRunner.setup_device()
+    device = module.custom_device()
+    
+    test_repeat_interleave_compilation(
+        device=device,
+        batch=args.batch,
+        seq_len=args.seq_len,
+        embed_dim=args.embed_dim,
+        num_heads=args.num_heads,
+        num_kv_heads=args.num_kv_heads
+    )
+    
+    # Test GQA
+    test_gqa_attention(
+        device=device,
+        batch=args.batch,
+        seq_len=args.seq_len,
+        embed_dim=args.embed_dim,
+        num_heads=args.num_heads,
+        num_kv_heads=args.num_kv_heads
+    )
+    
+    # Optionally test standard MHA via GQA
+    # if args.test_standard:
+    #    test_standard_mha_via_gqa(
+    #        device=args.device,
+    #        batch=args.batch,
+    #        seq_len=args.seq_len,
+    #        embed_dim=args.embed_dim,
+    #        num_heads=args.num_heads
+    #    )

From 862ba443c81b910c66bb2dd80b151571a11add8d Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 9 Jan 2026 09:55:53 +0000
Subject: [PATCH 061/194] [Fix+Log] Change logging system + Fix meta_code
 interface

---
 PyTorchSimFrontend/extension_codecache.py     |  17 +-
 PyTorchSimFrontend/extension_config.py        |  42 ++++-
 .../mlir/mlir_codegen_backend.py              |  51 +++---
 PyTorchSimFrontend/mlir/mlir_ops.py           |   3 +
 PyTorchSimFrontend/mlir/mlir_scheduling.py    |   3 +-
 PyTorchSimFrontend/mlir/mlir_template.py      |  30 ++--
 Scheduler/scheduler.py                        |  21 ++-
 Simulator/simulator.py                        | 151 +++++++++---------
 8 files changed, 189 insertions(+), 129 deletions(-)

diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index ef8c63e6..5066d214 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -10,6 +10,9 @@
 from PyTorchSimFrontend import extension_config
 from Simulator.simulator import FunctionalSimulator, CycleSimulator, TOGSimulator
 
+# Configure logger for extension_codecache module (WARNING level by default)
+logger = extension_config.setup_logger()
+
 LOCK_TIMEOUT = 600
 
 def hash_prefix(hash_value):
@@ -166,8 +169,8 @@ def load(cls, source_code,
                     subprocess.check_call(translate_cmd)
                     subprocess.check_call(llc_cmd)
                 except subprocess.CalledProcessError as e:
-                    print("Command failed with exit code", e.returncode)
-                    print("Error output:", e.output)
+                    logger.error(f"Command failed with exit code {e.returncode}")
+                    logger.error(f"Error output: {e.output.decode() if isinstance(e.output, bytes) else e.output}")
                     assert(0)
 
                 val_llvm_caller = MLIRKernelCallerCodeGen(extension_config.pytorchsim_functional_mode, arg_attributes)
@@ -179,8 +182,10 @@ def load(cls, source_code,
                 spad_size =  val_llvm_caller.get_spad_size(target)
                 spad_usage = stack_size + spad_size # Spad usage per lane
                 if extension_config.CONFIG_SPAD_INFO["spad_size"] < spad_usage:
-                    print(f"[Warning] Scratchpad size exceeded: required {spad_usage} bytes, "
-                        f"but only {extension_config.CONFIG_SPAD_INFO['spad_size']} bytes available.")
+                    logger.debug(
+                        f"Scratchpad size exceeded: required {spad_usage} bytes, "
+                        f"but only {extension_config.CONFIG_SPAD_INFO['spad_size']} bytes available."
+                    )
                     raise SpadOverflowError()
 
         # Launch tile graph generator
@@ -197,8 +202,8 @@ def load(cls, source_code,
                 subprocess.check_call(gem5_translate_cmd)
                 subprocess.check_call(gem5_llc_cmd)
             except subprocess.CalledProcessError as e:
-                print("Command failed with exit code", e.returncode)
-                print("Error output:", e.output)
+                logger.error(f"Command failed with exit code {e.returncode}")
+                logger.error(f"Error output: {e.output.decode() if isinstance(e.output, bytes) else e.output}")
                 assert(0)
 
             if not extension_config.pytorchsim_timing_mode:
diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py
index 2b1b3102..b0bcac7f 100644
--- a/PyTorchSimFrontend/extension_config.py
+++ b/PyTorchSimFrontend/extension_config.py
@@ -2,6 +2,7 @@
 import sys
 import importlib
 import yaml
+import logging
 
 CONFIG_TORCHSIM_DIR = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
 CONFIG_GEM5_PATH = os.environ.get('GEM5_PATH', default="/workspace/gem5/build/RISCV/gem5.opt")
@@ -134,4 +135,43 @@ def load_plan_from_module(module_path):
 
 CONFIG_USE_TIMING_POOLING = int(os.environ.get('TORCHSIM_USE_TIMING_POOLING', default=0))
 
-CONFIG_DEBUG_MODE = int(os.environ.get('TORCHSIM_DEBUG_MODE', default=0))
\ No newline at end of file
+CONFIG_DEBUG_MODE = int(os.environ.get('TORCHSIM_DEBUG_MODE', default=0))
+
+
+def setup_logger(name=None, level=None):
+    """
+    Setup a logger with consistent formatting across all modules.
+
+    Args:
+        name: Logger name (default: __name__ of calling module)
+        level: Logging level (default: DEBUG if CONFIG_DEBUG_MODE else INFO)
+
+    Returns:
+        Logger instance
+    """
+    if name is None:
+        import inspect
+        # Get the calling module's name
+        frame = inspect.currentframe().f_back
+        name = frame.f_globals.get('__name__', 'PyTorchSim')
+
+    # Convert logger name to lowercase
+    name = name.lower()
+    logger = logging.getLogger(name)
+
+    # Only configure if not already configured (avoid duplicate handlers)
+    if not logger.handlers:
+        handler = logging.StreamHandler()
+        formatter = logging.Formatter(
+            fmt='[%(asctime)s.%(msecs)03d] [%(levelname)s] [%(name)s] %(message)s',
+            datefmt='%Y-%m-%d %H:%M:%S'
+        )
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
+
+        # Set log level
+        if level is None:
+            level = logging.DEBUG if CONFIG_DEBUG_MODE else logging.INFO
+        logger.setLevel(level)
+
+    return logger
\ No newline at end of file
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 27fdf757..d0c8f815 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -2,7 +2,6 @@
 import sympy
 import re
 import os
-import math
 from functools import reduce
 from operator import mul
 import torch
@@ -29,6 +28,9 @@
 from .mlir_ops import ExtensionOverrides
 from PyTorchSimFrontend.mlir.mlir_autotune import MLIRBenchmarkRequest
 
+# Configure logger for mlir_codegen_backend module
+logger = extension_config.setup_logger()
+
 def reduction_init(reduction_type, dtype):
     if dtype in cpp.DTYPE_LOWP_FP:
         # Since load promotes all half-precision inputs to float, the initial
@@ -95,11 +97,14 @@ def write_header(self):
 
                 from torch import device, empty, empty_strided
                 from {extension_codecache.__name__} import CustomAsyncCompile
-                from PyTorchSimFrontend.extension_config import CONFIG_SRAM_BUFFER_PLAN, CONFIG_TOGSIM_EAGER_MODE
+                from PyTorchSimFrontend.extension_config import CONFIG_SRAM_BUFFER_PLAN, CONFIG_TOGSIM_EAGER_MODE, setup_logger
                 from Simulator.simulator import TOGSimulator
                 from PyTorchSimFrontend.extension_op import sparse_mm_dummy_stonne_outer
                 from torch._inductor.select_algorithm import extern_kernels
 
+                # Configure logger for generated wrapper code
+                _logger = setup_logger("PyTorchSimFrontend.mlir.generated_wrapper")
+
                 aten = torch.ops.aten
                 inductor_ops = torch.ops.inductor
                 assert_size_stride = torch._C._dynamo.guards.assert_size_stride
@@ -108,7 +113,7 @@ def write_header(self):
                 custom_async_compile = CustomAsyncCompile()
                 async_compile = AsyncCompile()
                 os.environ["TORCHSIM_LAST_COMPILED_MODULE"] = __file__
-                print(f\'Wrapper Codegen Path = {{__file__}}\')
+                _logger.info(f'Wrapper Codegen Path = {{__file__}}')
             """
         )
         self.header.splice(
@@ -909,15 +914,14 @@ def make_choices(self, nodes, kernel_name):
 
             # Try initial tile size
             self.reset(None)
-            src_code = super().codegen_nodes(nodes, kernel_name)
+            src_code, meta_code = super().codegen_nodes(nodes, kernel_name)
             current_tile_sz = tuple(self.kernel_group.tile_desc.get_tile_size())
             search_space.add(current_tile_sz)
 
-            if extension_config.CONFIG_DEBUG_MODE:
-                print(f"[Auto-tune] Trying tile size: {list(current_tile_sz)}, vlane_stride: {self.kernel_group.tile_desc.vmap.vlane_stride}, split_axis: {self.kernel_group.tile_desc.vmap.vlane_split_axis}")
+            logger.debug(f"Auto-tune: Trying tile size: {list(current_tile_sz)}, vlane_stride: {self.kernel_group.tile_desc.vmap.vlane_stride}, split_axis: {self.kernel_group.tile_desc.vmap.vlane_split_axis}")
             self._prepare_simulator_headers(src_code)
             bench_runner = self.run_bench(nodes, kernel_name, src_code)
-            choices.append((bench_runner, src_code, current_tile_sz, self.kernel_group.tile_desc.vmap.vlane_stride))
+            choices.append((bench_runner, src_code, meta_code, current_tile_sz, self.kernel_group.tile_desc.vmap.vlane_stride))
 
             while prevent_infinite_loop < 10 and candidate_axes:
                 for axis in list(candidate_axes):
@@ -939,7 +943,7 @@ def make_choices(self, nodes, kernel_name):
                         continue
 
                     self.reset(None)
-                    src_code = super().codegen_nodes(nodes, kernel_name)
+                    src_code, meta_code = super().codegen_nodes(nodes, kernel_name)
                     current_tile_sz = tuple(self.kernel_group.tile_desc.get_tile_size())
 
                     # FIXME. How to intergrate this constraint to tile system?
@@ -956,11 +960,10 @@ def make_choices(self, nodes, kernel_name):
 
                     # Add this choice
                     search_space.add(current_tile_sz)
-                    if extension_config.CONFIG_DEBUG_MODE:
-                        print(f"[Auto-tune] Trying tile size: {list(current_tile_sz)}, vlane_stride: {self.kernel_group.tile_desc.vmap.vlane_stride}, split_axis: {self.kernel_group.tile_desc.vmap.vlane_split_axis}")
+                    logger.debug(f"Auto-tune: Trying tile size: {list(current_tile_sz)}, vlane_stride: {self.kernel_group.tile_desc.vmap.vlane_stride}, split_axis: {self.kernel_group.tile_desc.vmap.vlane_split_axis}")
                     self._prepare_simulator_headers(src_code)
                     bench_runner = self.run_bench(nodes, kernel_name, src_code)
-                    choices.append((bench_runner, src_code, self.kernel_group.tile_desc.get_tile_size(), self.kernel_group.tile_desc.vmap.vlane_stride))
+                    choices.append((bench_runner, src_code, meta_code, self.kernel_group.tile_desc.get_tile_size(), self.kernel_group.tile_desc.vmap.vlane_stride))
                     prevent_infinite_loop += 1
         self.kernel_group.tile_desc.prev_tail_threshold = prev_tail_threshold
         return choices
@@ -976,18 +979,20 @@ def get_cycle(choice):
                     return float("inf")
             return float("inf") # Exceeded maximum number of autotuning attempts
         choices = self.make_choices(*args)
-
         if len(choices) == 0: # Can't autotune
-            return [None, None]
+            return [None, None, None]
+
+        # Get cycle time for each choice
         with ThreadPoolExecutor(max_workers=8) as executor:
             results = list(executor.map(get_cycle, choices))
-        max_idx = results.index(min(results))
+        min_idx = results.index(min(results))
         if min(results) == float("inf"):
             raise RuntimeError("Failed to find optimal tile size...")
-        if extension_config.CONFIG_DEBUG_MODE:
-            self._log_autotune_result(choices[max_idx], results[max_idx])
-        optimal_src_code, loop_size = choices[max_idx][1], choices[max_idx][-1]
-        return optimal_src_code, loop_size
+
+        self._log_autotune_result(choices[min_idx], results[min_idx])
+
+        optimal_src_code, meta_code, loop_size = choices[min_idx][1], choices[min_idx][2], choices[min_idx][-1]
+        return optimal_src_code, meta_code, loop_size
 
     def run_bench(self, nodes, kernel_name, src_code):
         _, _, arg_attributes, _ = self.kernel_group.args.mlir_argdefs()
@@ -1015,9 +1020,9 @@ def run_bench(self, nodes, kernel_name, src_code):
         return bmreq.make_run_fn(dummy_inputs, dummy_outputs)
 
     def _log_autotune_result(self, best_choice, best_cycle):
-        print(
-            f"[Auto-tune] Optimal tile size: {list(best_choice[2])}, "
-            f"vlane_stride: {best_choice[3]}, "
+        logger.debug(
+            f"Auto-tune: Optimal tile size: {list(best_choice[3])}, "
+            f"vlane_stride: {best_choice[4]}, "
             f"cycles: {best_cycle}"
         )
 
@@ -1025,9 +1030,9 @@ def codegen_nodes(self, nodes, kernel_name):
         src_code, meta_code = super().codegen_nodes(nodes, kernel_name)
         self._prepare_simulator_headers(src_code)
         if "autotune" in extension_config.codegen_mapping_strategy and extension_config.pytorchsim_timing_mode:
-            optimal_src_code = self.autotune(nodes, kernel_name)[0]
+            optimal_src_code, meta_code = self.autotune(nodes, kernel_name)[:2]
             if optimal_src_code is not None:
-                return optimal_src_code
+                return optimal_src_code, meta_code
         return src_code, meta_code
 
     def _prepare_simulator_headers(self, src_code):
diff --git a/PyTorchSimFrontend/mlir/mlir_ops.py b/PyTorchSimFrontend/mlir/mlir_ops.py
index 2b964c55..dce59ed6 100644
--- a/PyTorchSimFrontend/mlir/mlir_ops.py
+++ b/PyTorchSimFrontend/mlir/mlir_ops.py
@@ -1,10 +1,13 @@
 import math
 import torch
+import warnings
 
 from torch._inductor.codegen import common
 from torch._inductor.virtualized import V, _ops as ops
 from . import mlir_common
 
+warnings.filterwarnings('ignore', message='undefined OpHandler\\..*, please add missing op schema')
+
 def reduction_combine_vec(reduction_type, vector_value, init_value, axis, shape, reduced_shape):
     if reduction_type == "sum":
         return f"vector.multi_reduction <add>, %{vector_value}, %{init_value} [{axis}] : {shape} to {reduced_shape}"
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index bfcda258..f2bcba7e 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -299,8 +299,7 @@ def codegen_template(self, template_node, epilogue_nodes, prologue_nodes):
         template_buffer = template_node.node
         kernel, tile_candidates, render = template_buffer.make_kernel_render(template_buffer, prologue_nodes=prologue_nodes, epilogue_nodes=epilogue_nodes, kernel_group=self.kernel_group)
         _, _, _, kernel.buffer_types = self.kernel_group.args.mlir_argdefs()
-        src_code = kernel.codegen_nodes(tile_candidates, render, template_node, prologue_nodes, epilogue_nodes)
-        meta_code = kernel.meta_kernel()
+        src_code, meta_code = kernel.codegen_nodes(tile_candidates, render, template_node, prologue_nodes, epilogue_nodes)
 
         with V.set_kernel_handler(kernel):
             kernel_name = self.define_kernel(src_code, meta_code, kernel.kernel_name, kernel.vector_lane, kernel.spad_info,
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 8f92554c..304d0090 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -32,6 +32,9 @@
 from PyTorchSimFrontend import extension_config
 from . import mlir_common
 
+# Configure logger for mlir_template module
+logger = extension_config.setup_logger()
+
 class IndentedBufferGroup:
     def __init__(self, kernel: 'MLIRTemplateKernel', prefix=""):
         self.kernel = kernel
@@ -386,7 +389,6 @@ def conv_single_batch_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilatio
         return tile_candidates
 
     def meta_kernel(self):
-        wrapper = V.graph.wrapper_code
         kernel_arg_attributes = self.kernel_arg_attributes
         _, _, arg_attributes, _ = self.kernel_group.args.mlir_argdefs()
         if kernel_arg_attributes is not None:
@@ -483,38 +485,36 @@ def codegen_template_code(self, render, template_node, prologue_nodes, epilogue_
             buffer.splice(src_code)
             src_code = buffer.getvalue()
             self._prepare_simulator_headers(src_code)
-        return src_code
+        meta_code = self.meta_kernel()
+        return src_code, meta_code
 
     def make_choices(self, tile_candidates, render, template_node, prologue_nodes, epilogue_nodes):
         choices = []
         for tile_info in tile_candidates:
-            if extension_config.CONFIG_DEBUG_MODE:
-                # Compute Tile M, N, K DMA Tile M, N, K
-                print(f"[Auto-tune] Trying tile size: {list(tile_info)}")
-            src_code = self.codegen_template_code(render, template_node, prologue_nodes, epilogue_nodes, tile_info)
+            # Compute Tile M, N, K DMA Tile M, N, K
+            logger.debug(f"Auto-tune: Trying tile size: {list(tile_info)}")
+            src_code, meta_code = self.codegen_template_code(render, template_node, prologue_nodes, epilogue_nodes, tile_info)
             bench_runner = self.run_bench([template_node], self.kernel_name, src_code)
-            choices.append((bench_runner, src_code, tile_info, self.loop_size))
+            choices.append((bench_runner, src_code, meta_code, tile_info, self.loop_size))
             self.reset(reason=None)
         return choices
 
     def _log_autotune_result(self, best_choice, best_cycle):
-        tile_size = best_choice[2]
-        print(
-            f"[Auto-tune] Optimal tile size: {list(tile_size)}, "
+        tile_size = best_choice[3]
+        logger.debug(
+            f"Auto-tune: Optimal tile size: {list(tile_size)}, "
             f"cycles: {best_cycle}"
         )
 
     def codegen_nodes(self, tile_candidates, render, template_node, prologue_nodes, epilogue_nodes):
         if "autotune" in extension_config.codegen_mapping_strategy and len(tile_candidates):
-            src_code, loop_size = self.autotune(tile_candidates, render, template_node, prologue_nodes, epilogue_nodes)
+            src_code, meta_code, loop_size = self.autotune(tile_candidates, render, template_node, prologue_nodes, epilogue_nodes)
             self.loop_size = loop_size
         else:
             tile_info = tile_candidates[0] if tile_candidates else None
-            src_code = self.codegen_template_code(render, template_node, prologue_nodes, epilogue_nodes, tile_info)
+            src_code, meta_code = self.codegen_template_code(render, template_node, prologue_nodes, epilogue_nodes, tile_info)
 
-        with V.set_kernel_handler(self):
-            self.meta_kernel()
-        return src_code
+        return src_code, meta_code
 
     def _prepare_simulator_headers(self, src_code):
         spad_end_symbol = f"int spad_end[0] __attribute__ ((section(\".spad\")));\n"
diff --git a/Scheduler/scheduler.py b/Scheduler/scheduler.py
index 215700eb..3f5673a8 100644
--- a/Scheduler/scheduler.py
+++ b/Scheduler/scheduler.py
@@ -12,6 +12,9 @@
 
 from torch._dynamo.device_interface import register_interface_for_device
 
+# Configure logger for Scheduler module
+logger = extension_config.setup_logger()
+
 
 def import_module_from_path(module_name, path):
     module_path = Path(path)  # Convert to Path object for safety
@@ -380,7 +383,7 @@ def __init__(self, num_request_queue=1, max_batch=1, engine_select=FIFO_ENGINE,
         elif engine_select == Scheduler.RR_ENGINE:
             self.execution_engine = RoundRobinRunner(self.tog_simulator, self.num_request_queue)
         else:
-            print(f"Not supporetd engine type {engine_select}")
+            logger.error(f"Not supported engine type {engine_select}")
             exit(1)
 
     def add_request(self, request: Request, request_time=-1):
@@ -441,9 +444,11 @@ def finish_request(self, req : Request):
         self.finish_queue.append(req)
         self.request_queue[req.request_queue_idx].remove(req)
         turnaround_time, response_time, tbt_time = req.get_latency()
-        print(f"[Request-{req.id} finished] partition: {req.request_queue_idx} arrival_time: "
-              f"{req.arrival_time} start_time: {req.start_time[0]} turnaround latency: {turnaround_time}, "
-              f"response time: {response_time} tbt_time: {tbt_time}")
+        logger.info(
+            f"[Request-{req.id} finished] partition: {req.request_queue_idx} arrival_time: "
+            f"{req.arrival_time} start_time: {req.start_time[0]} turnaround latency: {turnaround_time}, "
+            f"response time: {response_time} tbt_time: {tbt_time}"
+        )
 
     def per_schedule(self, request_queue_idx):
         # Wait partition is idle
@@ -454,11 +459,13 @@ def per_schedule(self, request_queue_idx):
         if not request_list:
             return False
 
-        print(f"[Request issue] partition: {request_queue_idx} batch size: {len(request_list)}", flush=True)
+        logger.info(f"[Request issue] partition: {request_queue_idx} batch size: {len(request_list)}")
         for req in request_list:
             req.set_start(self.current_time())
-            print(f"[Request-{req.id} issue] partition: {req.request_queue_idx} "
-                f"arrival_time: {req.arrival_time} start_time: {req.start_time[0]}", flush=True)
+            logger.info(
+                f"[Request-{req.id} issue] partition: {req.request_queue_idx} "
+                f"arrival_time: {req.arrival_time} start_time: {req.start_time[0]}"
+            )
         # Submit batched request
         self.execution_engine.submit(request_list, request_queue_idx)
 
diff --git a/Simulator/simulator.py b/Simulator/simulator.py
index 672ae6ec..6ed679d6 100644
--- a/Simulator/simulator.py
+++ b/Simulator/simulator.py
@@ -17,7 +17,46 @@
 from PyTorchSimFrontend.mlir.mlir_common import MLIRKernelArgs
 from PyTorchSimFrontend import extension_config
 
-print_lock = threading.Lock()
+# Configure logger for Simulator module
+logger = extension_config.setup_logger()
+from tqdm import tqdm
+
+
+class ProgressBar:
+    def __init__(self, desc, silent_mode=False, update_interval=0.5):
+        self.desc = desc
+        self.silent_mode = silent_mode
+        self.update_interval = update_interval
+        self.pbar = None
+        self.finished = False
+        self.progress_thread = None
+
+    def __enter__(self):
+        if not self.silent_mode:
+            self.pbar = tqdm(
+                desc=self.desc,
+                bar_format='{desc}: {elapsed}',
+                leave=False,  # Don't leave the bar when done (it will disappear)
+                ncols=80,
+                disable=False,
+                total=100,  # Use a total for smooth animation
+            )
+            # Update progress bar in a separate thread
+            def update_progress():
+                while not self.finished:
+                    self.pbar.update(1)
+                    time.sleep(self.update_interval)
+
+            self.progress_thread = threading.Thread(target=update_progress, daemon=True)
+            self.progress_thread.start()
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.finished = True
+        if not self.silent_mode and self.pbar is not None:
+            self.pbar.close()
+        return False
+
 
 TORCH_TO_NUMPY = {
     torch.float32: np.float32,
@@ -105,9 +144,9 @@ def run_spike(self, args, arg_attributes, runtime_path, binary, vectorlane_size=
         os.makedirs(os.path.join(runtime_path, "indirect_access"), exist_ok=True)
         os.makedirs(os.path.join(runtime_path, "dma_access"), exist_ok=True)
         run = f'spike --isa rv64gcv --varch=vlen:256,elen:64 {vectorlane_option} {spad_option} {kernel_address} {base_path} /workspace/riscv-pk/build/pk {target_binary} {file_path_str}'
-        if not silent_mode and extension_config.CONFIG_DEBUG_MODE:
-            print("[Spike] cmd> ", run)
-        print("[Spike] Running Spike simulator")
+        if not silent_mode:
+            logger.debug(f"[Spike] cmd> {run}")
+        logger.info("[Spike] Running Spike simulator")
         run_cmd = shlex.split(run)
         try:
             stdout_setting = subprocess.DEVNULL if silent_mode else None
@@ -115,7 +154,7 @@ def run_spike(self, args, arg_attributes, runtime_path, binary, vectorlane_size=
             subprocess.check_call(run_cmd, stdout=stdout_setting, stderr=stderr_setting)
         except subprocess.CalledProcessError as e:
             if not silent_mode:
-                print("[Spike] Command failed with exit code", e.returncode)
+                logger.error(f"[Spike] Command failed with exit code {e.returncode}")
             error_msg = ""
             if e.returncode == 200:
                 error_msg = "INVALID_SPAD_ACCESS"
@@ -155,41 +194,23 @@ def __init__(self) -> None:
         pass
 
     def compile_and_simulate(self, target_binary, array_size, vectorlane_size, silent_mode=False):
-        def show_progress():
-            i = 0
-            while not finished:
-                i = (i + 1) % 3
-                tail = "." * i + " " * (3-i)
-                with print_lock:
-                    sys.stdout.write("\r[Gem5] Gem5 is running." + tail)
-                    sys.stdout.flush()
-                time.sleep(1)
-            with print_lock:
-                print("")
-
         dir_path = os.path.join(os.path.dirname(target_binary), "m5out")
         gem5_script_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "gem5_script/script_systolic.py")
         gem5_cmd = [extension_config.CONFIG_GEM5_PATH, "-r", "--stdout-file=sto.log", "-d", dir_path, gem5_script_path, "-c", target_binary, "--vlane", str(vectorlane_size)]
+
+        is_dryrun = int(os.environ.get('TOGSIM_EAGER_MODE', default=False)) or silent_mode
+
+        if not is_dryrun:
+            logger.debug(f"[Gem5] cmd> {' '.join(gem5_cmd)}")
+            logger.info("[Gem5] Gem5 simulation started")
+
         try:
-            # Create progress thread
-            is_dryrun = int(os.environ.get('TOGSIM_EAGER_MODE', default=False)) or silent_mode
-            if not is_dryrun:
-                if extension_config.CONFIG_DEBUG_MODE:
-                    print("[Gem5] cmd> ", " ".join(gem5_cmd))
-                finished = False
-                progress_thread = threading.Thread(target=show_progress)
-                progress_thread.start()
-                output = subprocess.check_output(gem5_cmd, stderr=subprocess.DEVNULL)
-                finished = True
-                progress_thread.join()
-            else:
-                output = subprocess.check_output(gem5_cmd, stderr=subprocess.DEVNULL)
+            #with ProgressBar("[Gem5] Running simulation", silent_mode=is_dryrun):
+            output = subprocess.check_output(gem5_cmd, stderr=subprocess.DEVNULL)
         except subprocess.CalledProcessError as e:
-            print(f"[Gem5] Gem5 simulation failed with error: \"{e.output.decode()}\"")
-            if not is_dryrun:
-                finished = True
-                progress_thread.join()
-            raise RuntimeError(f"Gem5 Simulation Failed: \"{e.output.decode()}\"")
+            output_error = e.output.decode() if isinstance(e.output, bytes) else str(e.output)
+            logger.error(f"[Gem5] Gem5 simulation failed with error: \"{output_error}\"")
+            raise RuntimeError(f"Gem5 Simulation Failed: \"{output_error}\"")
 
         with open(f"{dir_path}/stats.txt", "r") as stat_file:
             raw_list = stat_file.readlines()
@@ -216,39 +237,21 @@ def get_togsim_command(self):
         return cmd
 
     def simulation(self, model_path, attribute_path="", silent_mode=False, autotune_mode=False):
-        def show_progress():
-            i = 0
-            while not finished:
-                i = (i + 1) % 3
-                tail = "." * i + " " * (3-i)
-                sys.stdout.write("\r[TOGSim] TOGSim is running." + tail)
-                time.sleep(1)
-            print("")
         cmd = f"{self.get_togsim_command()} --models_list {model_path}"
         if extension_config.CONFIG_TOGSIM_DEBUG_LEVEL:
             cmd += f" --log_level {extension_config.CONFIG_TOGSIM_DEBUG_LEVEL}"
         if attribute_path:
             cmd = f"{cmd} --attributes_list {attribute_path}"
-        if not silent_mode and extension_config.CONFIG_DEBUG_MODE:
-            print("[TOGSim] cmd> ", cmd)
-
-        # Create progress thread
         if not silent_mode:
-            finished = False
-            progress_thread = threading.Thread(target=show_progress)
-            progress_thread.start()
+            logger.debug(f"[TOGSim] cmd> {cmd}")
+            logger.info("[TOGSim] TOGSim simulation started")
+
         try:
-            result = subprocess.check_output(shlex.split(cmd))
-            if not silent_mode:
-                finished = True
-                progress_thread.join()
+            with ProgressBar("[TOGSim] Running simulation", silent_mode=silent_mode):
+                result = subprocess.check_output(shlex.split(cmd))
         except subprocess.CalledProcessError as e:
-            if not silent_mode:
-                finished = True
-                progress_thread.join()
-                with print_lock:
-                    print("[TOGSim] Command failed with exit code", e.returncode)
-                    print("[TOGSim] Error output:", e.output)
+            logger.error(f"[TOGSim] Command failed with exit code {e.returncode}")
+            logger.error(f"[TOGSim] Error output: {e.output.decode() if isinstance(e.output, bytes) else e.output}")
             assert 0
 
         # Separate Autotune logs
@@ -271,10 +274,10 @@ def show_progress():
             f.flush()
             os.fsync(f.fileno())
 
-        if not silent_mode or extension_config.CONFIG_DEBUG_MODE:
-            model_path_log = f' of "{model_path}" ' if extension_config.CONFIG_DEBUG_MODE else " "
-            with print_lock:
-                print(f'[TOGSim] Simulation log{model_path_log}is stored to "{result_path}"')
+        if not silent_mode:
+            import logging as _logging
+            model_path_log = f' of "{model_path}" ' if logger.isEnabledFor(_logging.DEBUG) else " "
+            logger.info(f'[TOGSim] Simulation log{model_path_log}is stored to "{result_path}"')
         return result_path
 
     def interactive_simulation(self):
@@ -282,8 +285,7 @@ def interactive_simulation(self):
         if extension_config.CONFIG_TOGSIM_DEBUG_LEVEL:
             cmd += f" --log_level {extension_config.CONFIG_TOGSIM_DEBUG_LEVEL}"
 
-        if extension_config.CONFIG_DEBUG_MODE:
-            print("[TOGSim] cmd> ", cmd)
+        logger.debug(f"[TOGSim] cmd> {cmd}")
         if self.process is None:
             self.process = subprocess.Popen(
                 shlex.split(cmd),
@@ -292,28 +294,27 @@ def interactive_simulation(self):
                 universal_newlines=True
             )
         else:
-            print("[TOGSim] Simulator is already running.")
+            logger.warning("[TOGSim] Simulator is already running.")
 
     def stop(self):
         if self.process:
             self.process.terminate()
             self.process.wait()
             self.process = None
-            print("[TOGSim] Simulator stopped.")
+            logger.info("[TOGSim] Simulator stopped.")
 
     def wait(self):
         if self.process:
-            print("[TOGSim] Waiting for simulation to complete...")
+            logger.info("[TOGSim] Waiting for simulation to complete...")
             self.quit()
             self.process.wait()
             self.process = None
-            print("[TOGSim] Simulation completed.")
+            logger.info("[TOGSim] Simulation completed.")
 
     def send_command(self, command):
         if self.process:
             try:
-                if extension_config.CONFIG_TORCHSIM_DEBUG_MODE:
-                    print(command, flush=True)
+                logger.debug(command)
                 self.process.stdin.write(command + '\n')
                 self.process.stdin.flush()
                 ret = self.process.stderr.readline().strip()
@@ -321,11 +322,11 @@ def send_command(self, command):
             except BrokenPipeError:
                 err = self.process.stderr.readlines()
                 for line in err:
-                    print(line)
+                    logger.error(line.strip())
                 self.process = None
                 exit(1)
         else:
-            print("Simulator is not running.")
+            logger.warning("Simulator is not running.")
             return None
 
     def launch(self, onnx_path, attribute_path, arrival_time=0, partion_id=0):
@@ -440,7 +441,7 @@ def get_result_from_file(result_path):
                 break
 
         if simulation_finished_idx == -1:
-            print(f"[TOGSim] Warning: Unable to parse the output file ({result_path}). The file may be improperly formatted.")
+            logger.warning(f"[TOGSim] Warning: Unable to parse the output file ({result_path}). The file may be improperly formatted.")
             return core_metrics, dram_channel_bw, avg_dram_bw, simulation_time
 
         total_stat_lines = lines[simulation_finished_idx:]

From 75207a45ad3940834aa4c20dac043b12a6f9bb95 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 9 Jan 2026 11:11:45 +0000
Subject: [PATCH 062/194] [Test] Wrap softmax module

---
 tests/test_softmax.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/tests/test_softmax.py b/tests/test_softmax.py
index e6e8cc1e..005c3ed2 100644
--- a/tests/test_softmax.py
+++ b/tests/test_softmax.py
@@ -42,8 +42,17 @@ def test_softmax(device, size=(128, 128), dim=1):
     #cpu_y = softmax3(x2, cpu_max, cpu_sum)
     #test_result("Softmax", y, cpu_y)
 
-    opt_fn = torch.compile(dynamic=False)(torch.nn.functional.softmax)
-    y = opt_fn(x1, dim=dim)
+    class SoftmaxModule(torch.nn.Module):
+        def __init__(self, dim):
+            super().__init__()
+            self.dim = dim
+
+        def forward(self, x):
+            return torch.nn.functional.softmax(x, dim=self.dim)
+
+    softmax_module = SoftmaxModule(dim=dim).to(device)
+    opt_fn = torch.compile(dynamic=False)(softmax_module)
+    y = opt_fn(x1)
     cpu_y = torch.nn.functional.softmax(x2, dim=dim)
     test_result("Softmax", y, cpu_y)
 

From 8df5fef0291444c0f2feaa929983f4a5ca011c2b Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 9 Jan 2026 11:48:48 +0000
Subject: [PATCH 063/194] [Log] Add progress bar for auto-tuning

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 10 ++++++++--
 Simulator/simulator.py                          |  2 +-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index d0c8f815..28605e33 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -31,6 +31,8 @@
 # Configure logger for mlir_codegen_backend module
 logger = extension_config.setup_logger()
 
+from Simulator.simulator import ProgressBar
+
 def reduction_init(reduction_type, dtype):
     if dtype in cpp.DTYPE_LOWP_FP:
         # Since load promotes all half-precision inputs to float, the initial
@@ -983,8 +985,12 @@ def get_cycle(choice):
             return [None, None, None]
 
         # Get cycle time for each choice
-        with ThreadPoolExecutor(max_workers=8) as executor:
-            results = list(executor.map(get_cycle, choices))
+        # Show progress bar only when CONFIG_DEBUG_MODE is off
+        show_progress = not extension_config.CONFIG_DEBUG_MODE
+        with ProgressBar("[Auto-tune] Running benchmarks", silent_mode=not show_progress) if show_progress else contextlib.nullcontext():
+            with ThreadPoolExecutor(max_workers=8) as executor:
+                results = list(executor.map(get_cycle, choices))
+
         min_idx = results.index(min(results))
         if min(results) == float("inf"):
             raise RuntimeError("Failed to find optimal tile size...")
diff --git a/Simulator/simulator.py b/Simulator/simulator.py
index 6ed679d6..7a4f7e0d 100644
--- a/Simulator/simulator.py
+++ b/Simulator/simulator.py
@@ -209,7 +209,7 @@ def compile_and_simulate(self, target_binary, array_size, vectorlane_size, silen
             output = subprocess.check_output(gem5_cmd, stderr=subprocess.DEVNULL)
         except subprocess.CalledProcessError as e:
             output_error = e.output.decode() if isinstance(e.output, bytes) else str(e.output)
-            logger.error(f"[Gem5] Gem5 simulation failed with error: \"{output_error}\"")
+            logger.debug(f"[Gem5] Gem5 simulation failed with error: \"{output_error}\"")
             raise RuntimeError(f"Gem5 Simulation Failed: \"{output_error}\"")
 
         with open(f"{dir_path}/stats.txt", "r") as stat_file:

From d7c16b17c0aa082cb7c69b98b157ab66081809a4 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 9 Jan 2026 13:41:01 +0000
Subject: [PATCH 064/194] [Test/MoE] Disable compiling sparse dispatcher

---
 PyTorchSimFrontend/mlir/mlir_ops.py | 6 ++++++
 tests/MoE/test_moe.py               | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_ops.py b/PyTorchSimFrontend/mlir/mlir_ops.py
index dce59ed6..74629b00 100644
--- a/PyTorchSimFrontend/mlir/mlir_ops.py
+++ b/PyTorchSimFrontend/mlir/mlir_ops.py
@@ -943,6 +943,12 @@ def square(operand, *args, **kwargs):
         result = ops.mul(operand, operand)
         return result, V.kernel.var_info[result]
 
+    @staticmethod
+    def fma(operand1, operand2, operand3, *args, **kwargs):
+        result = ops.mul(operand1, operand2)
+        result = ops.add(result, operand3)
+        return result, V.kernel.var_info[result]
+
     # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     # PyTorchSim specific operations 
 
diff --git a/tests/MoE/test_moe.py b/tests/MoE/test_moe.py
index ae16f0b0..1030e59f 100644
--- a/tests/MoE/test_moe.py
+++ b/tests/MoE/test_moe.py
@@ -4,7 +4,6 @@
 import copy
 import matplotlib.pyplot as plt
 
-
 import torch
 import torch.nn as nn
 from torch.distributions.normal import Normal
@@ -64,6 +63,7 @@ class SparseDispatcher(object):
     `Tensor`s for expert i only the batch elements for which `gates[b, i] > 0`.
     """
 
+    @torch.compiler.disable(recursive=True)
     def __init__(self, num_experts, gates):
         """Create a SparseDispatcher."""
         gates = gates.cpu()

From c88cabceff908be57649357d8be20055036c9c0d Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 12 Jan 2026 03:06:03 +0000
Subject: [PATCH 065/194] [Fix] Support identity in the dram_stride extraction

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 28605e33..e0a7d949 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -1179,7 +1179,7 @@ def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffe
             max_dim = len(self.ranges) if not store_reduction else len(self.ranges) - 1
             for i in range(max_dim):
                 target_dim = f"index{i}"
-                if target_dim not in str(index):
+                if sympy.Symbol(target_dim) not in index.free_symbols:
                     dram_dict[target_dim] = [0]
             sorted_keys = sorted(dram_dict.keys())
             dram_stride = sum((dram_dict[key] for key in sorted_keys), [])

From 67612bb823be2992eaac36d7c9ddbbc24c017335 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 12 Jan 2026 03:39:17 +0000
Subject: [PATCH 066/194] [Fix] index to float casting

---
 PyTorchSimFrontend/mlir/mlir_ops.py | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_ops.py b/PyTorchSimFrontend/mlir/mlir_ops.py
index 74629b00..59a6be78 100644
--- a/PyTorchSimFrontend/mlir/mlir_ops.py
+++ b/PyTorchSimFrontend/mlir/mlir_ops.py
@@ -226,11 +226,25 @@ def binary_elementwise_common(operand1, operand2):
         if op_type1[1] != op_type2[1]:
             if op_type1[1] == "index" or op_type1 == "index":
                 if op_type1[1] == "index":
-                    operand1 = ops.index_cast(operand1, op_type2[1])
-                    op_type1 = V.kernel.var_info[operand1]
+                    # index -> target type: 2-step casting if target is float
+                    if op_type2[1][0] == "f":
+                        operand1 = ops.index_cast(operand1, "i64")
+                        operand1 = ops.to_dtype(operand1, op_type2[1])
+                        op_type1 = V.kernel.var_info[operand1]
+                    else:
+                        # index -> integer: direct casting
+                        operand1 = ops.index_cast(operand1, op_type2[1])
+                        op_type1 = V.kernel.var_info[operand1]
                 if op_type2[1] == "index":
-                    operand2 = ops.index_cast(operand2, op_type1[1])
-                    op_type2 = V.kernel.var_info[operand2]
+                    # index -> target type: 2-step casting if target is float
+                    if op_type1[1][0] == "f":
+                        operand2 = ops.index_cast(operand2, "i64")
+                        operand2 = ops.to_dtype(operand2, op_type1[1])
+                        op_type2 = V.kernel.var_info[operand2]
+                    else:
+                        # index -> integer: direct casting
+                        operand2 = ops.index_cast(operand2, op_type1[1])
+                        op_type2 = V.kernel.var_info[operand2]
             elif op_type1[1][0] == "i" and op_type2[1][0] == "f":
                 operand1 = ops.to_dtype(operand1, op_type2[1])
                 op_type1 = V.kernel.var_info[operand1]

From 50ceb5848baaa895230de9fb1cbe1f2e8ed44860 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 12 Jan 2026 12:22:10 +0000
Subject: [PATCH 067/194] [Fix] Change vlane_split_axis in case of group-dim

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py |  9 +++++++--
 tests/Diffusion/test_diffusion.py               | 16 ++++++++--------
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index e0a7d949..e5a1a273 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -1196,14 +1196,19 @@ def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffe
                     dim_idx = int((str(sub.args[0])[5:]))
                     if int(self.kernel_group.tile_desc.get_tile_size()[dim_idx] % sub.args[1]) != 0:
                         # In this case, need to recompile
-                        original_size = self.kernel_group.tile_desc.get_tile_size()[dim_idx]
-                        divisor = sub.args[1]
+                        original_tile = self.kernel_group.tile_desc.get_tile_size()
+                        original_size = original_tile[dim_idx]
+                        divisor = sub.args[1] * self.kernel_group.tile_desc.vmap.vlane_stride
                         new_size = ((original_size + divisor - 1) // divisor) * divisor
                         new_tile_sizes = list(self.kernel_group.tile_desc.get_tile_size())
                         new_tile_sizes[dim_idx] = new_size
                         self.kernel_group.tile_desc.set_tile_size(new_tile_sizes)
                         self.kernel_group.tile_desc.tile_constraint[dim_idx].fixed = True
 
+                        # Can't use dim_idx as vlane_split_axis
+                        if dim_idx == self.kernel_group.tile_desc.vmap.vlane_split_axis:
+                            self.kernel_group.tile_desc.vmap.vlane_split_axis = (dim_idx + 1) % len(original_tile)
+
                         # Send recompile signal
                         self.reset("recompile")
                         raise mlir_common.RecompileSignal(f"Tile size {self.kernel_group.tile_desc.get_tile_size()[dim_idx]} is not divisible by {sub.args[1]}")
diff --git a/tests/Diffusion/test_diffusion.py b/tests/Diffusion/test_diffusion.py
index c5170209..d6d740fe 100644
--- a/tests/Diffusion/test_diffusion.py
+++ b/tests/Diffusion/test_diffusion.py
@@ -557,14 +557,14 @@ def test_upsample2d(
     module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
 
-    #test_upsample2d(device)
-    #test_groupnorm(device)
-    #test_groupnorm(device, stride=[1, 1, 320*32, 320])
-    #test_resnetblock2d(device, in_channels=640, out_channels=320, temb_channels=320)
-    #test_resnetblock2d(device, in_channels=640, out_channels=320, temb_channels=1280)
-    #test_cross_attn_down_block2d(device)
-    #test_unet_mid_block2d_cross_attn(device)
-    #test_cross_attn_up_block2d(device)
+    test_upsample2d(device)
+    test_groupnorm(device)
+    test_groupnorm(device, stride=[1, 1, 320*32, 320])
+    test_resnetblock2d(device, in_channels=640, out_channels=320, temb_channels=320)
+    test_resnetblock2d(device, in_channels=640, out_channels=320, temb_channels=1280)
+    test_cross_attn_down_block2d(device)
+    test_unet_mid_block2d_cross_attn(device)
+    test_cross_attn_up_block2d(device)
     test_unet2d_condition_model(device)
     #test_unet_conditional(
     #    device=device,

From 319fd6cd6b98793573000bac138e976bae8cf22d Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 13 Jan 2026 06:54:16 +0000
Subject: [PATCH 068/194] [Frontend] Fix any operation codegen

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 2 +-
 PyTorchSimFrontend/mlir/mlir_ops.py             | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index e5a1a273..87c6a628 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -60,7 +60,7 @@ def reduction_partial_combine_vec(reduction_type, vector_value, init_value):
     if reduction_type == "min":
         return ops.minimum(vector_value, init_value)
     if reduction_type == "any":
-        return ops.logical_and(vector_value, init_value)
+        return ops.logical_or(vector_value, init_value)
     raise AssertionError(reduction_type)
 
 class ExtensionWrapperCodegen(wrapper.PythonWrapperCodegen):
diff --git a/PyTorchSimFrontend/mlir/mlir_ops.py b/PyTorchSimFrontend/mlir/mlir_ops.py
index 59a6be78..c3d3952e 100644
--- a/PyTorchSimFrontend/mlir/mlir_ops.py
+++ b/PyTorchSimFrontend/mlir/mlir_ops.py
@@ -18,7 +18,7 @@ def reduction_combine_vec(reduction_type, vector_value, init_value, axis, shape,
     if reduction_type == "min":
         return f"vector.multi_reduction <minimumf>, %{vector_value}, %{init_value} [{axis}] : {shape} to {reduced_shape}"
     if reduction_type == "any":
-        return f"vector.multi_reduction <and>, %{vector_value}, %{init_value} [{axis}] : {shape} to {reduced_shape}"
+        return f"vector.multi_reduction <or>, %{vector_value}, %{init_value} [{axis}] : {shape} to {reduced_shape}"
     raise AssertionError(reduction_type)
 
 class ExtensionOverrides(common.OpOverrides):
@@ -995,10 +995,10 @@ def to_bool(operand, *args, **kwargs):
         if ret_type == "i1":
             return operand, [tile_size, ret_type]
 
-        const_one = ops.constant(0, ret_type)
+        const_zero = ops.constant(0, ret_type)
         if tile_size > 1:
-            const_one = ops.broadcast(const_one, tile_size)
-        ret = ops.ne(operand, const_one)
+            const_zero = ops.broadcast(const_zero, tile_size)
+        ret = ops.ne(operand, const_zero)
         return ret, [tile_size, "i1"]
     @staticmethod
     def step(size, dtype, *args, **kwargs):

From c223258d091ed4fe928ea11437a838b1e4de69d9 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 13 Jan 2026 06:55:25 +0000
Subject: [PATCH 069/194] [Decompose] Use F.softmax for decomposed SDPA

---
 PyTorchSimFrontend/mlir/mlir_decomposition.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_decomposition.py b/PyTorchSimFrontend/mlir/mlir_decomposition.py
index 141fa9e4..284d25d7 100644
--- a/PyTorchSimFrontend/mlir/mlir_decomposition.py
+++ b/PyTorchSimFrontend/mlir/mlir_decomposition.py
@@ -137,15 +137,13 @@ def decompose_native_multi_head_attention(
 
     # Step 4: Apply mask if provided
     if mask is not None:
-        scores = scores + mask
+        if mask.dtype == torch.bool:
+            attn_bias.masked_fill_(mask.logical_not(), float("-inf"))
+        else:
+            attn_bias = mask + attn_bias
 
     # Step 5: Softmax along the last dimension (seq_len dimension)
-    # Stable softmax: subtract max, exp, divide by sum
-    scores_max = scores.amax(dim=-1, keepdim=True)  # [batch, num_heads, seq_len, 1]
-    scores_shifted = scores - scores_max
-    scores_exp = scores_shifted.exp()
-    scores_sum = scores_exp.sum(dim=-1, keepdim=True)  # [batch, num_heads, seq_len, 1]
-    attn_weights = scores_exp / scores_sum  # [batch, num_heads, seq_len, seq_len]
+    attn_weights = F.softmax(scores, dim=-1)  # [batch, num_heads, seq_len, seq_len]
 
     # Step 6: Attention @ V
     # [batch, num_heads, seq_len, seq_len] @ [batch, num_heads, seq_len, head_dim]

From 07be94b0d47cd61a0170f360110fe440296b43c9 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 13 Jan 2026 09:41:00 +0000
Subject: [PATCH 070/194] [Frontend] Add recompiliation for ModularIndexing

---
 .../mlir/mlir_codegen_backend.py              | 61 ++++++++++++++++---
 PyTorchSimFrontend/mlir/mlir_common.py        | 17 +++++-
 2 files changed, 69 insertions(+), 9 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 87c6a628..3d65c0a4 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -20,7 +20,7 @@
     is_welford_reduction,
     sympy_product
 )
-from torch.utils._sympy.functions import ModularIndexing, FloorDiv, Identity
+from torch.utils._sympy.functions import ModularIndexing, FloorDiv
 from PyTorchSimFrontend import extension_codecache
 from PyTorchSimFrontend import extension_config
 from . import mlir_common
@@ -365,13 +365,6 @@ def parse_indices(self, expr, buffer=None, comments="", indirect_dims=[]) -> com
         if len(expr.args) == 0 and len(indirect_dims) == 0:
             return expr
 
-        # Replace Identity arguments with Identity.args[0]
-        for arg in expr.args:
-            if arg.is_Mul and arg.args[0].is_number and isinstance(arg.args[1], Identity):
-                expr = expr.replace(arg.args[1], arg.args[1].args[0])
-            if isinstance(arg, Identity):
-                expr = expr.replace(arg, arg.args[0] if arg.args else arg)
-
         if len(expr.args) == 0:
             args = [expr]
         else:
@@ -784,6 +777,7 @@ def _index_expr(self, tile_desc, renamed_expression, index, base_vector_index):
         return accum
 
     def index_expr(self, index, dtype):
+        index = self.rename_indexing(index)
         base_tile_desc = self.kernel_group.tile_desc
         if len(self.ranges) != self.reduction_depth:
             # FIXME. This is a temporary solution to get tile stride of the reduction case
@@ -1224,6 +1218,57 @@ def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffe
                 local_tile_desc.apply_divisor(dim_idx+offset, divisor, "split")
                 offset = offset+1
 
+        # Support ModularIndexing pattern
+        # This pattern can be used to broadcast ex) torch.cat([a,a])
+        # ModularIndexing(x, y, z) means (x // y) % z
+        # tile_size must be: multiple of y (floorDiv divisor) and divisor of z (modular divisor)
+        if index.has(ModularIndexing):
+            for sub in sympy.preorder_traversal(index):
+                if isinstance(sub, ModularIndexing):
+                    if not str(sub.args[0]).startswith("index"):
+                        continue
+                    dim_idx = int((str(sub.args[0])[5:]))
+                    floor_divisor = sub.args[1]  # y: floorDiv divisor
+                    mod_divisor = sub.args[2]    # z: modular divisor
+                    current_tile_size = self.kernel_group.tile_desc.get_tile_size()[dim_idx]
+
+                    # Check if tile_size is multiple of floorDiv divisor
+                    if int(current_tile_size % floor_divisor) != 0:
+                        original_tile = self.kernel_group.tile_desc.get_tile_size()
+                        original_size = original_tile[dim_idx]
+                        divisor = floor_divisor * self.kernel_group.tile_desc.vmap.vlane_stride
+                        new_size = ((original_size + divisor - 1) // divisor) * divisor
+                        new_tile_sizes = list(self.kernel_group.tile_desc.get_tile_size())
+                        new_tile_sizes[dim_idx] = new_size
+                        self.kernel_group.tile_desc.set_tile_size(new_tile_sizes)
+                        self.kernel_group.tile_desc.tile_constraint[dim_idx].fixed = True
+
+                        self.reset("recompile")
+                        raise mlir_common.RecompileSignal(f"Tile size {current_tile_size} is not a multiple of floorDiv divisor {floor_divisor} in ModularIndexing")
+
+                    # Check if tile_size is a divisor of modular divisor
+                    if int((mod_divisor * floor_divisor) % current_tile_size) != 0:
+                        original_tile = self.kernel_group.tile_desc.get_tile_size()
+                        original_size = original_tile[dim_idx]
+                        # Find the largest divisor of mod_divisor that is <= original_size
+                        # and is a multiple of floor_divisor
+                        new_size = original_size
+                        while new_size > 0:
+                            if mod_divisor % new_size == 0 and new_size % floor_divisor == 0:
+                                break
+                            new_size -= floor_divisor
+
+                        if new_size <= 0:
+                            new_size = mod_divisor * floor_divisor
+
+                        new_tile_sizes = list(self.kernel_group.tile_desc.get_tile_size())
+                        new_tile_sizes[dim_idx] = new_size
+                        self.kernel_group.tile_desc.set_tile_size(new_tile_sizes)
+                        self.kernel_group.tile_desc.tile_constraint[dim_idx].fixed = True
+
+                        self.reset("recompile")
+                        raise mlir_common.RecompileSignal(f"Tile size {current_tile_size} is not a divisor of modular divisor {mod_divisor} in ModularIndexing")
+
         # FIXME. It will be nice to modify node instead of this exception handling...
         if len(self.itervars) == 1 and self.reduction_depth == 0:
             # In case of reduction loop only case, we will add dummy loop so shift it once
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index d96eb452..e31555ba 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -15,7 +15,7 @@
 from torch._inductor.ir import MultiOutputLayout
 from torch._inductor.dependencies import MemoryDep, StarDep, WeakDep
 from torch._inductor.codegen.wrapper import KernelDefinitionLine
-from torch.utils._sympy.functions import ModularIndexing, FloorDiv, Mod
+from torch.utils._sympy.functions import ModularIndexing, FloorDiv, Mod, Identity
 import sympy
 import contextlib
 
@@ -838,6 +838,21 @@ def rename_indexing(self, index) -> sympy.Expr:
         # and renames variables in index expressions to kernel arg names
         if isinstance(index, (list, tuple)):
             return [self.rename_indexing(x) for x in index]
+
+        # FIXME. This is a temporary solution to remove Identity wrappers from index expression.
+        # Remove Identity wrappers from index expression
+        # Check if index itself is Identity
+        if isinstance(index, Identity):
+            index = index.args[0] if index.args else index
+
+        # Replace Identity arguments with Identity.args[0]
+        if hasattr(index, 'args') and len(index.args) > 0:
+            for arg in index.args:
+                if arg.is_Mul and arg.args[0].is_number and isinstance(arg.args[1], Identity):
+                    index = index.replace(arg.args[1], arg.args[1].args[0] if arg.args[1].args else arg.args[1])
+                if isinstance(arg, Identity):
+                    index = index.replace(arg, arg.args[0] if arg.args else arg)
+
         index = V.graph.sizevars.simplify(index)
         sorted_symbols = sorted(index.free_symbols, key=lambda s: s.name)
         replacements = {

From e999bfc34b8527ea8253339eea9556c684758d65 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 13 Jan 2026 09:57:11 +0000
Subject: [PATCH 071/194] [Test] Fix minor bugs in the test folder

---
 tests/Llama/test_llama.py | 19 ++++++++++---------
 tests/MoE/test_moe.py     | 35 +++++++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+), 9 deletions(-)

diff --git a/tests/Llama/test_llama.py b/tests/Llama/test_llama.py
index 443f3fc2..889e5fa8 100644
--- a/tests/Llama/test_llama.py
+++ b/tests/Llama/test_llama.py
@@ -101,7 +101,8 @@ def run_rotary_embedding_test(
         vocab_size=8192,
         _attn_implementation = "sdpa"
     )
-    base_rope = LlamaRotaryEmbedding(cfg)
+    # Pass dim explicitly to avoid config parsing issues
+    base_rope = LlamaRotaryEmbedding(dim=head_dim, max_position_embeddings=cfg.max_position_embeddings, base=cfg.rope_theta, config=cfg)
 
     cpu_rope = copy.deepcopy(base_rope)
 
@@ -375,14 +376,14 @@ def run_llama_model_test(
     torch.compiler.is_compiling = lambda: True # FIXME. How to fix this?
     #run_rmsnorm_test(device)
     #run_rotary_embedding_test(device)
-    #run_decoder_layer_test(
-    #    device=device,
-    #    batch=args.batch,
-    #    seq_len=args.seq_len,
-    #    dtype=args.dtype,
-    #    rtol=args.rtol,
-    #    atol=args.atol,
-    #)
+    run_decoder_layer_test(
+        device=device,
+        batch=args.batch,
+        seq_len=args.seq_len,
+        dtype=args.dtype,
+        rtol=args.rtol,
+        atol=args.atol,
+    )
     run_llama_model_test(device)
     #run_custom_llama_test(
     #    device=device,
diff --git a/tests/MoE/test_moe.py b/tests/MoE/test_moe.py
index 1030e59f..9ebfb11e 100644
--- a/tests/MoE/test_moe.py
+++ b/tests/MoE/test_moe.py
@@ -16,6 +16,32 @@
 
 sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
+# FIXME. This is a temporary solution to avoid is_forward conflict during backward
+def patch_compile_event_logger():
+    """Patch CompileEventLogger.compilation_metric to avoid is_forward conflict during backward."""
+    from torch._dynamo.utils import CompileEventLogger
+    from torch._dynamo.utils import get_metrics_context
+
+    original_compilation_metric = CompileEventLogger.compilation_metric
+
+    @staticmethod
+    def patched_compilation_metric(is_forward=True, **kwargs):
+        """Patched version that clears is_forward before setting it if there's a conflict."""
+        try:
+            metrics_context = get_metrics_context()
+            if metrics_context.in_progress() and hasattr(metrics_context, '_metrics'):
+                # If is_forward is already set and we're trying to set it to a different value, clear it first
+                current_is_forward = metrics_context._metrics.get('is_forward')
+                if current_is_forward is not None and current_is_forward != is_forward:
+                    metrics_context._metrics.pop('is_forward', None)
+        except:
+            pass
+        # Call the original function
+        return original_compilation_metric(is_forward=is_forward, **kwargs)
+
+    # Patch the method
+    CompileEventLogger.compilation_metric = patched_compilation_metric
+
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
     pass_message = f"|{name} Test Passed|"
     fail_message = f"|{name} Test Failed|"
@@ -469,6 +495,9 @@ def test_moe(device):
         print("\n")
 
 def train_moe(device):
+    # Patch CompileEventLogger to avoid metric conflicts
+    patch_compile_event_logger()
+
     def perceptron(a, b, c):
         return a * b + c
 
@@ -589,6 +618,9 @@ def weight_update(a, b, lr):
     plt.savefig('result.png')
 
 def train_moe_mnist(device):
+    # Patch CompileEventLogger to avoid metric conflicts
+    patch_compile_event_logger()
+
     torch.manual_seed(0)
     batch_size = 32
     input_size = 28*28
@@ -670,6 +702,9 @@ def train(model, device, train_loader, optimizer, epochs):
     plt.savefig(f'{name}_result.png')
 
 def train_moe_single_iteration(device, iter_idx, is_evaluation=0):
+    # Patch CompileEventLogger to avoid metric conflicts
+    patch_compile_event_logger()
+
     # Training moe with mnist dataset for sinlge iteration
     torch.manual_seed(0)
     batch_size = 128

From d747e7ee7e505f74d7abb0296e098567b951cb47 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 13 Jan 2026 10:42:18 +0000
Subject: [PATCH 072/194] [Log] Add progress bar in spike simulation

---
 Simulator/simulator.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Simulator/simulator.py b/Simulator/simulator.py
index 7a4f7e0d..96a1fc86 100644
--- a/Simulator/simulator.py
+++ b/Simulator/simulator.py
@@ -151,7 +151,8 @@ def run_spike(self, args, arg_attributes, runtime_path, binary, vectorlane_size=
         try:
             stdout_setting = subprocess.DEVNULL if silent_mode else None
             stderr_setting = subprocess.DEVNULL if silent_mode else None
-            subprocess.check_call(run_cmd, stdout=stdout_setting, stderr=stderr_setting)
+            with ProgressBar("[Spike] Running simulation", silent_mode=silent_mode):
+                subprocess.check_call(run_cmd, stdout=stdout_setting, stderr=stderr_setting)
         except subprocess.CalledProcessError as e:
             if not silent_mode:
                 logger.error(f"[Spike] Command failed with exit code {e.returncode}")

From b49b6795d92088489fc0a5fb685c35307ae968b6 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 15 Jan 2026 07:28:24 +0000
Subject: [PATCH 073/194] [Fix] Use extraction for vlane_offset + Register
 extract op

---
 .../mlir/mlir_codegen_backend.py              |  13 +-
 PyTorchSimFrontend/mlir/mlir_ops.py           | 280 +++++++++++++-----
 2 files changed, 206 insertions(+), 87 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 3d65c0a4..912c618a 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -534,8 +534,8 @@ def store(self, name: str, index: sympy.Expr, value, mode=None, *args, **kwargs)
                 value = ops.to_dtype(value, mlir_dtype)
 
             if compute_vec_size < self.var_info[value][0]:
-                value = self.cse.generate(self.stores, f"vector.extract_strided_slice  %{value} {{offsets = [0], sizes = [{compute_vec_size}], strides = [1]}}: vector<{self.var_info[value][0]}x{self.var_info[value][1]}> to {vshape}")
-                self.register_var_info(value, [compute_vec_size, mlir_dtype])
+                with self.override_buffer_cse(buffer=self.stores):
+                    value = ops.extract_strided_slice(value, compute_vec_size)
 
             with self.override_buffer_cse(buffer=self.stores):
                 ops._store(value, sram_var, compute_index_var, tile_shape, buffer_name=name)
@@ -729,9 +729,11 @@ def _index_expr(self, tile_desc, renamed_expression, index, base_vector_index):
                 outer_dim = ops.remainder(ops.truncdiv(dim, vlane_stride_vec), vlane_outer_vec)
                 dim = ops.add(stride_dim, ops.mul(outer_dim, nr_vector_lane_vec))
 
-                vlane_offset = self.const_cse.generate(self.const_buffer, f"arith.addi %{vlane_vec}, %{vlane_vec} {{ vlane_offset={offset} }} : vector<{vlane_vec_size}xi64> // vlane offset")
-                self.register_var_info(vlane_offset, [vlane_vec_size, "i64"])
-                vlane_offset = ops.index_cast(vlane_offset, "index")
+                with self.override_buffer_cse(buffer=self.const_buffer, cse=self.const_cse):
+                    vlane_offset = ops.vlane_offset(vlane_vec, vlane_vec, attributes={"vlane_offset": offset}, comment="vlane offset")
+                    if compute_vec_size < self.var_info[vlane_offset][0]:
+                        vlane_offset = ops.extract_strided_slice(vlane_offset, compute_vec_size)
+                    vlane_offset = ops.index_cast(vlane_offset, "index")
                 dim = ops.add(dim, vlane_offset)
             dim_list.append(dim)
 
@@ -795,7 +797,6 @@ def index_expr(self, index, dtype):
             tile_desc = base_tile_desc
         compute_vec_size = tile_desc.get_compute_vec_size()
 
-
         tile_shape = f"memref<{compute_vec_size*self.vector_lane}xindex, 1>"
         vshape = f"vector<{compute_vec_size}xindex>"
 
diff --git a/PyTorchSimFrontend/mlir/mlir_ops.py b/PyTorchSimFrontend/mlir/mlir_ops.py
index c3d3952e..4cf031d2 100644
--- a/PyTorchSimFrontend/mlir/mlir_ops.py
+++ b/PyTorchSimFrontend/mlir/mlir_ops.py
@@ -21,6 +21,35 @@ def reduction_combine_vec(reduction_type, vector_value, init_value, axis, shape,
         return f"vector.multi_reduction <or>, %{vector_value}, %{init_value} [{axis}] : {shape} to {reduced_shape}"
     raise AssertionError(reduction_type)
 
+def format_mlir_op(op_str, shape, **kwargs):
+    """
+    Format MLIR operation string with optional attributes and comment.
+
+    Args:
+        op_str: Base operation string (e.g., "arith.addi %0, %1")
+        shape: Type shape string (e.g., "vector<4xi64>" or "i64")
+        **kwargs: May contain 'attributes' (dict or str) and 'comment' (str)
+
+    Returns:
+        Formatted MLIR operation string
+    """
+    result = op_str
+    attributes = kwargs.get('attributes', None)
+    comment = kwargs.get('comment', None)
+
+    if attributes:
+        if isinstance(attributes, dict):
+            # Format: { key1=value1, key2=value2 }
+            attrs_str = ", ".join(f"{k}={v}" for k, v in attributes.items())
+            result += f" {{ {attrs_str} }}"
+        elif isinstance(attributes, str):
+            # Direct string format
+            result += f" {{ {attributes} }}"
+    result += f" : {shape}"
+    if comment:
+        result += f" // {comment}"
+    return result
+
 class ExtensionOverrides(common.OpOverrides):
     @staticmethod
     def constant(value, src_type, *args, **kwargs):
@@ -36,8 +65,8 @@ def constant(value, src_type, *args, **kwargs):
         elif src_type[0] == "f":
             value = format(float(value), ".20f")
         elif src_type[0] == "i":
-            value = int(float(value)) 
-        return f'arith.constant {value} : {src_type}', [1, src_type]
+            value = int(float(value))
+        return format_mlir_op(f'arith.constant {value}', src_type, **kwargs), [1, src_type]
 
     @staticmethod
     def broadcast(operand, target_size, *args, **kwargs):
@@ -54,16 +83,18 @@ def broadcast(operand, target_size, *args, **kwargs):
                 outer_dim = target_size // src_size
                 unflat_shape = f"vector<{outer_dim}x{src_size}x{dtype}>"
                 # Flatten back to 1D
-                op_str = f"vector.shape_cast %{unflat_operand} : {unflat_shape} to {dst_shape}"
+                op_str = f"vector.shape_cast %{unflat_operand}"
+                shape = f"{unflat_shape} to {dst_shape}"
             else:
                 raise NotImplementedError(
                     f"Vector broadcast size mismatch: src={src_size} cannot broadcast to target={target_size}"
                 )
         elif src_size == 1:
-            op_str = f"vector.broadcast %{operand} : {src_shape} to {dst_shape}"
+            op_str = f"vector.broadcast %{operand}"
+            shape = f"{src_shape} to {dst_shape}"
         else:
             raise ValueError(f"Invalid source size: {src_size}")
-        return op_str, [target_size, dtype]
+        return format_mlir_op(op_str, shape, **kwargs), [target_size, dtype]
 
     @staticmethod
     def broadcast_unflat(operand, target_size, *args, **kwargs):
@@ -73,8 +104,9 @@ def broadcast_unflat(operand, target_size, *args, **kwargs):
         src_shape = f"vector<{src_size}x{dtype}>"
         dst_shape = f"vector<{outer_dim}x{src_size}x{dtype}>"
 
-        op_str = f"vector.broadcast %{operand} : {src_shape} to {dst_shape}"
-        return op_str, [target_size, dtype]
+        op_str = f"vector.broadcast %{operand}"
+        shape = f"{src_shape} to {dst_shape}"
+        return format_mlir_op(op_str, shape, **kwargs), [target_size, dtype]
 
     def load_seed(self, *args, **kwargs):
         raise NotImplementedError
@@ -110,7 +142,10 @@ def where(condition, operand1, operand2, *args, **kwargs):
         tile_size, ret_type = V.kernel.var_info[operand1]
         shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
         cond_shape = f"vector<{tile_size}xi1>" if tile_size > 1 else ""
-        return f"arith.select %{condition}, %{operand1}, %{operand2} : {cond_shape}, {shape}", [tile_size, ret_type]
+
+        op_str = f"arith.select %{condition}, %{operand1}, %{operand2}"
+        shape = f"{cond_shape}, {shape}"
+        return format_mlir_op(op_str, shape, **kwargs), [tile_size, ret_type]
 
     @staticmethod
     def to_dtype(operand, dst_mlir_dtype, *args, **kwargs):
@@ -157,7 +192,7 @@ def to_dtype(operand, dst_mlir_dtype, *args, **kwargs):
                 op_str = f"arith.extsi %{operand} : {src_shape} to {shape}"
             elif dst_bits < src_bits:
                 # Use arith.trunci for integer truncation
-                op_str = f"arith.trunci %{operand} : {src_shape} to {shape}" 
+                op_str = f"arith.trunci %{operand} : {src_shape} to {shape}"
             else:
                 return operand, [tile_size, dst_mlir_dtype]
         # Case D: Float -> Float (Extension / Truncation)
@@ -166,7 +201,7 @@ def to_dtype(operand, dst_mlir_dtype, *args, **kwargs):
                 op_str = f"arith.extf %{operand} : {src_shape} to {shape}"
             elif dst_bits < src_bits:
                 # Corrected 'trunf' to 'truncf'
-                op_str = f"arith.truncf %{operand} : {src_shape} to {shape}" 
+                op_str = f"arith.truncf %{operand} : {src_shape} to {shape}"
             else:
                 return operand, [tile_size, dst_mlir_dtype]
         else:
@@ -200,7 +235,9 @@ def to_dtype_bitcast(operand, dtype, *args, **kwargs):
         src_shape = f"vector<{tile_size}x{current_src_type}>" if tile_size > 1 else current_src_type
         dst_shape = f"vector<{tile_size}x{dst_mlir_type}>" if tile_size > 1 else dst_mlir_type
 
-        return f"arith.bitcast %{operand} : {src_shape} to {dst_shape}", [tile_size, dst_mlir_type]
+        op_str = f"arith.bitcast %{operand}"
+        shape = f"{src_shape} to {dst_shape}"
+        return format_mlir_op(op_str, shape, **kwargs), [tile_size, dst_mlir_type]
 
     # Binary element wise operations
     @staticmethod
@@ -283,7 +320,7 @@ def exp(operand, *args, **kwargs):
         tile_size = op_type[0]
         dtype = op_type[1]
         shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
-        return f'math.exp %{operand} : {shape}', [tile_size, dtype]
+        return format_mlir_op(f'math.exp %{operand}', shape, **kwargs), [tile_size, dtype]
 
     @staticmethod
     def exp2(operand, *args, **kwargs):
@@ -315,7 +352,7 @@ def sqrt(operand, *args, **kwargs):
             operand = ops.to_dtype(operand, "f32")
 
         shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
-        return f'math.sqrt %{operand} : {shape}', [tile_size, dtype]
+        return format_mlir_op(f'math.sqrt %{operand}', shape, **kwargs), [tile_size, dtype]
 
     @staticmethod
     def relu(operand, *args, **kwargs):
@@ -331,7 +368,8 @@ def minimum(operand1, operand2, *args, **kwargs):
             opcode = f'arith.minimumf'
         else:
             opcode = f'arith.minsi'
-        return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
+        op_str = f'{opcode} %{operand1}, %{operand2}'
+        return format_mlir_op(op_str, shape, **kwargs), [tile_size, ret_type]
 
     @staticmethod
     def maximum(operand1, operand2, *args, **kwargs):
@@ -341,7 +379,8 @@ def maximum(operand1, operand2, *args, **kwargs):
             opcode = f'arith.maximumf'
         else:
             opcode = f'arith.maxsi'
-        return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
+        op_str = f'{opcode} %{operand1}, %{operand2}'
+        return format_mlir_op(op_str, shape, **kwargs), [tile_size, ret_type]
 
     @staticmethod
     def cos(operand, *args, **kwargs):
@@ -362,7 +401,7 @@ def cos(operand, *args, **kwargs):
         if dtype.startswith("f"):
             operand = ops.to_dtype(operand, "f32")
         shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
-        return f'math.cos %{operand} : {shape}', [tile_size, dtype]
+        return format_mlir_op(f'math.cos %{operand}', shape, **kwargs), [tile_size, dtype]
 
     @staticmethod
     def sin(operand, *args, **kwargs):
@@ -383,7 +422,7 @@ def sin(operand, *args, **kwargs):
         if dtype.startswith("f"):
             operand = ops.to_dtype(operand, "f32")
         shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
-        return f'math.sin %{operand} : {shape}', [tile_size, dtype]
+        return format_mlir_op(f'math.sin %{operand}', shape, **kwargs), [tile_size, dtype]
 
     @staticmethod
     def tan(operand, *args, **kwargs):
@@ -409,7 +448,7 @@ def erf(operand, *args, **kwargs):
         tile_size = op_type[0]
         dtype = op_type[1]
         shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
-        return f'math.erf %{operand} : {shape}', [tile_size, dtype]
+        return format_mlir_op(f'math.erf %{operand}', shape, **kwargs), [tile_size, dtype]
 
     @staticmethod
     def cosh(operand, *args, **kwargs):
@@ -438,7 +477,7 @@ def tanh(operand, *args, **kwargs):
         if dtype.startswith("f"):
             operand = ops.to_dtype(operand, "f32")
         shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
-        return f'math.tanh %{operand} : {shape}', [tile_size, dtype]
+        return format_mlir_op(f'math.tanh %{operand}', shape, **kwargs), [tile_size, dtype]
 
     @staticmethod
     def acos(operand, *args, **kwargs):
@@ -491,11 +530,11 @@ def hypot(operand1, operand2, *args, **kwargs):
     @staticmethod
     def log10(operand, *args, **kwargs):
         val_ln = ops.log(operand)
-        
+
         tile_size, dtype = V.kernel.var_info[val_ln]
         inv_ln10 = 1/math.log(10)
         const_op = ops.constant(inv_ln10, dtype)
-        
+
         # Multiply: ln(x) * (1/ln(10))
         result = ops.mul(val_ln, const_op)
         return result, V.kernel.var_info[result]
@@ -503,11 +542,10 @@ def log10(operand, *args, **kwargs):
     @staticmethod
     def log2(operand, *args, **kwargs):
         val_ln = ops.log(operand)
-        
         tile_size, dtype = V.kernel.var_info[val_ln]
         inv_ln10 = 1/math.log(2)
         const_op = ops.constant(inv_ln10, dtype)
-        
+
         # Multiply: ln(x) * (1/ln(10))
         result = ops.mul(val_ln, const_op)
         return result, V.kernel.var_info[result]
@@ -523,7 +561,7 @@ def log(operand, *args, **kwargs):
             operand = ops.to_dtype(operand, "f32")
 
         shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
-        return f'math.log %{operand} : {shape}', [tile_size, dtype]
+        return format_mlir_op(f'math.log %{operand}', shape, **kwargs), [tile_size, dtype]
 
     @staticmethod
     def log1p(operand, *args, **kwargs):
@@ -542,7 +580,6 @@ def nextafter(operand1, operand2, *args, **kwargs):
     def logical_and(operand1, operand2, *args, **kwargs):
         if V.kernel.var_info[operand1][1] != "i1":
             operand1 = ops.to_bool(operand1)
-        
         if V.kernel.var_info[operand2][1] != "i1":
             operand2 = ops.to_bool(operand2)
         result = ops.and_(operand1, operand2)
@@ -552,7 +589,6 @@ def logical_and(operand1, operand2, *args, **kwargs):
     def logical_or(operand1, operand2, *args, **kwargs):
         if V.kernel.var_info[operand1][1] != "i1":
             operand1 = ops.to_bool(operand1)
-        
         if V.kernel.var_info[operand2][1] != "i1":
             operand2 = ops.to_bool(operand2)
         result = ops.or_(operand1, operand2)
@@ -562,18 +598,16 @@ def logical_or(operand1, operand2, *args, **kwargs):
     def logical_xor(operand1, operand2, *args, **kwargs):
         if V.kernel.var_info[operand1][1] != "i1":
             operand1 = ops.to_bool(operand1)
-        
         if V.kernel.var_info[operand2][1] != "i1":
             operand2 = ops.to_bool(operand2)
         result = ops.xor(operand1, operand2)
         return result, V.kernel.var_info[result]
-    
+
     @staticmethod
     def logical_not(operand, *args, **kwargs):
         op_info = V.kernel.var_info[operand]
         tile_size = op_info[0]
         dtype = op_info[1]
-        
         zero_const = ops.constant(0, dtype)
         result = ops.eq(operand, zero_const)
         return result, V.kernel.var_info[result]
@@ -583,7 +617,6 @@ def bitwise_and(operand1, operand2, *args, **kwargs):
         # Float check
         if V.kernel.var_info[operand1][1].startswith("f") or V.kernel.var_info[operand2][1].startswith("f"):
             raise ValueError("Bitwise AND not supported for floats")
-            
         result = ops.and_(operand1, operand2)
         return result, V.kernel.var_info[result]
 
@@ -593,9 +626,8 @@ def bitwise_not(operand, *args, **kwargs):
         # Float check
         if V.kernel.var_info[operand][1].startswith("f"):
             raise ValueError("Bitwise NOT not supported for floats")
-        
         neg_one = ops.constant(-1, dtype)
-        result = ops.xor(operand, neg_one) 
+        result = ops.xor(operand, neg_one)
         return result, V.kernel.var_info[result]
 
     @staticmethod
@@ -603,7 +635,7 @@ def bitwise_or(operand1, operand2, *args, **kwargs):
         # Float check
         if V.kernel.var_info[operand1][1].startswith("f") or V.kernel.var_info[operand2][1].startswith("f"):
             raise ValueError("Bitwise AND not supported for floats")
-            
+
         result = ops.or_(operand1, operand2)
         return result, V.kernel.var_info[result]
 
@@ -612,7 +644,6 @@ def bitwise_xor(operand1, operand2, *args, **kwargs):
                 # Float check
         if V.kernel.var_info[operand1][1].startswith("f") or V.kernel.var_info[operand2][1].startswith("f"):
             raise ValueError("Bitwise AND not supported for floats")
-            
         result = ops.xor(operand1, operand2)
         return result, V.kernel.var_info[result]
 
@@ -635,7 +666,7 @@ def rsqrt(operand, *args, **kwargs):
             operand = ops.to_dtype(operand, "f32")
 
         shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
-        return f'math.rsqrt %{operand} : {shape}', [tile_size, dtype]
+        return format_mlir_op(f'math.rsqrt %{operand}', shape, **kwargs), [tile_size, dtype]
 
     @staticmethod
     def sigmoid(operand, *args, **kwargs):
@@ -663,7 +694,8 @@ def round(operand, *args, **kwargs):
         shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
 
         if dtype.startswith("f"):
-            return f"math.roundeven %{operand} : {shape}", [tile_size, dtype]
+            op_str = f"math.roundeven %{operand}"
+            return format_mlir_op(op_str, shape, **kwargs), [tile_size, dtype]
         else:
             return operand, [tile_size, dtype]
 
@@ -673,7 +705,8 @@ def floor(operand, *args, **kwargs):
         shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
 
         if dtype.startswith("f"):
-            return f"math.floor %{operand} : {shape}", [tile_size, dtype]
+            op_str = f"math.floor %{operand}"
+            return format_mlir_op(op_str, shape, **kwargs), [tile_size, dtype]
         else:
             return operand, [tile_size, dtype]
 
@@ -687,7 +720,8 @@ def trunc(operand, *args, **kwargs):
         shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
 
         if dtype.startswith("f"):
-            return f"math.trunc %{operand} : {shape}", [tile_size, dtype]
+            op_str = f"math.trunc %{operand}"
+            return format_mlir_op(op_str, shape, **kwargs), [tile_size, dtype]
         else:
             return operand, [tile_size, dtype]
 
@@ -697,7 +731,8 @@ def ceil(operand, *args, **kwargs):
         shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
 
         if dtype.startswith("f"):
-            return f"math.ceil %{operand} : {shape}", [tile_size, dtype]
+            op_str = f"math.ceil %{operand}"
+            return format_mlir_op(op_str, shape, **kwargs), [tile_size, dtype]
         else:
             return operand, [tile_size, dtype]
 
@@ -711,19 +746,18 @@ def neg(operand, *args, **kwargs):
         # Type check & auto cast
         if dtype.startswith("f"):
             operand = ops.to_dtype(operand, "f32")
-
+        op_str = f"arith.negf %{operand}"
         shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
-        return f'arith.negf %{operand} : {shape}', [tile_size, dtype]
+        return format_mlir_op(op_str, shape, **kwargs), [tile_size, dtype]
 
     @staticmethod
     def reciprocal(operand, *args, **kwargs):
         op_type = V.kernel.var_info[operand]
-        tile_size = op_type[0]
-        dtype = op_type[1]
-
-        # Type check & auto cast
-        if dtype.startswith("f"):
-            operand = ops.to_dtype(operand, "f32")
+        tile_size, dtype = op_type[0], op_type[1]
+        if dtype.startswith("i"):
+            openand = ops.to_dtype(operand, "f32")
+            op_type = V.kernel.var_info[operand]
+            tile_size, dtype = op_type[0], op_type[1]
 
         return ops.truediv(ops.constant(1.0, dtype), operand), [tile_size, dtype]
 
@@ -739,8 +773,9 @@ def eq(operand1, operand2, *args, **kwargs):
         else:
             raise ValueError(f"Unsupported data type for 'eq' operation: {ret_type}")
 
+        op_str = f'{op_type} {attribute}, %{operand1}, %{operand2}'
         shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
-        return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"]
+        return format_mlir_op(op_str, shape, **kwargs), [tile_size, "i1"]
 
     @staticmethod
     def ne(operand1, operand2, *args, **kwargs):
@@ -754,8 +789,9 @@ def ne(operand1, operand2, *args, **kwargs):
         else:
             raise ValueError(f"Unsupported data type for 'ne' operation: {ret_type}")
 
+        op_str = f'{op_type} {attribute}, %{operand1}, %{operand2}'
         shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
-        return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"]
+        return format_mlir_op(op_str, shape, **kwargs), [tile_size, "i1"]
 
     @staticmethod
     def lt(operand1, operand2, *args, **kwargs):
@@ -769,8 +805,9 @@ def lt(operand1, operand2, *args, **kwargs):
         else:
             raise ValueError(f"Unsupported data type for 'lt' operation: {ret_type}")
 
+        op_str = f'{op_type} {attribute}, %{operand1}, %{operand2}'
         shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
-        return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"]
+        return format_mlir_op(op_str, shape, **kwargs), [tile_size, "i1"]
 
     @staticmethod
     def gt(operand1, operand2, *args, **kwargs):
@@ -784,8 +821,9 @@ def gt(operand1, operand2, *args, **kwargs):
         else:
             raise ValueError(f"Unsupported data type for 'gt' operation: {ret_type}")
 
+        op_str = f'{op_type} {attribute}, %{operand1}, %{operand2}'
         shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
-        return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"]
+        return format_mlir_op(op_str, shape, **kwargs), [tile_size, "i1"]
 
     @staticmethod
     def le(operand1, operand2, *args, **kwargs):
@@ -799,8 +837,9 @@ def le(operand1, operand2, *args, **kwargs):
         else:
             raise ValueError(f"Unsupported data type for 'le' operation: {ret_type}")
 
+        op_str = f'{op_type} {attribute}, %{operand1}, %{operand2}'
         shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
-        return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"]
+        return format_mlir_op(op_str, shape, **kwargs), [tile_size, "i1"]
 
     @staticmethod
     def ge(operand1, operand2, *args, **kwargs):
@@ -814,29 +853,33 @@ def ge(operand1, operand2, *args, **kwargs):
         else:
             raise ValueError(f"Unsupported data type for 'ne' operation: {ret_type}")
 
+        op_str = f'{op_type} {attribute}, %{operand1}, %{operand2}'
         shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
-        return f'{op_type} {attribute}, %{operand1}, %{operand2} : {shape}', [tile_size, "i1"]
+        return format_mlir_op(op_str, shape, **kwargs), [tile_size, "i1"]
 
     @staticmethod
     def add(operand1, operand2, *args, **kwargs):
         tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2)
         shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
         opcode = f'arith.add{ret_type[0]}'
-        return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
+        op_str = f'{opcode} %{operand1}, %{operand2}'
+        return format_mlir_op(op_str, shape, **kwargs), [tile_size, ret_type]
 
     @staticmethod
     def sub(operand1, operand2, *args, **kwargs):
         tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2)
         shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
         opcode = f'arith.sub{ret_type[0]}'
-        return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
+        op_str = f'{opcode} %{operand1}, %{operand2}'
+        return format_mlir_op(op_str, shape, **kwargs), [tile_size, ret_type]
 
     @staticmethod
     def mul(operand1, operand2, *args, **kwargs):
         tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2)
         shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
         opcode = f'arith.mul{ret_type[0]}'
-        return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
+        op_str = f'{opcode} %{operand1}, %{operand2}'
+        return format_mlir_op(op_str, shape, **kwargs), [tile_size, ret_type]
 
     @staticmethod
     def pow(operand1, operand2, *args, **kwargs):
@@ -850,28 +893,32 @@ def pow(operand1, operand2, *args, **kwargs):
             operand2 = ops.to_dtype(operand2, "f32")
 
         shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
-        return f"math.pow{ret_type[0]} %{operand1}, %{operand2} : {shape}", [tile_size, ret_type]
+        op_str = f"math.pow{ret_type[0]} %{operand1}, %{operand2}"
+        return format_mlir_op(op_str, shape, **kwargs), [tile_size, ret_type]
 
     @staticmethod
     def and_(operand1, operand2, *args, **kwargs):
         tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2)
-        
+
         shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
-        return f'arith.andi %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
+        op_str = f'arith.andi %{operand1}, %{operand2}'
+        return format_mlir_op(op_str, shape, **kwargs), [tile_size, ret_type]
 
     @staticmethod
     def or_(operand1, operand2, *args, **kwargs):
         tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2)
-        
+
         shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
-        return f'arith.ori %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
+        op_str = f'arith.ori %{operand1}, %{operand2}'
+        return format_mlir_op(op_str, shape, **kwargs), [tile_size, ret_type]
 
     @staticmethod
     def xor(operand1, operand2, *args, **kwargs):
         tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2)
-        
+
         shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
-        return f'arith.xori %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
+        op_str = f'arith.xori %{operand1}, %{operand2}'
+        return format_mlir_op(op_str, shape, **kwargs), [tile_size, ret_type]
 
     @staticmethod
     def lshift(operand1, operand2, *args, **kwargs):
@@ -888,9 +935,10 @@ def truncdiv(operand1, operand2, *args, **kwargs):
 
         if ret_type.startswith("f"):
             raise ValueError("truncdiv is strictly for integers. Use truediv for floats.")
-        
+
         # arith.divsi: Signed Integer Division (Result is truncated)
-        return f'arith.divsi %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
+        op_str = f'arith.divsi %{operand1}, %{operand2}'
+        return format_mlir_op(op_str, shape, **kwargs), [tile_size, ret_type]
 
     @staticmethod
     def floordiv(operand1, operand2, *args, **kwargs):
@@ -902,7 +950,8 @@ def floordiv(operand1, operand2, *args, **kwargs):
              raise ValueError("floordiv implementation expects integers based on definition.")
 
         # arith.floordivsi: Floor Division for Signed Integers
-        return f'arith.floordivsi %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
+        op_str = f'arith.floordivsi %{operand1}, %{operand2}'
+        return format_mlir_op(op_str, shape, **kwargs), [tile_size, ret_type]
 
     @staticmethod
     def truediv(operand1, operand2, *args, **kwargs):
@@ -912,7 +961,8 @@ def truediv(operand1, operand2, *args, **kwargs):
         if not ret_type.startswith("f"):
             raise ValueError(f"truediv expects float inputs, but got {ret_type}. Use int_truediv for integers.")
 
-        return f'arith.divf %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
+        op_str = f'arith.divf %{operand1}, %{operand2}'
+        return format_mlir_op(op_str, shape, **kwargs), [tile_size, ret_type]
 
     @staticmethod
     def int_truediv(operand1, operand2, *args, **kwargs):
@@ -938,7 +988,8 @@ def mod(operand1, operand2, *args, **kwargs):
             raise NotImplementedError("Not support remainder operation for floating point")
         else:
             opcode = f'arith.remsi'
-        return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
+        op_str = f'{opcode} %{operand1}, %{operand2}'
+        return format_mlir_op(op_str, shape, **kwargs), [tile_size, ret_type]
 
     @staticmethod
     def remainder(operand1, operand2, *args, **kwargs):
@@ -950,7 +1001,8 @@ def remainder(operand1, operand2, *args, **kwargs):
         else:
             opcode = 'arith.remsi' # Signed Integer Remainder (LHS sign)
 
-        return f'{opcode} %{operand1}, %{operand2} : {shape}', [tile_size, ret_type]
+        op_str = f'{opcode} %{operand1}, %{operand2}'
+        return format_mlir_op(op_str, shape, **kwargs), [tile_size, ret_type]
 
     @staticmethod
     def square(operand, *args, **kwargs):
@@ -964,7 +1016,7 @@ def fma(operand1, operand2, operand3, *args, **kwargs):
         return result, V.kernel.var_info[result]
 
     # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    # PyTorchSim specific operations 
+    # PyTorchSim specific operations
 
     @staticmethod
     def alloc(size, src_type, *args, **kwargs):
@@ -976,7 +1028,9 @@ def extractelement(operand, idx, *args, **kwargs):
         tile_size = op_type[0]
         dtype = op_type[1]
         shape = f"vector<{tile_size}x{dtype}>" if tile_size > 1 else dtype
-        return f"vector.extract %{operand}[{idx}]: {dtype} from {shape}", [1, dtype]
+        op_str = f"vector.extract %{operand}[{idx}]"
+        shape = f"{dtype} from {shape}"
+        return format_mlir_op(op_str, shape, **kwargs), [1, dtype]
 
     @staticmethod
     def ext(operand, dtype, *args, **kwargs):
@@ -987,7 +1041,9 @@ def ext(operand, dtype, *args, **kwargs):
             opcode = f'arith.extf'
         else:
             opcode = f'arith.extui'
-        return f'{opcode} %{operand} : {shape} to {target_type}', [op_type[0], dtype]
+        op_str = f'{opcode} %{operand}'
+        shape = f"{shape} to {target_type}"
+        return format_mlir_op(op_str, shape, **kwargs), [op_type[0], dtype]
 
     @staticmethod
     def to_bool(operand, *args, **kwargs):
@@ -1003,19 +1059,76 @@ def to_bool(operand, *args, **kwargs):
     @staticmethod
     def step(size, dtype, *args, **kwargs):
         index_shape = f"vector<{size}x{dtype}>"
-        return f"vector.step : {index_shape}", [size, dtype]
+        op_str = f"vector.step"
+        return format_mlir_op(op_str, index_shape, **kwargs), [size, dtype]
 
     @staticmethod
-    def index_cast(operand, target_type, *args, **kwrags):
+    def index_cast(operand, target_type, *args, **kwargs):
         op_type = V.kernel.var_info[operand]
         src_shape = f"vector<{op_type[0]}x{op_type[1]}>" if op_type[0] > 1 else op_type[1]
         des_shape = f"vector<{op_type[0]}x{target_type}>" if op_type[0] > 1 else target_type
-        return f"arith.index_cast %{operand} : {src_shape} to {des_shape}", [op_type[0], target_type]
+        op_str = f"arith.index_cast %{operand}"
+        shape = f"{src_shape} to {des_shape}"
+        return format_mlir_op(op_str, shape, **kwargs), [op_type[0], target_type]
 
     @staticmethod
     def shape_cast(operand, src_shape, dst_shape, *args, **kwargs):
         operand_type = V.kernel.var_info[operand]
-        return f"vector.shape_cast %{operand} : {src_shape} to {dst_shape}", operand_type
+        op_str = f"vector.shape_cast %{operand}"
+        shape = f"{src_shape} to {dst_shape}"
+        return format_mlir_op(op_str, shape, **kwargs), operand_type
+
+    @staticmethod
+    def extract_strided_slice(operand, target_size, offsets=None, sizes=None, strides=None, *args, **kwargs):
+        op_type = V.kernel.var_info[operand]
+        src_size = op_type[0]
+        dtype = op_type[1]
+
+        if offsets is None:
+            offsets = [0]
+        if sizes is None:
+            sizes = [target_size]
+        if strides is None:
+            strides = [1]
+
+        src_shape = f"vector<{src_size}x{dtype}>"
+        dst_shape = f"vector<{target_size}x{dtype}>"
+
+        offsets_str = ", ".join(str(o) for o in offsets)
+        sizes_str = ", ".join(str(s) for s in sizes)
+        strides_str = ", ".join(str(s) for s in strides)
+
+        # Build attributes dict for offsets, sizes, strides
+        built_attributes = {
+            "offsets": f"[{offsets_str}]",
+            "sizes": f"[{sizes_str}]",
+            "strides": f"[{strides_str}]"
+        }
+
+        # Merge with any existing attributes from kwargs
+        existing_attributes = kwargs.get('attributes', {})
+        if isinstance(existing_attributes, dict):
+            merged_attributes = {**built_attributes, **existing_attributes}
+        elif isinstance(existing_attributes, str):
+            built_attrs_str = ", ".join(f"{k}={v}" for k, v in built_attributes.items())
+            merged_attributes = f"{built_attrs_str}, {existing_attributes}"
+        else:
+            merged_attributes = built_attributes
+
+        op_str = f"vector.extract_strided_slice %{operand}"
+        shape = f"{src_shape} to {dst_shape}"
+
+        # Pass merged attributes to format_mlir_op
+        updated_kwargs = {**kwargs, 'attributes': merged_attributes}
+        return format_mlir_op(op_str, shape, **updated_kwargs), [target_size, dtype]
+
+    @staticmethod
+    def vlane_offset(operand1, operand2, *args, **kwargs):
+        tile_size, ret_type, operand1, operand2 = ExtensionOverrides.binary_elementwise_common(operand1, operand2)
+        shape = f"vector<{tile_size}x{ret_type}>" if tile_size > 1 else ret_type
+        opcode = f'arith.add{ret_type[0]}'
+        op_str = f'{opcode} %{operand1}, %{operand2}'
+        return format_mlir_op(op_str, shape, **kwargs), [tile_size, ret_type]
 
     @staticmethod
     def multi_reduction(acc, init, vec_size, red_size, red_shape, red_type, type_name, *args, **kwargs):
@@ -1034,12 +1147,14 @@ def _load(compute_vec_size, mlir_dtype, buffer, indices, buffer_shape, *args, **
         if compute_vec_size == 1:
             vshape = f"{mlir_dtype}"
             operation = "affine.load"
-            line = f"{operation} %{buffer}[{indices}] : {buffer_shape}"
+            line = f"{operation} %{buffer}[{indices}]"
+            shape = buffer_shape
         else:
             vshape = f"vector<{compute_vec_size}x{mlir_dtype}>"
             operation = "affine.vector_load"
-            line = f"{operation} %{buffer}[{indices}] : {buffer_shape}, {vshape}"
-        return line, [compute_vec_size, mlir_dtype]
+            line = f"{operation} %{buffer}[{indices}]"
+            shape = f"{buffer_shape}, {vshape}"
+        return format_mlir_op(line, shape, **kwargs), [compute_vec_size, mlir_dtype]
 
     @staticmethod
     def _store(operand, buffer, indices, buffer_shape, *args, buffer_name=None, **kwargs):
@@ -1048,11 +1163,14 @@ def _store(operand, buffer, indices, buffer_shape, *args, buffer_name=None, **kw
         if compute_vec_size == 1:
             vshape = f"{mlir_dtype}"
             operation = "affine.store"
-            line = f"{operation} %{operand}, %{buffer}[{indices}] : {buffer_shape}"
+            line = f"{operation} %{operand}, %{buffer}[{indices}]"
+            shape = buffer_shape
         else:
             vshape = f"vector<{compute_vec_size}x{mlir_dtype}>"
             operation = "affine.vector_store"
-            line = f"{operation} %{operand}, %{buffer}[{indices}] : {buffer_shape}, {vshape}"
+            line = f"{operation} %{operand}, %{buffer}[{indices}]"
+            shape = f"{buffer_shape}, {vshape}"
+        line = format_mlir_op(line, shape, **kwargs)
 
         if buffer_name is not None:
             return common.DeferredLine(buffer_name, line), [None, None]

From 729b999d37f563cdd51a1ef112965645b4ec8db9 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 15 Jan 2026 07:35:40 +0000
Subject: [PATCH 074/194] [Tests/Diffusion] Add embedding test case

---
 tests/Diffusion/test_diffusion.py | 122 +++++++++++++++++++++++++-----
 1 file changed, 104 insertions(+), 18 deletions(-)

diff --git a/tests/Diffusion/test_diffusion.py b/tests/Diffusion/test_diffusion.py
index d6d740fe..082ed865 100644
--- a/tests/Diffusion/test_diffusion.py
+++ b/tests/Diffusion/test_diffusion.py
@@ -8,6 +8,7 @@
 from diffusers.models.unets.unet_2d_condition import UNet2DConditionModel
 from diffusers.models.upsampling import Upsample2D
 from diffusers.models.resnet import ResnetBlock2D
+from diffusers.models.embeddings import Timesteps
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
@@ -313,7 +314,7 @@ def test_cross_attn_down_block2d(
     dual_cross_attention=False
 ):
     print(f"Testing CrossAttnDownBlock2D on device: {device}")
-    
+
     # 1. Initialize the module on CPU
     cpu_block = CrossAttnDownBlock2D(
         in_channels=in_channels,
@@ -338,7 +339,7 @@ def test_cross_attn_down_block2d(
             temb=temb_cpu,
             encoder_hidden_states=encoder_hidden_states_cpu,
         )
-    
+
     # 4. Initialize the module on the custom device
     device_block = cpu_block.to(device).eval()
     device_block = torch.compile(device_block, dynamic=False)
@@ -347,7 +348,7 @@ def test_cross_attn_down_block2d(
     hidden_states_dev = hidden_states_cpu.to(device)
     temb_dev = temb_cpu.to(device)
     encoder_hidden_states_dev = encoder_hidden_states_cpu.to(device)
-    
+
     # 6. Get the output from the custom device module
     with torch.no_grad():
         dev_out, _ = device_block(
@@ -442,9 +443,9 @@ def test_groupnorm(
 
     # 1. Initialize the module on CPU
     cpu_norm = torch.nn.GroupNorm(
-        num_groups=num_groups, 
-        num_channels=channels, 
-        eps=eps, 
+        num_groups=num_groups,
+        num_channels=channels,
+        eps=eps,
         affine=True
     ).to("cpu").eval()
 
@@ -462,13 +463,13 @@ def test_groupnorm(
 
     # 4. Initialize the module on the custom device
     device_norm = torch.nn.GroupNorm(
-        num_groups=num_groups, 
-        num_channels=channels, 
-        eps=eps, 
+        num_groups=num_groups,
+        num_channels=channels,
+        eps=eps,
         affine=True
     ).to(device).eval()
     device_norm = torch.compile(device_norm, dynamic=False)
-    
+
     # Copy the weights from the CPU module to ensure they are identical
     device_norm.weight.data.copy_(cpu_norm.weight.data)
     device_norm.bias.data.copy_(cpu_norm.bias.data)
@@ -541,6 +542,89 @@ def test_upsample2d(
     print("Max diff >", torch.max(torch.abs(y_dev.cpu() - y_cpu)).item())
     print("Upsample2D simulation done.")
 
+
+def test_flip_sin_to_cos_embedding(
+    device,
+    batch=1,
+    embedding_dim=256,
+    rtol=1e-4,
+    atol=1e-4,
+):
+    def create_embeddings(timesteps, embedding_dim, scale=1.0, flip_sin_to_cos=False):
+        """
+        Replicate the embedding creation logic from Timesteps class.
+        """
+        half_dim = embedding_dim // 2
+        exponent = -math.log(10000) * torch.arange(start=0, end=half_dim, dtype=torch.float32, device=timesteps.device)
+        exponent = exponent / half_dim
+        emb = torch.exp(exponent)
+        emb = timesteps[:, None].float() * emb[None, :]
+        emb = scale * emb
+
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
+        # flip sine and cosine embeddings
+        if flip_sin_to_cos:
+            new_emb = torch.cat([emb[:, half_dim:], emb[:, :half_dim]], dim=-1)
+            return emb, new_emb
+        return emb, emb
+
+    g = torch.Generator().manual_seed(0)
+    timesteps_cpu = torch.randint(low=0, high=1000, size=(batch,), generator=g, dtype=torch.long)
+
+    # Test with flip_sin_to_cos=True
+    with torch.no_grad():
+        emb_flip_cpu = create_embeddings(timesteps_cpu, embedding_dim, flip_sin_to_cos=True)
+
+    # Move to device and test
+    timesteps_dev = timesteps_cpu.to(device)
+    @torch.compile(dynamic=False)
+    def create_embeddings_compiled(timesteps, embedding_dim, scale=1.0, flip_sin_to_cos=False):
+        return create_embeddings(timesteps, embedding_dim, scale, flip_sin_to_cos)
+
+    with torch.no_grad():
+        emb_flip_dev = create_embeddings_compiled(timesteps_dev, embedding_dim, flip_sin_to_cos=True)
+
+    # Verify flip case
+    test_result("Embedding (flip_sin_to_cos=True)", emb_flip_dev[0], emb_flip_cpu[0], rtol=rtol, atol=atol)
+    print("Max diff (flip) >", torch.max(torch.abs(emb_flip_dev[0].cpu() - emb_flip_cpu[0])).item())
+    test_result("Embedding (flip_sin_to_cos=True)", emb_flip_dev[1], emb_flip_cpu[1], rtol=rtol, atol=atol)
+    print("Max diff (flip) >", torch.max(torch.abs(emb_flip_dev[1].cpu() - emb_flip_cpu[1])).item())
+
+
+def test_timesteps(
+    device,
+    batch=1,
+    num_channels=64,
+    flip_sin_to_cos=True,
+    downscale_freq_shift=1.0,
+    rtol=1e-4,
+    atol=1e-4,
+):
+    print(f"Testing Timesteps on device: {device}")
+
+    cpu_timesteps = Timesteps(
+        num_channels=num_channels,
+        flip_sin_to_cos=flip_sin_to_cos,
+        downscale_freq_shift=downscale_freq_shift,
+    ).to("cpu").eval()
+
+    g = torch.Generator().manual_seed(0)
+    timesteps_cpu = torch.randint(low=0, high=1000, size=(batch,), generator=g, dtype=torch.long)
+
+    with torch.no_grad():
+        cpu_out = cpu_timesteps(timesteps_cpu)
+
+    dev_timesteps = cpu_timesteps.to(device).eval()
+    dev_timesteps = torch.compile(dev_timesteps, dynamic=False)
+
+    timesteps_dev = timesteps_cpu.to(device)
+    with torch.no_grad():
+        dev_out = dev_timesteps(timesteps_dev)
+
+    test_result("Timesteps", dev_out, cpu_out, rtol=rtol, atol=atol)
+    print("Max diff >", torch.max(torch.abs(dev_out.cpu() - cpu_out)).item())
+    print("Timesteps simulation done.")
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Run UNet (diffusers) test with comparison")
     parser.add_argument("--model", type=str, default="runwayml/stable-diffusion-v1-5",
@@ -557,14 +641,16 @@ def test_upsample2d(
     module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
 
-    test_upsample2d(device)
-    test_groupnorm(device)
-    test_groupnorm(device, stride=[1, 1, 320*32, 320])
-    test_resnetblock2d(device, in_channels=640, out_channels=320, temb_channels=320)
-    test_resnetblock2d(device, in_channels=640, out_channels=320, temb_channels=1280)
-    test_cross_attn_down_block2d(device)
-    test_unet_mid_block2d_cross_attn(device)
-    test_cross_attn_up_block2d(device)
+    #test_upsample2d(device)
+    #test_groupnorm(device)
+    #test_groupnorm(device, stride=[1, 1, 320*32, 320])
+    #test_resnetblock2d(device, in_channels=640, out_channels=320, temb_channels=256, resnet_act_fn='silu')
+    #test_resnetblock2d(device, in_channels=640, out_channels=320, temb_channels=1280)
+    #test_cross_attn_down_block2d(device)
+    #test_unet_mid_block2d_cross_attn(device)
+    #test_cross_attn_up_block2d(device)
+    #test_flip_sin_to_cos_embedding(device)
+    #test_timesteps(device)
     test_unet2d_condition_model(device)
     #test_unet_conditional(
     #    device=device,

From 7fa8d5425b94a60fcb6b25c1d2f0bebb63cfba56 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 15 Jan 2026 08:10:55 +0000
Subject: [PATCH 075/194] [Tests/MoE] Add patch to avoid dynamo bug

---
 tests/MoE/test_moe.py | 37 +++++++++++++------------------------
 1 file changed, 13 insertions(+), 24 deletions(-)

diff --git a/tests/MoE/test_moe.py b/tests/MoE/test_moe.py
index 9ebfb11e..f9c96aff 100644
--- a/tests/MoE/test_moe.py
+++ b/tests/MoE/test_moe.py
@@ -16,31 +16,19 @@
 
 sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-# FIXME. This is a temporary solution to avoid is_forward conflict during backward
-def patch_compile_event_logger():
-    """Patch CompileEventLogger.compilation_metric to avoid is_forward conflict during backward."""
-    from torch._dynamo.utils import CompileEventLogger
+# FIXME. This is a Dynamo bug. Solution to avoid is_forward conflict during backward
+def patch_metrics_context_update():
+    """Patch MetricsContext.update to set overwrite=True by default."""
     from torch._dynamo.utils import get_metrics_context
+    ctx = get_metrics_context()
+    original_update = ctx.update
 
-    original_compilation_metric = CompileEventLogger.compilation_metric
-
-    @staticmethod
-    def patched_compilation_metric(is_forward=True, **kwargs):
-        """Patched version that clears is_forward before setting it if there's a conflict."""
-        try:
-            metrics_context = get_metrics_context()
-            if metrics_context.in_progress() and hasattr(metrics_context, '_metrics'):
-                # If is_forward is already set and we're trying to set it to a different value, clear it first
-                current_is_forward = metrics_context._metrics.get('is_forward')
-                if current_is_forward is not None and current_is_forward != is_forward:
-                    metrics_context._metrics.pop('is_forward', None)
-        except:
-            pass
-        # Call the original function
-        return original_compilation_metric(is_forward=is_forward, **kwargs)
+    def patched_update(values, overwrite=True):
+        """Patched version that sets overwrite=True by default."""
+        return original_update(values, overwrite=True)
 
     # Patch the method
-    CompileEventLogger.compilation_metric = patched_compilation_metric
+    get_metrics_context().update = patched_update
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
     pass_message = f"|{name} Test Passed|"
@@ -469,6 +457,7 @@ def test_moe(device):
     total_cpu_loss = cpu_loss + cpu_aux_loss
     total_loss.to(device)
 
+    patch_metrics_context_update()
     print("Backward Started!")
     total_loss.backward()
     total_cpu_loss.backward()
@@ -496,7 +485,7 @@ def test_moe(device):
 
 def train_moe(device):
     # Patch CompileEventLogger to avoid metric conflicts
-    patch_compile_event_logger()
+    patch_metrics_context_update()
 
     def perceptron(a, b, c):
         return a * b + c
@@ -619,7 +608,7 @@ def weight_update(a, b, lr):
 
 def train_moe_mnist(device):
     # Patch CompileEventLogger to avoid metric conflicts
-    patch_compile_event_logger()
+    patch_metrics_context_update()
 
     torch.manual_seed(0)
     batch_size = 32
@@ -703,7 +692,7 @@ def train(model, device, train_loader, optimizer, epochs):
 
 def train_moe_single_iteration(device, iter_idx, is_evaluation=0):
     # Patch CompileEventLogger to avoid metric conflicts
-    patch_compile_event_logger()
+    patch_metrics_context_update()
 
     # Training moe with mnist dataset for sinlge iteration
     torch.manual_seed(0)

From 7919094fe10c40434f0cf7ecd599a09cd12c08d9 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 15 Jan 2026 11:20:24 +0000
Subject: [PATCH 076/194] [Fix] Change wrong TORCHSIM_DUMP_PATH usage

---
 README.md                          | 6 +++---
 experiments/BERT.py                | 2 +-
 experiments/attention.py           | 2 +-
 experiments/conv.py                | 2 +-
 experiments/gemm.py                | 2 +-
 experiments/layernorm.py           | 2 +-
 experiments/resnet18.py            | 2 +-
 experiments/resnet50.py            | 2 +-
 experiments/softmax.py             | 2 +-
 scripts/chiplet_prep.py            | 2 +-
 scripts/chiplet_prep.sh            | 2 +-
 scripts/sparsity_experiment/run.sh | 2 +-
 tutorial/session2/Hands_on.ipynb   | 2 +-
 13 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index 4d98baa4..4a3ef145 100644
--- a/README.md
+++ b/README.md
@@ -99,7 +99,7 @@ The `tests` directory contains several AI workloads examples.
 ```bash
 python tests/test_matmul.py 
 ```
-The result is stored to `TORCHSIM_DUMP_PATH/hash/togsim_result/`. The log file contains detailed core, memory, and interconnect stats.
+The result is stored to `TORCHSIM_LOG_PATH/hash/togsim_result/`. The log file contains detailed core, memory, and interconnect stats.
 
 ### Run Your Own Model on PyTorchSim
 You can run your own PyTorch model on PyTorchSim by setting up a custom NPU device.  
@@ -197,9 +197,9 @@ Log contains memory & core stats.
 [2025-12-05 08:05:52.538] [info] Total execution cycles: 2065
 [2025-12-05 08:05:52.538] [info] Wall-clock time for simulation: 0.147463 seconds
 ```
-The log is dumped in `TORCHSIM_DUMP_PATH` and you can set the path as below.
+The log is dumped in `TORCHSIM_LOG_PATH` and you can set the path as below.
 ```bash
-export TORCHSIM_DUMP_PATH=/tmp/torchinductor # output file dump path
+export TORCHSIM_LOG_PATH=/tmp/torchinductor # output file dump path
 ```
 
 ## Training
diff --git a/experiments/BERT.py b/experiments/BERT.py
index 5ccd3084..fd671833 100644
--- a/experiments/BERT.py
+++ b/experiments/BERT.py
@@ -48,7 +48,7 @@ def run_BERT(size, input_seq, config):
     input_seq = args.input_size
     result_path = os.path.join(base_dir, args.dump_path, config_prefix, f"BERT_{size}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
     # setting environment variables
-    os.environ['TORCHSIM_DUMP_PATH'] = result_path
+    os.environ['TORCHSIM_LOG_PATH'] = result_path
     # only timing simulation
     os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
     if 'pytorchsim_functional_mode' in os.environ:
diff --git a/experiments/attention.py b/experiments/attention.py
index 842f105a..211433f1 100644
--- a/experiments/attention.py
+++ b/experiments/attention.py
@@ -47,7 +47,7 @@ def attention(query, key, value):
     size_str = "x".join([str(i) for i in size])
     result_path = os.path.join(base_dir, args.dump_path, config_prefix, f"attention_{size_str}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
     # setting environment variables
-    os.environ['TORCHSIM_DUMP_PATH'] = result_path
+    os.environ['TORCHSIM_LOG_PATH'] = result_path
     # only timing simulation
     os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
     if 'pytorchsim_functional_mode' in os.environ:
diff --git a/experiments/conv.py b/experiments/conv.py
index 25952fb0..61f7ad80 100644
--- a/experiments/conv.py
+++ b/experiments/conv.py
@@ -48,7 +48,7 @@ def custom_conv2d(a, b, bias):
     size_str = "_".join([str(i) for i in size])
     result_path = os.path.join(base_dir, args.dump_path, config_prefix, f"CONV_{size_str}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
     # setting environment variables
-    os.environ['TORCHSIM_DUMP_PATH'] = result_path
+    os.environ['TORCHSIM_LOG_PATH'] = result_path
     # only timing simulation
     os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
     if 'pytorchsim_functional_mode' in os.environ:
diff --git a/experiments/gemm.py b/experiments/gemm.py
index 3090e331..44be689a 100644
--- a/experiments/gemm.py
+++ b/experiments/gemm.py
@@ -42,7 +42,7 @@ def custom_matmul(a, b):
     size_str = "x".join([str(i) for i in size])
     result_path = os.path.join(base_dir, args.dump_path, config_prefix, f"GEMM_{size_str}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
     # setting environment variables
-    os.environ['TORCHSIM_DUMP_PATH'] = result_path
+    os.environ['TORCHSIM_LOG_PATH'] = result_path
     # only timing simulation
     os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
     if 'pytorchsim_functional_mode' in os.environ:
diff --git a/experiments/layernorm.py b/experiments/layernorm.py
index 9c9934a1..a6b16986 100644
--- a/experiments/layernorm.py
+++ b/experiments/layernorm.py
@@ -38,7 +38,7 @@ def run_layernorm(size, config):
     size_str = "x".join([str(i) for i in size])
     result_path = os.path.join(base_dir, args.dump_path, config_prefix, f"LayerNorm_{size_str}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
     # setting environment variables
-    os.environ['TORCHSIM_DUMP_PATH'] = result_path
+    os.environ['TORCHSIM_LOG_PATH'] = result_path
     os.environ['TORCHSIM_FUSION_REDUCTION_REDUCTION'] = "0"
     # only timing simulation
     os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
diff --git a/experiments/resnet18.py b/experiments/resnet18.py
index 5451e0f5..c7763d86 100644
--- a/experiments/resnet18.py
+++ b/experiments/resnet18.py
@@ -39,7 +39,7 @@ def run_resnet(batch, config):
     batch = args.batch
     result_path = os.path.join(base_dir, args.dump_path, config_prefix, f"resnet18_{batch}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
     # setting environment variables
-    os.environ['TORCHSIM_DUMP_PATH'] = result_path
+    os.environ['TORCHSIM_LOG_PATH'] = result_path
     os.environ['TORCHSIM_USE_TIMING_POOLING'] = "1"
     # only timing simulation
     os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
diff --git a/experiments/resnet50.py b/experiments/resnet50.py
index 83d82db4..4e611541 100644
--- a/experiments/resnet50.py
+++ b/experiments/resnet50.py
@@ -39,7 +39,7 @@ def run_resnet(batch, config):
     batch = args.batch
     result_path = os.path.join(base_dir, args.dump_path, config_prefix, f"resnet50_{batch}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
     # setting environment variables
-    os.environ['TORCHSIM_DUMP_PATH'] = result_path
+    os.environ['TORCHSIM_LOG_PATH'] = result_path
     os.environ['TORCHSIM_USE_TIMING_POOLING'] = "1"
     # only timing simulation
     os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
diff --git a/experiments/softmax.py b/experiments/softmax.py
index 580d56ca..d30559f7 100644
--- a/experiments/softmax.py
+++ b/experiments/softmax.py
@@ -38,7 +38,7 @@ def run_softmax(size, config, dim=1):
     size_str = "x".join([str(i) for i in size])
     result_path = os.path.join(base_dir, args.dump_path, config_prefix, f"Softmax_{size_str}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
     # setting environment variables
-    os.environ['TORCHSIM_DUMP_PATH'] = result_path
+    os.environ['TORCHSIM_LOG_PATH'] = result_path
     # only timing simulation
     os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
     if 'pytorchsim_functional_mode' in os.environ:
diff --git a/scripts/chiplet_prep.py b/scripts/chiplet_prep.py
index 4f8b7f7c..213eb85b 100644
--- a/scripts/chiplet_prep.py
+++ b/scripts/chiplet_prep.py
@@ -73,7 +73,7 @@ def modify_file(dump_path, name, address_numa_stride=None, subgraph_map=None):
 
     folder = int(args.size)
     print("Taget size: ", folder)
-    folder_path = os.environ.get("TORCHSIM_DUMP_PATH")
+    folder_path = os.environ.get("TORCHSIM_LOG_PATH")
     print(folder_path)
     os.makedirs(folder_path, exist_ok=True)
     test_matmul(device, folder, folder, folder)
diff --git a/scripts/chiplet_prep.sh b/scripts/chiplet_prep.sh
index cddf1a58..f3bd1a1c 100755
--- a/scripts/chiplet_prep.sh
+++ b/scripts/chiplet_prep.sh
@@ -8,7 +8,7 @@ for size in "${sizes[@]}"; do
     export TORCHSIM_TILE_M=$((size / 2))
     export TORCHSIM_TILE_K=$((size / 2))
     export TORCHSIM_TILE_N=$((size / 2))
-    export TORCHSIM_DUMP_PATH=$(pwd)/chiplet_result/$size
+    export TORCHSIM_LOG_PATH=$(pwd)/chiplet_result/$size
     python3 chiplet_prep.py $size
     #python3 chiplet_run.py $(pwd)/chiplet_result
 done
\ No newline at end of file
diff --git a/scripts/sparsity_experiment/run.sh b/scripts/sparsity_experiment/run.sh
index 84c818ac..da9b73cc 100755
--- a/scripts/sparsity_experiment/run.sh
+++ b/scripts/sparsity_experiment/run.sh
@@ -1,4 +1,4 @@
-export TORCHSIM_DUMP_PATH=$(pwd)/result
+export TORCHSIM_LOG_PATH=$(pwd)/result
 export SPIKE_DUMP_SPARSE_TILE=1
 export TORCHSIM_FORCE_TIME_K=8
 export TORCHSIM_FORCE_TIME_M=8
diff --git a/tutorial/session2/Hands_on.ipynb b/tutorial/session2/Hands_on.ipynb
index 2d5a5cdc..2964f293 100644
--- a/tutorial/session2/Hands_on.ipynb
+++ b/tutorial/session2/Hands_on.ipynb
@@ -32,7 +32,7 @@
     "import torch._dynamo\n",
     "import torch.utils.cpp_extension\n",
     "base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')\n",
-    "os.environ['TORCHSIM_DUMP_PATH']=os.path.join(os.getcwd(), \"togsim_results\")\n",
+    "os.environ['TORCHSIM_LOG_PATH']=os.path.join(os.getcwd(), \"togsim_results\")\n",
     "sys.path.append(base_dir)\n",
     "\n",
     "from Scheduler.scheduler import PyTorchSimRunner\n",

From 1ca33488eb464bd766d042764f562d6b3fe616d1 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 15 Jan 2026 11:34:12 +0000
Subject: [PATCH 077/194] [Scheduler] Validate pytorchsim_timing_mode != 0 in
 Scheduler constructor

---
 Scheduler/scheduler.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/Scheduler/scheduler.py b/Scheduler/scheduler.py
index 3f5673a8..dfd4aab6 100644
--- a/Scheduler/scheduler.py
+++ b/Scheduler/scheduler.py
@@ -376,6 +376,12 @@ def __init__(self, num_request_queue=1, max_batch=1, engine_select=FIFO_ENGINE,
 
         togsim_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "TOGSim")
         self.tog_simulator = TOGSimulator(togsim_path, togsim_config)
+        if self.tog_simulator.config_yaml['pytorchsim_timing_mode'] == 0:
+            # Scheduler requires timing mode to be enabled (pytorchsim_timing_mode != 0).
+            logger.error(f"pytorchsim_timing_mode is set to 0 in config file '{togsim_config}'. ")
+            logger.error(f"Scheduler requires timing mode to be enabled (pytorchsim_timing_mode != 0).")
+            exit(0)
+
         os.environ['TOGSIM_CONFIG'] = togsim_config
         self.tog_simulator.interactive_simulation()
         if engine_select == Scheduler.FIFO_ENGINE:

From 8df3beeab76256b2dbb2472bd6b73b80c43d1aa8 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 15 Jan 2026 14:51:44 +0000
Subject: [PATCH 078/194] [Fix] Move rename_indexing before load cacheing

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 4 ----
 PyTorchSimFrontend/mlir/mlir_common.py          | 4 ++++
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 912c618a..01485d2e 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -435,7 +435,6 @@ def parse_index_list(self, expr_list:list, buffer=None, offset=sympy.Number(0))
         return index
 
     def load(self, name: str, index: sympy.Expr):
-        index = self.rename_indexing(index)
         index, comptute_depedency = self.convert_indirect_indexing(index)
         padding = self.get_padding_type()
 
@@ -489,7 +488,6 @@ def load(self, name: str, index: sympy.Expr):
         return out
 
     def store(self, name: str, index: sympy.Expr, value, mode=None, *args, **kwargs):
-        index = self.rename_indexing(index)
         dtype = V.graph.get_dtype(name)
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
 
@@ -642,7 +640,6 @@ def store_reduction(self, name, index, value):
         dram_var = self.kernel_group.args.output(name)
         dtype = V.graph.get_dtype(name)
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
-        index = self.rename_indexing(index)
 
         with self.override_buffer_cse(cse=self.reduction_cse):
             # Tile is always reuduced in inner loop
@@ -779,7 +776,6 @@ def _index_expr(self, tile_desc, renamed_expression, index, base_vector_index):
         return accum
 
     def index_expr(self, index, dtype):
-        index = self.rename_indexing(index)
         base_tile_desc = self.kernel_group.tile_desc
         if len(self.ranges) != self.reduction_depth:
             # FIXME. This is a temporary solution to get tile stride of the reduction case
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index e31555ba..ad755c6e 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -913,6 +913,7 @@ def indirect_indexing(index_var, size, check=True, wrap_neg=True):
 
             @staticmethod
             def load(name: str, index: sympy.Expr):
+                index = self.rename_indexing(index)
                 if name in self.cse.invalidated_stores:
                     # A load from an invalidated store requires us to
                     # keep the actual buffer around
@@ -937,6 +938,7 @@ def store(name, index, value, mode=None):
                         for other_name in self.current_node.get_output(name).get_mutations():
                             self.cse.store_cache[other_name] = value
                 if name not in V.graph.removed_buffers:
+                    index = self.rename_indexing(index)
                     return self.store(name, index, value, mode=mode)
 
             @staticmethod
@@ -948,6 +950,7 @@ def store_reduction(name, index, value):
                         self.cse.store_cache[other_name] = value
 
                 if name not in V.graph.removed_buffers:
+                    index = self.rename_indexing(index)
                     return self.store_reduction(name, index, value)
 
             @staticmethod
@@ -960,6 +963,7 @@ def _index_expr(tile_size, buffer, renamed_expression, index):
 
             @staticmethod
             def index_expr(index, dtype):
+                index = self.rename_indexing(index)
                 return self.index_expr(index, dtype)
 
             @staticmethod

From ea79ad0cda4ddffa0ba8e1abca78bfd92a285463 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 16 Jan 2026 10:27:17 +0000
Subject: [PATCH 079/194] [Fusion] Fix template codegen + Add custom fusion
 hook

---
 PyTorchSimFrontend/mlir/mlir_scheduling.py | 51 +++++++++++++++++++---
 PyTorchSimFrontend/mlir/mlir_template.py   | 23 ++++++----
 2 files changed, 59 insertions(+), 15 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index f2bcba7e..aff2f0b0 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -25,13 +25,48 @@ class MLIRScheduling(BaseScheduling):
     target_kernel = MLIRKernel
     def __init__(self, scheduler):
         self.scheduler = scheduler
-        #self.scheduler.enter_context = self.enter_context_fixed # FIXME. Monkey patch: For fixing the inductor bug
+        if scheduler is not None:
+            self.scheduler.can_fuse_origin = self.scheduler.can_fuse
+            self.scheduler.can_fuse = self.can_fuse_with_exceptions # FIXME. Monkey patch: For prolouge fusion
         self.kernel_group = mlir_common.MLIRWrapperKenrelGroup()
         self._ready_to_flush = False
         self.outer_function = set()
         config.inplace_buffers = False # FIXME. inout kernel makes trouble.. So disabled it!
         self.max_fusion_size = 5
 
+    def can_fuse_with_exceptions(self, node1: BaseSchedulerNode, node2: BaseSchedulerNode) -> bool:
+        if not extension_config.CONFIG_FUSION:
+            return False
+
+        # Extract base template node
+        base_template_node1 = [node for node in node1.get_nodes() if node.is_template()]
+        base_template_node2 = [node for node in node2.get_nodes() if node.is_template()]
+
+        # Case 3: Prologue(Pointwise) + Tempalte
+        if len(base_template_node1) == 0 and len(node1.get_nodes())==1 and not node1.is_reduction() and len(base_template_node2) == 1 and extension_config.CONFIG_FUSION_PROLOGUE:
+            from PyTorchSimFrontend.mlir.mlir_gemm_template import MLIRGemmTemplate
+            from PyTorchSimFrontend.mlir.mlir_bmm_template import MLIRBMMTemplate
+
+            target_node = base_template_node2[0].node
+            # Currently only BMM, MM support prologue fusion
+            if not isinstance(target_node.template, (MLIRBMMTemplate, MLIRGemmTemplate)):
+                return False
+
+            if len(node1.read_writes.writes) != 1:
+                return False
+            if node1.node not in target_node.inputs or any(["view" in str(ori) for ori in node1.node.origins]): #FIXME
+                return False
+
+            # We don't fuse this edge case...
+            if base_template_node2[0].group[1][0][0] == 1:
+                return False
+
+            if list(node1.read_writes.writes)[0].name in [dep.name for dep in node2.read_writes.reads]:
+                node1 = self.revert_group(node1)
+                return True
+        return self.scheduler.can_fuse_origin(node1, node2)
+
+
     def _set_flush_status(self, status: bool):
         self._ready_to_flush = status
 
@@ -45,6 +80,9 @@ def get_backend_features(self, device):
     def can_fuse_vertical(self, node1, node2):
         return self.can_fuse_horizontal(node1, node2)
 
+    def can_fuse_multi_outputs_template(self, node1, node2):
+        return self.can_fuse_horizontal(node1, node2)
+
     def can_fuse_horizontal(self, node1, node2):
         if not extension_config.CONFIG_FUSION:
             return False
@@ -88,7 +126,7 @@ def can_fuse_horizontal(self, node1, node2):
             return same_iter and no_dependency
 
         # Case 1: Template + Pointwise fusion
-        if len(base_template_node1) == 1 and len(base_template_node2) == 0 and not node2.is_reduction():
+        if len(base_template_node1) == 1 and len(node1.get_nodes())==1 and len(base_template_node2) == 0 and not node2.is_reduction():
             # Don't fuse maxpool template code
             from PyTorchSimFrontend.mlir.mlir_maxpool_template import MLIRMaxPoolTemplate
             from PyTorchSimFrontend.mlir.mlir_bmm_template import MLIRBMMTemplate
@@ -132,9 +170,10 @@ def can_fuse_horizontal(self, node1, node2):
             return True
 
         # Case 2: Tempalte + Reduction fusion
-        if len(base_template_node1) == 1 and len(base_template_node2) == 0 and node2.is_reduction() and extension_config.CONFIG_FUSION_REDUCTION_EPILOGUE:
+        if len(base_template_node1) == 1 and len(node1.get_nodes())==1 and len(base_template_node2) == 0 and node2.is_reduction() and extension_config.CONFIG_FUSION_REDUCTION_EPILOGUE:
             from PyTorchSimFrontend.mlir.mlir_gemm_template import MLIRGemmTemplate
             from PyTorchSimFrontend.mlir.mlir_bmm_template import MLIRBMMTemplate
+            target_node = base_template_node1[0].node
             if not isinstance(target_node.template, (MLIRBMMTemplate, MLIRGemmTemplate)):
                 return False
 
@@ -149,7 +188,7 @@ def can_fuse_horizontal(self, node1, node2):
             # We can't fuse dim=-1
             layout_possible = stride != 1
             # Directed linked?
-            dependency_check = node2.get_nodes()[0] in [node.node for node in base_template_node1[0].users]# and len(node2.read_writes.reads)==1
+            dependency_check = writes1 & reads2
             dependency_size = all([i.get_numel() == node1.get_nodes()[0].node.get_numel() for i in node2.read_writes.reads])
             return size_match and layout_possible and dependency_check and dependency_size
 
@@ -177,8 +216,8 @@ def can_fuse_horizontal(self, node1, node2):
                 return True
 
         # Check elementwise fusion
-        if vars1 == vars2 and reduce1 == reduce2:
-            return True
+        if vars1 == vars2 and reduce1 == reduce2 and not node1.is_reduction() and not node2.is_reduction():
+            return writes1 & reads2
         return False
 
     def revert_group(self, act_nodes, args=None, var_ranges=None):
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 304d0090..31796a8b 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -573,8 +573,8 @@ def template_store():
             with contextlib.ExitStack() as stack:
                 stack.enter_context(compute_body.indent(attribute="{inner_loop=false}",suffix=self.compute_body_loop.epilogue_line()))
                 if self.reduction_fusion:
-                    compute_body.writelines(self.reduction_body_loop.lines())
                     compute_body.splice(self.masks)
+                    compute_body.writelines(self.reduction_body_loop.lines())
                     stack.enter_context(compute_body.indent(attribute="{inner_loop=false}"))
                     compute_body.splice(self.loads)
                     compute_body.splice(self.compute)
@@ -848,7 +848,6 @@ def get_spad_size_per_lane(self, tile_m, tile_n):
         return max(size, 2) # vector load/store
 
     def load_epilogue(self, name: str, index: sympy.Expr):
-        index = self.rename_indexing(index)
         dram_var = self.kernel_group.args.input(name)
         dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
         dtype = V.graph.get_dtype(name)
@@ -898,7 +897,6 @@ def load_epilogue(self, name: str, index: sympy.Expr):
         return out
 
     def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
-        index = self.rename_indexing(index)
         dram_var = self.kernel_group.args.output(name)
         dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
         dtype = V.graph.get_dtype(name)
@@ -1000,7 +998,6 @@ def reduction_epilogue(self, dtype, src_dtype, reduction_type, value):
         return sram_var
 
     def store_reduction_epilogue(self, name, index, value):
-        index = self.rename_indexing(index)
         dram_var = self.kernel_group.args.output(name)
         dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
         dtype = V.graph.get_dtype(name)
@@ -1119,11 +1116,19 @@ def set_tile_size(self, template_fusion_info, prologue=False):
         return tile_desc
 
     def rename_indexing(self, index) -> sympy.Expr:
-        for dim_name, dim_aliased_name in self.dim_aliasing.items():
-            index = index.subs(sympy.Symbol(dim_name), sympy.Symbol("tmp_"+dim_aliased_name))
-        # To avoid this case ({"index0":"index1", "index1":"index0"})
-        for dim_aliased_name in self.dim_aliasing.values():
-            index = index.subs(sympy.Symbol("tmp_"+dim_aliased_name), sympy.Symbol(dim_aliased_name))
+        # First step: replace dim_name with tmp_+dim_aliased_name to avoid circular dependencies
+        # (e.g., {"index0":"index1", "index1":"index0"})
+        tmp_subs = {
+            sympy.Symbol(dim_name): sympy.Symbol("tmp_"+dim_aliased_name)
+            for dim_name, dim_aliased_name in self.dim_aliasing.items()
+        }
+        index = index.subs(tmp_subs)
+        # Second step: replace tmp_+dim_aliased_name with dim_aliased_name
+        final_subs = {
+            sympy.Symbol("tmp_"+dim_aliased_name): sympy.Symbol(dim_aliased_name)
+            for dim_aliased_name in self.dim_aliasing.values()
+        }
+        index = index.subs(final_subs)
         return index
 
 class MLIRTemplateCaller(CUDATemplateCaller):

From 0c6175fdc0e354284cbc3f12cf64405dfb319113 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 19 Jan 2026 15:23:17 +0000
Subject: [PATCH 080/194] [Template] Fix template fusion codegen

---
 .../mlir/mlir_codegen_backend.py              | 87 +++++++++++--------
 PyTorchSimFrontend/mlir/mlir_common.py        | 18 ++--
 PyTorchSimFrontend/mlir/mlir_gemm_template.py |  4 +-
 PyTorchSimFrontend/mlir/mlir_ops.py           | 39 ++++++++-
 PyTorchSimFrontend/mlir/mlir_scheduling.py    | 62 +++++++------
 PyTorchSimFrontend/mlir/mlir_template.py      | 34 +++++---
 6 files changed, 153 insertions(+), 91 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 01485d2e..671d0e09 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -327,7 +327,7 @@ def get_padding_type(self):
         #         return 1
         return 0
 
-    def convert_index(self, expr, buffer):
+    def convert_index(self, expr):
         if len(expr.free_symbols) != 1:
             raise NotImplementedError("Not supporting this view operation...!")
 
@@ -346,17 +346,37 @@ def convert_index(self, expr, buffer):
         first_arg = expr.args[0]
         if len(first_arg.free_symbols) != 1:
             raise NotImplementedError("What is this case?")
+
+        # Create affine.apply operation
         indices = [list(first_arg.free_symbols)[0]]
-        args = ", ".join(map(str, indices))
-        map_var = self.map_cse.generate(self.global_vars, f"affine_map<({args}) -> ({expr_str})>")
-        args = ", ".join([f"%{i}" for i in indices])
-        index = self.apply_cse.generate(buffer, f"affine.apply #{map_var}({args})")
+        with self.override_buffer_cse(buffer=self.global_vars, cse=self.map_cse):
+            map_var = ops.affine_map(indices, expr_str)
+        index = ops.affine_apply(map_var, indices)
         return index
 
-    def parse_indices(self, expr, buffer=None, comments="", indirect_dims=[]) -> common.CSEVariable:
-        if buffer is None:
-            buffer = self.applys
+    def _convert_sympy_to_mlir_expr(self, expr, sorted_args):
+        """
+        Convert sympy expression to MLIR affine map expression by replacing index variables.
+        """
+        indices = []
+
+        for arg in sorted_args:
+            if arg.is_Mul and arg.args[0].is_number:
+                target_arg = arg.args[1]
+            elif not arg.is_number:
+                target_arg = arg
+            else:
+                continue
+            new_arg = sympy.Symbol(str(self.convert_index(target_arg)))
+            expr = expr.replace(target_arg, new_arg)
+            indices.append(str(new_arg))
+
+        expr_str = str(expr)
+        if "//" in expr_str:
+            expr_str = expr_str.replace("//", " floordiv ")
+        return expr_str, indices
 
+    def parse_indices(self, expr, comments="", indices=None, indirect_dims=[]) -> common.CSEVariable:
         # Constant case
         if expr.is_number and len(indirect_dims) == 0:
             return self.get_const_cse(int(expr))
@@ -372,33 +392,25 @@ def parse_indices(self, expr, buffer=None, comments="", indirect_dims=[]) -> com
         # Sort index variable.. ex) (%index1, %index0)
         args_dict = {term: list(term.free_symbols)[0] for term in args if term.free_symbols}
         sorted_args = sorted(args_dict.keys(), key=lambda term: str(args_dict[term]))
-        indices = []
-        for arg in sorted_args:
-            if arg.is_Mul and arg.args[0].is_number:
-                new_arg = sympy.Symbol(str(self.convert_index(arg.args[1], buffer)))
-                expr = expr.replace(arg.args[1], new_arg)
-                indices.append(str(new_arg))
-            elif not arg.is_number:
-                new_arg = sympy.Symbol(str(self.convert_index(arg, buffer)))
-                expr = expr.replace(arg, new_arg)
-                indices.append(str(new_arg))
+
+        # Convert sympy expression to affine map expression
+        expr_str, indices = self._convert_sympy_to_mlir_expr(expr, sorted_args)
 
         # Extract index var
-        indirect_args = [f"%{i}" for i in indirect_dims]
-        if len(indirect_args):
+        if len(indirect_dims):
             comments = "{indirect_access} " + comments # Add indirect access attribute
-        expr_str = str(expr)
-        if "//" in expr_str:
-            expr_str = expr_str.replace("//", " floordiv ")
-        args = ", ".join(map(str, indices))
-        map_var = self.map_cse.generate(self.global_vars, f"affine_map<({args})[{','.join(indirect_dims)}] -> ({expr_str})>")
-        args = ", ".join([f"%{i}" for i in indices])
-        index = self.apply_cse.generate(buffer, f"affine.apply #{map_var}({args})[{','.join(indirect_args)}] {comments}")
+        indirect_args = [f"%{i}" for i in indirect_dims]
+        # Create affine.apply operation
+        with self.override_buffer_cse(buffer=self.global_vars, cse=self.map_cse):
+            map_var = ops.affine_map(indices, expr_str, symbol_names=indirect_dims)
+
+        if hasattr(self, "dim_aliasing"):
+            indices = [self.dim_aliasing.get(index, index) for index in indices]
+        index = ops.affine_apply(map_var, indices, indirect_dims=indirect_args, comment=comments)
         return index
 
-    def parse_index_list(self, expr_list:list, buffer=None, offset=sympy.Number(0)) -> common.CSEVariable:
-        if buffer is None:
-            buffer = self.applys
+    def parse_index_list(self, expr_list:list, offset=sympy.Number(0)) -> common.CSEVariable:
+        """ Need to override buffer and cse to use this function. """
         expr_list = [arg for arg in expr_list]
         dim_list = [f"d{i}" for i in range(len(expr_list))]
 
@@ -413,11 +425,11 @@ def parse_index_list(self, expr_list:list, buffer=None, offset=sympy.Number(0))
         new_expr_list = [0] * len(expr_list)
         for idx, arg in enumerate(expr_list):
             if arg.is_Mul and arg.args[0].is_number:
-                new_arg = sympy.Symbol(str(self.convert_index(arg.args[1], buffer)))
+                new_arg = sympy.Symbol(str(self.convert_index(arg.args[1])))
                 new_expr_list[idx] = arg.subs(arg.args[1], dim_list[idx])
                 indices.append(str(new_arg))
             elif not arg.is_number:
-                new_arg = sympy.Symbol(str(self.convert_index(arg, buffer)))
+                new_arg = sympy.Symbol(str(self.convert_index(arg)))
                 new_expr_list[idx] = new_arg.subs(new_arg, dim_list[idx])
                 indices.append(str(new_arg))
             else:
@@ -427,11 +439,11 @@ def parse_index_list(self, expr_list:list, buffer=None, offset=sympy.Number(0))
                 indices.append(str(new_arg))
 
         # Extract index var
+        # Create affine.apply operation
         expr_str = str(sum(new_expr_list) + offset)
-        args = ", ".join(map(str, dim_list))
-        map_var = self.map_cse.generate(self.global_vars, f"affine_map<({args})[] -> ({expr_str})>")
-        args = ", ".join([f"%{i}" for i in indices])
-        index = self.apply_cse.generate(buffer, f"affine.apply #{map_var}({args})[]")
+        with self.override_buffer_cse(buffer=self.global_vars, cse=self.map_cse):
+            map_var = ops.affine_map(dim_list, expr_str)
+        index = ops.affine_apply(map_var, indices)
         return index
 
     def load(self, name: str, index: sympy.Expr):
@@ -1080,7 +1092,8 @@ def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffe
         if broadcast and (total_dims != local_dims or (self.reduction_depth!=len(total_dims) and total_dims[:self.reduction_depth] == local_dims)):
             local_dims = total_dims # Brodatcast tile shape
 
-        index_var = self.parse_indices(index, buffer=buffer, indirect_dims=indirect_dims, comments=f"// store_reduction={store_reduction}")
+        with self.override_buffer_cse(buffer=buffer, cse=self.apply_cse):
+            index_var = self.parse_indices(index, indirect_dims=indirect_dims, comments=f"// store_reduction={store_reduction}")
 
         if kg_tile_desc.vmap.vlane_split_axis in local_dims:
             local_vlane_split_axis = local_dims.index(kg_tile_desc.vmap.vlane_split_axis)
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index ad755c6e..0717333a 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -612,6 +612,7 @@ def __init__(self, kernel_group, reason=None):
         instance_id = id(self)
         self.target_buffer_override = contextvars.ContextVar(f"Handler_compute_override_{instance_id}", default=self.compute)
         self.target_cse_override = contextvars.ContextVar(f"Handler_cse_override_{instance_id}", default=self.cse)
+        self._nested_context_depth = 0
 
     def set_ranges(self, lengths, reduction_lengths):
         if self.call_ranges:
@@ -992,13 +993,20 @@ def bucketize(
                     values, offsets_name, offsets_size, indexing_dtype, right
                 )
 
-        super().__enter__()
-        assert self.overrides
-        parent_handler = self.overrides()
-        self.exit_stack.enter_context(V.set_ops_handler(CSEProxy()))
-        self.exit_stack.enter_context(V.set_kernel_handler(self))
+        if self._nested_context_depth == 0:
+            self.exit_stack.__enter__()
+            assert self.overrides
+            parent_handler = self.overrides()
+
+            self.exit_stack.enter_context(V.set_ops_handler(CSEProxy()))
+            self.exit_stack.enter_context(V.set_kernel_handler(self))
+        self._nested_context_depth += 1
         return self
 
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self._nested_context_depth -= 1
+        if self._nested_context_depth == 0:
+            super().__exit__(exc_type, exc_val, exc_tb)
 
 @dataclasses.dataclass
 class LoopLevel:
diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index bbc63b45..0158caa6 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -154,7 +154,7 @@ def render(self,
         W_tile_desc.set_tile_size_stride(W_tile_size, W_tile_stride)
         W_tile_desc.set_name("W_buffer")
         W_tile_desc.offset = W.get_layout().offset
-        W_stride = W.get_layout().stride
+        W_stride = W.get_layout().stride if N>1 else [Y.get_layout().stride[0], 0]
         W_idx = [sympy.Symbol("index2") * W_stride[0], sympy.Symbol("index1") * W_stride[1]]
 
         vlane_split_axis = vlane_split_axis if nr_rdim==0 else 0
@@ -163,7 +163,7 @@ def render(self,
         Y_tile_desc = mlir_common.MLIRMultiDimTile(Y_tile_size, kernel.vector_lane, vlane_split_axis, vlane_stride)
         Y_tile_desc.set_tile_size_stride(Y_tile_size, Y_tile_stride)
         Y_tile_desc.set_name("Y_buffer")
-        Y_stride = Y.get_layout().stride
+        Y_stride = Y.get_layout().stride if N>1 else [Y.get_layout().stride[0], 0]
         if nr_rdim == 0:
             Y_idx = [sympy.Symbol("index0") * Y_stride[0], sympy.Symbol("index1") * Y_stride[1]]
         else:
diff --git a/PyTorchSimFrontend/mlir/mlir_ops.py b/PyTorchSimFrontend/mlir/mlir_ops.py
index 4cf031d2..fd0114e1 100644
--- a/PyTorchSimFrontend/mlir/mlir_ops.py
+++ b/PyTorchSimFrontend/mlir/mlir_ops.py
@@ -1175,4 +1175,41 @@ def _store(operand, buffer, indices, buffer_shape, *args, buffer_name=None, **kw
         if buffer_name is not None:
             return common.DeferredLine(buffer_name, line), [None, None]
         else:
-            return line, [None, None]
\ No newline at end of file
+            return line, [None, None]
+
+    @staticmethod
+    def affine_apply(map_var, indices, indirect_dims=None, comment=None, *args, **kwargs):
+        # Format indices arguments
+        indices_str = ", ".join([f"%{i}" for i in indices])
+        op_str = f"affine.apply #{map_var}({indices_str})"
+
+        # Add indirect dimensions if provided
+        if indirect_dims:
+            indirect_str = ", ".join(indirect_dims)
+            op_str += f"[{indirect_str}]"
+        if comment:
+            op_str += f" // {comment}"
+        return op_str, [1, "index"]
+
+    @staticmethod
+    def affine_map(dim_names, expr_str, symbol_names=None, comment=None, *args, **kwargs):
+        # Handle dim_names as list or string
+        if isinstance(dim_names, list):
+            dims_str = ", ".join([str(dim) for dim in dim_names])
+        else:
+            dims_str = dim_names
+
+        # Build the map string
+        if symbol_names:
+            if isinstance(symbol_names, list):
+                symbols_str = ", ".join(symbol_names)
+            else:
+                symbols_str = symbol_names
+            map_str = f"affine_map<({dims_str})[{symbols_str}] -> ({expr_str})>"
+        else:
+            map_str = f"affine_map<({dims_str}) -> ({expr_str})>"
+
+        if comment:
+            map_str += f" // {comment}"
+
+        return map_str, [1, "map"]
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index aff2f0b0..6c103829 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -35,15 +35,15 @@ def __init__(self, scheduler):
         self.max_fusion_size = 5
 
     def can_fuse_with_exceptions(self, node1: BaseSchedulerNode, node2: BaseSchedulerNode) -> bool:
-        if not extension_config.CONFIG_FUSION:
-            return False
+        if not extension_config.CONFIG_FUSION_PROLOGUE:
+            return self.scheduler.can_fuse_origin(node1, node2)
 
         # Extract base template node
         base_template_node1 = [node for node in node1.get_nodes() if node.is_template()]
         base_template_node2 = [node for node in node2.get_nodes() if node.is_template()]
 
         # Case 3: Prologue(Pointwise) + Tempalte
-        if len(base_template_node1) == 0 and len(node1.get_nodes())==1 and not node1.is_reduction() and len(base_template_node2) == 1 and extension_config.CONFIG_FUSION_PROLOGUE:
+        if len(base_template_node1) == 0 and len(node1.get_nodes())==1 and len(node2.get_nodes())==1 and not node1.is_reduction() and len(base_template_node2) == 1 and extension_config.CONFIG_FUSION_PROLOGUE:
             from PyTorchSimFrontend.mlir.mlir_gemm_template import MLIRGemmTemplate
             from PyTorchSimFrontend.mlir.mlir_bmm_template import MLIRBMMTemplate
 
@@ -126,7 +126,7 @@ def can_fuse_horizontal(self, node1, node2):
             return same_iter and no_dependency
 
         # Case 1: Template + Pointwise fusion
-        if len(base_template_node1) == 1 and len(node1.get_nodes())==1 and len(base_template_node2) == 0 and not node2.is_reduction():
+        if len(base_template_node1) == 1 and len(node1.get_nodes())==1 and len(node2.get_nodes())==1 and len(base_template_node2) == 0 and not node2.is_reduction():
             # Don't fuse maxpool template code
             from PyTorchSimFrontend.mlir.mlir_maxpool_template import MLIRMaxPoolTemplate
             from PyTorchSimFrontend.mlir.mlir_bmm_template import MLIRBMMTemplate
@@ -170,7 +170,7 @@ def can_fuse_horizontal(self, node1, node2):
             return True
 
         # Case 2: Tempalte + Reduction fusion
-        if len(base_template_node1) == 1 and len(node1.get_nodes())==1 and len(base_template_node2) == 0 and node2.is_reduction() and extension_config.CONFIG_FUSION_REDUCTION_EPILOGUE:
+        if len(base_template_node1) == 1 and len(node1.get_nodes())==1 and len(node2.get_nodes())==1 and len(base_template_node2) == 0 and node2.is_reduction() and extension_config.CONFIG_FUSION_REDUCTION_EPILOGUE:
             from PyTorchSimFrontend.mlir.mlir_gemm_template import MLIRGemmTemplate
             from PyTorchSimFrontend.mlir.mlir_bmm_template import MLIRBMMTemplate
             target_node = base_template_node1[0].node
@@ -185,39 +185,35 @@ def can_fuse_horizontal(self, node1, node2):
             except:
                 return False
 
-            # We can't fuse dim=-1
-            layout_possible = stride != 1
+            # We can't fuse dim=-1 & N == 1
+            layout_possible = stride != 1 and (1 not in node1.node.get_size())
             # Directed linked?
             dependency_check = writes1 & reads2
             dependency_size = all([i.get_numel() == node1.get_nodes()[0].node.get_numel() for i in node2.read_writes.reads])
             return size_match and layout_possible and dependency_check and dependency_size
 
         # Case 3: Prologue(Pointwise) + Tempalte
-        if len(base_template_node1) == 0 and len(node1.get_nodes())==1 and not node1.is_reduction() and len(base_template_node2) == 1 and extension_config.CONFIG_FUSION_PROLOGUE:
-            from PyTorchSimFrontend.mlir.mlir_gemm_template import MLIRGemmTemplate
-            from PyTorchSimFrontend.mlir.mlir_bmm_template import MLIRBMMTemplate
-
-            target_node = base_template_node2[0].node
-            # Currently only BMM, MM support prologue fusion
-            if not isinstance(target_node.template, (MLIRBMMTemplate, MLIRGemmTemplate)):
-                return False
-
-            if len(node1.read_writes.writes) != 1:
-                return False
-            if node1.node not in target_node.inputs or any(["view" in str(ori) for ori in node1.node.origins]): #FIXME
-                return False
-
-            # We don't fuse this edge case...
-            if base_template_node2[0].group[1][0][0] == 1:
-                return False
-
-            if list(node1.read_writes.writes)[0].name in [dep.name for dep in node2.read_writes.reads]:
-                node1 = self.revert_group(node1)
-                return True
-
-        # Check elementwise fusion
-        if vars1 == vars2 and reduce1 == reduce2 and not node1.is_reduction() and not node2.is_reduction():
-            return writes1 & reads2
+        # if len(base_template_node1) == 0 and len(node1.get_nodes())==1 and not node1.is_reduction() and len(base_template_node2) == 1 and extension_config.CONFIG_FUSION_PROLOGUE:
+        #     from PyTorchSimFrontend.mlir.mlir_gemm_template import MLIRGemmTemplate
+        #     from PyTorchSimFrontend.mlir.mlir_bmm_template import MLIRBMMTemplate
+
+        #    target_node = base_template_node2[0].node
+        #    # Currently only BMM, MM support prologue fusion
+        #    if not isinstance(target_node.template, (MLIRBMMTemplate, MLIRGemmTemplate)):
+        #        return False
+
+        #    if len(node1.read_writes.writes) != 1:
+        #        return False
+        #    if node1.node not in target_node.inputs or any(["view" in str(ori) for ori in node1.node.origins]): #FIXME
+        #        return False
+
+        #    # We don't fuse this edge case...
+        #    if base_template_node2[0].group[1][0][0] == 1:
+        #        return False
+
+        #    if list(node1.read_writes.writes)[0].name in [dep.name for dep in node2.read_writes.reads]:
+        #        node1 = self.revert_group(node1)
+        #        return True
         return False
 
     def revert_group(self, act_nodes, args=None, var_ranges=None):
@@ -340,7 +336,7 @@ def codegen_template(self, template_node, epilogue_nodes, prologue_nodes):
         _, _, _, kernel.buffer_types = self.kernel_group.args.mlir_argdefs()
         src_code, meta_code = kernel.codegen_nodes(tile_candidates, render, template_node, prologue_nodes, epilogue_nodes)
 
-        with V.set_kernel_handler(kernel):
+        with kernel:
             kernel_name = self.define_kernel(src_code, meta_code, kernel.kernel_name, kernel.vector_lane, kernel.spad_info,
                                              kernel.loop_size, origins={str(i) for i in template_node.node.origins})
             self.define_function(kernel)
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 31796a8b..6ec043fb 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -473,7 +473,6 @@ def codegen_template_code(self, render, template_node, prologue_nodes, epilogue_
                     for node in epilogue_nodes:
                         node.codegen((vars, reduction_vars))
 
-        with V.set_kernel_handler(kernel):
             src_code = (
                 partial_code
                 if isinstance(partial_code, str)
@@ -785,8 +784,8 @@ def def_dma_op(self, dma_type, dram_var:str, index_list:list, tile_desc:mlir_com
                    subtile_size:list=[], async_type=None, indent_size=0):
         # Prepare code block
         local_code = IndentedBuffer()
-        with V.set_kernel_handler(self):
-            index_var = self.parse_index_list(index_list, local_code, offset=tile_desc.offset)
+        with self, self.override_buffer_cse(buffer=local_code, cse=self.apply_cse):
+            index_var = self.parse_index_list(index_list, offset=tile_desc.offset)
             node_layout = self.named_nodes[dram_var].get_layout()
             if dram_var in self.exception_nodes:
                 numel = self.exception_nodes[dram_var]["numel"]
@@ -826,7 +825,7 @@ def def_dma_op(self, dma_type, dram_var:str, index_list:list, tile_desc:mlir_com
 
     def def_sram_buffer(self, dram_name, tile_desc, id=0, indent_size=0):
         # Prepare code block
-        with V.set_kernel_handler(self):
+        with self:
             dtype = self.named_nodes[dram_name].get_layout().dtype
             tile_shape = tile_desc.get_mlir_shape(mlir_common.DTYPE_TO_MLIR[dtype])
             buffer_name = self.allocate_sram_buffer(dtype, dram_name, tile_desc, id, forced_name=dram_name)
@@ -854,8 +853,9 @@ def load_epilogue(self, name: str, index: sympy.Expr):
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
 
         # Want to use tile_desc from epilogue_info
-        index_var = self.parse_indices(index)
-        dram_stride = [index.coeff(sympy.Symbol(val)) for val in self.dim_aliasing.values()]
+        with self.override_buffer_cse(buffer=self.applys, cse=self.apply_cse):
+            index_var = self.parse_indices(index)
+        dram_stride = [index.coeff(sympy.Symbol(val)) for val in self.dim_aliasing.keys()]
         vlane_split_axis = self.kernel_group.tile_desc.vmap.vlane_split_axis
         vlane_stride = self.kernel_group.tile_desc.vmap.vlane_stride
         tile_shape = self.kernel_group.tile_desc.get_mlir_shape(mlir_dtype)
@@ -888,7 +888,11 @@ def load_epilogue(self, name: str, index: sympy.Expr):
             vsize = compute_vec_size//reduce_size
 
             if compute_vec_size > 1:
-                offset = self.cse.generate(self.loads, f"affine.apply affine_map<(d0, d1) -> (d0 + d1*{(self.r_tile_size)})>(%{self.compute_idx}, %{self.reduction_loop_idx})")
+                with self.override_buffer_cse(buffer=self.global_vars, cse=self.map_cse):
+                    map_var = ops.affine_map(["d0", "d1"], f"d0 + d1*{(self.r_tile_size)}")
+                with self.override_buffer_cse(buffer=self.loads):
+                    offset = ops.affine_apply(map_var, [self.compute_idx, self.reduction_loop_idx])
+                #offset = self.cse.generate(self.loads, f"affine.apply affine_map<(d0, d1) -> (d0 + d1*{(self.r_tile_size)})>(%{self.compute_idx}, %{self.reduction_loop_idx})")
                 compute_index_var = ",".join([f"%{zero_var}"] * (self.kernel_group.tile_desc.get_nr_dim()-1) + [f"%{offset}"])
 
             with self.override_buffer_cse(buffer=self.loads):
@@ -902,8 +906,9 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         dtype = V.graph.get_dtype(name)
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
 
-        index_var = self.parse_indices(index)
-        dram_stride = [index.coeff(sympy.Symbol(val)) for val in self.dim_aliasing.values()]
+        with self.override_buffer_cse(buffer=self.applys, cse=self.apply_cse):
+            index_var = self.parse_indices(index)
+        dram_stride = [index.coeff(sympy.Symbol(val)) for val in self.dim_aliasing.keys()]
         vlane_split_axis = self.kernel_group.tile_desc.vmap.vlane_split_axis
         vlane_stride = self.kernel_group.tile_desc.vmap.vlane_stride
         tile_shape = self.kernel_group.tile_desc.get_mlir_shape(mlir_dtype)
@@ -981,15 +986,17 @@ def reduction_epilogue(self, dtype, src_dtype, reduction_type, value):
         compute_index_var = ", ".join(zero_var_list)
         with self.override_buffer_cse(buffer=self.loads):
             out = ops._load(vec_size, type_name, sram_var, compute_index_var, tile_shape)
-
         # Reduction body codegen
         with self.override_buffer_cse(buffer=self.const_buffer, cse=self.const_cse):
             init = ops.constant(reduction_init(reduction_type, dtype), type_name)
             init_vec = ops.broadcast(init, compute_vec_size)
+            init_vec2 = ops.broadcast(init, local_tile_desc.get_numel_per_lane())
+            ops._store(init_vec2, sram_var, ", ".join([f"%{self.get_const_cse(0)}"] * local_tile_desc.get_nr_dim()), tile_shape)
 
         mask_shape, mask_var = self.get_mask()
         if mask_var is not None:
             value = ops.where(mask_var, value, init_vec)
+
         result = reduction_partial_combine_vec(reduction_type, value, out)
 
         # Store partial result
@@ -1003,8 +1010,9 @@ def store_reduction_epilogue(self, name, index, value):
         dtype = V.graph.get_dtype(name)
         mlir_dtype = mlir_common.DTYPE_TO_MLIR[dtype]
 
-        index_var = self.parse_indices(index, self.reductions_suffix, comments="// Store reduction")
-        dram_stride = [index.coeff(sympy.Symbol(val)) for val in self.dim_aliasing.values()][:-1] # Assume that there is only one reduction axis
+        with self.override_buffer_cse(buffer=self.reductions_suffix, cse=self.apply_cse):
+            index_var = self.parse_indices(index, comments="// Store reduction")
+        dram_stride = [index.coeff(sympy.Symbol(val)) for val in self.dim_aliasing.keys()][:-1] # Assume that there is only one reduction axis
         vlane_split_axis = self.kernel_group.tile_desc.vmap.vlane_split_axis
         vlane_stride = self.kernel_group.tile_desc.vmap.vlane_stride
 
@@ -1100,7 +1108,7 @@ def set_tile_size(self, template_fusion_info, prologue=False):
             self.r_tile_size = tile_desc.get_tile_size()[-1]
             self.r_dim_size = template_fusion_info['r_dim_size']
             self.reduction_nr_outer_loop = nr_outer_loop
-            self.reduction_loop_idx = "reduce_loop_idx"
+            self.reduction_loop_idx = self.register_var_cse("reduce_loop_idx", 1, "index")
             self.compute_body_loop.size = r_tile_size
             self.compute_body_loop.step = tile_desc.get_compute_vec_size() // nr_outer_loop
             self.reduction_body_loop = mlir_common.LoopLevel(self.reduction_loop_idx, nr_outer_loop)

From a90f11483be095ca39928a91c11765f98d9285b0 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 20 Jan 2026 11:17:38 +0000
Subject: [PATCH 081/194] [Fix] Fusion axis mechanism change

---
 .../mlir/mlir_codegen_backend.py              |  8 ++----
 PyTorchSimFrontend/mlir/mlir_common.py        | 27 ++++++++++++------
 PyTorchSimFrontend/mlir/mlir_ops.py           |  2 +-
 PyTorchSimFrontend/mlir/mlir_scheduling.py    |  2 +-
 PyTorchSimFrontend/mlir/mlir_template.py      | 28 ++++---------------
 5 files changed, 28 insertions(+), 39 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 671d0e09..34ba1031 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -313,7 +313,9 @@ def __init__(self, kernel_group, reason=None):
         self.base_vector_initialized = False
 
     def reset(self, reason):
+        save = self.exit_stack, self._nested_context_depth
         self.__init__(self.kernel_group, reason=reason)
+        self.exit_stack, self._nested_context_depth = save
 
     # padding type 0: zero-padding 1: negative-padding(-inf) ...
     def get_padding_type(self):
@@ -395,17 +397,11 @@ def parse_indices(self, expr, comments="", indices=None, indirect_dims=[]) -> co
 
         # Convert sympy expression to affine map expression
         expr_str, indices = self._convert_sympy_to_mlir_expr(expr, sorted_args)
-
-        # Extract index var
-        if len(indirect_dims):
-            comments = "{indirect_access} " + comments # Add indirect access attribute
         indirect_args = [f"%{i}" for i in indirect_dims]
         # Create affine.apply operation
         with self.override_buffer_cse(buffer=self.global_vars, cse=self.map_cse):
             map_var = ops.affine_map(indices, expr_str, symbol_names=indirect_dims)
 
-        if hasattr(self, "dim_aliasing"):
-            indices = [self.dim_aliasing.get(index, index) for index in indices]
         index = ops.affine_apply(map_var, indices, indirect_dims=indirect_args, comment=comments)
         return index
 
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 0717333a..be491925 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -614,7 +614,7 @@ def __init__(self, kernel_group, reason=None):
         self.target_cse_override = contextvars.ContextVar(f"Handler_cse_override_{instance_id}", default=self.cse)
         self._nested_context_depth = 0
 
-    def set_ranges(self, lengths, reduction_lengths):
+    def set_ranges(self, lengths, reduction_lengths, index_names=None):
         if self.call_ranges:
             assert self.call_ranges == tuple(lengths) + tuple(
                 reduction_lengths
@@ -623,7 +623,12 @@ def set_ranges(self, lengths, reduction_lengths):
         else:
             self.call_ranges = tuple(lengths) + tuple(reduction_lengths)
             self.ranges = [self.rename_indexing(x) for x in self.call_ranges]
-            self.itervars = [sympy.Symbol(f"index{n}") for n in range(len(self.ranges))]
+            if index_names is None:
+                self.itervars = [sympy.Symbol(f"index{n}") for n in range(len(self.ranges))]
+            else:
+                assert len(index_names) == len(self.ranges), f"Index names length mismatch: {len(index_names)} != {len(self.ranges)}"
+                self.itervars = [sympy.Symbol(str(n)) for n in index_names]
+
             self.itervar_cses = {str(index) : self.register_var_cse(str(index), 1, "index") for index in self.itervars}
             self.reduction_depth = len(lengths)
         return (
@@ -867,18 +872,22 @@ def rename_indexing(self, index) -> sympy.Expr:
     def override_buffer_cse(self, *, buffer=None, cse=None):
         buffer_override = self.target_buffer_override
         cse_override = self.target_cse_override
-        target_buffer = target_cse = None
+        buffer_token = cse_token = None
         try:
+            # Store tokens for proper restoration in nested contexts
+            # contextvars.set() returns the previous value (token) which can be used for reset()
             if buffer is not None:
-                target_buffer = buffer_override.set(buffer)
+                buffer_token = buffer_override.set(buffer)
             if cse is not None:
-                target_cse = cse_override.set(cse)
+                cse_token = cse_override.set(cse)
             yield self
         finally:
-            if target_cse is not None:
-                cse_override.reset(target_cse)
-            if target_buffer is not None:
-                buffer_override.reset(target_buffer)
+            # Restore using tokens - contextvars automatically handles nested contexts
+            # Each level restores to its own previous value
+            if cse_token is not None:
+                cse_override.reset(cse_token)
+            if buffer_token is not None:
+                buffer_override.reset(buffer_token)
 
     def __enter__(self):
         class CSEProxy:
diff --git a/PyTorchSimFrontend/mlir/mlir_ops.py b/PyTorchSimFrontend/mlir/mlir_ops.py
index fd0114e1..9edd2e44 100644
--- a/PyTorchSimFrontend/mlir/mlir_ops.py
+++ b/PyTorchSimFrontend/mlir/mlir_ops.py
@@ -1186,7 +1186,7 @@ def affine_apply(map_var, indices, indirect_dims=None, comment=None, *args, **kw
         # Add indirect dimensions if provided
         if indirect_dims:
             indirect_str = ", ".join(indirect_dims)
-            op_str += f"[{indirect_str}]"
+            op_str += f"[{indirect_str}] {{indirect_access}}"
         if comment:
             op_str += f" // {comment}"
         return op_str, [1, "index"]
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index 6c103829..faf5e69c 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -178,7 +178,7 @@ def can_fuse_horizontal(self, node1, node2):
                 return False
 
             size_match = node1.get_nodes()[0].node.get_numel() == reduce(operator.mul, node2.get_nodes()[0].node.get_size(), 1) * reduce(operator.mul, node2.get_nodes()[0].node.get_reduction_size(), 1)
-            target_symbol = symbols("r0")
+            target_symbol = symbols("r0_0")
             try:
                 stride = [i.strip()[:-1].split(",")[-1].strip() for i in str(node2.get_nodes()[0].node).split("\n") if "r0" in i][1]
                 stride = int(sympify(stride).coeff(target_symbol))
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 6ec043fb..b864e5f2 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -429,7 +429,7 @@ def codegen_template_code(self, render, template_node, prologue_nodes, epilogue_
                     ).group
                     prologue_tile_desc = kernel.set_tile_size(kernel.prologue_info, prologue=True)
                     kernel.kernel_group.set_tile_info(prologue_tile_desc)
-                    vars, reduction_vars = kernel.set_ranges(group, reduction_group)
+                    vars, reduction_vars = kernel.set_ranges(group, reduction_group, list(self.dim_aliasing.values()))
                     for node in prologue_nodes:
                         # Reuse created spad
                         read_list = sorted([i.name for i in node.read_writes.reads])
@@ -469,10 +469,11 @@ def codegen_template_code(self, render, template_node, prologue_nodes, epilogue_
                     _, (group, reduction_group) = max(
                         epilogue_nodes, key=lambda x: int(x.is_reduction())
                     ).group
-                    vars, reduction_vars = kernel.set_ranges(group, reduction_group)
+                    vars, reduction_vars = kernel.set_ranges(group, reduction_group, list(self.dim_aliasing.values()))
                     for node in epilogue_nodes:
                         node.codegen((vars, reduction_vars))
 
+        with self as kernel:
             src_code = (
                 partial_code
                 if isinstance(partial_code, str)
@@ -855,7 +856,7 @@ def load_epilogue(self, name: str, index: sympy.Expr):
         # Want to use tile_desc from epilogue_info
         with self.override_buffer_cse(buffer=self.applys, cse=self.apply_cse):
             index_var = self.parse_indices(index)
-        dram_stride = [index.coeff(sympy.Symbol(val)) for val in self.dim_aliasing.keys()]
+        dram_stride = [index.coeff(sympy.Symbol(val)) for val in self.dim_aliasing.values()]
         vlane_split_axis = self.kernel_group.tile_desc.vmap.vlane_split_axis
         vlane_stride = self.kernel_group.tile_desc.vmap.vlane_stride
         tile_shape = self.kernel_group.tile_desc.get_mlir_shape(mlir_dtype)
@@ -892,7 +893,6 @@ def load_epilogue(self, name: str, index: sympy.Expr):
                     map_var = ops.affine_map(["d0", "d1"], f"d0 + d1*{(self.r_tile_size)}")
                 with self.override_buffer_cse(buffer=self.loads):
                     offset = ops.affine_apply(map_var, [self.compute_idx, self.reduction_loop_idx])
-                #offset = self.cse.generate(self.loads, f"affine.apply affine_map<(d0, d1) -> (d0 + d1*{(self.r_tile_size)})>(%{self.compute_idx}, %{self.reduction_loop_idx})")
                 compute_index_var = ",".join([f"%{zero_var}"] * (self.kernel_group.tile_desc.get_nr_dim()-1) + [f"%{offset}"])
 
             with self.override_buffer_cse(buffer=self.loads):
@@ -908,7 +908,7 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
 
         with self.override_buffer_cse(buffer=self.applys, cse=self.apply_cse):
             index_var = self.parse_indices(index)
-        dram_stride = [index.coeff(sympy.Symbol(val)) for val in self.dim_aliasing.keys()]
+        dram_stride = [index.coeff(sympy.Symbol(val)) for val in self.dim_aliasing.values()]
         vlane_split_axis = self.kernel_group.tile_desc.vmap.vlane_split_axis
         vlane_stride = self.kernel_group.tile_desc.vmap.vlane_stride
         tile_shape = self.kernel_group.tile_desc.get_mlir_shape(mlir_dtype)
@@ -1012,7 +1012,7 @@ def store_reduction_epilogue(self, name, index, value):
 
         with self.override_buffer_cse(buffer=self.reductions_suffix, cse=self.apply_cse):
             index_var = self.parse_indices(index, comments="// Store reduction")
-        dram_stride = [index.coeff(sympy.Symbol(val)) for val in self.dim_aliasing.keys()][:-1] # Assume that there is only one reduction axis
+        dram_stride = [index.coeff(sympy.Symbol(val)) for val in self.dim_aliasing.values()][:-1] # Assume that there is only one reduction axis
         vlane_split_axis = self.kernel_group.tile_desc.vmap.vlane_split_axis
         vlane_stride = self.kernel_group.tile_desc.vmap.vlane_stride
 
@@ -1123,22 +1123,6 @@ def set_tile_size(self, template_fusion_info, prologue=False):
                 self.compute_body_loop.step = tile_desc.get_compute_vec_size()
         return tile_desc
 
-    def rename_indexing(self, index) -> sympy.Expr:
-        # First step: replace dim_name with tmp_+dim_aliased_name to avoid circular dependencies
-        # (e.g., {"index0":"index1", "index1":"index0"})
-        tmp_subs = {
-            sympy.Symbol(dim_name): sympy.Symbol("tmp_"+dim_aliased_name)
-            for dim_name, dim_aliased_name in self.dim_aliasing.items()
-        }
-        index = index.subs(tmp_subs)
-        # Second step: replace tmp_+dim_aliased_name with dim_aliased_name
-        final_subs = {
-            sympy.Symbol("tmp_"+dim_aliased_name): sympy.Symbol(dim_aliased_name)
-            for dim_aliased_name in self.dim_aliasing.values()
-        }
-        index = index.subs(final_subs)
-        return index
-
 class MLIRTemplateCaller(CUDATemplateCaller):
     def __str__(self):
         return f"MLIRTemplateCaller(source_file={self.bmreq.source_file})"

From 78613ad5e21441b1a6c9221410a8c5b83ff3cc46 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 22 Jan 2026 04:30:31 +0000
Subject: [PATCH 082/194] [Test] Fix syntax error in experiment scripts

---
 experiments/gemm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/experiments/gemm.py b/experiments/gemm.py
index 44be689a..6b6ece4d 100644
--- a/experiments/gemm.py
+++ b/experiments/gemm.py
@@ -31,7 +31,7 @@ def custom_matmul(a, b):
     import os
     import sys
     base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml)
+    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml')
     config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
     sys.path.append(base_dir)
     args = argparse.ArgumentParser()

From 21d08f219b2cb25ce5cb4da0b173c6340bb94f02 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 22 Jan 2026 06:13:11 +0000
Subject: [PATCH 083/194] [CI] Change base image for OpenReg build

---
 .github/workflows/docker-base-image-2-8.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docker-base-image-2-8.yml b/.github/workflows/docker-base-image-2-8.yml
index 3a1d97a1..74e81e07 100644
--- a/.github/workflows/docker-base-image-2-8.yml
+++ b/.github/workflows/docker-base-image-2-8.yml
@@ -63,7 +63,7 @@ jobs:
           file: ./Dockerfile.base
           push: true
           build-args: |
-            PYTORCH_IMAGE=pytorch/pytorch:2.8.0-cuda12.6-cudnn9-runtime
+            PYTORCH_IMAGE=pytorch/pytorch:2.8.0-cuda12.6-cudnn9-devel
             GEM5_ASSET_ID=${{ env.GEM5_ASSET_ID }}
             LLVM_ASSET_ID=${{ env.LLVM_ASSET_ID }}
             SPIKE_ASSET_ID=${{ env.SPIKE_ASSET_ID }}

From 24e67eded3496b011f73472c5fcac06de35f8e1a Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 22 Jan 2026 06:15:25 +0000
Subject: [PATCH 084/194] [OpenReg] Use OpenReg style Custom device

---
 .gitignore                                    |   1 -
 Dockerfile                                    |   5 +-
 PyTorchSimDevice/ExtensionDeviceGuardImpl.cpp |   8 -
 PyTorchSimDevice/ExtensionDeviceGuardImpl.h   | 127 ----
 PyTorchSimDevice/extension_device.cpp         | 711 ------------------
 PyTorchSimDevice/extension_hooks.cpp          |  48 --
 PyTorchSimDevice/extension_hooks.h            |  30 -
 PyTorchSimDevice2/CMakeLists.txt              |  44 ++
 PyTorchSimDevice2/README.md                   | 175 +++++
 .../cmake/TorchPythonTargets.cmake            |  22 +
 PyTorchSimDevice2/csrc/CMakeLists.txt         |  16 +
 PyTorchSimDevice2/csrc/aten/OpenRegExtra.cpp  | 195 +++++
 .../csrc/aten/OpenRegMinimal.cpp              | 148 ++++
 PyTorchSimDevice2/csrc/aten/native/Common.h   |  97 +++
 PyTorchSimDevice2/csrc/aten/native/Extra.cpp  | 210 ++++++
 PyTorchSimDevice2/csrc/aten/native/Extra.h    |  69 ++
 .../csrc/aten/native/Minimal.cpp              | 185 +++++
 PyTorchSimDevice2/csrc/aten/native/Minimal.h  |  61 ++
 .../csrc/runtime/OpenRegDeviceAllocator.cpp   |   8 +
 .../csrc/runtime/OpenRegDeviceAllocator.h     |  43 ++
 PyTorchSimDevice2/csrc/runtime/OpenRegEvent.h | 146 ++++
 .../csrc/runtime/OpenRegException.cpp         |   9 +
 .../csrc/runtime/OpenRegException.h           |  20 +
 .../csrc/runtime/OpenRegFunctions.cpp         |  74 ++
 .../csrc/runtime/OpenRegFunctions.h           |  18 +
 .../csrc/runtime/OpenRegGenerator.cpp         |  28 +
 .../csrc/runtime/OpenRegGenerator.h           |  21 +
 .../csrc/runtime/OpenRegGuard.cpp             |   7 +
 PyTorchSimDevice2/csrc/runtime/OpenRegGuard.h | 197 +++++
 .../csrc/runtime/OpenRegHooks.cpp             |  11 +
 PyTorchSimDevice2/csrc/runtime/OpenRegHooks.h |  41 +
 .../csrc/runtime/OpenRegHostAllocator.cpp     |   8 +
 .../csrc/runtime/OpenRegHostAllocator.h       |  48 ++
 .../csrc/runtime/OpenRegSerialization.cpp     |  48 ++
 .../csrc/runtime/OpenRegSerialization.h       |  10 +
 .../csrc/runtime/OpenRegStream.cpp            | 253 +++++++
 .../csrc/runtime/OpenRegStream.h              | 162 ++++
 PyTorchSimDevice2/include/Macros.h            |   7 +
 PyTorchSimDevice2/pyproject.toml              |  35 +
 PyTorchSimDevice2/setup.py                    | 148 ++++
 .../third_party/openreg/CMakeLists.txt        |  21 +
 .../third_party/openreg/README.md             | 151 ++++
 .../openreg/cmake/GTestTargets.cmake          |  12 +
 .../third_party/openreg/csrc/device.cpp       |  37 +
 .../third_party/openreg/csrc/memory.cpp       | 259 +++++++
 .../third_party/openreg/csrc/memory.h         |  96 +++
 .../third_party/openreg/csrc/stream.cpp       | 313 ++++++++
 .../third_party/openreg/example/example.cpp   | 112 +++
 .../third_party/openreg/include/openreg.h     | 109 +++
 .../third_party/openreg/include/openreg.inl   |  42 ++
 .../_C.cpython-311-x86_64-linux-gnu.so        | Bin 0 -> 15312 bytes
 PyTorchSimDevice2/torch_openreg/__init__.py   |  24 +
 PyTorchSimDevice2/torch_openreg/_utils.py     |  42 ++
 .../torch_openreg/csrc/CMakeLists.txt         |  24 +
 .../torch_openreg/csrc/Module.cpp             |  99 +++
 PyTorchSimDevice2/torch_openreg/csrc/stub.c   |  20 +
 .../torch_openreg/lib/libopenreg.so           | Bin 0 -> 59728 bytes
 .../torch_openreg/lib/libtorch_bindings.so    | Bin 0 -> 166144 bytes
 .../torch_openreg/lib/libtorch_openreg.so     | Bin 0 -> 569736 bytes
 .../torch_openreg/openreg/__init__.py         |  86 +++
 .../openreg}/extension_device_interface.py    |   0
 .../openreg}/extension_device_op_overrides.py |   0
 .../torch_openreg/openreg/meta.py             |  13 +
 .../torch_openreg/openreg/random.py           |  61 ++
 .../mlir/mlir_codegen_backend.py              |   2 +-
 Scheduler/scheduler.py                        |  51 +-
 66 files changed, 4100 insertions(+), 968 deletions(-)
 delete mode 100644 PyTorchSimDevice/ExtensionDeviceGuardImpl.cpp
 delete mode 100644 PyTorchSimDevice/ExtensionDeviceGuardImpl.h
 delete mode 100644 PyTorchSimDevice/extension_device.cpp
 delete mode 100644 PyTorchSimDevice/extension_hooks.cpp
 delete mode 100644 PyTorchSimDevice/extension_hooks.h
 create mode 100644 PyTorchSimDevice2/CMakeLists.txt
 create mode 100644 PyTorchSimDevice2/README.md
 create mode 100644 PyTorchSimDevice2/cmake/TorchPythonTargets.cmake
 create mode 100644 PyTorchSimDevice2/csrc/CMakeLists.txt
 create mode 100644 PyTorchSimDevice2/csrc/aten/OpenRegExtra.cpp
 create mode 100644 PyTorchSimDevice2/csrc/aten/OpenRegMinimal.cpp
 create mode 100644 PyTorchSimDevice2/csrc/aten/native/Common.h
 create mode 100644 PyTorchSimDevice2/csrc/aten/native/Extra.cpp
 create mode 100644 PyTorchSimDevice2/csrc/aten/native/Extra.h
 create mode 100644 PyTorchSimDevice2/csrc/aten/native/Minimal.cpp
 create mode 100644 PyTorchSimDevice2/csrc/aten/native/Minimal.h
 create mode 100644 PyTorchSimDevice2/csrc/runtime/OpenRegDeviceAllocator.cpp
 create mode 100644 PyTorchSimDevice2/csrc/runtime/OpenRegDeviceAllocator.h
 create mode 100644 PyTorchSimDevice2/csrc/runtime/OpenRegEvent.h
 create mode 100644 PyTorchSimDevice2/csrc/runtime/OpenRegException.cpp
 create mode 100644 PyTorchSimDevice2/csrc/runtime/OpenRegException.h
 create mode 100644 PyTorchSimDevice2/csrc/runtime/OpenRegFunctions.cpp
 create mode 100644 PyTorchSimDevice2/csrc/runtime/OpenRegFunctions.h
 create mode 100644 PyTorchSimDevice2/csrc/runtime/OpenRegGenerator.cpp
 create mode 100644 PyTorchSimDevice2/csrc/runtime/OpenRegGenerator.h
 create mode 100644 PyTorchSimDevice2/csrc/runtime/OpenRegGuard.cpp
 create mode 100644 PyTorchSimDevice2/csrc/runtime/OpenRegGuard.h
 create mode 100644 PyTorchSimDevice2/csrc/runtime/OpenRegHooks.cpp
 create mode 100644 PyTorchSimDevice2/csrc/runtime/OpenRegHooks.h
 create mode 100644 PyTorchSimDevice2/csrc/runtime/OpenRegHostAllocator.cpp
 create mode 100644 PyTorchSimDevice2/csrc/runtime/OpenRegHostAllocator.h
 create mode 100644 PyTorchSimDevice2/csrc/runtime/OpenRegSerialization.cpp
 create mode 100644 PyTorchSimDevice2/csrc/runtime/OpenRegSerialization.h
 create mode 100644 PyTorchSimDevice2/csrc/runtime/OpenRegStream.cpp
 create mode 100644 PyTorchSimDevice2/csrc/runtime/OpenRegStream.h
 create mode 100644 PyTorchSimDevice2/include/Macros.h
 create mode 100644 PyTorchSimDevice2/pyproject.toml
 create mode 100644 PyTorchSimDevice2/setup.py
 create mode 100644 PyTorchSimDevice2/third_party/openreg/CMakeLists.txt
 create mode 100644 PyTorchSimDevice2/third_party/openreg/README.md
 create mode 100644 PyTorchSimDevice2/third_party/openreg/cmake/GTestTargets.cmake
 create mode 100644 PyTorchSimDevice2/third_party/openreg/csrc/device.cpp
 create mode 100644 PyTorchSimDevice2/third_party/openreg/csrc/memory.cpp
 create mode 100644 PyTorchSimDevice2/third_party/openreg/csrc/memory.h
 create mode 100644 PyTorchSimDevice2/third_party/openreg/csrc/stream.cpp
 create mode 100644 PyTorchSimDevice2/third_party/openreg/example/example.cpp
 create mode 100644 PyTorchSimDevice2/third_party/openreg/include/openreg.h
 create mode 100644 PyTorchSimDevice2/third_party/openreg/include/openreg.inl
 create mode 100755 PyTorchSimDevice2/torch_openreg/_C.cpython-311-x86_64-linux-gnu.so
 create mode 100644 PyTorchSimDevice2/torch_openreg/__init__.py
 create mode 100644 PyTorchSimDevice2/torch_openreg/_utils.py
 create mode 100644 PyTorchSimDevice2/torch_openreg/csrc/CMakeLists.txt
 create mode 100644 PyTorchSimDevice2/torch_openreg/csrc/Module.cpp
 create mode 100644 PyTorchSimDevice2/torch_openreg/csrc/stub.c
 create mode 100644 PyTorchSimDevice2/torch_openreg/lib/libopenreg.so
 create mode 100644 PyTorchSimDevice2/torch_openreg/lib/libtorch_bindings.so
 create mode 100644 PyTorchSimDevice2/torch_openreg/lib/libtorch_openreg.so
 create mode 100644 PyTorchSimDevice2/torch_openreg/openreg/__init__.py
 rename {PyTorchSimDevice => PyTorchSimDevice2/torch_openreg/openreg}/extension_device_interface.py (100%)
 rename {PyTorchSimDevice => PyTorchSimDevice2/torch_openreg/openreg}/extension_device_op_overrides.py (100%)
 create mode 100644 PyTorchSimDevice2/torch_openreg/openreg/meta.py
 create mode 100644 PyTorchSimDevice2/torch_openreg/openreg/random.py

diff --git a/.gitignore b/.gitignore
index b42d5f6b..3ca1e54b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,6 @@
 __pycache__/
 TOGSim/build/
 .vscode
-*.txt
 *.ipynb_checkpoints
 output
 togsim_results/*
diff --git a/Dockerfile b/Dockerfile
index 088daa43..1b4d08f3 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -10,4 +10,7 @@ RUN cd PyTorchSim/TOGSim && \
     cd build && \
     conan install .. --build=missing && \
     cmake .. && \
-    make -j$(nproc)
\ No newline at end of file
+    make -j$(nproc)
+
+RUN cd PyTorchSim/PyTorchSimDevice2 && \
+    python -m pip install --no-build-isolation -e .
\ No newline at end of file
diff --git a/PyTorchSimDevice/ExtensionDeviceGuardImpl.cpp b/PyTorchSimDevice/ExtensionDeviceGuardImpl.cpp
deleted file mode 100644
index a0b1395d..00000000
--- a/PyTorchSimDevice/ExtensionDeviceGuardImpl.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-#include "ExtensionDeviceGuardImpl.h"
-#include <c10/core/impl/DeviceGuardImplRegistry.h>
-
-namespace c10::extension_device::impl {
-
-C10_REGISTER_GUARD_IMPL(extension_device, ExtensionDeviceGuardImpl);
-
-} // namespace c10::extension_device::impl
diff --git a/PyTorchSimDevice/ExtensionDeviceGuardImpl.h b/PyTorchSimDevice/ExtensionDeviceGuardImpl.h
deleted file mode 100644
index 6d35677b..00000000
--- a/PyTorchSimDevice/ExtensionDeviceGuardImpl.h
+++ /dev/null
@@ -1,127 +0,0 @@
-#pragma once
-
-#include <c10/core/DeviceGuard.h>
-#include <c10/core/impl/DeviceGuardImplInterface.h>
-#include <c10/core/Stream.h>
-#include <c10/core/Event.h>
-#include <c10/core/DeviceType.h>
-#include <c10/util/Optional.h>
-
-namespace c10::extension_device::impl {
-
-struct ExtensionDeviceGuardImpl final : public c10::impl::DeviceGuardImplInterface {
-  static constexpr DeviceType static_type = DeviceType::PrivateUse1; // ✅ your backend type
-
-  ExtensionDeviceGuardImpl() = default;
-
-  explicit ExtensionDeviceGuardImpl(DeviceType t) {
-    TORCH_CHECK(
-        t == static_type,
-        "ExtensionDeviceGuardImpl initialized with non-extension_device DeviceType: ",
-        t);
-  }
-
-  // --------------------------------------------------------------------------
-  // 기본적인 device guard (CPU처럼 동작)
-  // --------------------------------------------------------------------------
-  DeviceType type() const override {
-    return static_type;
-  }
-
-  Device exchangeDevice(Device d) const override {
-    TORCH_CHECK(d.type() == static_type, "Expected extension_device but got ", d);
-    return d; // nothing to exchange, CPU-like
-  }
-
-  Device getDevice() const override {
-    return Device(static_type, 0);
-  }
-
-  void setDevice(Device d) const override {
-    TORCH_CHECK(d.type() == static_type, "Expected extension_device but got ", d);
-  }
-
-  void uncheckedSetDevice(Device d) const noexcept override {}
-
-  DeviceIndex deviceCount() const noexcept override {
-    return 1; // pretend single device
-  }
-
-  // --------------------------------------------------------------------------
-  // Stream handling (동기식이므로 기본 stream만 사용)
-  // --------------------------------------------------------------------------
-  Stream getStream(Device d) const override {
-    return Stream(Stream::DEFAULT, d);
-  }
-
-  Stream getNewStream(Device d, int priority = 0) const override {
-    return Stream(Stream::DEFAULT, d);
-  }
-
-  Stream getStreamFromGlobalPool(Device d, bool = false) const override {
-    return Stream(Stream::DEFAULT, d);
-  }
-
-  Stream exchangeStream(Stream s) const override {
-    return s;
-  }
-
-  bool queryStream(const Stream& stream) const override {
-    (void)stream;
-    return true;
-  }
-
-  void synchronizeStream(const Stream& stream) const override {
-    (void)stream;
-  }
-
-  void synchronizeDevice(DeviceIndex device_index) const override {
-    (void)device_index;
-  }
-
-  // --------------------------------------------------------------------------
-  // Event handling (전부 no-op)
-  // --------------------------------------------------------------------------
-  void destroyEvent(void* event, const DeviceIndex device_index) const noexcept override {
-    (void)event;
-    (void)device_index;
-  }
-
-  void record(void** event, const Stream& stream, const DeviceIndex device_index, const EventFlag flag) const override {
-    (void)event;
-    (void)stream;
-    (void)device_index;
-    (void)flag;
-  }
-
-  void block(void* event, const Stream& stream) const override {
-    (void)event;
-    (void)stream;
-  }
-
-  bool queryEvent(void* event) const override {
-    (void)event;
-    return true;
-  }
-
-  void synchronizeEvent(void* event) const override {
-    (void)event;
-  }
-
-  double elapsedTime(void* start_event, void* end_event, const DeviceIndex device_index) const override {
-    (void)start_event;
-    (void)end_event;
-    (void)device_index;
-    return 0.0;
-  }
-
-  // --------------------------------------------------------------------------
-  // Misc (allocator integration)
-  // --------------------------------------------------------------------------
-  void recordDataPtrOnStream(const c10::DataPtr& data_ptr, const Stream& stream) const override {
-    (void)data_ptr;
-    (void)stream;
-  }
-};
-
-} // namespace c10::extension_device::impl
diff --git a/PyTorchSimDevice/extension_device.cpp b/PyTorchSimDevice/extension_device.cpp
deleted file mode 100644
index a1dcfcf4..00000000
--- a/PyTorchSimDevice/extension_device.cpp
+++ /dev/null
@@ -1,711 +0,0 @@
-#include <c10/core/impl/alloc_cpu.h>
-#include <c10/core/Allocator.h>
-
-#include <torch/csrc/Device.h>
-#include <torch/csrc/inductor/inductor_ops.h>
-#include <c10/core/impl/DeviceGuardImplInterface.h>
-#include <c10/core/MemoryFormat.h>
-#include <c10/macros/Macros.h>
-#include <torch/extension.h>
-
-#include <ATen/native/cpu/Loops.h>
-#include <ATen/native/DispatchStub.h>
-#include <ATen/native/Resize.h>
-#include <ATen/native/TensorFactories.h>
-#include <ATen/EmptyTensor.h>
-#include <ATen/core/GeneratorForPrivateuseone.h>
-#include <ATen/NativeFunctions.h>
-#include <ATen/native/CPUFallback.h>
-#include <pybind11/pybind11.h>
-namespace py = pybind11;
-
-namespace {
-  bool g_amp_enabled = false;
-  at::ScalarType g_amp_dtype = at::kFloat;
-}
-
-static at::ScalarType to_scalar_type(const py::object& dtype_obj) {
-  py::module torch_mod = py::module::import("torch");
-  if (dtype_obj.is(torch_mod.attr("bfloat16"))) return at::kBFloat16;
-  if (dtype_obj.is(torch_mod.attr("float16")))  return at::kHalf;
-  if (dtype_obj.is(torch_mod.attr("float32")))  return at::kFloat;
-  if (dtype_obj.is(torch_mod.attr("float64")))  return at::kDouble;
-  throw std::runtime_error("Unsupported dtype for extension_device AMP");
-}
-
-static py::object to_torch_dtype(at::ScalarType st) {
-  py::module torch_mod = py::module::import("torch");
-  switch (st) {
-    case at::kBFloat16: return torch_mod.attr("bfloat16");
-    case at::kHalf:     return torch_mod.attr("float16");
-    case at::kFloat:    return torch_mod.attr("float32");
-    case at::kDouble:   return torch_mod.attr("float64");
-    default:
-      throw std::runtime_error("Unsupported scalar type in get_autocast_dtype");
-  }
-}
-
-static inline at::MemoryFormat fix_memory_format(c10::optional<at::MemoryFormat> mf_opt) {
-    if (!mf_opt.has_value()) return at::MemoryFormat::Contiguous;
-
-    auto mf = mf_opt.value();
-    if (mf == at::MemoryFormat::Preserve) {
-        return at::MemoryFormat::Contiguous;
-    }
-    return mf;
-}
-
-#include "ExtensionDeviceGuardImpl.h"
-
-static uint64_t op_counter = 0;
-static uint64_t last_saved_value = 0;
-
-C10_REGISTER_GUARD_IMPL(PrivateUse1, c10::extension_device::impl::ExtensionDeviceGuardImpl);
-
-// basic dummy add function
-at::Tensor custom_add_Tensor(const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha) {
-  op_counter += 1;
-  // Since this custom device is just for testing, not bothering to implement kernels.
-  return at::empty(self.sizes(), self.options());
-}
-
-// basic dummy mul function
-at::Tensor custom_mul_Tensor(const at::Tensor & self, const at::Tensor & other) {
-  op_counter += 1;
-  // Since this custom device is just for testing, not bothering to implement kernels.
-  return at::empty(self.sizes(), self.options());
-}
-
-at::Tensor _reinterpret_tensor(
-    const at::Tensor& self,
-    c10::IntArrayRef size,
-    c10::IntArrayRef stride,
-    int64_t offset_increment) {
-  at::Tensor self_ = at::detail::make_tensor<c10::TensorImpl>(
-      c10::Storage(self.storage()), self.key_set(), self.dtype());
-  auto* self_tmp_ = self_.unsafeGetTensorImpl();
-  self_tmp_->set_storage_offset(self.storage_offset() + offset_increment);
-  self_tmp_->set_sizes_and_strides(size, stride);
-  return self_;
-}
-
-at::Tensor& zero_inplace_batching_rule(at::Tensor &self) {
-  op_counter += 1;
-  // Since this custom device is just for testing, not bothering to implement kernels.
-  return self;
-}
-
-const at::Tensor& custom_resize_(const at::Tensor& self, at::IntArrayRef size,
-                          std::optional<at::MemoryFormat> optional_memory_format) {
-  at::TensorImpl* tensor_impl = self.unsafeGetTensorImpl();
-  tensor_impl->set_sizes_contiguous(size);
-  const auto itemsize = tensor_impl->dtype().itemsize();
-  const auto offset = tensor_impl->storage_offset();
-  const auto storage_size = at::detail::computeStorageNbytesContiguous(size, itemsize, offset);
-  // Dummy device is using cpu allocator, so here just call cpu
-  // function maybe_resize_storage_cpu in aten/src/ATen/native/Resize.h
-  // to get a sufficient memory space.
-  at::native::maybe_resize_storage_cpu(tensor_impl, storage_size);
-  if (optional_memory_format.has_value()) {
-    auto memory_format =
-        optional_memory_format.value();
-    TORCH_CHECK(
-        memory_format != at::MemoryFormat::Preserve,
-        "Unsupported memory format",
-        memory_format);
-    tensor_impl->empty_tensor_restride(memory_format);
-  }
-  return self;
-}
-
-// basic dummy eq function: Only support CPU
-at::Tensor custom_to_device(
-    const at::Tensor & self,
-    at::Device device,
-    at::ScalarType dtype,
-    bool non_blocking,
-    bool copy,
-    c10::optional<at::MemoryFormat> memory_format) {
-  TORCH_CHECK(self.is_cpu() || self.device().type() == c10::DeviceType::PrivateUse1, "Dummy test only allows copy from cpu -> dummy device.");
-  TORCH_CHECK(device.is_cpu() || device.type() == c10::DeviceType::PrivateUse1, "Dummy test only allows copy from cpu -> dummy device.");
-  // Some dummy asserts for the basic use case: inputs are the same size / dtype, all contiguous.
-  TORCH_CHECK(self.scalar_type() == dtype);
-  TORCH_CHECK(self.is_contiguous());
-
-  op_counter += 1;
-  if (device.type() == at::DeviceType::CPU) {
-    auto out = at::empty(self.sizes(), dtype, self.options().layout(),
-                         device, false, memory_format);
-    std::memcpy(out.mutable_data_ptr(), self.data_ptr(), self.nbytes());
-    return out;
-  } else {
-    auto opts = self.options().device(device).dtype(dtype);
-    auto out = at::empty(self.sizes(), opts);
-    std::memcpy(out.mutable_data_ptr(), self.data_ptr(), self.nbytes());
-    return out;
-  }
-
-  auto out = at::empty(self.sizes(), dtype, self.options().layout(), device, false, memory_format);
-  memcpy(out.mutable_data_ptr(), self.mutable_data_ptr(), self.nbytes());
-  // Since this custom device is just for testing, not bothering to implement kernels.
-  return out;
-}
-
-
-// A dummy allocator for our custom device, that secretly uses the CPU
-struct DummyCustomAllocator final : at::Allocator {
-  DummyCustomAllocator() = default;
-  at::DataPtr allocate(size_t nbytes) override {
-    void* data = c10::alloc_cpu(nbytes);
-    return {data, data, &ReportAndDelete, at::Device(at::DeviceType::PrivateUse1, 0)};
-  }
-
-  static void ReportAndDelete(void* ptr) {
-    if (!ptr) {
-      return;
-    }
-    c10::free_cpu(ptr);
-  }
-
-  at::DeleterFnPtr raw_deleter() const override {
-    return &ReportAndDelete;
-  }
-
-  void copy_data(void* dest, const void* src, std::size_t count) const override {
-    std::memcpy(dest, src, count);
-  }
-};
-
-// Register our dummy allocator
-static DummyCustomAllocator global_custom_alloc;
-REGISTER_ALLOCATOR(c10::DeviceType::PrivateUse1, &global_custom_alloc);
-
-at::Tensor & custom_fill__scalar(at::Tensor & self, const at::Scalar & value) {
-  TORCH_CHECK(self.device().type() == c10::DeviceType::PrivateUse1,
-              "Dummy test only allows dummy device.");
-  TORCH_CHECK(self.is_contiguous());
-
-  op_counter += 1;
-
-  switch (self.scalar_type()) {
-    case c10::ScalarType::Float: {
-      auto* data = self.mutable_data_ptr<float>();
-      for (int64_t i = 0; i < self.numel(); i++) {
-        data[i] = value.toFloat();
-      }
-      break;
-    }
-    case c10::ScalarType::Double: {
-      auto* data = self.mutable_data_ptr<double>();
-      for (int64_t i = 0; i < self.numel(); i++) {
-        data[i] = value.toDouble();
-      }
-      break;
-    }
-    case c10::ScalarType::Half: {
-      auto* data = self.mutable_data_ptr<at::Half>();
-      for (int64_t i = 0; i < self.numel(); i++) {
-        data[i] = at::Half(value.toHalf());
-      }
-      break;
-    }
-    case c10::ScalarType::BFloat16: {
-      auto* data = self.mutable_data_ptr<at::BFloat16>();
-      for (int64_t i = 0; i < self.numel(); i++) {
-        data[i] = at::BFloat16(value.toBFloat16());
-      }
-      break;
-    }
-    case c10::ScalarType::Int: {
-      auto* data = self.mutable_data_ptr<int>();
-      for (int64_t i = 0; i < self.numel(); i++) {
-        data[i] = value.toInt();
-      }
-      break;
-    }
-    case c10::ScalarType::Long: {
-      auto* data = self.mutable_data_ptr<int64_t>();
-      for (int64_t i = 0; i < self.numel(); i++) {
-        data[i] = value.toLong();
-      }
-      break;
-    }
-    case c10::ScalarType::Short: {
-      auto* data = self.mutable_data_ptr<int16_t>();
-      for (int64_t i = 0; i < self.numel(); i++) {
-        data[i] = static_cast<int16_t>(value.toShort());
-      }
-      break;
-    }
-    case c10::ScalarType::Char: {
-      auto* data = self.mutable_data_ptr<int8_t>();
-      for (int64_t i = 0; i < self.numel(); i++) {
-        data[i] = static_cast<int8_t>(value.toChar());
-      }
-      break;
-    }
-    case c10::ScalarType::Byte: {
-      auto* data = self.mutable_data_ptr<uint8_t>();
-      for (int64_t i = 0; i < self.numel(); i++) {
-        data[i] = static_cast<uint8_t>(value.toByte());
-      }
-      break;
-    }
-    case c10::ScalarType::Bool: {
-      auto* data = self.mutable_data_ptr<bool>();
-      for (int64_t i = 0; i < self.numel(); i++) {
-        data[i] = value.toBool();
-      }
-      break;
-    }
-    default:
-      TORCH_CHECK(false, "Unsupported scalar type: ", self.scalar_type());
-  }
-  return self;
-}
-
-at::Tensor unsafe_create_cpu_tensor_from_dummy_tensor(const at::Tensor& src) {
-  // TORCH_CHECK(src.device().type() == c10::DeviceType::PrivateUse1,
-  //             "Only support dummy device.");
-  const auto& sizes_ = src.sizes();
-  const auto& strides_ = src.strides();
-  auto storage_offset_ = src.storage_offset();
-  at::detail::check_size_nonnegative(sizes_);
-
-  size_t size_bytes = at::detail::computeStorageNbytes(sizes_, strides_,
-                                                       src.element_size(),
-                                                       storage_offset_);
-
-  at::DataPtr data_ptr =
-    c10::InefficientStdFunctionContext::makeDataPtr(src.storage().mutable_data_ptr().get(),
-                                                    [](void*){}, at::kCPU);
-
-  c10::Storage storage{c10::Storage::use_byte_size_t{}, size_bytes, std::move(data_ptr),
-    /*allocator=*/&global_custom_alloc, /*resizeable=*/false};
-
-  constexpr c10::DispatchKeySet cpu_ks(c10::DispatchKey::CPU);
-  at::Tensor tensor = at::detail::make_tensor<c10::TensorImpl>(
-       std::move(storage), cpu_ks, src.dtype());
-
-  c10::TensorImpl* tensor_impl = tensor.unsafeGetTensorImpl();
-  tensor_impl->set_sizes_and_strides(sizes_, strides_);
-  tensor_impl->set_storage_offset(storage_offset_);
-  return tensor;
-}
-
-// basic dummy copy_() function, so we can copy from the custom device to/from CPU
-at::Tensor custom__copy_from(const at::Tensor& self, const at::Tensor& dst, bool non_blocking) {
-  TORCH_CHECK(
-      self.is_cpu() || self.device().type() == c10::DeviceType::PrivateUse1,
-      "Dummy test only allows copy from cpu -> dummy device.");
-  TORCH_CHECK(
-      dst.is_cpu() || dst.device().type() == c10::DeviceType::PrivateUse1,
-      "Dummy test only allows copy from cpu -> dummy device.");
-
-  // Some dummy asserts for the basic use case: inputs are the same size / dtype, all contiguous.
-  if (self.numel() != dst.numel()) {
-    custom_resize_(dst, self.sizes(), c10::nullopt);
-  }
-  TORCH_CHECK(self.sizes() == dst.sizes());
-
-  const bool same_dtype = (self.scalar_type() == dst.scalar_type());
-  const bool both_contig = self.is_contiguous() && dst.is_contiguous();
-
-  // 1) fast path
-  if (same_dtype && both_contig) {
-    std::memcpy(dst.mutable_data_ptr(),
-                self.data_ptr(),
-                dst.storage().nbytes());
-    return dst;
-  }
-
-  // 2) slow path
-  at::Tensor cpu_self = unsafe_create_cpu_tensor_from_dummy_tensor(self);
-  at::Tensor cpu_dst  = unsafe_create_cpu_tensor_from_dummy_tensor(dst);
-  if (!same_dtype) {
-    cpu_self = cpu_self.to(cpu_dst.scalar_type(), /*non_blocking=*/false, /*copy=*/true);
-  }
-  cpu_dst.copy_(cpu_self);
-  return dst;
-}
-
-at::Tensor custom__copy_from_and_resize(const at::Tensor& self, const at::Tensor& dst) {
-  return custom__copy_from(self, dst, false);
-}
-
-at::Tensor& custom_abs_out(const at::Tensor& self, at::Tensor& out) {
-  return at::native::abs_out(self, out);
-}
-
-at::Tensor custom_empty_strided(c10::IntArrayRef size, c10::IntArrayRef stride, c10::optional<at::ScalarType> dtype_opt, c10::optional<at::Layout> layout_opt, c10::optional<at::Device> device_opt, c10::optional<bool> pin_memory_opt) {
-  op_counter += 1;
-  constexpr c10::DispatchKeySet private_use_ks(c10::DispatchKey::PrivateUse1);
-  auto dtype = c10::dtype_or_default(dtype_opt);
-  return  at::detail::empty_strided_generic(size, stride, &global_custom_alloc, private_use_ks, dtype);
-}
-
-at::Tensor custom_empty(c10::IntArrayRef size, c10::optional<at::ScalarType> dtype_opt, c10::optional<at::Layout> layout_opt, c10::optional<at::Device> device_opt, c10::optional<bool> pin_memory_opt, c10::optional<c10::MemoryFormat> optional_memory_format) {
-  op_counter += 1;
-
-  constexpr c10::DispatchKeySet private_use_ks(c10::DispatchKey::PrivateUse1);
-  auto dtype = c10::dtype_or_default(dtype_opt);
-  return  at::detail::empty_generic(size, &global_custom_alloc, private_use_ks, dtype, fix_memory_format(optional_memory_format));
-}
-
-at::Tensor& custom_arange_start_out_impl(
-    const c10::Scalar& start,
-    const c10::Scalar& end,
-    const c10::Scalar& step,
-    at::Tensor& out) {
-  double s = start.toDouble();
-  double e = end.toDouble();
-  double st = step.toDouble();
-  TORCH_CHECK(st != 0.0, "step must be nonzero");
-
-  int64_t length = 0;
-  if (st > 0) {
-    if (e > s) length = static_cast<int64_t>(std::ceil((e - s) / st));
-  } else {
-    if (e < s) length = static_cast<int64_t>(std::ceil((e - s) / st));
-  }
-
-  // Resize out tensor
-  custom_resize_(out, {length}, c10::nullopt);
-
-  if (out.scalar_type() == at::kFloat || out.scalar_type() == at::kDouble) {
-    double* data = out.mutable_data_ptr<double>();
-    for (int64_t i = 0; i < length; i++) {
-      data[i] = s + i * st;
-    }
-  } else if (out.scalar_type() == at::kLong) {
-    int64_t* data = out.mutable_data_ptr<int64_t>();
-    for (int64_t i = 0; i < length; i++) {
-      data[i] = static_cast<int64_t>(s + i * st);
-    }
-  } else {
-    TORCH_CHECK(false, "Unsupported dtype for arange on dummy device");
-  }
-
-  return out;
-}
-
-static at::Tensor custom_to_dtype_impl(const at::Tensor& self,
-                                       c10::ScalarType dtype,
-                                       bool non_blocking, bool copy,
-                                       c10::optional<c10::MemoryFormat> memory_format) {
-  return at::native::to(self, dtype, non_blocking, copy, memory_format);
-}
-
-at::Tensor custom_zeros_like(
-    const at::Tensor& input,
-    c10::optional<at::ScalarType> dtype_opt,
-    c10::optional<at::Layout> layout_opt,
-    c10::optional<c10::Device> device_opt,
-    c10::optional<bool> pin_memory_opt,
-    c10::optional<c10::MemoryFormat> memory_format_opt)
-{
-  // dtype / layout / device fallback
-  auto dtype   = dtype_opt.value_or(input.scalar_type());
-  auto layout  = layout_opt.value_or(input.layout());
-  auto device  = device_opt.value_or(input.device());
-  auto memfmt  = memory_format_opt.value_or(c10::MemoryFormat::Contiguous);
-
-  TORCH_CHECK(
-      device.type() == c10::DeviceType::PrivateUse1,
-      "custom_zeros_like: device must be PrivateUse1");
-
-  at::Tensor out = custom_empty(
-      input.sizes(),
-      dtype,
-      layout,
-      device,
-      pin_memory_opt,
-      memfmt
-  );
-  size_t nbytes = out.numel() * out.element_size();
-  void* ptr = out.mutable_data_ptr();
-
-  TORCH_CHECK(ptr != nullptr,
-      "custom_zeros_like: out.mutable_data_ptr() returned NULL");
-  std::memset(ptr, 0, nbytes);
-  return out;
-}
-
-at::Tensor& custom_zero_impl(at::Tensor& self)
-{
-    TORCH_CHECK(
-        self.device().type() == c10::DeviceType::PrivateUse1,
-        "custom_zero_: expected a PrivateUse1 device tensor");
-
-    if (self.numel() == 0) {
-        return self;
-    }
-
-    void* data = self.mutable_data_ptr();
-    TORCH_CHECK(data != nullptr,
-        "custom_zero_: self.mutable_data_ptr() returned NULL "
-        "(storage was not allocated)");
-
-    size_t nbytes = self.numel() * self.element_size();
-    std::memset(data, 0, nbytes);
-
-    return self;
-}
-
-// With TORCH_LIBRARY_IMPL, you can register custom kernels for your backend.
-// For open registration, we're registering all of our kernels to the PrivateUse1 dispatch key.
-// Later in this file, we map a custom device to the PrivateUse1 device type,
-// which allows user code that puts a tensor on your custom_device to eventually get plumbed
-// into the kernels registered here.
-//
-// This macro registers your kernels to the PyTorch Dispatcher.
-// More details on the dispatcher can be found at http://blog.ezyang.com/2020/09/lets-talk-about-the-pytorch-dispatcher/.
-TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
-  m.impl("to.Device",             &custom_to_device);
-  m.impl("to.dtype",              &custom_to_dtype_impl);
-  m.impl("fill_.Scalar",          &custom_fill__scalar);
-  m.impl("_copy_from",            &custom__copy_from);
-  m.impl("_copy_from_and_resize", &custom__copy_from_and_resize);
-  m.impl("empty_strided",         &custom_empty_strided);
-  m.impl("empty.memory_format",   &custom_empty);
-  m.impl("as_strided",            at::native::as_strided_tensorimpl);
-  m.impl("view",                  at::native::view);
-  m.impl("arange.start_out",      &custom_arange_start_out_impl);
-  m.impl("zeros_like",            &custom_zeros_like);
-  m.impl("zero_",                 &custom_zero_impl);
-}
-
-TORCH_LIBRARY_IMPL(aten, AutogradPrivateUse1, m) {
-  m.impl("to.dtype", &custom_to_dtype_impl);
-}
-
-TORCH_LIBRARY_FRAGMENT(aten, m) {
-  m.def(
-    "_reinterpret_tensor(Tensor self, int[] size, int[] stride, int offset_increment=0) -> Tensor",
-    torch::dispatch(c10::DispatchKey::AutogradPrivateUse1, _reinterpret_tensor),
-    {at::Tag::pt2_compliant_tag}
-  );
-}
-
-void custom_cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
-  at::native::cpu_fallback(op, stack);
-}
-
-TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
-  m.impl("abs", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("abs.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("abs_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("absolute", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("absolute.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("absolute_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("add.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("add.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("add.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("add_.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("add_.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-
-  m.impl("cat", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("cat.names", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("cat.names_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("cat.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-
-  m.impl("div.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("div.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("div.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("div_.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("div_.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-
-  m.impl("eq.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("eq.Scalar_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("eq.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("eq.Tensor_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("equal", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-
-  m.impl("erf", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("erf.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("erf_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("erfc", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("erfc.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("erfc_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-
-  m.impl("exp", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("exp.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-
-  m.impl("ge.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("ge.Scalar_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("ge.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("ge.Tensor_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("gt.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("gt.Scalar_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("gt.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("gt.Tensor_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("le.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("le.Scalar_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("le.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("le.Tensor_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("lt.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("lt.Scalar_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("lt.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("lt.Tensor_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("ne.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("ne.Scalar_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("ne.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("ne.Tensor_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-
-  m.impl("logical_and", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("logical_and.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("logical_and_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("logical_not", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("logical_not.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("logical_not_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("logical_or", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("logical_or.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("logical_or_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("logical_xor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("logical_xor.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("logical_xor_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-
-  m.impl("neg", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("neg.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("neg_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-
-  m.impl("mul.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("mul.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("mul_.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-
-  m.impl("pow.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("pow.Scalar_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("pow.Tensor_Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("pow.Tensor_Scalar_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("pow.Tensor_Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("pow.Tensor_Tensor_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("pow_.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("pow_.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-
-  m.impl("sub.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("sub.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("sub.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("sub_.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("sub_.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-
-  m.impl("sum", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("sum.DimnameList_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("sum.IntList_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("sum.dim_DimnameList", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("sum.dim_IntList", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-
-  m.impl("resize_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("resize_as_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-
-  // Foreach ops
-  m.impl("_foreach_add.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("_foreach_add_.Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("_foreach_add_.ScalarList", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("_foreach_add.List", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("_foreach_add_.List", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-
-  // Indexed
-  m.impl("index_add.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("index_add_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("index_copy.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("index_copy_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("index_fill.int_Scalar", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("index_fill.int_Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("index_fill.int_Scalar_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("index_fill.int_Tensor_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("index_fill_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-
-  m.impl("tril", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("tril_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("triu", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("triu_", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("triu_indices", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-
-  m.impl("nll_loss2d_forward", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("nll_loss2d_backward", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("nll_loss_backward", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("nll_loss_forward", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-
-  m.impl("scatter.src_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("scatter.value_out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-
-  m.impl("index_put.Default", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("index.Tensor", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-
-  m.impl("mm.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("sigmoid.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("gather.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("silu.out", torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-
-  m.impl("all.all_out",                   torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("_local_scalar_dense",           torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("_log_softmax",                  torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("_log_softmax_backward_data",    torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("mse_loss.out",                  torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("_native_multi_head_attention",  torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("where.self",                    torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("min",                           torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("max",                           torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("index_select",                  torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-  m.impl("nonzero",                       torch::CppFunction::makeFromBoxedFunction<&custom_cpu_fallback>());
-}
-
-// This basic implementation doesn't bother dealing with different device indices
-// (e.g. custom_device:0 vs. custom_device:1).
-// We could do that by letting the user pass in a device index in our exposed device function.
-// Note that if you do that, you'll also need to register a device guard to core.
-// See `c10/core/impl/DeviceGuardImplInterface.h:C10_REGISTER_GUARD_IMPL`.
-c10::Device get_custom_device() {
-  return c10::Device(c10::DeviceType::PrivateUse1, 0);
-}
-
-bool custom_op_called() {
-  bool called = false;
-  if (op_counter > last_saved_value) {
-    called = true;
-    last_saved_value = op_counter;
-  }
-  return called;
-}
-
-class PrivateGeneratorImpl : public at::CPUGeneratorImpl {
-public:
-  PrivateGeneratorImpl(c10::DeviceIndex device_index) {
-    device_ = c10::Device(c10::DeviceType::PrivateUse1, device_index);
-    key_set_ = c10::DispatchKeySet(c10::DispatchKey::PrivateUse1);
-  }
-  ~PrivateGeneratorImpl() override = default;
-};
-
-// this is used to register generator
-at::Generator make_generator_privateuse1(c10::DeviceIndex device_index) {
-  return at::make_generator<PrivateGeneratorImpl>(device_index);
-}
-
-void register_generator() {
-  REGISTER_GENERATOR_PRIVATEUSE1(make_generator_privateuse1)
-}
-
-// Here, we're exposing a custom device object that corresponds to our custom backend.
-// We do this using pybind: exposing an "extension_name.custom_device()" function in python,
-// that's implemented in C++.
-// The implementation in this file maps directly to the `PrivateUse1` device type.
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("custom_device", &get_custom_device, "get custom device object");
-  m.def("custom_op_called", &custom_op_called, "check if our custom function was called");
-  m.def("register_generator", &register_generator, "register generator for custom device");
-  m.def("is_autocast_enabled", []() -> bool { return g_amp_enabled;});
-  m.def("set_autocast_enabled", [](bool flag) -> void {g_amp_enabled = flag;});
-  m.def("get_autocast_dtype", []() -> py::object { return to_torch_dtype(g_amp_dtype); });
-  m.def("set_autocast_dtype", [](py::object dtype_obj) -> void {
-    auto st = to_scalar_type(dtype_obj);
-    g_amp_dtype = st;
-  });
-  m.def("get_amp_supported_dtype", []() -> py::list {
-    py::module torch_mod = py::module::import("torch");
-    py::list lst;
-    lst.append(torch_mod.attr("float16"));
-    lst.append(torch_mod.attr("float32"));
-    return lst;
-  });
-}
\ No newline at end of file
diff --git a/PyTorchSimDevice/extension_hooks.cpp b/PyTorchSimDevice/extension_hooks.cpp
deleted file mode 100644
index aadd6d2a..00000000
--- a/PyTorchSimDevice/extension_hooks.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-#include "extension_hooks.h"
-
-bool ExtensionPU1Hooks::isBuilt() const { return true; }
-bool ExtensionPU1Hooks::isAvailable() const { return true; }
-
-const at::Generator& ExtensionPU1Hooks::getDefaultGenerator(c10::DeviceIndex idx) const {
-  if (idx < 0) idx = 0;
-  static std::vector<at::Generator> gens;
-  static std::mutex m;
-  std::lock_guard<std::mutex> g(m);
-  if (gens.size() <= (size_t)idx) gens.resize((size_t)idx + 1);
-  if (!gens[idx].defined()) gens[idx] = at::GetGeneratorForPrivateuse1(idx);
-  return gens[idx]; // 영속 객체 참조 반환
-}
-
-at::Generator ExtensionPU1Hooks::getNewGenerator(c10::DeviceIndex idx) const {
-  if (idx < 0) idx = 0;
-  return at::GetGeneratorForPrivateuse1(idx);
-}
-
-at::Device ExtensionPU1Hooks::getDeviceFromPtr(void* data) const {
-  return at::Device(at::kPrivateUse1, 0); // MVP: 단일 디바이스 가정
-}
-
-bool ExtensionPU1Hooks::isPinnedPtr(const void* data) const {
-  return false;
-}
-
-at::Allocator* ExtensionPU1Hooks::getPinnedMemoryAllocator() const {
-  return at::getHostAllocator(at::kPrivateUse1);
-}
-
-bool ExtensionPU1Hooks::hasPrimaryContext(c10::DeviceIndex device_index) const { return true; }
-
-void ExtensionPU1Hooks::resizePrivateUse1Bytes(const c10::Storage&, size_t) const {
-  TORCH_CHECK(false, "resizePrivateUse1Bytes not implemented");
-}
-
-// REGISTER_EXTENSION_HOOKS(ExtensionPU1Hooks);
-
-namespace {
-struct AutoRegistrar {
-  AutoRegistrar() {
-    at::RegisterPrivateUse1HooksInterface(new ExtensionPU1Hooks());
-  }
-};
-static AutoRegistrar _auto_registrar;
-}
diff --git a/PyTorchSimDevice/extension_hooks.h b/PyTorchSimDevice/extension_hooks.h
deleted file mode 100644
index fdf3505a..00000000
--- a/PyTorchSimDevice/extension_hooks.h
+++ /dev/null
@@ -1,30 +0,0 @@
-#pragma once
-
-#include <ATen/core/CachingHostAllocator.h>
-#include <ATen/detail/PrivateUse1HooksInterface.h>
-
-#include <ATen/core/Generator.h>
-#include <c10/core/Allocator.h>
-#include <c10/core/Device.h>
-#include <c10/core/Storage.h>
-#include <c10/util/Exception.h>
-
-struct ExtensionPU1Hooks final : public at::PrivateUse1HooksInterface {
-  ExtensionPU1Hooks() {}
-  bool isBuilt() const;
-  bool isAvailable() const;
-
-  const at::Generator& getDefaultGenerator(c10::DeviceIndex device_index) const override;
-
-  at::Generator getNewGenerator(c10::DeviceIndex device_index = -1) const override;
-
-  at::Device getDeviceFromPtr(void* data) const override;
-
-  bool isPinnedPtr(const void* data) const override;
-
-  at::Allocator* getPinnedMemoryAllocator() const override;
-
-  bool hasPrimaryContext(c10::DeviceIndex device_index) const override;
-
-  void resizePrivateUse1Bytes(const c10::Storage& /*storage*/, size_t /*newsize*/) const override;
-};
\ No newline at end of file
diff --git a/PyTorchSimDevice2/CMakeLists.txt b/PyTorchSimDevice2/CMakeLists.txt
new file mode 100644
index 00000000..2c207ca6
--- /dev/null
+++ b/PyTorchSimDevice2/CMakeLists.txt
@@ -0,0 +1,44 @@
+cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
+
+project(TORCH_OPENREG CXX C)
+
+include(GNUInstallDirs)
+include(CheckCXXCompilerFlag)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_C_STANDARD 11)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+set(CMAKE_SKIP_BUILD_RPATH  FALSE)
+set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
+set(CMAKE_INSTALL_RPATH_USE_LINK_PATH FALSE)
+set(CMAKE_CXX_VISIBILITY_PRESET hidden)
+
+set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
+
+if(APPLE)
+  set(CMAKE_INSTALL_RPATH "@loader_path/lib;@loader_path")
+elseif(UNIX)
+  set(CMAKE_INSTALL_RPATH "$ORIGIN/lib:$ORIGIN")
+elseif(WIN32)
+  set(CMAKE_INSTALL_RPATH "")
+endif()
+set(CMAKE_INSTALL_LIBDIR lib)
+set(CMAKE_INSTALL_MESSAGE NEVER)
+
+set(Torch_DIR ${PYTORCH_INSTALL_DIR}/share/cmake/Torch)
+find_package(Torch REQUIRED)
+
+if(DEFINED PYTHON_INCLUDE_DIR)
+  include_directories(${PYTHON_INCLUDE_DIR})
+else()
+  message(FATAL_ERROR "Cannot find Python directory")
+endif()
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+include(${PROJECT_SOURCE_DIR}/cmake/TorchPythonTargets.cmake)
+
+add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/openreg)
+add_subdirectory(${PROJECT_SOURCE_DIR}/csrc)
+add_subdirectory(${PROJECT_SOURCE_DIR}/torch_openreg/csrc)
diff --git a/PyTorchSimDevice2/README.md b/PyTorchSimDevice2/README.md
new file mode 100644
index 00000000..83ec85b1
--- /dev/null
+++ b/PyTorchSimDevice2/README.md
@@ -0,0 +1,175 @@
+# PyTorch OpenReg
+
+## Background
+
+The third-party device integration mechanism based on PrivateUse1 has become the official mainstream method for new backends to integrate with PyTorch. Ensuring the availability of this mechanism is crucial for enriching PyTorch's hardware ecosystem.
+
+**Note:**
+
+The goal of `torch_openreg` is **not to implement a fully functional, high-performance PyTorch backend**, but to serve as a **minimalist reference implementation for mechanism verification**.
+
+### Purpose
+
+- **Test Backend**: To serve as an in-tree test backend for PrivateUse1, ensuring quality stability through CI/CD.
+- **Integration Example**: To serve as a reference example for new backend integration.
+- **Integration Documentation**: To provide module-level integration documentation that corresponds with the code.
+
+### Design Principles
+
+- **Minimality Principle**: The fundamental goal is to enable/verify all integration paths/mechanisms for a new backend to integrate to PyTorch. All functions follow a "just right" strategy to ensure the correctness of relevant integration capabilities.
+- **Authenticity Principle**: To complete the OpenReg integration in the same way a real accelerator backend would integrate with PyTorch.
+
+## Directory Structure
+
+```shell
+torch_openreg/
+├── CMakeLists.txt
+├── csrc
+│   ├── aten
+│   │   ├── native
+│   │   │   ├── Extra.cpp
+│   │   │   ├── Minimal.cpp
+│   │   │   └── ...
+│   │   ├── OpenRegExtra.cpp
+│   │   └── OpenRegMinimal.cpp
+│   ├── CMakeLists.txt
+│   └── runtime
+│       ├── OpenRegDeviceAllocator.cpp
+│       ├── OpenRegDeviceAllocator.h
+│       ├── OpenRegFunctions.cpp
+│       ├── OpenRegFunctions.h
+│       ├── OpenRegGenerator.cpp
+│       ├── OpenRegGenerator.h
+│       ├── OpenRegGuard.cpp
+│       ├── OpenRegGuard.h
+│       ├── OpenRegHooks.cpp
+│       ├── OpenRegHooks.h
+│       ├── OpenRegHostAllocator.cpp
+│       ├── OpenRegHostAllocator.h
+│       └── ...
+├── pyproject.toml
+├── README.md
+├── setup.py
+├── third_party
+│   └── openreg
+└── torch_openreg
+    ├── csrc
+    │   ├── CMakeLists.txt
+    │   ├── Module.cpp
+    │   └── stub.c
+    ├── __init__.py
+    └── openreg
+        ├── __init__.py
+        ├── meta.py
+        └── random.py
+```
+
+**Dependencies**:
+
+```mermaid
+graph LR
+    A[Python]
+    B[_C.so]
+    C[libtorch_bindings.so]
+    D[libtorch_openreg.so]
+    E[libopenreg.so]
+
+    A --> B --> C --> D --> E
+```
+
+There are 4 DSOs in torch_openreg, and the dependencies between them are as follows:
+
+- `_C.so`:
+  - **sources**: torch_openreg/csrc/stub.c
+  - **description**: Python C module entry point.
+- `libtorch_bindings.so`: The bridging code between Python and C++ should go here.
+  - **sources**: torch_openreg/csrc
+  - **description**: A thin glue layer between Python and C++.
+- `libtorch_openreg.so`: All core implementations should go here.
+  - **sources**: csrc
+  - **description**: All core functionality, such as device runtime, operators, etc.
+- `libopenreg.so`: A DSO that uses the CPU to emulate a CUDA-like device, you can ignore it.
+  - **sources**: third_party/openreg
+  - **description**: Provides low-level device functionality similar to libcudart.so.
+
+**Key Directories**:
+
+- `csrc/`: Core device implementation, including operator registration, runtime, etc.
+  - `csrc/aten/`: Operator registration
+    - `csrc/aten/native/`: Specific operator implementations for the OpenReg device.
+      - `csrc/aten/OpenRegMinimal.cpp`: The most minimal set of operator implementations (allowing for the creation of Tensors and related operations upon completion).
+      - `csrc/aten/OpenRegExtra.cpp`: Implementations for other types of operators.
+    - `csrc/runtime/`: Implementations for Host memory, device memory, Guard, Hooks, etc.
+- `third_party/`: A C++ library that simulates a CUDA-like device using the CPU.
+- `torch_openreg/`: Python interface implementation (Python code and C++ Bindings).
+  - `torch_openreg/csrc/`: Python C++ binding code.
+  - `torch_openreg/openreg/`: Python API.
+
+## Currently Implemented Features
+
+### Operator Registration
+
+- Operator Implementation
+
+  - Register for builtin PyTorch Operators
+    - `TORCH_LIBRARY_IMPL` form: See `empty.memory_format
+    - `STUB` form: See `abs_stub`
+  - Register for custom operators
+    - Schema Registration: See `custom_abs`
+    - Kernel Registration: See `custom_abs`
+    - Fallback Registration for `AutogradPriavateUse1`: See `custom_abs`
+    - Meta Registration: See `custom_abs`
+    - `torch.autograd.Function`: See `custom_autograd_fn_aliasing`
+  - Register for fallback
+    - Per-operator Fallback: See `sub.Tensor`
+    - Global Fallback: See `wrapper_cpu_fallback`
+
+## Installation and Usage
+
+### Installation
+
+```python
+pip3 install --no-build-isolation -e . # for develop
+pip3 install --no-build-isolation . # for install
+```
+
+### Usage Example
+
+After installation, you can use the `openreg` device in Python just like any other regular device.
+
+```python
+import torch
+import torch_openreg
+
+if not torch.openreg.is_available():
+    print("OpenReg backend is not available in this build.")
+    exit()
+
+print("OpenReg backend is available!")
+
+device = torch.device("openreg")
+
+x = torch.tensor([[1., 2.], [3., 4.]], device=device)
+y = x + 2
+print("Result y:\n", y)
+print(f"Device of y: {y.device}")
+
+z = y.cpu()
+print("Result z:\n", z)
+print(f"Device of z: {z.device}")
+```
+
+## Future Plans
+
+- **Enhance Features**:
+  - Autoload
+  - AMP
+  - Device-agnostic APIs
+  - Memory Management
+  - Generator
+  - Distrubuted
+  - Custom Tensor&Storage
+  - ...
+- **Improve Tests**: Add more test cases related to the integration mechanism.
+- **Improve Documentation**: Add a new chapter on third-party device integration in the `Developer Notes` section of the PyTorch documentation.
+- **Real-time Synchronization**: Keep the code and documentation updated iteratively and in sync.
diff --git a/PyTorchSimDevice2/cmake/TorchPythonTargets.cmake b/PyTorchSimDevice2/cmake/TorchPythonTargets.cmake
new file mode 100644
index 00000000..b7a807d2
--- /dev/null
+++ b/PyTorchSimDevice2/cmake/TorchPythonTargets.cmake
@@ -0,0 +1,22 @@
+if(WIN32)
+  set(TORCH_PYTHON_IMPORTED_LOCATION "${PYTORCH_INSTALL_DIR}/lib/torch_python.lib")
+elseif(APPLE)
+  set(TORCH_PYTHON_IMPORTED_LOCATION "${PYTORCH_INSTALL_DIR}/lib/libtorch_python.dylib")
+else()
+  set(TORCH_PYTHON_IMPORTED_LOCATION "${PYTORCH_INSTALL_DIR}/lib/libtorch_python.so")
+endif()
+
+add_library(torch_python SHARED IMPORTED)
+
+set_target_properties(torch_python PROPERTIES
+  INTERFACE_INCLUDE_DIRECTORIES "${PYTORCH_INSTALL_DIR}/include"
+  INTERFACE_LINK_LIBRARIES "c10;torch_cpu"
+  IMPORTED_LOCATION "${TORCH_PYTHON_IMPORTED_LOCATION}"
+)
+
+add_library(torch_python_library INTERFACE IMPORTED)
+
+set_target_properties(torch_python_library PROPERTIES
+  INTERFACE_INCLUDE_DIRECTORIES "\$<TARGET_PROPERTY:torch_python,INTERFACE_INCLUDE_DIRECTORIES>"
+  INTERFACE_LINK_LIBRARIES "\$<TARGET_FILE:torch_python>;\$<TARGET_PROPERTY:torch_python,INTERFACE_LINK_LIBRARIES>"
+)
diff --git a/PyTorchSimDevice2/csrc/CMakeLists.txt b/PyTorchSimDevice2/csrc/CMakeLists.txt
new file mode 100644
index 00000000..e2ae2b3f
--- /dev/null
+++ b/PyTorchSimDevice2/csrc/CMakeLists.txt
@@ -0,0 +1,16 @@
+set(LIBRARY_NAME torch_openreg)
+
+file(GLOB_RECURSE SOURCE_FILES
+    "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp"
+)
+
+add_library(${LIBRARY_NAME} SHARED ${SOURCE_FILES})
+
+target_link_libraries(${LIBRARY_NAME} PRIVATE torch_cpu_library openreg)
+target_include_directories(${LIBRARY_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+
+install(TARGETS ${LIBRARY_NAME}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_LIBDIR}
+)
diff --git a/PyTorchSimDevice2/csrc/aten/OpenRegExtra.cpp b/PyTorchSimDevice2/csrc/aten/OpenRegExtra.cpp
new file mode 100644
index 00000000..04ba6d48
--- /dev/null
+++ b/PyTorchSimDevice2/csrc/aten/OpenRegExtra.cpp
@@ -0,0 +1,195 @@
+#include "native/Extra.h"
+
+#include <ATen/native/CPUFallback.h>
+#include <ATen/native/DispatchStub.h>
+
+#include <torch/csrc/autograd/autograd_not_implemented_fallback.h>
+#include <torch/library.h>
+
+namespace at::openreg {
+
+namespace {
+at::Tensor wrapper_quantize_per_tensor(
+    const at::Tensor& self,
+    double scale,
+    int64_t zero_point,
+    at::ScalarType dtype) {
+  return at::native::openreg::quantize_per_tensor(
+      self, scale, zero_point, dtype);
+}
+
+int64_t wrapper__fused_sdp_choice(
+    const at::Tensor& query,
+    const at::Tensor& key,
+    const at::Tensor& value,
+    const std::optional<at::Tensor>& attn_mask,
+    double dropout_p,
+    bool is_causal,
+    std::optional<double> scale,
+    bool enable_gqa) {
+  return at::native::openreg::_fused_sdp_choice(
+      query, key, value, attn_mask, dropout_p, is_causal, scale, enable_gqa);
+}
+
+void wrapper_quantize_tensor_per_tensor_affine_stub(
+    const at::Tensor& rtensor,
+    at::Tensor& qtensor,
+    double scale,
+    int64_t zero_point) {
+  at::native::openreg::quantize_tensor_per_tensor_affine_stub(
+      rtensor, qtensor, scale, zero_point);
+}
+
+std::tuple<
+    at::Tensor,
+    at::Tensor,
+    at::Tensor,
+    at::Tensor,
+    c10::SymInt,
+    c10::SymInt,
+    at::Tensor,
+    at::Tensor,
+    at::Tensor>
+wrapper__scaled_dot_product_fused_attention_overrideable(
+    const at::Tensor& query,
+    const at::Tensor& key,
+    const at::Tensor& value,
+    const std::optional<at::Tensor>& attn_bias,
+    double dropout_p,
+    bool is_causal,
+    bool return_debug_mask,
+    std::optional<double> scale) {
+  return at::native::openreg::_scaled_dot_product_fused_attention_overrideable(
+      query,
+      key,
+      value,
+      attn_bias,
+      dropout_p,
+      is_causal,
+      return_debug_mask,
+      scale);
+}
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
+wrapper_scaled_dot_product_fused_attention_overrideable_backward(
+    const at::Tensor& grad_out,
+    const at::Tensor& query,
+    const at::Tensor& key,
+    const at::Tensor& value,
+    const at::Tensor& attn_bias,
+    std::array<bool, 4> grad_input_mask,
+    const at::Tensor& out,
+    const at::Tensor& logsumexp,
+    const at::Tensor& cum_seq_q,
+    const at::Tensor& cum_seq_k,
+    int64_t max_q,
+    int64_t max_k,
+    double dropout_p,
+    bool is_causal,
+    const at::Tensor& philox_seed,
+    const at::Tensor& philox_offset,
+    std::optional<double> scale) {
+  return at::native::openreg::
+      _scaled_dot_product_fused_attention_overrideable_backward(
+          grad_out,
+          query,
+          key,
+          value,
+          attn_bias,
+          grad_input_mask,
+          out,
+          logsumexp,
+          cum_seq_q,
+          cum_seq_k,
+          max_q,
+          max_k,
+          dropout_p,
+          is_causal,
+          philox_seed,
+          philox_offset,
+          scale);
+}
+
+at::Tensor wrapper_custom_autograd_fn_returns_self(at::Tensor x) {
+  return at::native::openreg::custom_autograd_fn_returns_self(x);
+}
+
+at::Tensor wrapper_custom_autograd_fn_aliasing(at::Tensor x) {
+  return at::native::openreg::custom_autograd_fn_aliasing(x);
+}
+
+at::Tensor& wrapper_abs_out(const at::Tensor& self, at::Tensor& out) {
+  return at::native::openreg::abs_out(self, out);
+}
+
+void wrapper_abs_stub(at::TensorIteratorBase& iter) {
+  at::native::openreg::abs_kernel(iter);
+}
+
+at::Tensor wrapper_custom_abs(at::Tensor x) {
+  return at::native::openreg::custom_abs(x);
+}
+} // namespace
+
+using namespace at::native;
+// Registration via STUB
+// LITERALINCLUDE START: STUB DEFAULT
+REGISTER_PRIVATEUSE1_DISPATCH(abs_stub, &wrapper_abs_stub);
+REGISTER_PRIVATEUSE1_DISPATCH(
+    quantize_tensor_per_tensor_affine_stub,
+    &wrapper_quantize_tensor_per_tensor_affine_stub);
+REGISTER_PRIVATEUSE1_DISPATCH(
+    _fused_sdp_choice_stub,
+    &wrapper__fused_sdp_choice);
+// LITERALINCLUDE END: STUB DEFAULT
+
+// Registration of custom operators
+// LITERALINCLUDE START: CUSTOM OPERATOR SCHEMA
+TORCH_LIBRARY(openreg, m) {
+  m.def("custom_abs(Tensor input)-> Tensor");
+}
+// LITERALINCLUDE END: CUSTOM OPERATOR SCHEMA
+
+// LITERALINCLUDE START: CUSTOM OPERATOR DEFAULT
+TORCH_LIBRARY_IMPL(openreg, PrivateUse1, m) {
+  m.impl("custom_abs", &wrapper_custom_abs);
+}
+// LITERALINCLUDE END: CUSTOM OPERATOR DEFAULT
+
+// LITERALINCLUDE START: CUSTOM OPERATOR FALLBACK
+TORCH_LIBRARY_IMPL(_, AutogradPrivateUse1, m) {
+  m.fallback(torch::autograd::autogradNotImplementedFallback());
+}
+// LITERALINCLUDE END: CUSTOM OPERATOR FALLBACK
+
+// The rest is for testing purposes
+TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
+  /*
+   abs_stub only works if abs.out is also registered with PrivateUse1, because
+   abs.default is designed to redirect directly to abs.out, which calls
+   abs_stub.
+  */
+  m.impl("abs.out", &wrapper_abs_out);
+  m.impl("quantize_per_tensor", &wrapper_quantize_per_tensor);
+  m.impl("_fused_sdp_choice", &wrapper__fused_sdp_choice);
+  m.impl(
+      "_scaled_dot_product_fused_attention_overrideable",
+      &wrapper__scaled_dot_product_fused_attention_overrideable);
+  m.impl(
+      "_scaled_dot_product_fused_attention_overrideable_backward",
+      &wrapper_scaled_dot_product_fused_attention_overrideable_backward);
+}
+
+TORCH_LIBRARY_FRAGMENT(openreg, m) {
+  m.def("custom_autograd_fn_returns_self(Tensor input)-> Tensor");
+  m.def("custom_autograd_fn_aliasing(Tensor(a) input)-> Tensor(a)");
+}
+
+TORCH_LIBRARY_IMPL(openreg, AutogradPrivateUse1, m) {
+  m.impl(
+      "custom_autograd_fn_returns_self",
+      &wrapper_custom_autograd_fn_returns_self);
+  m.impl("custom_autograd_fn_aliasing", &wrapper_custom_autograd_fn_aliasing);
+}
+
+} // namespace at::openreg
diff --git a/PyTorchSimDevice2/csrc/aten/OpenRegMinimal.cpp b/PyTorchSimDevice2/csrc/aten/OpenRegMinimal.cpp
new file mode 100644
index 00000000..d54ae552
--- /dev/null
+++ b/PyTorchSimDevice2/csrc/aten/OpenRegMinimal.cpp
@@ -0,0 +1,148 @@
+#include "native/Minimal.h"
+
+#include <ATen/native/CPUFallback.h>
+#include <ATen/native/DispatchStub.h>
+
+#include <torch/library.h>
+
+namespace at::openreg {
+
+namespace {
+
+// LITERALINCLUDE START: EMPTY.MEMORY_FORMAT WRAPPER
+at::Tensor wrapper_empty_memory_format(
+    c10::IntArrayRef size,
+    std::optional<c10::ScalarType> dtype_opt,
+    std::optional<c10::Layout> layout_opt,
+    std::optional<c10::Device> device_opt,
+    std::optional<bool> pin_memory_opt,
+    std::optional<c10::MemoryFormat> memory_format_opt) {
+  return at::native::openreg::empty_memory_format(
+      size,
+      dtype_opt,
+      layout_opt,
+      device_opt,
+      pin_memory_opt,
+      memory_format_opt);
+}
+// LITERALINCLUDE END: EMPTY.MEMORY_FORMAT WRAPPER
+
+at::Tensor wrapper_empty_strided(
+    c10::IntArrayRef size,
+    c10::IntArrayRef stride,
+    std::optional<c10::ScalarType> dtype_opt,
+    std::optional<c10::Layout> layout_opt,
+    std::optional<c10::Device> device_opt,
+    std::optional<bool> pin_memory_opt) {
+  return at::native::openreg::empty_strided(
+      size, stride, dtype_opt, layout_opt, device_opt, pin_memory_opt);
+}
+
+at::Tensor wrapper_as_strided(
+    const at::Tensor& self,
+    c10::SymIntArrayRef size,
+    c10::SymIntArrayRef stride,
+    std::optional<c10::SymInt> storage_offset) {
+  return at::native::openreg::as_strided(self, size, stride, storage_offset);
+}
+
+const at::Tensor& wrapper_resize_(
+    const at::Tensor& self,
+    c10::SymIntArrayRef size,
+    ::std::optional<at::MemoryFormat> memory_format) {
+  return at::native::openreg::resize_(self, size, memory_format);
+}
+
+at::Tensor wrapper__reshape_alias(
+    const at::Tensor& self,
+    c10::SymIntArrayRef size,
+    c10::SymIntArrayRef stride) {
+  return at::native::openreg::_reshape_alias(self, size, stride);
+}
+
+at::Tensor wrapper__copy_from(
+    const at::Tensor& self,
+    const at::Tensor& dst,
+    bool non_blocking) {
+  return at::native::openreg::_copy_from(self, dst, non_blocking);
+}
+
+at::Tensor wrapper__copy_from_and_resize(
+    const at::Tensor& self,
+    const at::Tensor& dst) {
+  return at::native::openreg::_copy_from_and_resize(self, dst);
+}
+
+at::Scalar wrapper__local_scalar_densor(const at::Tensor& self) {
+  return at::native::openreg::_local_scalar_dense(self);
+}
+
+at::Tensor& wrapper_set_source_Tensor_(
+    at::Tensor& self,
+    const at::Tensor& source) {
+  return at::native::openreg::set_source_Tensor_(self, source);
+}
+
+at::Tensor& wrapper_set_source_Storage_(at::Tensor& self, at::Storage source) {
+  return at::native::openreg::set_source_Storage_(self, source);
+}
+
+at::Tensor& wrapper_set_source_Storage_storage_offsetset_(
+    at::Tensor& result,
+    at::Storage storage,
+    int64_t storage_offset,
+    c10::IntArrayRef size,
+    c10::IntArrayRef stride) {
+  return at::native::openreg::set_source_Storage_storage_offset_(
+      result, storage, storage_offset, size, stride);
+}
+
+at::Tensor wrapper_view(const at::Tensor& self, c10::SymIntArrayRef size) {
+  return at::native::openreg::view(self, size);
+}
+
+// LITERALINCLUDE START: FALLBACK WRAPPER
+void wrapper_cpu_fallback(
+    const c10::OperatorHandle& op,
+    torch::jit::Stack* stack) {
+  at::native::openreg::cpu_fallback(op, stack);
+}
+// LITERALINCLUDE END: FALLBACK WRAPPER
+
+} // namespace
+
+// LITERALINCLUDE START: TORCH_LIBRARY_IMPL DEFAULT
+TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
+  m.impl("empty.memory_format", wrapper_empty_memory_format);
+  m.impl("empty_strided", wrapper_empty_strided);
+  m.impl("as_strided", wrapper_as_strided);
+  m.impl("resize_", wrapper_resize_);
+  m.impl("_reshape_alias", wrapper__reshape_alias);
+  m.impl("_copy_from", wrapper__copy_from);
+  m.impl("_copy_from_and_resize", wrapper__copy_from_and_resize);
+  m.impl("_local_scalar_dense", wrapper__local_scalar_densor);
+  m.impl("set_.source_Tensor", wrapper_set_source_Tensor_);
+  m.impl("set_.source_Storage", wrapper_set_source_Storage_);
+  m.impl(
+      "set_.source_Storage_storage_offset",
+      wrapper_set_source_Storage_storage_offsetset_);
+  m.impl("view", wrapper_view);
+}
+// LITERALINCLUDE END: TORCH_LIBRARY_IMPL DEFAULT
+
+// LITERALINCLUDE START: FALLBACK GLOBAL
+TORCH_LIBRARY_IMPL(_, PrivateUse1, m) {
+  m.fallback(
+      torch::CppFunction::makeFromBoxedFunction<&wrapper_cpu_fallback>());
+}
+// LITERALINCLUDE END: FALLBACK GLOBAL
+
+// LITERALINCLUDE START: FALLBACK SINGLE
+TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
+  m.impl(
+      "sub.Tensor",
+      torch::CppFunction::makeFromBoxedFunction<&wrapper_cpu_fallback>());
+}
+// LITERALINCLUDE END: FALLBACK SINGLE
+
+} // namespace at::openreg
diff --git a/PyTorchSimDevice2/csrc/aten/native/Common.h b/PyTorchSimDevice2/csrc/aten/native/Common.h
new file mode 100644
index 00000000..c17196d0
--- /dev/null
+++ b/PyTorchSimDevice2/csrc/aten/native/Common.h
@@ -0,0 +1,97 @@
+#include <ATen/EmptyTensor.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorOperators.h>
+#include <ATen/core/blob.h>
+#include <ATen/native/CPUFallback.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/UnaryOps.h>
+#include <ATen/native/quantized/AffineQuantizer.h>
+#include <ATen/native/transformers/attention.h>
+#include <ATen/native/transformers/sdp_utils_cpp.h>
+#include <ATen/ops/_local_scalar_dense_native.h>
+#include <ATen/ops/_reshape_alias_native.h>
+#include <ATen/ops/abs_native.h>
+#include <ATen/ops/as_strided_cpu_dispatch.h>
+#include <ATen/ops/copy_native.h>
+#include <ATen/ops/quantize_per_tensor_native.h>
+#include <ATen/ops/resize_as_native.h>
+#include <ATen/ops/resize_native.h>
+#include <ATen/ops/set_cpu_dispatch.h>
+#include <ATen/ops/set_native.h>
+#include <ATen/ops/view_native.h>
+
+#include <torch/csrc/autograd/custom_function.h>
+#include <torch/csrc/autograd/function_hook.h>
+
+#include <c10/core/Allocator.h>
+
+#include <include/openreg.h>
+
+namespace at::native::openreg {
+
+class MemoryGuard {
+ public:
+  template <typename... Args>
+  explicit MemoryGuard(const Args&... args) {
+    (find_and_unprotect_tensors(args), ...);
+  }
+
+  ~MemoryGuard() noexcept {
+    for (void* ptr : unprotected_pointers_) {
+      orMemoryProtect(ptr);
+    }
+  }
+
+  MemoryGuard(const MemoryGuard&) = delete;
+  MemoryGuard& operator=(const MemoryGuard&) = delete;
+  MemoryGuard(MemoryGuard&&) = delete;
+  MemoryGuard& operator=(MemoryGuard&&) = delete;
+
+ private:
+  template <typename T>
+  void find_and_unprotect_tensors(const T& item) {
+    if constexpr (std::is_base_of_v<at::TensorBase, T>) {
+      unprotect_if_needed(item);
+    } else if constexpr (std::is_same_v<T, c10::IValue>) {
+      if (item.isTensor()) {
+        unprotect_if_needed(item.toTensor());
+      } else if (item.isTensorList()) {
+        for (const at::Tensor& tensor : item.toTensorListRef()) {
+          unprotect_if_needed(tensor);
+        }
+      } else if (item.isList()) {
+        for (const c10::IValue& element : item.toListRef()) {
+          find_and_unprotect_tensors(element);
+        }
+      } else if (item.isGenericDict()) {
+        for (const auto& [key, value] : item.toGenericDict()) {
+          find_and_unprotect_tensors(key);
+          find_and_unprotect_tensors(value);
+        }
+      }
+    }
+  }
+
+  void unprotect_if_needed(const at::TensorBase& tensor) {
+    if (!tensor.defined() || !tensor.has_storage()) {
+      return;
+    }
+
+    void* ptr = tensor.data_ptr();
+    orPointerAttributes attr;
+
+    if (orPointerGetAttributes(&attr, ptr) != orSuccess ||
+        attr.type != orMemoryTypeDevice) {
+      return;
+    }
+
+    auto [it, inserted] = unprotected_pointers_.insert(attr.pointer);
+    if (inserted) {
+      orMemoryUnprotect(attr.pointer);
+    }
+  }
+
+  std::unordered_set<void*> unprotected_pointers_;
+};
+
+} // namespace at::native::openreg
diff --git a/PyTorchSimDevice2/csrc/aten/native/Extra.cpp b/PyTorchSimDevice2/csrc/aten/native/Extra.cpp
new file mode 100644
index 00000000..129ad621
--- /dev/null
+++ b/PyTorchSimDevice2/csrc/aten/native/Extra.cpp
@@ -0,0 +1,210 @@
+#include "Extra.h"
+
+namespace at::native::openreg {
+
+at::Tensor quantize_per_tensor(
+    const at::Tensor& self,
+    double scale,
+    int64_t zero_point,
+    at::ScalarType dtype) {
+  return at::native::quantize_per_tensor(self, scale, zero_point, dtype);
+}
+
+int64_t _fused_sdp_choice(
+    const at::Tensor& query,
+    const at::Tensor& key,
+    const at::Tensor& value,
+    const std::optional<at::Tensor>& attn_mask,
+    double dropout_p,
+    bool is_causal,
+    std::optional<double> scale,
+    bool enable_gqa) {
+  auto backend = sdp::SDPBackend::overrideable;
+  return static_cast<int64_t>(backend);
+}
+
+void quantize_tensor_per_tensor_affine_stub(
+    const at::Tensor& rtensor,
+    at::Tensor& qtensor,
+    double scale,
+    int64_t zero_point) {}
+
+std::tuple<
+    at::Tensor,
+    at::Tensor,
+    at::Tensor,
+    at::Tensor,
+    c10::SymInt,
+    c10::SymInt,
+    at::Tensor,
+    at::Tensor,
+    at::Tensor>
+_scaled_dot_product_fused_attention_overrideable(
+    const at::Tensor& query,
+    const at::Tensor& key,
+    const at::Tensor& value,
+    const std::optional<at::Tensor>& attn_bias,
+    double dropout_p,
+    bool is_causal,
+    bool return_debug_mask,
+    std::optional<double> scale) {
+  const int64_t batch_size = query.size(0);
+  const int64_t num_heads = query.size(1);
+  const int64_t head_dim_v = value.size(3);
+  const int64_t max_seqlen_q = query.size(2);
+  const int64_t max_seqlen_kv = key.size(2);
+
+  auto opts = query.options();
+  auto output =
+      at::empty({batch_size, num_heads, max_seqlen_q, head_dim_v}, opts);
+  auto logsumexp =
+      at::empty({batch_size, num_heads, max_seqlen_q}, opts.dtype(at::kFloat));
+  auto debug_attn_mask = at::empty(
+      {batch_size, num_heads, max_seqlen_q, max_seqlen_kv},
+      opts.dtype(at::kFloat));
+  auto philox_seed = at::empty({}, at::dtype(at::kLong));
+  auto philox_offset = at::empty({}, at::dtype(at::kLong));
+
+  return std::make_tuple(
+      output,
+      logsumexp,
+      at::Tensor(),
+      at::Tensor(),
+      max_seqlen_q,
+      max_seqlen_kv,
+      philox_seed,
+      philox_offset,
+      debug_attn_mask);
+}
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
+_scaled_dot_product_fused_attention_overrideable_backward(
+    const at::Tensor& grad_out,
+    const at::Tensor& query,
+    const at::Tensor& key,
+    const at::Tensor& value,
+    const at::Tensor& attn_bias,
+    std::array<bool, 4> grad_input_mask,
+    const at::Tensor& out,
+    const at::Tensor& logsumexp,
+    const at::Tensor& cum_seq_q,
+    const at::Tensor& cum_seq_k,
+    int64_t max_q,
+    int64_t max_k,
+    double dropout_p,
+    bool is_causal,
+    const at::Tensor& philox_seed,
+    const at::Tensor& philox_offset,
+    std::optional<double> scale) {
+  return std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>(
+      at::empty_like(query),
+      at::empty_like(key),
+      at::empty_like(value),
+      at::empty_like(attn_bias));
+}
+
+namespace {
+struct CustomAutogradFnReturnsSelf
+    : public torch::autograd::Function<CustomAutogradFnReturnsSelf> {
+  static at::Tensor forward(
+      torch::autograd::AutogradContext* ctx,
+      at::Tensor self) {
+    return self;
+  }
+
+  static torch::autograd::variable_list backward(
+      torch::autograd::AutogradContext* ctx,
+      torch::autograd::variable_list grad_output) {
+    return {grad_output[0] * 0.5};
+  }
+};
+
+struct CustomAutogradFnAliasing
+    : public torch::autograd::Function<CustomAutogradFnAliasing> {
+  static at::Tensor forward(
+      torch::autograd::AutogradContext* ctx,
+      at::Tensor self) {
+    return self.view_symint(self.sym_sizes());
+  }
+
+  static torch::autograd::variable_list backward(
+      torch::autograd::AutogradContext* ctx,
+      torch::autograd::variable_list grad_output) {
+    return {grad_output[0] * 0.5};
+  }
+};
+} // namespace
+
+at::Tensor custom_autograd_fn_returns_self(at::Tensor x) {
+  return CustomAutogradFnReturnsSelf::apply(x);
+}
+
+at::Tensor custom_autograd_fn_aliasing(at::Tensor x) {
+  return CustomAutogradFnAliasing::apply(x);
+}
+
+/*
+ This implementation is only used to test stub registration, so not all
+ capabilities are fully supported.
+
+ Current Limitations:
+ - dtype: Float only
+ - input tensor: must be contiguous layout
+*/
+// LITERALINCLUDE START: STUB ABS
+void abs_kernel(at::TensorIteratorBase& iter) {
+  TORCH_CHECK(iter.ntensors() == 2, "Abs kernel expects 2 tensors");
+  TORCH_CHECK(
+      iter.common_dtype() == at::ScalarType::Float,
+      "Abs kernel only supports float type");
+
+  auto& output_tensor = iter.tensor(0);
+  auto& input_tensor = iter.tensor(1);
+
+  TORCH_CHECK(
+      input_tensor.sizes() == output_tensor.sizes(),
+      "Input and output tensor sizes must match.");
+
+  auto abs_loop = [](float* out_ptr, const float* in_ptr, int64_t n) {
+    for (int64_t i = 0; i < n; ++i) {
+      out_ptr[i] = std::abs(in_ptr[i]);
+    }
+  };
+
+  MemoryGuard guard(input_tensor, output_tensor);
+
+  if (iter.is_contiguous()) {
+    abs_loop(
+        static_cast<float*>(iter.data_ptr(0)),
+        static_cast<float*>(iter.data_ptr(1)),
+        iter.numel());
+  } else {
+    TORCH_CHECK(
+        input_tensor.is_contiguous(), "Input tensor must be contiguous.")
+
+    auto output = at::empty(
+        input_tensor.sizes(),
+        input_tensor.options().memory_format(
+            input_tensor.suggest_memory_format()));
+
+    MemoryGuard guard(output);
+
+    abs_loop(
+        static_cast<float*>(output.data_ptr()),
+        static_cast<float*>(iter.data_ptr(1)),
+        iter.numel());
+
+    output_tensor.copy_(output);
+  }
+}
+// LITERALINCLUDE END: STUB ABS
+
+at::Tensor& abs_out(const at::Tensor& self, at::Tensor& out) {
+  return at::native::abs_out(self, out);
+}
+
+at::Tensor custom_abs(at::Tensor x) {
+  return at::abs(x);
+}
+
+} // namespace at::native::openreg
diff --git a/PyTorchSimDevice2/csrc/aten/native/Extra.h b/PyTorchSimDevice2/csrc/aten/native/Extra.h
new file mode 100644
index 00000000..f002949a
--- /dev/null
+++ b/PyTorchSimDevice2/csrc/aten/native/Extra.h
@@ -0,0 +1,69 @@
+#include "Common.h"
+
+namespace at::native::openreg {
+
+at::Tensor quantize_per_tensor(
+    const at::Tensor& self,
+    double scale,
+    int64_t zero_point,
+    at::ScalarType dtype);
+int64_t _fused_sdp_choice(
+    const at::Tensor& query,
+    const at::Tensor& key,
+    const at::Tensor& value,
+    const std::optional<at::Tensor>& attn_mask,
+    double dropout_p,
+    bool is_causal,
+    std::optional<double> scale,
+    bool enable_gqa);
+void quantize_tensor_per_tensor_affine_stub(
+    const at::Tensor& rtensor,
+    at::Tensor& qtensor,
+    double scale,
+    int64_t zero_point);
+std::tuple<
+    at::Tensor,
+    at::Tensor,
+    at::Tensor,
+    at::Tensor,
+    c10::SymInt,
+    c10::SymInt,
+    at::Tensor,
+    at::Tensor,
+    at::Tensor>
+_scaled_dot_product_fused_attention_overrideable(
+    const at::Tensor& query,
+    const at::Tensor& key,
+    const at::Tensor& value,
+    const std::optional<at::Tensor>& attn_bias,
+    double dropout_p,
+    bool is_causal,
+    bool return_debug_mask,
+    std::optional<double> scale);
+std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
+_scaled_dot_product_fused_attention_overrideable_backward(
+    const at::Tensor& grad_out,
+    const at::Tensor& query,
+    const at::Tensor& key,
+    const at::Tensor& value,
+    const at::Tensor& attn_bias,
+    std::array<bool, 4> grad_input_mask,
+    const at::Tensor& out,
+    const at::Tensor& logsumexp,
+    const at::Tensor& cum_seq_q,
+    const at::Tensor& cum_seq_k,
+    int64_t max_q,
+    int64_t max_k,
+    double dropout_p,
+    bool is_causal,
+    const at::Tensor& philox_seed,
+    const at::Tensor& philox_offset,
+    std::optional<double> scale);
+
+at::Tensor custom_autograd_fn_returns_self(at::Tensor x);
+at::Tensor custom_autograd_fn_aliasing(at::Tensor x);
+at::Tensor& abs_out(const at::Tensor& self, at::Tensor& out);
+void abs_kernel(at::TensorIteratorBase& iter);
+at::Tensor custom_abs(at::Tensor x);
+
+} // namespace at::native::openreg
diff --git a/PyTorchSimDevice2/csrc/aten/native/Minimal.cpp b/PyTorchSimDevice2/csrc/aten/native/Minimal.cpp
new file mode 100644
index 00000000..8a3263bb
--- /dev/null
+++ b/PyTorchSimDevice2/csrc/aten/native/Minimal.cpp
@@ -0,0 +1,185 @@
+#include "Minimal.h"
+
+#include <unordered_set>
+
+namespace at::native::openreg {
+
+// LITERALINCLUDE START: EMPTY.MEMORY_FORMAT IMPL
+at::Tensor empty_memory_format(
+    c10::IntArrayRef size,
+    std::optional<c10::ScalarType> dtype_opt,
+    std::optional<c10::Layout> layout_opt,
+    std::optional<c10::Device> device_opt,
+    std::optional<bool> pin_memory_opt,
+    std::optional<c10::MemoryFormat> memory_format_opt) {
+  const auto device = c10::device_or_default(device_opt);
+  const auto dtype = c10::dtype_or_default(dtype_opt);
+  TORCH_CHECK(device.is_privateuseone());
+  TORCH_CHECK(
+      c10::layout_or_default(layout_opt) == c10::Layout::Strided,
+      "Non strided layout not supported");
+  TORCH_CHECK(
+      !c10::pinned_memory_or_default(pin_memory_opt),
+      "Pin memory can only be on CPU");
+  const c10::DeviceGuard device_guard(device);
+  constexpr c10::DispatchKeySet pu1_dks(c10::DispatchKey::PrivateUse1);
+  auto allocator = at::GetAllocator(at::kPrivateUse1);
+  return at::detail::empty_generic(
+      size, allocator, pu1_dks, dtype, memory_format_opt);
+}
+// LITERALINCLUDE END: EMPTY.MEMORY_FORMAT IMPL
+
+at::Tensor empty_strided(
+    c10::IntArrayRef size,
+    c10::IntArrayRef stride,
+    std::optional<c10::ScalarType> dtype_opt,
+    std::optional<c10::Layout> layout_opt,
+    std::optional<c10::Device> device_opt,
+    std::optional<bool> pin_memory_opt) {
+  const auto device = c10::device_or_default(device_opt);
+  const auto dtype = c10::dtype_or_default(dtype_opt);
+  TORCH_CHECK(device.is_privateuseone());
+  TORCH_CHECK(
+      c10::layout_or_default(layout_opt) == c10::Layout::Strided,
+      "Non strided layout not supported");
+  TORCH_CHECK(
+      !c10::pinned_memory_or_default(pin_memory_opt),
+      "Pin memory can only be on CPU");
+  const c10::DeviceGuard device_guard(device);
+  constexpr c10::DispatchKeySet pu1_dks(c10::DispatchKey::PrivateUse1);
+  auto allocator = at::GetAllocator(at::kPrivateUse1);
+  return at::detail::empty_strided_generic(
+      size, stride, allocator, pu1_dks, dtype);
+}
+
+at::Tensor as_strided(
+    const at::Tensor& self,
+    c10::SymIntArrayRef size,
+    c10::SymIntArrayRef stride,
+    std::optional<c10::SymInt> storage_offset) {
+  MemoryGuard guard(self);
+
+  return at::cpu::as_strided_symint(self, size, stride, storage_offset);
+}
+
+const at::Tensor& resize_(
+    const at::Tensor& self,
+    c10::SymIntArrayRef size,
+    ::std::optional<at::MemoryFormat> memory_format) {
+  return at::native::resize_(
+      self, C10_AS_INTARRAYREF_SLOW(size), memory_format);
+}
+
+at::Tensor _reshape_alias(
+    const at::Tensor& self,
+    c10::SymIntArrayRef size,
+    c10::SymIntArrayRef stride) {
+  return at::native::_reshape_alias(
+      self, C10_AS_INTARRAYREF_SLOW(size), C10_AS_INTARRAYREF_SLOW(stride));
+}
+
+at::Tensor _copy_from(
+    const at::Tensor& self,
+    const at::Tensor& dst,
+    bool non_blocking) {
+  TORCH_CHECK(self.defined(), "Source tensor (self) is not defined.");
+  TORCH_CHECK(dst.defined(), "Destination tensor (dst) is not defined.");
+
+  MemoryGuard guard(self, dst);
+
+  if (self.device() == dst.device()) {
+    at::Tensor dst_as_cpu = at::from_blob(
+        dst.data_ptr(),
+        dst.sizes(),
+        dst.strides(),
+        dst.options().device(at::kCPU));
+    const at::Tensor self_as_cpu = at::from_blob(
+        self.data_ptr(),
+        self.sizes(),
+        self.strides(),
+        self.options().device(at::kCPU));
+
+    at::native::copy_(
+        const_cast<at::Tensor&>(dst_as_cpu), self_as_cpu, non_blocking);
+
+  } else {
+    if (self.is_cpu()) {
+      at::Tensor dst_as_cpu = at::from_blob(
+          dst.data_ptr(),
+          dst.sizes(),
+          dst.strides(),
+          dst.options().device(at::kCPU));
+
+      at::native::copy_(
+          const_cast<at::Tensor&>(dst_as_cpu), self, non_blocking);
+
+    } else {
+      at::Tensor self_as_cpu = at::from_blob(
+          self.data_ptr(),
+          self.sizes(),
+          self.strides(),
+          self.options().device(at::kCPU));
+
+      at::native::copy_(
+          const_cast<at::Tensor&>(dst), self_as_cpu, non_blocking);
+    }
+  }
+
+  return dst;
+}
+
+at::Tensor _copy_from_and_resize(
+    const at::Tensor& self,
+    const at::Tensor& dst) {
+  at::native::resize_(dst, self.sizes(), std::nullopt);
+  return at::native::copy_(const_cast<at::Tensor&>(dst), self, false);
+}
+
+at::Scalar _local_scalar_dense(const at::Tensor& self) {
+  MemoryGuard guard(self);
+  return at::native::_local_scalar_dense_cpu(self);
+}
+
+at::Tensor& set_source_Tensor_(at::Tensor& self, const at::Tensor& source) {
+  return at::native::set_tensor_(self, source);
+}
+
+at::Tensor& set_source_Storage_(at::Tensor& self, at::Storage source) {
+  return at::native::set_(self, source);
+}
+
+at::Tensor& set_source_Storage_storage_offset_(
+    at::Tensor& result,
+    at::Storage storage,
+    int64_t storage_offset,
+    c10::IntArrayRef size,
+    c10::IntArrayRef stride) {
+  return at::cpu::set_(result, storage, storage_offset, size, stride);
+}
+
+at::Tensor view(const at::Tensor& self, c10::SymIntArrayRef size) {
+  MemoryGuard guard(self);
+  return at::native::view(self, C10_AS_INTARRAYREF_SLOW(size));
+}
+
+// LITERALINCLUDE START: FALLBACK IMPL
+void cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
+  static const std::unordered_set<c10::OperatorName> cpu_fallback_blocklist = {
+      c10::OperatorName("aten::abs", ""),
+      c10::OperatorName("aten::abs", "out"),
+  };
+
+  const auto& op_name = op.schema().operator_name();
+  if (cpu_fallback_blocklist.count(op_name)) {
+    TORCH_CHECK(
+        false,
+        "Operator '",
+        op_name,
+        "' is not implemented for device openreg.");
+  } else {
+    at::native::cpu_fallback(op, stack);
+  }
+}
+// LITERALINCLUDE END: FALLBACK IMPL
+
+} // namespace at::native::openreg
diff --git a/PyTorchSimDevice2/csrc/aten/native/Minimal.h b/PyTorchSimDevice2/csrc/aten/native/Minimal.h
new file mode 100644
index 00000000..a2e5cf02
--- /dev/null
+++ b/PyTorchSimDevice2/csrc/aten/native/Minimal.h
@@ -0,0 +1,61 @@
+#include "Common.h"
+
+namespace at::native::openreg {
+
+at::Tensor empty_memory_format(
+    c10::IntArrayRef size,
+    std::optional<c10::ScalarType> dtype_opt,
+    std::optional<c10::Layout> layout_opt,
+    std::optional<c10::Device> device_opt,
+    std::optional<bool> pin_memory_opt,
+    std::optional<c10::MemoryFormat> memory_format_opt);
+
+at::Tensor empty_strided(
+    c10::IntArrayRef size,
+    c10::IntArrayRef stride,
+    std::optional<c10::ScalarType> dtype_opt,
+    std::optional<c10::Layout> layout_opt,
+    std::optional<c10::Device> device_opt,
+    std::optional<bool> pin_memory_opt);
+
+at::Tensor as_strided(
+    const at::Tensor& self,
+    c10::SymIntArrayRef size,
+    c10::SymIntArrayRef stride,
+    std::optional<c10::SymInt> storage_offset);
+
+const at::Tensor& resize_(
+    const at::Tensor& self,
+    c10::SymIntArrayRef size,
+    ::std::optional<at::MemoryFormat> memory_format);
+
+at::Tensor _reshape_alias(
+    const at::Tensor& self,
+    c10::SymIntArrayRef size,
+    c10::SymIntArrayRef stride);
+
+at::Tensor _copy_from(
+    const at::Tensor& self,
+    const at::Tensor& dst,
+    bool non_blocking);
+
+at::Tensor _copy_from_and_resize(const at::Tensor& self, const at::Tensor& dst);
+
+at::Scalar _local_scalar_dense(const at::Tensor& self);
+
+at::Tensor& set_source_Tensor_(at::Tensor& self, const at::Tensor& source);
+
+at::Tensor& set_source_Storage_(at::Tensor& self, at::Storage source);
+
+at::Tensor& set_source_Storage_storage_offset_(
+    at::Tensor& result,
+    at::Storage storage,
+    int64_t storage_offset,
+    c10::IntArrayRef size,
+    c10::IntArrayRef stride);
+
+at::Tensor view(const at::Tensor& self, c10::SymIntArrayRef size);
+
+void cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack);
+
+} // namespace at::native::openreg
diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegDeviceAllocator.cpp b/PyTorchSimDevice2/csrc/runtime/OpenRegDeviceAllocator.cpp
new file mode 100644
index 00000000..3d35b677
--- /dev/null
+++ b/PyTorchSimDevice2/csrc/runtime/OpenRegDeviceAllocator.cpp
@@ -0,0 +1,8 @@
+#include "OpenRegDeviceAllocator.h"
+
+namespace c10::openreg {
+
+static OpenRegDeviceAllocator global_openreg_alloc;
+REGISTER_ALLOCATOR(c10::DeviceType::PrivateUse1, &global_openreg_alloc);
+
+} // namespace c10::openreg
diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegDeviceAllocator.h b/PyTorchSimDevice2/csrc/runtime/OpenRegDeviceAllocator.h
new file mode 100644
index 00000000..c9aea4a9
--- /dev/null
+++ b/PyTorchSimDevice2/csrc/runtime/OpenRegDeviceAllocator.h
@@ -0,0 +1,43 @@
+#include <ATen/core/CachingHostAllocator.h>
+
+#include <c10/core/Allocator.h>
+#include <c10/core/Device.h>
+
+#include <include/openreg.h>
+
+namespace c10::openreg {
+struct OpenRegDeviceAllocator final : at::Allocator {
+  OpenRegDeviceAllocator() = default;
+
+  static void ReportAndDelete(void* ptr) {
+    if (!ptr) {
+      return;
+    }
+    orFreeHost(ptr);
+  }
+
+  at::DataPtr allocate(size_t nbytes) override {
+    int current_device_index = -1;
+    orGetDevice(&current_device_index);
+
+    auto curr_device =
+        c10::Device(c10::DeviceType::PrivateUse1, current_device_index);
+    void* data = nullptr;
+    if (nbytes > 0) {
+      orMalloc(&data, nbytes);
+      TORCH_CHECK(
+          data, "Failed to allocator ", nbytes, " bytes on openreg device.");
+    }
+    return {data, data, &ReportAndDelete, curr_device};
+  }
+
+  at::DeleterFnPtr raw_deleter() const override {
+    return &ReportAndDelete;
+  }
+
+  void copy_data(void* dest, const void* src, std::size_t count) const final {
+    orMemcpy(dest, src, count, orMemcpyDeviceToDevice);
+  }
+};
+
+} // namespace c10::openreg
diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegEvent.h b/PyTorchSimDevice2/csrc/runtime/OpenRegEvent.h
new file mode 100644
index 00000000..e869cf0d
--- /dev/null
+++ b/PyTorchSimDevice2/csrc/runtime/OpenRegEvent.h
@@ -0,0 +1,146 @@
+#pragma once
+
+#include <include/openreg.h>
+
+#include "OpenRegException.h"
+#include "OpenRegStream.h"
+
+namespace c10::openreg {
+
+struct OpenRegEvent {
+  OpenRegEvent(bool enable_timing) noexcept : enable_timing_{enable_timing} {}
+
+  ~OpenRegEvent() {
+    if (is_created_) {
+      OPENREG_CHECK(orEventDestroy(event_));
+    }
+  }
+
+  OpenRegEvent(const OpenRegEvent&) = delete;
+  OpenRegEvent& operator=(const OpenRegEvent&) = delete;
+
+  OpenRegEvent(OpenRegEvent&& other) noexcept {
+    moveHelper(std::move(other));
+  }
+  OpenRegEvent& operator=(OpenRegEvent&& other) noexcept {
+    if (this != &other) {
+      moveHelper(std::move(other));
+    }
+    return *this;
+  }
+
+  operator orEvent_t() const {
+    return event();
+  }
+
+  std::optional<at::Device> device() const {
+    if (is_created_) {
+      return at::Device(at::kPrivateUse1, device_index_);
+    } else {
+      return std::nullopt;
+    }
+  }
+
+  bool isCreated() const {
+    return is_created_;
+  }
+
+  DeviceIndex device_index() const {
+    return device_index_;
+  }
+
+  orEvent_t event() const {
+    return event_;
+  }
+
+  bool query() const {
+    if (!is_created_) {
+      return true;
+    }
+
+    orError_t err = orEventQuery(event_);
+    if (err == orSuccess) {
+      return true;
+    }
+
+    return false;
+  }
+
+  void record() {
+    record(getCurrentOpenRegStream());
+  }
+
+  void recordOnce(const OpenRegStream& stream) {
+    if (!was_recorded_)
+      record(stream);
+  }
+
+  void record(const OpenRegStream& stream) {
+    if (!is_created_) {
+      createEvent(stream.device_index());
+    }
+
+    TORCH_CHECK(
+        device_index_ == stream.device_index(),
+        "Event device ",
+        device_index_,
+        " does not match recording stream's device ",
+        stream.device_index(),
+        ".");
+
+    OPENREG_CHECK(orEventRecord(event_, stream));
+    was_recorded_ = true;
+  }
+
+  void block(const OpenRegStream& stream) {
+    if (is_created_) {
+      OPENREG_CHECK(orStreamWaitEvent(stream, event_, 0));
+    }
+  }
+
+  float elapsed_time(const OpenRegEvent& other) const {
+    TORCH_CHECK_VALUE(
+        !(enable_timing_ & orEventDisableTiming) &&
+            !(other.enable_timing_ & orEventDisableTiming),
+        "Both events must be created with argument 'enable_timing=True'.");
+    TORCH_CHECK_VALUE(
+        is_created_ && other.isCreated(),
+        "Both events must be recorded before calculating elapsed time.");
+    TORCH_CHECK(
+        query() && other.query(),
+        "Both events must be completed before calculating elapsed time.");
+
+    float time_ms = 0;
+    OPENREG_CHECK(orEventElapsedTime(&time_ms, event_, other.event_));
+    return time_ms;
+  }
+
+  void synchronize() const {
+    if (is_created_) {
+      OPENREG_CHECK(orEventSynchronize(event_));
+    }
+  }
+
+ private:
+  unsigned int enable_timing_{orEventDisableTiming};
+  bool is_created_{false};
+  bool was_recorded_{false};
+  DeviceIndex device_index_{-1};
+  orEvent_t event_{};
+
+  void createEvent(DeviceIndex device_index) {
+    device_index_ = device_index;
+    OPENREG_CHECK(orEventCreateWithFlags(&event_, enable_timing_));
+    is_created_ = true;
+  }
+
+  void moveHelper(OpenRegEvent&& other) {
+    std::swap(enable_timing_, other.enable_timing_);
+    std::swap(is_created_, other.is_created_);
+    std::swap(was_recorded_, other.was_recorded_);
+    std::swap(device_index_, other.device_index_);
+    std::swap(event_, other.event_);
+  }
+};
+
+} // namespace c10::openreg
diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegException.cpp b/PyTorchSimDevice2/csrc/runtime/OpenRegException.cpp
new file mode 100644
index 00000000..09eb09b6
--- /dev/null
+++ b/PyTorchSimDevice2/csrc/runtime/OpenRegException.cpp
@@ -0,0 +1,9 @@
+#include "OpenRegException.h"
+
+void orCheckFail(
+    const char* func,
+    const char* file,
+    uint32_t line,
+    const char* msg) {
+  throw ::c10::Error({func, file, line}, msg);
+}
diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegException.h b/PyTorchSimDevice2/csrc/runtime/OpenRegException.h
new file mode 100644
index 00000000..16c1ee1c
--- /dev/null
+++ b/PyTorchSimDevice2/csrc/runtime/OpenRegException.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <include/openreg.h>
+
+#include <c10/util/Exception.h>
+
+void orCheckFail(
+    const char* func,
+    const char* file,
+    uint32_t line,
+    const char* msg = "");
+
+#define OPENREG_CHECK(EXPR, ...)                                               \
+  do {                                                                         \
+    const orError_t __err = EXPR;                                              \
+    if (__err != orSuccess) {                                                  \
+      orCheckFail(                                                             \
+          __func__, __FILE__, static_cast<uint32_t>(__LINE__), ##__VA_ARGS__); \
+    }                                                                          \
+  } while (0)
diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegFunctions.cpp b/PyTorchSimDevice2/csrc/runtime/OpenRegFunctions.cpp
new file mode 100644
index 00000000..566bacd0
--- /dev/null
+++ b/PyTorchSimDevice2/csrc/runtime/OpenRegFunctions.cpp
@@ -0,0 +1,74 @@
+#include <include/openreg.h>
+
+#include "OpenRegException.h"
+#include "OpenRegFunctions.h"
+
+namespace c10::openreg {
+
+orError_t GetDeviceCount(int* dev_count) {
+  return orGetDeviceCount(dev_count);
+}
+
+orError_t GetDevice(c10::DeviceIndex* device) {
+  int tmp_device = -1;
+  auto err = orGetDevice(&tmp_device);
+  *device = static_cast<c10::DeviceIndex>(tmp_device);
+  return err;
+}
+
+orError_t SetDevice(c10::DeviceIndex device) {
+  int cur_device = -1;
+  orGetDevice(&cur_device);
+  if (device == cur_device) {
+    return orSuccess;
+  }
+  return orSetDevice(device);
+}
+
+int device_count_impl() {
+  int count = 0;
+  GetDeviceCount(&count);
+  return count;
+}
+
+OPENREG_EXPORT c10::DeviceIndex device_count() noexcept {
+  // initialize number of devices only once
+  static int count = []() {
+    try {
+      auto result = device_count_impl();
+      TORCH_INTERNAL_ASSERT(
+          result <= std::numeric_limits<c10::DeviceIndex>::max(),
+          "Too many devices, DeviceIndex overflowed");
+      return result;
+    } catch (const c10::Error& ex) {
+      // We don't want to fail, but still log the warning
+      // msg() returns the message without the stack trace
+      TORCH_WARN("Device initialization: ", ex.msg());
+      return 0;
+    }
+  }();
+  return static_cast<c10::DeviceIndex>(count);
+}
+
+OPENREG_EXPORT c10::DeviceIndex current_device() {
+  c10::DeviceIndex cur_device = -1;
+  GetDevice(&cur_device);
+  return cur_device;
+}
+
+OPENREG_EXPORT void set_device(c10::DeviceIndex device) {
+  SetDevice(device);
+}
+
+OPENREG_EXPORT DeviceIndex ExchangeDevice(DeviceIndex device) {
+  int current_device = -1;
+  orGetDevice(&current_device);
+
+  if (device != current_device) {
+    orSetDevice(device);
+  }
+
+  return current_device;
+}
+
+} // namespace c10::openreg
diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegFunctions.h b/PyTorchSimDevice2/csrc/runtime/OpenRegFunctions.h
new file mode 100644
index 00000000..c2eb1e80
--- /dev/null
+++ b/PyTorchSimDevice2/csrc/runtime/OpenRegFunctions.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include <c10/core/Device.h>
+#include <c10/macros/Macros.h>
+
+#include <include/Macros.h>
+
+#include <limits>
+
+namespace c10::openreg {
+
+OPENREG_EXPORT c10::DeviceIndex device_count() noexcept;
+OPENREG_EXPORT c10::DeviceIndex current_device();
+OPENREG_EXPORT void set_device(c10::DeviceIndex device);
+
+OPENREG_EXPORT DeviceIndex ExchangeDevice(DeviceIndex device);
+
+} // namespace c10::openreg
diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegGenerator.cpp b/PyTorchSimDevice2/csrc/runtime/OpenRegGenerator.cpp
new file mode 100644
index 00000000..c2e03f66
--- /dev/null
+++ b/PyTorchSimDevice2/csrc/runtime/OpenRegGenerator.cpp
@@ -0,0 +1,28 @@
+#include "OpenRegGenerator.h"
+
+// Default, global generators, one per device.
+static std::vector<at::Generator> default_generators;
+
+namespace c10::openreg {
+
+const at::Generator& getDefaultOpenRegGenerator(c10::DeviceIndex device_index) {
+  static bool flag [[maybe_unused]] = []() {
+    auto deivce_nums = device_count();
+    default_generators.resize(deivce_nums);
+    for (auto i = 0; i < deivce_nums; i++) {
+      default_generators[i] = at::make_generator<OpenRegGeneratorImpl>(i);
+      default_generators[i].seed();
+    }
+    return true;
+  }();
+
+  c10::DeviceIndex idx = device_index;
+  if (idx == -1) {
+    idx = current_device();
+  } else {
+    TORCH_CHECK(idx >= 0 && idx < device_count());
+  }
+  return default_generators[idx];
+}
+
+} // namespace c10::openreg
diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegGenerator.h b/PyTorchSimDevice2/csrc/runtime/OpenRegGenerator.h
new file mode 100644
index 00000000..877a9707
--- /dev/null
+++ b/PyTorchSimDevice2/csrc/runtime/OpenRegGenerator.h
@@ -0,0 +1,21 @@
+#include <ATen/CPUGeneratorImpl.h>
+#include <ATen/core/GeneratorForPrivateuseone.h>
+
+#include <c10/core/Device.h>
+
+#include "OpenRegFunctions.h"
+
+namespace c10::openreg {
+class OpenRegGeneratorImpl : public at::CPUGeneratorImpl {
+ public:
+  OpenRegGeneratorImpl(c10::DeviceIndex device_index) {
+    device_ = c10::Device(c10::DeviceType::PrivateUse1, device_index);
+    key_set_ = c10::DispatchKeySet(c10::DispatchKey::PrivateUse1);
+  }
+  ~OpenRegGeneratorImpl() override = default;
+};
+
+const at::Generator& getDefaultOpenRegGenerator(
+    c10::DeviceIndex device_index = -1);
+
+} // namespace c10::openreg
diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegGuard.cpp b/PyTorchSimDevice2/csrc/runtime/OpenRegGuard.cpp
new file mode 100644
index 00000000..d50e56e4
--- /dev/null
+++ b/PyTorchSimDevice2/csrc/runtime/OpenRegGuard.cpp
@@ -0,0 +1,7 @@
+#include "OpenRegGuard.h"
+
+namespace c10::openreg {
+
+C10_REGISTER_GUARD_IMPL(PrivateUse1, OpenRegGuardImpl);
+
+} // namespace c10::openreg
diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegGuard.h b/PyTorchSimDevice2/csrc/runtime/OpenRegGuard.h
new file mode 100644
index 00000000..f0150fe6
--- /dev/null
+++ b/PyTorchSimDevice2/csrc/runtime/OpenRegGuard.h
@@ -0,0 +1,197 @@
+#include <c10/core/Device.h>
+#include <c10/core/impl/DeviceGuardImplInterface.h>
+
+#include <include/openreg.h>
+
+#include "OpenRegFunctions.h"
+
+namespace c10::openreg {
+
+// Device guard registration
+struct OpenRegGuardImpl final : public c10::impl::DeviceGuardImplInterface {
+  static constexpr c10::DeviceType static_type = c10::DeviceType::PrivateUse1;
+
+  OpenRegGuardImpl() = default;
+  explicit OpenRegGuardImpl(c10::DeviceType t) {
+    TORCH_INTERNAL_ASSERT(t == static_type);
+  }
+
+  /**
+   * Return the type of device managed by this guard implementation.
+   */
+  c10::DeviceType type() const override {
+    return static_type;
+  }
+
+  /**
+   * Set the current device to Device, and return the previous c10::Device.
+   */
+  c10::Device exchangeDevice(c10::Device d) const override {
+    TORCH_CHECK(d.is_privateuseone());
+
+    auto old_device_index = ExchangeDevice(d.index());
+    return c10::Device(static_type, old_device_index);
+  }
+
+  /**
+   * Get the current device.
+   */
+  c10::Device getDevice() const override {
+    int device_index = current_device();
+    return c10::Device(static_type, device_index);
+  }
+
+  /**
+   * Set the current device to c10::Device.
+   */
+  void setDevice(c10::Device d) const override {
+    TORCH_CHECK(d.is_privateuseone());
+
+    set_device(d.index());
+  }
+
+  /**
+   * Set the current device to c10::Device, without checking for errors
+   * (so, e.g., this can be called from a destructor).
+   */
+  void uncheckedSetDevice(c10::Device d) const noexcept override {
+    TORCH_CHECK(d.is_privateuseone());
+
+    set_device(d.index());
+  }
+
+  /**
+   * Get the current stream for a given device.
+   */
+  c10::Stream getStream(c10::Device d) const noexcept override {
+    return c10::Stream(c10::Stream::DEFAULT, d);
+  }
+
+  /**
+   * Get the default stream for a given device.
+   */
+  c10::Stream getDefaultStream(c10::Device d) const override {
+    return c10::Stream(c10::Stream::DEFAULT, d);
+  }
+
+  /**
+   * Get a stream from the global pool for a given device.
+   */
+  c10::Stream getStreamFromGlobalPool(
+      c10::Device d,
+      bool isHighPriority = false) const override {
+    return c10::Stream(c10::Stream::DEFAULT, d);
+  }
+
+  /**
+   * Return a new stream for a given device and priority. The stream will be
+   * copied and shared around, device backend should be able to correctly handle
+   * the lifetime of the stream.
+   */
+  c10::Stream getNewStream(c10::Device d, int priority = 0) const override {
+    return c10::Stream(c10::Stream::DEFAULT, d);
+  }
+
+  /**
+   * Set a stream to be the thread local current stream for its device.
+   * Return the previous stream for that device. You are NOT required
+   * to set the current device to match the device of this stream.
+   */
+  c10::Stream exchangeStream(c10::Stream s) const noexcept override {
+    return s;
+  }
+
+  /**
+   * Destroys the given event.
+   */
+  void destroyEvent(void* event, const c10::DeviceIndex device_index)
+      const noexcept override {}
+
+  /**
+   * Increments the event's version and enqueues a job with this version
+   * in the stream's work queue. When the stream process that job
+   * it notifies all streams waiting on / blocked by that version of the
+   * event to continue and marks that version as recorded.
+   * */
+  void record(
+      void** event,
+      const c10::Stream& stream,
+      const c10::DeviceIndex device_index,
+      const c10::EventFlag flag) const override {
+    static int event_id = 1;
+
+    if (!*event)
+      *event = reinterpret_cast<void*>(event_id++);
+  }
+
+  /**
+   * Does nothing if the event has not been scheduled to be recorded.
+   * If the event was previously enqueued to be recorded, a command
+   * to wait for the version of the event that exists at the time of this call
+   * is inserted in the stream's work queue.
+   * When the stream reaches this command it will stop processing
+   * additional commands until that version of the event is marked as recorded.
+   */
+  void block(void* event, const c10::Stream& stream) const override {}
+
+  /**
+   * Returns true if (and only if)
+   *  (1) the event has never been scheduled to be recorded
+   *  (2) the current version is marked as recorded.
+   * Returns false otherwise.
+   */
+  bool queryEvent(void* event) const override {
+    return true;
+  }
+
+  /**
+   * Get the number of devices.  WARNING: This is REQUIRED to not raise
+   * an exception.  If there is some sort of problem, e.g., driver error,
+   * you should report that there are zero available devices.
+   */
+  c10::DeviceIndex deviceCount() const noexcept override {
+    int device_index = -1;
+    orGetDeviceCount(&device_index);
+    return device_index;
+  }
+  /**
+   * Return true if all the work previously enqueued on the stream for
+   * asynchronous execution has completed running on the device.
+   */
+  bool queryStream(const c10::Stream& stream) const override {
+    return true;
+  }
+
+  /**
+   * Wait (by blocking the calling thread) until all the work previously
+   * enqueued on the stream has completed running on the device.
+   */
+  void synchronizeStream(const c10::Stream& stream) const override {}
+
+  /**
+   * Wait (by blocking the calling thread) until all the work previously
+   * recorded on the event has completed running on the device.
+   */
+  void synchronizeEvent(void* event) const override {}
+
+  /**
+   * Ensure the caching allocator (if any) is aware that the given DataPtr is
+   * being used on the given stream, and that it should thus avoid recycling the
+   * DataPtr until all work on that stream is done.
+   */
+  void recordDataPtrOnStream(
+      const c10::DataPtr& data_ptr,
+      const c10::Stream& stream) const override {}
+
+  /**
+   * Fetch the elapsed time between two recorded events.
+   */
+  double elapsedTime(
+      void* event1,
+      void* event2,
+      const c10::DeviceIndex device_index) const override {
+    return 1;
+  }
+};
+
+} // namespace c10::openreg
diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegHooks.cpp b/PyTorchSimDevice2/csrc/runtime/OpenRegHooks.cpp
new file mode 100644
index 00000000..57bc2d9f
--- /dev/null
+++ b/PyTorchSimDevice2/csrc/runtime/OpenRegHooks.cpp
@@ -0,0 +1,11 @@
+#include "OpenRegHooks.h"
+
+namespace c10::openreg {
+
+static bool register_hook_flag [[maybe_unused]] = []() {
+  at::RegisterPrivateUse1HooksInterface(new OpenRegHooksInterface());
+
+  return true;
+}();
+
+} // namespace c10::openreg
diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegHooks.h b/PyTorchSimDevice2/csrc/runtime/OpenRegHooks.h
new file mode 100644
index 00000000..656fba8e
--- /dev/null
+++ b/PyTorchSimDevice2/csrc/runtime/OpenRegHooks.h
@@ -0,0 +1,41 @@
+#include <ATen/core/CachingHostAllocator.h>
+#include <ATen/detail/PrivateUse1HooksInterface.h>
+
+#include <c10/core/Allocator.h>
+#include <c10/core/Device.h>
+
+#include <include/openreg.h>
+
+#include "OpenRegGenerator.h"
+
+namespace c10::openreg {
+struct OpenRegHooksInterface : public at::PrivateUse1HooksInterface {
+  OpenRegHooksInterface() {};
+  ~OpenRegHooksInterface() override = default;
+
+  bool hasPrimaryContext(c10::DeviceIndex device_index) const override {
+    return true;
+  }
+
+  at::Allocator* getPinnedMemoryAllocator() const override {
+    return at::getHostAllocator(at::kPrivateUse1);
+  }
+
+  bool isPinnedPtr(const void* data) const override {
+    orPointerAttributes attr{};
+    orPointerGetAttributes(&attr, data);
+
+    return attr.type == orMemoryTypeHost;
+  }
+
+  const at::Generator& getDefaultGenerator(
+      c10::DeviceIndex device_index) const override {
+    return getDefaultOpenRegGenerator(device_index);
+  }
+
+  at::Generator getNewGenerator(c10::DeviceIndex device_index) const override {
+    return at::make_generator<OpenRegGeneratorImpl>(device_index);
+  }
+};
+
+} // namespace c10::openreg
diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegHostAllocator.cpp b/PyTorchSimDevice2/csrc/runtime/OpenRegHostAllocator.cpp
new file mode 100644
index 00000000..55263803
--- /dev/null
+++ b/PyTorchSimDevice2/csrc/runtime/OpenRegHostAllocator.cpp
@@ -0,0 +1,8 @@
+#include "OpenRegHostAllocator.h"
+
+namespace c10::openreg {
+
+OpenRegHostAllocator caching_host_allocator;
+REGISTER_HOST_ALLOCATOR(at::kPrivateUse1, &caching_host_allocator);
+
+} // namespace c10::openreg
diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegHostAllocator.h b/PyTorchSimDevice2/csrc/runtime/OpenRegHostAllocator.h
new file mode 100644
index 00000000..edef545a
--- /dev/null
+++ b/PyTorchSimDevice2/csrc/runtime/OpenRegHostAllocator.h
@@ -0,0 +1,48 @@
+#include <ATen/core/CachingHostAllocator.h>
+
+#include <c10/core/Allocator.h>
+#include <c10/core/Device.h>
+
+#include <include/openreg.h>
+
+namespace c10::openreg {
+struct OpenRegHostAllocator final : at::HostAllocator {
+  OpenRegHostAllocator() = default;
+
+  static void ReportAndDelete(void* ptr) {
+    if (!ptr) {
+      return;
+    }
+    orFreeHost(ptr);
+  }
+
+  at::DataPtr allocate(size_t nbytes) override {
+    void* data = nullptr;
+    if (nbytes > 0) {
+      orMallocHost(&data, nbytes);
+      TORCH_CHECK(data, "Failed to allocator ", nbytes, " bytes on host.");
+    }
+    return {data, data, &ReportAndDelete, at::Device(at::kCPU)};
+  }
+
+  at::DeleterFnPtr raw_deleter() const override {
+    return &ReportAndDelete;
+  }
+
+  void copy_data(void* dest, const void* src, std::size_t count) const final {
+    orMemcpy(dest, src, count, orMemcpyHostToHost);
+  }
+
+  // ignore
+  bool record_event(void* ptr, void* ctx, c10::Stream stream) override {
+    return true;
+  }
+  void empty_cache() override {}
+  at::HostStats get_stats() override {
+    return at::HostStats();
+  }
+  void reset_accumulated_stats() override {}
+  void reset_peak_stats() override {}
+};
+
+} // namespace c10::openreg
diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegSerialization.cpp b/PyTorchSimDevice2/csrc/runtime/OpenRegSerialization.cpp
new file mode 100644
index 00000000..43809d60
--- /dev/null
+++ b/PyTorchSimDevice2/csrc/runtime/OpenRegSerialization.cpp
@@ -0,0 +1,48 @@
+#include "OpenRegSerialization.h"
+
+namespace c10::openreg {
+struct OpenRegBackendMeta : public c10::BackendMeta {
+  OpenRegBackendMeta(int version_number, int format_number)
+      : version_number_(version_number), format_number_(format_number) {}
+
+  int version_number_{-1};
+  int format_number_{-1};
+};
+
+void for_serialization(
+    const at::Tensor& t,
+    std::unordered_map<std::string, bool>& m) {
+  auto meta_ptr = t.unsafeGetTensorImpl()->get_backend_meta();
+
+  if (meta_ptr != nullptr) {
+    auto o_meta_ptr = dynamic_cast<OpenRegBackendMeta*>(meta_ptr);
+    if (o_meta_ptr->version_number_ == 1) {
+      m["version_number"] = true;
+    }
+    if (o_meta_ptr->format_number_ == 29) {
+      m["format_number"] = true;
+    }
+  }
+}
+
+void for_deserialization(
+    const at::Tensor& t,
+    std::unordered_map<std::string, bool>& m) {
+  int version_number{-1};
+  int format_number{-1};
+
+  if (m.find("version_number") != m.end()) {
+    version_number = 1;
+  }
+  if (m.find("format_number") != m.end()) {
+    format_number = 29;
+  }
+
+  c10::intrusive_ptr<c10::BackendMeta> meta{std::unique_ptr<c10::BackendMeta>(
+      new OpenRegBackendMeta(version_number, format_number))};
+  t.unsafeGetTensorImpl()->set_backend_meta(meta);
+}
+
+REGISTER_PRIVATEUSE1_SERIALIZATION(&for_serialization, &for_deserialization)
+
+} // namespace c10::openreg
diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegSerialization.h b/PyTorchSimDevice2/csrc/runtime/OpenRegSerialization.h
new file mode 100644
index 00000000..559e92ea
--- /dev/null
+++ b/PyTorchSimDevice2/csrc/runtime/OpenRegSerialization.h
@@ -0,0 +1,10 @@
+#include <torch/csrc/jit/serialization/pickler.h>
+
+#define REGISTER_PRIVATEUSE1_SERIALIZATION(                                    \
+    FOR_SERIALIZATION, FOR_DESERIALIZATION)                                    \
+  static int register_serialization() {                                        \
+    torch::jit::TensorBackendMetaRegistry(                                     \
+        c10::DeviceType::PrivateUse1, FOR_SERIALIZATION, FOR_DESERIALIZATION); \
+    return 0;                                                                  \
+  }                                                                            \
+  static const int _temp = register_serialization();
diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegStream.cpp b/PyTorchSimDevice2/csrc/runtime/OpenRegStream.cpp
new file mode 100644
index 00000000..aa6c325d
--- /dev/null
+++ b/PyTorchSimDevice2/csrc/runtime/OpenRegStream.cpp
@@ -0,0 +1,253 @@
+#include "OpenRegStream.h"
+
+#include <c10/util/CallOnce.h>
+#include <c10/util/Exception.h>
+#include <c10/util/irange.h>
+
+#include <array>
+#include <atomic>
+#include <cstdint>
+#include <deque>
+
+namespace c10::openreg {
+
+namespace {
+
+// Global stream state and constants
+static c10::once_flag init_flag;
+
+static DeviceIndex num_devices = -1;
+static constexpr int kStreamsPerPoolBits = 5;
+static constexpr int kStreamsPerPool = 1 << kStreamsPerPoolBits;
+static constexpr int kStreamTypeBits = 2;
+
+/*
+ * The stream pools are lazily initialized when the first queue is requested
+ * for a device. The device flags track the initialization of each device. When
+ * a queue is requested, the next queue in the pool to be returned in a
+ * round-robin fashion, see Note [Stream Management].
+ */
+static std::deque<c10::once_flag> device_flags;
+static std::vector<std::array<
+    std::array<orStream_t, kStreamsPerPool>,
+    c10::openreg::max_compile_time_stream_priorities>>
+    streams;
+static std::deque<
+    std::array<std::atomic<uint32_t>, max_compile_time_stream_priorities>>
+    priority_counters;
+
+static thread_local std::unique_ptr<StreamId[]> current_streams = nullptr;
+
+/*
+ * Note [StreamId assignment]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * How do we assign stream IDs?
+ *
+ * -- 56 bits --    -- 5 bits --     -- 2 bits --     -- 1 bit --
+ *     zeros       StreamIdIndex     StreamIdType    Ext/native stream
+ *                ignored for ext   ignored for ext
+ *
+ * Where StreamIdType:
+ *  00 = default stream
+ *  01 = normal stream
+ *  11 = external stream
+ *
+ * For external stream, StreamID is a orStream_t pointer. This means that last
+ * bit will always be 0. So when constructing StreamId for a native stream we
+ * set last bit to 1 to distinguish between native and external streams.
+ *
+ * StreamId is 64-bit, so we can just rely on regular promotion rules.
+ * We rely on StreamIdIndex and StreamIdType being non-negative;
+ */
+using StreamIdIndex = uint8_t;
+enum class StreamIdType : uint8_t {
+  DEFAULT = 0x0,
+  NORMAL = 0x1,
+  EXT = 0x3,
+};
+
+inline std::ostream& operator<<(std::ostream& stream, StreamIdType s) {
+  switch (s) {
+    case StreamIdType::DEFAULT:
+      return stream << "DEFAULT";
+    case StreamIdType::NORMAL:
+      return stream << "NORMAL";
+    case StreamIdType::EXT:
+      return stream << "EXT";
+    default:
+      break;
+  }
+
+  return stream << static_cast<int16_t>(s);
+}
+
+static inline StreamIdType streamIdType(StreamId s) {
+  // Externally allocated streams have their id being the orStream_ptr
+  // so the last bit will be 0
+  if (!(s & 1)) {
+    return StreamIdType(StreamIdType::EXT);
+  }
+
+  int mask_for_type = (1 << kStreamTypeBits) - 1;
+  auto st = static_cast<StreamIdType>((s >> 1) & mask_for_type);
+  TORCH_CHECK(
+      st == StreamIdType::DEFAULT || st == StreamIdType::NORMAL,
+      "invalid StreamId: ",
+      s);
+  return st;
+}
+
+static inline size_t streamIdIndex(StreamId s) {
+  return static_cast<size_t>(
+      (s >> (kStreamTypeBits + 1)) & ((1 << kStreamsPerPoolBits) - 1));
+}
+
+StreamId makeStreamId(StreamIdType st, size_t si) {
+  if (st == StreamIdType::EXT) {
+    return static_cast<StreamId>(0);
+  }
+
+  return (static_cast<StreamId>(si) << (kStreamTypeBits + 1)) |
+      (static_cast<StreamId>(st) << 1) | 1;
+}
+
+static void initGlobalStreamState() {
+  num_devices = device_count();
+  device_flags.resize(num_devices);
+  streams.resize(num_devices);
+  priority_counters.resize(num_devices);
+}
+
+static void initSingleDeviceStream(
+    int priority,
+    DeviceIndex device_index,
+    int i) {
+  auto& stream = streams[device_index][priority][i];
+
+  OPENREG_CHECK(orStreamCreateWithPriority(&stream, 0, priority));
+  priority_counters[device_index][priority] = 0;
+}
+
+// Creates stream pools for the specified device. It should be call only once.
+static void initDeviceStreamState(DeviceIndex device_index) {
+  for (const auto i : c10::irange(kStreamsPerPool)) {
+    for (const auto p : c10::irange(max_compile_time_stream_priorities)) {
+      initSingleDeviceStream(p, device_index, i);
+    }
+  }
+}
+
+static void initOpenRegStreamsOnce() {
+  c10::call_once(init_flag, initGlobalStreamState);
+
+  if (current_streams) {
+    return;
+  }
+
+  // Inits current streams (thread local) to the last queue in the "normal
+  // priority" queue pool. Note: the queue pool have not been initialized yet.
+  // It will be initialized in initDeviceStreamState for the specified device.
+  current_streams = std::make_unique<StreamId[]>(num_devices);
+  for (const auto i : c10::irange(num_devices)) {
+    current_streams[i] = makeStreamId(StreamIdType::DEFAULT, 0);
+  }
+}
+
+static uint32_t get_idx(std::atomic<uint32_t>& counter) {
+  auto raw_idx = counter++;
+  return raw_idx % kStreamsPerPool;
+}
+
+OpenRegStream OpenRegStreamForId(DeviceIndex device_index, StreamId stream_id) {
+  return OpenRegStream(
+      OpenRegStream::UNCHECKED,
+      Stream(
+          Stream::UNSAFE,
+          c10::Device(DeviceType::PrivateUse1, device_index),
+          stream_id));
+}
+
+} // anonymous namespace
+
+// See Note [StreamId assignment]
+orStream_t OpenRegStream::stream() const {
+  c10::DeviceIndex device_index = stream_.device_index();
+  StreamId stream_id = stream_.id();
+  StreamIdType st = streamIdType(stream_id);
+  size_t si = streamIdIndex(stream_id);
+  switch (st) {
+    // The index 0 stream is default as well.
+    case StreamIdType::DEFAULT:
+    case StreamIdType::NORMAL:
+      return streams[device_index][static_cast<uint8_t>(st)][si];
+    case StreamIdType::EXT:
+      return reinterpret_cast<orStream_t>(stream_id);
+    default:
+      TORCH_CHECK(
+          false,
+          "Unrecognized stream ",
+          stream_,
+          " (I didn't recognize the stream type, ",
+          st,
+          ").",
+          " Did you manufacture the StreamId yourself?  Don't do that;");
+  }
+}
+
+// Returns a stream from the requested pool
+// Note: when called the first time on a device, this will create the
+// stream pools for that device.
+OpenRegStream getStreamFromPool(const int priority, DeviceIndex device_index) {
+  initOpenRegStreamsOnce();
+  if (device_index == -1) {
+    device_index = current_device();
+  }
+  c10::call_once(
+      device_flags[device_index], initDeviceStreamState, device_index);
+  auto pri_idx =
+      std::clamp(priority, 0, max_compile_time_stream_priorities - 1);
+  const auto idx = get_idx(priority_counters[device_index][pri_idx]);
+  auto id_type = static_cast<StreamIdType>(pri_idx);
+  return OpenRegStreamForId(device_index, makeStreamId(id_type, idx));
+}
+
+OpenRegStream getStreamFromPool(const bool isHighPriority, DeviceIndex device) {
+  initOpenRegStreamsOnce();
+  int priority = 0;
+  return getStreamFromPool(priority, device);
+}
+
+OpenRegStream getStreamFromExternal(
+    orStream_t ext_stream,
+    DeviceIndex device_index) {
+  return OpenRegStreamForId(
+      device_index, reinterpret_cast<int64_t>(ext_stream));
+}
+
+OpenRegStream getDefaultOpenRegStream(DeviceIndex device_index) {
+  initOpenRegStreamsOnce();
+  if (device_index == -1) {
+    device_index = current_device();
+  }
+  return OpenRegStreamForId(
+      device_index, makeStreamId(StreamIdType::DEFAULT, 0));
+}
+
+OpenRegStream getCurrentOpenRegStream(DeviceIndex device_index) {
+  initOpenRegStreamsOnce();
+  if (device_index == -1) {
+    device_index = current_device();
+  }
+  return OpenRegStreamForId(device_index, current_streams[device_index]);
+}
+
+void setCurrentOpenRegStream(OpenRegStream stream) {
+  initOpenRegStreamsOnce();
+  current_streams[stream.device_index()] = stream.id();
+}
+
+std::ostream& operator<<(std::ostream& stream, const OpenRegStream& s) {
+  return stream << s.unwrap();
+}
+
+} // namespace c10::openreg
diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegStream.h b/PyTorchSimDevice2/csrc/runtime/OpenRegStream.h
new file mode 100644
index 00000000..e1fd0c71
--- /dev/null
+++ b/PyTorchSimDevice2/csrc/runtime/OpenRegStream.h
@@ -0,0 +1,162 @@
+#pragma once
+
+#include <include/openreg.h>
+
+#include "OpenRegException.h"
+#include "OpenRegFunctions.h"
+
+#include <c10/core/DeviceGuard.h>
+#include <c10/core/Stream.h>
+#include <c10/util/Exception.h>
+
+namespace c10::openreg {
+
+static constexpr int max_compile_time_stream_priorities = 1;
+
+class OpenRegStream {
+ public:
+  enum Unchecked { UNCHECKED };
+
+  explicit OpenRegStream(Stream stream) : stream_(stream) {
+    TORCH_CHECK(stream_.device_type() == DeviceType::PrivateUse1);
+  }
+
+  explicit OpenRegStream(Unchecked, Stream stream) : stream_(stream) {}
+
+  bool operator==(const OpenRegStream& other) const noexcept {
+    return unwrap() == other.unwrap();
+  }
+
+  bool operator!=(const OpenRegStream& other) const noexcept {
+    return unwrap() != other.unwrap();
+  }
+
+  operator orStream_t() const {
+    return stream();
+  }
+
+  operator Stream() const {
+    return unwrap();
+  }
+
+  DeviceType device_type() const {
+    return DeviceType::PrivateUse1;
+  }
+
+  DeviceIndex device_index() const {
+    return stream_.device_index();
+  }
+
+  Device device() const {
+    return Device(DeviceType::PrivateUse1, device_index());
+  }
+
+  StreamId id() const {
+    return stream_.id();
+  }
+
+  bool query() const {
+    DeviceGuard guard{stream_.device()};
+
+    if (orStreamQuery(stream()) == orSuccess) {
+      return true;
+    }
+
+    return false;
+  }
+
+  void synchronize() const {
+    DeviceGuard guard{stream_.device()};
+    OPENREG_CHECK(orStreamSynchronize(stream()));
+  }
+
+  int priority() const {
+    DeviceGuard guard{stream_.device()};
+    int priority = 0;
+    OPENREG_CHECK(orStreamGetPriority(stream(), &priority));
+    return priority;
+  }
+
+  orStream_t stream() const;
+
+  Stream unwrap() const {
+    return stream_;
+  }
+
+  struct c10::StreamData3 pack3() const {
+    return stream_.pack3();
+  }
+
+  static OpenRegStream unpack3(
+      StreamId stream_id,
+      DeviceIndex device_index,
+      DeviceType device_type) {
+    return OpenRegStream(Stream::unpack3(stream_id, device_index, device_type));
+  }
+
+ private:
+  Stream stream_;
+};
+
+/*
+ * Get a stream from the pool in a round-robin fashion.
+ *
+ * You can request a stream from the highest priority pool by setting
+ * isHighPriority to true for a specific device.
+ */
+OPENREG_EXPORT OpenRegStream
+getStreamFromPool(const bool isHighPriority = false, DeviceIndex device = -1);
+
+/*
+ * Get a stream from the pool in a round-robin fashion.
+ *
+ * You can request a stream by setting a priority value for a specific device.
+ * The priority number lower, the priority higher.
+ */
+OPENREG_EXPORT OpenRegStream
+getStreamFromPool(const int priority, DeviceIndex device = -1);
+
+/*
+ * Get a OpenRegStream from a externally allocated one.
+ *
+ * This is mainly for interoperability with different libraries where we
+ * want to operate on a non-torch allocated stream for data exchange or similar
+ * purposes
+ */
+OPENREG_EXPORT OpenRegStream
+getStreamFromExternal(orStream_t ext_stream, DeviceIndex device_index);
+
+/*
+ * Get the default OpenReg stream, for the passed OpenReg device, or for the
+ * current device if no device index is passed.
+ */
+OPENREG_EXPORT OpenRegStream
+getDefaultOpenRegStream(DeviceIndex device_index = -1);
+
+/*
+ * Get the current OpenReg stream, for the passed OpenReg device, or for the
+ * current device if no device index is passed.
+ */
+OPENREG_EXPORT OpenRegStream
+getCurrentOpenRegStream(DeviceIndex device_index = -1);
+
+/*
+ * Set the current stream on the device of the passed in stream to be the passed
+ * in stream.
+ */
+OPENREG_EXPORT void setCurrentOpenRegStream(OpenRegStream stream);
+
+OPENREG_EXPORT std::ostream& operator<<(
+    std::ostream& stream,
+    const OpenRegStream& s);
+
+} // namespace c10::openreg
+
+namespace std {
+template <>
+struct hash<c10::openreg::OpenRegStream> {
+  size_t operator()(c10::openreg::OpenRegStream s) const noexcept {
+    return std::hash<c10::Stream>{}(s.unwrap());
+  }
+};
+} // namespace std
diff --git a/PyTorchSimDevice2/include/Macros.h b/PyTorchSimDevice2/include/Macros.h
new file mode 100644
index 00000000..c75523c2
--- /dev/null
+++ b/PyTorchSimDevice2/include/Macros.h
@@ -0,0 +1,7 @@
+#pragma once
+
+#ifdef _WIN32
+#define OPENREG_EXPORT __declspec(dllexport)
+#else
+#define OPENREG_EXPORT __attribute__((visibility("default")))
+#endif
diff --git a/PyTorchSimDevice2/pyproject.toml b/PyTorchSimDevice2/pyproject.toml
new file mode 100644
index 00000000..774fe5cd
--- /dev/null
+++ b/PyTorchSimDevice2/pyproject.toml
@@ -0,0 +1,35 @@
+[build-system]
+requires = [
+    "setuptools",
+    "wheel",
+    "torch", # Needed by setup.py for getting include of PyTorch
+]
+
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "torch_openreg"
+version = "0.0.1"
+description = "A minimal reference implementation of an out-of-tree backend"
+readme = "README.md"
+requires-python = ">=3.9"
+license = { text = "BSD-3-Clause" }
+authors = [{ name = "PyTorch Team", email = "packages@pytorch.org" }]
+dependencies = [
+    "torch",
+]
+# Add classifiers info for making lint happy
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Topic :: Software Development",
+    "Topic :: Software Development :: Libraries",
+    "Topic :: Software Development :: Libraries :: Python Modules",
+    "Programming Language :: C++",
+    "Programming Language :: Python :: 3 :: Only",
+]
+
+[project.urls]
+Homepage = "https://pytorch.org"
+Repository = "https://github.com/pytorch/pytorch"
+Documentation = "https://pytorch.org/docs"
+Forum = "https://discuss.pytorch.org"
diff --git a/PyTorchSimDevice2/setup.py b/PyTorchSimDevice2/setup.py
new file mode 100644
index 00000000..01e2f065
--- /dev/null
+++ b/PyTorchSimDevice2/setup.py
@@ -0,0 +1,148 @@
+import multiprocessing
+import os
+import platform
+import shutil
+import subprocess
+import sys
+import sysconfig
+from distutils.command.clean import clean
+
+from setuptools import Extension, find_packages, setup
+
+
+# Env Variables
+IS_DARWIN = platform.system() == "Darwin"
+IS_WINDOWS = platform.system() == "Windows"
+
+BASE_DIR = os.path.dirname(os.path.realpath(__file__))
+RUN_BUILD_DEPS = any(arg in {"clean", "dist_info"} for arg in sys.argv)
+
+
+def make_relative_rpath_args(path):
+    if IS_DARWIN:
+        return ["-Wl,-rpath,@loader_path/" + path]
+    elif IS_WINDOWS:
+        return []
+    else:
+        return ["-Wl,-rpath,$ORIGIN/" + path]
+
+
+def get_pytorch_dir():
+    os.environ["TORCH_DEVICE_BACKEND_AUTOLOAD"] = "0"
+    import torch
+
+    return os.path.dirname(os.path.realpath(torch.__file__))
+
+
+def build_deps():
+    build_dir = os.path.join(BASE_DIR, "build")
+    os.makedirs(build_dir, exist_ok=True)
+
+    cmake_args = [
+        "-DCMAKE_INSTALL_PREFIX="
+        + os.path.realpath(os.path.join(BASE_DIR, "torch_openreg")),
+        "-DPYTHON_INCLUDE_DIR=" + sysconfig.get_paths().get("include"),
+        "-DPYTORCH_INSTALL_DIR=" + get_pytorch_dir(),
+    ]
+
+    subprocess.check_call(
+        ["cmake", BASE_DIR] + cmake_args, cwd=build_dir, env=os.environ
+    )
+
+    build_args = [
+        "--build",
+        ".",
+        "--target",
+        "install",
+        "--config",  # For multi-config generators
+        "Release",
+        "--",
+    ]
+
+    if IS_WINDOWS:
+        build_args += ["/m:" + str(multiprocessing.cpu_count())]
+    else:
+        build_args += ["-j", str(multiprocessing.cpu_count())]
+
+    command = ["cmake"] + build_args
+    subprocess.check_call(command, cwd=build_dir, env=os.environ)
+
+
+class BuildClean(clean):
+    def run(self):
+        for i in ["build", "install", "torch_openreg/lib"]:
+            dirs = os.path.join(BASE_DIR, i)
+            if os.path.exists(dirs) and os.path.isdir(dirs):
+                shutil.rmtree(dirs)
+
+        for dirpath, _, filenames in os.walk(os.path.join(BASE_DIR, "torch_openreg")):
+            for filename in filenames:
+                if filename.endswith(".so"):
+                    os.remove(os.path.join(dirpath, filename))
+
+
+def main():
+    if not RUN_BUILD_DEPS:
+        build_deps()
+
+    if IS_WINDOWS:
+        # /NODEFAULTLIB makes sure we only link to DLL runtime
+        # and matches the flags set for protobuf and ONNX
+        extra_link_args: list[str] = ["/NODEFAULTLIB:LIBCMT.LIB"] + [
+            *make_relative_rpath_args("lib")
+        ]
+        # /MD links against DLL runtime
+        # and matches the flags set for protobuf and ONNX
+        # /EHsc is about standard C++ exception handling
+        extra_compile_args: list[str] = ["/MD", "/FS", "/EHsc"]
+    else:
+        extra_link_args = [*make_relative_rpath_args("lib")]
+        extra_compile_args = [
+            "-Wall",
+            "-Wextra",
+            "-Wno-strict-overflow",
+            "-Wno-unused-parameter",
+            "-Wno-missing-field-initializers",
+            "-Wno-unknown-pragmas",
+            "-fno-strict-aliasing",
+        ]
+
+    ext_modules = [
+        Extension(
+            name="torch_openreg._C",
+            sources=["torch_openreg/csrc/stub.c"],
+            language="c",
+            extra_compile_args=extra_compile_args,
+            libraries=["torch_bindings"],
+            library_dirs=[os.path.join(BASE_DIR, "torch_openreg/lib")],
+            extra_link_args=extra_link_args,
+        )
+    ]
+
+    package_data = {
+        "torch_openreg": [
+            "lib/*.so*",
+            "lib/*.dylib*",
+            "lib/*.dll",
+            "lib/*.lib",
+        ]
+    }
+
+    setup(
+        packages=find_packages(),
+        package_data=package_data,
+        ext_modules=ext_modules,
+        cmdclass={
+            "clean": BuildClean,  # type: ignore[misc]
+        },
+        include_package_data=False,
+        entry_points={
+            "torch.backends": [
+                "torch_openreg = torch_openreg:_autoload",
+            ],
+        },
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/PyTorchSimDevice2/third_party/openreg/CMakeLists.txt b/PyTorchSimDevice2/third_party/openreg/CMakeLists.txt
new file mode 100644
index 00000000..1bde7e00
--- /dev/null
+++ b/PyTorchSimDevice2/third_party/openreg/CMakeLists.txt
@@ -0,0 +1,21 @@
+cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
+
+project(TORCH_OPENREG CXX C)
+
+
+set(LIBRARY_NAME openreg)
+set(LIBRARY_TEST ortests)
+
+file(GLOB_RECURSE SOURCE_FILES
+    "${CMAKE_CURRENT_SOURCE_DIR}/csrc/*.cpp"
+)
+
+add_library(${LIBRARY_NAME} SHARED ${SOURCE_FILES})
+
+target_include_directories(${LIBRARY_NAME} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+
+install(TARGETS ${LIBRARY_NAME}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_LIBDIR}
+)
diff --git a/PyTorchSimDevice2/third_party/openreg/README.md b/PyTorchSimDevice2/third_party/openreg/README.md
new file mode 100644
index 00000000..0cee2c87
--- /dev/null
+++ b/PyTorchSimDevice2/third_party/openreg/README.md
@@ -0,0 +1,151 @@
+# OpenReg: An Accelerator Backend that Simulates CUDA Behavior on a CPU
+
+## Introduction
+
+OpenReg is a C++ backend library that simulates the behavior of a CUDA-like device on a CPU. Its core objective is **not to accelerate computation or improve performance**, but rather to **simulate modern CUDA programming, enabling developers to prototype and test in an environment without actual GPU hardware**. The current design principles are as follows:
+
+* **API Consistency**: Provide an interface consistent with the CUDA Runtime API, allowing upper-level applications (like PyTorch's `PrivateUse1` backend) to switch and test seamlessly.
+* **Functional Consistency**: Provide behavior consistent with the CUDA Runtime, such as memory isolation, device context management, etc.
+* **Completeness**: Aim to support `PrivateUse1` device integration and safeguard the third-party device integration mechanism, without striving to cover all capabilities of the CUDA Runtime.
+
+## Directory Structure
+
+The project's code is organized with a clear structure and separation of responsibilities:
+
+```text
+openreg/
+├── README.md               # Comprehensive introduction of OpenReg.
+├── CMakeLists.txt          # Top-level CMake build script, used to compile and generate libopenreg.so
+├── cmake/
+│   └── GTestTargets.cmake  # Utils of fetching GoogleTest.
+├── include/
+│   ├── openreg.h           # Public API header file, external users only need to include this file
+│   └── openreg.inl         # Public API header file, as an extension of openreg.h, cannot be included separately.
+├── example/
+│   └── example.cpp         # Example for OpenReg.
+├── tests/
+│   ├── event_tests.cpp     # Testcases about OpenReg Event.
+│   ├── stream_tests.cpp    # Testcases about OpenReg Stream.
+│   ├── device_tests.cpp    # Testcases about OpenReg Device.
+│   └── memory_tests.cpp    # Testcases about OpenReg Memory.
+└── csrc/
+    ├── device.cpp          # Implementation of device management APIs
+    ├── memory.cpp          # Implementation of memory management APIs
+    └── stream.cpp          # Implementation of stream and event APIs.
+```
+
+* `CMakeLists.txt`: Responsible for compiling and linking all source files under the `csrc/` directory to generate the final `libopenreg.so` shared library.
+* `include`: Defines all externally exposed APIs, data structures, and enums.
+  * `openreg.h`: Defines all externally exposed C-style APIs.
+  * `openreg.inl`: Defines all externally exposed C++ APIs.
+* `csrc/`: Contains the C++ implementation source code for all core functionalities.
+  * `device.cpp`: Implements the core functions of device management: device discovery and context management.
+  * `memory.cpp`: Implements the core functions of memory management: allocation, free, copy and memory protection.
+  * `stream.cpp`: Implements the core functions of stream and event: creation, destroy, record, synchronization and so on.
+
+## Implemented APIs
+
+OpenReg currently provides a set of APIs covering basic memory and device management.
+
+### Device Management APIs
+
+| OpenReg                          | CUDA                               | Feature Description                |
+| :------------------------------- | :--------------------------------- | :--------------------------------- |
+| `orGetDeviceCount`               | `cudaGetDeviceCount`               | Get the number of available GPUs   |
+| `orSetDevice`                    | `cudaSetDevice`                    | Set the active GPU                 |
+| `orGetDevice`                    | `cudaGetDevice`                    | Get the current GPU                |
+| `orDeviceSynchronize`            | `cudaDeviceSynchronize`            | Wait for all GPU tasks to finish   |
+| `orDeviceGetStreamPriorityRange` | `cudaDeviceGetStreamPriorityRange` | Get the range of stream priorities |
+
+### Memory Management APIs
+
+| OpenReg                  | CUDA                       | Feature Description                       |
+| :----------------------- | :------------------------- | :---------------------------------------- |
+| `orMalloc`               | `cudaMalloc`               | Allocate device memory                    |
+| `orFree`                 | `cudaFree`                 | Free device memory                        |
+| `orMallocHost`           | `cudaMallocHost`           | Allocate page-locked (Pinned) host memory |
+| `orFreeHost`             | `cudaFreeHost`             | Free page-locked host memory              |
+| `orMemcpy`               | `cudaMemcpy`               | Synchronous memory copy                   |
+| `orMemcpyAsyn`           | `cudaMemcpyAsyn`           | Asynchronous memory copy                  |
+| `orPointerGetAttributes` | `cudaPointerGetAttributes` | Get pointer attributes                    |
+
+### Stream APIs
+
+| OpenReg                      | CUDA                           | Feature Description                    |
+| :--------------------------- | :----------------------------- | :------------------------------------- |
+| `orStreamCreate`             | `cudaStreamCreate`             |  Create a default-priority stream      |
+| `orStreamCreateWithPriority` | `cudaStreamCreateWithPriority` |  Create a stream with a given priority |
+| `orStreamDestroy`            | `cudaStreamDestroy`            |  Destroy a stream                      |
+| `orStreamQuery`              | `cudaStreamQuery`              |  Check if a stream has completed       |
+| `orStreamSynchronize`        | `cudaStreamSynchronize`        |  Wait for a stream to complete         |
+| `orStreamWaitEvent`          | `cudaStreamWaitEvent`          |  Make a stream wait for an event       |
+| `orStreamGetPriority`        | `cudaStreamGetPriority`        |  Get a stream’s priority               |
+
+### Event APIs
+
+| OpenReg                  | CUDA                       | Feature Description                 |
+| :----------------------- | :------------------------- | :---------------------------------- |
+| `orEventCreate`          | `cudaEventCreate`          | Create an event with default flag   |
+| `orEventCreateWithFlags` | `cudaEventCreateWithFlags` | Create an event with specific flag  |
+| `orEventDestroy`         | `cudaEventDestroy`         | Destroy an event                    |
+| `orEventRecord`          | `cudaEventRecord`          | Record an event in a stream         |
+| `orEventSynchronize`     | `cudaEventSynchronize`     | Wait for an event to complete       |
+| `orEventQuery`           | `cudaEventQuery`           | Check if an event has completed     |
+| `orEventElapsedTime`     | `cudaEventElapsedTime`     | Get time elapsed between two events |
+
+## Implementation Principles
+
+### Device Management Principles
+
+Simulating multiple devices and thread-safe device context switching:
+
+1. **Device Count**: The total number of simulated devices is defined by the compile-time constant `constexpr int kDeviceCount`.
+2. **Device Switching**: Device switching in multi-threaded scenarios is simulated using a **TLS (Thread-Local Storage) global variable**.
+
+### Memory Management Principles
+
+Simulating device memory, host memory, and memory copies:
+
+1. **Allocation**: A page-aligned memory block is allocated using `mmap` + `mprotect` with the permission flag `PROT_NONE`. Read, write, and execute operations on this memory region are all prohibited.
+2. **Deallocation**: Memory is freed using `munmap`.
+3. **Authorization**: When a legitimate memory access is required, an RAII guard restores the memory permissions to `PROT_READ | PROT_WRITE`. The permissions are automatically reverted to `PROT_NONE` when the scope is exited.
+
+### Stream&Event Principles
+
+Simulating creation, release and synchronization for event and steam:
+
+1. **Event**: Each event is encapsulated as a task function and placed into a stream, which acts as a thread. Upon completion of the task, a flag within the event is modified to simulate the event's status.
+2. **Stream**: When each stream is requested, a new thread is created, which sequentially processes each task in the task queue within the stream structure. Tasks can be wrappers around kernel functions or events.
+3. **Synchronization**: Synchronization between streams and events is achieved using multithreading, condition variables, and mutexes.
+
+## Usage Example
+
+Please refer to [example](example/example.cpp) for example.
+
+The command to compile example.cpp is as follow:
+
+```Shell
+mkdir build
+
+pushd build
+cmake ..
+make -j 32
+popd
+
+g++ -o out example/example.cpp -L ./build -lopenreg
+LD_LIBRARY_PATH=./build ./out
+```
+
+The output is as follow:
+
+```Shell
+Current environment have 2 devices
+Current is 0 device
+All tasks have been submitted.
+Kernel execution time: 0.238168 ms
+Verification PASSED!
+```
+
+## Next Steps
+
+The most basic functions of the OpenReg backend are currently supported, and will be dynamically optimized and expanded based on the needs of PyTorch integration.
diff --git a/PyTorchSimDevice2/third_party/openreg/cmake/GTestTargets.cmake b/PyTorchSimDevice2/third_party/openreg/cmake/GTestTargets.cmake
new file mode 100644
index 00000000..777fc489
--- /dev/null
+++ b/PyTorchSimDevice2/third_party/openreg/cmake/GTestTargets.cmake
@@ -0,0 +1,12 @@
+set(GTest_REL_PATH "../../../../../../../third_party/googletest")
+get_filename_component(GTest_DIR "${CMAKE_CURRENT_LIST_DIR}/${GTest_REL_PATH}" ABSOLUTE)
+
+if(EXISTS "${GTest_DIR}/CMakeLists.txt")
+    message(STATUS "Found GTest: ${GTest_DIR}")
+
+    set(BUILD_GMOCK OFF CACHE BOOL "Disable GMock build")
+    set(INSTALL_GTEST OFF CACHE BOOL "Disable GTest install")
+    add_subdirectory(${GTest_DIR} "${CMAKE_BINARY_DIR}/gtest")
+else()
+    message(FATAL_ERROR "GTest Not Found")
+endif()
diff --git a/PyTorchSimDevice2/third_party/openreg/csrc/device.cpp b/PyTorchSimDevice2/third_party/openreg/csrc/device.cpp
new file mode 100644
index 00000000..9643bc59
--- /dev/null
+++ b/PyTorchSimDevice2/third_party/openreg/csrc/device.cpp
@@ -0,0 +1,37 @@
+#include <include/openreg.h>
+
+namespace {
+
+// Total device numbers
+constexpr int DEVICE_COUNT = 2;
+// Current device index
+thread_local int gCurrentDevice = 0;
+
+} // namespace
+
+orError_t orGetDeviceCount(int* count) {
+  if (!count) {
+    return orErrorUnknown;
+  }
+
+  *count = DEVICE_COUNT;
+  return orSuccess;
+}
+
+orError_t orGetDevice(int* device) {
+  if (!device) {
+    return orErrorUnknown;
+  }
+
+  *device = gCurrentDevice;
+  return orSuccess;
+}
+
+orError_t orSetDevice(int device) {
+  if (device < 0 || device >= DEVICE_COUNT) {
+    return orErrorUnknown;
+  }
+
+  gCurrentDevice = device;
+  return orSuccess;
+}
diff --git a/PyTorchSimDevice2/third_party/openreg/csrc/memory.cpp b/PyTorchSimDevice2/third_party/openreg/csrc/memory.cpp
new file mode 100644
index 00000000..6f02eeb0
--- /dev/null
+++ b/PyTorchSimDevice2/third_party/openreg/csrc/memory.cpp
@@ -0,0 +1,259 @@
+#include "memory.h"
+
+#include <include/openreg.h>
+
+#include <map>
+#include <mutex>
+
+namespace {
+
+struct Block {
+  orMemoryType type = orMemoryType::orMemoryTypeUnmanaged;
+  int device = -1;
+  void* pointer = nullptr;
+  size_t size = 0;
+  int refcount{0};
+};
+
+class MemoryManager {
+ public:
+  static MemoryManager& getInstance() {
+    static MemoryManager instance;
+    return instance;
+  }
+
+  orError_t allocate(void** ptr, size_t size, orMemoryType type) {
+    if (!ptr || size == 0)
+      return orErrorUnknown;
+
+    std::lock_guard<std::mutex> lock(m_mutex);
+    long page_size = openreg::get_pagesize();
+    size_t aligned_size = ((size - 1) / page_size + 1) * page_size;
+    void* mem = nullptr;
+    int current_device = -1;
+
+    if (type == orMemoryType::orMemoryTypeDevice) {
+      orGetDevice(&current_device);
+
+      mem = openreg::mmap(aligned_size);
+      if (mem == nullptr)
+        return orErrorUnknown;
+      if (openreg::mprotect(mem, aligned_size, F_PROT_NONE) != 0) {
+        openreg::munmap(mem, aligned_size);
+        return orErrorUnknown;
+      }
+    } else {
+      if (openreg::alloc(&mem, page_size, aligned_size) != 0) {
+        return orErrorUnknown;
+      }
+    }
+
+    m_registry[mem] = {type, current_device, mem, aligned_size, 0};
+    *ptr = mem;
+    return orSuccess;
+  }
+
+  orError_t free(void* ptr) {
+    if (!ptr)
+      return orSuccess;
+
+    std::lock_guard<std::mutex> lock(m_mutex);
+    auto it = m_registry.find(ptr);
+    if (it == m_registry.end())
+      return orErrorUnknown;
+
+    const auto& info = it->second;
+    if (info.type == orMemoryType::orMemoryTypeDevice) {
+      openreg::mprotect(info.pointer, info.size, F_PROT_READ | F_PROT_WRITE);
+      openreg::munmap(info.pointer, info.size);
+    } else {
+      openreg::free(info.pointer);
+    }
+
+    m_registry.erase(it);
+    return orSuccess;
+  }
+
+  orError_t memcpy(
+      void* dst,
+      const void* src,
+      size_t count,
+      orMemcpyKind kind) {
+    if (!dst || !src || count == 0)
+      return orErrorUnknown;
+
+    std::lock_guard<std::mutex> lock(m_mutex);
+    Block* dst_info = getBlockInfoNoLock(dst);
+    Block* src_info = getBlockInfoNoLock(src);
+
+    switch (kind) {
+      case orMemcpyHostToDevice:
+        if ((!dst_info || dst_info->type != orMemoryType::orMemoryTypeDevice) ||
+            (src_info && src_info->type == orMemoryType::orMemoryTypeDevice))
+          return orErrorUnknown;
+        break;
+      case orMemcpyDeviceToHost:
+        if ((dst_info && dst_info->type == orMemoryType::orMemoryTypeDevice) ||
+            (!src_info || src_info->type != orMemoryType::orMemoryTypeDevice))
+          return orErrorUnknown;
+        break;
+      case orMemcpyDeviceToDevice:
+        if ((!dst_info || dst_info->type != orMemoryType::orMemoryTypeDevice) ||
+            (!src_info || src_info->type != orMemoryType::orMemoryTypeDevice))
+          return orErrorUnknown;
+        break;
+      case orMemcpyHostToHost:
+        if ((dst_info && dst_info->type == orMemoryType::orMemoryTypeDevice) ||
+            (src_info && src_info->type == orMemoryType::orMemoryTypeDevice))
+          return orErrorUnknown;
+        break;
+    }
+
+    unprotectNoLock(dst_info);
+    unprotectNoLock(src_info);
+    ::memcpy(dst, src, count);
+    protectNoLock(dst_info);
+    protectNoLock(src_info);
+
+    return orSuccess;
+  }
+
+  orError_t getPointerAttributes(
+      orPointerAttributes* attributes,
+      const void* ptr) {
+    if (!attributes || !ptr)
+      return orErrorUnknown;
+
+    std ::lock_guard<std::mutex> lock(m_mutex);
+    Block* info = getBlockInfoNoLock(ptr);
+
+    if (!info) {
+      attributes->type = orMemoryType::orMemoryTypeUnmanaged;
+      attributes->device = -1;
+      attributes->pointer = const_cast<void*>(ptr);
+    } else {
+      attributes->type = info->type;
+      attributes->device = info->device;
+      attributes->pointer = info->pointer;
+    }
+
+    return orSuccess;
+  }
+
+  orError_t unprotect(void* ptr) {
+    std::lock_guard<std::mutex> lock(m_mutex);
+    return unprotectNoLock(getBlockInfoNoLock(ptr));
+  }
+
+  orError_t protect(void* ptr) {
+    std::lock_guard<std::mutex> lock(m_mutex);
+    return protectNoLock(getBlockInfoNoLock(ptr));
+  }
+
+ private:
+  MemoryManager() = default;
+
+  orError_t unprotectNoLock(Block* info) {
+    if (info && info->type == orMemoryType::orMemoryTypeDevice) {
+      if (info->refcount == 0) {
+        if (openreg::mprotect(
+                info->pointer, info->size, F_PROT_READ | F_PROT_WRITE) != 0) {
+          return orErrorUnknown;
+        }
+      }
+
+      info->refcount++;
+    }
+
+    return orSuccess;
+  }
+
+  orError_t protectNoLock(Block* info) {
+    if (info && info->type == orMemoryType::orMemoryTypeDevice) {
+      if (info->refcount == 1) {
+        if (openreg::mprotect(info->pointer, info->size, F_PROT_NONE) != 0) {
+          return orErrorUnknown;
+        }
+      }
+
+      info->refcount--;
+    }
+
+    return orSuccess;
+  }
+
+  Block* getBlockInfoNoLock(const void* ptr) {
+    auto it = m_registry.upper_bound(const_cast<void*>(ptr));
+    if (it != m_registry.begin()) {
+      --it;
+      const char* p_char = static_cast<const char*>(ptr);
+      const char* base_char = static_cast<const char*>(it->first);
+      if (p_char >= base_char && p_char < (base_char + it->second.size)) {
+        return &it->second;
+      }
+    }
+
+    return nullptr;
+  }
+
+  std::map<void*, Block> m_registry;
+  std::mutex m_mutex;
+};
+
+} // namespace
+
+orError_t orMalloc(void** devPtr, size_t size) {
+  return MemoryManager::getInstance().allocate(
+      devPtr, size, orMemoryType::orMemoryTypeDevice);
+}
+
+orError_t orFree(void* devPtr) {
+  return MemoryManager::getInstance().free(devPtr);
+}
+
+orError_t orMallocHost(void** hostPtr, size_t size) {
+  return MemoryManager::getInstance().allocate(
+      hostPtr, size, orMemoryType::orMemoryTypeHost);
+}
+
+orError_t orFreeHost(void* hostPtr) {
+  return MemoryManager::getInstance().free(hostPtr);
+}
+
+orError_t orMemcpy(
+    void* dst,
+    const void* src,
+    size_t count,
+    orMemcpyKind kind) {
+  return MemoryManager::getInstance().memcpy(dst, src, count, kind);
+}
+
+orError_t orMemcpyAsync(
+    void* dst,
+    const void* src,
+    size_t count,
+    orMemcpyKind kind,
+    orStream_t stream) {
+  if (!stream) {
+    return orErrorUnknown;
+  }
+
+  auto& mm = MemoryManager::getInstance();
+
+  return orLaunchKernel(
+      stream, &MemoryManager::memcpy, &mm, dst, src, count, kind);
+}
+
+orError_t orPointerGetAttributes(
+    orPointerAttributes* attributes,
+    const void* ptr) {
+  return MemoryManager::getInstance().getPointerAttributes(attributes, ptr);
+}
+
+orError_t orMemoryUnprotect(void* devPtr) {
+  return MemoryManager::getInstance().unprotect(devPtr);
+}
+
+orError_t orMemoryProtect(void* devPtr) {
+  return MemoryManager::getInstance().protect(devPtr);
+}
diff --git a/PyTorchSimDevice2/third_party/openreg/csrc/memory.h b/PyTorchSimDevice2/third_party/openreg/csrc/memory.h
new file mode 100644
index 00000000..35851ac9
--- /dev/null
+++ b/PyTorchSimDevice2/third_party/openreg/csrc/memory.h
@@ -0,0 +1,96 @@
+#pragma once
+
+#include <cstddef>
+#include <cstdlib>
+#include <cstring>
+
+#if defined(_WIN32)
+#include <windows.h>
+#else
+#include <sys/mman.h>
+#include <unistd.h>
+#endif
+
+#define F_PROT_NONE 0x0
+#define F_PROT_READ 0x1
+#define F_PROT_WRITE 0x2
+
+namespace openreg {
+
+void* mmap(size_t size) {
+#if defined(_WIN32)
+  return VirtualAlloc(nullptr, size, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
+#else
+  void* addr = ::mmap(
+      nullptr,
+      size,
+      PROT_READ | PROT_WRITE,
+      MAP_PRIVATE | MAP_ANONYMOUS,
+      -1,
+      0);
+  return (addr == MAP_FAILED) ? nullptr : addr;
+#endif
+}
+
+void munmap(void* addr, size_t size) {
+#if defined(_WIN32)
+  VirtualFree(addr, 0, MEM_RELEASE);
+#else
+  ::munmap(addr, size);
+#endif
+}
+
+int mprotect(void* addr, size_t size, int prot) {
+#if defined(_WIN32)
+  DWORD win_prot = 0;
+  DWORD old;
+  if (prot == F_PROT_NONE) {
+    win_prot = PAGE_NOACCESS;
+  } else {
+    win_prot = PAGE_READWRITE;
+  }
+
+  return VirtualProtect(addr, size, win_prot, &old) ? 0 : -1;
+#else
+  int native_prot = 0;
+  if (prot == F_PROT_NONE)
+    native_prot = PROT_NONE;
+  else {
+    if (prot & F_PROT_READ)
+      native_prot |= PROT_READ;
+    if (prot & F_PROT_WRITE)
+      native_prot |= PROT_WRITE;
+  }
+
+  return ::mprotect(addr, size, native_prot);
+#endif
+}
+
+int alloc(void** mem, size_t alignment, size_t size) {
+#ifdef _WIN32
+  *mem = _aligned_malloc(size, alignment);
+  return *mem ? 0 : -1;
+#else
+  return posix_memalign(mem, alignment, size);
+#endif
+}
+
+void free(void* mem) {
+#ifdef _WIN32
+  _aligned_free(mem);
+#else
+  ::free(mem);
+#endif
+}
+
+long get_pagesize() {
+#ifdef _WIN32
+  SYSTEM_INFO si;
+  GetSystemInfo(&si);
+  return static_cast<long>(si.dwPageSize);
+#else
+  return sysconf(_SC_PAGESIZE);
+#endif
+}
+
+} // namespace openreg
diff --git a/PyTorchSimDevice2/third_party/openreg/csrc/stream.cpp b/PyTorchSimDevice2/third_party/openreg/csrc/stream.cpp
new file mode 100644
index 00000000..30f50b1a
--- /dev/null
+++ b/PyTorchSimDevice2/third_party/openreg/csrc/stream.cpp
@@ -0,0 +1,313 @@
+#include <include/openreg.h>
+
+#include <atomic>
+#include <chrono>
+#include <condition_variable>
+#include <mutex>
+#include <queue>
+#include <set>
+#include <thread>
+
+static std::mutex g_mutex;
+static std::once_flag g_flag;
+static std::vector<std::set<orStream_t>> g_streams_per_device;
+
+static void initialize_registries() {
+  int device_count = 0;
+  orGetDeviceCount(&device_count);
+  g_streams_per_device.resize(device_count);
+}
+
+struct orEventImpl {
+  std::mutex mtx;
+  std::condition_variable cv;
+  std::atomic<bool> completed{true};
+  int device_index = -1;
+  bool timing_enabled{false};
+  std::chrono::high_resolution_clock::time_point completion_time;
+};
+
+struct orEvent {
+  std::shared_ptr<orEventImpl> impl;
+};
+
+struct orStream {
+  std::queue<std::function<void()>> tasks;
+  std::mutex mtx;
+  std::condition_variable cv;
+  std::thread worker;
+  std::atomic<bool> stop_flag{false};
+  int device_index = -1;
+
+  orStream() {
+    worker = std::thread([this] {
+      while (true) {
+        std::function<void()> task;
+        {
+          std::unique_lock<std::mutex> lock(this->mtx);
+          this->cv.wait(lock, [this] {
+            return this->stop_flag.load() || !this->tasks.empty();
+          });
+          if (this->stop_flag.load() && this->tasks.empty()) {
+            return;
+          }
+          task = std::move(this->tasks.front());
+          this->tasks.pop();
+        }
+        task();
+      }
+    });
+  }
+
+  ~orStream() {
+    stop_flag.store(true);
+    cv.notify_one();
+    worker.join();
+  }
+};
+
+orError_t openreg::addTaskToStream(
+    orStream_t stream,
+    std::function<void()> task) {
+  if (!stream)
+    return orErrorUnknown;
+
+  {
+    std::lock_guard<std::mutex> lock(stream->mtx);
+    stream->tasks.push(std::move(task));
+  }
+
+  stream->cv.notify_one();
+  return orSuccess;
+}
+
+orError_t orEventCreateWithFlags(orEvent_t* event, unsigned int flags) {
+  if (!event)
+    return orErrorUnknown;
+
+  auto impl = std::make_shared<orEventImpl>();
+  orGetDevice(&(impl->device_index));
+  if (flags & orEventEnableTiming) {
+    impl->timing_enabled = true;
+  }
+
+  *event = new orEvent{std::move(impl)};
+  return orSuccess;
+}
+
+orError_t orEventCreate(orEvent_t* event) {
+  return orEventCreateWithFlags(event, orEventDisableTiming);
+}
+
+orError_t orEventDestroy(orEvent_t event) {
+  if (!event)
+    return orErrorUnknown;
+
+  delete event;
+  return orSuccess;
+}
+
+orError_t orEventRecord(orEvent_t event, orStream_t stream) {
+  if (!event || !stream)
+    return orErrorUnknown;
+
+  auto event_impl = event->impl;
+  event_impl->completed.store(false);
+  auto record_task = [event_impl]() {
+    if (event_impl->timing_enabled) {
+      event_impl->completion_time = std::chrono::high_resolution_clock::now();
+    }
+
+    {
+      std::lock_guard<std::mutex> lock(event_impl->mtx);
+      event_impl->completed.store(true);
+    }
+
+    event_impl->cv.notify_all();
+  };
+
+  return openreg::addTaskToStream(stream, record_task);
+}
+
+orError_t orEventSynchronize(orEvent_t event) {
+  if (!event)
+    return orErrorUnknown;
+
+  auto event_impl = event->impl;
+  std::unique_lock<std::mutex> lock(event_impl->mtx);
+  event_impl->cv.wait(lock, [&] { return event_impl->completed.load(); });
+
+  return orSuccess;
+}
+
+orError_t orEventQuery(orEvent_t event) {
+  if (!event)
+    return orErrorUnknown;
+
+  return event->impl->completed.load() ? orSuccess : orErrorNotReady;
+}
+
+orError_t orEventElapsedTime(float* ms, orEvent_t start, orEvent_t end) {
+  if (!ms || !start || !end)
+    return orErrorUnknown;
+
+  auto start_impl = start->impl;
+  auto end_impl = end->impl;
+
+  if (start_impl->device_index != end_impl->device_index) {
+    return orErrorUnknown;
+  }
+
+  if (!start_impl->timing_enabled || !end_impl->timing_enabled) {
+    return orErrorUnknown;
+  }
+
+  if (!start_impl->completed.load() || !end_impl->completed.load()) {
+    return orErrorUnknown;
+  }
+
+  auto duration = end_impl->completion_time - start_impl->completion_time;
+  *ms = std::chrono::duration_cast<std::chrono::duration<float, std::milli>>(
+            duration)
+            .count();
+
+  return orSuccess;
+}
+
+orError_t orStreamCreateWithPriority(
+    orStream_t* stream,
+    [[maybe_unused]] unsigned int flag,
+    int priority) {
+  if (!stream) {
+    return orErrorUnknown;
+  }
+
+  int min_p, max_p;
+  orDeviceGetStreamPriorityRange(&min_p, &max_p);
+  if (priority < min_p || priority > max_p) {
+    return orErrorUnknown;
+  }
+
+  int current_device = 0;
+  orGetDevice(&current_device);
+
+  orStream_t new_stream = nullptr;
+  new_stream = new orStream();
+  new_stream->device_index = current_device;
+
+  {
+    std::lock_guard<std::mutex> lock(g_mutex);
+    std::call_once(g_flag, initialize_registries);
+    g_streams_per_device[current_device].insert(new_stream);
+  }
+
+  *stream = new_stream;
+
+  return orSuccess;
+}
+
+orError_t orStreamCreate(orStream_t* stream) {
+  int min_p, max_p;
+  orDeviceGetStreamPriorityRange(&min_p, &max_p);
+
+  return orStreamCreateWithPriority(stream, 0, max_p);
+}
+
+orError_t orStreamGetPriority(
+    [[maybe_unused]] orStream_t stream,
+    int* priority) {
+  // Since OpenReg has only one priority level, the following code
+  // returns 0 directly for convenience.
+  *priority = 0;
+
+  return orSuccess;
+}
+
+orError_t orStreamDestroy(orStream_t stream) {
+  if (!stream)
+    return orErrorUnknown;
+
+  {
+    std::lock_guard<std::mutex> lock(g_mutex);
+
+    int device_idx = stream->device_index;
+    if (device_idx >= 0 && device_idx < g_streams_per_device.size()) {
+      g_streams_per_device[device_idx].erase(stream);
+    }
+  }
+
+  delete stream;
+  return orSuccess;
+}
+
+orError_t orStreamQuery(orStream_t stream) {
+  if (!stream) {
+    return orErrorUnknown;
+  }
+
+  std::lock_guard<std::mutex> lock(stream->mtx);
+  return stream->tasks.empty() ? orSuccess : orErrorNotReady;
+}
+
+orError_t orStreamSynchronize(orStream_t stream) {
+  if (!stream)
+    return orErrorUnknown;
+
+  orEvent_t event;
+  orEventCreate(&event);
+  orEventRecord(event, stream);
+
+  orError_t status = orEventSynchronize(event);
+  orEventDestroy(event);
+
+  return status;
+}
+
+orError_t orStreamWaitEvent(orStream_t stream, orEvent_t event, unsigned int) {
+  if (!stream || !event)
+    return orErrorUnknown;
+
+  auto event_impl = event->impl;
+  auto wait_task = [event_impl]() {
+    std::unique_lock<std::mutex> lock(event_impl->mtx);
+    event_impl->cv.wait(lock, [&] { return event_impl->completed.load(); });
+  };
+
+  return openreg::addTaskToStream(stream, wait_task);
+}
+
+orError_t orDeviceGetStreamPriorityRange(
+    int* leastPriority,
+    int* greatestPriority) {
+  if (!leastPriority || !greatestPriority) {
+    return orErrorUnknown;
+  }
+
+  // OpenReg have only one priority now.
+  *leastPriority = 0;
+  *greatestPriority = 0;
+  return orSuccess;
+}
+
+orError_t orDeviceSynchronize(void) {
+  int current_device = 0;
+  orGetDevice(&current_device);
+
+  std::vector<orStream_t> streams;
+  {
+    std::lock_guard<std::mutex> lock(g_mutex);
+    std::call_once(g_flag, initialize_registries);
+
+    auto& streams_on_device = g_streams_per_device[current_device];
+    streams.assign(streams_on_device.begin(), streams_on_device.end());
+  }
+
+  for (orStream_t stream : streams) {
+    orError_t status = orStreamSynchronize(stream);
+    if (status != orSuccess) {
+      return status;
+    }
+  }
+
+  return orSuccess;
+}
diff --git a/PyTorchSimDevice2/third_party/openreg/example/example.cpp b/PyTorchSimDevice2/third_party/openreg/example/example.cpp
new file mode 100644
index 00000000..f00f1909
--- /dev/null
+++ b/PyTorchSimDevice2/third_party/openreg/example/example.cpp
@@ -0,0 +1,112 @@
+#include "include/openreg.h"
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+struct MemoryGuard {
+  MemoryGuard(void* ptr) : ptr_(ptr) {
+    orMemoryUnprotect(ptr_);
+  }
+  ~MemoryGuard() {
+    orMemoryProtect(ptr_);
+  }
+
+ private:
+  void* ptr_{};
+};
+
+void add_kernel(float* out, float* a, float* b, int num) {
+  for (int i = 0; i < num; ++i) {
+    out[i] = a[i] + b[i];
+  }
+}
+
+int main() {
+  int device_count = 0;
+  orGetDeviceCount(&device_count);
+
+  std::cout << "Current environment have " << device_count << " devices"
+            << std::endl;
+
+  orSetDevice(0);
+  int current_device = -1;
+  orGetDevice(&current_device);
+
+  std::cout << "Current is " << current_device << " device" << std::endl;
+
+  constexpr int num = 50000;
+  constexpr size_t size = num * sizeof(float);
+
+  std::vector<float> host_a(num), host_b(num), host_out(num, 0.0f);
+  std::iota(host_a.begin(), host_a.end(), 0.0f);
+  for (int i = 0; i < num; ++i) {
+    host_b[i] = 2.0f;
+  }
+
+  float *dev_a, *dev_b, *dev_out;
+  orMalloc((void**)&dev_a, size);
+  orMalloc((void**)&dev_b, size);
+  orMalloc((void**)&dev_out, size);
+
+  // There will be subsequent memory access operations, so memory protection
+  // needs to be released
+  MemoryGuard a{dev_a};
+  MemoryGuard b{dev_b};
+  MemoryGuard c{dev_out};
+
+  orStream_t stream1, stream2;
+  orEvent_t start_event, stop_event;
+
+  orStreamCreate(&stream1);
+  orStreamCreate(&stream2);
+  orEventCreateWithFlags(&start_event, orEventEnableTiming);
+  orEventCreateWithFlags(&stop_event, orEventEnableTiming);
+
+  // Copy input from host to device
+  orMemcpyAsync(dev_a, host_a.data(), size, orMemcpyHostToDevice, stream1);
+  orMemcpyAsync(dev_b, host_b.data(), size, orMemcpyHostToDevice, stream1);
+
+  // Submit compute kernel and two events those are used for calculating time.
+  orEventRecord(start_event, stream1);
+  orLaunchKernel(stream1, add_kernel, dev_out, dev_a, dev_b, num);
+  orEventRecord(stop_event, stream1);
+
+  // Synchronization between streams.
+  orStreamWaitEvent(stream2, stop_event, 0);
+  orMemcpyAsync(host_out.data(), dev_out, size, orMemcpyDeviceToHost, stream2);
+  orStreamSynchronize(stream2);
+
+  std::cout << "All tasks have been submitted." << std::endl;
+
+  float elapsed_ms = 0.0f;
+  orEventElapsedTime(&elapsed_ms, start_event, stop_event);
+  std::cout << "Kernel execution time: " << elapsed_ms << " ms" << std::endl;
+
+  bool success = true;
+  for (int i = 0; i < num; ++i) {
+    if (std::abs(host_out[i] - (host_a[i] + host_b[i])) > 1e-5) {
+      std::cout << "Verification FAILED at index " << i << "! Expected "
+                << (host_a[i] + host_b[i]) << ", got " << host_out[i]
+                << std::endl;
+      success = false;
+      break;
+    }
+  }
+  if (success) {
+    std::cout << "Verification PASSED!" << std::endl;
+  }
+
+  orFree(dev_a);
+  orFree(dev_b);
+  orFree(dev_out);
+
+  orStreamDestroy(stream1);
+  orStreamDestroy(stream2);
+
+  orEventDestroy(start_event);
+  orEventDestroy(stop_event);
+
+  return 0;
+}
diff --git a/PyTorchSimDevice2/third_party/openreg/include/openreg.h b/PyTorchSimDevice2/third_party/openreg/include/openreg.h
new file mode 100644
index 00000000..a5e4af55
--- /dev/null
+++ b/PyTorchSimDevice2/third_party/openreg/include/openreg.h
@@ -0,0 +1,109 @@
+#pragma once
+
+#include <cstddef>
+
+#ifdef _WIN32
+#define OPENREG_EXPORT __declspec(dllexport)
+#else
+#define OPENREG_EXPORT __attribute__((visibility("default")))
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum orError_t {
+  orSuccess = 0,
+  orErrorUnknown = 1,
+  orErrorNotReady = 2
+} orError_t;
+
+typedef enum orMemcpyKind {
+  orMemcpyHostToHost = 0,
+  orMemcpyHostToDevice = 1,
+  orMemcpyDeviceToHost = 2,
+  orMemcpyDeviceToDevice = 3
+} orMemcpyKind;
+
+typedef enum orMemoryType {
+  orMemoryTypeUnmanaged = 0,
+  orMemoryTypeHost = 1,
+  orMemoryTypeDevice = 2
+} orMemoryType;
+
+struct orPointerAttributes {
+  orMemoryType type = orMemoryType::orMemoryTypeUnmanaged;
+  int device;
+  void* pointer;
+};
+
+typedef enum orEventFlags {
+  orEventDisableTiming = 0x0,
+  orEventEnableTiming = 0x1,
+} orEventFlags;
+
+struct orStream;
+struct orEvent;
+typedef struct orStream* orStream_t;
+typedef struct orEvent* orEvent_t;
+
+// Memory
+OPENREG_EXPORT orError_t orMalloc(void** devPtr, size_t size);
+OPENREG_EXPORT orError_t orFree(void* devPtr);
+OPENREG_EXPORT orError_t orMallocHost(void** hostPtr, size_t size);
+OPENREG_EXPORT orError_t orFreeHost(void* hostPtr);
+OPENREG_EXPORT orError_t
+orMemcpy(void* dst, const void* src, size_t count, orMemcpyKind kind);
+OPENREG_EXPORT orError_t orMemcpyAsync(
+    void* dst,
+    const void* src,
+    size_t count,
+    orMemcpyKind kind,
+    orStream_t stream);
+OPENREG_EXPORT orError_t
+orPointerGetAttributes(orPointerAttributes* attributes, const void* ptr);
+OPENREG_EXPORT orError_t orMemoryUnprotect(void* devPtr);
+OPENREG_EXPORT orError_t orMemoryProtect(void* devPtr);
+
+// Device
+OPENREG_EXPORT orError_t orGetDeviceCount(int* count);
+OPENREG_EXPORT orError_t orSetDevice(int device);
+OPENREG_EXPORT orError_t orGetDevice(int* device);
+OPENREG_EXPORT orError_t
+orDeviceGetStreamPriorityRange(int* leastPriority, int* greatestPriority);
+OPENREG_EXPORT orError_t orDeviceSynchronize(void);
+
+// Stream
+OPENREG_EXPORT orError_t orStreamCreateWithPriority(
+    orStream_t* stream,
+    unsigned int flags,
+    int priority);
+OPENREG_EXPORT orError_t orStreamCreate(orStream_t* stream);
+OPENREG_EXPORT orError_t orStreamGetPriority(orStream_t stream, int* priority);
+OPENREG_EXPORT orError_t orStreamDestroy(orStream_t stream);
+OPENREG_EXPORT orError_t orStreamQuery(orStream_t stream);
+OPENREG_EXPORT orError_t orStreamSynchronize(orStream_t stream);
+OPENREG_EXPORT orError_t
+orStreamWaitEvent(orStream_t stream, orEvent_t event, unsigned int flags);
+
+// Event
+OPENREG_EXPORT orError_t
+orEventCreateWithFlags(orEvent_t* event, unsigned int flags);
+OPENREG_EXPORT orError_t orEventCreate(orEvent_t* event);
+OPENREG_EXPORT orError_t orEventDestroy(orEvent_t event);
+OPENREG_EXPORT orError_t orEventRecord(orEvent_t event, orStream_t stream);
+OPENREG_EXPORT orError_t orEventSynchronize(orEvent_t event);
+OPENREG_EXPORT orError_t orEventQuery(orEvent_t event);
+OPENREG_EXPORT orError_t
+orEventElapsedTime(float* ms, orEvent_t start, orEvent_t end);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#ifdef __cplusplus
+
+#define OPENREG_H
+#include "openreg.inl"
+
+#endif
diff --git a/PyTorchSimDevice2/third_party/openreg/include/openreg.inl b/PyTorchSimDevice2/third_party/openreg/include/openreg.inl
new file mode 100644
index 00000000..851be132
--- /dev/null
+++ b/PyTorchSimDevice2/third_party/openreg/include/openreg.inl
@@ -0,0 +1,42 @@
+#ifndef OPENREG_H
+#error "Don`t include openreg.inl directly, include openreg.h instead."
+#endif
+
+#include <functional>
+#include <tuple>
+#include <utility>
+
+namespace openreg {
+OPENREG_EXPORT orError_t
+addTaskToStream(orStream* stream, std::function<void()> task);
+}
+
+template <typename Func, typename... Args>
+OPENREG_EXPORT inline orError_t orLaunchKernel(
+    orStream* stream,
+    Func&& kernel_func,
+    Args&&... args) {
+  if (!stream) {
+    return orErrorUnknown;
+  }
+
+/*
+ * Some tests in PyTorch still use C++11, so we use conditional macro to
+ * select different approaches for different C++ version.
+ *
+ * Std::apply is only supported in C++17, so for C++11/14, std::bind is
+ * a more appropriate approach, but the former has better performance.
+ */
+#if __cplusplus >= 201703L
+  auto task = [func = std::forward<Func>(kernel_func),
+               args_tuple =
+                   std::make_tuple(std::forward<Args>(args)...)]() mutable {
+    std::apply(func, std::move(args_tuple));
+  };
+#else
+  auto task =
+      std::bind(std::forward<Func>(kernel_func), std::forward<Args>(args)...);
+#endif
+
+  return openreg::addTaskToStream(stream, std::move(task));
+}
diff --git a/PyTorchSimDevice2/torch_openreg/_C.cpython-311-x86_64-linux-gnu.so b/PyTorchSimDevice2/torch_openreg/_C.cpython-311-x86_64-linux-gnu.so
new file mode 100755
index 0000000000000000000000000000000000000000..04b3b4e1cb7232dbb845c2f33fe24d94c640b705
GIT binary patch
literal 15312
zcmeHOU1%It6uz6Z8tqTk(rRqQj?!YQ?If)!+DdHFH0eZ}Hl`^e{*1G`lkCd=q`R}4
z^`Taaf0RN&d{bzB5Px0-1@%F#RHRBDT0sy6DPkd2(W)rbhmPmWoNqIo?urx<gnMD<
zyXW_wJ@@Y3JG*Bd8`wJ7l1M1DR&~3Yp_J+sL2?~5n0Kq1L)Rs0aiBYEMsz`M#q5v;
zRLPL$F#?g$-~-o75J2#m9gJs$epC$K5jHStcL~Wl%uu1@_Ve+qjHg<}fyBim+pQ40
z6=DZGD0aX$G3OZh^@{k5qak|0xZP2)i{mn}1M(Q$cAWAu7c<_7*v0b*dR6FN0WscP
zWXE~=rw!9QD&sYqhiS<tP{<&c?1R_LmTfzEnhtX;tr7tD7YfbtuYY;>r|frkcfRy^
z_nl9q2fuy)#V>nU74yk{IsPE;5**X5``+^jnzlib7!9PHpGJIk-H2}!e*wMXjpH)n
zOrm$vYk``pj4MLnWzezhi9)GpS3IZe*|xHW#)j>TTXrXM70)e?4fp3uMR&|e<=s%$
zSYoHA9D6)hbn>}JT{Ti0D(1*rzseDApLC0(?!<5@Qza+)T*@nz(^)%}D`s-ViHcb%
zsm{`**O@LAGfpSTH!RyeI<#eI8}|_=K5tm(NqHZJe4fBRE_2b=8(M-7`sl`x&vV7O
zLOUMe%SR^=eG%bft+3!^gfCxFp{w2yE+xQP4>|g(GoUk|GoUk|GoUk|GoUk|Gw}b<
zz_0Ds{%P&~q0QPi`$VTw){}?57XP@l_oKEW!JG5feM)S9`7ye-FYQ&VpJmDEZ+zb$
zKftuVd^btQ+m~)uf!tsIa-FvJ_q<AkwSV?Y4C~K&i)g3^xKBe}=AUd|x`*8z5hVRu
zpueunu=1Ss>>2CBA2(Vbo^7=fA6qBRc?-$GB5}~>pA5%^J;$@BXB~-E^`@QH-kxvx
z#@%}MlsDJf*K;NDr-vx;=?q;yo;{D~#QkJjAD`_{KSDx@C!lX2n!Ip7$W=W%#MDh^
zKxaT_KxaT_KxaT_KxaT_KxaT_KxaT_;D3;T#FEzA_`gg3ugf^&`xap@;UwV*;WNzt
z+4m0;a^wG4xg{PQRf(x&V#(Y~+YZnlam$Ez4ZV*4<ogDpnx7n;-*HF#oJZRBsEv!)
z-+1$~>zU2=^fQ4vN_z5FAF~7geT0W&eAe;kHAS)1|MMhTH=O~U0i6Mz0i6Mz0i6Mz
z0i6Mz0i6Mzfqy3hsLw<_CTcBriTifJrv&F>Sh=a2C-f?*^SoMU)PXJ$8uguvg+@In
z-%E0X{I_#{iRm*^+=-ga!&21A^`P83guXl^)gi$-f*pdW1)Y{UP}Gb<j{97$1LbyA
z2r4N9gP-LaXKIM^m5@}pSB1Yn{4jouD{B84pk6&bV3*7n>T9?3_ir#(>`YaQUe#E$
z#_Tq`R<EfB<eJ@U*P313=Gv7ai2I7tyk2IhpZRa|^BS*{7OG=@FnaS(y!paE5aVAb
ze5^wvgLVsD6Y5Hp><wwG&;Edag~B=trAj?S9Ud3!8vKU?{iMRW1pj&JG>X63`+6bp
z)2`k!;9&+E`FSnSU!dai@@BwaRDWOa@`3O%Pv9S;P7ANTSl&Jh6ez0(G($W^(4yuk
zd@dpVo;nHo6$YON@cGdY14IOq#BWrOzPO2gcN6|d;3wi(x~V4mDqZ#}UUhujOsknP
zciSu2X)biHDBI4I?1_9S<>c)QRjOBPr#hw5rNU(1_1uiv)mVdz-*dK8E;}{bEqdje
z8ZSEq*UnT6g&LVeD4UDo&r_v<QWTGE503T?57-0SHghHZ=6kmF4G;Abo2u}^v@?~G
zopp*CuE%FbB&O#T>;X%Rt<9q<2u9gkwr=0lx7FT0IJjeA%pU98v~_^oHY?Q+D*qc-
z_y5f^0T&kd#~vt9W~El}oD`u~4l#>fvE;etM6qg4mP?av*{dnD&Pr8t`ONBEMg<C4
zr;=4>rdA|pA$nzHmfgI=OzBPLJ!J+tF{x`#l!)X`#Z4>IbEk;pSyHB(mHEhK$P1{@
zHk_DH6MY)ODdflrxnz$Nf#W&KOjRn%q@_`y8NYA|UKEG-HQpPrpEP^!2hOo?faYqY
zsC~2Nf1oeh4@lKVz29lC!T8uOf&4oyH`v9;zmsIVSHd3qDG>V`Z-ev^t?zr-=XMZ}
z_c+*Ne+C{RS+uc_XuFpPjt%zMUx64u9zVu+#eS1G#eNNh9^3OCkM|EgA2K*5oG0qv
ztMdJda|@{i4RL_xeI{yeh+Q3_D2T6ZU^QmnD*_<&;082scg%i71VHFDvwu2f|DYHE
zKY^iO+~59~J@y$O&V_Ij&);)mkNH=IXbP?p5)ijPL;}t~?7NIm6ZlL>)~}d<FkYt)
zpG9y^!?^&@tHt#}1K%cFJ{!Xx=MaNG*wmmGQZqF8M@YbH5B4}8;Q7OY{N||1F|_Uv
z^H3uhtH2)5OC9ZGE2WI!I&;CeKt4}I?QtHM{)hIT$Lw)lY3mBd#5jl`PJcxL<{^HZ
z?~+aIB^_|5PEwC~qF(32es!G_c3>YxtevQ8uQ;3(A>Yq5`u^Z^KVpyCzp*x?c3~h9
V#z@7tOO5QW>kbW0iya_t{}-!tJ*fZy

literal 0
HcmV?d00001

diff --git a/PyTorchSimDevice2/torch_openreg/__init__.py b/PyTorchSimDevice2/torch_openreg/__init__.py
new file mode 100644
index 00000000..a69151e9
--- /dev/null
+++ b/PyTorchSimDevice2/torch_openreg/__init__.py
@@ -0,0 +1,24 @@
+import sys
+import torch
+
+
+if sys.platform == "win32":
+    from ._utils import _load_dll_libraries
+
+    _load_dll_libraries()
+    del _load_dll_libraries
+
+import torch_openreg._C  # type: ignore[misc]
+import torch_openreg.openreg
+
+
+torch.utils.rename_privateuse1_backend("npu")
+torch._register_device_module("npu", torch_openreg.openreg)
+torch.utils.generate_methods_for_privateuse1_backend(for_storage=True)
+
+torch_openreg.openreg.init()
+sys.modules['torch.npu'] = torch_openreg.openreg
+
+def _autoload():
+    # It is a placeholder function here to be registered as an entry point.
+    pass
\ No newline at end of file
diff --git a/PyTorchSimDevice2/torch_openreg/_utils.py b/PyTorchSimDevice2/torch_openreg/_utils.py
new file mode 100644
index 00000000..1c26f475
--- /dev/null
+++ b/PyTorchSimDevice2/torch_openreg/_utils.py
@@ -0,0 +1,42 @@
+import ctypes
+import glob
+import os
+
+
+def _load_dll_libraries() -> None:
+    openreg_dll_path = os.path.join(os.path.dirname(__file__), "lib")
+
+    kernel32 = ctypes.WinDLL("kernel32.dll", use_last_error=True)
+    with_load_library_flags = hasattr(kernel32, "AddDllDirectory")
+    prev_error_mode = kernel32.SetErrorMode(0x0001)
+
+    kernel32.LoadLibraryW.restype = ctypes.c_void_p
+    if with_load_library_flags:
+        kernel32.LoadLibraryExW.restype = ctypes.c_void_p
+
+    os.add_dll_directory(openreg_dll_path)
+
+    dlls = glob.glob(os.path.join(openreg_dll_path, "*.dll"))
+    path_patched = False
+    for dll in dlls:
+        is_loaded = False
+        if with_load_library_flags:
+            res = kernel32.LoadLibraryExW(dll, None, 0x00001100)
+            last_error = ctypes.get_last_error()
+            if res is None and last_error != 126:
+                err = ctypes.WinError(last_error)
+                err.strerror += f' Error loading "{dll}" or one of its dependencies.'
+                raise err
+            elif res is not None:
+                is_loaded = True
+        if not is_loaded:
+            if not path_patched:
+                os.environ["PATH"] = ";".join([openreg_dll_path] + [os.environ["PATH"]])
+                path_patched = True
+            res = kernel32.LoadLibraryW(dll)
+            if res is None:
+                err = ctypes.WinError(ctypes.get_last_error())
+                err.strerror += f' Error loading "{dll}" or one of its dependencies.'
+                raise err
+
+    kernel32.SetErrorMode(prev_error_mode)
diff --git a/PyTorchSimDevice2/torch_openreg/csrc/CMakeLists.txt b/PyTorchSimDevice2/torch_openreg/csrc/CMakeLists.txt
new file mode 100644
index 00000000..4ff321c4
--- /dev/null
+++ b/PyTorchSimDevice2/torch_openreg/csrc/CMakeLists.txt
@@ -0,0 +1,24 @@
+set(LIBRARY_NAME torch_bindings)
+
+file(GLOB_RECURSE SOURCE_FILES
+    "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp"
+)
+
+add_library(${LIBRARY_NAME} SHARED ${SOURCE_FILES})
+
+target_link_libraries(${LIBRARY_NAME} PRIVATE torch_python_library torch_openreg)
+
+if(WIN32)
+    find_package(Python3 COMPONENTS Interpreter Development REQUIRED)
+    target_link_libraries(${LIBRARY_NAME} PRIVATE ${Python3_LIBRARIES})
+elseif(APPLE)
+    set_target_properties(${LIBRARY_NAME} PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
+endif()
+
+target_link_directories(${LIBRARY_NAME} PRIVATE ${PYTORCH_INSTALL_DIR}/lib)
+
+install(TARGETS ${LIBRARY_NAME}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_LIBDIR}
+)
diff --git a/PyTorchSimDevice2/torch_openreg/csrc/Module.cpp b/PyTorchSimDevice2/torch_openreg/csrc/Module.cpp
new file mode 100644
index 00000000..38c45633
--- /dev/null
+++ b/PyTorchSimDevice2/torch_openreg/csrc/Module.cpp
@@ -0,0 +1,99 @@
+#include <ATen/Context.h>
+
+#include <torch/csrc/Exceptions.h>
+#include <torch/csrc/utils.h>
+#include <torch/csrc/utils/device_lazy_init.h>
+#include <torch/csrc/utils/object_ptr.h>
+#include <torch/csrc/utils/python_numbers.h>
+
+#include <runtime/OpenRegFunctions.h>
+
+static PyObject* _initExtension(PyObject* self, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+
+  at::globalContext().lazyInitDevice(c10::DeviceType::PrivateUse1);
+
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* _getDefaultGenerator(PyObject* self, PyObject* arg) {
+  HANDLE_TH_ERRORS
+  TORCH_CHECK(
+      THPUtils_checkLong(arg),
+      "_get_default_generator expects an int, but got ",
+      THPUtils_typename(arg));
+  auto idx = static_cast<int>(THPUtils_unpackLong(arg));
+
+  return THPGenerator_initDefaultGenerator(
+      at::globalContext().defaultGenerator(
+          c10::Device(c10::DeviceType::PrivateUse1, idx)));
+
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* _setDevice(PyObject* self, PyObject* arg) {
+  HANDLE_TH_ERRORS
+  TORCH_CHECK(THPUtils_checkLong(arg), "invalid argument to setDevice");
+  auto device = THPUtils_unpackLong(arg);
+
+  torch::utils::device_lazy_init(at::kPrivateUse1);
+  c10::openreg::set_device(static_cast<c10::DeviceIndex>(device));
+
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* _exchangeDevice(PyObject* self, PyObject* arg) {
+  HANDLE_TH_ERRORS
+  TORCH_CHECK(THPUtils_checkLong(arg), "invalid argument to exchangeDevice");
+  auto device_index = THPUtils_unpackDeviceIndex(arg);
+  if (device_index < 0) {
+    return THPUtils_packInt32(-1);
+  }
+
+  torch::utils::device_lazy_init(at::kPrivateUse1);
+  auto current_device = c10::openreg::ExchangeDevice(device_index);
+
+  return THPUtils_packDeviceIndex(current_device);
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* _getDevice(PyObject* self, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  torch::utils::device_lazy_init(at::kPrivateUse1);
+  auto device = static_cast<int32_t>(c10::openreg::current_device());
+  return THPUtils_packInt32(device);
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* _getDeviceCount(PyObject* self, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  return THPUtils_packUInt64(c10::openreg::device_count());
+  END_HANDLE_TH_ERRORS
+}
+
+static PyMethodDef methods[] = {
+    {"_init", _initExtension, METH_NOARGS, nullptr},
+    {"_get_default_generator", _getDefaultGenerator, METH_O, nullptr},
+    {"_get_device", _getDevice, METH_NOARGS, nullptr},
+    {"_set_device", _setDevice, METH_O, nullptr},
+    {"_exchangeDevice", _exchangeDevice, METH_O, nullptr},
+    {"_get_device_count", _getDeviceCount, METH_NOARGS, nullptr},
+    {nullptr, nullptr, 0, nullptr}};
+
+/*
+ * When ASAN is enabled, PyTorch modifies the dlopen flag during import,
+ * causing all global and weak symbols in _C.so and its dependent libraries
+ * to be exposed to the global symbol scope, which in turn causes
+ * subsequent symbols with the same name in other libraries to be intercepted.
+ * Therefore, it cannot be named initModule here, otherwise initModule
+ * in torch/csrc/Module.cpp will be called, resulting in failure.
+ */
+extern "C" OPENREG_EXPORT PyObject* initOpenRegModule(void) {
+  static struct PyModuleDef openreg_C_module = {
+      PyModuleDef_HEAD_INIT, "torch_openreg._C", nullptr, -1, methods};
+  PyObject* mod = PyModule_Create(&openreg_C_module);
+
+  return mod;
+}
diff --git a/PyTorchSimDevice2/torch_openreg/csrc/stub.c b/PyTorchSimDevice2/torch_openreg/csrc/stub.c
new file mode 100644
index 00000000..4e02f9fd
--- /dev/null
+++ b/PyTorchSimDevice2/torch_openreg/csrc/stub.c
@@ -0,0 +1,20 @@
+#include <Python.h>
+
+#ifdef _WIN32
+#define OPENREG_EXPORT __declspec(dllexport)
+#else
+#define OPENREG_EXPORT __attribute__((visibility("default")))
+#endif
+
+extern OPENREG_EXPORT PyObject* initOpenRegModule(void);
+
+#ifdef __cplusplus
+extern "C"
+#endif
+
+    OPENREG_EXPORT PyObject*
+    PyInit__C(void);
+
+PyMODINIT_FUNC PyInit__C(void) {
+  return initOpenRegModule();
+}
diff --git a/PyTorchSimDevice2/torch_openreg/lib/libopenreg.so b/PyTorchSimDevice2/torch_openreg/lib/libopenreg.so
new file mode 100644
index 0000000000000000000000000000000000000000..272fb567b8daf1c45b8dc0f7b3a557257a8b68c2
GIT binary patch
literal 59728
zcmeIb31C~rwLg5lC~+1ng{7D+2yog2h_SL^0RbzvlB+~cOq@U<q_Semu^L+vvgA0d
z2H7d95H(v%AJC>P6ezTWmeSC&C?vsQHvt}@Kr#Cj1`<Mnu<HArS+1@X3%c;W?|uJ&
zZX(Z}GiT16IdkUBnRDk#7dzc^3kwR06mb<Prz!<9gS1-QTVBlqtCcEcJW7sItPHm}
zTeR0*_4hbPriz8<sSOj|R`Psq`e_4)%hXtpwbv)=<)rT&$l)?I)*FSQ&>g%ybl>o_
zn=zv(U4%rJ;Ik07tlsY!R0-WbJhc!4E}53_tU^4}XHb;qrdEL`)7*NB9Fd%~T-1{(
zwL#ZteCR%3{|JXJi7(TJHzUn-pUa=jTd5K*EBmV`lFxckZ{R9kkefD(e#=z0cRK1x
z5et%En}}gh7K`@s^{-FJNv5P<bWyuRgNAqB!leq>N)b$omF`JpC{N|LEWdgC&Npvw
z4HO=?_(S#V?_L%;*p4D)5<XS<5I)IZ(wJWzdWdWPU%DK}{>;?<&gx=Ian-_6S1lf>
zElAxqruG=ZWfhUqbrMgMlaW^A;}H2;k<LZx#%H0(%Vd$fL3%nqi}6{4&r*EOz-Jjg
zXW{dGd_4FVuSSvaB5lSexboMBou&5Qzhcir=PtSM*RS6<-rn})>)Rr4&0U(g?VZZe
zTlaWIpS`B~2y5@c8@wl5k1Ow5Kk=3|g}=S(wcpIFEWG3W3g_!rt$)4hvKy99yLilL
zzk2o3BgXA_+;bm4IPQ(xjw#!EK*_}oe?IK42hJ+|!eVcGv-72^JRSFab(nJA`n&Ia
zH#YBsGY(vI_1uql4-LGa4Hh17c;OpQK0N!`tIsd7x79?iT>Of?>HXQie*E*!r)U23
zH=k5|`q(SC9b>uspkP<EwROt#fBRH)Wv`?T%C!GpP(^uVoT5w^0sbc_#z^=sOrVkA
zk3w-qg1<UuWO#8Neya1})3_f!0-yIo(MD?br9AYH%hRtYlyxLJ<$3U@V}gu?e{LT9
z)AGQJ^1yrZz$ah=jx=630REj3<nwpHM&k4SJaYap4}NnVetw@vZpY-Ie{CN6$vp7i
z=YfBcho7=M`dJKqMv~7-dFX_pKO@PxEYE!T6!ITQ&aOQ2S)4~c*XQZ)XL;b?$wTK)
zdEk|K;3awVIh?26NqOd1aUS{8+G52g_c|g^yGQ03m-;+#T5v{^KhQ^P_xL>UD;Ynf
zN<A$exCV#-E(gZ_D6EjFD|vnaPn1<`d`p!LKi~kMvFm0gpHijT%kw`4UAmqFpGPaD
zilvL^59W!|$LJrd>=FJ}nSc+DrFIWi26Nz*fbS1FgAtC{Cg@)+`B}~3djvd)ai@0s
zF5&QV1pIFpF9yGm!!O{8LY^+s57DBDYrT-ek%E4GD~De!;1_}(;a7@<7|pTkaKYyQ
zDH1NTX!kOKUoG&9gkQNsz^$}s<GM=Fj|w>q2suc9CnV|2=lE|yzI6RUz%4<Z-@p@v
zx=(ys$icu>!4qW;qi<2HVth{!@O45ysq;DDaly}tkPpeNehr703;gTExLds(U={FN
z1^sS8e};fh%%cy}gq#Qb9C5qACx3?cPYFk9qJRfLhx9}he)nVn-zfMQY~q0Z1iX>i
z0qAXj0~QN-3HT)X^<un4qQnKAP$NeqyG+;dLe7mU&u<s<d0gP{674<-dC~QmXg4Ls
z<#N=gYZAtv<XI`^#S{)!b_)8bGdbWEL4O`nHjXtM?hyJD6?7T}oh^cX3(yFEP{>V=
z`^(^q=5@Uo7dh`X<Qex>%)XT>R-ymj6@2~ybbbjs8?NF2sh?*F{M0HAUm@@nk7reT
zFyQHkc*7BoNAb9p)OnhHVc#l$N5mIiQdiR!4EUCKSGM_h*~lfHrY^6i#UJpt`Ooty
z!SFm^q}JEzZ}Qayqk)JKfCeLN9iCObh{xO997a)t#8o_JH@DSywj&L!X;(s#*09gp
z>}iiid|jTlVAE<8G(>Dwp2aIYk+9DPKAXb6c3&V;KMWiQHv2p)y&b+B643x7((Y?-
z3awSzL*Zb=*A!tyr&WTj)^Ko*r(-Sp-R|*)!@;m$X>a$2l=f(Vl2EY2-{nDVbaGWd
zX#occu$zK`7WAUd+Xm|B9Z6zU)EjR0c$?0R`ooYLgL%;lKUy;&gney3Vl$^({=heM
zi}0t}bust@9X<@6H_+?}`&N3}yn!a)$~+=zsPrsuz%bK*H?8(GwXXKGc>Qevtm^9W
zgnZ$SAPr(<t*5h6@hlCj@q_AOUq`f^hIKCWnYErB1ieX5OE3(%p+kAvriP`U)L;jx
z1rZdtb$nQWz-}BJYXcDH*$Yn$hI|2Nooy!cWr??A^^#x%M&H}+tgnLdu-k@6RZBF`
z6!8ZGuDP9^PA5p$2mJwPFf`E-iG=+tp+z0kysteNUb{3PM_)qfb!}L3#zGs^p{c9O
zyVBojvz>_hHg89VC$cu=^Y{ZTK_~b+dtpPw9)zSlEp6UaP|EtSFXRpTJiabpQ<ONV
znc>1X*_)sk9zt|2Cf4RVot+TGq9vZ%(4r-k9_P}wPN&D|T--3r<6OuxGdy#bE_8Ns
zwV3H)l|4j;&qCI;&F*OkdD^|JeV&e1G~4WncvsEzG<YI@XQvaDkOhGy4VW1tp(-7b
za8rAT1Wa=WO5E%xrahhBu%FtgwJ{D*I*ZRko6XY@ff9TC?V&c;!Uj*3#}()duJ(mp
z4Uw6VXsFHSnt%2}eN1X*fDzF-r_)(GjnRW}zb%S13Rz;aICr9%Ec`!Z``Z>n<+m%=
zS)MviI2w>dM;gf)YrOsl^ebYsMFakGqdsO!Y0#KuB{9OBkjgLvxDqB;P9~;xq==B`
zNTh9*fndbnvepCTlq}3@0+~Q?y622(wrRqeHj&|-9tf@>Bu|@vWs|4FA6V7qqc&mV
zea)De&Q4hJS|1cMxK?IQkNU!T#@Xf#b@-Z>_?a!@Gau8tKJ167ju0A$NgChGHkLM>
zZO+MzAHZxd81>j-Vx>u5FbtkuYDkE7*qe4Nf)QV*&W!8<Uk<@8=TS1f*c(_yCKUpn
z+3dsMU|e!+S%cTrFbx(WtJ>d)sVl7w(<0<)qD2@cg4RTrd!Zf1J`g1XXPf46V`My`
zV4J^bt#g{w3CjnC;_DK`AV#GDwwOkhFYauhYA|jQ-<keM>s;8sj{G`OL(B*|yb@P9
zeZ_6_1y)5``HEY=ph=fpwgo9H-iWkEGr=i$Tlmm9F*21x!$XlUnJ8ftsANX7uf-c}
zi@<4U^F<(L8$?=97MZaFGm*8k&{1aDu|{;Q)tv|ep@i0V9~Q+XZ=|VJ<X6I73JimU
zeQY-6bcT1}n}!dAR_6&tJ6bVd7#wf3%Qc^AfU}d<437dIq&?V4YJ-VeWt_O)aM-)n
z6Y#C^aH)z#1WOoW2L$1kkfWjx$zoqqFig{r>*er~=1Myp#_`T1n?e&7Ot3}9=BZ<T
z+CQOVR0V1dk^f6oXes+w)mb+6@6;Ra29Sr?;UCEpSkmyVtu~~KOQ5*l(wdt~PEH2}
z7PuqQ+;rS=6&=BfS(IJX1d~iT8|9h=ym@MyKM?Jj+Eq1c>Z}<AoJPgMBST)9S5xDe
zUQwycbGzo$c&1fM&)!YbZ)fOt)3dj>?CrFQnM%#FWsW&6kF8>Qg`J}>TL#KADy9w3
z+H3~YVTh8Ts5f0O#k0dOrwP^srp>ODFIw!H=UR9II(L$|rwvjeo--8Vp9!B)cxF?A
z{22T#W*^Fr!To3k8>@_C<>LX_51&$e2-nQ&jbVQYzXW%qu|pe!e+7u`QJ(Jpr~mN{
zAGKl{&!4ea0jmG*T10#%DAj<`Mde2r$`A1Su~oMys$OnV4iM#y!2iRQKfw;$tSlC=
z0~vm)vO?tVX656RRU&^4%a2pSA|E}}kDYjlav}0`6+jjrgCn_bUHkN%$nB@Z1^(JS
z6u}y+{6yrvEI&rMS>&I&b2WC}qm{cwzL}MeQXUlfYL+)CPl|kG%2kMP6e}-_e3+FN
zDQS`SvwWel2YI^Wb4XNT*9NhV?-KDK2~UZKN!>Z{kihTDfme!n$#psKZV}g6p9Aj`
z@LO}>iikIC$bqjH_#1QJ8w7k?4qWNu{VNlE$$nWxTw!t!yi&lca^Q^uKDdd~lXPT0
zCEBHM%7OUEahLJ6T)0Kh&xKp_z;_tv*KrouUm3?FIutXL?d~z)8{u+%a`80-{^<t#
zy9{ub0j?PM6e_|>OAK%kDq`?z1N>A2oq7X&wgJA(06)b5Z#2MH7~ri2c%uQ{{UFas
zxyYhE0mr1rOwC@`8Q>Hfl-GI#or5G0>8%1j0ry7u1_Pay2K<c%xCqs<(v*SzQ3m{N
z2Kq+$fPv022K*fc_^}4~paJePz%>KhW`OT9z-JlYdkpXs4RCD}&yZeCHNeYKJcF<u
zdtNMytOhvMk=NuDzaxBv0QIXf#cv6Y(5!w{rT8tuMaYr?sts^2M`3?e1H9G%uQ$L+
zC*-v(#qWqdedN`c;&%j-wtyh5Df-KC9sHL44W)R7@DUQ$udWooB{&uc{pwEfTY?Ld
z%>b7f;4=111$_p1DFMWFU5eij9kYN@vOdK#1UJr&TMcm1X?bl(@jJrbUjmVCOz{lC
z58$`#Zz{zz1pf}dWq-G&c!uBy^IP_JAjLBTpTlq2-yJEQA-Ies6J#(&f4Q6w;kWFs
zmf{)0Ka}6Hzq?XAL-1;T%l__3@eIKa<G1|prY!t$1H8llKf(aF7~m5P@G=AZNCVtz
zfG;z^CmY~a1H94zKiUAVGQh=>%u1^b@JR-I)d2sl0bXx_f6oA4W`IvNz#9#4@o0~g
zwi@8)81O>|xO^5t#a#yYaR&Tu1H9P)zsvwX-T?12z{L?RR(hQQUSYsrZ-C2ZQ&fDb
z0bXgq-(Y}GGr%_*;L{E8lmRZEO;PbS1AL|df4~55GQf8j;0p}!eYg80uulT}B(P5c
z`y{YW0{bNJ|2GMIXrB0)8hf`yO_*N&v7)HGsYqdFK#grF+043<nfV&PyEBtsM`Auw
zA^cV<*`0nhlgae4d;!Zpt>+6_ev_W34dm|hU3#82j=R&p)bq4q+?~Es&yQmHOY}T#
z3U;T@)$_E`+nx66c?ultPM@ylX=ArL?a=eIq1&Cd>3Q17?M@%1=V=4CJ3T?qm$H0;
zo;S1n2Vcql(nbya*YmVN+ns(|&(p?icY2earwtkUujeT+h5qY#3Rs~3dY(3FyVIBG
zdD@_%|9YM_VCcV|r;Qi-ujgsQh5qY#+EAhYdY(2?=)az)4HWvX=V{}#J6)jXX~VQT
z{lS;A|5lcNRnH&A@=xn|+8Ck#dj1%eze~@Tv-~ghJZ+57e?3ndBJ^L+(?$sW*YmUi
zLjUzVZDi1YJx>7_^k2`@h6erD^R!_>|MfftO3;5jPa72UU(cVw@*jL5`%fDZjK7|z
zfDOi9&(p>P{nzufA;I|TdD@6z{PjF-K+u0ZKa1rr(eo7GLI3r970Y{N9<y_6f@UWI
zDKsCEpMCl8{LGyE)SUdJocs|v`Ga!u`{m?KIr%Sh`1vR&|6Wf1&7AzpIr-;u@=xUC
zbNhdPPWhj5^1sW;-<*@bF(-d*PCl8F@6E|yn3Ip@<lA%dzMTAuocz+9d|gi7nUkNL
zlb@NBpPG}Ol#@RqCx1{*e!rZ&DJTEskA~~tM>+ZTa`JEH<X_IoKbMn#A}7B!Cx3rV
z{!cmi-{s_Q&dJ}HlfO16pUlbk=HxGwd3XF1Z6gec8sDk)VL@>ty;bD9kyjJ7<yKmC
z_xlU2l4|^YU}gr~@wYWv-}GX$n!KRgB5=OK3Jp5dK&b0|FH%B8I|R|=s21`~$P<;r
zkZ)0wv;X)*MQLeKwHIO4)qXUEEWq~(@Rlr?YMVsumjNb|;)-w4E+Lfo&IdN4sumFl
z_W`IK3`?Z#21KHs0Vp;8)#ua&C%lUb6*W0yE48ZGfvEO7>0Uufi@BoRO~AwjB#TUB
zl$scwsV`Au_uoo|=H9ztA*quWaTc_7NYf$oj2bx$trO3*zLC5vS#i4K@1q)|1u0nw
zr%vEHky70+S+k<f3u^rBtb~UiCHY*ij*IOk#x!xm#MUdo$t>6%&uC9lf6>fu07k>^
z_*>cn(c;y{miS=r#oe8v`YlL_JCCu>Ufk*SzQw#Vr6v!Wa2e#8ocSmv{idgo(1kvz
z4#;qd=`KLhZ6MuJVLq~l6}I%DI1Lq0lcv`wvG0i-1Xu9MUfe?|$7BZ0M|KnHW|0E_
zUWPyO50p&#N{#PQH@$zFx@k|5TJW&?^jDDsLEuI~pd>Tc!uXW+Nkh74_oA*6J#MKQ
zn>`koQ2RF{<JH9Mg@9{QK7}?lV{jiftw6qD1?uYUQ~mTB6bgPQPby>XA5C7TEw!1q
zw~>6(%`MD7rSc;i)cEH#pb7Mx=D{-%L_&!d*;31iz7ao`DOhu414lncpx*@aco`v&
zKtD0soq&#am8eBsm5z@$6-1Cbcwxdl77D4Bsfl{4ng~s<O;lSPnO9qk@+bb65^v;$
z_im4z5c{kkGAH&~VPt9Sv!dw!YT`6vf9wOa=4)R;G~}Cu50VPjdkoyC-6X)#Iq<|N
zBMeOQ5myrx4>D4}2B{CtIC^TH0H+~XXE9GuEx1`~HBTURmsYCrdVGq8t}rm%=Qz`G
zhGVH?33!gZJ6VmxD1kq7&-IWun|$s>sI0E{V{^|cK<W|~A%o&mkZ~pEm8tRC@)Bs4
zxo5nn07<o$VSbu>?nDpWiFQ<YFVb9>@Zsi-$i;ODcRB7JG56esa#!LkAio%yq9)ES
zbH`Dn{Q%0K#$Qnr%PkM6B%~MiVD?ZgAV!}@4Gwq5pVA&c5mooQ;x7#S(Vg(|BAT+l
zMiDi5ygUA8ItrS6oWX~zzEitc;5=LhoU^H2SgaD(s+tH|G!G;HUSzr&Z!0fJ9{~&(
z86<{*2~E8()9i!=F!)PFegVsxE@OxuiI>s7!w9_?DRISLNt0=ICB839KhgGl!h3<z
zzo&o3tR99zu;H<2J4E?dHGWRHCG8`O*p|u5S2)gbtZ;na;n|$kK+?YkHQ~W1z8E=@
zq^cc4_(p+V#_2_0UA~#kC#eIj__IO{dbgXeNvRm5YaS{Yh=Q+=)ZUMyhdCa^AS={%
zZG4ZRdCWI8IS<P9r7QlVGv2S#e<+6b09t}_^ge{OP))3`sEN8VcM|JI?~CT1S24le
zg!%H&gseWtpCjXb2v!+jFD-N1+pV4NU_jOQRun|vR_&jfdmbRwBH^o08H?8cF^vH3
z+-w*yJze(a%$HtA%of#G+1^uQsmTDT<~bTQ-Fp#AxIVb;o6J2=p<Cd@_M&znqeS(e
z4V$ShulX!$?r_91h31~=z@@u2OPw?k;3q3WEMR*=>_Vk5I*w$&Ja%$n^o%~HQQEqX
zu)fi__o?w`sPr;5QG@<9pnpy1pDVsYjc*=$v!xHDi=tsRf9MrZqQ}Wc|3vjR5Y^ig
z&N7;EFAzeiAnq(1TBO?pnt5-5sccPi&l*x~)IJ8aIW==H2`Bx16mU7P77-F)^&&q9
zwz&s~Z4}fn_xu@fcYHUNKU%cY{~&7bMaEJS=AM60CBmZl%)0BeWwdL6l-`c@POdYk
zU@`LaI;0^5hqj(YmZ<bTh8tq0w*$NlbrU1K#J}D?QEwsYzhsyw{w`s@!il95fI=T_
zFQ(nN>r2!yQ4J825LHcCdOG!4UolZ0+f!ifd72P3hEPQaQl5IV+U^g*A@#WTU?A9D
z#2~Pm_lTNKvQOsT5727*BmCiVb=kL>d;S0vrs*(Zw)aSJE!qL-0;wysfJ~dv*_az5
zONW@B5Jl02CCq`*ZU#0@q-GFC!)X&zHPHc#^94psU@RdFbh8vpN(|d}tyvT=7sVD<
z%$q>5Qxw|)Ci>)ca(%xU6>RUN$AG1HzjotpbSx`9u^v;rDgKqy_6TFvmT_!E^~}8#
zHqg!oW!rW&wkM;SYagOTZL@&@vp?z4plVqE<4?y2Vy_m#2st;!A4=Z_!RzC}{DnV%
zFbqFE8W@K94^q4%{Th%sf6~4!RpTFtrMaqEO%zX>3_LPalWi&UJ*HP-6?!nOY*2d)
zrQ<%N3M?RL<29+0hb~)EUJ^NhEdAtW^*CPJ+`FY1*s(2DwedlNMb!CqCuhC@y0mS$
z3%#T<{3&WM7PZsBB4mS|KzV;{c{vG9jc-zu&W!rW2WqmlKsDc))?B+X8OYWkR9i;4
zweM?(qw6l1M#$IPLoY3|#ds8MK$tck{hD92TR`?UG3i)hCpr5tBv>EKJ%1x`{8J%6
zS7J%Ig%-K8I(u8WwZqKl{T`CQ!Z!43pNdiG`4zJ6_^VhihMp4j+{sD5ypW)8Yd_=d
z^=u>^O`84yu<&S4!)?FmM*O8qO@{WsftwG<1>Wo*0Y!2j5_r3pNSwIoQi8-yn0i#u
zEeo=Av{nd+8Xnw<xibf?V*b10kJs71h>T;sFt7V7+Ix_B6X!GJR;w$XQa2JCBz0H3
zda|p(nmw3CN)DCc=&!D(6lBajQ=toWCqp3SOSc1NUUxYFZu_M7E>IM5G2Z5r`U-Hs
zCbiq%YBjI>9z^SkT~J=8(8|B%VJwLSjH_<A8H%}wmQ%K*y5p~52;A|9iA;(1z4sxx
znCYh=Uf@nNmzQ8X%In~RR63w1lR;H+0P%PiKve{)21Ie%-f+i{BTYM)1kr#Qi}i$Q
z@laaYGw1|tIEQLKMms}Ka=$Y%^AI%Ww!fvuBjpxz&mxFL^s7v35&eRlLLuv1?6+uA
zjdhhNZgcIR`C9OuwxW*E2kZoPoDYo1MC=~=aYvOpCXjdZ8AeE-A8O*DpP>|L6*Z^n
z43w-Ne@IQthu=D<%w>N9%kLaZwfU-tp^ecupm)_hADerx209AYv;rVpvVQG$^eO!~
zZvy^wUHpr55h-HgAe-LKwr^^uKif{m+<ODi-S%|Hbu?dkPX@K2J}FOIhN%U3>rc-@
z2}V-2AN1sT=t$3%sHeuCr!j@2f_bG)1amZ?(os>ZDhwPgIN<L}RAO=%TAj79D9<a0
z^oOj$&s6(=N8o%J8~W3GBQ<m)Q}t)lT~G~I{2y$camPPP&j(z)3RPe&{`xN8_QT&w
zn=$MLV{JaNn#CjKjsZ6FcWbGOpOe}A8}*jlhLtoml9OIT5zWfKfG(N6B*4&x+o=Zf
zJz<76^J<3nk(4JELM&?7W7(Hk!-p`ZZ+qluH9lZ0=y)5xDK?n#ZMyw&+z)aXrH>+~
zsP+}sHM3$DPA<gyS!wP`qd<ED-Ezl2h9l{;y{G+VD3js-HF~@jMQUv76fjai48MTF
z<+X85xYu%DHg!`M^>wFqFYi!~nT9QC+72+cZ?iAPTGwy79{{?rg<nWj$jtSd?v&8D
zX#+vyrn@NVH~n4~^qU?Nx3GgZ;TFm7kZ6~IA!=BmlzfbzW8Be=<txA?-vfY2xgU&H
zahD-X-AY8ONC?`^?_gGZfofb%=>x$#y8a}<!v>g-uh<hpMdVg?r1kQyHGI8v(7<B7
zWR|Sje9cyxCfB5@OSaOsZun}sT3;=5_7CplRH8@zWgOikg{jro$%DZrUni|9teurf
zH~z3ISzC^k@?&>0RNyw>`7wMo?S?e3%;s!j(rnOVtK7RVnppKFAx{pM)gAwe*}d#~
z=duGV*1NCJf?>VejVzpB<9cWQ;WSXxt<Fqv$mpsp@Bryzg1O5p;jFsI!Ki~#&01<T
zK8`vX0<Y=6VNlrqX{xuA)MBT07Vm%0ttf$hg#i}pXC_MP=L!Jm66@z!a-OD`Isip-
z0TJmpIV4WpG>0H@Qw=YumIYZl+GOxS4X@Y0&os^t#-6QIOe*Gelh7f)ex6LoBtcq0
z<1k<wi5K4CZ&*av1JQLds+wc>qZM~zzu%xkb@o?j<6mO#Ihrg2UrNQs?tO4XjR|{c
zSFIEXC^5%=N;)SOpK9~9Tc!RJ(_^Z8un8h+BQ36vz6F{w(@AKWFRtkt)@3n&VzaF%
zQ83By7gqW62okJLW74kzfc+cnahR=3s{I4GE)8EV{z^6NADDkg`za_*`Z?HJ=_lDI
z-3yB5o(70abk;(<*b?pZw|Qqv__hF@HLoMh;~s>$hn{ajDVd>T6ByRCowy^+*y+49
z{W%{$=(-(cwEWcqqOZ4X340!pbT#u2pr14%Z2fu$V7`8pG8S*6e(1{zMu8Tw674MT
z$-EM=+YUh*tni`qC@=w;^>=WYnTF2&C*;*LfOcK>r=wTb#UD@8@(L0?g2dRN>D)j0
z8o8%qL;!Nmpi5ZKhW<c{+D!1G-H*YBoqOxe?4mXwI+a~t*?!`C<bodAjqVEj=Za?>
zXSm}3jmTL13C9v{2xIS-Va5Avm|MLV)QcNBijJ{3i3y>x=tOQ|u_Y*jbsoMYsD)UP
zvmXHk?ItirM(#2)h>0Vv1x!us_Y+EJ4ZV>9Cdrwdl#jox#y=S<#Sl{cz2YMm?iC-2
zoJ2kxwq)bWwL8#rH8H-N95;mKD6;W<_)ue1<arqt={XM;jhY%q_K3$R5Qj0wDJYB-
z-wX~Z4)c_`Cx9B73r*Nw<g`bMntX}az*b1h5yRaNUxbjI6r?zmKunn_m~bR*t)qvq
z&eMX;VgZA=k^W$E_Lb+Nu~XiF>iz=bt;45I0i4X&QCy@wEu=w7(@6jeyV)NiyEzE}
zx`f@_j`a}+^JGAg)Dn??(-?^pH;pAo+%%3Cl*ocC9c|a^Xb26rB2DjNbDTO^6g`i&
zj6J8KgDm2~I%)2uy*2A!C2m;%zK0tT%XyEGc`RoE`B{v@2jJE~q309onT+EIs@DN0
z{dW`y{h*ew67?d#V(smrEJbKeKqE4sk<pGK_Oo%EcO@2^_Q}~tLulkHJ^C7YH<k5{
z^iPlD@b-I}(6k&zC@zH`Sl<>fUPT-yeWB=kZv;0w3q<~UB0rIk<DaB&K{Ld1?_oe2
zdR{my3QDDYM_lK(K+^4*jNdR(@%RlrM$^mnwzO$zh=ghGJrW4n_|1P)zfn`fZ$3q1
z=`Znz%ag^AA`uZgdQgXBa=s9NIAG>`{s;=-o>_4<KBHVa5lyLy8JKji>tfQGV?V@`
zBZMN+%}3}NQo<PD*@_xu4k%_t@s|%s5X0g-zZMv$5C-BqKR_oW1_C%eqL|h)D&B=u
zHi2TFC_V(Wi2gXFY(Dl}2SmhoJYWfi^8P_|JZl|A{3QE?pqic#lne`=RM}E><j<c6
z<$`^@zMX{eA?%a`Tk!Z++8R6s0nt=sPb#t=9PMRMg6T&AvDc_+)g0KY#_<3lc|@29
zCHGqnf5ZG8>@L-KiJEMKD?qW}WtG6_!-U6v);yuw$}9F)mk_<$cq5i73auqg#{-87
zx3!qtHc>-XZ3<-Jh;Jd+)QX)*&XZaK!`s#4I~4kWF|Rhncj);N;yQP=(QNueyX{pr
zc0GsEh$Kx90W9p!BHZ?yeuuww3A=MP)*l$0&440#90}~sl@cdzx{4ri)78A7PZnh9
zXkD++9Qs7t^>?W1O&CA$!*}%QQ^A9_4dt}1(Pq01%+Wn<=Pnlac-n(deFFyT{ngag
zhcrfy5r<?ve#LGum4AjC?Bfmv2ed8P4XUJ~PK+}=kJqVca`v-GxV=h6zGK{@t!c;h
zjJDD5s3*^UfUtVV4rzyh3{45u{*3uj3RJTwJ2J5skV(`}RujuA<4bXOe0*uK8ef5@
z;cshBbalBcg)NzQUbme2+1WgM<S@V<^ssdjj88zLRCf#%ixhMsWP|ZJ8elO#cm9CJ
zXFmYw665n1XbcAC7-D&f>3Ad<pAUe^aN?%jxW)K<NJ+nGmn_KA(RRE{<MXywk2FnQ
zJguj?o2<s2N}6JQm~IP!VVG{fqp=dt<M=Z~jFO5rXoyna0R4(1OXR(f2@Qy!56EI-
zC^@?Z3AG2s9zGrzb1@$F*%gG;GZ_`N1XzF_rk!@88l$o~jAxYCH5Tl`ea9%;Mn{gN
z<{o+p;)`u5(Y+c>gK9_u4{!#u@sNbGnq*B%1e*!MacA{VDr-^H<kYo(65vklT+V;b
z6h1CL0$7a8!MN==Me&y|F)l~L&A@;p07Y^Q5{UmSi4!-SO^~>0IWJfy3$k>yYU%+R
z-uNQM<&PK_+8VRXGGA!4(;y3y=zib`9Z8>rRQMU_N%~_{!Jz#N)%Epdr1840b=Y{l
zfGi2{E;LWnDL#S;^eccdURNXe#__s{kl6DW?M1ZyPmS09K>Vl1i#^4-1A@jXC<g37
zy?+?5M^=$wcWRqB|8l(k2Cx{fRk-apZN*=@#CV0_USObJ1Qf{|NHAWvNu0Rpw*-ls
zZs!HJ%7QE%t?zGqyh@O=@q(Oa943;&VCZby#kl<tT_wqy#JEjIih<ho0=Iqap(fVh
zBh|!l-4`}0QTWF@wWC?3=qYOA0A@?p%~?j^7g0$~oJ^8izCw-lPfo{ysLMk;e+QKx
zhgsY@v=NOYfNz+>Odt5ubSqjMwo>A_0Vq=x7#%;QoX!DUpk0s7!AiENi6Rys#?c9P
zvT^dzf!zOK@!GewnXJ_!^F7tedQy>hSo}D846RjV`dWp2Y>Sn49ERt?YJxZPtX2p{
zhn~=GM*_B=1wpnu)*i)Oc7E#5O8!@#2bc`?(jEcBIp+ZgPUiuBPhx_DY&Z`<MKrEg
zqX_%85zhng>N~Zdz{xoez(~+}fJKb_2<HJffw#%&DAT487L5$q;9t-pTn2q~;hFpc
z9517dVuW(f1Mt#MH0^obi=6WSoDG}@*e1&V+4BJGbVvMioJKi|PNTHZX_Pf|8l_rI
zoInnJ6;4i+kZ_L1R`f?+Md>~n=4J~d=RiM7lJ)SHu=9tZh+Oo5MF?~bxDZP+*{`JO
zm)|EzkEXLMP(isrW9L?M7z|&<gG535@V{wt2HANP_yaf+upLLb68nv#=Iu(PG&ZNC
zfR3wN&(`x@lpWgXN)(sFmS*ZLU@<a_*c&g{8&7tucLfu*Kq^ZwM0O)SnOzR^CUk|x
z+6{p5Lys6HFZ!X+-+x+OXE*+H^4cFA9wsmEiho63o1Yt5UN3t7KbIF7vNgh~bSG9Z
zO~PZJBdUot4Abhpv<QLyolKFmFP=3>1^dh41MJ8h;ON6TLfOz?Fka%Em;OB8Ft#N_
z*R=E*^5cgd;QJX?hnSv#bBWxfB3^C2CsmNzOyiHUF6{UV4#8|H*=Ft~*Nq*Dq7zNx
zs1#H+fF-4rrVkFt9kB$;&t}c%z==6a5L#S`@@W(z>ggu}i6bh3Q-_#!2}67hKqpNv
zN3y<v&9B7VbL@_DBIhvvoUkP&n6Bm?+S*bxG&$dc&am2Ta3T^*OOo?5@j8ySKe-jL
zEf&52$o+Dze7e^*a+#TXC~n_<QKJ$WZ@$NYWyRck8X&9!*@DZ_+(l@PaXfUon9qst
zbN1|Kon!88!Nekh(T(ZlxFvsZ7xg`zfv3dRuU(E9`cc5U79GW~f5h6LZuH)dzWqhc
zPfL0{Zh1Xd;=po9U_4Fs#WdMzrki`70KNl{sK=w@4n$4IlZ1LH{4uOT8DXdU0md03
zh3G)xjC5`Uoy1b260hSlHsvC78ovdY`o)<vZ(3TCz5*cKA0TE*YLjy^X$p9>^qKFO
zOQP)g6C*dbm3c_VIa)td&Bvx}sSnNHHyv^Ug&8M|PX7crdgsxn^a)Ie-QX?T-uN_`
zG`&62&;C(_fuoVaUNPN_6+$xlgPSjw!g1RGeZzT&opiEKo4%d<NJ&H%9f_XuyFsEi
z)pde9F}sxvcN3nlnOf-X-rIl6FZYx^I0jmbLkOEx`$OhSzl$blDcv{#qY70~9P4_@
zHz0LkU*1I}r6_SF?<>C*K+TI5T```)P#;DiP2l{9zzB?3>6Afb9paKzt_1e=>?ju%
z;1nWMhDud;(vN*VosWsbq}#R5*>yO1>`MNEe)oZ`3pj*`qizs5eL+1Q>fqF_E7osi
zUVUc3Wq&35vMaW=#BD#me9dNx$Kc80_;S7%Rq;UI5}C^5s&w9r=o|w&?BK*se#RoU
z)r#}RI16Ekz6g1{?TDjBCb<%GIr<yuDV{50JI>Huv4?@~J{d^S7hS^ln#cO$N}PkP
zC0k2W`^%k2%lQe1@B!+K8rxi=?R*LfYkzsoJ5-Y@A>ttpSwD*+%msc}yPxo4TX1Y;
zulO=`9=$gAz7FM*JvaYwD{?HVrJ_I78$50yy;6C%Bii0whHy%vucYajG5Sp%W7Ar$
z>w=EO+)Fbj{RaNfJmL06O%^X!Hw_lM3i|6XKy}F=cJXNzCgS6?dC1J?IgWYKxv9v3
zbPz850fn18ooXD1qc;z|jfeX%#4h`yGIMY2%S?tQ*7-<rmP%{DkvdoZJn|**lp|P%
zj}>`xl9corK%&!i<Wyi69EP<%4@d1*TGw2l#$n&LBPx;IAcs)YXze&l#nER9Sh(z-
zvZw=%M^72LM)o)U3QlLTkb3+{;sD1e5b@irEq?-!l9_7;rYcb6iUlVZu#mg`vFNt+
zV_$$3#LTftzFdtzt;Tn!$!sz?u~*hmZ60W7a~>!Aq5B_Lo=#Si!FoDMj9C)_W$-qO
z1cZYgs=cNXV2AnIE!q^+b{4!rPlE^b_y~^&Nf_FA9ptn2B62Vmxyb(gqe)!06t^g3
zPx$y{_XCU{{e19V+Zj3n`7}tl6G7Tw(NSMIZ$9@-EG<=vx#vBUIcP~x<A;~?oew?U
zFtbVaiuQA`P8&Ch47rm{llcxPwi!FIH;2rmPjdhB7`VTu2z#IR`QB$9HD$#T_2L;~
zMWmp&1Mem<cW^0lR_D-$YYE#T9mo!QnS0~FqN&3t&gMS3KiZ(Zh9*EXdi&5LQVv6#
z`dkSo=J!tFaKayPa({Cvxtq~n=*4A2S93p=^)|g9n>FxfbpOLylJ?WM<HMX$=lG%f
zwTT!M8h<p+H)FGL$6OGE>pp;Rm_9K3PC;gR9QBd#<{#{4J$bi8yA5^jEu|dAa7+(D
z8P(*47I)GIn~JlxNz=uPsnCgm{!qOgWmTF94dbTmVP|VWzq5^)de|Al4&B)WLW=g>
zBNQKgvmkl`_tu?gqs3(oY@t}{6L8B}ICTPvk{w>BQwS^`1S$nXk7f0T>l1Z}HbT?U
zrM>-a+sA3*hskf4zF}``IHiY%=%`#V!p6mn$q<;rfwsG#Y-&@ayg0jZ;Q`{+fV$&@
z>Xd=(8usv-u8uQcn>xuhxs%5i*4f{UINkC0vi!1B%CJXAvZK@Q<FJ@KI!$w2t9ujx
zJpOtWcC^xEe>eJ+J3h6N7$9q%-HIOt!ITLd<Lk?mhY{@g3d*Ya_g9pmr?jI%O3ZgT
zzxFz1!AAacw(NecCb6h60mPrv{)Eb~;>G44(+|8bdw%&8!h?XuLqOx1fXv&VSIm0_
zh*?VQ0t_tqBo9^tL%cUcdV|n;2_T52<Hdn$yfaW;iBGlW20nZ!tin#^LAFpO54;v8
z4eQTqd)!IH?T-b93(GT$vb&S*U8;%jD4Qacc^0bj2&NE{hoD~};><k3i6)%Y#EE$V
zy{k}%=A8BTG-`eje7KkV3WX`3+TVLCWpMTPlG{SsN>35W2&DZMoy2%w&vd-C#GPzL
zSRZ1*>jXH<Mlt@_LzoB_JOx?AQ1iW%bT}C$3?rNk=8mvdkslvtt-_)5ECmLP@!Oz7
zX6tpdkLjh-YC|p2`zVKjkCqlyt3fZ^aR!<!K$T3E2TdSO6ElycBssf?(X?pK^mCe*
zfhN&;=5$WybK;pc5FGrH0KY+CXoN+MVG}vI$y2J;O>Y;gMYFG20?`Eb=lqTaZR#1s
zfcImDN~C=w<49*&xpBm~b(~`rN~^_C2d3$chgotn5^eofZk%W)--5o#S>0~9zg^rT
zE}XvG09T-}68{arrT;@PdKv*A)w1_&r%8>-X+L-YtHm}tzoq>Qyb$L!-2~^Oz`13`
zX6;E7?QLH$iy{|0mbjCv>g5q&xKi3tOyYQ0n^;hdWff19P9Ie51Cev+y#waA_gAw?
zgEwmM?hMX8E``sxJ#rwuUPBNIS6?U87w@*!R9cDZ^$?PpJikjT#7%a;FV@vQ=4ZX4
zZQX)k+yLb37BZ#hGZ~EOF#9*MJva+`iW3R6OYNy+I(sDER)J}#S;ijVtOd-SnEKEn
zEEsfL7U%x~p+(rXeVs*sdIX5s0_r7ADArqjW|^lHmoxroDsN_nUs`>>={?AUCb~~T
z1K$<Y$W6yHyFb5ACgDwp1GD42cQ7WPun9gBr2}LL_#?kxV7SDSIP#{&lQ`pRbe-tP
zrweJKtSAwj{|L<TWj@YUd2`P;Kt$n3Ifc=g@Nj9)VgVN@pD`3##p8St$MX(>QZ_V-
z`%N@iXkru#ui-h=p~;~?1jJg`1DIUj&6|qY*BP|*#nbCb(zh^YxSPuPemDjI_jh@o
z-gYqNJN3NI3uiZS8h1E2tLEN=$n@##vhy~a4)-gFk5NF$u4fVscjE!(pOf+2W!~f*
zpC&u1-LN@3FMo{TkaP6UxF>w3k4v~z@Ot07doRhDT*uP{hVK6u#F#@Be_wlD+-v`}
zi32gvS_Qiwwo;PO!AY4xiWzF9sI>R|b)1RQOs|tVXCFlGJvCtR4~GsPPEywnLPPWd
z=WGxpOMMv>6uS8HPr!?`)aSanh5Z?Enp}c~B5oneyH>z2VLdo#C9*Kr=TnlLy##Ck
zPb+_j6Q*ZqRH<B4a-$OY=UIz=tSFgA<TzM`u|6Dr2#ob0jJ4@)XgIxRrzV$SpY)N-
z{&#cFW^@B~8mjAxZ7I+WM0>PO+c!twRoS3o^vo{%7W0p3fp=|sqlm1}jp#SM4iMk$
zvcDgh*aH7`F3i#=w7pd0HG8zj8N$P=eQOk_#WRbNkx6iq-qYSl!61#p`=i(_Q`P<O
zfJM=+05<03JN(Tb2yVQ#03gf`yxlo$?XYFn4)#WxoF6m@mw`5xohM-KR^S%N8A!CF
zKnapN6eFg6^dMI~GJ`SLMp8SvcNy-_$35R4eZ9ZrqOm69SWPzw)43BBuJ{)eLW2!A
z{dT_Co6t|-kyWJ6H<>{v7$#?C>qusB8BruNNZT5|*<og|TQiu!*WUz%k<H+B5X9G;
z!HZ!Ace_Xdbu)N1Mt!&$+yscQaqs68Mo-m^hBSj8G8BUuJWNL^W1Dks-$u5FTT%I&
z>|rYuw55;RyCL1au{>=JdwT)nr5EE)>Wj|A$oB7W5Y<u2*fuNI{?XR!pR#|WfvDZ^
zK-T6>#T=9NZz=Aj{e#&np*dcya)Z(OATa^eV%s0Q5v@m`xj%1s{KoY4fo_N^y+2yw
z<vd@g=hc&EP6BD>p;#i5)Wk*DOV5059%O9)Byv#qX~#y&v>KrIrf9C?g$w5(wl=ia
z?_)20(5;A5u!$4vuSy>SCSgBVK4H3$h{aNH07s|Kx(|#<d5iKDDCce%!_j5}2d0O`
z=vSoAyq{=MFZg=E%`l8UJtS-#=C&=frCjkhvJ$~ym2h7Q{xK0C_hK@;Np{ui&qBFB
z@ME+?)|!5-3mdT#77lPHJG$J-+i5?5{y5C{IN4zcdOMPcoA2qSB0@*R$ASpKSk>Nu
zWB=@Z12k4mjWHGy^UJ86O1xA8LAc9R@;MHJH+M8EhIy+tPe2jOTeW#gb9rTpIS6$8
zb(dphFb7`1E!GJ-_iAMv6EFpTX0v$;4ttxYETM`hM5GH(1+XIV;{|v*&Ve8zW3k4H
zoux&MX$Zshs|b#nd#)q?QKJ-iB5=uZ`mHS3p~la*L|at6NMwbP8H<j)#X+bHY%sBF
zMRjve-&cqyBgzVc&c7xheGwphe!#)CP|(QSOHpxV&p**#xfhW;if|jpi$5>EQrnKu
z^w5upud$3Mn4><NU&i8<TFqZ+64ao1ye6N-^SgM%WHtFix%KNO(P$9OR`UcLN1K7U
z#sfEqU~zsUdwm543YO_kSM&%Nm_`}jMCg|DwMjdP`1*_XIuhuEwj6!Hic*X$(`AZ*
z<ZA$%hnK*1wzqQs?TUXXp6g(-SOrBo0ekyM6|KMY*0LTC`1>GDi&xmY>7>NkY&1JO
z9^gu}v!^u^U39=NwgrOOxm=y{KKvWUiu8VrYs7`9L~OOP?1=eFVSE=<MlN%Z@giu=
zyh=pb@H$73DKFG?(PZ*v!0kK)%Drev50npf){6OBN0H=_;^jk3PezWHZ$Z4goW;v=
za2T&el$m>8A{}M%@-s2(n74k^-Z5helSeSZ3?cf83^rS$kGUyE5jmRTX(PD$2vl1c
zM)Va|;#3y(ph*qqMa7%)&?_D_KMekfF<$;493c@eKh>4!#3qaad+MgQisYN|=^KDT
zc3h8()A@LKRIG>QAJX@sq?uDq;qJ(#vTu_{?A!35dGtluzmjNwx)vJF+m~&#co^v<
z8CJ5C7^x$UpTokHk$H)E6se4#$pV#QxH`a|g8{K!!85Oj&FK9{|0W7?P^*kr$P7yV
zeqPDYJ;VL`p~v<0i=3gq(g6NNJlQF9oax#qAM-fQKwO%X4Nel@A2U`j#v_;+cnX2{
z)>sUBmpbJ`bqd0GFRPnAD|T)Ave;$5bD++Ar{<oLR_)KNX;$NSQGbuHH%YZb9LL1E
zQ;GP}AQ(W`R1Ke9$nRnK*gFN$XVhdZCJp(*1$$gfrS;h%oJsxrLR5R&d}*9apdBw#
zzoVLKo^uzxW4`nUv>f2w)ZQt$!CN;@+sCftffLnun)BmM&ME1iQ^MPP2>i?$q<eRK
zj)o8TnIrf~6F>H^xK+mK;h{5G{H`{k!Uw=E7SW_i^LMJTWtqRjoxqzH6Xswsp1;c-
zcVK+g$><s1M#tyl?N${!iQeb-8ehv|2Qp%`b8ztCk?1c8>z`rIz9)+FcArp0V?P(Y
zi-~V+(Kh}GDqZj%`KLI!g0GHPAm!;U*hNgU!><nW1bC&x1*>k93V{cFs(IdkJ9)}t
z7a~aHwx3R9nF`>@uPgp=T>&0=>`K!M;jRZ^2<f_b=s7q><I!Vn+TF0M+&_o?awpf6
zIF{mzBgDA?Qh%6Mi{W|r#32**C-E($V(eic9?d=DGS5LA!hF>xEIiTIdLD_M0bpk<
zf4QvA{#LZ}L1uXGMr-V?*$-;tFv*)K5U<h$ni{L7{eh;U6Hj5#TQ$DpFHi|Zt|a)j
z44M~iyRR&A8*Q0>1WM}3%dBgz2QO3uu9a>3Fm%{!dXekobIPp_deL(&dM@p2{2%U<
z*OWy}YGUHh2JsyPXJyN2{%0S{())C5as^d2v1T&aboID-Rj&AZ2vVC|Q$E5|GIi6Z
zMQZFzJTZAxjcw1Un_k1Tz`}c`RXy%Z3+A4QvRFshRfLEYzVLyxKXE2H`#F8VOk7i^
zU?NfEyT~;q<)V`@>#l+jSOiz=`Xdc2o{Mm|2agCS`h^G=pX`RH7ouGUf9Nm=4=k7A
zE@%;r&jc(#4d?^ZH}(*l82EN2_(CG~Yi>hWx2BSTc+9A#I!E<{{M^ZN3l}7dXVC-Z
zAJY53n8Ykrrkb4(xsw+Z7P{kj3ay=jZ&pC*{NyQLBEk1d(HJK>CS#Kb1oxC(>eHG#
z8QBUk)TqoKvG0zIQ>WldD?ZWIpu>QppLnD;gElXU79#@my7c+5bEDA7i$*O-9$(>(
z6V26XHi*50`FC()$ohV0z6g4z2e=1Owgo+{O8*M8g6Y+i-L!>PpMJ-kobfzaM6h80
zxbt8Yo9dqsGw-0`q06QJ4KSW*Vklk73ro}~&$|mg(Y662y$Ue+1FlUwXGcuh(z~!%
zY1{yo@qJC>!TuK9Gk=HF2}(=hf_>LM3G9==J_+oTz&;7=lfXU+?32Jg3G9==|6LL|
z_H5gXVA$=&uidvU@Pz}uHdoz3{EYKF_o6utx5u;4W3x@?zj0sZ4R}}i!p^yN{Q7b@
z91MFR^_}$#I@@j2*pJ>fh1M?c2b!Jp>+xI2iyNlnGt;xUo&6BK6TdfHum6;L&X3hA
zoxY|>FnrQU_?cq)tIFO`2tSIfC{5k~eiGb@U(3d?LtF79vnQRze%9LB<_&`xYoyg1
zu(o@<@LRg)`6f>p37=EXSK%9m`(ON7VFmbh`ibmz?`of?qtzS6&tylutMaz;@B9X_
zk=VER+rEq}^kKUAac%xf&d#N6old-%Aug1kD1X~o|IOwc9EnRQeo%Tk$wU5RIDSam
z;}3+|yiGor%_drMwTIg1SIH|q&aeM9aU*4MUmn48PJA_s{2W$vax)$WEkM5hMm$8o
z{jT+y%&kb3UuH5Xq<uGKGFB6Qdk^UwNH-wefHU&Dkk(_fY`Qs<*@E;Sr1dx`xfAK(
zW}G8KdI3`RZ!(!Ogs^TzdI8d1NN=F@mP}>`=v!{hWa#^Ys*#r9`AP`sWTYFAo`SUT
zHt>bC^7c&TampjrkgmTolbKPB-_%3uM!E}WBT@_Y?h&L`r1bqCl}K+uT8(r7X&(Y_
zwdhCa&za0MNE`7mt_1xFAw3*vH=Y-_BCW@Xf@i6pTQV8TXtakj8*Zf4+cKHU5$fE4
zbR*Ib&Re~Uv>N9nd&VG-!<LUDU60e42aW|DoGq?L8baEP)beyDa}!c4(v3)WJ)6mt
zj|1LwnanvzyOCNTPv!YcCWLel=^IFQQGP$<cc2_{9()mbr1akHAktk(Pl3F<UqOG7
zDhMKOLuz>qaLBv+4afuhZ$P>pX$tAxNC%K^Lpq4`C8U*aW-?zOtww4=zZ#L^ol2z-
zDSZP#qn623p?|xOhQMzMCy;g^tscr`=y!TIyqC$`0X|bWrgRPXqU%7OC<W&&RtmZ-
z1&56vUD5|7EC-y9lx+Ji@Psi{N-c9s%jTQMttshNPCN9ZW2cuNO;xFWEk4yh2TueL
z7s1cLrwZ^ssxL0WiQr?oIg^<vU}Z=z!{>0oDhM{e)DkN^yR@WsGvnbVfGUTTompBk
zZ!=-C-<sTlv>N5p83gx8ci__o7}ZsrrIvMtj?%JN(Y#Wte{5-4O{v9ES~7RsLc{%h
z{eC`VrlJ}hjNAbl57VTaQ)>Bf;hfU4D~jfnT9d_dN+(}#a+FrC8|5giijA%<Z7lj-
zL1`5rj?&4%048e8DaEj`-<fO$jdL)sPC%aci52=;8MQ-a(n2VkUX0A#5!$IWwzG0<
zX%zrD&D4TtlKB8=HDeymz`dfDT2`YKlDP)><$yyy*hR1}0J{vZgAFhX=KD1{c;$fg
z<-n={BYHxO*f<fHg7fO#Op``mzw}uA1g6v{H_FQKX+<3e=63NjV0fgyGMUzrOy`w`
z3Qr%gvlt*NXptVZ{(|dW@ztPL8j6ixH|lazviORk9~Uar<{<d2g?-tMzEXQv&=@9Z
z3@<N+2-ndV#zqysG`4hdtzp#Wjw`$tAV;ayDF@Ouj_7IlJd2>@TNtb2@8#)3cD$$$
zi$?4N2FwatRp9GxF?WjJg3-z81C7&7m_PORWXFzlW+Py3z=|MCE=MqWMUj(q(@DCS
z<)OB;J}@E+^T2*Bs?%3xTJU_Bbf@?S;32P^YSEof$Cg&sGSTQdF8WdoS?|GGRSlZO
zHr|&}>r9syCyTB?ky3?E2y%<So?nHwsGZAcUal*G>|@1+ckwBRSu<~(1NSIr_oD6{
zsJjD4C2kRQlQfo>7h#ar(OAYz^GYiVujC^Ms(k)A$FX)b)O9}yTlO#3r8!j&8Iuh!
z$34v{SSB{7P652+<xGZ#fG!%VdSr?Llk<XL&4gfpkzMTq><iFz;a;gDIfTZRmeh<>
z1?}rlemv^HT(gVl+yU5Rz~p=-yO{!P24GVN%T0D}nQm9EfsUvs+XCSi{|=UDq;Z9w
z#75PY_7xPJQ;3m;93fAAWF1fuWioxR-rfQE`KX<&J;k_%F;EU>PZtBZ62(hsZ3gZ=
zgqs~>(2W(>mR1)=$Cg?#KB$DzT`-Q>+RISqg4BO*YiVwfZ(w}|-+Cn4PZnZs81)2w
zCd)!IzE<kV{Bc~Vb;aRx!;1CxZulMat$MkADLhlx*V=J)Tny-uI?kovlUZhStQB-_
zz}c$*gbvN04WQEl|L6DAw*|xJ59CTK8(5e-E>v2w`QgXwN#yH_E-z%d`7Y}2L0v1=
z#dQk(fm#%A$z*;?6SPUEC&sF>@Mnghn$IX!3W{9c$?s@}|0VUb6=cRk{*8b=3H>Wx
zfXt$NYnsaKd}Gmb1+Y*xd#!5o#uaw6jy&<?Q*%ojhgo;%J=#j4{VnkGR^UE&d^-3r
zILVyM)9)SQXfEwRy?0UXyHpQXGd^3uY8U3q;~4AW8!)k;yJF2}c6Ed`|9Ik?FZy#0
z)5bLp%lnAV0P!5bB0Tta_}tVmn@iBb7}>z@b1vbV)QoG>@8>{?Xsntsey!m7V&YHX
z{Ox4}VeJba<x7{Y${;{C=O)y@?z&9oAlw%}3WHc$A2Y2hzPu<|sMG_t1?89FeTD5*
z{s)v}-1eF~Zp@uaK);45hP1Kue3a(Gm`IA;e=LKX?P!x^UQFMDYQP{^H<pQipiP=5
zbkUlA4Pe!f4}Hs?au$t&TVLDQ7_haC`3PiAE*(kIwa`^4japi5RGOli?06ZSG{-oa
zTjRk*7kGIe_cZn+$hNxhfgIV+9j7z`T}9n5U~|R@J0bK2N+H(ctY0HcV3lhxb$5<i
z4t0VMbDTT0!NwnYvkm<1A$$HkOs<i6Y%Gewl9L(N6~Hl$jX&z&|C3DSZ{U~J$BdGr
zWd&^fc<2}QMtDI-=q23ci^nqCP78qwa3yfcv41&-#wI&IvOclV@2DncVT5S_VUfIN
zV4Rnsy$!f8{v!x7dj?xA{ejiep3!Kkx-Fx=gi!Y}^o8lwI@Hxm`CREk`5mAQ)29@Z
zBIT1+A2fm%0$o-vCaVF|KOgk#h__RTepXhK0jq++dkL>2hjCA|FE5<O`K~Nniiumx
z%rJyTKI7q#UF85CYZEV@K|z?@T>1ABp`vR>)>GFwE9kX>-tpiKuS6-u&w*Z!&q%t`
z2b{;Tk8Pti@8xZdpwEp(-Ff<t`iIE=(^~|F{*%nB;NZAXPWDV*nM}d>hYFr4{L@%v
z!&sOn<cr7gblm!4)8ED@&yC3e`%PPll<fs3xU;}6ITXbwbCd~J6p;<?GmXdJ2TcGH
zhH1-aUQC%$Zxot8EM(1CICRX>NIMV0hD~q#`q>53XPv4jzbG(mGbx)2OwXDMHWiu%
zP0G84rgu%sWyPlZOv+2erVS?L7bd#@4K;=yvnzA`Aq5lc0Gy3av&moZs{+%hq7-e6
z`$>W6rUD{9th?iXTwv-(3k9Zs6f;&%;N{<&h4iNdrca8L-xr#Oij*%4O|fF-$s&S0
zN51@9MFb}9?Qrj?g-+98vGNS~&QcP*?MwDaV4no`NnoD@_DNu$1peQZz=o4}dnr3l
zLn1G^sJx%wb+6%RpGcK1o?kEW8zfv-?2|X&PHLg!uZF8KC*PQpH=us)t@MvRhM!26
z%$u-3Fy;@)$sZx|GCnp<<d5Lhl+hxPAizAJgfbalc$Wor;F}8k(0%;oFp($Ul&(X0
zqI8RL+B49F?I^qOOJza2q32n2!S!MnJtw0Jp#XN>iY+T$*bcDkFu{oEHjC|3fgpo`
z1VdBccC%}lAbg<6V~fqMgGH(x#cw3MTL?n(*(%z{){R|o3fc8lhVx%7I+hW6fv)Tl
z`8FY#PeoqJU6A=dCJBl8mfNAqN)F+_afSG-$Tx~KB+_n?_K9@8NH>TyCDH+r4vKV_
zNEI7z$0AazNGnBJEz)|CHi|SP(r%IViFCb4H;6PP(gBeUigcGqm1&}Vky=GsDbi|@
z){C@Jq#==Zi?mOq>qWXjq$!aOh;&e-yF{vphi?{<T18qZ(rS^`i?mUsA(3{Av`?h#
zMY=(xgPVqr?7W(qldO}Mu8amEQLD{XF{7e#sx8X0wu`1stEik&F>Q({7+!ODj_g*k
z@<C=e6tNHh3YAi&`ea@r*iiZf9B(PH>n5HkKL?=#<zOWx?2Sdh>AT~IPUR^az@8=I
z_O}AAU~5VjdoGCdsXXw1NIC+aJu3vh4Th%x(LL)!L?vtA0s>ww;LCWiavR{pPveOk
zAo={Wgj0|Q7kgfY^br9M<>0?4;N1dVO_gxH4|oAZoBMdamnX_*7`Tz-+zEq4{8*NA
zM4!N?pBE%}w}7)}2`Kvx>?y&k&*6wtp4e@3I^XAT7H7xp`+(E9ScQF%{oM}+p3xCU
zLL~kgzzN@ygMSr+AFiZ?JShf5*Utrfy?{&myiw3k3HeaWm97B+e=bL!?@Ih1aJ=_<
zqLe^xG`<@JeQDnh0G#-LQ_!K99$nJ~oc-hg^0L3@3iwl13{9!yiP9tBhYLUEbOFCn
zz}p2};?ubVYFGL_GXC=<;56>i-;i=h3%K+<#B!z3w=RvO4+j8F^pzZc>L3BH%z;l9
z@Ld1vRKQ7YijW)4bGqg;_;+-_fI}7KTtP?rbrOCd!#@aTP$Yz>Hvm2o|F-}>N<URW
zIQ+`j-$xn!{gv)t52sJLk?8MW_!dR_ZHq*;<KQ5W-d-m76vM1s1USjF@_ddM6ZrQt
zhhl;<aFqavWFQZ|84eH|ml}>J{fOfPT>80~ZtS`Q@B-y9CD(3WA@HT&>k{}G0k>Sl
z0W9{9+eL7+h>j-gKf2GZ)qoc&W<~mg0<4_N@DEm`|1I@!3&Sr}2Ip{0)_vT*B;e9N
zm;EBYw2-{MN=ccgRe%@3>&x{w91MP#vP<wOYt=D2rOJk!cAEt}CEy2(3Kt2u+$TuC
z_wNE868Li7eIVewa_E~-k;W@`T$Te~K;Cgma>&zjC4M*0v*#GN{fB_d{fd;&65Q+$
z`t?gVKAjz)Yo&mP8d+BPJx`Q#8C+xx$<2Tt4LT{IpY-gFuBQN>z;B2B{fN;iRR+Qw
zOfhx3#+Zqpy{|I?C;FByUMA&yOCJ2qdEoS_C-rww*q;i~6rG<Vc(s=U*mG3e9w*@S
ztsG9z?C3g`!TG)MZ&M!l)qo!byV`vv$Cl&%SRQ;0aO!X0B^;kUr@?Ingn6{FL2>T_
zd?dMr^1yFlboQ3#Cx9OX`=g3=bF!d67sLs^N7yAfAFl<R$!!rwWan6L`*Q{tl?}<g
z0^f>n5~PcrA3-{h2miA?@cj<p{0!#s;{+U4_PVZMaP;?NUL?o;zw+SUn+N_x9{8R-
z@EHe=tp7g1zq1#Yit;1CN8<l>2FLt5lUHJAXK;Ie9{gQ^GyS=c<6j`!nh6JHB>MC6
zz#H?xw=y{F;c8Bpo@LTCm<NAP9{5-YVI+Qz%mZHzIL)K_D|vfT57z)r{jFca;pIG0
zUSf1G-<vo9W{_RG^3Wd#!@}%gnB$2=Ig-IeWkYff;2^cvbzvU-D*&f?G$8aA)0|zK
z7(V=YmFK13z723p$-S<3^1vrTAS55Fkk1LC=IH{S67U5Az8-Mu?}i_60G3g9{a(Na
zBRo&fAL&X7_%0#OCyO|IfWi5_@$c(-;1&daSi1p^y;xK_lEIA>_<dC#_?d!^Mffvx
zCXud20Z$1#Kaqo#a|PTg?7UULFB0%dVaKFj5f|{#SsZb_u>U_3@Klh)>G?EWw=g)r
zH~zgZ4}1sUq_>S-9GlK~(KQuI`$+S`!QiEeRjfzf<>*Rd9{lqG|0VQc!&Mwn>f3Jw
zodG|GZx?j#&qL?sJn)b6z^5KIGXM1q4m-ArH@HHy`+dNz;@+5?mxs=kdEmdw1K*Sf
z{yyL&|4JeMDWdUH59j(>J)Z;UoG4uj0JpNc;eXD~1CIbs^mF~9p9}gHp+6fWxprTB
zFub;+DHKvXXX9-O+lijJco`<*4+cDVx1_nv7k16<>~#K*yeA{xk!d)Qy(d%UaXK4j
z<bO}bX~zpP{y=ANwa>YDK|{puaRk<Sn!OS4|9;kOmEgTyyg{_s$UAyc?}6N_JmLi-
zPcY;QduagBttGDi5yAh9FDNn1sPZgc>4}7WK39EbLu5wC>kqpY;GLD6*QI98X$v;3
z#z;oc=LTOJ-fgmVL|k|m3ZNNnzK#y|q6@w60+18k_Mp2RK4(2@Kp}V=o<OkK$1p1V
z9bs=pB@OY{zL!-y&Dr_QNVb0hX%^-LUgmO+^mZ3on0LlEaALEq@<m*Mj)*tV<a2g9
ztNh{~mH+Lkc*V@y2FX@51>2gHV0bPDq5^quo43vH?Er+AtHF+l#KC)O=!c_YZ9r6m
zJ~jNW(Wm^DW~4W;{{6|H?|rRrqG-@}{mnhR*++BN)$9vI{E@XT<g;2$_t0u2-=NjN
zFlhDH+1$Lmf2}>i8)si<Px29^+4FVGj5sX+2_qvB1QYHPS9#i_5nmVGPpe$z>44So
zws&}77x98*r@zSub$51R3>u)?t>It*FF#79XYd;KzT}zyNbB4-@2U=7M$~NnfIs5L
zlsV7m3Hw(0LE7)bZ^5H%)+$d6kU_eEwU?z)ANB{s7;Z^s;fXNO0aUlm^fosy@pi0U
z66AH9*_S*UB2_J7A!Q4wa~OmC<w6OnWt|$KGn}gaqM=|wzMR?-UFmTN>4+7a=64>k
zQc>QaZwf~`B2j#oaFf#P<AQ;(Bki81wqU^5p?EyaLC>nT;7V_srx`D+c6hweE~P1m
zHy3>oUo+X~kuW?hkhjMh4tv*nFkIoaN=w+=?(;N9+uPToiXn%U-5(i-%7<FICOO7`
z#;TfVI1HLxc1{qRyVz0Z^f(vRdOR4<B@N&5yO$t&#<xzA-e2V}=(-vr(>t(k7&S;N
zm(#dmLBb5oETnw3^UPbg)Z<h|&(+$+==Z-(Q7nzRZcoE1<_Y1IXL_}mFPq<P_iI^;
zp5~6A@H9F1-{uRmwi$3Qs2|O6L?U>3oB55AwILtWttI#$hzqM%?%UeVcBS^rim<Px
z!sA)h)kWs2BN%|KiLCW>VuH|IAjQN;f17=zPRK!<$$SK7eSK$p`?oL6+DcLbnx~4V
z_N;}DI#&&*-IBT*ebVWS%xaGY+Pxw8P>c`BUsZc39E`vvqyiEKwDsGR1zU}p{T-no
z97^U6%9dxe6FbA0nb86t3XFXFZ1CAB%=DtUa~qsXJWCvN+<1|kb@@MWEo`%(c<?BF
z;fN<1@Shv?xuDeKbV#S1t>My-_CxEvFxbp5S?R=A4}AOG<x=-yEe)=P)aAD!FvnA_
z)U~{rH$ICk+QI3bGp5<5b*$}(_}D_zw0e3VxW?I8J<si$Q?qQDXF8tG+w8o~O!mD8
zm^(SkD!E)_8HNA9#6<=$(h3Klc^3R2d~Jer7`<uJz!G0H3=1yWfF|1Y?S~LBHo$Nb
zu;`I1zmj$+`p~f*if1KuF4zMtZtxfz5xwMG*vMxD8y-IvOK|6@^0?S07Gn~b8Hr*A
zb<IC}q3&bW%)p+3ed~aeF3~Tzofye>e*lz*35d64t7-}cn&H}D+urF7`@JjMd^6T~
z{ShZNn>HI)6c6pps8Qx!4HwJwN{D5iPb{Z$)SPYJP=~L1iNAd$mx~DvhR}nK2G>9q
z%^qfda`}T~XBv^DU*y5=rEM5;g>43i($WnzZ^(Xa0*Tr64d0F6@icXLJuBhV1wc5`
z)LIP{a?EjgY!%bFkiJQU*whL|J6h3|rq$%(xaKq6a(32+*vw%~z>YHql8?A&TE(=A
znY@AFdPgH?)QSqI9-ICn4VJAE-}B*bS?htjl~c`T8%F&<)hE8rFF7MSPyR!S@V&68
zeT(Fz0a0Ma{aqex$06@kfno9vhU<g=0Hy?V))9dhzY;qJ_^C9FE2Xlv`2wpVt%xt+
z+gj=uG#Lj&;S1C-ntW~kH#CnL%^C*XV0c>aB{3|xk;ii6nhk6{^cl$2c~#UKCKJ@=
z!wfJgG^{3Od`k}h#?A}7$~x~icD;fr)G%A-58xYS+Tl&-VboZiM*EC~u=$t_UEY=c
zP8*qkrZ#U!he!9UoJL-0P;9UOK^Rds=gB%5NfJjPZY}xF4=Wqls$sojT7g*Q(tz+G
z_+VGb!Di-%?KA?uH6A|nIrC2}CI9jWvjt8J0v0$PAJUm4o5c(ROmmk%qQi5|0z^=U
z-I^B*H}5NpkqOg`FNUGDK`vPk*bMnyK-_obfrWaC#SRl4ThH?CSaqqA=yk@XlaURq
zX*bdVq`3`FnPJ>2NL<6zq}lhcsR`Nf5x)+I0%2Ium_pE|8_Q~a*cZar1$lg3zNRRx
zV>L5eXx&b01k)VXV(R;Rr?V5&Cg&@ZoLJ_tEn3Ky%^9A#OBXUfgo_0hLhj%(P#6H%
z_NLa=o))jaO_#|GEO{I~*DEkA`PpfL@UsC<!Aj-|<VX}qHiOz5OG)L)|I``a+CrNH
z)TyKJ2c!5+s~(R8qF-(Pl}-2-sKBZ=pHMGfvrc?Etvorimds>_{~IPQTZ;b`e$Y(A
zx4IM<;yL;d=>ZWPBHc^)piS)|z2Ev29BIj|!Ah7rBvJ`@@JGz+u#Y)7hN+>?E9OZt
zziL>UG8zm1bf2mwh(o6UunwIb^&uF-2Tu)l5FF83Shsq<KazMnPNc|f)38Q_X2BYE
zt<CKR^D~4y@*ghmwBHCv1KjSl2RnzWtn{VPO+J0(R9ox~tnvu~;;YO?a>-b@aU}aN
z)6?LI_~8bV+sHz41})H|dLX{{HrqHzsNB}Uw*Lszg>~5pZ*U})(7iAioxPRADAXB0
zfcvpGiqs!Iz8=>S3KGb88g|MQO<O|CpW;~>Sc9*N^DOptMB5?Lni_0(DizwM>YOt5
zWlIb91H4V=M*ZPoen4J_lA<gBgId920aDGG24ghZvHRc)9Cn%1_mO)}<Ni&)I1dhG
zBOCuxs1@gyI@Y#FyepAL!aQx2ckr`)6{`Z#3QQb?Xd-Ks3LUgE>TheF>TgyULaVo<
zRjFuR8vrq$M#8+L6QLb?K*!KT#)C5W2wtin?m}%5rGmvvDsWq|Du^uIboiQ-3Rrk#
zSx~4V9Au9%D}1dYB-+{xYBGmrnwqdMHu*wu9hml1L{@fmaN0a($|=cvqDIVsFk{@?
z?njGgpPC1Ro|#rK3JBqn-zOgj|EDP)Z!0*ui!+4$8WcA&wF<aQO5}}5>710YzW#!Y
zIQJpX*~n`0oUF_{aPu+VS&{9__jF|1EzVQOdb#b>ITbpSBJ0cdc=S{p)SzP@L`v4j
zu@iPR;4iT%>&y3oWGdeS;@F&qyr*<IKJ-qGtS{fgk?E5HS@JLOWJ<?0=p7%Km+$%L
zspvoPPc$U|f~>Mez-4{;UXe`YdquK+Szq@55>bDSASmBclBs;JiR$Ahhu*%dp8$+l
z#IbgMRSr){zl@jUL&DbK##rAf>dSP!&?AX2&!@<gjw>1Kr>1jEnM%BT_5UlU{(z`2
zQ{#8q5e-RCroYaqFW<M2>E&*!mAzy=3Aqh7M*ihFXPL_LXc9kHes_xca{T4{7&0Bu
z3pfR<BqY*5Bc=8wnmi{i(`H#9@N?VWnp3~(T8=1Fcd4i-5~+U@`V?-A?e~yCammzA
zg@!A){Z~;&G>b%mJii~hgICJM&&|ILG-6SzS#JH@@sj+@^h4A&8|usV5W7YD5)UZs
zl6jfZ3rW;P*}gp2f0?MS@oM^CSx*ASqcY7sSzn%45B-i~(D(e&C62x6^H=tZhRfJ~
z_k#>w=_bX()y@89|H$@bdJrlb>nkaNyMd>JRA{(lJ(*6-sW0Cn7`%&Dl6*=MvYyPB
zqdvi8|K)oiyF`6WB8t?Slb<5mugrnV`cre@vOC5!XNs^I)<5uygQ5dRBD)YD$-lIp
nv^?p&^D6xBW}ZBcb700xwku&1a5GZ<=U?F%({d{0WR?F1mZhy&

literal 0
HcmV?d00001

diff --git a/PyTorchSimDevice2/torch_openreg/lib/libtorch_bindings.so b/PyTorchSimDevice2/torch_openreg/lib/libtorch_bindings.so
new file mode 100644
index 0000000000000000000000000000000000000000..144e6dc6d88bfba08b0424d1a4b975ae430924ab
GIT binary patch
literal 166144
zcmeFa33yaR);Hb_1cCw`6g1;T7~`0TOBx6y0+NsfZc7IPArhC^grtFJHj-`#Dq}DS
za;NQ?;JDzp!MF^L<A{z6;(!TD0^>3PL{Ucs6u51a%@J?`zTY`@@9oN^`*ZYt|IhRP
zo@d&J^!=UMPMtb+>eQ*a?Oi$U@!jn9?l%3?-FA)5ZUB5|7|#p48Bd_4+0Mk@1lu6a
z+rvyb{bBYEf@XZ%;smBPV(4)?{lRBC|24lKyH;>ve9Psyy_GtbmhVI3MLov1T&^dQ
zmVGD6EjvT>$M{a`E6Nq>-$psgMdb>A5akN{h;qhvhJZHXJ62R<BOdAB3O(KWouKD4
zzOCgp>E*27iF!HXo7$jH{$5|{&uK4DFBiW_^w0YJxvm+;x6$5Ylq0$PryqUTZBz92
zqK(6JRtl(r{;iKP%aZCK@`OnRHi)N%crg^0o(Jfd(zg%Xf3T0z@J!uP3+DCmH`=d_
z`~06sw)MxqPw|h$+YA4eBrTiKcJGr3wv{&9AL<tx!@zlvY}<u@pX1*?{F6T#z!&(}
ziGN?>-y!^?&td#q{#n}M-<<o!$w~K}tQ)Z9-Sbnc7S4P7{4Y-C*2Uh@C*g~eFW&o%
z?LhUBu@67;hWk%>TQm2MFF3X5!mO(nojL0MU+uYW!;tMq-(22r*SRmQy+PLkuDZ*g
zcYq9;$_WS2!Lww7{``Ctde-@o>-iT18>#;HFh3&IAA`)1^56W6$oXetU?b(<i$RG*
ze+~H~wR=1I7m4nOlD`Rx5-I;@j>zpk4Spi!e;3LYiO+vWsYi>_?qp1&NcGH(LcbgR
zjg)_6lyN+7VC4KmqU0ZXcI5o+$R9}#-J{51E;7?xvwo_fSCQ&@2Qx2H{_c=Nr2He#
zi%fqo%DDG~;fPev-YEQ!jKb%>DE%4?dZhaMK%SBM^=zNW`3Id7S^f*6@To=N^ScWo
zxBCp{V<i3Qi}8)b=c)L}d?rQVe_<5+(Aqn4{ZB(dBFSwCn2f~d;3)I)U=%r=7p0!#
zQRc;3@Dr)s-$lva7DW!ODDC!-Vh?w~(npfRAES)>m(c%6d^SeW^FvYWk1tBQqoeSD
zBkGUTZqF$7w?^TAZj^bt_sqz0$c&;_=R(gT@$-DY$n~s`BDZ@`PbB$QqF<5b#e^vB
z9)dh0@$*;|{(DE!t2d(9|MxGn*?ws?s=&8Kk#i9`6^Z|+Fkd3k$4AlU=c4GtrBTM~
za`2Q8fzN?a=INX$c4=Z1`P><WpN=T@YDSd&B~jMFeNooIV(=ene4mWM=Oa<{ZFiLX
z?NQo&Fv>V?2R#x$*G8F-|BNF45cDdNK71Qxo$VWCovJ=J@;rJ!O1sZT;s5g}^N9Ky
zNv}SO(%<YTcIMtFdOjZYN19(BMH!c(DC_&0DEeF;#SXj<`9l<^eV&S92VRN7&xj~`
z^#E#)q$k6o*s?8)XR{@75AS0w&#h|=E=qRi7fqVS&`Ma~%Bi2AT8%08knTEC*`
z$(vF7br9`F>hEV!<os2X_2|_odRrDn4vVAM+ma~yRuF}sSWNav`migCe^L}h51))O
zPyZ6d?p+#1-?p3)MV?XS>EI~(kP>Bl--DbZ^|v#M9`1;u=Wj)kvp0%942{yS!YJ#`
zq9}5!isGNVhE7G&!<;B`n2B+Zl)owpKmI8Cwgml(v|cnsnJ@Mza#$IK&tp;O=S1P>
z?kMB(rzrD!U=%-jR+RZTGfKbqN0~2yDC^zzQRdzE7?eo***%K>H%D2oAC98uKSY_Q
ztE2GwW)%B#985;y(-y_f*I=GT(zlW*`?%gw_K8cPjPH&p^W`2i8mZk8s6P_DFp7Ro
zi!$$Si=zK~qUhWGQRMc!D0cP1D0ZnJN<Ho<eph^yd2~^f`MwzRNb=bNeTYPVD@wfU
zj3{=#6_rG?8|}zH*mjOB<4h>3v}QG8ynboxXRCjS$+qjomu(c<W%}!akfqaa!8*(I
zzX`&tIz1KsD$!fc!T{kjMts>O==G=Q^(%UQ74oF~wgDpJ2*H-E8F4D2Tk{`<J`;WA
zN>Rc0dOh1%&(ZEZf&ek{XSU9NhtB^5ou9X0mzHCOO>Yx~&3gVq%y*Jco-QA#2Y)Vt
zFe!gVd)NHqbUs(UDDo}P^T+A(*`@dQVV!<I%OCA-5cOQ8({BYo)UWDi1!2F=X9DvR
zXWOOcAFk)Oc|5Z!swzFTzTz66$7A!jrsjG|y*1uhWwkzU&D7ki@~TSj)Z!WCUXd0#
z#ZyvO?3r0sSzKOryVsUCKd+{$+FRqB@0mKk8U;PG>gqfo)K*arkmsh@JU5n>=gqA^
z=A60`&lG>9udKqGQ&Uq_L!_D-Prlce@2e@RoQ15D@_o)>H6oA4OPR8V=FGKu^5#<m
z9(PsMZT@PMb5)`vl_lO>uWxo$sc2o#BRIKnQmU`2rewBrMBe;q#Wj_vUMa3DE%(-B
zIjKS;J=O0kE3b8qDD}=QEAe{Di*KLrDXT2=<xCn@;!JU-WHX)W%%MU=9EmPhLOjml
zCH|TkZ>0}%q{65+Z@#;#a+YU&O;rUwgqt~?jcjIv#-rpmPs!}tJTr^S${|4#u(z_*
zQ&Q|JnXTg)7^O;saw9}E(rDRRSK_Vql~q+rT^gG1t0}ImEid+YJ!ZPcSLLam4`Eg2
zOqobhPW4EM)xH|%uy9^L?(L6fbEbH5JvH8PZ*eX4U$9?;L8+;lXY&+P&MQM#rg&@p
z70?cg_=XCxc}l7<-d%-PP+40xtI}IaLKK2zxfGX|S3xdPxAYc8d%E;l0rKXLtEwtD
z6oW@4vzERxD@(7N>#dntP7GQ&8V(JeU0gZKn=J;k*vJ9(@L_nI!&#@YW_wF+8xNe5
zH?bstsOMJv({=f}5+0dSDtMYIL-SA|Vv^8?8>hNXE2Q^uW{p?wc7?a1q<TK8s+?Cr
zv%{0^W#QOreKqCYN+h5nr$;HSo$Z-1-{-B(n>e=uIf<*0G)i?&r*r6x;@Ywjs31)~
zR|%Cs_iH>*pE6&qt0X7C7=LooF~lV`UdTOX3fF;*=^hWu+*4jP3svw;rM{uAl8S2Z
zTwXQ<HKX2gFXi+WmwI)fq?>xiQ(GT2F*rXqKq<9ebGBh{(C<mGTCg7e5}%DkWP@NT
zs^*fi<@<)Fm|S`*XZdD_*+eeX^z+40Tn!UYN~3AmHMS6J)Vjzx9n((8H|J+c-#L`T
zUj=(doy@5q{h?7z^T6z|QsN1%(8$$#Fdxe*s>@v!IXQFlN^YI%$)S8@voM~bw-)OP
zm1J}9Q*|Oo*Z1*WvQom1PpYb^;B^7BNEjtbFrLZGPpv5~@y>w3vnFH}`)g@A>5@CE
zs?z7J^Xaw2h+y(zVA&FsdMhwbX|4J>8m(Kc>T<A?Q&GXgh>9lu^tmw{YoXACpE<l~
z7+r4-Sx<dx4IOSMZI$2Wsha75EuZD}%&hQ%&D$Z2TyF&y7^pfeQ`bQcr+8=OR+ai8
z3?%Y{CyS@+P)H}=2W^NtwWC{ush+7Ro~cexPR@1tX`cLacntpwb>&UWPxVlRmT^m!
zEzu1B%;+2oci_gU*UPN`=f!i-@YL&Jz`{km#T8Lnq|9h$1T8w{Ufo3hymtR*g;UsL
z>DuRXA+joAUGNwEh%Pqn3VSre#o9*pYEp59cU&s31v!)QVLQS}IWws5LyNH_6ql4>
zeXl9T>~jvE<@IHTvB(8mR902%0YBa>cGg(JI)6xs#1)UjR9Tikqd)n+A8BMbdvbY^
zha0-xWqftN5Sd4KJS0{K6s}+`+}nsfn1Y<c!h=BK=Sh>3Kf(h&ALfBK$A;Pvj;^}w
zL;i?E@iT3*7&4>op%|>Jg6vEMPIE155Dlo2nvT@GnzFfA)(dLA&d8alirlF#xr(1|
zBgua#t|f;KYfb)ep8FCbPP9>&R}h&2O3cm6kDjQw^8PQ0U8<Eh|7pr*OkSD}!)BFN
z%_uI{*Ct4{T6i~}3O^PrxI<Xnx{~~rOzi5A)XvAQt3o&=Wzdsh!=@^E6TFpTp79PO
z+dH$^U+y!B&<QY*Zem?~e`E@zmcTc!$$_p8^|&xg@y~}LN0a|MD+62f|CQRfCMd{h
zi^G5DrjD%fVy@1`LMrB+)zcaJb3GlN#m-@#adW*TFoJC8TyTwN<m7-#yJ%xyA|@GD
z43X27SAuN$DOg0EBasmng*GsTQMneKrL>&6li*Izs`Rs44o|F-9Vt&4_T^-CU3tic
zEk;gGcJ*~rX{S+8Syob2>h)yS7EB$Vh8<15*o}{ar=M9<Q><^t^$oGBHs3$PNBeVZ
zip(6DwfSPxEfb}BYEgTa9JL-jmG<meq*bWzx*4}ZC>~_&!d6Zt@mFdoZ&Vm&ST8;S
ztBTK8V`g_FCLvm5+5c4MbVS%a2Up{3u*`YTL>5?*vAe9&JIP-$!&?J|`Ei%})(9@O
zRQot`YkwBIWFKKYXnz&<MI~ePAL@U4wVyV7U?a!aWB$k<pAB<qX^nR#D=usV=8)Ld
znR_%W(f>)PfH$rqD&eDhOT%#m$b%BXJ3e^trSmI`E3joODXygr7#AFl9UJe+g+at1
zVI$x%w&~C}#ESH+PT>yohEvaXeQ~*84=CxGhRUw9p37BP>aFWi)lh7d&0V3v>Wt!2
zkJw2ICg5#X)lir!%)*GW%DLF*AYaWa#5F3-ZVWB0swhUFMlc6m_!%3DktJ0X)x|Yl
zjuhn0pWt%mlV*5wDr^1N{eUU7P1-!pUsmpedje8+8K$5f_0VTj7Wy#WUs*z-A&@d_
zW>H4NZcu#6oNQbg6}qq>O)K-w7Jbc|pH*C4i%kW!l~+}Utr$&llb<Of<^$4H-rWl>
zk=LYne6_WvC`BeFiBLk<Y_u5<w>Zf=56Cbrf-}lb5x^;kB+T!U+Qo8=gA=M3)CEGC
zl+UZJ(J_gaSri25+J%D35(@8lOGRH;+?geF{AD#>Q4o!rQb3-mv&(8F5fGn}K)Q6r
zRcnMTEx8P9DwoWLfPLO95n)7rW+cCOuE7N@;OQ8Y3It<)UJ5-LqbpVgb9|+74DYBV
zBJBdGepvNsHCS^BDiQvv6@g%~ir6eu7|BNz7w}KD#_&CcR~MJnxM=T}H<yA+*iK@@
zA%!rozM;D%K4+~D+0(pp{7`aVl?x@P_-wS#&y(`Ud7RiXBjhXW0Hq+vgxbb=5bn`^
zCXra`p;RLYzAl!58ZXg2)m7zXCG$B6E3*O;ch87?xo0?Y+%r<}&xz%k17M<HPQ{do
z<X#}QL{S%ta>C-0C7?yO(mT&1YzjqV$((k{Lu)^WX{DaS1Q%BekK%tOTI@N5XfsK)
z6bFhT(GikF@l1Im3pBbsDJ8NvJwGl@mT6fX!U`jJCjc91hO#2p>l}^+veJu%#TW#H
zN)W*lfkuj#&+-@7U=YMC)gwfLriZ4x(28eN)%?uZAG1>9<-n1r*k6YQ8v(gmUri~W
zO6Znninp3<WtL%p4clx?I^8lE5P`Aq#MG;RN_MepS^7bTuqTEYk%-`-WHzmmKWEW#
zj-)Ua&UI?wcU4wV*okAC>`DKOMa(&Z>nq0+-?h51jYp@$tI9NQ@on%zJm}pN?@WkJ
zmjc^GN(2@0OOp<>ORH8Md!2gT30XL9pa3HU7b!+YOJJ4m31O3@FY8929WkIVQ8$0%
z$?v9xh?hj&+vVr1Dg@hgIt(){m8N~=3}RfS`m3>E(z6In`&@{{ddWp#Uzmc_KW~6U
zm=VF|pI1D!Mha76fnqlG{s{nyQm2eQP7-lfV;CSw@rZQ2u$~|MVE!5rj^QqZB#rYm
zD_uxloa11tT*huwKCGnElgoQ&3Lv@M@?K~t4*YOBrd#WwIUF^jBa)w3<oZdlhqSvY
zo>4Z}NoKhOx>@V-kw;NhIkSo@CSFE(@~g#x6wJ*}VCd(R>&g`!R@9b7VvJ_AWivZ`
z-t1yJG84h@YIug8xn(szteVh}sjjZw#V%kbjzZBPKcC0t*l;+5ePzjPkA9MQWo=cM
zCQb>vkV>lkI>zy*{#5HLEg3Qd;K~tHc2)_Fdk8rRm6&wV&J36rEYp<1HX$p^GwjL~
z+XT03T$X3(mCoVj<FKLTW0*Mf%2YuuEJV5CS31*dS%rm}<6Iu+mBX&oOXz9oa+0N(
zQ%)P!rOKfxa++L;oK1eda;Vdmc-<7&1lOdi(BG@|XF6=R<G*hFw>$p!;HMb8t7ykN
zady1hM(Kh|PvSqy-yNmIf3bRccUx~RBTB?@9{PI*@=;ygY-i$c_`hy=3y%<`_R*ih
zsr1wX*MY(m%6+EZLO+Ra<xX(k-R8i325PsvUdHGL<rjUWzv2I={vOPeLGNvH7|vzX
zLKIOuJ&B%(zeE=jm7hfJ7^F}eLL%0ZLY|`a7}O&E6K(f5IW@`}T#EGQe~nQtx=7r)
zN&q46SMzZLu8-|EpwWkVb(@}^2YmmM2XUjwVY^SKy~Mb$?J*q>C@aHFtytR%9sAEK
z!wr}owwHkELnAdwPa{10_(Qn)(%-gLr`2qE0qw`x-qEpoS2EK3**?;-pVQB@X*w?E
z@@Lq-(Qz%O_qO#IB>Gg%>Ah_Ibv&Qbd)g9oyzPbCa04aAc7=|8$@k%AOLtqEj^}Xs
zZnkV-`WWx?B9r*ki?0>>`@RM};Vwa6X`#31^cD+!rGD>blZ9TrRMgX9p~vYLYj#-Z
z^(RDr%|hR#*MG=Dum3^hKW?EX^wcln=;e%lwLB>D$64sRbR2J?cj);OEOgsLqMjrR
zeV3j;#X^tM^QT$pDY|^BE%bzqg8#)9dWKGKw$Nwmc$tMhUC+P5LO1fSve29L{2dnh
zN}aA*=q)<kroZ=X$l0hr#X_&w%coiBdAfWuEc9xfo@b#a==5C!#l<tDT>~eb)0O_s
zi$b0m7J9vojc=o#o-gV0x8zT_SB(2I3%!q?e}#n}7ZmwhEcA=@{F^NFxO$O)mxX?n
zo?o-jTbe|E+qqr)J6rE>tc9MiP~=ar&`0U@Ct2vrbbhKWbYpz$EcBK7_|{wKOLcmN
z{!P-o0{?JblRt_?Kd95_D^H@g0vq{__NgA3Zs?blZnSTu8~j@7o1(P4D+;|(q8}#4
z_0M#P{;5I4cb!DPL88Yy#5iJ+vdQ!oDL<|jnx7p}=q>%i^~m`*>ElBEmFb71dOFS8
zZMJxQ-cf$t^mD2bg`ObMub1jsD)B>g8J}em9alch&kBitj!CuIR!VeSZ8krvB>I;o
z)n?ly(FaNN4vC&9(RWDnUrF?3lH6qaE-8PKlz*v|U#4qPe&Y@)6+0x+he`Dum+0dq
zx=qp(@@tGwtVEw;5b+%+(F-N|Dv3{-o*?D_Ldu^c(TgN{ibVHF^fZZnn?zqBwJXyz
zr2OSlenp~JOY}U6?vv<+65TJ+r%Uv?5`DHrzg?nNOY}P=dYweSQ=&IZ{gUbRQvQIH
zf2EXPrZ1NA>q`!oYL@8tN%bt1=(<b6smmn#aw&g{)UHflA?1Hk%D+;gKPS;wN%R*a
zdW%HYuUK>HCW-!vl)powzb4UlNc1%leV0UkQ=)4U{T~wjkVM}i(T_{?of3VqBp;bR
zTcTG<`8y=~pCo#GoUp^V=8nBl_>&;f@rsH0Ns{PzRmJ?INOU;-<|j>})0$y?G9)^+
zZG04oZumNsmM77%;W0mj5*@E}n4jqq{Y;aJ{g6cOBhjlRIyPA5r%s~d>b?1?m+1RV
zs?D}oq8q*dB{xg-{!;#>68$WRzD%Oy6&&-kLZai<5c9KAq8slhP~s|yey*7tKfWW;
z<0bkgiT(?T-XYQPii!EzA<^*)g8A7c(Xn+jKbk~OFsWEyB>G^9eq5seQli`XhxPVi
zi5@G_FOleR68%z%9xu_8C3=EHzf7VhN%SESJw>8lF45B@`jrwrL!y5r(G`iFBGL0C
zx>KSTO7x);eY!+9-|ay9Y>A#K<*%0LyCr&^L?0p1>m~X~iN08(r%Ci?iJmUemrC>l
z5`CFOzgnWNkm#c%`bvpDTB5I#=wl>$i$uRhqHmJuV<mcrM9+}uJ0yChMBgRR$4PWe
zq8sl#QSu>)o-O4+F41!&y6vp6{!ftTu@YU8=y4L=CDG$0`W}g%AknXt=t&ZNqC`)T
z=x&LgCed>xdWJ-wB+(U#ew{?mljxHrdZ9$mm*~?a`c#QNTcQ_8^lFJdO`_LH^cy95
zy+prBqA!-{H%s(piGGVjUn<e3OY~(Dy;!2Jkmxfc`bvpjBGFe#^iql5BGJ7PeUn6=
zDbYJ5`YegQL!!@?=({9(nMBtl`mGZEkVLPL=*J~`r9`)#9oGLj5<OO;*GTj@iC!zw
z<0bk$iJl<Q>m+)TM4vCwQzZHViJm6We=X57BznC>S0wsEiJm9X@0RF=61_p9PnYP8
z5`DHrZ<6TM5`B?GuaoHaNc4J%zF49!mguTPZ<gr4k?2b$dQhS-ljy&d=qn`p?<D$4
ziQX*HS4s3G61_#D|6ZbRlIZtK^bU#sfJEOR(I1rPyCnKU5?zz%4@>kz68#Sn{kTMb
zM55aUg!TVXi5@G_ACu^D68&+B9xu_Kkmv~#eVIg0lIVYw=qVEYDT$sY(Vv#+84~>&
ziLOZW6%sv9qCYFq3nluWCHi!U{uhZpTcSTN(W@o;XA-?mqW@K**Gu%35`D2me@UV@
zOZ2}<^raI0Wr@B_qQ5H9S4i|#5`Cpae_f)llIVvddW%F~Ezvhg^fx4WheZFoMBgFN
zLlS+LL|-e>HHqFL(GN-VbrSu!L|-q_ZF*e27ib$KdaOj>DAD62`sWfoUZS^2^aP3C
zF42=D`X-5<BGETX^fZb7mPF5x=zAr)BGKQL=y?+T9f@8j(chKm(<S;pCHicM-XYPe
zCHhv0UMJDFN%VS&{+>i%EYY`1^k#|vzC>Rt(La#r%Ov^^iM~Rje<;ycO7xE;`YMV3
zu|#i?=>L-Fn<V-t68*oA|CPXhCGcMf{8s}1mB9a>B=EK4qVJT3FJqNJ%={PZHl?Y>
z*X>k?(y$>`cuS{J_keul)Fq$c%Q47C`FWpyB((d~sZ-62?dZ{w&=wQZ*5XL0)x_Ny
zzi8qfjF+1@hVc>;_hh`##N?zM3C%HaEaPGm_hvlV#Ah(hG%*G6kA$2i?!$PniTg6{
zZ{mK8?Iw0GK77*XZye*@ChpI8i;2%-+-l;p8NX=a0gRWMcp&2?CO(JpLKC0Mc#etV
z85f)Q7mO#H_&mm$CO)6B)5I4r9&F+Z8TU8wMU3qx9>n<Y38ViBjCY%OFyk#I{w3p9
z6JN~uMH447UT)$`7%wsLuNW^h@uiICm^g`Xv5AuzPd4#oj5AG4fp(0)i7#h7*u+;b
z?r-8N8QV>K730G{82wLSyxYW1##>B0lyR$xhcSN9#KRdcH*qTCB_<xhc%g|$GM;1N
zG{(gyPG>yX#8)%UH1R0LP7{x2JlMoz822~vHH_^hra<<Q(Bbcm{%0`WZQ@METTDET
zajS{57{6%ZY{tt?oWppDiN`ZuXyOTs=a^VwTx?<&<H;tzmT{(uDbS4ZH?f=XU=!yu
z?r-8rjO`}Aj`88|jQ-~_-fiN^jJKG03gcE2=QDoM#8VkBH*o>uB__U}@j??%V?4*i
zg^Y_$d;{ajCccqzripK2>@@Msj0c;zh;e@t-@@2#Vh`iP-x~d&&Um+piy3b*@eIbT
zCN5$8qKQixFE_E5@e&iyWW3PCvl!1YF)ctxLd7O7V?5c!w=&K&@okKqCN5_@*u)i#
z`<u9uvE9T~j1M0-`d`g>w~6O4-eTe!#;qo<W&EOveT<iz*w1*0iRUt2XySQ{=a{&T
zaj}W#GoEbX+Zkt?_zuQS6W_^ru!$Ei?r-8>Gq#)fF2;wC8U3$kyxYVJ8E-N1-HcmJ
z+`#xn6E`wmZsI1!OH90o@j?^d!+4H~7c(w4vC4R|iGRa5)5HPBP7~kDc(93sjQgAT
zw~Xy3{vG4PM~(hBGv00DC5*S2_&&z1CjLF+7fpOW<K-rPfbkL&Kgf8Yi63G-$HYq+
z7n}HD#*<C_2gaEueuS~p#E&u_Y~sfl_c!t5jO`|Vg7M*RjQ%fUyxYXf8E-N19~rlr
z_({eun)oTk%T4?=<0U43hVeoZ|B3M&6R%)gY~p7bPd4$N8E2aKImS*C|Ap~j6F<+m
zzls0K*lyw%7$5$((f^f<cboV{##>DM6601A|BdmBCVrXmaudJ8c!`N$WxUY9@V?hP
zN!~ZsP4c&ai=OI=3oJO-f^#f*j0LA!@D&z(i3MM1!2>P0uLZ|g@Ci$QzOmr1EO@U4
ze`3MgE%+S^-e|#AdA?#vf8K(hw&2Gs_yG$JT5yvEFR)<01y@+G*Mf^IxWIyQEjY)5
z$5?Qx1z%yomss$H7Cg{``&w{}1)un1SN;9Qg1@rhy%zk51#h?DcPx0L1+TH-S1kB>
z3x3*yAG6>GEI4SvO%}Ysg8dd;VZmMtF0$YP3(mFR919*}!KoH}g#}+?!53QaKnw0`
z!7&zmV!37fE%+-7-fO|1SnzfWe#e40TJRbRe#L^Hx8SEO_%RE9z=DGo++@KEEZA?s
z6&CEZ;35kyu;5$^&avPz7MyCqS6J{R7JQ)v547OE793;2Cze^p--5rg;Jp_7i3M-B
z;CC!|qXn<A;8!g8c?*8pf*-Ts2P`;f!A%ytz=Hi2Tw%dp3of$Y0t?Qy;2aAcW5KBw
ze1!#HV!;<$@IVXhYr!!VeBueq_*?K-7QEMjKe6EL7W|F{Z?xbw7W|3@KX1WLTkvBR
z{D1`qEx5^o7g(_0f-5Z8Yr#bpTwuYu7Mx?jV=Oq;g0HaPODy<83m#~}eJwb~f=@hd
z8Gj4@%7XV=@Fy0$-Gbk-;Efi%#)4n5;O7m@0dAMNK~cA9%McJ$)Pve=Y>eINens7(
zG-!5zhIR~|IwjnNh!K7FLgZL!7@cVI#S)AKSt~)|9Ovhuy|Ff%)`kLY*@;PZ?Ou?z
z53th|fq0ZbW1`p=#9y?_k&d>smyOM+&dUIumw$oYg_qlOUcN?YYWxmFaV1{vq)I<P
zWRH0Hlz6$?<i$q3OpM4&4i#I5V#G@lvg8%MnSotOp7sb*UXR105*&CHHug#|wfgsV
z^rFSt;#S+geohI-+;gAZhG&;*a|JU}Txv#wc0V?k+G~*0n^aaSAZ~-Hl_PNkp|&gP
zM(s)JM_}Oahd>FWR^b2ZNWCdSF;U`TQDSts#2itgn<z0#FEIoqShi}rcHP%3_B6XT
z5M2?}^@_S(Yr_b+QT=a`Sy4Y#RE<Y=sCFSrP{PxQ1=05vB=&|o6XP(nj6v5Hqcn;0
zw<OLp&Gu09LG1!Ggc>^=sI|^7#3)mF^ifpt1++=a(KY8C+H)2t>Io%qSDeyx%GXB;
z_!E@mliHI<sG@^wlOV35&;vkP9fsEEnhykM6uNdbsuCiyfjJ0uNmxXa&>aZ%(+60n
z@$>~AW6`K2jMpJq8s>!{OTtjJIJ7|tONIVx3Dh@J82hyZ>d74_?Q991I#cXb6_v#>
zko)&(Xevfr(=bq{31Yvt8)~JEreOt78<5AX9@Bmx9LUlX_2$HCw_4t#8YfjF1><gY
zuhP&Gr>KpIyGV&vqQ9NLR04M<<|(7@Oib|gQv%tE)0JSgT`N5-cz>JgyE8F`awLfy
z8JuGra!||HvKpCnYz^0ufE=CcNCK}4J<(1cGUo9Hw5`j~rM{>6N%W{}3{|Fn>Q;BL
zLZxVLqXZ>PMQ7>zU&K$?RD6PpwKNol2)AHnXn)6u30cXef!fz7UP;V6j4Wcn9aL#B
zby+HnE2i59t%y1m7<d3P60_)1d{K#WjY`$^oeruUL=2Qx#aw@?T`IaY+pc{@LmC&J
zEY!D$K_k|SNtw*PIhrUcqJ4Q#B>GrF2elulO6W+2ZVP{aQfbpr15KF6`@>#fxT=-(
zPyO@V>Su2Cly(_Pgyxb3)l+=^l|W-+J!<E1`U6@()=)mOP-i|uWv)PurUa%YCb-qV
zBsNn{_5L3G9f&!DJAKU16q_AfE`p#!XOS-644v*${U}t=M~yVS-^K8SzA&a4cf&N#
zL#Zc0R76Y+q-uiDIqi;~qE9JGz?YbTnM<=cM*I8#GN8sUX+SvR6lCn&PSVeu>QaB;
z5iH2lCZbxmx=*))kC8Y71FwVu#rVxyA|~uKR6+%LEjXyH(wS)1$zAIEo+pw~Uy-P9
zCsh<kJy)-<SJ(Px>-A-TOm+R64ANw6H&1`mbdXMod*4`;d2{HoEAYQYr-#p>44G4r
z@f(~bCzUFJ9+!}%Qq=6kcqK3a>tJ?bER{QhdLOF||B5=M)8;9oa%}!SBIP60)ag}%
z<%yWt+4Se2){}DMdnZmN42whfucU$)mROsAU8mC!iEff8L(t4%G<*jrG-4$*5y3!w
zsWL?Hicz|SMyX*#5_+_LrWtQ5P=Z%Hzf_3+0?0-QjPW-UJWzy1?+;jLLK{&V=U0Mb
z7E;z=%<c4*e%v<@2J7hW@JvEGwaYPxG}*RZ|3?1FoImwi`hvQshm_U>W0lt9-4%PA
zvgM?204i9ZR}gz@m*C4NPkpW*(-&oJ{vicQ!<hS!6Rho}m1E3F7)tH+PAF<Al)Wux
zFV;qT5z3nFQ~A(5B<lPS#yI9<6f?SvVw$rBwUSI<haS8+0DaH~eTkZHhn6Ugwa2wB
zXb0(JcE+=sVL38|i_vY^kO4b*6UkQl1|_K}R`ZU~%di$Y41J2{nO8an(IJ6YLw<yx
zWbmVHpn)bXo}-y4xM-o##1Kd2qN*zwt0gY>;@L4MgX6841wyk)D1}{wqQJ)?MI~GD
zN0LJz2RA7Oo(j^?cQ~3Tkgkjx_yn+{kxmJepf9lx*lW*D>_gqtz9hb}Mm8Kj<!Gch
zH>_Bnrmc>qb4byG4fP-?V@~0(<DM<xNoo4nv6#*a#0Wh0#i>*3r`r3}Q|f&#Fqok+
zSf|f0B|HXmFb0MC7-W(fWY~_@Q*hkw>#K~4IestLY`Ou;palD#d8OU9+DWYkQ!m9c
z6gBNCXwyv?W$2H#jhbv~aWstpR}gZarWXG>N_#s>pVHbwu$w-5>Vxc|Yjwef=IIam
zus;OYJTpAlsFzGlzM_7u1heAAy_6A3V62kf=7UMdjZ*^ou}b<seQAp7hXn&cEr7)H
z>6`tfs}yyVqE3uc+Oyz|*=%jSU@$hs-c+Q|XD%m~QM-eg&JIK0l=M%0qqvo(t-iCB
zhVSgY3$zKWNN72*08^_VwynpxXna)XA7(Z?Ud@VI+=~k!v2(Yg9_wsq4tGh|;>F-Z
zbE1}+j)N`ux7oS1e)^3kmG+DjViMnq{+&nPGu6Dp_6*oM8~o@ViH`m@NB@jC{7Z23
zSJLo38~^GskeEsq`H`@&;Pg})uCn{Ot#1x}zn^4m&L`sMJu=e}^7jyMTo-+g#Uw|+
zdn-*x9gX$WeDarWH8IiE@PlJsHoA-lyMLJD)g0U6UhY6*B1mqvBz9a|k4qsDnDel*
z<w5RoJljdna+^N)ZSTCGnH|b~&aGN^wiYCOtQCohnuSTU2PRef2$NE;y7NhB-H{>C
zx>v~3v}jmk!22QRR^24Ty0Nsl1DCpv9m)+}{$6e%7Y%K8G?Khsfn;>irEber{{c;b
z+Pv$#ghvqaxm5{V7wg>WQom5tO(^Y3-{!xb-0t*u{U0N@EjN9OFJEce>^nyZWKvGf
z<BQEz2fRn=s3^{*p3%icw?WkGpOcmRdLQOT!)@_)p?;Y+^Mu1Hu-P}POO>xsm2hA%
z!=T3L9li@(?K$8dYJqk-9~Ac9t#)+o#QY%L?%Ww3Pu*VgfHr)Y*JU)>I+c=j+OPFs
zYrxj2SY_+vzY~@z2G73Q)2tHPZ^J6l>8hCEj;1GJu7u^e82xjs{a$+uBclW^ycrdt
z9i^eWT~jdoX>$xyM#g6VdjkGK2qMqa2ifk_FDAP~ixBLN4a2?q66$?0^@U`%J7<yx
zd`9iT?%V_mM|P*N4EaOH$@Z^663HR-HITHb<bYYS-R|J5I9}1P)=Pc~l-;df%Bubt
zwTJnj9)lpYO*C|vcP@1!?6PCtMEte;Qys4s*cOwe$%@Tv>(Rri@6*t~c<LVpIy^qK
z>c7*Oh(Q$VnW94ZF$uKpP-aE#PA;6O|NGsfj|a35NPwWa0~dETYn%511;>1m1Svd^
z6v<%D<nYAWZ@QDX#-#ifp2$s~cb)Ik)RURl^9q))Ed25~oC9|t_1#5wn<R_IM2(z=
zK+I0i;V8$?R2-EpuHg7MdcZ)&thvW-o9U?9gnu0~9p!J4H$AiUV5}p#Pa@@kS5p3-
zBYA3KoX?w4f70$~e5;$yhNnJ`#&mFpr@oHHvEpffqj9fDJ=@XPDV};d8l585;b<Jm
z{lO*;cH(=o6Txh_(r^ZoI>+jM%9~)=rcFYk+OD<{EB<@nj2(oKL~eMQDN3;B2QEf`
zQZyTFLW6%v+=Zg<U@5j8n5Uz7V#Iuoo|AEQ9(D9@Ox#4`Ko{wG2R-XsOlQkXJhcd6
zEhJ&JHzuwkR`5{*g|J1lu}0mYZ&&RY(5o~TX>1_o?j{x?c?-e6C$2;RY7aXeN3y=V
zX=urCB$pFgVna&_w27hbW%$+{=q`;4_;$mQsH1LS891s08xuQ_ftbNp9R5)f@_|_b
zNZ5pL`m~9xHti)=vZ;w_v{6YR8&MCvU!8zFn0kNtmDnnl(_Db~rz5y99a8gQZ;h%G
z=uaMPoR(33xHEJoiF{l~BPK0$Ot79ygcKsAw>a*(6c0+^-bA{;;N0pwiVeOid6PDq
zIf*H5WC0CNR04k&Y_mXoVS&(|Jx*)~sk^)#4D4p<UI6JrmSUF&sc8A6H%8!q1g6Ih
zzDFw}mmW2^9j&|7!>cE9jj11BLK-yYPw2nqz#P@}P5V9Ol@hq0XNqam98GjpLs__i
z=9QTHnYUzmr1`IPqWaD@bN*}3?E!w(#7gjG?kmMQcA<&T*Vt{s&LMsl+JZ-I2nL{Y
zk(pon9{ml~!uElL`vBVcp8=PIaVHib-WzL6kdv$-OibuS_#Vdd8~QUMP@%Kos0)j^
zG4zQ<e+|#({^v*h?`OfG)(AG>$2wlE*D!e-TYPZaE}YN+Sz^0Li^4F@z>5O;<NBg7
z1x7PNUli&gN@&=VEvSIhaX0K9oXX*jR~tl~{`20X{~M>4#K9!^4YpHxNo|~HrveEm
z(79N9co!K3ZHnG~a1~O_uC<5AnUfZYQb5U~QcC7DvY0OQ8}9hRFXLgN+<{9TUT86z
zv7lpOKMOi-S2MV)rt!r5Gpy%bcb{%OPq&=fbnx#A6v54mSwnTEuXi-jRTa4AMZk_m
zx`#t+^=P>Gm~wQD)296a18(jh`a*VCc1P0=LW5<~zJwH@0Oq(j8ZSc@A;(=D?jbok
z)_#2|4|b<ry8!Bme$WPDA&Sxrr*4{VoulbGl!wN%P{ejFvxt?R_2p=Txu3XjMLn(S
z6QP@KXgLL5$nudzBW+(td<SA~L(js7m)9ZP@Y0+j3q}Qn;oU`)KujzN2PTayGksum
zA+wA)S73n_4gkSOx$WpnFKrV~-55LA!TJRQOzjy)6}A9Y*N7U4)(m^f+Z3!ily@Z>
zqVE=bYl~>4U=`t79dietVuMDkl3}CZ4eaqDJQ9!iio-wC9`7Q61P$Nxfjy?IHtm<-
za`jbsg7tlTFk3V2094r>D2a#n>}Z^f?vk1(?<9k@kF1@#PJ0VkXp~yG<fDo8^bO^x
z!?&@@A#{pn>WRV@dLGq;GSnxw&<F6q7J4sbfrVa$Z(*VLlZ9@h$s34y^VgDv-bGni
zcrWvq8bc)+Y3djo>3tt_`FVF?;1JbIG7K=uOM8~po&tDo3g8_J58=H%nyH3B%y=pY
zwJ_SyHi99yx_k9G+(PQ2K}ZjdaX@U^jnLq*|D_Ef`Oqq~7g5Ym3RVTyQ?jpTLo-Qq
zqLDfc>Jpfj(D1HZd;cTkeFo;YIT!wdFh~crJWL1}r0X*suVVZ#4H{eg7l^6!0Cuvx
zl_8Uq31L$*jM70Zfc8W8qHSTX$=>e*(=?EWFp#0A$;gv@F!`SNmoQDC61WVKje1mv
zWXboLN$W!=q~})YxhHx_#VNm0mds!>=#s6YeOcx-nhe52)FXWn!yJl=;3nx+2AIc0
zfImfhpB_p>FFQv)f<^Jdr|$-0peP<hI69ZFcc*tc8n;0jtWGdqNu9sseGIm!>tmIM
zHiQO!iD@qEyRRTm1n$UGn9=}thM0Lu`uqO(C@u&G<%l*G7iFmwGW)kF0pbCfP)UUi
z9%m{V(MIDgw419zN?fjb$UOu(T!^MQ8lMK|3TACmjuV50&>!)ET4Jvaj6P05O3{IO
zM`J%WG;Z>sjyt!y1Fj6$vBTK4fw>+XE_Ew;^~p*AR%Z_OmB$>7bc0k0VBfr%+><%j
zFk+jEGOqNkzMV?og*;T~3Qn*qD)zcfUpbm8iJ-PO%N*vWA9vhy8!07+G#!oAyuf7h
zE~*FbqTp~ajY7+qNa*^8!kdZ^Hl%F_S>ezfJJ^2!v+?dg89IP{O!`LOSly3t1ujo?
zK|ZlqXt2V3<7gx!fCI12empNQ%h01X-ymgFy#I)+VPmX2{qjWrCRandJ#-y-)9a!0
zn551#nib?w(hvCNVZ0#$gicblLa@MXKuVvw(tU|>jz&L8G&ml4(d6YU-s_Jm>N!gA
z`r|Gh#PsbD9X4Rev;|U<KhOrF8bvL#8B92u#-jaD4GD-^7xQ=&SAUc{aS^E#Wp_cP
zm~ltbnN;Q){BfnX`1`sV4&dBwbfTMdj#*aAVRV8O@k%fcqf>xB6vQ6An-(!BvO9o;
zKtY@<c=L7}Y2AgLMPeSi)SlcB8sxg5=Drl(#iKJxZge_>0tr(Q{TAALR9lE^;oKKe
zPvrM^P<o2sVk(WL1`Z%avQ~6O*c}+182XqGdW95NP&6Knrr|Ifp;`FDZICsYGFY#k
z(+5!c1S1_1r7Tx<$#NJqhY~3Z%+!0^0oKAy(S(pfhL;HqGt`R@^Hxw^4-EtCK`a=x
zh32EAmhgdLwKj$h!YLFMY-6YoR(j)kH|SyOEcUN3@O|gQfUEB-f%-V;1_!yr@n0n{
z-k6$-Bl{4t@2s=iPA!W^R<1WNAwiCZOYS%Ck2#L0Vus%pe<}}AMZDW)`qNZ?Z31;+
z9D1eirx_15v4}(VqOximTCc|XsRYufqR^LEDNgH8Fy?t5^`}?pllSFh*c;|I7!yP~
zg*wSSc^!E$1Z(5KMN#OD_v8}Hzj+2jjA&;-%4C1~DuIFT&qXuod9l6(u+<8-_~hsy
z#K>zJ7(_gR&KzJiZ5)Z>lMOQ(IJ3FH*-}6DGH~L!zeTZcBvsgqj^lfEuQCTt;@h+v
zz#j#R>%pIhhfmdmhp>OYj)l824HFt>(AOV_dZ^9US9=;O8J5Z(?qCTEoPy4DrlR);
z$0HMKgaq>4fmx7iFA5M|ggwJ;=u-VCoA3HevdqD}Q<*B}1_Cg?0XKmw@E=;Wjm(hH
z4_5$T5_Lbuh6*{UP#?WeABx(%WoC%)d=qk51+_hODAUo3$~&K(*(~8co$pHgJvS#Z
ze{aE(QB${^#@_`bybHh^@wqmhMib0-Ze@FHvcY_&pT_5UGedmmYgRs&hc&DVzis;b
zX~Nk7Em`_O1p??$6y4tl<m9;nPt(|u(!_u@yxxgWwe}L}QD8x=_N;h_)0Xi=5!Q{0
z1hQv0klw(gH>^+48aeF_O}hgRWC{1tBCqH4pVfJVu`VDpt>qUY3c>;T0~Cq&Yg+9V
zG$#`MpTXwMCU!1R0(J3Pl3)OoQ@%5JGHS0-?!bb0BfgqMS0I*wxRye@B<SA*kk0hO
zEr8HD7*xH#(0M%w$LAZgmvgB{A(=I#&%C4Kz&GrAPX!wX=>G!SWnt}q3**IMxPzqo
zMHo)3<;QBF<~_EfX|xT^QMz|`2hws$?sRwG6}Spfbyonf;@dGN=tQDM_yreYR4^kP
zjZYz)_PY;hRydmIeU98fFE?HD5Vg7BOQa_`8uw8Pa3b6kB6bIE!{S0=z-q9WS$_?&
z>9V)c?kN#$@(cwaMqdUdPX(*}v2R@I@A+^h0T=4>4yswrwXr_p4vuzH2aSV>OY7lE
zefL(6wlUtk0Qq)fLj?xzKv(KWM(6_>2pELXiQ1odbZ8aDTHU6laoetA$Iz~Sg)7k0
zjY?eD_oJu_)hOVfy`+8^CO4uEbfS)QJT`MI9UY!#bU0FbBlY%7lhnq9T=<meEy*i`
znjYm+vHv>}>Vsws``}hjin#jwzR^lM^$?tC)GFoN14fK_0@sDc!j5$UuI*;uCPf|o
zLFYO{uPN@nc4o7P4e+TBpQ5PSFxQgYfq8L(3Gu2sv4?xqyaY$%6&QCVSZ9Zem#ZGh
zd;`$tTEoplAr$qy9QAm^ew^}t*E2WR=Y2T0O7amUIJ^U4^W-+fq<@_y_{t$?fZjjK
zgm>UpKgw2Hm2G=S@rc-P5VRfYx5~C%u8Yrd1$z6gQ+jW5sW9~i?Edwus9E(xq++M2
zoQaV=wpF>fz*c`}BEr{>>~#v<jm(BqV||60@b7Zq<HuuFOOhg4%CQb*uu-G$uf$H%
z9LU$z@C9`9i_BGH(FEAhi6IcRH;EP^M;7vei$JCjo=+PNPvERs8ZhL$x&`_1xywCd
z<h8*bhv@vY@dpxPQ_E`VhvLZTQ0yI!yT2x)nH~8Q$Aoq8TOm!Yr#m=$BQ-}iNm`(n
zxq;z{bk2j7O>-yXoacZq9>PG-6q_@i%i8PjkgdTYlc|2>3i<|OOT5+5NDjJN{Q$~>
z)27#nfzESb4};g*bJe$k<GQ)FeTH)-41h0N-I5!KcdMI7&Vhce!RwXYtt7XLd*(KM
z47OVpwP(Y3WBu!0<Xl4zLLZ!4d3>l2|KH*HB8#K(OX_LhoJ{P6<K4YKavj^|zW5>@
zFP9^G8ysdAIE^DGIXHZaJNbQ56>lb2)HwkBHLSNc93Sf&-_XK3;B%?1ZgpKQdK_>-
zanui8+xC$5yLxY7WjNP8cnh@M<;dBhsOL6(Ki0p7TzconsKvipX=umfXJvYgc7Sm(
zeh5Y!^&!^c6Pxwqqgw3SBtY73*$?KbmnY`ZUeSfDaUjN%2C$QuNrf$=->d|GjTS<^
zG4c?B5DfPwbd)^P;FuNtv8R9LU1;2!$5}y6IYNVwr0b0_ANhPIEgMbXmI+P(U#8&;
zc;eJ;fobtB^&NOP*xq8YC_{R=)D9)sXB~EY2|!BguI@_mCTx-%_xu*6CItsv1`Tip
z$BlJ4T<dTVqo*r4d_4}}@Hmz>t2p>O3|+h0WnY)suxG3*8S3=G{8(4;_6&D$(s7!R
zA7-v4m2tg^KlG`0Cu0s`OfhWhl;n45*m7w9&GY+7>|DkCJ`$eaMCAE>NT1(lV}3gt
zZvt7G<8PAS<J~xP3+9bg94;_PCS~+{%EL<Q7gxK3!{ag=_JPOu!C*sMhO6OVY>s*;
zH+{dO5&zp48?F;H<)KQq`kgy@4-_rdhQ$hmwIpnKRxDDj;4NBhk7kpJ^bLGH16@+u
zdB$cZ!ysv{-n{}jSW<K2EUYNO3$_SWXwQa-zyR<=GTTbCY%#?vS#@BE;<D8O(vLLB
z{4^`847*^AQ8Bo7rDH%hxg1#=6??1W?i)dpGN!k}B5YC`T451FJ+W}VLqkB59gb$Z
z_AHbMw<>Kp&Z98T@JyBDL+<o_x?O-8`VyfrZQAn9ln)?N-7zsZdL&26J_5tet+{Gz
zuDVBPtGcE0?67~<&}w%zgzUbNxq+diZGm&iHV<x7daq-X06+IAOad%#uZC~O`a4_=
z>tGTv%QjOn|7KVb%vBmrg@$vJ()ulI+VFoWu=}mwQim|6ePS~kc0-1nX-r!&rUx*l
z@3_)WIvU?bc!11TIlk2`N^oARK2Eg6z&<4(;W1VMIA%LRp@iOOuotGDKHj8E(9>A0
z1ig2|3c(2gjZW2@{>GsPx_6Ni7bu7)l@1PA=T2USI|IM2;1jvd^YnQMJCxaQVyw@V
z*{~6o=nK2govnrtPI0N<=fd;awu?;BXRbh>L|5+?h{knsPxs&tg(+&m?wU-|YFEPs
zm?Emk{~9eIInJ-BF8|B4=T6AR-aQF3$#)hUjs*0<g@MGt)wR>WVYJ=q0$Y0y8y`2;
zbrqRFMo#dLUAhX5?W%4wo4c%cj>gBaujMtb2oJ7?PAqTVh1Wbmq&4rjSo5YKmsq#a
zJH)!e!v9VZK6M)BBfVQ)DtfQ}JJgI~#yUuRLn$&44S?g86BpV?b(%XigFiZ*{x_YV
z=@ay2^x2r82pplz>64VCJh)dVUohE7lfssnzSGf2zbv7&ucH~yrLvO`D=>H4H1Zwx
zD1$qolddy+VZm@+e6G+RI1euDZ%3je$6?rm`A*o6rj>{+5O1b3odfQ=DpTX%kf$q)
zAvDM=_6$9UL?Oq_^ev9Y0HA)gNat@eG>Q1zqHNo*493uczn;wBITC+8L-d;<r?=wj
z-7_?d>SX6e=znM;dhcv;ZVmmKq!Ao5@k}f{S8T%bDr4a>Oh{-IPM`#Tw4c{c*Bic^
zEuE?3L=+1QwhcGa!O+bJU<Rg(I6iJlz})VqBcl}U<~B4ytCoJ*z;VwRNTv%1j;2F2
zK8{xhBrc|isQMA*OKa=V4YrPk3ZJpc5Ri~N-mwU=Z`<l$pa?ePt*C@HNhfJ#Mmq^u
z9bo%#^g%&O66xoZ5fdE*1u2NU;~};!gD$4iGNmGcPSD#kc>3Uiu6$SeoDHZq5cAqh
zGy@BXylL7VHka@bl=OolFIvUT>o3+*UWh|FLt03!H*A>A{m{2-LZ0yU{PWP)kOSSL
z{U^LMnn<__utgVD3J2ODGLlCIoKOTkw5MpG&<Zvih^g_>DeL9ejw;-oO2(`YosSwZ
zuFbtMt_7$|tEOHA$1Ldu!c;1cd-^LmK%;K#>`e~{pm#zrB#5p87)YIhfG;GTkZ$et
zH<(NqW9`I7dGew^tS@xmmoK@|4=**G?Cn^D`@Imz$r8sRyeN(1wUhg?&Nvo5CxGKp
z0{0VmbK(t)(%#=#F<eoPu#3@oIrmdp`0dzn4&?fnM7!|YyB=H$(2vW1durD=F^)xW
zq##brWq5x3)k6(!J>VgniM&tXBE!73e4d3Zq`IH(oM{(>a}II10>cx73VJ!_P$}N)
zShfC}q?>=Y?wcfiyr7n?Uvxvgu4B7gsJda#53bfwH@E#CSa|Ovt!>@aO~D$W?r^Jb
zWvZLA9j{KEW`|l&8^3r!zwDsvR70rSI6Nw2sGswmSH@81=Z$VykD&jq#R%$ROON0Y
z_cK@>V`21mj3V)<G<;t&F9U44z^3EXY!BMFz_WNfc1Ths!RnrT@!QxR2CI98y0NP1
z_K<W4-Q`WB5}XjnLZ)zjFD*`QDKrBG)?zWVg{tWZ-iIx;20rOp@NEnI4Nnk`sE@{*
zqh$y_D#5HICB2;@cX*(=@h+SjBTyo)?2QG42W#BmgYu0Eqi#2?*M^|TP2u>Zc0O?D
zhh}@)nS@uHnDxzwfw<HY2!LS2n$pEkCDAR`QVN5UA{*{H*$u30duRs@8bv4hV3Buo
zG%Z0)AfmPyO(GZH-#e(C$Ez=vXIhzm!IMLqiyZWnK*1n!&y^sQpCE;Fup6U28QP53
zmO}No;R7_qF5}w42BW|8gkzJ#o7J9n?RDKxV~Z89U53(R1Ry&2ZrqGEQ@24$rVa9^
zK&?^`pfeAz#dv9N!4V2oVpxTKYi|&ygAXkejEMMU0i9)qLrDrQU3L*D-IWfy6Z=3y
z;+F4GXbxDy&EJhpTm1+WwreFU65J_OL*$3H(48<3I!7W%z-61uY_FdIxgesdtg4<*
z|6^096L(n1sevFM`c+rsXP#!_DTziRo-%iy*$fBDpT#zrhvQQ%fOcTUutuEnT`TUX
zl6_C_n0Ker@a-x8yL6uou_9dIOt8@rY<y=6*<G5kZD2}C{|CJ3E!y9BSSfiq?OS+U
z-CZp`?HO>mCEi}k*F{d3J$mrjmH$@|xuRQpvO>YsVLfP_T1Vm<GY-$%i8T=P4%A~<
zU(*a@k-~bWo7HDnc+gf02fs#faNyt8`JwJlQ9VZ|q;&K36!wGOs|vja-lTL)Fh}DB
zu#S8T%GW-yUCbADHB$UErx7Q3^dm1K2I@z|bEm&wXEMo+L!3J!#6SNI)<oiI@lW&$
z!Okx6Ps)8-{1e1*{1brUpSMu_^LkTXQOKr!(8{G_wRgosoVJl4C}7LXIvQ_B2{Dc$
zzWWqsKz#QRJh;=p_U{V~LY8J@FSedcPPT~u>-H=!Y?xfIT1prW1;q5{x7TVjD0I8`
z4dO^xw#$`3ZTwB3hW-hvJ`P$wo;%x{wTbk-(zIVNb*Ar2{IWg6`nko8Lm@1qN?;1K
zw;!B;1XgfP*8?l@VG)&e2c{uR;dG~W%snSJI2pR~zN3-u)8qySbj+Qo?Svw@f(WR%
z)B%a)8j+iXo4;k`nUiPa(p@Apg{=^IMY=;oo)Q)oI@4geh|3cH)E>oHMyS#MS2!X{
zT&ANxT&~W)hWo_+3CJ+8oex2OGFLmmVJMr22lCc&u7SD#;%*Z4UOk2=<aX^GD1#e@
z1ZIg%$w6&EWIlWdR9ku`b&5Nw_l!;Bde}SoPda~FbpHjOU+`wTjn#lJWgymkmx9%;
z&{*9Hd38JVIt_Zv!Xixa;2m&N=>kXhn?kF2`i0j=<Qaqd_&iLmFUP14542h?6ZJW{
zz6E-HIW}!JnHn6q%!cHOLboHAEMM}^UFvoY*(&M>6t&g%t)>#|WIs9we(}0_Mfx&B
z^CiMC;)wSriI2`_!tz7ZY0O!tuwZ@=)_WdLzMrcPXD+;5h>7m!Z_};;pP5(;NZ0ne
zVCIE09;l`5Laqv@$KIwTldUj!1g;^-g<#gaWRA}Bx$iM!tqBYxr&I&H3CgRle(S;3
z9Vfw72G}Bguu8db^o)3duOG$v5VCEvYrjMjydEQ$MZQzvj1v37)K^dsIT&#ErJL3Y
z47_-EXbcL6d8OS8e1PM{jz!xrcB^OL4=P!TN~om_VjIgL?(vG<3zl@Td*NG``lbcK
za?>fgzI%cA63|H^CP}KTolT1qyCA`svD4@U<sOOVP6aJMzcOB<ETa?YjtI;^yV%9V
zaatm$Wy?q7S7A9T6k-hfV?5q`JtkC!ISbJyoPcQeLMb&md?fkOu40h>AXqZiQNhQm
zSEKXsDAH&QpF)L(J*0`lM}7(do_ImXHz+(lw1NlD=d)#PI@R0svA+`m92m_7Hs?{q
zdeP<p4w-A!+{XnS2#)Pg+H*9F20Xr;UHlCe&n8b?Y2DZTrlQaQ)Wz!uNaE$1@O&1h
zBl+M4tz4zxB=Hv0o}*c5-IxO?ts6Cf#=7C>b>sP0NE8d$BOVUE5U|-en--M&DFJsV
zUZ4vt<aCT@O{Jq(CHVjyyw)HG;yIKb`R?g_pY62P`7+j*#U?-6$As@OF|Bvfk|X@R
zX)g5x1yhkf4)Z#F!<Sf3XCXi~5?X-61%79bjxD;;PRls9puqDtba>(1jN=Q;6vut7
zZXEVdM2+>Bj#TIr(fP3x8<}3Yc;zN|=ee<Td9+<odz?El*yB<HxO`vU>PUv#Hf_D@
zQmp-VjYGu8e;MvHV)0Uoa9h2AcGdE!UNf9(Z3ntY+0TXm;XWR7BW6F9y1j8G(#f-?
z(~=p`DAea$*|~%Km^Y4#?%qWjgw;{2LAny4mf98j`p^_4Q$A4n<51E!!BK2Qz?ZeD
zvmLtc{2sa=f~uJEWqZf>Ys~{$VeIH-BzWsMJDiRKfz0~RiQ=?#18MsRj3saMF|XUS
zO+fmgYyju5qdWK@mVWe&PA_xuIuP9?p|eLaFXHggH-|1J^w#>oSmNNhj*i>u1RD32
zqMUPLd!ns{lz0chwzN%~#4Ynl=llNATt}e5mJ4&k2fukR%Nz8wU;poDD4-aI2Zd|l
zkQcb?+=p-Y_53~$$wT;4W65E1!Z%7F{eJl+wA@bnNn5*klMQdQ@gA%_OYFefvl1w^
ztAix`qyNYCH3@yU>g!W4TJ`lMsMY^XU)Pbf`-%Fx@Hq4}9fRIQU&A`y<+O=TeKGkj
zb>osDLgDnnZu%xiBi=xv$<IfIP;KnFG5I$s_DwwR_c>nO%u}C!6A;-PO`qW5bz0Y*
zAJe_WKsT&RBi-qHaUGH-Cf?15ZGp-N`${VypCaPeK)v?3lCJr`bOqfbaog=1-?^@2
z><w{NfO1E)B~Vrz8@TO<XS!fY$b@Vr6GFR&co&<3{bWv1yOy((`dmd(<sx$Ta>$U#
z=fGzBUF`zhZ-X166;N9gguyNshbLqqaDovVx&pQ#EIzA=_&ZEQzhe-28%7)=hrdhn
zAza}9DR~bjWAPu#d;4GiXY$_rx1THT$G?HRug7c(%Uej<9juBoR&Kn1yYS1!(8(@t
zT%EI}Z2|(sfh+M^lhQsxIHS1D+IS`DJT23Kdsd(n{T?Uod{X?H-WY<Wa<nLx77ALB
z?qOFF*Bo%S;d{i<NNYG!KAkR0=25sjPqDAluH#S+UNj<ilrmvMu*1=q#zowT(@CY*
znR@L?KjCQn4xF$jK>nSJu8xT<9BuLW<pP{dqX8d&76k5eAvok$gy9(GnIT?q0dH{V
z^CV(iq&EW?j;2~_dEo|<OjpZp+6(G>tfT227D(zf*J0~J_k-aB!oNn1G~-RVh+plv
z0b_|(nPv^H>|nXl9?C>BaD4M1EOLbJnLrq42#Shx|J1^N10r0M0VSer@K6VqGi@j3
z{*cIGO}~iLBp7p@o8t5cm*B{ToZAG`xlQ1%PWTITm7?58%rFx(IPo$)F^>|bn~Bpo
zu`d$oI#LsD!GZ%%zE5ANmrfL~FT4OwjB_n8*8CiWqQtO&K>`A;5BK3_R7@j}{<?w#
zda?p&lc`P&K$p??T|+Ud@W^k0MO#NB-_-LY)!rkKH>*%N(jmxfIwTc}Lz>WcdcFC4
zdaYZ3A79<6mBIG0a`C2zR7Skt4Sqr&;FO5%67iaV@e};}?jW3^%U`r(@{VgIXj{8a
z#Huh7+NB&^L48m$N7KWoGxSHUIK!shhDt+U0}1{RxK5l-FO);si66d3qqRYDY#H%`
zE%=#?F-H$Womy!xMWY&NNPT}6i7AnWbef(B#!ny8t0__aghurRiUq+noCY-#JHiw4
zjxdgM9@OH|1;O3M&K<@>yUwrpDVN}wT4K3hc!$~fxRId-R>7P<L=;=~2vKSUuHeM)
z!3)?%uQ_f5H;!C6yf;0ZMxPW0YEJ2R!sKTc5J6IN7W>b`(w2hh%lA>S=pnBj;CNdr
zLYKmuC-`a1kN!Rw1<&av+QqE9xWp}4$Hv4uP$`f(A&!kCW}0Ck>5?q?NKk@}i7hY$
z1gaIhF0_&W-UV8PZ~EZ9krw)fCy0O~%&T^S+(2ZXP1(upu+4m=7gYiWJ7xi-N`Z`E
zOd~(FX=h?EDDzUvj67sxALx(g;QfhYo0Z_piOc94?b0`W=$X>!qx~6EntzSr1eFTt
zx$)+sDBYMy?=sMfT+{K57ToIJnMCiP*^g+Cyh%EPIj~bp;VrjFb*G;aFK%)vvaehJ
z1V&K}F{<UD6b)D!tVav%Bf0Nd2UaQSN;7hhb)>EwK|mdiY8x;kNH9r6UEIGfQR!{%
z-j3g<oCVNu9A{~%^u(u0jzwwwEKbr!(X+aaZ{;a#jT5!b%V<BdP(MeTOuaYG(QXt#
zoTH5+pr51dQ+G71?|P1=?xb-}AmN~Qbe(-Qek2Da?i4TVW>A-ithGD~Nq6FQ1m1`$
zAxRVU*Rz#C-$gDM>^F^*Hr{8eU+Cv-IJ~0^OnlC!;hYVFPv>k%*3a2A+^fWu6`ZrV
zC(}9G<nfF9^vn)sp)ERR>uH>`1#@r)a-#QKYzzAiM1wFrWp<pGIbQ8iwiwrD9!*T4
zPQVh?VN`}6q&|E~wk<_#<FON}0JERWX;^%)U&T>`Hf;rs=KYC<T>fRDF))}_WH1A|
zx<E)mgJ`ZI*n;<FbP8$5%TJ4}%mT7zP*tcL@gF@aQz)ydXC>*;%V!y)Dlq~`FQfEe
z%oM!jhATfTZLSK|FJ=;mRBfT7=?DfXbXgCaw2~icP8XVHKQd$LXdLPHTdq2joR1=G
zJmEhFZ?h}unDvgv51^ecyhWm<Z+G0Y7@)03{27$TrB=XY!Z}u3c08@nmbn(V8IrLd
zQPf9Sf?5Uz`XY#x?5`<SoKts|<gHJ^U^ZNyh!yV?CJ(OB*y;5YTs9tq?WGIXR%wW_
z^nOjpMum76>X6n(1E?Dj+R~EOLYH=l*%a+~0#$=!>)+<~<0nkDDfDC<TTc~0Ot$L*
zupL>0(Xn-2YR<2>X(Ai*>mLG$`SmXXtBhI6$JKjOtzl!AY4#dU!JrnqP$^Bb4t$%_
zY&wmoLi`h1yWtsP{AC6WU*USz$&z1tp!0*r+ikdCX3R1^s>Vt7u|0?q)AUkW4|Y@R
z--+oZ9b@B%BGiMJUD;xGWiRe=O>n`fhOfFU9&n95x1J4VVQzgd=GN;uJhyJp=hh9F
zTM*;)%vBeHJH#=9eZTC)!%=NIt*4lFHd15`c|OYTh8j`N+n`>Gdaq8SYa929yoi0$
z^KlY}bUs1hPk7q=!A+4yk)h#uH^(9`8BE9xrq;Haq70bfjo?Cin|OeSl_40omu-^4
zzz;ZNq=tB=@x76Ig`pz`pgeWdF>DC$O@ullC)Qwk!AW9C>+?sd|8&zZmD|-#L-;lS
z=`0FN;-{8VIblyDP4Y%PojOF0JkJ3kNR*zbr;<KUX+@;(;*KMIyq?b12I;Ym#`k`p
z28H3FIm7PmMaU;6Pt$s&yXg43sT4otMqbjAW$@&t0nl$uFdr~^Z)1>2&?~7axPljO
zj8zDp6qs2wOq$8>tEdVVBQ|r@l_*D7sc;LX?;#BXGzwEQv<1Pr%vA#^jekagQa6W?
zdLdF>4GZWJ=bbTVF+fWyUX2!R08!v92<-p2J8(O0B;h4a7rn$8Onn|Tpd~E{m2gPy
zZD@w>zf%Y6VdwBocOwU40l<21YL{yWBFc1WY0=lvk=58Hkw&T5Yf^>th^xT(Y7#^q
zYX+-{a5_i|>{`%qGZu8M>@Xw`Vg}{JycH$L^kYWz2qAGgSHnw3o1^hpFx%$xp}s6~
zA-AYPXd62v_v05bA*Ow_enFee@fP~P*=b1`@dPnk!G_!&^cseI5Sk%Fj+YG__uP$6
zw)IF#B14|5dU1RFUv1e4{-~d1wthM2*e4F96RWFZ(YJ8c8cx7OeusyJC&a~`xv{vO
z`NhH$!bIKz#Lh_P;D+yza~|Zr@H>(74t^H-?!xnFmeJ~ftf4{2%!7zCgTpV-t)f_~
zwOlB0j^owO;dZ#YKgFU2Z*#}P9R3--pV_Q_fwv>dwhn6QoD6^ClG~1yCLdA{sN2<#
z>@9eo0loqLjZV*8JbFMdpFE1p#XSZrJW2e`TMv(d912|e=zBA2vws!L;S!mPv$6cV
z+9Mg6;h~4eLmyr3dN$F~Xa6&gVZ5x$1~Yaa1ZSA9qj-0Ql<f@E1!aRR!Xl>|B%xuN
z1HX1b(t)uu)JoSV@fImvVf`s;^n~oBklC=P(ZNTFmDBuzO=!mS2k_fQUHk!d1zZhR
zV!5^<iRRr>G()Ba9=7i)I^P-T!k+e47$Ce~u?m@NPW+%4Z}|^ukI-@tl?+X%J!LbM
zKmG{1jdP#PO2gm9Uc+~zITf)jVA~W-?Y$Q><56h7F+zrvNUATRTGPic=AU7=rh^Bz
zk0+@m%qy-D0;v`PX<>7Kd9{<%#a_wWkU5%GlhH(2h5M^X0dwO2n3jf8#_yuY{oP57
zJ@=&E-`8l5-;Dj-y}EH|b0pu(79f;?B;Af7T{j+35Bz+Quo0mTzSsBp!NQo(8<4CK
z?+h093_aHc_X<5CVR2kPl-g#$QQ(hJUFIq}$q(D-v7bY{3wZgHjB?X$-$7)?xHuZu
zqNcTUmxuZ`=nv@l6i~DrngCdqXH)FB@jNOMfIg5a^IOAMhGK}Q{!=@H3LXCz0K>!v
zj@<I~+PHIRkPHL-5zKSP9&q$M56S5Z@_Ob}&mMKa`=T+^kRlgmJ(k~gN8@g+P{#7f
zs`1&wLW%L}G8~f7TS(`94eV5k_RKl#N5Z2lB@@dRAt%z4;Za^CfG~q+1DGD=6?$A}
zp}ueX71>C`uS^s`_?5qmWIg`}ex?0U>S9;Fau!VA=27%38AiPR|9k)P2DArnRQ4|o
z2L$fdbPTj(H)H~Ra`0%~O#8Wk4>AqM1)pjwn9Q%D^SZoKo4J&_cDhph6@H0n1u(z$
zA7cv7|A$0+9H{})ty{63dOCYYP#3ar=7x>|t-h=DTpeUKP>a=DTdJp<tGA=^{I98D
zUH_;KAM%J@PtzGlGslDPKRgHp{Pz)>qG(??o@`8R;dLLz^G2lNC_fKZhwo=DLl6C%
zDTSas%1&>n=Qnr-lW*%>jNG`GiqoFZEHs8Yru29-9(h#{-3n!YgM6aUbUfk8DJ72~
zmLUU{m6)PWoQ)1+in94c0OcTuUUjDzSrcS&G2s0#x^ud-xPB)lcfZHc(}xbAbxavs
zXwNY`VHVgzJMg4)+!yPF&T%(>WbTf^##s!A-NM~uojV=|dG1$Xf_E8={SSbR8$jhI
zzYcv2j#vs`hdTHX^7=Zo5s!S^mbZ$b3n5g#WV?nX*N-6azJ@z+3Qi(lyj{aRJJc1X
zgL^6SWdx}fQPuebY2-yieGDiD>l$K%GK&$riXb&91eZXNniLZ452z(!01%WNiL2Mz
z52TyHF+=G2ka!+M&!6Mj2zahVaq+ydbE-i+Y7*Cna6?_sus(!MnUOg}Gej>KI!}6@
zBt7HuGklonuGCHU)8ItkNR86dd5WI4k-CslRa_tuTkc4+^Sj@R2F%%Maf&{obSOE;
z{c<Uc2hfaOY8g6M41fr<mIB<FYUUkG@KdvI(75T;*VCMk`8oqUiy}B7Hy|B96@`+x
zknHPAC!u`7DU-^<O1fw`(ZO&BUhJaHzd*ju$4x%S3_6-_hjw?W`e2He2e9bU%LCDe
zqT@c%5L<u<)^equ&O?Fj((v=*x&ST#vBU&20gI56e+|Uc4&*SNuQZ?6hf4J4&Lak^
z9EJ~hG4l;dAN?V8JvSrq?`X_ok`$-wLoHtsZXw+DUzuHxI#T~!B1t-IK`aXMte(ph
zlcRAn*d+lZkVHrTO~@0XSBi{IylXt984rAuIP?xC`;Rj=A~Iu+#yq{@45Q&b)bQ1y
z>i#9Q#=kfkkD~(qp@GwWlePY?G@I{I+J}UHXT{O9hm^iy1LAPvC$i0Rv%Lx~;6p8I
z_S1qzhKYPiZ1{VAgc3+dro|e^fGMY4le>#2l6bcV(IV-Z9DbJrcT@J8x8&dqCm3-W
z3fd%T>)8dMEBjq9r=8zS0*rh~?$qz_BjZ_V_Rt{A2H}@U_u=Y6)P6v5F(Ml~2k1ki
zzz3ve{Fh?ZN2~iu)J?s8jweMn^%mnE1hsQ>_{4_qYl(}br@!P-e^<KAkFD-BghRH{
zPgixrPgnVIx9li^o&Yck6c3>vwbF)Snz!YogWBcvD2Yywiq>mq(@emn#$f8g9Xt+*
ztZ26ob6|2V{3NY^xU11R_3m#){`a9phP<Tr$>OC0Cf>F7LLA3`K5-OlZlC<+;7Gso
zwcbV7Eb*%{^rI*TwSNZrEJk~cWGDV76GW~fBaZU`1Z>WjJC^TP5g&n=hhXy1p=|09
z-srNiw<ovdyF?p>x@g=eEUa$pTme^dIP#M+mGrn9H{7l{wJXd}s2s-B81L(~cfk;T
zwuajYq-Ica?X3_bgw~D<wcpZ;310{ovk>h$p#Av?GS$W}ywD2?ftVxs6>{Qk2GT-B
zq*JkEu+gvPu)srfp-%IN=8~wLK&tDs<_1}s8x^6sAzV03`}%P-cL2O<y+y5CnUB=J
zomT5+W+&W<y(nqz#74ank5R2lQ7g$PltsO{8qN~+W}Pl*zC)wke=n$kfkmRxi|DJ1
z<fd90t&Y&>A-&NgYV=|>s$C;$?ImjMz%`DpwVq^N!d*EpLapCXDU8R~$587#u!P(d
z`hU}Cp9Yxe6`Uj0c^cDCSgL;mRahlOze7*e*HZPBs9sZ1t2mUPUyK`gEoFet=;FZt
z@8&YRNRn--db(V7eCQHX#C&#8jUxr0Ho<2ZeRUD;G4^`GVmMo_42G4|@@xF$2l4sV
zqu{d*tBkh$H{AO_b3!Wjqf4#JEw!#eNo(&P(`)rpt@KteGkrcWeYdE7BIPjq{z<YB
z2JQDP)!%|D=oJj@!|za6BI?q{QWf+%QM;DGsWA`GFF6HMFM3Z{0E*{?zC|#Dn#BRD
zSlu}QqTQ$4y7Mp{n5!m(y*rJET0DepGsj0Lr}3ME?%=$*%mR0?3a^<Rp+kTCKEnZV
znK|f5!a@5AOk&9dibHL@quZjWo_HLu#~~e^pO8S`Z4;94I@ARGJbo|js7igsAsihU
z-+(n!3FPU&>Q?V)T1BM-*W$;85KCb|cjo9-_uztf?H?@Hf&>b;#Pa(T6hqqp+X622
z*8@fXne~^EFCf=1?kA1IK?uiZ5En!^UJwy;qS~&9?C4DGDP~Rd%QWb?7L5G{;$hT%
zDHtZsB?Aic-K4-)D8a_WdSs#(_i&*AN3Zm!8W;TVYuc!qO#AmF3T6a<uy`GJIPSp<
zuXI9=pwTyY%H&s!@8_Cm?dMAB=#4;}kft_lrTAnwstcD9wYRCdRcM<&{Q7YrYRYgd
zdIvSF&IB_k^8(6H+jo)BP)`PVCT_J9i?)7&zYbd{!y#eMB!e#SE9gfGk<;kxNIo{Y
z1koezKX(-`{E`$C5Tar>!5CWoaEoX)nOF?6hcK{$IoU_{0o{+&*05s0W7oF9zoGN~
zAfm+d*6S&44$lU@nt_{GBrKdD)KiZj6b#!wZ7iEhN&%%h^d4-S%uh3&4w2XdKQ5i0
zhZl$W@iHyU&lM&=Svo&~j4pc9v<Y)vZe&Q;Mn3NlogOYVvcl5HCQ5^k6{k`BMfe%|
zPfRvp_2T};hv@zkcCHQoz`{A63}SGM_ia311{P%o{!CthF5X4l65LZ=ytn8r;iN|w
z9i$#_ZqZnn<hH%vggOJIIHtfmfO-#J;$@ZQ-T$HP&EuphuD^c<85rZzgNly(sH1`!
zGZ-deM08M=ZXJ}kM2to!(I|;aFw6*v+rabyowg&1xNo>5!LO)j4B`?UcK0Ag#Vrsw
z>e>n@3aC+j@6V~a-8~2<-{0$b{&?o4^u2X!Id$sPIj2sYI@Kc8)Ez0E1XdJ2UuB<d
z{;Y5l_2OOJPJKh!E{T!;(er@3|0L2FqLJDIU^G^^Ppve$1bLGHD!CgBozL(yTFcog
zzxpVEdSLii>25>By2B>=yj9VA42-xLqNsRSoUmsiVH#;DGy?V0<+szm;3gqh7(R|J
z;4NO&7T~zUvJeri{tF!-9@5+Ezog1UUVf8)SGg}`&E@w~e%0?w!NjJ4+$0=_7vR_g
zCiV&+3*Aek&xHwSs`xH;Awx8hq`1Ww!b4OY68*9>S^}x*Q~quJ8$A+Ug~|QnB9&G(
z$ZV`%;Rnk%a$_5tlREdenZdl6@A~KmGtsVXfoNVy3vaO`Nn*ugk2{(aL|gRzkfHc6
zq|lE=c9Tu%`tFC&<ueeP0zYu>>7>nzD)ZvhfONwk-};!ja3KuXs<J4xqZ+fs`%gi>
z_^c_O+G;|-G;fJpdm>$C44gCAskSfS*<+ZfJi)WCr!YLbyLQ!xa7zx++c*yJeut@i
z33~)Fd1L2=@_s(SW28xIn{W9e*DW^fFJXoh(+GHI<?Vb<$T!f^z9yh7s~1qLSL1VJ
zvgZ$%JMcN+E_w?v*}t)csix{GH*;y}3YHWWv4fsrzMf+&hI7wePROG*$cYcBL;`_?
zjl)!cv{Cv5yX31*sRe;yGR$pZVjRS7bVT^n@g5rmDlR3i!}*94o(xcB(W%hH6v+pI
zPiQXv0P;2$L%1C@Z@z^{HNsD<b^F2=G?wfb<Fl!_tbtzK#jW7tVG=ywb?54ReDVEM
zy|)+M3+v@-3sV3S{&+qZ`YT+>bpTzM9{KI!hr{ECDvkZdy*;1WmA=W|j?hQOW#ki{
zS&U%nsJ0TX`|%&Jw(G2eJ*gqdmj8u~ORdumZ_HNfL;>;s({{;3i5|1VmSi12UsY`6
zE?qd&>$2jnj3u)hFC?KmIViKh^%HD(Xg5)o_+N&4^FvDCpv^)>GexvC>GY0|+FYGz
zM^XUU!0Y!e!$g9-?%UyZ7tb=q2k$^BgP-C<6TG1wrlQ7N-QG~3Sm@rg7<(CZmkSbV
zJ;=Ec+SKMN%qfw+h$5z4<#<T7b*m|}5Ij<Cn%sDa2Am?)%$^mhVE5m#6tUKthjEvS
z&~!lvR=t~g!N4?#T1SX}ldgk_I+!MBQ&Dt}(GZZGML1!OcpuN4LSGRf_P&gUvb*=C
za&7DW@d+C1;+BZ<EfTtDnK-@WJ^B8`QYNr_NS`uuA0EKBz8z~4mE}FncJ#`cB-gVZ
zS-U3D^T_ICrYp}(sUva1oG{I*IdU|Q8op#^!gT%QCRB<eWF{>4Z%O}FrJvjcZQ!b|
zk^D1Z3OV#EdeN_pmHO=;e#h~v*yG7XM^^BN9)F~)YWmeDekaW2Cp^01V>L;Znwc<?
zU-C7D6t&7DKhyLfv%Pzho;PxB6}E+6F?&*(Nl)sFhG?0HQx5mDmE`7uRH@`o`eeu_
z{?dm!eAK<{78u^L184WvB5TTGiV>FPwabNE)J+<(ihEOo2<nAPne?1cSfGENR2I})
zZUQ-Pj%B+Y71&}ltkUW4Q6lznbh3-vnHeES(hRF+#yGdX8ElYg&g}1`)w=CTv-1NG
zmU~%!W4~^MI+xq#p$`EmH}Ec0xnxGZVMv#>|EQp#@GjR6;J64Vr@;Aw49XWNuN$=C
znp!G}`|=a|Vy-~%uq4spL^=Cvb<)<sIVBDgln<s6h>}+O{z;M<zhGshP8d>o9rdgG
zb5gO3vFM)JD}XeL)UY$<lt&i}&BYVjZ;82&{eVnfVrUrn=~JNQvP(R!o1kVx;+KLY
zSNA?2H8Y;S*rV<>D<l38Yo_mLwThTz2uxS4a!ZsVJ`Se-YUQWR6hJYCwhL`-4mEPx
zG1UBEbEr9kT2rzs=C-M$j{xY=k}F?$DYS2BiI*6;^0&71X-h%L%Riv*fXd<cGYL~*
zXI?25RadQ{OEb;QV?d#r$ya)GEqs!+<3NW-w}7r~J-Qy{qtJCLyB4Cf!?hYS>5;3O
zt%9@6{*5y1X4n&0VirLjjhSh)MIPB(_2X7jK_H*`OpVO44OfYJVlKE<kH9X6qRXVR
zvg6%Rq#n5fl{XT1qcJlEA-;@{A{|jjGQ&SS=E^U*MAfzR_iw*@Q0=ub|Mab4EI%BB
z>`(ubi`u4Z_K3VPIA1PmlQihf|L!gZQkXHNB{y&eJkr$#zzdk^2ZRvPZG6<0c>AwC
zX5J2Ly3*ZYOMao`dV4B=phkztZ|wjUL||0&xnECa%P*u$RQ35ns=^@jo7qg%DgiRu
zI0fXlz`Y~^lk2tePWr=nnJUdf?hj|Y3U+_azmG!>yn^Cre1C(h)%&IKJNMn_{ZRkz
zj$-^ZCE69d(ba{w^gsCQTi1Lt;$PrS_N|Zht>3kiZ@rAxrz97(YJ~I`?eTAvj=ZJ;
z^`?&m6~3nF#hSM9HSOnXIwREd5ZDd<(CsL#R3>uk_(Z*(Q8Ufn-=gEUED*Y}m;YL@
zt#XGWY{UAr1djWGqu@_blK!_Zf$x|YzQY+o4_`$QzVi|H;CbR5YPs%-9eq!CD#GWs
zipyV69+!Vo$P=4b;J)q45B23Q59MD0+*rSRMso~3=tAH26~(e2;l@MT-|}UT3T1Dk
ztXn&lLfP_RY~lje(z#@WQbx>*$7*P?mb-l|^W~0{tNXH-@4-nzSZ?5c^BC)g!y1oS
zSKNI&PUyT<1c$U}a_?&W%TsUnx~+s`ox@ZRHw=78@i@W|x6BQrb}53UN1z|78nFj=
zOISu%xv4zm?+KamdEfafg-egPPvB;Bej5+=&;a%=P;6YrG+4p>ojB#BWcD#54y}SE
z`wNBxd(m3v<`#kSSWT<Gl1r?QT#5AA8}uP0DCZb*MkAC;$F5PwtOswH2}jd-2~Uyx
zst6G0HU_>l2ZWBASWL9*u>&4Ps1`%#HL8WD{_YCZa(hWFr3Lm31^T<6g#xwiR28_8
z0v=_jiF`cD=2RHUUYn-6>$cy)kMy~aN{k>BH5SdEhSzAXVzbe};&Z-tod0|3`wh@i
z)P>J_!BIWX!HZ_%5p6uhQxtvaGn^MPG`R<qyo6*A=wG0b0BDs5w66#B7u$P4=U!9<
zv=j-ugc`@WrqJYQ_e0hFK>-OT7wdiukpbPjbgSU3+cwmFcCqg0uO&S#ZRY_IA7jG#
z1eSi$w(3xB;A2GKb=R^A@V)p@^Twn5c;C{wzNI_2^S$^tWE11}nSaN{W)+L=>WlUB
z#m*1K7L*jbsR&>+IXv6=0M4ZyfKvfDJU=DS{*7|UbGx2b^BlK*iwFNrr7tZ>Zzv4N
z-bI)uNlN8<O}j;P)xFx&ccpI;CV$I&AmfZk&dB)hu-m9#p~l6`&VZacU*oa9#z~>Z
zWz7WuJJtAo)o9b}Qq}m{7lmxmbl;ER>PMyT>xKBgxNiRWXL?@Gx+f;t(h*%Qvao9R
zQ?)}F4!^%Uy9}Wlw$}-<%7~yIWrkI_!#`39Q|QIL&j15N91?pT=C=Elz6}t!xVmHV
zQ59U?v|Zbz$rR-TmIX19{laZ!ta(7^FU3s^cX6T4mKn+H-xdRjd&SE9&6XCrPk7;?
zR@JLNmxpw@Bp8?D0r@M!{Sx6kp~WP!%hg@o7%ff~x=U9}EuSq#RIv@pF{iW;FRT1r
zCW4t7Fdgf9kL##9m|X^)!F4Hx^|Zy|Tjv(gD!pD{udCcFdu^G8Uo?hnU9f{LM>Xgj
zUGfhVOMCc2Jc_pa3^~nPmxh>dH-g62i?{63vb$TnN?~5(Wl~q#SV*XfQwehc9_K+T
z`8RC5RG+&8R@{0q7qyowLl?I_<a?^(OJQ%&z42cvj(*8zkJb;l^5NS;%Waoepe6K3
zr0=8GsmwdlB|Z^dg}^YuRyCflB(b?4M^F4IuXcrmkF9u=3Z4rbBv&VO{0&2yY^hjB
zkQjr0sV}r!uwb;D24(aH<BBJ9B%jZo;_kkHeiiR;bT{h#T9lSF%`@Iv-Xmxv)??Z(
zuk4a&AF5W{F0bNo0FM!ry}_-f#?XIH%=242A%OGtA#752n?*=q8A$-0z=m!+1NYR5
z){Ai`N!mP-EumVr1me=1S>;BH;$iyQi&e);PqHruRxsXLbT{$v`)!{L-jFL?mj$Kv
zI#w}LAM%RaL-cRc`#P*^7%J~*Qry?Eu(G*}QS!sq4j`hUHy}qeWG5m=PlS5ZhmFm>
zGtT$vScbHYi7lH=Hey%u(fm!mqK95@EZEN)^6l<Q#zF|r?`G^}6V(s@rs`|?=$>J2
z7bi$(dM!Nw%&t)1jzF9z!vMu$ONZ-BwM3kv6OATw*zW`Jc|-pE^VH8$;{6$~7Wc>#
z@7cU!xQI#noW~ms)vMfZt)v<&jJ~^Qv@JKCg)(O3OHM!6LDKy#$dT=*x!0Kxp{15<
z@;A6={JWZJxjKKndzAP5AKYNTHza1@DJ$H*@M4kH?wS+@=EJ<UJh}4ozg5=7PirW<
z#}k6p^}UcXxw>;65s8?)$Etq`%%4>}QLVl#8dAwm7q_MIWqbW<5ox*Ivm)NyHs?`c
zW2Olb!kjUJ|3_95>uE@(dZIQlSmbx|04~b!;UDJZ_jez_SOuuIDg^j>kc1l+UcN)!
z?prV_z8F4JnK#^_>hQC1osXCHZm=v^@GH#z#<4K+oMz%+h)hb&-Hjv9^O{DRdZBJE
za8fc6&t=!@r|wYe$tw4UfKfqSh;j*cUd3HG9_Cu)qF|m)&X~Vg`#Od$W&&TudYwDU
zyno=Tz?bAE5~A@{xU1-vfZByUf3S9x@j)&t;WD1*+<QN#j|)AZuR$weJdW^yj=aVI
z{XK4|YCeL%JVI^gj}Fl74OBs%#qsMD`K^Cr_TWxjjA_xB`B0%tE0AW*<*BwINZc%O
z9O4W2Dx3Fn4>B%hnL=(W+g2%mso&N}D5gy!t?<_3en<t``jLVd%#2|4p!IbZw+fK7
z1jY%%wpS`;lCRqI_DTl@1XQ)QSH_Oi)=JBfowrr|jtao^Fr#hr7%|b}R7$i<K59CZ
z?12dOx%oZ?c_3)3{r*8u@K~~cP%Q&aynpa6kPFyq?H}aF7l}@^+kX_J)#mw|=%Y99
z(Fz-@f&Z96zptgVyPYxhBU&NJ;AhzRny0LPr`m`<!N$KvbSFqWjA;H)U*;c%f18zZ
zo~F$4zDzchdHWZIGOA**O=iW&`t9XvZI#<DRGNRY2z{Urn=Oh^*U*VF6ed@eL*h6H
zO%?74I;N_6(3X@{nH{piF}$=|`D&G~x7#V?d%`g4t*&<{A997gd1Zzd*1BKXJeSJk
z-tqmjD&JOCj!|{p@S^hB?h*PK7aJW;0Sz*YJq}v?a|SrRp_uvQ`K18UCYQ(Eb)Y}s
z`^(RH%=OywD*e<Q*WKg$&kRj$FSt=nsQhu17i5QOG>?=%u$(I6Vd)t^>?3TyY_i_R
z!13U^s|-i%Pf)k@w%B<$mT&pLd*Hs1c<{u~<bnIO44=8W2g*Hg-=c!Jv8(iu|Fk%O
zo#oxv^|N~vz_*~Eq2$N<*@>SBEHA)<e)fACp;8g2cz-4O*$L|EGoM;PhI`0H`q^zi
zQ%_d8hxN-KmFQ__1wGA}xRkXP@_707#aZCuPlfsD@_H}E`*W}J9+A7VzV<y0j@Q=?
z<S_!F8(afM1|i5ceNRZ{2(}2n>qOEpJKZY=Rb#Te?09$cEWwJUQ$(K1>}G*V&ipR}
zm81f1@xjl%e&&wZM2#)?a;%L_Qmn!Jo4*%1+$pZf&8|Jhhr$2<czn1DhK#+mPVo%W
zXELmv;u-#ddG$^4;cgcb`CrC|`^o>n0d|HA3iRaZq3;q?;1z_Ji2j6UMYy4ZZ<SoP
zc6e@w7l_x9%5Im;EG8Z$_qb$bvTFI1&~XIUZXT#@L9TUzpcT|`ZHiDfJqR;PgkwU>
z?e%yy)5iVrr=mxHYa2R(U8ZRCt-=R(F8s6#r?Twkoj}Nm#mzlai!hB!BJQ8RH<@`d
zkMq=-mHEiZd-%~ObXP@|Xex+WN9JU<M>0FI|JsIAs+&)B@1dv>(M7vLxpDp7Se7rT
z?1OfeH09j@<V4*Qx=Ri?y`uRy$?W$?X!$s7c22Si%lpG|k{I?a#%9b9vz@EnZPQiH
zvU>5^QN7L6_11zj-U@e##2`Dh_(i{wpQ7`kE8M6;TB059=a*xlJuwj1M4CPT@2T3y
z&{Dc2zV)CJH#MH@fc!7V6Tg8?jHhh{yaWi}Jf6kE|LgJeE|kRc&~>4@{~G-x+b|9(
zz$Iv@b*ptvL-yp+M`w3<T8{AhCbI<WK9<NT&o%cB7$$tev4Vh0mdH;;?6=@W6@5E=
zs8`c7yMuG?OsY>-v}}{FjrCKNuV3B#LbCq!N{#=b(e5q{2c-?uR1ox3smv`M`In2r
zMs@?EN7E<eRU<P;34isT8Dg+-*UQIR$YEJ9zkfgU$okOMTccy{%4%_O^AYhI$;!Yq
z8>J_R-Pcx>^n2r~pax5{IXh2`M_`B#HOCEP+-%oXA>{BEMZI*_n>^AhN-1szfTtjC
zOV*&$<<3$xVyrDI+}zLf&b`3e^v%^>a*tYC=Pss3WotrhxZKiosgK*?CnU264Y{78
z`g7&K{YtZv?y~GbBa)wDyc!;DtKspdlaCAy`Z%KSafE$5D||$*$}=)9i9&_tV;CQ$
zSU#b}Os-e|jDG5V@+A$D8Udk(MGKWPJOrC)eT#2u{k<Q1&^FonQ+!nGgKQ~9d+=AG
zU`|Q9lVbpGiXUdi5A#b2$kWfs3L5jsf@43Jum4Ko_4ZF@aW>9=-(Lo3Av+WI%}qWs
z;QsCPA8{Ut-g_1MU(lw%bTG5V2k^A32miojS8Efqxc+-dqJ2A`Ezv$$5Aho0kx}Ax
zi@MeD1IInAMlKE*B~HOTX^^c_c=<^+|NI;a$e?OvOXa$KpG(6VNbI?2P;Vvisn&io
zqZ|0dYya??<aGq^4feihP%@;CAblk1BlMo=IS2wzK`Nt1;sMZ<=t*#3j;{7>s!sGA
zg^gtmF*9I%!!w@*zPM&k;+Lo>`q^4AzYfVK_+KIP@sv|Kq@0Sosdr8sk>NBsLLJwj
zTuFVdR{Og1&-$Sh>xfdU3rcBF9!e!uY6O*>Q_VlWtkpN`58sIT*GLFFQmjX$5D$;Y
ze>^m$pz=~LwR7l-(vl_KY-fhFYNc_=kBO(ax~H8g(Z#xjWPfR{K2@dK=8xiva|<n9
zPnO2|w-fDJx2JL^<4?yG_VT(DS!IZhZ2WJu#)~LF9izBuZWPbsrt?q3>}2M`r7(cy
zyd<AY^t|vo5|)$T5<P!CooC1MY6eIdSM$6+(UWW5G8e8_EZ6DDhXdJeo9wfcv;CvE
zXmE>U;=VbBps0ItA1AHiRj@TPz1w8n!L}xGS+^QF6nu=Az(Q#~A;9Z8AK-PZJL^YM
zT7IcLcN7h<t>{xjP+&V<w|DlG7!j%ZHxlhDFgr${gN5Q0b9bLY+LcN(Jh&V6epTVU
za7*LmxKzvD*IS&j+Vwj_9Dpae)%Q;6B3Wt0BStl5*o=#oXw6wV4ed_P+zt)g=Q$-c
zXT=uzne39hH4RnhhSN&n5f_N<G3a0>^SDTf=o$E>FklF~e}KZn676_`<L-43A!)>T
zSGn6y7n^xsZVR^T{S_E$%$-!LAlb1kYKU&e8IDDHZSz6N%tr`B#9MYyZF1cjChb4i
zv=w!6bL>vZ3?h^EpsTele85)6(Xz(+SDLp}#B}%0NIIDLF*4$3G8qsvU02V{n?g>E
z8TH8wHz!iKt-ee1*2iSFo{!3ypNxK@-&TGbbLYS>hS^-=>Os5lL-H~?CqJ;c+`Wr_
z=sIAe!Ur94XVG3t3y;i@-fn+{iT|*`y$)9uS3eUfr(VO~$#$`Bd{MHwv}(29JDI%*
zhsMK8B(Rj!4mS)`4c8=+hOj{4u+>tw+hXwhlJG0lfNcvR0WVHwk3rl#N8~=GXLr@Y
z8&n-r8?*~94PM;*&nLtiT!Id8i*jRb9H%=}4+!PrO(J>chE{StLn6HmvpShM0jPcm
z+D?GVe+c{Gf=<@_7I8VXXlR8B4q7<05}p7XA{#sm@r<j8=Si&jt7L1cbiV^JTsxn}
zW+Xd9D!VlZJGVAf_MC-q(=z>TL7|b|!jdwu3Xs6;;|eELg#NI@zN$CTuI&lNwlr+=
z6aknVZ~`ZUyP$A_Osx)dGaiJpr1@djC}sng6=3Y{=#9cWvIV20;G>W!;z~Cio5tZ0
z9R0x5SM((q@lY81QB~1u3ez6_N)e<F^w`E%0vAbeZKac$-Ci&~iK*Wu`Yu(t^G+_1
zh7Jd%2-B$Rsj#x47kuZ3YJ1t^HaL6R3->WG;fwsh(@&|0?L;lChZD2P$Fg6S?r8ol
z8(Gaf3U4x_#dyPCQ+O-A_p`9RYRv6hPZ43_U!V#sJ@SM^gnQej>u4rLLj~o%E1`_j
zN~t6P8QM^#-$f18x`dlo8-9Kpl$_|<Q0umNi9wMKs@Dy7iap5n7{gQUS8HHjjhTaL
z+%vMD<Q^SK8*HYzXEhv%b^PTDj>VORtga_%eD@T6?~M}~5GOO=ZpV#@DcIne;abtZ
z8KEF=z0c+zbfQS<?(fDV*g&+SX1xvnD9L-KGJED{xH&Z7XYssfG93ud`Cf3wH}3!?
zGf@=d#GM4>Mi4fgDA>6;1=BvZOtZwaT+aR%x`JH>5nlvx4lOCkPRA}9Vpxn>BDaFH
zG(D~%5Et2oDm69>R06-LZ2~#jh0DtxuOM4vChy)mfz^4sG5Z~V5ydOK!h)kQP@odk
zfSVf<WmhwndA|x>T_m;QdUfOb1GxRdhpR@^Y~6|e*g|usAjLGOoCn*~P3o^%7x3a<
z$2!uTM@QgH>$HR~-YxL5x=Z;@-YxJ|^H|+2u#KB(1e!=+M{eVY;vY9!(aqd3fXVSx
z8<fIb>L2i`*$P{Hx#g#xuWs&!fs9Rd@!EP#sc|J!>fdVB2deILf=oTYWYDr8Uaz^W
zmIym`2C(TC9;t21I6~ZA+@-m7m-xOm(8oFjF|`7%g|(o+k6t&>T+1906)ZE$B6^`;
z;!L~E6UlW9q-1~pL9_P;E;0U9#;o0C>U{1rEq-*v%V}tJ=J?+{SgHdr-s!)I2Y)vu
ze13#JjVw7J<|?rL#QoteY4Jff^?1#NaQ9<V{&7Dq7I?pTgjRm9IHgWY8kS4pf=fVw
z^VPv$mCpL;>qmL2!bj#k`fgz@#o~sNU*L13nXv_3E67(Z0wi8v4p)V;vK2qc?w3US
z=`Rflbr<t4J*$>*D7BNbOeHUN#dc6tet>X_FYOc%slQ}m@cKlbbWJPI?g+JwrLjp$
zjkZg3{)4UjQN-Qh@NOx9O{N0d(ir`vpX;uI7<9x|RB}{uz=PIABHevgC9&dOSdqjN
zRXI`;U&H-w#cAv{Gbp1|^>4Nu6Wt4>K`;CZ_U+Ot#}%s_)$?mrvO$INGmneZ-b|!N
zQA@b@@DO+IM)h#6CkCxyQ}y7`ZP$z?<iZIA7n?u89qt5)FHOjny{H}gfll{z(3`~X
zrXeomm#DX7?y(R)Y^7U1jUu3MguxT1OLSATL5X}EtzUVd&E0{AY)HU&*D{dNmrmDM
zuXs;Vz+HW8DR$B|)6{gKW7n#x&Nw#&WkanT&I+OCta6cE+A4g^_d!dWdzl4x)~EeW
z&4*8h>(wK;{Z>+f9J1?ka&>jLsKr(8L$G8EU)aD7*Z}qm=`WEE@}}aSO@Q)KI3u$f
zh>a4R9{6ip{?(R?btdT)^6VkhD2u2G70gF!O~1~PYZuOwqN7mOFOmyOXXEd!UtF9>
zU)~LY{l%q;^d9AU*f-IBnf#F;T=bG8BiG0nu#TusW_QDxo?0*3i|=6wMf^8;M9`yW
zKBMCr_y1AIQ!Jdh@+)sv=DNxHVJa;(@OcKx#R3`7l?FbNnHhGbMZKtU4G1@@>hnGL
z%3%0hAvY5#Y;@esExd6h3Y6_BVWBOER8JMYFo02qXb5dgw`OlLgQ~2VKr&NFKsTFT
z{kgd@0pDKjTh|xmwBKUu`_OtLs6g`oinvY-Z*}YKYIsHno6>9QO_I*A{#EXeBA-C|
zI=xElg`YN(l%4A77T(Zg^*vUzT)_O%q*j;`#aZCbPTG_ZQLZ{hRXi5jOZ$jAn4Zx8
z!;5k~6>@oi(>@?poU77WCD$Dx*G9d)v(h8Oa10<eA>zV?*>QF#yi_PFqUvNT%!Gzw
zPqza_X!i@tAk+3{DGfk5MJ2K`3Y;<NCqQOyhwIVte)q(zj<&bTbgvTv7z4xH<bWM&
z676e@KqMTXQqDnidr}mHDT0N@D}i_nkGZ31hB53=GXu`6NhZ>NAQoOS_hXpM5Qg_A
z&vB8Sy*<aJ9u9e}^ahDk76asPhH&<Bm>qwYr~0f+)h|p;S_l=`F~aSMP$oO0__4S?
zOD5_{l`Txwf0#%=roelBKJ2?BmY>MX#$M>gTUYXQVlP$}Cstx5m<wlqspYMsb6Yc+
zF%V&D=opUJ>YdoJo4;|v_VFrYV)-lucX<dQcJ?5EJxxl=IGZ+8nav99o*Xc6_YYva
z>F)p=fjOr0Vz*=*M+UbGC>fOLuwr)gdcBGbDF57wjNnDL{k9%{2GNN0C8lSnIKjqH
zal+}UjGJ#z?>D$!A^^z46LGIumK>j3PP#uCePBRxkD-xkU2PQ6`o#hMMv*~lVQUR7
zY_QR<(~|?b)g;pI1H6s}flv)?pT(FH%k9&|{ky7PEQ#kVFHdIsb;Tx-NN=Y^fYrEb
z%FUI`o$ZHBC+X8Q;-q#gf8xK1)fflS-SZDV{~>4{#yAbM=mZE9{Q?b5{$)h`Ed?ga
zIyL~rl|&Qmynp7I<eaz5IrqJb!Vz#?ND&&Y6uHhf@@;@pe-P@AnP~G?(CledELy{K
zXMw{-w#(dYj5EfkEed!g^1Zeg27|_{&*0m}%tN<sOgs>w&EeJlgLmAJXy1dE!}d?K
z?<h&m@YsZ<0*K#B*v9&=66u+pB<q8nC2MA?NmOC9(zKY(%gCl-0%~jMl~HU;=6+sw
zwkG87@$m<zLty|1)nw)wj-yY$@N}RzLWUFn875!@1XIuuTI9dv07<vRkf|o?8gu8D
zF-;Pa7BL!=cf&s5vo>aYQ40_3NAGH@9y=nhkX8r&u*-2nJ*<YYybG4s_jL<UC&R?i
z$gKiV2OP<v1in#%C^v@Dpgn09QoF<tb!#`5VqbItXV&^f5J({wWpPmzM?L>Pcz<_E
z?DK(t17-B^j1c?$jLF>x&E5j+g%DP6UhSvn*=Q#*#rZS9f!4w?;O{X+3M+2YHv}FZ
zZ9P@ch{Bvw7O!Sin#*A<wJ;ITCV27E%&m|zgj`Z}g^MYyKFOo{A^xjSsE7ar13yvb
zNw8ZGD80xN9d4<+)gT0teda0Ndt&1CqE(Qdn0TLPdi)nYit`Nr;^UXzrI*ZWHp0aV
z-Sc~xJuePj`WK!17$2@~?{58HY_agac6EDV;+qmy<1GN{*vE;n7(GjZ*Ta+95tVC?
ztgPnNlk6yX^oJ6@<u9+G^7<k4Oo8B^s0KOi4-k;nDbnso;c<852!?5GqNfZc7|=*e
zX}27M{Afcar>=9oc7KDGIbw*(MZ=mT8>_bd!KUFJ$M02SI$HZ;Qo^paAhdNuFI4&>
zcYzJW6*a9Yi&j^q`8Y;->Yq^rs4KkOrOk2VfWxuUko+bn;O=XGuQLC12QU=&K><zD
z&30U7_eZbtAGhJ7rH|wmf+2lBRK`(#pYlG?ZdnpAkkG8a$_%`_4Lpuq4$r*n?w4^U
zM~m*y_7qeWbh6%1hW-+>cHe!a8iGlPV^x!mYFw+&Fpk=?pyI-(rTV6k0K<`8$fX+K
zlT^MG!6Ur<oKM+qHR}EWQgyi!q#7=LBQ+HxRa$&Xs#NtGa|8?X{WYd=x{<Pvf=tM7
zh5Bwp;6CvVu<zMQH3T8=9Pj({v-+&htn%@`2j?2fh}zwKsQ%%agPuO(HxJ`Ch?2+7
z4FSI~Hr1U=opHeL?vn?p;K7s2Itw^sP8yDh$1(+C0XGr>CmH<%1phZNfm9EndUH%5
zXqo>d2Y6SqYqK2S`-A+~0tfIKYhm{9#CHv(M({>lw{#k{lOz{`$X-2ZS813*LkX98
z{eq^4+wFVn-09y~SQ<tsRA=zsWRxd1CQ7sT(#ZjGU`q_~RM(grg9JkTyP6f$!fv~K
zOH5078(F#wD2pym>$qZ~lH3Q?4Qf3cLi;06tpktELlM}o?Bz(k#IPsY_faAS5xzH|
zc8R>ZM0=9x`HYvTlBeliwBnKmf`DaDE788C63{1>TZMI7qf@*2$YNyQ&@xkCGV>DY
z%htnEFa1PfS8`DrSJ&pPyi82@(%54;*M5u_@K8+A0JS3%R3vnItD3&aq+cTfTLc%A
zR6&2qEVUpTwr>0cwjlA`0cbwX#v1sTkF)U+-Av5dVzOV-H)hr=<0xv3`+|RH?}AW~
z&~4(Q`>kmin-eZ?94-ihjuLA;G9NUr(T5Td^8h)c(@8aaXdncn+BULeg|_&g!fPlJ
z*9kAGh;CX{<o{;G!aC>%8DOx|=yxksYb(_OzI^Tp?6gn#Iy#!o*Mj{$?rVrJKggUo
z-1o%S0d_exMBA+^$#1dCrHq|9(8wIMah)4`1lcz`NA}mkyz?9xO8u<^JEU(xKAy<Z
zyoi6^)2bf?*$%MZTgVbk`&`}e-z9<A<wy5ob2B@x$~~hgMtpTqKb7fptiCm7MpYS&
zu`@H$(gFiP;D@NzV&i7g>rGcQX3MET3PEqr4p!?`3PXi`mvN%7P@I>nUjTskLviA$
zZC-^NEHJQ<trBbJ7)Y8c^7!bDdj%hqMElsSl}t1U4NR3hQpk7ek6Bk~zi`m$gU*Pq
zKdc1b8Xrw?!@FBKK%Mon<5X14HiQLYC9}XT#(Ekp!tTWy#No_vE!O@#X66p`ib;Q7
zOvvQ>#Q3!PD6VA$;`=CWv}d3HR-XU!eH590M#sVd0-rDf^3=p~VU^1tCFFXJ7NvkW
z=q+L#XTuGmxk@V>F(}a27IZaQS)%8B4V4E;X^IgTV-N46R>loMvI<l2r>hI?fk#bW
z8K+mevo%W*=t|L=hH>h!LNPDk2o?96HNbTka{ELU<qkIqba8Em0^hBLdFaOI{rJLr
zOz?&Ne3$R71D|`BW)khQ&@7F^id9Ds^A?YXO!Lyv{EGU>wL%^%>TKPaZvrY;O}0Nu
zpQ3U;7~fzO??aZ47I4pkP>)~In+Sq*r!DfLhvB_G9Up-i-lhrtaX-Qg(|Mx-`B0zn
zi)3zU%`^aYD@E1xTrbfn<e5Sw@#t7%TJd~5P4jVpyGk<Ewj>F?AtlZH``gp>O=NIl
zJy4Iej<!-6bHLNJJC_gaCPl;STNlEHM=L)r;v+;9|1j>aMKWI#y;*$kq1fWj#qYxV
za@C)!J2k@(P1gfi^eeC?{Ts%@RaYz*;P=s1##ma(PX2=Y+LLmwn+ko&a2Dw4F>fg(
zUx)Hzxmm2=5(A?<;(4KqZysaXB*V^1xLFQFP;Q!))7Vl-FZy!_XjRKs*{<B~yhPvs
zwAg<s=4jWPhUjd_e3IPQ81I}cP{Ku69nX0MtD#JZvPZhy@BDgybfJCfnXPvDQ^D!_
z&6Q^+`q!Dfx^Y%durR8gRc;9N#3aqK;82<kmc?VBYgMkv5iE-p{e_0jSQd|AQpK8?
zgq!GATP$WXv=_15tH1AaRqHFcj916G>_xeL8FF<AxhhNRvKQq_DObTzy39R%tgqY9
zMBO7wOWBJ`ZEvMwOYtJtW~B<Y#MrpaRIQ5s%*>D}xx<i(2pH0(%ym7CMCnWKOZ)_~
zafeY&bgmJ2E4ht+q;u$RVfQXvJ9Z$9p`y!>^oK3GN4Jsc=Pe)3#egucj1P%46pCfx
z@lV+qp1{btizK1cmZCJGad8audCglJ6gZe#FoQ;8{rXgHc$t|rMw94G?e#AtCLPkX
zOBX3T)snE<t0@y4wu@pqQK;=mGja$_0n$De`&KfGVNJo7#9VWAAGHI}A7QW}f4erX
zfvSs{l8ffbC`Qgj%K7N`RD)mFH}-Mzd`jU?GN}sfESY=IMyIh)Bm~-kW#s9Bwrejz
zCu=NQnXI3KOVN|qtG0FP&9c2Y4eii6H5jZ)!IBc){Gsm$`++CcjB;y(#2*!{ey3yt
zb<oF6O?*uxO-Oh6)2hZM)->M#I|k54;L<%~0K4%Leea_p-vRH1(es-l8lEs38orT?
zX>mL?l%g6fdn=5R22VbS{9J)3C`*TO;oyq0F3}x`Skq%nkDMB4c%@q-@?~65=BX^m
zM^tZSzEMpw8Wwq+CB4!pnGa8JpCh<q_@e)z_OEROMQu)5G>{@46%2ak<E3R049u!-
zHPHoXh+9sOIN}9<ipJ#dac48xzgc8P8)((NbqG%{zQ@y2Jsrc7UHcGS`9ZkxA!^rC
zH#!tu9H%|XWbG1NhO{WTBjT5ORX%?yL=&&b(FZ;6%5V>nC)4OS_<*}Xj6)0kYivEc
z%6&$Hr-VfM0p?s_oQJ?T<Kzai8mCXy4;h_ke;$=c+_r-zCBb^oDSaMSV$ASjg(cQW
zrwlse#F|su-a^y;fLSQ5_eWSt)~GNV@^Bbat5sCkJ@gUKMHCow@TcJ`#9Jm#&WN}2
zQZUw7ptDZK8sHhrYSqrxVMako|LhGvFtuVR$d}&y>aQR$V3J?6pU7aMl}Ip{tQjxF
ztaFV3Ax3sHpTgSs*AN23zk(b>#xMM9KD?gFD=@SGLo~_?IkU5|9Lct}3JN@IJ`&<a
zB)R*!TK6g~G60EkX5S%h`xPs<`^Wnj;h}F*I@*mFw@^GoVCkkQf9|SH!z^<+=RfBQ
zz+PoPHaHJOEfAc4_^w_DmW>6q3;pw_7xDave~$EguYbOkzPQ`{^9xG9hG(-2nYJFS
z5_ifCoTX0}^GW8dT-}vGP3rrk;stX@Z734mXVYci4b9~KF@U27L<gW!xTd~E|DnPq
z5E!bcC2*fxv`EY;B-zyPG`g_fU9OCqSRK9p9m*hMTKO(*Sy)SDJ{VsAZbCOsLf3dx
zwau?^djU({#STovVkO|~2HNYa#xTLXf%akz2?JFS&&}kqV_ib^wNg0fLkG>d*H~Ld
z$B^H*w5S*}C*cmJlzW(wiw3-_3O6Ssb}WS;W;v|l+s9F_MTrOTKhdb|hB^~>3EJ|m
z3&}n!Wr;;ju|It<4Y8Z@BFI;RCb!h7jciVLSiV(mSAjnRGK*`qv%~l|<^Q#Lc+Xbh
zZ>W4|_+Aa+YYZzB={__ejM}7?!m~GdBHeo6>HCg;g4^i30)Z{YD**+a@BAZuA-HGV
zanF&UY2*F~_4#t=7RxQ3qjJ4;GZ2~gw#=*CZ^-(!zTq0P$}N^E5VvEq5%Eyvw#3L8
z{YHX>JTu4><#_Q`gox#G_}MQ;kE$50j>h_;y{L;ve49!@lc9lJ*7rD90TDZgTwPQL
zxhhNRvKQ60<{$yI%$+;f*J<NQ?h&QA?M1m4SnhmuK!GmjMPIPu35bZU3_U8`+Y;SX
z^7WG<IYbv0)AM$SH2NMXLe1~Rq;j%nzrvb3N)|Ily?OS-^DMG33~S#G<?nU$Nd45c
zJ)^N)<z9yj0&b(VqLJLdX_r9g-yLYr^Yy$x&%U}#eRUiiac@0g)qQcYRo9Q|<T7?e
zNp;T^y0IW%8~42AW}2wJFs}0qQ?$LqrL_0ZVd7$Vae8kD_h)D<y4(XdKpCzfg9qpX
zO*2onKkxwk!UJ>%{^IK27mTjYI;WU*KcDpopLJZwx|pm1Oz&3~9fe?4N_40sG?diD
zXv4q~>DNA2LLfYi_=^m6G@sPCqq5h;wS6akz*_Gs**wwymWq_?n~C<Ukn5X?_HgBb
zQECO~?$?T)iocn1w+cl}JH1S+;#Hp&Iubz$0h?o()*xP#u=H<grPoI%>8I|1r#vN0
zU<$<DjsrMEzodMecvAfEi~o$27G(5)hW)sZYa`Sp+P7dVqW$SdLGs-RACJn4J<Hz~
zpb-&&->97Ueaf_YeG%mML#F68h%wk>IIrOSR|}`&zjlP!T{PXiP_yG26p@x48oAgH
z!;aEzF=|Hh0eDH;A-)SawD)N<hc~z6E`#RnIRa!Z`Ky&*hxRycnq`N&#J;$%7Mt1f
ztNm=2Z+K_VYhnp`dvo>&*iQ#l$&H|GeMQSdjoAZI1J3W#a<48l#SEW46Y+IgP4%GG
z!)gvlOzlXO&A0gsO6@d(<|F>JyFniEXMM<-aOmcb2cT$C8o`j-V$yOxG-fM@bEYaS
zx*MLweHu@6Tr1H3e?<>*0eFLXgp+L}1(8ih@aB$$pc=_kDP8*8qzum<GMvL6bU3ay
zmHl30eKi{zeAO9_gOkKRB6g+0g4xV{b-efXU6ZV*HG)oLJvf<~#gt8DFT>WxejPiG
zTc@%Y;pFj^+jW2RsNRAZZFE~+Qf>FG=e2Cm+IFpl*i$76BL3-%MLo?Zl<ad}1gUe+
z7$?0h@1GjKXgc}aAx9{UXdM8yD@bSc!k-;7s2^I<Jk+1WY+1=wWg`OwEp|TX`u$Y!
z(uO#3guqIZ`1o~HAZyH6UZSp$oX*2;a3kjdia#SeEkDzbZz_v>iZoV)Co`o1e@;I{
zpas|7BQ&H*ozg)Wa1YVVB(k>BMpnZ*q_-c|wT8Gv`vw{_O`VbBc#7go$w}C}dVZmA
znNOm__%)0Orn622#K^w9#YegH2oW5QD2$K|wyc}(<R7TV{c3cMRoh+b)+q1yt?icD
z;hFas_vlzEYs|co%3Zq}G$c8B^WCs=A%jXQleC)Q*;|(@hX|#yY^fdMS=$U}I_$;#
zoK)r^70=Ii$AiSucwNMNu|qMP{L48FL(Cty@J?ZPo?H367epxiV-6H~p5I>v#Wb<v
zQ585`paY;_rd!Q$#YSKTF#LAc(JHrhjp|r6XakqegX%6`J$zQ|HNLuOh5MUc1$fDn
zwj2N2IvqRlH6+@nL5afl4}IP4pH^Hb)bO*dUI}2Pq5;efMiA}4)?_gVaE%cq-vwhH
zo;}~2M&WQ~CoO@g50z@_kGINiG@V91ne5MR)al0Txf^x*+xvu0zZ7*BA%=Fv2SmuX
z_|I;?{{n7nE|h0Sxr_UV5%33&S|$|cm%-hFzkDeUTR6N_{>VasTeqf#A3PX3Av7w7
zt+@uUsE+pMCI2_EcGl++J_IJB;M7rCBo5p=k|P?x2Qi5)_3~5ojxYaa;zU0QMWmMQ
zqSW`4dV&$~I`^txexsU-<Gz42WxQu!|NB?v`6+n>MQH>#y%*$*1tn<vAlZpziMqm(
zoOjT+@h(dLTZ5D>e^bgg;>H~W)hFxcCE9zNQSLyu>$4>AMcX2K3X1Hx=0<W@EGM92
zf(qfcXf>S=qy4ZYigWD<&`D^=pZi$07v=f<J|~PI!Hc<G2SZ|ijB*PoO#Gg`RsY3{
z+38)%X)Up73<gInx8M3@iS+kGn+zQiGsI`XQ`6{meo|*fif0|7F|>u<%6(C-It{Eu
z`>`Ni$C<U+;rT7+i(Y+J#0LGs<-`#`1g-vz(d8h6!)RYWV}yfCgP%J8#Tlc!rm2=@
zO@0FWXwu`e)a!uJYL$S>I&#E$BllA+Qem>}n66EZUo=uv?{uSkxB;;N3x|2CZPW28
z`ZYXzur@-El+ZAF`|V!zC8_g9-9#Y!(35z7q7&9yj)cCGMQqLU$C0iSb$M$Klx_5`
zlVZ}B;ZkS0Tfr&Tl2eCg4$fy?U*3%Twnh*C<X3?E9b3Z!?nbb#*i&7zgGbG^$?Tq<
z8eULpW?4S6N|xEbr%_eUUsBaOVHKD^IX<Y=#I`N0v(rJP$3ep84=QbFIlORC=`0%4
zAnyjyZXGi+oK0Hgj^;~r`UX|Br$33*pA-R>v13RF@RollrWc!EA^z9@?!4(C#yyJX
zO}hg7e|g@t;(veMw0RHp;s4rs(-z3fjet7~AJT9>*<B(Up`!Dq4>522hXw9jF%&r0
z8oJATH2ekqm-r1PQa7Rh-t6^X4!`RF=yJknz%oq#Ej?n&^>4kCnOBn8;T&9iHj!Rw
z?7`-`tS7^L1Pc{L8+h{xX>IOq@tEAjBkcfEAa?jLvE?`*+Idp*UTv%=dsLIAZFMqx
z!U%X%gZrTrqMQyOWL{3<4=q2<uEsEr>0f$yw9?X;K%BN_jwUm`lWj+hz?YP>K1kmx
z)$U%EVUOv_yx!*gN5c6F+=g+W!Mx!u->prj9D$!5+Yj4Jz@B$@X2j|6kjr@laNO5m
z8}9edKu|D{YKuyLH&uggmJK6@XYXs}O>?*$ZD0es?HN_Wvtz4?1JIJlA7J*-Hk}Rm
zXEPTV`4clMEmWLykM0U|{$tDA)HX`h>?Xsp8_irwWiKa%qYpk#&6ngR2?=2;^Vr_f
zO&y7kPX+?1BCqozrnT*G4ghM}c{yT{+L)l)A-GPn+VD+N=PGj#iw4(xQi|oMx0$q5
zx<P}4w;@%@>`6KtjJoI7Dv3$2sJ8^bL0K$4XZnDWJD)S{+mU}&ipER3Fv)LoUuot`
z2Ha0pZv}XiPKI@B^?f|%deslpPu;JGV<?XDG?ZWTPk^)-j;Gx}z7W4zZbu0lVw#QK
z_>!X8qmm*B4jLx`--d<2HVq=ybz>TmnS)+P=8j6*5#8s>@16o<nl_5RAy)h9`4G)~
zDK`MV7)lfrQ-OkSSL@q!zJ=+R?-xG|jUNsS56K(^XwoP@AS9`xbER@YpYDOc%q!e<
zGlZ;iwQ55IrqS)ucddKkG@nr6+C!FY$WlbcM)RG?KK=vAwIr_OGc_2QX7(&t(}6OA
zOq8f8>EOsrD3$3p;A~P<swBjK4?Fv(j5=a;NY&a2qV%q8S;VA}+ikH{roB;Pn)g5}
zOJ$F6?F@(COlcWHoCgl(<HpnSZir8`f^88QXOgr~qFH>b<ni+bafzsm^`3^2T)N&Q
z!n-$_JxzNr$5iCEWPj3#@rGnQ5sKuC<~zD<gMwCSuO6rplBn!&!O%D+q4}tzsngWa
z#>|~HRs1Th6QL6E-?4?TPHo3sO|%aqgOX7kd|#97OQ^34IbiOtm#HbE=y@%D@LZ>_
z%{9rEcm1t5?FyBuaejLz#NJ1|6#D~vqd$wApH9I#^V@nmK<Lk`&>zf)))RjvRI;8P
zeXTp3oMmst4_nw*Gz$x}+50+hS)wqIYlyp~(Hq7@*1=909&-H{!e)H*2_@9mXjGU&
z`uBH_$|?gQ*{!jtPsb)Xk6*zjeFvN-MPE2g_4u1kh@rtI^%jYC^;HJh_jjb@@9<*%
z)3#Yvbl*mJ_@Uw9k2a*8@$mbA2Y-KgKt+dWx!$8>6WFkAGzcu9B;HQ)8)0In{{bsk
z)U>LB0%F2BO>omsm_U=ZfBL&yF+C||l(5V2qvi3%a05~L4Xc(HWEb-l_VC3GLp>7h
z4}b;tOfS$geItCFZ1}ieG>ZT{wjxicFRwqx^nP^#FCD-d@UoOgwX38};$@UJ!;5&C
zru^(e*FxVL;bq}Q)G`!mxuV7j7+$u9V6O1qD)mod8UPFAuz&iyN$Z4{wBM`tWOIa2
zQ)Q@OQ_uUk$R{Fr;Sl}IYKT$u7ole64xr{Pp5SXRY)H)0$%A5wbmh2``9H0I4<RAo
zW4-!EV@lctJ_3N9Q1YaT0DAv#MoFUmyQXS9=$()h9V6#ave?=TD5(}o_@}>XUh7dZ
z+v>NOG*~F9Hk4EgN|p2|F(5ojs{MR|S{@ZjZr&c0+;1oe^iYDA`-ILZYCx#zJdds_
z;73Ra`01if(z22^i64Wu6MmMfdhpZmpYX$_EWq)gcm68Dk5dIP$*?#X%$aZSbK<8S
zKdY^Nn@xtFTEkDRpj1hZpXJKr@q;;4{098!0Is`qJMg0exP|#*`0?|}@Z-Dm4fqjK
z0)FtvF#ITK6Zk=dO6BRVZW`R6Uq#$l_(F8Jbge1U8C*|Zw}y`z9S^OF=A&j{c&mzD
z7tE(Hrm?c{{rDkP5t2E-A>giJ4W$<mrkfaEG;!Up!;7~0-Fe}qS}$jW7YR6be0ZtU
z%MiT)+H*{?xXEB)Uyy$_PO(VOGMZdfSikF2Z13>viSZw(&y587>ZDJIrq~AG?f0El
z7FgCxRu=udbsCP`X=U+TB7G871}nm5R~A1&uxxT=G3r08EMn)Gu(<dPU6gf&{MI`F
zT5F4Y%_~@Ii|12>my?G2H%o%{wX`7Os9#y2VavlpM5RA>sTL||HXXe9qkOsDx4ah(
zAm^hu`qc%iii3V3-Qz1P5?X46^~Ii+I124-WUV{o2~}0}5s?D;IQDZ<o7$5a{ZEEZ
zvr)|%lz-hyAJ%c6wbK0rSW?-)dA;DDZbBU@cQ<{}Y0_uTP9-jwkGl$SdsD2eDs@j<
zqK~gi$C4Jmuear@n8+nMber8Xk$x91FcqJte_oLCB(7EJ45>@SiX~gncvmO;thdl_
zaCofs--2<n4A?i;gLcE{eUonpb%G3$jr|oi-y5@iv>y4T9K_U8V%E{MYv*)Lq#xv^
z?Ozp%_7{1Oi%o4=%Z`azN7i<@t{m+n`VG4?p;`u3WjCC_Obxg~mWjzGW{rURV4Vz`
z>Fnn$exJUkmjHNnYm~+mtHiCJs~~T(%!5Q+DhmhJ=QCk`slLqlYztUgVgg<YMh?qK
z)SvVtI;82>_w|S+Joi0ikFQ$Wt!vkoJv`8f_8*hX;T=I84dL12#l4;L6KCAGRHmab
zY)LhYPO>GXttjRU0iOW=SeD|?H77$HNuUmKB;zfqV2?`7YMTN?iyh4nGvJ6sdV>JW
z4Ls&2knMYW#O*4YB6Wv2lhK=OUwOS$aV8fw&u$!WHgP6XxpAZ2pcxxBthHJhj~Ub!
z_oxl777y6kBWt@R+RwC<<AKg&EZRz)*0~d@#!nsxp<+qyjSXJ!qo8X2F0P~=fZ190
zVU|EWWAlO8Kkvpz%UZ<G>Sf4Mb&KhgEGQbcBzgRN@u!jng|&+--Vu;AA!lZ5g6Ocy
zGzznnZ|;_A>%l7EWTIdVixuWX`xlTciLwkPbBCe*0B^E>ee<qb0$j6F^HAQ!EeFR-
z0G2zt$1;(Am1PY+pht8`v};>3n#4Z^e(Loat!oxH6Uve$O=r1%zsYUpx``yT`xFNq
ze&iNpg<_)(iO`W+%d%zJi9Y&4AKJee>51g&M0#vDp~*tJa)hiiq5LzRkF1bm4bimz
z;FHWa1hrYwTfk1Th@E&jaiUfUrW+rRKC}Z!(jk3%2B`*>`MeK>DG7bAGMiplP?=es
zX*r>KFeg}5Cf*^TKZ=Ahueck(1q!e8D1?4I{=9rgFZ|j%YFf~HxOz8WaGYl4$mSoq
z@xSea!*QF!Ars7SIIe)hXNow4!)%PhDM6tr#o=i2iJ1_WXt{vH(VM}cPF&J)rpn0$
z9F8jBkmUZZ_Xi$_m^DmM8q-O)W;r4ps-BIt>M5Yj{b)-NIFY9?uT5~YZ;eMvv?u=r
zexcZpl~GTYww^AqL{&xqE`Be?d#hRlQz%v2g!KtQWtic@{Yrft7Zj?=!UE_Y*L;TS
znkhlQUn^~D1c?8~2u$^UuEy;k<i}x4fsl9n+8~5}4c3{S;yA>skvM)wVJKpixf4Yv
z+K;j(!bb+Ax;P+HYg(-zfv)+|-%Wkrk4ckwfUN^dG<#*&m`n}Yxsi6z>>+L4B_`7D
z=)oA&+Q^`1;4i5q&+*0@FnQpay9vF%xDFI<1DdZh@H|eNf9$TC)Ctr2x+zTe3z)Xj
zF{ZD8?FCHlunA1_O=bhs6f2e6=7T?pcfaMDndi}`)|Xb=BcI8Y-F*Dw=zOXT<0QGl
zSzX!hBwJRRY<c&O6dz5RdwrsSpDBG7Ba2*`mp5j!5~+&yob9Gy-<nwaN=QJ07^dZ~
zafBBbB+%8&lgbMpA`tMv+kb?>`xI-8wQn}VMr^avd8C{!8dfcM(Pt|OUR#yuIjnMc
zjxc_iC8_N8gcOhWcS_@m_aLr#BK;IF4F{55@L@A9(&Umjw0MukOqsbcxTCw%p|-oF
zx|b2FZBXl3HMOe<4nkDtgIO<dc#U&w&HIFaOdK?U5b3Ng5_!wrT&}{FrR7aDqeQw~
z_L;AXz8`4Zyy#<p3;A(sW1p241Jlm5Eufx&|4!rbAx(7}mxDTw%kVoHmnzcY^_5J_
z6;u`k+-I!(qd)Oa{9SgY=t1~~d#f9tj$viAsdlwt->KchC{jWXFWo_5dChG^NB3-8
z?mU)OCFO2bxskTq@_g<Ro};VyCmu&w1nPaeLIp-F#Nu=YGLc?S?WKdshA7!m7Vdrf
zWX|7<sxX-{fT#RGHD-H6mE;odQN_DbXN12}!S3gi4^#39#rE8q3d*R_F%X6~h0jFP
z!Q$n@)$2MQbG;5`mkv|-Id^!)=?O?F?gw^o9V>nI9nC+d`8@_ZxL;vKwQDH%-`c^u
zM|&xI=ILl>;A_lSH>2HocK0%d@#1jj_@P?5+MQb~>jw7$iz#iwzhjH4Rqi}d`$nsn
z^+SA~j*urw9#uw1wI)3uwNF*{4ar)72B+Hv`NNXA;4U{orFoHLbuHm#x?V<wmuY%A
zH8k?JW)8K#&ToJ`UX+>F8=T?ElgR%%L=e75*!2s~FpT&)Z_%*@))g@h3h+(RZ0oa9
zi<aMOg?3t3R$Y^dCVCYp+`mR>>~ksxtxeb51*8ov`v|@pR~wvzHcT}ei@(i$XSPu)
z6f}Q|dj{2+Ant|v#RrhZw`hD>xUVl>pJV^ed_i2<q?##NoU@<L%6!83cx8^E9yVy~
zbO=0DVPy~7^ve8MzkK^oEAxjKxm51b|IMBsCiy1mJ+&D5N8q~E;C@O+wdaSX$ethC
zh!*BIn-)Hth56sP9oy#143kh{VU8KGHy0jh%a3hw?fLZvwO!o144<iHVNovSJy-Y3
zTXf+X^NoejHa8#_q}5mXy}!{YH24sNm3ccKwe#n0Voyq&ezmsg_c%pk{b*9F>Y)pG
zybT)xXkqhjI8sELf5YoPZP_mW?O4BRTR_Un=H2lu!zXD&b7B7v*RGWR__h5%NO$A?
zKjr?~{vV0O{Xbsh=2Ow7d3ym~VWDgb^rL;i-Vl)X0jDs#Z43;rpU@qVaeOnB@i=C(
z*%NZ?2lm}pI3-%MHiG&*>6iF0!-*i&!YZXNr;64yYriB1xmv3FeyBC%&bI$aq?xu3
zjC)#OpSKNx3s$$}ler_kpMVA7&RtSo=Kc*)6N^u7TZ#0i!f>veBcZDGm|6z8vulc0
z`OK73M++S<X&YDFH8E)n6PSx3aV|Nkl6=I^`>5^7u9J_d%|ED#dK4Qq{&309x1dQi
z_bRzOC*IGPhjHTUANZQ<b>{*4seAKg&xz{*)8rRqkNM_lH+Y$t82gc_buI8=KcAvO
z2p%XM#a-dR&qtwSwFRYYi>`;tpuF0TOy!2KAK4>1gzWB62GF~UOw;&edzIZrFSB*W
z>xTtEwtUJob(?*#Ob<7si)T~u=4H3zq5d8T3ev~znNs4cuaF6Tw{!&G^4k{qT4D2Y
zSj^WB3J=K~GfA{jl|L*bsiJ$0yED_=C$dCAURCaWvq`OTzfvQbDe>-QSwoy<QOI%}
zS&FRD@aOqs-?baAm4ajvVUgQkN=Rn8_@8}`{dwgdII<IC1Eb}AK*xqDtqh*cMjOgH
zm_%c?slr`xNzC=W=IMY69&a3LOi&_w7d#z?il)6+(`N0Fb405D|2!R7)&*mv$x9g{
z36d{{=eb*FF9!v3u~Utmy&Posi(L%eCm6ytJqX}9aPsxR$w84rD+>(oF2o(u%Coz1
z1f|#znB8^u0cLmK|AY~Xhw$*j#r@91;mC80(>2DT3PbXIkW|Vs{;Ux!<ro{PRo`%W
z3qQbMX1Y#ews#|(76<FUr}`{P&DlUG=fjBiHh}@`%e4rzeptJ9E*7TSdGC0?tGCW2
zEt<%!BsJ!0-sFd1LmTqjE1v!$&a2|O)<&AzIXb&__0idG>yz2OTBjZK#aFmgH7_lI
zk$D<@x{Odo3MM$`a}0Sej_KBzJ!omNez%&IR^c#NVp6)m2mvM!pBdT7sFW5I^!JBV
ziCH`B0+`W>a2#_($(>eH>jjTIxZ}W3J7`Ki3p|{0El2xpRrw@CR4w1Xd2AX$%=2$j
zH*`*p0Gh|@S+aYgaRTLx!|2wUliJG9s7*}Xmj2yp!sZATGzg<Y^_xyZZ-?|<8~qkF
z)g42T{Bm)?K|Do`N;?N$Xd|j+mgj&qHqyocGgv0!fVYeN=jvW(;U`w`T{LLzZ<rqX
zu^aqa;D25Z4N=s^(Ee*Zl#uB)Vu1CZe-&$9M!@-90td?PWgI6a&f5m4$qnx3G8@vL
zkAzx~u`!aKT2DIPU*LOOQ$d-%$M_>NSxW4lripHFJB1o=4JBgcdhIbV(Eq`nqUvTL
zBUJb|CDlF>*MC>6ua)vI?VltwpQ5F`C@l>gr*klEMp`$Yq~+Dr$khOi^@k+VTkzr*
zj*Yn({uYf`*jc#dVqgDPG$SifLi#7;^9#%|`4}|Tf0Rgzl?C@=uBzI!)7DxB6W{qx
z3Y=+)qc^*?LVHbW7zOTR6u7Ys?l$9xMFr0InL;)470}S=l4T!BK1<eqk92sNxj$xE
z7o)`u^IBY0QH$FXSyj~HL=4j6?wVSn#r*@B1`(9%a8-e=&9P6T`@5xoR5c}%>705c
z5M5J){#%iaI%06?0KbT82bxZ0AD8lzlJe7UH@!>w>A7}@Ox*LBT<CIDcAZ6^zbD~i
z{=O63fA?&_>h)st{#<`H$Mz1g$1|UsUuLDhJ~2URMk4(-|FUYUf3f9N?XAq!KhLK!
z2XQ&fd3eb)eYr%0an0}N^$flwb3-aq@x-<}2n0&=LCOsropqynAeU~094%p6Q^X?k
z&liQTb%72heHWUK6Zo>T*1wq0op0!pT!m7<=pyWHbVjMei>YJZZ{DNtlKhfleaaP~
z!NluS>;yzOZ&!p&3w{^kHWa$yJ87GzI=mrO>a&+Lu~thx8rx7Lb8rzoVEWWBbKV-B
z`DN0$ZK*yrwxKa|(90O|QMt<Oa&-4w@ksL2d&YK*4Yx72nUm>WuGdqNQn|XWQ$1t*
z(YmiTxW8h`is!O?!fqbMESzaGewBMngboL2C1;!|+^f#QYK5EprDsG{?s-$dIKQm5
z4JD4H1huV7Qrl0dEiV0f{IG3!Fj~*|>_oNqf6w!~Ux&)ds4VD5sm!{*YYUe|5UXFa
z3u8RT7_!~=)sjdy>2ahYOzOb^9*U@snBVP;5yG|QGHvQDCa3<)IL7-@|Jo(q;N!YF
zHaM>P7WHMzcfN+7T;1_0z4doT3eRy^4h&RktwfMk?yGEwD5B=`rGzFuy=CUXQL5sG
zV?auG8U=6v0whNPN&DH_@X4M=C|p_d0Jx(B;F&xpvu&;Vp=gt}!;0XP#6!oNo_Ofo
z(=fSeo!RkzkLeAG^xnwJ*3a=RaLKmCgsznWY)q>jKh!b35M$NJ1G%_EQE0XKw|Waz
z4@lNFTbfOpalkN&U;XKMbjkC$lIPa&OqR(d$y18S*OesCjGq}qEnlstAx#LGrZgD`
zXXs(hyLe`C-Uae#WE=#Jm<tDKFt^N+aYj(VW;z-(a~nB+yOa4TbR`NgTF#t`WM<HM
z9C}+mkZ}`F)CX$>d1Gdgzw2nB8;7fx-F1{I`#`2hH7|T7divHBYgZlAcCm^QrWZA9
zL1N-IdTCp5L~_oD-D5B>=8$V+_Rbo0y^*$55YfYituTOTxv|g3!|Ok3-c@e<nMDj3
z27(=G+?fEu@YKR^YErr8UR^Rz_k9ZS*|KeZDE(=h!#eQ&vgSjFLvz9zV|94)#pS8J
zI+9y1O0HX++^0MJBX9@X(cRlVJEG;eRNH*|zp#R8DA_`k@CWVCCw@=8sX(_ru><s=
zbr-?GRb}o1?m`d-stwc`9{92kOGlL<Q`g9(N)+Nw=pt3!6M<EWh)|D-pE7jLkiPFE
zGu>*|9(4*D@Er!VcZ5+a2Tckh>vz%eF>KXqpRm6sj>6vv3IV9$)qL;JFG24$280ci
zw`Jzw_@V*JL}mkSs)vTW0*>=k?60<c&ZcoY54j`D0=S09%nO4aBU2ZUDKzfYy>UQI
zO@io`B<bvm<_U3Rm?n+&%M%m#Azjucg=#J%APq}K8tAbXxka&GRjZiYyw15p_S)R%
zt6)#Ix5}l|2kI+DbSCaX7>xNTvn88IzfN8YviP~pL+pYA0K-ZM5})V0XnlI;<aa`H
zYe+_*`%cS|`qq^i&c|%Zs4Dv??CIkK`*>rky3X;d_6Jy`syj_IGsAmmP?eZYBJC9R
zu>R9Td&`ai%0uK%g6wYMI}bj{Xw;twl-{0nOm^@J*-OB5r(rA%-!s<D@D(aeW`?d8
zh1xg@lZTKYF>82L=I?F)Dr;MTv1BuY`*Skd;3}kSad0y)*zjstVT@aKPp(^%+-Jvz
z^oC?&FpRQ$s%?Im4Rw1j#Jh-=Q3-CUFbdn~OgwZ{6Q=<+@RM$!$BhF}vJw-H??Ioz
ztL+(q*IkpjP!KL#>l`P7q<poX%;1PY!;ae@5NprD1%xlndRE7ikJjjUoHNH(3zz(f
z9j^C}_S}UL)1T5SN}emjb0YnokO>T`0cmiW!X<MTnanHV`;ECf{mJs2A5gaz6+y8z
zL^YswGCy*UES)>d$Z3}Rm^o6zA8)ei86v^P`<*ocOupI5C{Cc}<yNV5;%p06p;T}t
z`rKr#DTJJwaciy?(m6r)KJ%kMCiG!sDAilP^thH^{<G))dI--A;h8K+d(@RpK3+nc
zMOEp~O{6hd%d1IHqwrog0ft*ZOlHM~6X3(BJKXBit;W6kq$b0ch~yQ!QpAQR(S8nS
zmVv<7rh3J)6JxpieqQqoh1}hPxm=VnQw<d-?~|G>#mSpUr?eF$?~3*#cybfpXX#qU
z(-e2N&>`ZvW<Rhz#pY3rp0z`&(9~zD?11uJeg{_D@8BhRj;R~g-*!yp3H3XAM)iS_
z>-5H7GB=c2=Gk}jPd=JnpBu6vy@V;Tz-tIn4iXbzWCtvnE%JtIc#oFR%9Ld8F&js>
z&#olRrWQZnlewuLyL=x-gYL{j`TpI)cP-GTaeg;!zIE9gTtENZ!{x>(^HwzJ>vw1B
zeY?VY3`gIm-1Y2`corTFqV(qA=?^@wv)LHFYthz2POaH7qag{FGG8_KbkEU>zI^zp
zzFa)oUGj_`<a(dOBf%d6%FJ~N1Sys=TK6B=zf)^k!Lb~&5`C`lzFvK%)+{GY)}QvC
znxWYdh!d}c%7==Tu>nwi+!cIOP(Mg7nnKeh>koKre=6w`y`%Igm_DKojU8+S4w88W
zEl=#_1O%JHmWsW=uq90jqO-trKZAP<0A=!W6Q0YIv-g<yiiK^K*WN)5#%mYRK#|wZ
z7K!226^l?cVp9NWusH|W`MDPw)9?#Y{zQ_>>n>!O6|D^8pO|&bDIHNaSbt*H9EHuV
zOtg)s-{F1%ZH~P7R%)M<y0jeZ_3bt->((dQenKX>`e)`?*0z74oqg5aGN1Tqm*!Uj
zyNkIWd@S&-!2d{i&=?rr4w^ZE->FB_c-1WSGru#t6UJs+Z#+Bv8vFCiR-wJ7s2awk
zgY1d)7dxrawDD)tyxSIzLKa@9G)*L)r|EgLf1a-AasGLx_?@2h7b~lYo)^rf4>*4C
zA<^@)Csn{8&kS9v=jkd;!PZca$I0PQ$ttEl9@TyQDOokxb7tsr)iRTJrK`4(?vy?|
zr0aV~U#;{Gr3Vo&JVg&O&~0zwUAQ%ywb4384VShg@leTV`g=^4a^fn`sKo1C?-doQ
z`n{+Y&RfW0!O)azdsJD|uEf_yV5$wGb_6(|Y_o%8R`W^92T2YgN`kc<(Yd|P-W7L;
z8u%u*Q*B*|!Sig-J&47rP!W+EI#+QG=SM7b;RV2}exl9z#>CQFwt<}=Gg;KIuOJfx
z+KmU%=x8+tX=<3!q8sFj_Mj1ld@B2_*0C1^7YdDf`Os0PdmAv<5=(z-te+9I<xg0n
z#%;1ahqZc$#iv&N(y}(1@yxr--$kSv_eA=4l(uRAGaic5{y(c&3sJG8GNlzzRfayP
z&0DbQG`(2_R8V>{3%Dl493>O&U&+qeb}2WNq_;Q5Iiyl-LV2UP($+c=#AJ!z@Y#@e
z7nj-m#}Vs_*&td96_!$Zsfg6BP25ml8W2@|xMHKGUaF!gyN4s4s1(A_LViw~z8$H+
zD1d#@-tK`=behOWLwrPlQke?MN$7^@A%V0`!G5ga@uV=-2472l^Gwz)GAu^tB-%!z
zd&eW3%yfG?Zr{sw%1>b^15+!G?E3`Pw~@UE=!+w(X}mE**HkySio9H)cOFE~5!@eT
zJYin;b~o}QVHIuo1axLG(IjRyoHCt>|2G@i@}IU-M=boV!V%T@`Q`>C17&!<#ZSP*
ztfyjCz>iRtjiacIT(KSyCMx=gA)3pNn8Ty!i_iZkClRC1VLu^%w%44r0P2b~_5+~s
zq3PvdH<dYwQ11)yR|xYmFa!4x7WsIxMw3++LWx?0oj|GRKfb*imxCFoA3oGRu7tw;
z*L%ckAddERB97AOVY8wAV=6cN=#XCT#H43wBDzrSF07};n4%j=fq!C2wAG#w4Mb&3
ztDp$16=cN=<CKYws{h0V!~mg{<}pP&`A-}`3n<-&(^$VcG4b(_Im(;)G&Sel^2ROB
z<?1X=@b=@i%Y+ME+pT<#(Z$Ne{RD$g^f8K-#(Q)0a51I7ksdB)?;~m<*T(b^69J*Q
zHM9SM^Ls>#X7d9dTJJWW-I!Su)5XOjN)-Axg~05^Y~vLc=&$*tY9+zQIGM{Ld-r9U
z2mXS=^c2f62Ie=<0`qjy$NGHB9R*WN2RVW%=IwuqP7bTVKDm%V4?N}{oR6?c<SmzH
z<@v&#UQ9V19+-|!`3QEA_z*#&{Yc9ZAB5oYNVcX&)J6xA9P}n&;l6V3=OrHl-(R=p
zO|UNqs__1{-go2Oy{e~!p}FZqp{|?pT_DPp->;v<;MIz$Rx4QpJ+8J%woE43{_YO#
zT50-Y%fnIAU8On0lFW|0=X(7{<ufbkyh~|@Fl|Ptn#^n7PNqRZ2F!C`(0U`4FIMHb
zUj6zhligO=Q{|B5R*v#bR(`Pd&q6q8OHnYjiBw{moI%R$2w8g<vRaC=o*J@RXQ;J{
zvc?tKi*nV4T)yk+nubtevXIqMl=WMd6+{erj7&a$elo*#?0!75tk*OD4A--)8=N_-
zrgl*7yc!d5!j*f(Na_khT)3kpi95xvm)pGmKXyOMGncXK{xCNL{kvOyKZ{J${U@`#
z^)UngT3=zF^Rd<+Ddf3ce>=!V!%Y<NLYE_j21JjBL@U5s(1Qv?MBK<~IU|IT5^h$$
z?!Oxv`JVLO3%Gked>0~0Vux1iRpsxs@8DL`qxpx)Y_}T03@027@ho$DV4Ur{#Qa$G
znn06G=0^2M4wzw}CMJB&h<p=(!>xgC+~figkFIpzej8UMwhyd#(>NaCn`~`WJq)S|
z`oBV|skVP}+tY2>0T|#~yH(*wyo6g7hUhV|KO;D?mPf3_7N|A(_%WwU?EmtD^ZVs@
z3+sFCSGcI3zgyv={^LJanTrm?t=F!4dM~rIZC<zJocFilcBfnMV;Z|Sg_SG%GvY8e
zlH=h#-)PTf{fZPNGB=W738YJ88m=A&ncB(2P4fVCVwoHF2K~w`dl;1znKTz30M_c3
zeMeu>I$Z@wX~gn7La$A!+$mif>#y(#op^=qt@z2Ds*e&AZ!u$=|IDpTiME^)GEb+7
z$=&CDqR^!hLtoGZ@RP0)ugmpgQ*)L3*<0XQ`#yT<|ElmDs>q%v|7}lVa+L1#X0i{y
z4-2q9W73$bCMNY$O=ZR;cB7(B1Dr@7W)PK7;V9~l_ET!E?yG}+n}h$Qa!{5I*RY|g
z12>X~iCH60X?wfOeF7oKnj}44L4PMsq2pS|h&NeouX5bv)i}hNa_Y}zgkC9xpKVWB
zG@(*u(CW*1O)@s;b<zu{NwVE)`VMOzanSwEjQ^Mi20%?sLo8v_>`~@^Ee4|YPC@sR
zl7=@7TM@n<cj^VAgRf8Bw@bO}CZ_Ipm-@ENer1~9ZCJ9jeR#e7BEjaBx#xs_!bP>&
zM9bB2A&w2{yTBAJwt*;~vWicY^}}j|-eCOEJvv_SVK;n0QxT|}gmEx*oql;YJcAV}
zHA0NA72{gZtxDvN$I15S?S@lAva+Zu{kdalEVdDPyWxB8X!xD>X`w_UDIL^2cMm=Y
zSSGI9+YrF0iK801Ftfc;lO>qPB?z0MyRfDC*$p{)$0IWinF@@(mbJE;zT=fJN-LO;
z>DM{Fo2wgtkjK_TVz0Tnmhd{=xUF^-Ktq3Emz}v*zx1Vd=!*<=dX{a3WPR43%(tQ+
zrKzY-)5&0Mw`xMyy6N;-^m3`0y?s72t~BNbex)4^TiPBWCY-&=ZwoOY)dJ@b1@|79
zGBt5GNL1toE~Jcm`(;0T(+voJ+uMU+E*?Tnu+agq!tg0f+&rzV+O{Y&Bd3mDN?5oP
z#e2b>NL?F)f;X+4>Z>>!f1>sNoK3BA-LcSfdR5zsH)=LuCs%>Hu+cYW-cvnl(j8|c
z=3)D(?Du?_sfxg}&8c8tce6gA62eWbZM~b%M4bFw$e?NE)}OYstOQF2rMa|9@M7(Z
zwc;rBHXGg2Uu=2`HkqUi3eF~`;LLvr3N5<!3$+WmZf9P^6~?PK>gJi?rAXD1-1JM+
zh`zg1H`Dmr;!?48wTZ^ROXHgDcbhFd94*JJAoB#pL}Ts@AP@llzEe*~NE(q(`y9z1
z5Hd|5pG_rwmpqJ36I|Eb*2k02>X!h4%r+M;MmUM4?xs$65zGhA(vvSDKcu&q#**1X
zt+QVtwcLK&`^v?!-IfL+XKIy;kzB4{U8%L>a^XEBNS!$2MYvKIr509B8w66Y-ISV8
zJvqdCmFRF$hLwv6i?Hv&5D3pgglJm@+-0)+SId@++B`xuldHQSNZns)7YP>8gy=P0
zV-6%<M9LL#P6_ANHRvFvWrQ0omXYoCgBF1pB%)!Vd5EjoPJP=*Iaa6lTIM_5WUbTb
zUn~Kpt(2Op+Y!vGk)br5ztw#QH0K{KP(Pz(%+_0<db`nt%8LH5ijs`W7*iU;JYmq=
z%NStqvT~h9OH*fo`|@XMr`L{OkwX0M3)S2z_h<SP`x6Gf1RJIE$weJ{h<?k$nGe^2
zOZ|1)*%%}M0Iije><(Paj!bl3q_o%)&<X5X_wZii_?B|CcX;!1V#s2yXiiLcT{zKZ
zKWq@A>>_syOpD{p+U6c1D$ro2GTkaPPPL6(L6x3H5!?_^=EM3w_=(FGA#?P4^0a)A
zJW#pII^;mEW9%JrXTW+&DqakB#}Ayq?{m{?qnL1_51Ll8&qDVMQWE<vZ;jUI7J!xr
zjtY&?ml#U)Q{lj@Qtguvo`m4#4hkMh@Ny<jKheIJTjUR0T^Li^>OH*AyP8>Lnr_r+
z99q-%Q|6;o1Ceg5qa2s3S<&~yK-+#21hducvUlZzKvEa-yO>>9ers(D%G?CL+nv(Y
zI#LU1sx(oGK0xZsD2T8A@eng*OnkL3ks2fG%u079^OLRGyFFPAGKpfIP=CVH3$l9R
z3nQ!b7idgYxDimc_+uqY00_H_OF-5sTu3tWqLJG0fog18j(x#rtaWd)LQv5<F%XIG
zjrYc*mq4{!3n0dA7NTi-X#Qf@Ga|3@(De1t`~ql1<a23+c)J^I&Bg;EM-ed$hQp1#
zkfBPy4Fn~VA)X3jIFa_KH1nt@2DV4Bsdb5Z=m|pX5`D=s-Hl^9MPFj+<Pg6?+cai6
zqE@ln*_74a<uQRryO8Yfh?qx7UG~ep5Ibmw4P3$x-d5S?>1syzMU3_7kNEU4q`PDP
z{?&#~JC^c^XdeRT(dWwwj{T|3%lSRMW7xb@=B15M5KRJdjfKYB9kY|z=vUnm?Gmkw
zg(z>P9s}mwUC#j#gyyMF2-_b@h=#pI2dHHld0j8r)k^b-4!*uGzEbN`vPn*u?xSqA
z?iG@0IhlEr1COo`p3Rb{gMf^yd&`;!^KaMYUuaNw2Nt*f&m3`Th?{H7yp8f|BN>An
zcfo*gTY|LwH7dRAS(;Y><Ff}cY(EhN1qZa~H?J1G>$XwezmV5mMiy6Zl(x#vp`O_9
zu5g}f8@EgNd^?}rHsR&!Lf(1#hA<TQ>VkmSgnU22V3Tt36iXc;)|89K!i%sBe%y@k
zZj$y^y)d&klC*wNZ(>(c_OoZmMUTpW$<Lj!ny6ktym5v`vDc?q<{-I)&s8g{+(d}Y
zXKgHIz29ej+GpJ&WIdIvaR<BWA<tN>at}&u(g~aI>$EE2#gwu4c`-wmFNHUj;^_Go
zrt{+b4$3OQ=kD79;Igs?tUv@qqBo1ycBVh>PN@!SnM1xHQA%{)E>y$9ba9>oznfR(
z-hCQ~vJw-X3Dmk10ISh!QyPT9dWA6^#&4=^C5{9WZ5_SJ-Jqg`8?4rBS>>`I$n7X-
z#>u{#`BLXb!W4??nC>1xKzgx|yoZ=WZr~KEa9Q*#p3{21oac}|x&#Eo&)3s!|2!|5
zc_~t4OaF=igk?l!v};M`D@&fIQIN`yQ{{UXDi>3TPQ~aM*RL?X;W<AdPTW<(IX5uB
zCuQG*k;myL6*EVdt0Uil=F6TEnjiBPONf4Bd(-LRd%@n6NFQWXBKoa)$oEk4L^~D{
z>F$EDN9zmVt2c$|k_)!Y0<Vh3pvq_<$NWj49)`UNLuYhPp*}-KKQj6L+3U~fVxe2G
zE7RR#n`A+eyX{HeIRl<t263FL0RF`x*J{;Ct}g0VoXcJ`M^6g5w3Je=2A?aDK1#M}
zG$Bh+&I8FAecMt?$UD)leF^n5tMyjfhY;)nIwj>YS|kdqo3pzo*w@6TeukD4{k1TC
zVjK!>F%xxzR1>g>)x@N0OgkEitlI>B66veeB+xd+gLf%yN0)D$dX7?$Lomi1qR{4Z
zw({Oon@doy!VpE1sNS=udnB7YC)@5+)z|C5HP+j+&IXZ8&&QyiP)t(?I*#wZ@FihB
zZt0IyGS*&uMIU=<l&jm*DuGM5mO?cJCL^5X`xMY1mYzsoV3;XEz$FXm>PS)<n;40O
z-d@R26dINa7kz-nPz2ZCVgf?}bdBv9%AZAIrZOM8+ikI5B9^1cg~AJ>rA2fJ&LFSm
z<p&AnwnmD@`;S6BUmxZ(s82X_Qb}mGw>#MKh9&l1yhQubwE%EFc%{$FKkZY*^YX#b
z-R9MKs*v_)G8f!iTEOr!T`w<%7cDB>ND(YtLo%SCr}z1ajh{sNHrjLhFQDaXL(Ar-
z{E}~nXOy4lAXCEDrBGonxgFhTH{LzHTxzn=w8pna8ohiWTVAfN-&vksuAuDzy=wZb
zfJXEl=~eMTAbk``x?Q28=tlulkBg|#_j^Yj|6hCW0^UY(^^K1%>r0$C20}<epb8|#
zfy9yId$`%L<=BbisIdd#(ynDmwnc17mE<G_^45?LZV8xj3wKj44U~(aTmqDmmQq5?
z*O~%f1C-VjXu}IFp`|q3tnYVbW~G(9n-JQM|MUL8?qm6^&Yn4Q=FFLM&dkngaZOY?
zK~zbG7U5N<3~c~aqgzq?BYvu#3~eIn7v+9KaYmSSr4ZFCM7@|3zrXtpm==lpPBg=r
zX&#(3u34I9dGB0<Z32RQJk5UBsaz8uyw5dptYm+_WdH6q*g!U;-7g_sd8aYOZI0kZ
zjl6_&OEvNaa21Vwp+jrrF<_0*vKRM!cVCJ`fw$uBv`0@F4O!n=lo*knzzyxb7X$d6
z_fW0qqo?25imnAK4{qn~u92)TVBS6VsUpzpSEqKL$Ab+`Pp}&M%<l84Ul8!II`jVS
zXC{EnJ_6apKP4d!z3(ps%75z<#-hRwr643B&vgnQVdJG(2-|HTSry19Nz#LGpNZfS
zstO{-Am6(WYwY2>zMdXx?>va<l<3DFP~;Fl0#S4d@5i}+<wABqNOBsF|4b7S_C+Qv
zlO%E~?;cDw<#A2)!2K{tOJzPncfRuoOjx7^8{VVG8S|coM*nmCM+5)Sz<)IG9}WE9
z(ZKRG!B}TcWUXUucQEYe3rBta)uC{gqdvGM<PSE51HpB?;!t>vuO}36_+njsy}@w8
zk%%~g>-^oma92>!3i%eSjl@>R!6mrB+us(6`MX;~y}DCD>t2wcOKKz<499|93;gkz
zf5Eaypsy!b;*Unbs>2r#`PFzL24NO1RF|p#NEqMx{0T?6ucyZmjwBphBB>TffD7zM
z^hbmIqA%?5`TfD3V9bZQaMxEH3nJ0P0`Lm>7W9NV7DW3K-H~u<iOaPh9!dn~Q}caY
z!MKobK`89+=?erG)V2k~3+f`_L~vcAq?<UYUBQGJ2zL7VdJ;Gfi)x^fXwaXCJ5YNl
zoS5h6=u0@dASzR{lx3#(o=CWh87Vs+Oh`Se>C=`-ILIJLZMvn6k;v+J6C{mwq7%5B
z^+L?Wnv^U<`6X%{Q$yiU!lX?an(Tp5-DtKr7b>80^iE$m!PcPdk=O|xkx0);ywgJA
zcrcbweSPc3U+WC@^q^iM$m(rh6lM`%3mq*HN2=Q+F?4ZfC>#vT$|8sA<3ZWIY)*Wk
zLsg@ZxTAP&cgWxE2*n+9&hDG*ILVRd4#nr66c3#pES{Tnm9dp!Uq?@ntQs|9Oe7qB
zAIvOe?%0>Xk7J0$%dN%{={O4wb`&2i1ao*#Th!E2?{ca2p0?ViW>3B9S+QdI3U!&M
zwY7G!r`3Tr_#?g15Go3V(VULHE=O-95Oj3%)+KzQp7>m;DXza*5JE)!nwlWN%CIo8
zU?5%HqSM+ut!izn+S<0Fsbw)**9lI2u^_BoPv5_&KM|~r#eDs0ZM>BatqfXfpiU3I
zctUF_x+&oc`-6^1rx;<3)KGWaapK!CCm-`~*8<lzV<$-EIaapd`QfJK+C|MCNBv5V
zqiwmPwq<!+qi2P~v#QSHZEITIauQ<`8bGvig$JDjb%x26^#sGkstSh@jH~M0g$~X+
zMF-XQ`Vt8uOc9*{KKrOQRUdeDz+uFDeDUsBFxo={t}oF!zlx<;u?eYDM=>Nhk<DFb
zBxH)}4E9a1HEQ-r>;%s;Z`)}ODcVVF!i3ty!U^E$V0^@feLap~EEb6^bVU0*Lg9eR
zwQyk|NOrYwA-|w@1{3hRzHk5vLBo468Wt{$1>+dEG%U#IVOl;%JV;K`kqGq$OB~Br
zG%apwscm*}dq!=H)%p6mx)bE*`ogQjk+oq*7)($Uel8eIgd*4(Kb}T+5j;*jE}5)2
zHkllqn@rN1=`O<OJe(hg&m+ajWIH||;76keK$qP<mXe7%R(QP4wRN6)$D-3nG=69;
zffK-JlLMBk07EYj@>5XYaRLsFf&(8i5FchOEC=oafyCD<Y#!&4Sg0#R8us}Up*6vD
zL-`oc0*sDNO2M2}b}3mvM{iPZiVeEOyyk_DWlgQiYTN1>X^f_2MIY{u0X!ySX7(Z?
zFyfu6QNS4?xIhtcbxoPt<@c)yr13-maa;ATTj%Qtxty*FRTcf9l}g(oMU6BI6zYsc
zdR4zK8b|bp3DN-e)Qi%Lf?Jw;A>3zQ549-ai1#7p#p9hw2Kv#Zp+txqGuL$j-YOAv
zw6(T6RtNiM5wE&bLGDrMvS^z0;+zO{y}s2!q#eElvX5vC`AIC%uToM`j8P`+M=~If
zK^H{HeU_8%n4CV6B9u{A1bu=2;<>X#eOeyTi=pa=Gl?5kmbywc^7;(R?V~cSoCsVh
za;)I~+NPIRvsE;MS8v1yRq(^O-$lB`^&3<%3KQx6)(T`RKV_L}M_;H1emV|YM9_q>
zz{XOgs3+3jgk-7jn)CDZAfN{NDTy%&bPRnYL!At8F=Uy2!G&pjw=eGKMb;OJ!m-4H
zoxxZTkrDaEzc~%Su<C><_C=!+WJt&Ye3UN9q4z~cLQljO2*xs4Iog`j!$PMbEl$vg
zBW<Fx&d5TzcP%kO@&*5hE6_z}gy$?rp)Y9cp(j+*Dj#?Fa_l?@iwAo;8S*FQIMKlz
z`4c8hEXYdS;(hQNF*eV!0Hwd(=j$PNi_#_dDo@j1hr&@GYFgt`S@E&p8=vPm_Sn^H
z>1^%<M)@>vY{1dk7lxmVgi9t&U?mP|kD2iUvmFWxQC}>s4P!?<)D`w6kTp4qU30VI
z>3-<lbD7^4Ca>g&fpOOoSh!GR!;T(E7>vO$Ay)SK)~O-^m0}@A@^TR=KB7t}&XbXF
zBs^bhiI!;|#n@4fwV^~e0(ANsQZj_6IE5<Gn&3>{SR+}uQ2&j5i-9-Ph-F=xe41dS
zCZ7W?s5=r6B6LI`hwzk00|My*B8cLYc?(C)EjSoB-tq4lA-zadkUaBjndw{hh1!`K
z-?A;zRd}?t#Sx4n79kU$utZZ1n&Ih#easg|!iV&MJAQ)!M!rG}QxPz@Dh)KgHt1WO
zb|5kUX$~zjUh2jpAAlJ?NGmcQGiewP{87L`u02i_YZq1NtOH_3K}OIP3yFkB;>d9O
zXgaPj(NsmEHPp7%HZOFfh7rxnh%W5!clf)5{?#%#5Q$hR@b?6LF*S-visb@CG@g-Z
zX%f?TJG5vT38y%OgKN{)L`@Ax99lcF8_69KS~{o1?R3UVG<SsRdiyYc=?JD1L9B0(
zHOE2$k?c#y#k(h^r?lQR2^TITkI16{&zrj=J#eWDr6&|N?n}wXHP5F>7?WMd6KNqp
zMwEY3gy0q+?H`UuTNjDMm^OwPbuZQ~FxQ0oX<o}5IGAJpNe(sv9TZuOo+n3w)bDr)
z6J8YO=d3@Gi3t%yMz?SwI|0+0m?P#3#ZkaRQwz+o=twD=-26O8Nl8g+^1vOGwuFH!
zyEhc)UbdY`+f!3&!N*`+B>E__3VW5K!Z3fM{2@h_VjC}Uv_bWq5lo!da<7$Iu|nv=
z$_$)`464BZCQtI1Frgl+Tm6wfifia_nh3NXE!Te>CurukAfD(c={~9bWXJND4ojNu
z39iGHc@xsKT6g%U$UHrHcPtW)a3rQ<C63kze7CPJ5y4C}<ez{pg`LI`!i2b3vcfsd
zJm6+y!Eg74&<3nT#e9x#<TD^8X<_P8+`t6Bm?xGRNCjwCEN!78d6FYhH{X}*9-7ps
z3DGc2t89IIVZ^6RqsRbfaV@7~=2~BT0(Ml7J6aQ14_VfSg)^RRHZ`|Yt7=!ckERMP
zx7r->uf}RkPsrczaeF-DiZgG+8gEOLrv3_H_U>|?i5pF3vw5KY3bhq86I!4O1`b$=
zblt8>nSjz#o|xi7Ufp%4A1fv*mn_|o^mVs0l`LYmGoDhkJx%Yb6php+tzheN%N&|t
zWDM=A*0(Q!6;FhKrUO$NwA@;%dRl!=Ydi<mL&fKafx3P!ITyYjnbC<(r_<$hJ4>Bq
z&T?mkv(j1Rtj7G%<#M}9U1hFvSB0z6RpqL7JKZj~+g<7|bC<g-+?DPscXg?=)K%&(
zEiEl8EibJott_o7tuAwxxysySrDbJh<z*FRm1R|B)#c7|SGl{qw7jgmyu6~kvb?Ii
zy24rEs&H47R+LqgS5#C~R#a6~S2`<QmF~*Y%CgGx%8JU$%BsrhDrc3e%3W1jRaRAA
zRZ&%0RaI484H2tReKlBDqgXXi8I96?`Ttk@22Qr1ZTxe%?fAEBjGcq;d0WSl%+A<F
zw~Zx>0k;Cy0Os94mfWE*b`{_>JnG>X9!pjM1^}-DJokaIBs~Ih8(<<2Pmeq}mRvQ7
zu^B%E9`Js^;zGtsA0A77h94s72K*U3@^--8fO(IMCD&mCZW-XYc*Y^`QP3aB*j~Vk
zQ13<C$C9Ifd5?`H+fdKC$H$T*fQ?UpAK-StX?VtE7hp5s)*p=}2LQJ{IhH&D^6dQa
zSaKrj+4j^}vKx=-?F6(#t~}hqJ{NE=;7fqJaU*|_<ioy{Rip>N9>8sY1AyBBuL9f&
zxD#+UApMTlUckM8dA~+Ikh2PKHsC72Dv}=%zqHGC1I~k7(?-UUKLT6@_!i(^z#_=G
z4I9v|B6+d(?<K%#*z`FY&UPF2-QNyaOdF@rj#YT3jX$4<ZJL;Dv7LbQsN6Jcmc56*
z|7k4w9^h_3hW1X|J(jEo+zNOu!FR@zHxmB8$CA$jb_4DP+zHr>e(HV~bb!V0q2B?w
zVRynFz^cDO9_+2idw(q14!CL$>I3Zl0Q~^C9dH`9N919vQ~+?@hhxcMz}^4Ab^89v
zSkjAU*YZAvod8xHl1%Od+>Y&-4E?_D&}6b0aMcVvqYO9=8~pbI?mar0tim&PTisYY
z1YA{`Ozr^O3-}peV_7okz;5Plz<Pq^$z%d>T4geM4`5YQGWk3DUY$%99SS~xHGstn
zlgTJx;|a;+Fks$E*mnas?c`)~8th;fU_D@QE%@W-UUmaE1MXdvOl}0+UWamk>*`UC
zzBfSrY0%e_WO5zg?xo4(cEIA6WHJvMNw#{E$yI=jt&j_F)j6nV2I~DzGT9Bd{k&xI
zCBSJHK)%CqeG}G10UIyDySxFneHU`f1m9be$<2U`*n_kKFc15dr(hReF`yH0_YWW+
zVB>HyxeJgEG@l=qvsW<7x+2Tr6La$h;Q(d>3l<pMuyNQ7D=ca#oVH|g!P>k5cJhpc
z$Cl2<Y*du52V8XvY?To5Aiihf;{`rS<>f&%34HRf;{&SU2jMQlX9{o*j=?imfH&gf
z08Yy*2sez+OyDqV_`zkwndNM6GjBoe4WBPz>~O6eP&04KUW{R?&mJK~7Z$Y?=J7hn
z*?op<Fk^mD9aDfKdJRWyDF*IC;7Bi2{#04M2Kb4P2j<2Ps;>>WX~4}POmkt;CTkn0
z8FS(j#kG03)<)MF3X3*d>kFrCvMnxjL?#qYt1T>ARG8-}Xw*(w8OYwkryIU=6QKQv
zsDYQOrE;}}jt%zu!glM5355=D5)5h!c;C@0PiyXiPoQ$wqFilZ`v&_a8@)q`_@034
zL*RR^cHMw)qXFLp$|W?u=VbEb?Fau+TqnCgokg{UdEkut@RhdW-Ge%PfOb{1Ws7xD
z;k3=vmQ83&@dl;7aKK{y$%Mk<`a%bqxkze-`g#w_?F8?`sC*sDgLH$k$-ddP#Y%OL
zpza-ak0p=6In}+%>f^dE#@F|7Z3r>vnbb9CY=doSp~JdZC?Aq8E?~rG8e(7lLy(hb
zA;4y<^;^V<?4u0Vw&EIG0zXJD`pwvfur+Ef&h2l)(s<onsebQ9nT;s3gGz9Iyx-Bl
z4fe%_PV2XzXEasxpQnITf$T<<DMBpFL0hTJ7VF}ao@#}joYwC{P0(CQP4xxb-glvF
z8|2~BHLYJake!7Jr`2<XGSUgf^&ZgOOy`^qwztW~<ysA;2-0N*L^}_(qoBRdfHn@=
zCeF68fNX0k>JLD62ldl$;Vd;7B=7pdsC6i-&7{W>lsg9f`v%VKPorF6R5Mv&KagSK
zb7M)eZMK|rGR}6K)=x53Qv3AJpZ{MVZ^p-o^6#O13;N4`CHO<HmlKc6Y@R~L7JFl1
z@n)sA(77q6wy<hL?&3nP&2E`cSOpYRzNpXvQWU``Z7et!{A%Ec-ot*jiy^m;7YMi5
z7ZI;|;#E()8Zb<MZOE#&z<P?|g~kHgd&addf=>;g${ncxGvpa(Bi}KsKiw9n-1c)b
zZZ^NwP=Y$tL$+89{tv-^U&J}pBlQlQZL&*Stg-sCY;kD;s{#4#ka5>bW67gQ=SO4I
zz!voVCd&_6Y)*(*+@P~&j7H(BwxRx?;W3~;Q@?Xt;$s)~Z2LM2$$`^QaBk|nw!i^W
z2g=<Df8fQr{n;t9Y?JhzxDI?hXm)Be_m8KU4VrGylp)_*OMI>cji<0^gVnlZLSbHQ
zK?D9T5pDG%Z`%yI<x)<)ZFy3*1&?jZh1l6P)l@(+;wtdC2=dIsIq6!)2#m>Zq*~`G
zNO=oBF7i+&fZsW|3-#JRMKt0%M#z?amFsvhEMOj~Snn`u3&^MC!MDAH4Wx*U{HVeI
znZS(#cPzC9W9>2;CtGNoY^FZnM7F&l$ChK^3{wMx54T^Thy$M?$o2edW62oK?dL!)
zp|^TyY<a<Hg?Uu>M$}UUzpVGCTY;N`P1>L0y6A7~r6|Mgf%LK+*FVJd_44`_tB2YO
zd)Q>Ru7aAxAf;i#{rPUt*VArtqDM^LU|q|7Gx>Tu5{z}Yc8a{V$qHtmC!1*p{1M{S
z@$eh=r;v1l@8uM;FSDh`czTR2E*!SoR@n@P5=Mj2$wS~d`;M{XHB|Taz!QZw<ZjB@
ztZcDgX1m-<{&N>-He%lMHql%Nnza8+#e6g~9Wxq5%r8P)z6IV_Q@L}9cPixb_Be5U
zCgx9XP<!Y)_(~h)?Md6H^nW9E+o3R-D)cq_T<-uIwDxQS&+nkljbwWpsLqj$_E3Ly
zV}6oAe_e-jVVk=C>Pqz&xf|{?c7f+xpnnA4?W&Md+8<=tjF<`gYa;uzsxWAbjkNs{
z52|w}>ZE7v&n3C$3LcxSEh4@}t-DdH2yYm>H1255G>ivAF2p(6=_cyu4HOHl0nh@2
z{Gh1-NRiZ#UmQRk6EXjK2IuxSF=^3!lg!PmeJHt@2aLr9-`4yTQb^FBi~5F8b}@}F
z-Ve*o?5e)dYrWdgs$k*GD4qvix8q?X`U!KpD&)}jYdWs+ScJG{eZjC-kw4Iwm^Kmq
z1oO!{oQpBLo{txjyA0R1;@ZC|H}M((uN(2q+gYRoZ>BBs_?aGCl;*x+sDq{gu1}h?
z?!cV)U*$#PeieAVg~voMq;YZxyj0dWX-LI(K2DYt?y%Tq7>^TJml*dwSW9rcgI|us
zcVQ=s3u~-rOeidZyMg6Xy(6f1Gic_b6YMWz@|;!gxbeBeW?MDMuqXLgB408a^>$m5
z$v(1;jiN1^$zMtvMOX;y(|ouRo<p9Aa>G{qMk{#OY2wbu1j&3lWX4#)JcDH32)agF
zGWi1OYPsNV5Z`Og8W(x2mt>)k`6AcAXBX9tbtZ}<_WQw;$3etAnk&^<cW5K2nXY4^
zwk<cAd<-$#eub$2aw}rrWj5pzTkK8LS4gsD9=5b_)M8B<8d5{S3E<ZTe$xuEFNyM%
zR|UVztjHO*kk8*tbCgZsH)65=Bg^%83P_%-P=0$+GI<N2{TWRj_$RZNF=(~c8Ft}P
zo=e{jGsyc|GWlch7JY!W@myXvZ}Akg!qW)Hgh)fa>31kMh;mm^IU_sbxoJPzpm*!d
zJa3})nh#NSIhExx+l%nKw7`%5)JYDY&jo*nBbme#o6LS5d=95u4vPAM26&5kzz>0D
zCTJMho=@|SseXh{lRg-coosak^zVWG4x%s5pyzRKap91~`fa_nHWg4klt=`QN+u7X
za+5O3k+0h>bc3`V^*BLa1^PCuAH9Hc`}_E^jPfil7ySJOJDE6ne3XEdi+IulGBz7)
zI8^2j_yV2k?R+JQh_s`Y%XA}1yr92~=${t!vJT!BXeyPrlHENIl3~zKA-lUx(CcF!
zw>$EJf6p>INJzSyh-MFdJ(;B6C$kS`)W_RIMIY7Y42a;QyLyzHhP`%0RPMTrax~U%
zf|z1z2Tw+0vkakhFX*4Zx$t8=PBuc%h{du$rB7}L?Pl!f`&VN{H>!FMa(a;`CdkkH
z1BSwLF+R>=z~kmy^@W&G{D6EiVhC~<<gKY30TZib#uFas(t)~GLGA@Nk0r}-Zod<C
zftfx>BR!lzi$2)1nASrbjmss-7knu_K*!WK-Jrb+v=g9Xp)+o4+zzb0DMM&1u%1^q
z4R!^)<GLO~xgP9){33NcPx8!5CLaV4euDQ4a$@T;)KJHL_TmDXs-p(@Z^%Y=SC4WR
zp<F51-B01Pa8+-2Jl9{02uA}iBh{+o{%#|cFG(iptp)Z6MfuCE`^aUYwkgIbavk@-
zFM(e%)^txKdt5E}>BkvgBcer<+aV4#2u+G%Cv%}GQ&1jj!1N9;B#OK}m($vboa3kG
z<vey{SlBAlW}^3ni+KA2;5Q9x$3EgWQR8Ro&oR$FJ7siwZj=akq3?&luL^6>_Y%K9
zBG5tK`^?plTTC$)5ZM}R3!gkpp02|BH7(TGf2i@~V}q}2WaUzbUN+m@DJe1aW22>j
zPpRdo(Ki5z0mV1G%!XC@lF9q1zfZ>4$dW%@7tb|mspq^@O<DX%_Md^@)<80OC+UM$
zT}}CE{bjw*kY8<qt-)ANJg=a-oLKVtT_@JtN%p7qTi0S*iF_NCdB#nD;6i2^S4IJY
z#?_6eYa}G?_#9E!f!OiysQg*U<eMbxsV1`W7%ue^wMERF8ui;eEK5ei$>gbd;CGq{
zzjVI6xNy*Beb7+Riwj7X=YrqB8jLfTfZb=pFD)NDh}AY^H)PmQV0$+QS1_tEy7e|9
z=VsfG54Yj|2j5l<`;c)IzAf!M7ZtQ=(T9(-nJ6=WG7nI^_=bsm`t}Z3Y<tWUi1vaV
zb7Fzyy^BT8Lw35^x{A+>Xk2dw!5&=OPW6@Uw>}=9&M~eJ(Np+}QSdzTGPIj`o@m08
z>sZd~(1uRp30&5Eov($E9h9LB-MDA8zzv?28c*&Yp?Mic^?KQ%SZ_AzHquEw?oI95
zI+i>R=OXT$&UHdxlSvCp-Z0&gH}8<MEgoexUKa_k&R1<%vkTX6IKncqObIbnu_%FJ
z#&p-`@#`q-RHByS6`#Ir(SFMAaT4!YmV?x`=d6~;EXr@Jwn6K$I8RsbJf|tQ%wbB%
zGE;fQVsYc7Mfr=x62QsX%6F6pEy~|5>?KQ`Ua+gE9B&u+uXJ8ywOnaYuCUs6TPEQ=
z&A*59pKqDT6vZ-CS*Ey^TP>DGd>ODPPg>Y^%UnHMc7nz?X_<MGMR~!-F19MK+Sq?v
zv4HxBjS$b;&g6CH9jR2avYF$~Nv<j7AU{*tZDo5ct3R-^3vBq#<xN^jcUZiPoo`bv
z&12Wum1HjafqnXIdF(btxj&EnC`Y+AkL}D^#@G*X39%y=`45WP`@jPFoNJkSjPeU=
z!~eF}uCx^X*g~oWBHd`vB^D4Z`t?v{23j3sy_x;m#;>1dX?(!adS1Te6034azU@Vu
z@?^f{W4m%kzGaJ2{6N0t>Kx^{e9KRAmFw~?Be@DFFUV7Xe<ZK)<$TM#d35!&Ji_a0
z-&;a;&$k?Ym~y;jzT&o=s`!*M0k>Kd%;&aSF#vSrY$wqtEfW^C3|iO&7NMybC4grr
z!&bJ<qTu&)@3hkSKdBp}9tl<?KC|r1V^^VL^4RS*<!^cH37hh19{Y_=c{Pu{Zd2~g
zWxun(X$8^Kc_{mrJO!sHIcE7U<%+qu7p8n%%s#Nx|D~9HYNhjQZO>WWC}wE+@B;Qw
zj`Cm$dp<|GtAyQ_*Zbjoc3Zyk?tJ!-d<8UDPEdgPX@T<WeD>}{<@sXv{v_pz`Rvz)
z%Kh`%p9_^;#q6n~Q^54TgPyaXwwDi4ez}1C>QJir?WxLLbJ+ujIdFdCH07qbY{T?#
z<NU%I$^|$-e2fZgI)dtXj{hG$LV0Wf+k1ra(*^A9BdO-=9m<RI+4HlMKh9@w&Qe~1
zT(gu%=dmqEDfi7|e>s|H-kW`!1#P>(_#m)*wwT2G+i^mPQ%(bnDYskMPK$y>XjCe9
za#J;97g!Yhs?fu{H*OfVSbS~D9hT{rT9lV8?AL_dVx6_a%ATMypIK*JW{1jW?Zt4h
z1M#eV*3$~RNtyLXKKo3W^==OPZO-JOe0F>8tf4&iN$#wj`RtXv!fW%{Kl2WMDxW=&
zk3th!!TJg7^VjFC&t0FhURiHnZ(DC&Z&@!r73Nj*5YO>hhff<m-T0h|PZXbKd<Z`q
zpJIH-o_xxHh210V=_*Swh@Xmn%BFlWk-cYAwoYV^*p+)Hv0vI1;Qwe>ZlA=)?8*(3
z*hPx+-AU|vr2+WADayqY*=0Ej7Rk2eD3?!UJ93Erw>iou1?=xR%5U@8=Q+x+^4T4^
zgnuBHIAg%z{PkSruLbP7yc6CoU~lCqpG{yN<tcxkz#h+6fcfhLv>9R^Qwaz)UaYW}
ztjc#4w#}vtDC`p(VLr0c|JM{jdW$WJlM5{hb~apXQJy)B{mP>J=rH!NW#+~i>~X7d
z`(f-c+viq%`9+R${d9JIuJYq)?BU!>U<UHOXTgqzC-aqGAI>hGpa6Blghwn}4`=Tc
zD1Sec-8+#`PffhXa^DoTsZhCfCVQcf>iAQkvU4W;jGx~ynTWPeJ_P^Ynyg%X82f}{
z28)!(rn38rl%GyzKjY`G6)E8Jk0RyWDeUti<&RU?@IlH&Q`xT$Qm&ZF-s0zb4pR2c
zWS1SRoOcAf{a`vDK3Ku-kzX9FTs(!1^7Bs*rdACeqC9#CyYmn_fAA2Z**FENcy@~N
z;S~1z6q0|#RAT?hROM!nA4X!{HcfeU8hdOS=^!~xxqmvldb)DwboSD8y86y^<=Gi5
zIbC^T1{<28yf}kBGK27M&LF|xpK%bBb>-m-^z+c+%C5uNFAgWgZJMdvJ(JxylhpIt
zOuG8#nbh>Z&m{5Do!V#OLl)%?E4$FD;Kzo6FQi>ZRmx`;b{QXLh>O!^C<!)C!EcH$
zQr25;Aw#{#GGV8MqB9*RRWupMQl*78DW|eH&Vid)40xupmc>NunY(zm<uM!EXjAb2
zW}EUC8+(YK!v|tm?&jxwSfcQW^H8{K4qWM(<W(=UOvC92EXvCMYEkY78}4TZty6K8
zJA_MZ%AGcLtBpv&fIibPsD76RH*m3ScOLui9AzYrJ(pXIb6m7jLjmWN0X}C@uC=jO
zEXv(>Y#UH+wUa`&+Szxl%H{Y^QSlOT7@$|K&0UTY<S7SL|JE{SwcKu1?xJXc%R(U_
zI%qR#>2u3A%SJnUK$bjeo$|hw{n&EMppCt6Q9iY@C#?$3pSRXuX=j&7Du){-N32co
zA-0*nu(SVVQ+{G+SK1Y%irZ*_q3#cDmUnDhECbdT8YzyvS<eP?m3JE1FLRf@-pHQI
zYrK64yE9*TxrsfNubjV_{qKC`;|6xoB;|%W%L9cqpxiWBdAfz&F<E(GIlF(da$gI(
zt_ZJvZ)EQttUOlBc26O8cT82r8rfBcDL-svcTFeE=hNr>qmjKaqvqx&_Bjk{3A_79
z<(Vbyny)FZ*0U=d3xT=zs8&$^<tXKsjqHP?mD}ssYqLqjOXeI6abKQ8eD=<H+YZu)
zk5is%WZUKuqu1vv8|$$)FdOIJTd)vBo1Mz@E$mX)G^{MX=$iIu3mb4Nk2bR_-O7E<
zY}l>*a~Zp?O!*!#mCEQ+_Hw0y3bs@!!?kQzHC1!P!VMN&xZ;HQAo|6L%16uDA5S_O
z6}){i728mwJg|seS3~D_)Eor-lQmTAxf)W`%QYm{o*HW3MYUAH&9x+MvX1cE>d_T%
zdXz^R*ro;rTEC$|d8&au&_FUg-lV*+gpD*QP}c6IgTV6QCCcy;cFPh4J@eKQ1?szC
zDOGhn{x4<2OG$)hmMVW+%3kB=SGFjxE@yYOkjQeFk@4lvK{||)BE}tZtXXp~fB&}a
zPP^rNt1_gpKUr<p+bxgVlvnI*lYI+{rfhQd^H$|?1;#w@A%(2!eLMS+UBUS;?Ngsu
z*t3d)|NF7)mO1T=J!4TG&7q;OA(y>sRX)gJx7n3Xa@a4(f^Slk-8t+DMY$%Iy}-}^
ztPuILIheV>nZqyUg6{Y94=rlj1dsBgBK9|{g5~cQZL{!y#IEd^%zm%T#{c(nlt(AC
zk8)?@|HgdfS4C{ogxUCiZGrN`B6jD*+4%qHB;~ou?B|7b_z&OC`Ce*S08xKqQFhH_
zH-q;P>>aCe=}fl8rkp<$(dSW+T|HsV)*}&0lubvnCnin7{|%Fs&yQe_79EEF*B-3A
zkFtkcW&zkU<qY`Ev8l@CN0O(;BxZ*4_7Uu(8OjGou+4|l`7Ln2Guiz!aq-2OaCzI0
zK-VA{!X>n`WiG^lo52n2Ypmpm{$N#pj4aTuJPX%kKZ~)S<|q&6vC$mm%{=xIKff@S
z$dkFpVy!oiE?!Le$365998W)H9b<p6AgN{Fw<`b4#hxVvw}R2YUn4!YJ%AOG3v!eT
z^4LfY3cQu0+?h{-_WpdfHJ8r6PkA!CJ8upWBT6~1$S3Trl6#{k$k=~dlqYi8kKk(a
z*an;O{XFuoH|Jq$0s04X=;C>~%4jaT0q!RkNyXQ2{u=e{mOL!LzDjJ#Up=H)1{F7^
zBiGsQvHa=G-1GAOYI#e|eR>j%KlklI<!@)^?w(xu>NoRWE}H)O()^E$Y`^+u{=mVb
zMD*a4<M4lM%6!JYH?t4)FCX!y72xJsRQQ!y%2;dej@jinADweF{*M&T$Nx8v#X`s~
zN+5S1r##-q&Y!2e+RBFJDO=ju<MWhTTiLB8%Fo-_0}GTtwX$axDA%>I*B2=FwzAiq
ze&DYuYkA^yc5OM)JX)@dp27ZBuH62uNq?+Re)6qJ1C`1jzA<Sl{LMEe{l4-DTpg+^
z-1?3DE!ER6{YL)1)wZ3j`4=sG+wyJ;d-Mcl)W`OmFl(oeZ8%B!u!Y@llJb<Fy?T-|
z?4LAnvhv|+leV6$Jk~ntcPA^LAFeqT>@Hpe$**5@q2+sR?1?&R;?6o{?-}f`b;|2!
zvW@l1OJ}l?`eV@0n>@;UXXNf~I0iKLG%B~9k^4-ea{n2*ukrJTo0OY<?8PSK`#$y#
zKfiJbY5T<`)TE)MN1%~EYbMOo%aolh?1C2M#uj!-3u?c&MR}@)J>D{#u|LuGPnP4~
z<zD4wFT2C5pf!(smEU{W2tR-J8_M5S<c_RRo><9VUO~JETb1ii$-S{vd46T?$F0iX
zs@#X$luJ*^J#Urr<Y{cvDr)O1rzs!#C%u0fsr1Uzm7krS`^f3ksTY4s`Qf)F-SRC`
z_-Efzey!%-q7wcgRoSKH{z6q?5LdP<_qFHV)lTPcv@2KpayR;jX3$41e$A)gC*3aY
zP(JTq*K{Z!cChC=s2^VQD;oooKJb(7ZA@hCcDPhz@3{ZuAZx`skOS*8`26Sij|Tpu
zf&W1o7!a~Lz9Zlc3AGjs7a%0!h;9{_BEG>0<Nb5-UHiXN(hYnjFb=+{7W+l6(I^BQ
zm2@K-{#uEb-;3mTr>rlt^!~p&o)NiQHf4>(cgy;<>+L4^K@&d1CiopD`0V<$dd=W~
zl+P>m5S37)cgXKrD<>T(u4vz9WqmKp`d**)y(R1WH?zL4mfx|8&5u6${RsKJl{P2f
zm?!`?gozWRar_v(TS$f_aefR<5cYJq{9Ys>mbm#bEUzOM<Ogo0@B@#sMCfFXD9#X9
z`QeoXu{DJsLr7uhm>>Y_lJe1dKOMB5PKPAJ25`YpvSr0c!RWv(M1Ej<B|i?6u%=O*
zX!wy6#CJ_j?O6<Luj7YSAAYBp%Rh9yz$fK*O%A-%gkRr|<vBX=Tb=wnmaPAC{14W^
zs)?FqWj4auF22`D=#{Wt!l;A;5)Mi@B;l}xJ0u*Da8$xQ5;7m-N0EdM37rzwNa&TY
zUBak@0}>8OI3(e)ggYc0k#JPPJrc4GS-*r137rzwNa&TYUBak@0}>8OI3(e)ggYc0
zk#JPPJrXj%tY1QhgiZ--B=kzyE@4!{0SN~s9FlNY!W|NhNH{9t9trtF1Q4MpAmngJ
z=#;QVLa&7F5=JE)kZ@4KAqj^i+#%tJgrgGfk&p#t`z3Tp=#;QVLa&7F5=JE)kZ@4K
zAqj^i+#%tJgrgGfk&t!D`XzKo=#;QVLa&7F5=JE)kZ@4KAqj^i+#%tJgrgGfk&t!C
z`XzKo=#;QVLa&7F5=JE)kZ@4KAqj^i+#%tJgrgGfk&tg5fe1z2LJo(7P6=xy^h($+
zVN}8a2?r${l5kkU9TJX6I4a>D30X+CUqXk3P6=xy^h($+VN}8a2?r${l5kkU9TJX6
zI4a>D2^kh?=qQrVA)!;k8VS7;wo4e5a6rOA35O&cmT-rJBNC2ExJN>^TGlV2Lqey7
zH4=IyY?m-9;edpL5(;kV|D%Um(_akEECx6u)IT1`q(pwekAbQC=5NGHkIQ*b@O5k!
zSiAlr&L_(2MO(y$ysYa-Yu5+Fh3xVR<n<jF7?<ys?Q)!FeEoP?e#nHsC}sZNV#(B-
zmTznGmaSR&zB#7@yRx{hZlR-iWe0xsqR-)Sm6Vk@=ezp&SJ!{J-6hVl68BtrVPBm8
z4u#v<2gz(2YL%50GRInRMRp&XCGq!U;Tt5rYMr337l4IL@Ha{Pxsslq_oU;e_)xu9
z_4BXn7y(!(t`mOne1UjI;wil+eESB0@Jakz5<epGSjgrFbqCS2t>XJ!iC-Y`dnEl1
ziN9Imhc^m@oK~{m+XSB{q?~U^`Z0+gy<Q;bIYc_>))>|6q=w_*&l3WkCGk6k2+J6M
z&JXZzi67m>(d<P5aK~Hlar~D+%#-+%9J*_O$%q`Un)_V^Jk>jTp&+E^oanem;%_`#
zeAlMKO0MKDJN#{Nnaz=S_c;P_wZt!xcy0W?A^Aro{#i-C5&c2O4HDlh<^PpHvqNEr
zykEX65VQxFj<Y3x>3Z=U`&Ibymc(o82FFSK81NQ$ID6$Vp?Fy>o0Ko~qxrK@Nk1KU
zBm18XJjvtDT95d;#1CZQt0g{r{bD)rB+sradCuVYLzy<;l;W`SB_B4RVFbL4(@$aA
z{8hVZCjY&h{$O_IpEM0<C5+qq1gD?DMx>ljBR?E-NdBoz^Jm(85?d3vexiRAbo6Wn
z9rSw}ydUK{fi`d7DDj$~c1W>qk@!8i0>Sr@6Q2U&Gnr}gcg`ID4hEj|xjRcvY<1@N
z(JVYYgGKU;GztE+FO-hEIQ<N^GmHKwl3rUsqCN9;{8r+zTZMnWjQ_`;kcmP++vU1J
zvBWP1p6YG?fj~%i$<CJentZ|M2}yrF7A$D|4rR&n2=K&b$HRi~LP>uy6iaw*UF>Fw
zzgOZJ--wQN2r0ihS;|B2Ov6EY1L?R%;+@0ddrScKB=FSl+Io=I@2^PuXjZ?|^Zz6d
zqaP2zLHil$Xp?yFBjWor0<g=0x8P~}nx6~$LnQtftbcI4T+h<#t>bvC;>h(9NzUk<
z*i`R`^bcNn;U0+}{HZ`lx6bkoCjN)v`7hxgG(HCbPkh?X67tZVQaWlSe)K-^9TM;(
zF7aM)qvHkspWy~1@t?uAW%c939ACt=^}eGd{cDm>c0VR1pJ*1Jxlj~u@9zYI$0eT*
z;Hlo>Ec){${qQG(@W+z=`<(u8wkM1J50c(_uOOu7jp?Yuf)v&3l=i<)pxLPs?~ntR
z_IT5Ap~P$Jh+xZ)+kq!JFUqR-DM?>r6&2DmGIZo4aP#(Njo%81FOuuM%LO^}NxVZ2
zEZLoG7w}Z?U{<{!N_yuqQ7=7{K*zy{iF%o2pb9iQR^lCx3WVmbwgFH4v-9zXCH>%!
z1idB?{ay^U%ln$Z)3bYYL?oWwFTNiu@efLT_Bee-<8!T`*Yy06#1|C^2AZFliWGsy
zwYFXkGJebjp5$ql`uv*2w{ZF*HuN*Wr&QuEFrmL&;<fd4q$~V5>TucKEI;r~iH~j<
zbo6{H9f6sGUR#GhNub$3fhRp{@l>Lj?Fd1?<2r%izaN3q`y@X4slZ<)3t(@7uq&DP
zXmXx-q`(hP6N1n)9CXAaUb{a4PsfiBHN31>x(PN73M74M^A4YESCz!K(<>Hn@E_g)
zd;xfB*Y+&?{6M46iX*cflK<1hg)w@bl#V`!=-)g!@2KKf><R#$<QbeO>MfP+dVQ9_
zYxhYI*7@;Ii7$Fje5Yp}>6m$xppUi)K5xo++M?m59nKNtY&Gy?Z+oRYPKm!v(mQ3`
z*X;A`qh-4!pV^XrNa9BdMZNSa5*-c@P`gH@e)#V)0N(&S>2n}UpK(b)&Tngcq&;BR
z@?$z0LVVU`@u}nZBBtGU`L?9rF6nn<(T{NaR5qHGC%g$f$)D|?^S&<dHQyJEkG2YU
z1n?9ewfjH_$NX5v+dGwM^G$7BJ0&0Oz8DJgqe0>aUl-r$*<Lyh<9H-x?fh%%-&K;{
z5fKbvmi*Wz@kImT`z8U{(-J?hULdr%{HDYYN&0goz3mv$-e{fRf1m8%=@MV_kRYUI
z{pr{!@!9d{HVvN?bcaZKI}$U}v-2H+AC>xCAn`S_UG!`R9g8G>NZMhWK(o^%p8Z%L
z==pIv{wnd>eaSmyy;J9kdUu>I5cDh!9reI7b`YC}Et&e`RN$$99WwvX^nA9&SIK-S
zPZlaXmgr$ua(rq1zCq%PWuBvryIX)KJq%>o`41(&eT9&Ro*|;6;5cFbTeIx>7>RF}
zc-dX-REh7-qR&BKB00775lznXK~M73OcnL|WWU@i@!5Irz&t6xj8mFDUjsbpO}lTZ
z>FqH|KO*DcgQ8jNV~HP<dT>bmr1^q>yIsgb&)U+_3w)6{&G^3oc%*9Mj)zU~IVB8h
zC`D{k#?vxc@+9Dm__qRo6xpp5yiw8*n$X_}ypcR_nb3c1LO*eVu{<+P@biH;s&|PA
zUNymAWrBYQcq9Eh13c+xcvvV#hIw|7(^#H5;Enj4Zh~KJ!sk0C^p~38Z#Tg|YJz{-
z1pfyU{JSRj&w;-cEv|W181i${e#%|O`neGJB1xkMHv)f<9wE;kGvV{534Mv%xZX=m
z@b8%5?MOt8?Cmh%jpUzeLf;C!QM(c*^iP`5|K5cDQ{au{IS2#a2tNmSqjpsRZzN~P
zgwKEp{Y}90@$!J^KpCFdyTBXOn^$g(|GEkO8^9aMIRre_JMf^W_Y`SAubc3(Rv7E&
z5a2nV#{`2q$)^T*Bl)i|!9Q%m|K}$7*MO&bN2UBS-Dh(vjpeK|!8ZeMBxlHkzR!gI
z*CzNsneh401aH9v!APD{P4MTL;I{&AWY2e);2-9Eu&#BX=)gl|KR#>1e@3-2|6_nR
zl7BJq+)jQd7(XKUUt>c5kO}^I;Eme#I}`eu3ytYrz#GZA1bCzI(qqEsToe37Cj6f=
zp&v29{~mZu&Bh&{nb1!@-dO%}6a1;bQ#@*yd8`(XE(hL7KQ9Asr2jWe=ubR>F~>OR
z89NntBmVus8}Yxy1i#G$zr%$8izf7cF~NUqf}eUK<&9IBH!II?0p3XdZ<^ryP4JhP
z;BPg-KLosy-Tu~uemC$&`akF-WBk#;8}V;5p-%vBq=!u=^gl4cKWW0}1rz-1z#G}y
z$0qzIoNV0Qc_#QK;El#(0C=PJMu9id+Z87Cw*YU{FFQ=|@0jpmHOBHBY=S?=1Yc=_
zUuuFs6L=%}*O=h%Ho-q<g5M3ik)A&T-bl~0YK`r|3A_=X<4y3(fH!Ja+ysBE3I0(N
z{!g3GI~Ez&dlK+Q@_S9_?=_)+!UX?I6Z{({{693IpM-?aNY1$?_!<*@Gw??GzsUsu
zIPgaH@LS-?Z`Vk_t;sf_-dN7Bncz!+H<D+W3H}-rK0h#_|D_530~0=<ncxrd7|U61
zg1-QGO(#ruYz3b3w;cytI99tabAN*||Brw-;xl=%F?|W}M*X$kg#IQI{I7sFs`n2j
z_zz6@6k=h=h|dwg8})mY34Sy1q_^l@LYaRS>SK4B&_8B^|BVUW)g<_jJSO;S_rI<J
z-bl{JfH!K_b0+v#P52~%H|np_CC2={CioEWMsi+kLVpABq|fa2w_PUmyG{64Ej6z9
z5)=Guz#Fx9O0#i$=K@dbitTb;5zAuyI1_l1pTG0~m<p@Fhn5-hSr5FCey%maKV*V`
z(FBjjzYW_px5c>Le&CJlXV3(HhY6p*n$Ul0LjSep#`RX3;Ol`m(%W}U@b{VU`MC-H
zMc|F>fvKmvob<AK@$z_YrF;=!t5YrU<4tySy*zy#uVV`*42i1S8oetMctN?^7mnhk
z*UflQeTuLIZ_-SZIKj+akwqu!Z3+j1>kP<IH{Pa&S7UYs%>`^}O;o9P>pZ>j5pVNq
z#M`zL^ioolzdNuAZ>?%+RnbbifY(|Ff=#W7vZyZ<Yg+2X+sEj2W#T1Ot%(ZpQci={
zV^tFwf9EN^2CKDJ^>{qZ9ZvlBcwANLGSsiVIhfbs@wR%@l2AP6<L!owl`4O!8D4FL
zw=d%L!Q%bGO@4X>8k!hW@xr!HBHrYu3j7|Auhp}>wM@?=^`0}ATmN25x0AnaS9>v=
z8pFG}V*$?^UF}O(q{>!6n}MESWgzGWWoIT?Ya-pwYP?#p8!tu+1T$z3ydijx+cNOM
zYYt<5{zQ{^DH^w?O>M1GTN7?4-lvC`v<B1=UQ0%=%~TV<E}i7ij_5hwrA^r#Ci}6~
z1zumc^Qx%xs23Nho($u0s;ziGnm^%+Cp<7Jyh;x*)lEd2z#iYrNFn^ZrL|tAPLBjs
zyza9rsL~a@Ne|rWRku^E5AxT7ii-h$RlDSk;g#u7JXfxKvmIwwsWV(@31na5<03O7
zD}#>WLCE5&q?d?x#ULNotS_8s>S*o&-3q*^Ev-K&5AWUG=h(^6FR5EvI!0t_oWLE7
ze`8$QRD?Dwzo<5yI&I>-rM22jo7Lvp<d=-K33mTJZMt>Zgt2Hw)&cwJz|b#6r{E2q
zcn`Sh>sv=XQ>re)dx$l2?nw8ALFqD-?hN(No12Nn_~N~p<qy<{)VZ7J#eE)(xfM&v
zFu1YMi_3*qN_z$2KCAhwJn1Eco|HF6*>oFDC&LRRmPdo(6~QhH0K8}0<Ep@m(CN*v
zYMp%bHavZc+n1=Ui@-|PCEQhrFtvCgdyg37jgiReIDc1cr_WF3T&2CkvZ<xikN5vM
zg~hhP=XyNp+W+l5wHVjzDnl295}tK<Wi7o~Gu3)nJ{q&n>nSD}`8<;H!M`{d7LCnj
zwSOv&Vftl{+KVkqh4WU!LA+Zxkbcu7BATlr8`&J9@YJzlX`8xYDU1)nG+jcCCStDA
z)T=7#t6tIBhT(Q2x~1RyjdyZJ`|y_IpeNHIuSgFNZwpkXa}nfk%VCPT-u%B_iB#m`
z=H-iOoACznz7DmCyf%!s1Q8S++nQ)b?huMl=Tkpd%C}Q9e++Lc@9XR=@iV+ZRlf9C
zP4ue%o(L3;7u*ISYFAIB10Fbluob7_kGJj8i(b(+G`x%`FxtZ*)fbET`c=I5IM&ZP
zWAs+@KwocfKZ@wSaf8etRn>+SwaYy4nDuxKdi|*-h~Idnao4(aY7}pIrFX!F68-8L
zO0>q?j`>Sb>s;g@s1S_gD}7yPz0<P>^_oQ~A(G~DGfj=`31Qg-SHBGLlqX-(jW#Xe
zS3GO<FO!Aq3`b(U=q}AQH+d;-Aoo}wUEYRaPJM$a$Lr2jjd88^MEq1*zPvl`S>vhT
zC>X=(i%<D7rN(uVtJ>g&;q`7PN$c&?+c4IvQ{$F2l=;qAsnV9E*`uq<2L~LFM`B2a
zorsXySMh$`P%vJF1WolN;0Qdty?!6w+|B!^?Ub~nhVPv{Fng<aajs--`rJ;-HoQx<
zSqP<yt$05#y1F7Wxu{l~0vJCqcZweAM2*O@8}2HV@AVdW8uE=QvwWk<Jm27#jPnh!
zGt4(!$lk;Uei*}#Qa7p`5ugQWwaDDk=7=_?=`%ar<&N|v)JUhuD8;bI&|Nynz@){<
zfO=EYA~SW3%WF%`G*fD>8GgxFGhk<^8Om!@n$gT5)o%tSCTVnoEOQRza*_uR!2zBf
zY=R$CQxP*WZqYavcAe=>JS%&skO;3Hk5gT<gre0Fble*F25&<fvifKn^i>~4Ky|~+
zR`#rDX?3fv($vHY?x6<`Ni9<wR<?Lrmr&-Aw#ICM1EDxxzm6fM%eBA{e;LJOL$a#j
zK0$>u(DBcB!L+_+O&9x!0B$V;q&h(ss?uIh?{<>zvfp!$Vp(y%Ls+NgwA1n#H-yi_
zpiYl$(LNORRrhdp>h<MlQd?`fC9X_O%k*M-Lqn^lO>L`P)a=RZLRYEwf^xMJYZM-y
zjU8wz@5!)=GI~Wg9QgsSDtJUzC{n_d!85h~w3gasO?4TSWSfAioJU!teQJ*nFO5$K
zH*RV&E-W(ayM)0=uNxQbTxI0HJ9{E)#oN=fdc*9+>G~01P)~ML)pf7Yo80x>_CKB~
zV@S{=F^ZlUf!kFP3S$U`0?6yS`g#$;G8FEr>I(I!aX+S0pzwdYFBA)MeHbPs6*M4I
z>4+MQA|r9R^@t4SsqyFU3Hrh^mk)U0#17nuG8mU4Z$JvE^Pd@)HEoq4+od(GaGWL%
z>6r);sSJAk=*sRSWDG8`;v(K}k6uaVIV6XF&6jCTmIRun43jaLyQGZXsL#@iYt*vz
zoG5K$TBO-$7Mlv{6on2fm(8>~cOcU13x!j|2O?!=4_V8g|1YhoRrBcsEysC!S1j#i
zDOWAlgtc|u%#PgWVi@%)%_<CMCVUy@3-xEI%IC{+ZlVU_5iBIq(ru<F2WV5MpJq)5
zYEKyhO&ihX`Gm{O$ERH5UR3U1x6U<g;AAXfR|ViPX?@s(g##^nbCsuZE4ir6vy0T&
z(*|m03N}8U)WvJ<V)h7Fx<;FRkb=hdZz>NGYn@m^#vn@N*BDJ-bmh3#EW@rf&#?I=
z;|v?@Ofzh;dW_Ca%{@3H!?{Nl<{lb746Lmwb5=<<mr}ct%hM9lR&T?a@!E5nn~Enh
zu`%SGo-cm2s}reu4?LSq70qz8WOe^<EU%YgF$KwmoXw!!Uun-_Id>VY#lm~ja$F24
zxBk`^7TVgD)nN$JI$B0<l{HDjsMltdV21_WkRLbY(20HF(6{@7v}#H>7d1aL&Rx;m
zsI;_6V}e^m4SivM0t?J5_)WTcDIU$fyi2_qLiW-W)4Rr_O(!v_NW~_sQu^q|4z1Qi
zL_GLv6)cB)+9zl#C?&0@@0F6?^(HT8hhMF*7=m<v7V#z0pOxwvO$_9k<i2h_XS&0T
zK<4`Y)Mm1$8`<{+G6dg$9M`%zon-I7jN6S{y1hRAS^=_te^8$Vw4L%*dxVUJm-S*s
z`K4yh9t!)_!7$%n5bA7N6;8xsX%A2p4tDuyc+(9Tqg%AAA`zm;ilK{Sz!rtIzW#Vq
zo4N*3nlG-1%ebdcmqVS{PJvZ^+<^%DgMp^jx-=o;x-8-W+EYe@&a<3k5t%EGt4lM(
zc-h)+pUJG<#Yb*x_LwzJOon&nB3g!yGp2E3NM$Y~cl3pN5;Q4U=MP3nXPJq>mtWx+
z*GE3w9goMz5ryI^HeUp>lB##x?r6VE_ObVXI@nba3v&IZRF58#8rF3a-GbhKH|?Wy
z(ixR5L#>8i7c(`{2Zqx}x!WZ(@lCCz8O}Tqioa0ak0r=5T8|U=MD%H&Q`{Zyj754g
zZV&67PC6od$#og5B=Gfsui8O~Ab~a62v+GcmTl5aGKdyrv~8_lwXk}P!yk3{uw2xG
zn^7dpDVak)b5I&BFp<JjnS~q(l=twKjc1mL(dJVrQ`S61wZT#kA6xso3q4MjaBwX|
zLH{7Ns1L3Q`Gcv25pj23*Fjk;t>U)kK6kLCtK&(PT(Qj1T4^P%wu&(cf6RS<B0{%s
zur5jW6L4!B>v{i<bzQ!e=7yehOSAeRSSjY!z8>814D?e9=q~M5JA-r|0s8@BF>IjH
za$em;dSCOejHYV6XEJ|JnF-ewH9Sjw=%Q3knr#w%IgRIrUpn;SzNL#Nm((Fac_*_B
zK`Vya6xdTo*V3*UF_o6uGgO(*WDExLzQSV}Et|&Z4s*hXSe$l%m`p5gS*dy&<=y$l
z`W1ZS=&aIsL0R9gnJMH~elx3_6i4^5>F)AZo19an`rFI8Na#r$7nGhr|H8bC%+>9J
zv%y&8k-d}RCq?<V$Gc|S?J2%3CB2yAZRns)0Jv?O=~7+po?y5u(Vbc{)pb^-*sK(H
zx#j*35)LuLlVYpq)dI&w8$W#=p*1erf`jZBsT};sTACpnHOFMT0g<A2^ta5@j_`lr
zZC|?4gC=Fg6M@aR-8SCo8LG`)rrW8bjZ-y*9XVtOG!;*E(0I#&(Lfk|FZ+GbI5rBE
zmh094)`nmbYa+ORgPFgWQew$6f;+fNVQPL_`-0P<dAqiSAvJxkR++cINqUcE!9+JU
zkG5gd$zI)0RHRjEUm(CY?1~ZxdjGIG+K27D9oU-f@z*SFZdz2gYL!}wAAWVaz>udf
z*l7_(RKpf;*e8vkre^(J2e;ZB@vj!!MfyE%EZlkf>qGv8idBrJM6kD&yl_{RbmjWZ
z&K-T7=KGz+T+`bRc3Lh&b1hLm6v4)57=PAIX-_%#%Cv9Ntu{sC^r6A1Zy=;x)Sn2}
z@;z6zaVaGh0|I?fL8rP)+$H6pqfP5-1JYYM!FRD$Oszw(O$6O=^!{}|wF8^M!U*~Z
zxi!#X?V={tRZ@y(2e4Yxt3#DklP=0(9{%p~KH3}aLU^V81-DPn?nfxWhQsmu9fbnZ
zTYN|_rtOgy)S8I)80iPMBCy#SYBaGH+^%-2w5tKDTG);a8>KCA?lM(PbjNVNRln;*
z?d(m!#?RKvQsQ<(P0KsZ!lr4gfG29P9wim*?I+pPrg&>#2Spdw8|<aYQkKd+v6$LG
z=?nOHw3#5WYNIs+GFNdyT>-Pi)@jg3VnA`jtBXXjX%01YQ##a2J+H0|BNg(m4h9xu
zL`%9>><W==mSJKS>la*Tyz3Y7v7Aq79rohK>3IS298G6=JZrrEvoQUm#r{wicIZ<F
zVOh&Wr3O|idZ9eVmqek$-rh_dx**%iaEP|rbL&eDq|Ciu`a;4;T%|Fg6%nSyK0ZB}
z9NeeqE#QlfNY8jyU^KqcFSnn;fcclus;h#uiUEZLJshDuQnZ#jUc=b^nSe2&cUSXG
z7@~HJ{4z33c>feaxPnD+f6}fsFHJj>wfwk@kLo(Iv<BMJfjTk2K1(cj^CT)`$CkdW
ze2BTQLqH6%)ba<Zc0W<TqxUZ4azst~R3)n>+A*!k>_Yam%-hO0UnBHlykYE}8pC$T
zm0_CM)4T>)F__duJg)k=<74oZs3Jo_JZa0wY-l*37vj3bU!AnzE4A6^i|f1u@zGCU
zv&8?`{TkNP%T;o2(tCIH(ZeRbjz}z_7Y{>PrztmSs;#M3Qx$iLNaIA0#rrY$=oLdQ
zq!+lv#Y1VvXp`|P0v#ea;(yTBYwoFyM_kI&eDR^H?E)P)Tuq`$TuntrsDetVi^Q`~
zA`yTK)1G!n@97cw+m{%$;^vU?I7R%yZUk5fk`;vQI5I1Y=5(OtfjS+~rE9Mf$(jnM
zFP}N!!|dM?6}biRC`0kA8ILaXMN`8>#GE>s8j?%a*$|Dk#wM?3LSnR~2oF4b@LO83
zV6+Eekw0qE+uJ)nk>eQ(H&dO;Ag!jK7ObTpuYvj2>5{CJfqlV8(=wmv@)*cLSww2U
z==~EqT|Z<P;x)~K)D^)v_CwX+y#z=~DXE|?j;p%JO>S7kZAzRno<JehAGk&-rxqU0
zvtoR}kon6BT4y2tB9C)bc>7QF#ljfNjeIjkOfG=^n>-cnmyxT=uq5(Ebt~(2-iYiB
z>-GHU18MHup||zn=>r-Al#YKnrO?i`U~Em$gYqe#4UUv=c@r>lZed8Y)`c=drmFUY
zI`C8@Gy@r6f>2yAkO~^5ZW=ROJ|uTsKJ;-U(xXj+N=xx%XzHOEo*c`^=u$}J8EANQ
z>N`z9bkPcG0)m1XjEQ#NNnxXXv7ovp6ieV~zRXdfE3AzM9#O|VN~0?V|0CmWX7ZJ3
z)n6d9>^06TznmQ-@P@j=bep@3=9$eQq(gk{Q0T%#u_^^1gdSSI!kVQ~1j5aF7?qfA
zNop*mVSYec@ZkhY)gFBtX?D}pMvq=}p|W5MH`M9Lm^Pk8z<?=R!b%B$F3eSi)E=9c
zw5NDIKFC<7-3k{L;O$@B)GXX7%&o0E6c@e2Q6LxoK#Nkf;Q((QayNf(_B2k*rEBA!
zFeTNP8St5@W`M#8%6x?Upe6}WwXv?$I@p((50mvR_xt-|v0wl~>zY8bWNbyKqf6NK
z2{#h-t;T#_g)}RIoj|0;UfI@Am6F+m+k!Nv7m532De1Cy#p?S8%%E`f6kkuDbeY@?
zs<jNJC(?z*v>;8Tvi&hBG}@mUwyrX6?fY58(hm<zlgSyU?@5q@%o0wEMrc(Fo_<0J
zm>z;qf=@bAcQZ1Jsi%k6W=x!9E0@d7!6G)uNxaC}$v)F-LKzm?84G4kMaP*e@5C@|
zA)zU|$aS*f`zr5BExrrCh`9)V1R<qAF`3|#RXL3iDZ73{X0~$V;t*MB6dHjaReSs3
zaIs{FTRf{XctdL1VUg*I<T7>^-;CL{mL%y$ohKZJqa>x0!PSc0$HIUK3B!itPFbaH
z^oUc?2x2~Sz=TSA-$1Yz1HUJzvk_q-`<v`wc0t|+sTIE=m#K>Jir{fcX7@sNeT}J5
z))d#);J9IvbIjh7ouM~8Ue_B5)8r9%ddRg>?$iL|NwqiSv!oeoc9WXet<ecW3q>M=
z((O80uSk*Us7OI(`3Ao~3SCn&DoqN*Ez1Jx++Xr$l)mj^@gUQYJJGS)h#)5}x?HG$
z3|5XJZOUmfP5yEV2B=P2-HKq8)_}Ztjv2wMUke}&{836cmF7@5*wWYA5scB?)!!Y<
z7&F3{Tlm9(SQtu;3vf&=0ItOCq^@u<KnS@Qs98Ma`sBOewBsnk$94-Ia;CQLcfAt#
zs<7ziMTe#5atO;}VHE3ddi@i_EQf>N*ho1XKKW0(*{qV>&A8NM+;JeNe}T1Jomcka
zfmp<JN&)exS~!Sdk9BTox?+&Yk(p(+2cEsiVb13@p59)X6hf=oAl3OoqX%8HYNxc=
zMb9<L=geJYT^Y}vdpgn`M)M%8#c*t?7E>oyVY!*ld&`>8UE=~6@8|=aGgbJaNQMHm
zQl#%H8e}>ryh8XpVM|aA&4a}+UqGw_trE-P+D`s*6*nemV|w)$-MEaW-%y<P#q;O3
zTkvBUCO>(x$b926zht~|8SD%<F4JRy;_*Q2R>|I+|D`lfOb|ffY&H85W}RrFhqdV;
z9G)R^1JCIFacd<={M+>X@UJiKWMs*1k!pxrFe2fg>#nkTo`R6Kr*ui%NQXPy89$Uj
z#x0!`3|daH;N8?#zF;NtBd$U5+;WgYiQX1<BSJ(L9;sloMt3{(dia9N2K**gY7}JT
zax&JWZM+&Q@$lW4O=2cWPCP5i7g1C$qGq{)FR>!!s^rVaouSzN){)6VwC&(tKMg6&
zkW8QTple^?SqnTqz@LHC_FRn%=;M5lw?7r_7Nu{JG|}%Ur8-&{Jm`ULjKeh;1S{h~
zmoA8$l9AifqD?%spOsu5C2QkaY{Xnl%fVqmx{LY?GCxM*WSsHHw6hhr@nz1LIo8wR
z5+<?l`-rrkNvEDV%m9TyP5I9Q8@JiIm$v#PtgMoLAHs*n0u6#dW^=#bh)Vw?(D(%`
zv9xZKXN<p%YT^apMzo0nxX~`R1IZ|Oi&BrsVDS5*VlnwkxDH(?!(xO8W%vy)?gGso
zy}_oAR0QKcA7B>1%;OhbF%Dl}AbRniV+yUqt<?&eI7rvLxDHF6baQhx(y2gy*w>4x
zIBjRja$@i@G(+eKW}Ba|<y4A-R6o;+3$<rtFykD(unjp3Kx()ilwlO$DrO{Zqm6xZ
z-!$5fJq@%3h3?(o+>Tc6G<{^<HBP1DG~KN8^~L$%#b8TWl5wB#_3>~JI~T-{Ya!NW
zMi^IBx>Dq`P$Sl;>6UML(SdIf2nW}yV#S)=2v6@ZrJ!(nDt3Qps|++BZs7aesXxAi
zFA(=Y{FyTHy%?B8AouRYQbHn_*|phgMA|AC=3!!GmF{z+Z)p>k{yEu<o@6EQ{$Avi
zfQguZ-P#%Mpax62!hI$56Hzp^VkIdQ?bi;>4+R(}pu~`sz<$9@K&X^op?z4`&7JW@
z#WhSAd{jW5MSBvggl0pm1g9lk5qT01`dJD6XcQ|!NDhHI67VH_tR&bie;ubg0B+hh
zRO81|7T-C5pN^wjn4GKl;T*w3J10IQ23T^DY3QMRBta4Y!>j3t#~FS%sFy#SEq?6s
zU-*AG%I4xDe>RhClP4N<Wbw|+`d)(X`tmzvc@56aqPJ&#$ImpgkCMrVR{tJ(qQPNF
zua(QLA3r0?3h>d&57|Tk4GvEg-xUD*saUQ2G=XNVfK)^)@0}qoXt3*Rl28KqbF(Sr
z892j>#8dKb$9og#(BNCLfF{3Ij|RIzJEki?gx~$ALxWCPp5!MvHTj8#j<xs@Z;jsw
zeqWys4Ms^hIJEk;@>=`Plh+n;D#mIW#fb)s<&Sbw`RVv%m%kJ^5-|rKeza#ovM>Fi
z$)Vvk;Y45F(JroNa6rnh(QChzs=+I=$`7p;lp56N%*tP%Reo5O*Py09v+}oPmDk=U
zuE9vD>_EvNTb{eJ%4_cf*5D+qVhOYLcfTyJ^*<Yy^+`BQ>u6ctA)y8y#R-+r%4_dy
z)8JcVbh;zE{%3GaU!FZL>zA-yjwg+UW<T2ZUuTtX|C1=M!NDwrW!L`($`|6JiK)Fm
zdgv3uC#!hMel<D%g0ho!<+b-!56kiz9Z2}0eb?Z}z*0?G{n~qyx6ATFLJUzD|Fv=&
zA_v!L>}ch+_x7&(Ocbn<9Yf=v4z0Y_uB(C5*T1z;6xcc@%6ALE%j(N(d?w?%zI^)q
z!-CI<R<Y!umD3=-MpR#3dq3=`BFgWji@L+1JEzklah-T+?bqIWx+_<dcMy^8(CXDt
zN6Y%18bU&?{QNAuCXOD8nlcIV@&tC*F#>-iz;=AJ`n6$|51b~upcMZ{7l`tI$L|{J
Y53ODeNAA_A{EOca<>zG;$ok6u59JP$EdT%j

literal 0
HcmV?d00001

diff --git a/PyTorchSimDevice2/torch_openreg/lib/libtorch_openreg.so b/PyTorchSimDevice2/torch_openreg/lib/libtorch_openreg.so
new file mode 100644
index 0000000000000000000000000000000000000000..dbfd3478e7e06650efbe50b2bd6de36f52cd3986
GIT binary patch
literal 569736
zcmeF4349b)w*M<E8jvNZNL<hcL4yi*XCqP31QO`T){vlafliY&B$A|?qys@ggMbDR
z4UP-W$QZ_Pj0+l6G~<W{5seEP+y)(y;6_Jf42%ovDF1WrJ$3u^O%>qs-sipdpVp@P
z_pMXs?n~Xe)ww2n!q_e`F<lLMbv4d6VoU*_8T8$Kf2^VzMw-zJjtnv!!nT__()mkW
zPwCD4G~%QU((d>*jzZHuk3T_rH9z@z;)qk6Z5Z0;t>nl2<mVj$!Bt;~|H9{`(VxQ{
z0vl;A`Dvcl1teN#Tqf-^=4hX2fXC_6%q5H?zT1fFpAI;NFNgToF!b{bvqZGdA!KiU
zVi@>365jaTE`4Z&FUo!uUhXHMB0NuzpQ#;q61*JZ-$*<?bY9#YvK;i!$B&UGn4e}m
zr@(nK2g%_fKFs(|rSR<1SKD7YGC!$Si9A6c7|U^!rWsHi%@`LmTk-p`r15yq^H1HE
z`utIEcG)!K_jlZV)>}E}eY6P<8%Kh}uJCri_oLt)|5;pIOgG1{xW#c@3LfurJT!=I
z&pG~_8Mip*=%t>FK1R2tgO@ChS=GIpQ4n)h7h`bGSg=1f)-m)$&*qpr?;3oR5xX*W
zNZm;_N5pvg+|b-@X+}(!Zds1FE~D!*3hEmM_cSt%yY?S(eN9hCW_8k~#!}<xBix2@
zQgVJmhBJ0c!?j@EBj(7Sj<lGpmLoRZS`*vjT4&~IJ)AR*(V6l2PNV<Qm=g-R_dDj;
zgo}-%3Szq%&Wwz1U6%Cg(({BRwFNz)1}r`*)9BXqv14P5W8+5G4z3xr^xBMWy^Oe;
zK@W6!V{k^7x|qbdHAa`kHOWU9r^F}J^@z`Uv|sZqqoJmzCuD)hkM58s_&NeUd(d}m
z@<+kDH|f6giFW<y`_b?T0UQ0{eH^iP_&gEbCzD3|Q{cM;-h<#h7~VtReHy&Y*Xi^<
z0X~Pp`wV!WN$f2891fpn!`lgO7qMjeOrcNomrCE$;qx5E&xP+7z<V^j$HIFYyv<h*
zeZP=C$HV7D`aYRHr_d*!lMCPT;5`lA7sLA!cwY+d%iuj7-d=d)%Lku@@Gc^aY!-Z<
zO}Yd==fJxR-gDty3GWcRtKq!>-uSu_KChzh3*qw``d&ky*TUx#`Yz8fjHRTP!RK}G
zz8>DoiQNF7H^O@*X*}m<_+H2KD)|08c;621JK((<-uU`GeBK4`HSqo;vAgNB9zNI7
z_j~B`&-97s-V5LFgLeb52jFuZydNRm2%nF^`*G46=o8Ovg6~hkdn3G?;k^mo_<9aL
zH`Dj$;qwLhE|1E8TY$X;@0a2ID!jMC8(*)(XDht7!TT*@@4)BZ>HBv2d>20dLErJb
z9rS%CeD0#}AHwHH@ctCuyNRJ~7{0fY{tQ09V8`${zV>MBOVE4ay-(v`X?j2C@8JDC
zynlfAzv2BOybr)zylBko3ZLERJM!-I{RsFxlD_wV&z|t^MY=b99)14K*SCJ`J7#Xl
zmf6AX4fj01{`RIhG2hIeQ(v09Hs`$L(aHZRjG6rE8}n}X{Da5##BF|gtZT%Xmk%6W
zdBmyTWo2Yt@Z>4)uJ$`ij^CR(^xY5E?o6AW-S*kO@?(}YH7s9#!Q|(|@9bK&=E|c=
zPaM_!b=H^@JeOYcd)HN`j;_9aXODlx4;gvmslJKLH}t&p?^Unw^u`X~vS!1QX?bf-
zxu?hGn9UC@Jn9c$7u}ir_;ru<Xt-z6;K2u`efs>UJ&qBN{r&#NlKNYG>-O!9+x*=%
z>ubvj2993zmpcX|?EWMp|I#-<N||1D+YheG{+Yh?hJxggvBuMVOHXWi<d#L(JpXmp
zX)g`=(^Gdoy6w!jzj^kw4<|qW#Chk`=eL#J^@lIo-<o{OPt{M)&bt5g>cPWT4EXG)
zmv+Tpo^bN}XKssIzOHR$;vZJTUO)fCfn(0zU48Z6J{>#lKuYeHkNm55!#y!w2Hkhu
zl*Fl1CUu;X_x0&(HZA&c-P<P>optWg^|cRN^2Nhb?wGjwp`wAK)7$%;9Zb3GoULCU
zbwhaQ^$VXoI$`#`4?nZv(W!qM@!%=%eEjU`uMWQMyk5@?d-c7_(7Cb350Bll=!~nD
zJp99Vcb0!O_{y_p-S}+#3E$t}_mqkEzxl-tf1G|_)hnMKn3Hw?vD>G&&bZ+7W0R{N
z?zt*8;mtwrkI&wE;%7h2xpZ910>`lrFYJHntoOffdUb2B>*s&_{ry|t`SZY2%2s4H
zPYYdhan5m`i?X{fJm#v}^|gQNwdI=R3(uG_C$QjywFhQx{pPrY+0)*6ZtU(ypV{!m
zsqcLJ!;ROz{`NI(^)F@gdgh(n{Ego=4j5E0<^DJSQul28rtFt*UDxs8zA1^1{PB!4
zW~Ak=c>m0+pa1Hn-j2_HikT5O^~>!sb$=al?8C2YK66n2+b%yY<MNAN{_?%9&rVOu
zZ2ar<@nagFxunao9t)4@K78!%OZ}nc>+Wtix%9<PpBQ}0D;Galx!{AJ=2VvWjyb>X
znuls{Kkdn9Hr#a4hOMVAyyccrtFC?h>({={==N6E`1qUWZd&_MTJ1Nd_nffe(VfH3
z*)U+x)m_^@e{=o^gTr?X+HuB%o4S5Guy@UuUp=^Q(6UK)jXdJ9NrR@OO?oaj@q!=P
z(ms5-<d%(r>b5_XUG>D^yRRPbO~uf=?|!oN-+y0o$Ckd|x5SUB>UwVRvoi*Nv-|yz
z?wPgZ!CB28&z>;nqtm@#+|zLKj+@qQUbA)0IXyNnoImB6rN=J3?b_EjopRYbsc(Lr
zF>&FBbI%B#_tdWCMH`;Ja^V;0XZ4)$?2&DspZ(o~`;t%3d#+$d%enm_eAn1Z-YG1-
z^{1}+i5IM2*!HJc#g!-in0{OB4}V?r^80_-()T{k8<{g=;_sMy!A<GEJGPtmh!@j`
z4S!|ix50L3mp8dyD1T-7ssBhiuxI3%GwORRz2m!4f!Dt4dS2EA@9w#I^`EYKD){%a
zcfL7n@imYA`@-kLR}WtBnXAX_o9144$vf>kPTIX|RQ2s|{^R`-^?lj~+*ew8;+>DC
zjeUGquXDFu5cANODS2xu@A~t4+2M5l_k<kiM>=C;)&Oh1JKmZvyU?2NKh>JwJH?ux
zeY!QDHO`u^pJB~s*~D}AKx_N?)2w-FqBZ{r^2{pVhNM{Y@qMlN72~aWZ-+JS2@~pm
z7UgvUj44*}xhv0__rBbk|6#JV|5Y#<xAI@t!<x6+l<!>-d8_!JVdMXG8(v^jUdKTG
zTls%%l65?%Lp-hQ@43X<eluKDS=nD<Q?K5zsV7gug``z}9++s&&ppAKPqN9MVdJgi
zS=P(i&jmqio(thxrMKo%Yx|^tHLr&YL#y<@In0{hpKZ;%gA=Rx4Dnd=t8MD%P@F$m
zw4+II#bw2NjkM-ZKz>@qXS7Xycr?@6PrXgL_9a>Sf6yl1#^hN0cbsI+H{0a%yEggo
zb{}g$Z8rV?%V$~JzhYBQ`oOifRsQdW_*><3o!8p_`!lThl{Wq11Ls@Y-)A$9Y_Vyl
zi)`9y!v)s<lc7JiN^d4Cm{{@erdacDZ2FVI>DKnibFF!nO+R^@O?z2tQ@)FB#-lkl
z{wtvUTBY~oTx&jTv^B4{>6d!i*!QrRZ}f$6&@w*I|6B3<Y{vahj<>dd!6seVHuZDR
zY1V$SPPFFzp#Qgu=Ng;#GTo-1e9&gz^`6c6cmdQCt9I(L>6doe_<zMF-#&r)<Y0?_
z`(+#dTWrexbenPT1e<(#)n>jic$jrQZ-f5dDj&YIspmJ_^s8HXTl<G*XE-eKp^Hs@
z9&A&uuD9t=CPL(`;{Vr+t>ZK1WNY3r)tYy)=_gm)lnbuMS=Hy$Z1U{_n|$`$q-&?m
zxZH4=b-3g5t@-md^Xd|te&YihKQnFYTWr#Gp3OM^n9cmK#^(Ai&Zb@kZ0gmIW32Px
zK$<ll3;wOzZ-Y&}S^~ya_3%WS`Nkco*5O`cGcJ5<<7b48J+2>G)x#XPv1G-6vhlOj
zCZ0#w^glP)^kc`_jK2^~!vSyob+Ju9_O?xV?XsCieUN3H-oZBQdX`N;dCCxLKe<6`
z{yUp-^Pe{TfYZj$9GiS7vgrqw+wjE>>v&GKsm}qM{(ON=KQqmyUB7HI?q6Y3e=f4g
z|A{vB|1q0+X-pUE^rk=~wW|Nmz&ydK{XPl#Z^a+7X%{s%<+a8p|2@zEtm1QwO+9%K
z>c3Td&W7@`;`iIMyVVKS;jXjk#}?#R+dpnoPxji_A89lG{@bP;Pq3L!UNhA?KHu3~
z&z^5H&m3zre{QnzbBRqmeaj|Y-E7MDzV6oXKNBj0RXGl@@qf}3Yd<+Q{cw3-Yx@H*
zj$4Jh-DX^Pc%-$TZ*1cEh)uZTZ0hH$rPlt>I?|f&vT2XqY{va>dRhA!2$^8j9(&ld
zuNgM|{9kP9|8+LkBLSQK^UF!r@p;0gUZvUa56`sr6K@ml=Qic?M25AWr+QlRVi;$w
z>eZ1p_39&Nk5=~Yr&#+>u&IYHz&voU(Z>kRgN_AWkWBHKkqmnWi~&Z%Y^d<?f^Ja0
z;0xNvRZIJQgx6mrx#xA<8DbcJh5W_WD^Q=&Ps`=f56(yNwI048FP<LFt6_dC{NoL1
zc)db?HbQ?p2zKGLO@$5tUh(2z!>E9M9_=0LWPDzR@eE&=k0sti#%C1yndXta?S9Fn
zH^bP{U-HI8>A>VYARzSLm@2us$Kn#mXXG8LWw=Ahe;DRJ$XkPwLo*bwMR5I#ydYEh
z=}Gn@G9-7rDed2-fD4b4ygo_Vn|{7MR&wKP$)}M2OrPYf`(?TyTg2-)N>>5dn|_X=
z{Eyf2f9PPD&&}gyL?Afv3W3D<x8cSGc&!ot!d{6W*0(-}^CW4XPCR~uv~OH0)Aa%I
za{`jr43hScE#lRM>QBwq=yLh;9BFSnE&Yt6e5-+R3gc7bmiA5L=Us|_O|JAmfVk%r
zS>LJ)qU+n@L6X<ch~{HqeG>h5_+&o!l))HVr%PTuMMh*8#pjkQB=_`^;hObr2DCf$
z6W1W)f46jJoCoz2>;Hb*)dkf|yc*_6`?}uJ4^=zkB(iUvCFAKN`<sSH`$np7@+`x+
zoAR@s^7DR*=P9smiSem_RmP`>{4k2)I#;Ck1DQ~>p6tF@mapec$w7)&06Kq+e~{X>
z8ULj-q@PdoqVwk}oAxyf`WXzj@pm#JFbx*3G-_Y1cSw%=o$&P!81F<saH~w|a*2)I
za3cCKX2}fxg!q#aBzI1i@yGok_}UDeC;BN!mwsRxBVKPn1x7ywkIHZzWPjngl6y8v
zem(J%T$0x>m)vX@Zw;4RumAV^C3k!;<8wWQ8)MThJ|*6Gx{T-LRDb$XKk2{$4qhKq
ze0Dozx(sT+XN#k7?MU&-xK_sVPU3@L{SeDB0~a*l1>J&p-9B5!v-JlV&-L;&WAH?Z
zzgO}SiqBHYHzyg6AwOwU|J&}74$hXR8ONO}<Jp`i(`DA5E1}|H{s(W734&@YUbXNq
z^12$y&3-8XPDHNv&o@)Oa^5DxJ(cq3a_X1rmPvjZ`9FJ<^i#iCa<g2n0=X7)rTzjL
z&zH!48pWrE;v<j3-l9P=KO3n(G1E0x<a3<ixJ6omnRu<VX{VbZVrz|)jJhSVTw=-o
z=@QC^12Q8Y>m?7AXGv~+B;!xpfDJG8S8;99ei->VJzuu#xVI#KkNAZ&E@WuqLT}2?
z`a0>qm;5lknk>t)V~I?cSuXDsP`S`}Wah&@@}sxw`-`Q$Uf=#=ldiL1{w(GXKAAst
z6n{76XC3u>Fx?Wb(n9G!?mL;@O5&fxr>JjBWCej9M7+|6NM1+%Y7gSCl}p~{km>ak
zUk9I}9%}XTPeUc|AV21KmkJR>`)1sL2d@HZM_r&}N3QoLTZ<&u`;Ak<A=(?Gq=P2m
z4ffexCV9aQnO?Kpo1uWvzHNlGH}j_kqzLyq8J`=;&pdO!w?GznJ@FpUu?YL?CBK*W
zCt_TH_HwSwA2?mS5>J)&q+pulX8oU$O7YS9hkJvveg<h?v5Ne-kCA@rM@c_Wtm5?z
zBoN~hw@}7s1@Zn4$qVj~={5WR_0(V0P=5v4D_*0h|8Y?N<0gIy^?P+bnV)Skn~c|B
z-j3n6UMBsYLH7M9o;BymcqS3=L*r4~rP9w>d73fxJn2X8f9^U{^0pe8E*e&i<7qyV
zpv`AuGo*do8`8cH#q(|$M{u6j(O1UDOji#W&_%v&lks_z;<*Si1bH2`7qdTn2NHr@
z?*~@X_}f9_Z#Jdt5y}S#^`FJ$|5h50+Gsp_koebSvV4QL$#|Om$%8W~o?81&p?vUY
z`7n^$my_C;^kx`;uxY<bX};$vlM&xf@kz>(;kJDs^Yb0z7Y>zNZ`TVcA2Oy(KRd|2
zC*@ll^`9y~M^Juh?dT#nUF5UY9^1~ByyhhtpOfW>@h<hN9-1#*MdA7=T^Te`A0>;~
z_*l$;1{iMby6nT#rT=ES9-K_|r*yFFZ`){GOC$f+LHCRC)Z5FIG=HmEBO^%D7h@Fp
zY0<{V<u>i;MCz}C)L+4{C|<49Kg11`C0j|nD-6syud7)uxoMx)U&b>?*Fk1~eg=(?
zZ98SgnEm9bmyrE-$s5W42I#nif0|!Gwu{&BB5AL;mod=LME(qve$49voIWA1yFq5~
zeDae`<?DQ2=9{_KyEoKZ47ZKi#SLV?yRR&l4y_!^ZQ9q><iB-`^lx5I?WB2H;{%eH
z%j#=f3mJ&>kXmgX(#@uQy+r+g+c`4-&2fL=Op50fP+{QZ5dXqneQGZSG_if3csh*>
zjenByNuYXILG8U>YwztDGCz%XrM-C_eS#>j0Y)9ww>a{DH;rGOT4@j6j(Gi!t~VQ-
zBscTt{6y(rZ@<qCkh~^EW|%o{J_hl`@{PM)a?{V|$+8?9X}$!_Q@rMQW&U_*9&YB_
za8aKJ7#>=OG2=gv+I#b@GM;|&vnf`F+qhO{*h1pJzg%*I>bWc(!?=_BAALUj9z+}C
z@1zSIGhKb=%5kKQ+TBw!dBy~4FLm$AdOm>odo-RJcgpxQLj3Uc6xGkVB{KfUQu&@R
zLxy|HR0+)M>%Sf)^QT^`=id*N`I({RCob=Z{9hscbSMAc=Sv>c+I2gXV;o&@*Na%f
zK7S}6oM+b2b-LN_Et@6%IOzK5VX|)?B6%~-_aOP=^>0X*Xuqpu&JQA9O5<`Xjmyw{
z#H*K0Kd|Lung0e|kLQs6P1G)0w01FVFojF~6%5DX)j{>4QS0}Pv1yNekCW-Dq4g-3
zc8b@rHtlpOmE+U7(d9T15{~(hLGf{tpI#oB&k26nKbZCZ2I^0o)SsC7bDmB6{ew+=
z%(iJSr%`)veL~hlb6%H6;nvgo0Js&e$yEP6H2*g_j&E4s95>4JJ}ccB<7u9$uTMT;
z)4rauX)nJ!L;A0$`E3=&C$50Xg|4&A{^8<NWj+^NEbH4Cvahyj7dOGUzZS~Hu~gc>
zME2{cJ~(E`2HlU+^+(|!)`#i(%Pg;xhsyZ0Y4tyw#_@U@$1}+P8fagbZ><&5|9Qk)
zsJ_)FOWue0?KECzXxFn}4VLM3(0anjWPcXKU#xFXJQIj-bjWaxM(KZ|?6!=}7fBwc
z&4)LhOX+$>X21>dG~>_E5HSAwx^6zrH)?3U0m%`s4@5s7XLQgy0GNr_<E1j5%``rm
z`S~8@TL<Nv*`M4%^-W)o9S`HRh|gRZPjD+<GpS#yr*i2<{5`4<dcEzQFZ-EB>Sv(4
z5U&@<OaJ=1>(SKi8nyBYLA}EGctSEh{V3nEZR+#4G!DjV`Oux}Nrq24F#FqakWr%k
zjFfyB`M++Y<odd6vADh%U^J3{v%MHz@9eTbxwkUps|b0$hBqf~qPNIj;h$Yn8S+=;
zO&n7iDEH_2W|sQpG3z7V!fKy)R!O<9wB$-KymV4hq05=*EGZ9FR8^MD^LvA#3NHjP
z+aGd`tniomeU*N1g}*XTRZ-}#%${eS;&SCKD4P^0^5>KVOI?YDfpUmgdB_WnY>mo%
z3ugMgzDm;w(^5FcTUb2DJIhy63a*PvJ@d*egUE75+>XpC59KWg`ja4%5$a0LE%cT8
zD)30>qv>Ny13rth(nbdYrJc=PPDoy-U{XSXio)VFUsWhDyTVtL<TSrc3WP8p{AK>~
zkiTfGue5X~q)f(bsyu7tgp!#RzKR9ukXAZ8c~Y*|<r)(Vj;$&$43z}RvoYz?Jc?(z
z(3N*~b+tF>uc!=Q>4g?}=Q$1UwDS2S<wf49{>rK{Rh}+qp1-^@AhN>DByVZS%tCKv
zN%`zjzc*A|;rA8!i!i+0kjq(F?5pq>VcF-13c(#@XJ;kKlpKboh^VYo$(XB&$+;rL
zP@sHrMXm@1u?GUmb$XR&tb~VU&-}a6r<E7^p(^`}WNJlCbW#!_K#`ldp^;vw^VKew
zD{-c;vIMdqQ~^=SDTLIx3L!?`P=&7~RGCwlo$G^tvR#SZiC$5kE2;`Z*=2>0NtjD%
zUTAJ1sHZOY779SM^;J}QT~2Qalzh$=+1c|vg;(Txv(c_(Hh7eo`?G_0Lhu#-V5zUr
zpIuhw8DA(G#LrJhI{5JggZ}cOY;uwtN_0Zhiz@>2y`}#0*`Z>uzoH^g0f#ZJSO}>(
z7x_x7{3DC}kWGOFs@z@Pyeg>QGOZKJT-g^+O7w+NWyWXcy1ejzR7E+=6R13~ETThj
zT7>|C$XLq8nVp-2;T}5lU*e2n7TtKwmY`9zh5Uj%#e9rDIL4Jd395SXcxa~Z?PpiV
zY^ZsGAodQvQfi#8tdh#0FH~4O-oGFl3-^j&R@JbE6hcQ3@@G#iD>KUcWre{7;0j73
zUDcDoY-lsQny@6K@D??*YL<PYqKwPCi08oJpN>sA?;?m-<P^WJ%qB!?_o<};7!Uli
z@5BbIT4KZrbObg|WMA;>PR1b%GPTkl!t%l9<w`cY`rN`|f0@s$PnO4`YGBe8CD2BK
z(B=5$so7J<^E4f1=axMaqoNH-$<VRQ%ARZ)!)(taXkV_x$uRKvVECKlD}&lHJ~xy)
z&tC{S2m67fJl|~WJ#*8b6Tm@jBn(J$UNX7jB7a56tOZmku2gSs5PH`t94Ea{ef&8S
zCP8JJU0x-6=|t~@089yjfzpz~1=)$v8_Vuc%tNp@67!eYRlbTMudi@!RY`?k<t@Zf
zHWM|RLsSkqs$joGZlNJmmq1yuFlYu8#e6dZ6{<wjCcyMd1^CO#<Zz;yE}}`bh|j&E
zs!-Gcsz2jdzYbj!OloAWF0kxKMIYsLCBft^G7HQRvlBMU!;(#vLts*J#8gfeu?qRB
zN<-fAs?t(#2*&uF+(d{oX6~hvWE^B*>9X{|0d`$D>yv%D^qiO^=2#F*e+Be-nU$6P
ziqKdXb<r3+UIB0TKjKslSZSd^R-g*zzA|khyfg?;%=%m|8RRUS0$K&?N)~~RDfSo6
z(ZbRLN(}{Y#>j#h3Be$8Ib|p_@i3*Vd_E4|c{yXFMoXCZXsLE3mXy!(S9mK$uaM|0
z^Ud*lXVI*XC}w|JW<`Z>!Bqb&C>&^&uFQ(rFzqW3WkLJR&dyC1`J%P%6ftOHj5^OZ
zv50NjDz7$)NiZRSN}m^q%qz2T=D@OnW)3cANhRh6HVbhyy30?OBSt_@!r))%#_dN9
zkxelEl>{n9Uol2bL~$e_M{vaIbUZOHkN1ziDBwc`0*#;q>XY@b1pNvYGIW%39D+86
z7f)cE>h)s!A?0u(gw@+J1yjK?DUw^5PkG)vs5-f+Ue9=#`G}UoGKY5r)JzYjG5xYG
zOD0W?sE0ZFh$(4hWy$Pv971Gm%?%)>j3~>_o|+rT6V+JsPc-AUX`jD1?l2QhHfK12
zs*pD@%Uj_qpABuUECdbZN-cRmukx6ahNw<26ri3|Hj13FQ+byFx#1L3Xb4#AEySUs
zQaLy~BbT_MV+Gq%$OJwW7u_Qc(2-dqUTGE0gesF$ls!{4jhwN$p_CBL=yNP4^3b)&
zE?IWi+B6${!<-w&@l>xC3{?4|nKN@oVmOBv3>3HUmh6SvK>vqQeAvn5%$!h`Ec!Hc
zsI(MPA?;Y|WLVJ9R5y{F1MQO9TPJRo4(3_8F9b^-X}+12a8+0(vPR^k<Vk6Bt6)h3
z)=9h}aXF0_-{x1}EWGlxIxR5~dhSY?-BcFAOs6<dQW!DI%M}@Gz1||T>`(?5O@b*A
z&Xs&KOXj(t_x2W+`YJ2MkPFRd7VJO9xpI^hUiMX%SQ^R*xsuKK-FO&w{iQ4_mWOh!
z1~iL6)ewdgf=KLg4^sQ1T|@P7dTnce?k}tTx&J-2|B%7_hP7W@K)7;Y8snQSm*ngg
z??kU8X2q^_UwL^T1Z_YpD^|#Pf@srbxnVyM=|`X~{oEd3BoVqFtWa_mD1uI}Tu@f#
zhlQ9zHiM9na+Oq0D~A~2oVwFNn1_ftf29}K-#Q&mE%sH8t?>IV@>P`JLMW|QvLq+N
z!VawA;F=Nm?|gh@)bU(s`!Wz5<N1{fu9SI1Q&krL*$2OnF^A{hz!OdEf4wJhaa9D1
z34zMrES;`o7|vl~7A^=%{V<6a7mMkt{mYT{S7#=z2LD!Bz*oCOQ*utKDw_$b>QG6{
z1yptIB^z1vA&5w)Mbt>mQ0ttBGUJzQqB*3YRPLKUM^#3H7wjUtNLQUc74CqrL}
z`D=ODr4F;^kQ7zjv-Oq6pEvI@M_ibDsBYr$ms*k_k!D*v#JoGD5S9ZfU||fFYIESy
z0N&7x`J=lSSGru<y0|1%JjQJFaE@BHfXq6?>a<wWNTSY*or(FIYR-*d)gyZ8Qw3mY
zMkOqY$T`8c0A`=jHAx&v%JR>HdBfzHS6CcOo#-zMR4f=9s3?oRqaoHU;N}9XdxiX7
ze|4c>EQnB(gE@MUcfQ{Tt1F=jng!+MU?%bdU@R8eG$>q)m1FYetST>=TP1E6(W<0Y
zAF+6n%|!v(7w9?V`V~%iKJsyox>=Zuh>YZi#2AD$8CUFGiJk?Jzp#jvTN((Nv%*d%
znFnYxFYYj<i^;sU43H|P=AG9EA_pb}%4f3%oFXr(y@7fD3Rs$$kC*7G4%1mKrwB_d
zW5`o^L}1l}s-e0sNA=Q`9;^tI1)v(hor-e*Y%$TbbkJ$p#0(kMipB)+rdBmK>6}54
zuJ~uK?WMp_A!;Novxx{&(M5Vsvr7KT0T{DZ#6C3yy*e((=LK@by&$o&f(=cq(NhIa
zs(>}hk}_yVxX=N8+b_P@glUs2+8b|u(MM>ZaVD&umV2QY7Zww#@Kfs(EfH?&L>-+4
zlb)ysZ`OZj<er<T{_@^L)Xham8jToo?r*M8N8fmWPL7%l+^@`qD|lb1s=^P~XE5;b
zt^n-xMQ+QU09T4tWmijRu=ocR+PtmI>T6^z&%(s&iW$}j%}ZT=c`U9c;24IDr(uPl
z3XcWME~LpK>M|V?Cmh%u6W&JBBwQvhGHR5<`gkd<XODzB4{JZzOPi}67Xr_o$Ctrj
z{4B@rz0_L*Qz%?_&+(MXaV9t!mPg>OLK$=$vZ@|*z_AQZcsg{5U<Qi@a9dA=X-?p2
zp)b9p5*BIIG9NAR%GFp{&a9LjQ{?z$k=62eA}m99%9-r!JTG=*dAU%hT;7RQuo{0j
zcL+pD{`^xQl*A-|IbJ4s>Gt<5=rZB<C?rfa2;O*Pk^h2oFf~87V>FZWOU|PUdVS!_
zGw&U;YebPIvgc_P!FSFA#M-)o%j=z2X1%Z=!xNKO8BWxaf!Pja8@0IRH`^#2Iz$`Q
z789b{Xy;>{+o-5-zkX~HQ>kC=M)dAM=vlq<N-9ECKC#HlChW=3u&QR_05*A6c6A6Z
zuAE*^ZUD-LUtX6~ddmVuRZ+J?QpAdVV&eQ_U&uQjX2rPhiQ80Q{{w6_5HkolJg5yW
z$=DOSQq9#bh*fbQFo&+1t!%-23EciM_YKX9z7DrK1G@^`fl=tkmS`V~#MB_Ha`|zg
z4kx#8MmgLAuYh|WrL;?7yrugTaPJGqgwBV%hB5?w)Y48(O!UAdMP^}PRasRj1UC*A
z8{Cj`%aIK4a<Pm1sL}9O`<2t&yj;)Nc^=wW5M4Sj%)o_HVObDs01RKr-axQ&<h&C9
zeAsA}4Aa$8F>&##tr~LG-7>QhRY^nY;r42wSY=^lKJFr@Wm=j<9Xa@!*(v_AU<h{>
z{HoJ&e}YV>Xb|QeC9&<tYGH_7h92hqiR8-pz98IzfIPw(6=rF~3C#NOLL9aXWMiv5
z^p+sFnPIghXgrjX8TZj}13aY?R+-F|DDButx%0z3ianuRB}<N8y2E}Tl=fe^Y6tt_
zMAF6ODC9!aa)ep!TuxEWu+wag->T~Ir9?~$I8{6LU?!R3g}a)m-c)&2%s`g77?xnB
zvU4T!>tH$EN-czSd4FOWMsp%;)=Eqjvv+S*d1X})?_k2RJa(S2_UOyT)sjGkbmFOi
zYo$VQS07SCHwIkk>IOh$<v+SPSuIUdOC%>F@$Le2wbAEIg<*>9M2k%VD~l`xliU#O
zF0<XN#_FfGzfIgM%)_M#s2gc`%P>WR7@2q3pKBYOSP@By>hRR;GtyGDm>h;Tx|)x=
z7YLJLbuUoWuyk3&peukC5brFp1S0cI4D9^=c%s)_qrs~bzKWd)yY`}LUMki+uvazD
z1gho%fSO0xs^)d-lG2NOA)nkYHxK#>C+wjqs|tzfCLYP2mp!d?UN&T<Cl4+v;KnXi
zhfZD+N6PJb!WpdFV6W_lyFH>>%dt#!3b>*Hi+S^1INrgh*E~U1yOCKCBy76GnV85Z
zaV>`@BqqTk670|s`+>!om2!PEdn#@Y(5I1;^cLny!uEu14TGSL3Zs<hg5kD23zjea
zxExhv#z3u9B)UY>Cc%CY^O&}!AS(1j=ftmXDU)p)3U&$RR!95wJ}WD#*yE>Sme7?G
zED=R6i&%3M1&KY^!ImYlLE}y<ab2eN<*G*b^L?024_52oCZXI1EcRy|Vq+GpH$%rF
zcjCZw>Svye%PHz)Gxkxfl;!3@5`5@WRQe*JM6XzA^BGr)xd7s+@Y_!MF#E9>QZ4(!
zLrgD!K@chxaLdN+t1K2BNlsaAC^_gWsmK`*-+qHsVb3m-1-s2)bcpPDhRq3ZH>WV<
zf=xZZ;U*j`O~6Hb8K`8q=~fKiCgqOy!q(`Cfg;*VfgEQ`cmVD|!yYBEK}>A4DDvV_
z^N^<k7uqWP$jrTe!rjP7V$E&Z6J|QI<=X|3jhcB!jQ^7}kTY0kXr&3eeXJoYhdlFO
zOu{M*cV~H7r^D_IxT=EZ3!+LI&CIfvr@$y`UD}EEr7aFwmo~VwEo~Q+_`yp1e`i{#
zO>6y+xSopK#jrXmMsceH=FJMrW9$K%)RK9)%Rfcz>5qgZ#wdPBO&6k1P@R$4rOGMA
z&l$R4cLMBun+H!gm@|^-AXsEw>CsQuh-HlENXw-S5nI?9sBV0jSLK*4YcG@%djpDx
ze$AHXs;rtl8}1r=qh_LVceR|v;hOeL+{O$m@N_Za!u^6?oS+rM<(t2>(hECxXg99y
zNn(>>r<1%%w349}wJXuw`;Y@$fy@jzxRG@|bS35b=J|_EcfwT=W^C}>kQXMhur>=T
z*mI0%R8=k@^fr2s$#A<8*IzQTCiv(1OJT<?%-?aFz{~~SlA`KJJ#eMu!n$|qMPhe3
zULV7wHEFZ)k(FQopJbT_Pb%P5QRGpU$_4NMT$!8_mxyEOFrzJlXWT-5t5q#MWj5(g
z$*hu(dEwn-e|5+%GdtO_N=5rHUFq`CxA_&mAUqsY=B<R=eRww@5BD;||6*4@n+pDl
zYbomme=)Kj@)iSJBEs%Hw&1UoqUf7o;#W4A#}yuK$u64hSF<#1(b|cyQ@K0<#kv4y
z2mUm#Ty`%K4Ho8kuH>oy!azllIq`C(6joK>(q5jZZm^rssu)FN((p>YNL)`@1S?Ej
zNw9DTH>|vJxzOGh+z2~#!eA6l@{X3zKFGThFow>A`#MQ*U#Cdk*MWK`Ly6?lOlg%K
ztb!SicsB4iEb{2iG!iJkorNPkUOQM;515PNv0rdUC=%Ee5CS{|XX78%!Vc~I3K0u+
zx9Ko_|2o%V=yE0I`eymf{Q#mO+ve4;&#7Ot?qZX@&~OhYgFCfveNHHIgPtD6h1G2A
zpP=f%BKP4;62VMO62&RWc9X<p`$?iWWIag)cfWj+_<tjDG`d6@N2k6vY6>7KbM!3=
zHEYv5X`2%}hsdtY&3+}i57Io#odZd@XR`#lb*~?Xi|E_!*n9u%G(`u>PXF1<AeQY)
zEV3NB+X6Gu{#3^$=F#%W1+j|-8xjmIEPn7HmAn(K22EIO6D#`%Tl>MqhUk2vVMPo+
zbg74f5AEaonFVc`2T}VU4y%uW4kH;Ns&YfC3(+%F$d>4Opbv`&JzG_7%c%dC#8%8i
zp#G<en=SZr0EtfZHE%*L+%$ysS@X##oZO^T!g7hf9G(Iw7kf%wBXQd6gB?62<*-E&
zAK$W^lenC6*Bsu1#iuY~nkBws4Zs?fhIb`#K8255sVS9OUX2`f!2_u3xX53KmbmMX
zjQ0sGZaYZ->{+uEww8|J>o);i`XtzuW+rlGRT<8CD_}iA#O;u$CMM#k<*+Bcs0422
z;5<-l=C;nHB(IoM!wr^TRj3kI6(c)5wQdU6?C`iNsR}mN;7@13GaCL7?c+9Y$;nZV
zR<&T|DtZ+JH4KOFk)3;?cT#RTdnAvqMar&2gch|gSe2rP3BO|pQ*ZIVa11Brd13QC
z^eh-_%yPXUbjtDMO1NJ0YPZ}WwTa#=xxLxztE$eyrL$~!;Fccnr^H9@l+1%m5_zLY
z^kTA?8k2|*gPKMYvdW=pu}c6r1@Z(Ny;(P@cN^pS8RvDX7)d$hP^Nfm0VnvbbXc5*
zhe+V(L+}bp&cH>TfNMegMF#P+2Dp_~JU#@Q9?c&)z-kBi0DG#K4eq~EZ6)gNCe%Kv
zscZ!lt~M=ZB(Ul|5dwmGB-bwC_ZlMW88Yj{>L?a=WNWrut$@oYyup&3D}GsJ0`A$x
zUjX?flS4?Kx!=!ftprQO-2P*C9D<Er9UR}O!bPtK;5<(*Jww4mjmas4c{o-B*x1Ks
zis03H=@{#ank&NhNx7MJbHz;ixuQ5^Q8Ln<?OYL;ki~)`K6_}E-PG}-@H*GSsD+T!
zs`4QIwhLw$-7dk0K_dM#Yptn83(9?E0kva8bZwpmxUIIN)L$VUa-O9&-#~YjC^K2!
zYoY~nya!R~5}i2ilfe~r(TVFFLUj3b9$0B)jV|W@jf=o~nZXa31mJ0Mf5kX|C^H1x
z6lTKmRHeGLKdl_LLYp`DaUm~F7{kNmiAj@#J{X(T?*@pc{NU;8bm#-<exDq(;4D{~
zBsmZeZpg%?DR^QY@3z7KHM=TMW$x0jF;0QuAD*l`sKYeu>2QC(*kYpg^Q068f(yJf
zP-9O_k%Ml9){g9!4_%O0T62t!W^Smo5;y;AKY;_c+&evXT$l$fH3fDMM1CbkE)u(v
zVQCEZW0i;G=8#nSF_B-`hhc2Q4GLHjgaR~|`mk$C#Jy_fL%LA4DK7dEzOOp!6m47@
z3BOl?tJOF<$sLN>;whh~ZhB;9`2sYGJpU7!TJWL1b4M99ZpmH|2S;-O@z=WPd_DHt
zT<$>XJNIekv<PzoM*YKB#p!f86iM%4FG&3ci!9Iz&1Pa=ql&8p^N|$#X(atp$?i0|
zx>8%VaRbj`<%Yh88Uhe=Z<iDH7JB726pJZ1Tos84m^2V$N~eJ->UurpN`FNFCctu)
zBi-!Nvc%r0arlENmQKVqMKX+1{-`Zo`rx9^HgR~tjz}BeTuu*+js7C|U5ZM};5uI@
zU|;LSscVrwcNKja%_MP~sO<D2TPL(QCW-3?S~<p@i@{>}A-l>s@LYr+x7x##(W38_
zr*(?CxL_&487JJ+p*3Y}5$YluetpCIxwpuAH@0gz{ep27Ht*6?*)YKl$}vfME}M2+
z+fBo0nkHw6!m&Id>gJf>a(lHE*RmyY(J1m$x=zMTz%o0q!qcxTILiZpV3v4nGutEb
z3J=PM;c=-2pM;00EE|Bd3q`o25!Exo9M5X6RK%+s*ko0TjA<@;KO|yCvmf~oerId!
z-I;MR%;@M^p|k|HBpoWosp_Fgu}+l;*?|w=dFgj*VDSbV;;$K1$X_wSc3BE;a|4hQ
zE~ji^qJK1pnIsohYO|*)3PInA*DaM|w<9W<_2|fzh_e>?%pR;7h`4t0iHV~!N|GDk
zX8X%w(^jE;awoFu4GKW~UQ>F+qxhkiWcG_;xo~|WAm0NQU2OP+Pn|bcMq-iZlCgj+
zDS=-Rg!_1Qv4O$hmqjSLOp|d5BdXv<%~Q|XkabdFu<B5zrIR2|Q8@;ic%nl~gdbPI
z-wvY!gMx&gY4L*Z=&BTbl2}WLHt5_sSUg=Lu|`k^RxW_smD(0j?d(a=n9M71Xl5h5
zyvT4L68UzyWJ8(=KOt1%a*1CO3W=6OQ%^Y0{3S!1t6F&Y2CizW;oj^V`29ol8>#ZL
zf=2%wSae3fkAViOB0q8oCy08keqq85OJAk1qQ$#bSJZkau5-?mKXAnAE$a?tJ56R|
z8?M?!E~g+u)RP|4QfvUmpS=rI;4cTb1C=583T;w4$Eygq8y!$nU$|Ao+pFFKKxOx`
z#s^RDnK5A$^GkHb5eD;gs0}5M3je{jZo&ku(<3HX<p=-a0T`Iz&ZPE`|IdB0NIKCU
zlb3^uIj_ZHCc<uk6H!lzsz<{Qb5B03J&83inDWDLoJ4z=f3_D_XD^ZX!fXZ$@;6zF
zVU=eL{>&ZJAgD1^o2V(lwtjy(j47^UbGlL_x44Tz7vrTYoY?cxQ6E;EsH4f*)wq{r
zwqN?s?rf1L&y_#97a4X&UJRRaVJQJp4tov6qG5RmehUbesa&pUur-H{L%sk>Gq>@=
z3YB>P*HVA0OcP5Jn6PBnB7(mS1&+-aihhmt6)*~=kHhf>FT%x&6Hdg<I(TTq&R0*?
z9&Ilcz}%HO#xqStLfqbgO%Yf+&_JUf=v5c1&>>l#srv0sK3%C;_GS&Kgk?q<hNT-=
zi~{4ptXc3O&BUIi&Q6^;H6gpmT102N#HfJH*<k0C!*~l3vp=4W@e&VhVYy-<o0pVg
zrCs!emMM}K2KcmBq}sZYqH^7pE~fdFaPuBl@*}}OW7dAm5UMd$Dw;(d-p?1}&(_E#
zZMnK)A2w#GOya?E)LDqX&@F#uN7{)hkDY#^?O5g-S-GIR5Z1QKVfL-Ja_iAe{`4mN
z)--i%e1__QriH(IAw7x9LM(=;p>q<}oWvBolLd>E*v!Ndq_T0jWS=6sH=0z4GhiKd
z9uyaJAFk*#<PYUV>Imc)4zz5ND(i)em6}Q)DjsrIl~osbs56sck6@>M^H3+HLbS)o
zjY(7xs<aQ5TiA-DPZPbg49zyn5)<VG7c&@*t72v&Qd3{KW_|GdcVX0JoX5g5&iEjR
zyuLd`BwX-gOquiG$2TL-W**{<k?`w}RVAg7Ys;i0^Os;*N}_8AUb1)-Q|ymJ9|1p{
zcJN9mW{0ASJxE#56!PGi#XTJ)PUgf=%ow5P2dARLK2*iV#JZBAt;94@q)<PCNtXkM
zb_xwPBfaLPIoME-^XiJ6vGbsxh1i&@Ut&c!a>ujG7tDcS>gP_NvS+HBHJREyQPgnb
zN8Jy~gkN5OmnV0E7w$CTZ#&Kl%)$L0us7EWi>hLR3p2Ug@$8u=*JxZgDlfv@eenA}
z&{4{1z;D`*!|41!J;TIAyUbS(KjQ|g0X)Yb|0cslA8xvrhv7+I?4DsxCTa;hBK%)m
z@bFX!Ts*3WKj7wfbit>~f|pg3e#J5BDwV1TFtKF<V*X!WTeGxCqlVh*^5%@s&5{w$
zhK$Y5o=}zuw*(^-$^WehP!n8zp94+4*dVK#G)$qRN*1-qB%UF^k-td%_~R%Rk%N_U
z83*XD)RKkW<0&Eq@=+vQna4#Qo5xd<?C&OvL(u12Jd6bHEbk`EM^g^|c*_5cwAqx~
zFBnQ9Lq!TSKrH0L-ZTLFTvero@BeL^2C$gbrh#Pm>DMCYoJZ1P1oYW%cydiXllOn3
zuwl^!9vFvqgik2sc*|hyNyDE$5<mG18~k7#h4o55Tz!?|{cmv(7=Ex9_f*it<o{ll
zxPDcZ$h$K5R~!;k<N__5@X!_80{S5?e!+&brvJv#5*|p88VFEB`&4tgY;(lSwXFO9
zA9e=%ZWOTy#D=Sh@W>DRj63c%`1zCWNXDm9Pbr?v#lJdav#Sc$&Q%8q-|=F@`YBEO
zQ4o(<4}u371I^A+^Z_N+usu-RRiR~g7oXlk7vUB6jo}Z=6v6{=ID5_gA2fM|--Y|1
zGn1xY4A(Oda_oQFe7^{{IYU>W=FO?FbW~PS2u}s!AB+<HNc683q#X7sZ0^B+Y%tSX
z3L{ac9~s4UJDSE9`Q$nWnDO6bI@pO66fQ`ai|~nfI~TS%!QWuO<rDbbE%CQWvOQ_A
zwl8p}b$5Ax7#Ff(`5d;H`OBe?m$qVrME4OJxUnbP(BOI4`2sg8@lM@*FD%sIK63b}
zEBQ-TxB~2j9bV!IOJ7k@1^tR7J--Qybm9-)TnhU>B1hm6ORMd(IG{)625yL=y@}BY
z>y!(LXi95ehct~WlSP{zEDOyN5_{?+CFH?B!BXPGTQ4GC$3W4A{P;`2(60q5;MsOP
zQC1FS`V9D+c>#H=W<+HG6)c7p7UQj8(ol@_tuj<pc;=ZPMx>(QZ20x8N>naXg~W@7
zmzI=QRS&ODOC6q?j4(0cm4!5JLeA(h-oz0}#<($Kyh$URhDlt>>T6P>`Wk_WBT|ep
z`T3cnbG)t*Nh8vvNq#;!Ngm-!<Hu5>j-^K_my27Dj6N29Vxp6qrZW+BR-%hbW60#G
zIpcCBoei0D4t>Wz-x&k{brJu@7+vYJ8+>**jsSk7_=Yz4AM(!sb%pb~8nJLjjL`$$
z=&dJxLnhCY=N?62M0@IG^cHs9H^v36x~kaW(LSB*kec!CD~@9LGTb<_jt)(xwwJQv
zq3Yk!wjaE^P%NZh{1<)Vrx`XX9uE4KT84KtX!Acw<Z=AoEh>#N2P1x^MOP7;>G2ry
z)s4c4&Lz_y#@Y-^=8YKx{KT@t|4a*h>_6ZClLMFo&%@ug#=nQ_fcG&aCB5-40Pg{Y
zFFgJbJMKAc0sKYwKE}7|csJuIIu3uz+qiYr?eK@Z`xys_Jt}mZ(X&E^ajftkYxE~w
zbwWuG18V&#pz#%B^fK1axu~nRJp$n!W1LC$#ryAozx3PJNHfnD$9o&&NH4zz{}OO7
zBagIC*dJxgBt2i)_cZ2`_KV{^jH^ipg+9_)NqT{>Kf<_+^qY^&fxp1q&A6ZRTw&kU
zc${=-*e&pv?z<S9L5uibD*vuIK=E?;<lhOe(Re|=<h2^FyHxTzjW=E<`D%@~5wF*H
z+~v}~LE|3cjT&zz-lTD3y7bemaS!npjn@%x)p-33>1Vsf+X^HPYrK*EK75D93!ayM
z_<6s^>xdhdMdyEzc&x@7iN|TYmAFIWEfk*wjXTNSsquQ^X&TQUo}uw(;%<$1yd=}>
z(Rg!({5#wE8gHwVyjbG}vt_;nHD33i{DbJ#8t-^S@*0iDJs^3l#@il}yiVhB6wlQf
zHy)Mt^%}2ll)ORXZI4UdsBs7JCXE+7CheOwUVp#jEgG+RLh@FPJ1HNwYrK)--==XV
z<wIEGp7qjyhsNvHNxom>1rJMZTpnGYn<@Webx!Gu(|G*`=_g*}9c1s&xIz9CG=BRk
z=mz2C)c88$X&Qfqc!tJbCSIWNnsn*EPUFqQn>1c<j<j#nxN)xJvD2gTt&O--<IU$u
z`+SW%&X>GK<M|X`t;U<Lm;G3s#+}P0Z_s!x`DxU+u|(Q8X*`a2tHvA2&vuP>Tr2&w
zX}o~^?AQ2q@?*@<@{Rn&YTQG992z&sZ-T~K$&XXx8RW;U@dWbY(RefY$=A4(`~)@b
zAwSg`ZzMl88h4oK@<!)_F+tXcG>tomdo*4^yjbTGrJrhzw-K+?c=IG_U$60u$&xo|
z-0(==qVWRaZ5nq>k@g)LuOS{=5S<T>snR}P;|0VsG;ZWd`(lmP6R*>FMxL~9(s
zHjUR$llJ>Ho^g@nvA*c|*ARDT+;OqAH)clL$D8@0@%ppnxE!bPx@5@{H0~rnX&P_5
zRNB|-JW298jW-i-(zu)aH)}jd_AMH(p?J1wJV<`R8ZRI}9U2dkpV-3ad|OR^;xz6d
zKk*tbAU{rxH=FsU@eJ~lp>YrS$=7(8{1j;1Nq&koo<V+UG#*FwtXAU=@>8dAC;4g6
zcpCX>)OZ~EY0|iZ{IqJkfc$LNxIuo}G#*EO_G`SB{1`>i<=BxZ>tU?M4f5mAcq92q
z(0Cj9acaDS;_24-cJkxVcq{qI*LWNG32NM+c3!RVX7W>`@mBKFs&Qk9>>t`RUO>D<
z<1M7)>AJzZ9&9Chx5n#BrT+qr8)cGLYrKGXt<KA({c4S81SD_Jc~J5ujXUN_-m397
z;$e+vknYfU-50X{?AN&GTgl^RMb|gaN0O&$Ja~${uF24N!9>YD8h5xPFVJ{C#iLl`
zaUC+;pvLR|C3&63n<<`+8gD&ShTEj^davXy8aEOpkEQD~v!29JJmWN8aJ#gR*Ldsi
zBrn!@<8oP!H5#wKLGslauO?lu@%k2d9nqli4&sd(H(rwV%^I&G`xcEi65p=znw2u1
z9U5;Xy<g*=?R4E#99^#ph!<<zaWmzQ#tTT-YTWt0^k1j(`VS;ut??S-^%{2)Z_s$#
z4(X>+;~5lglg2&7n>AiQyhY<d;;kC5A--MXb;R2=UQax%@mAs;8gC=MU*mO@UZW(s
zygJA}R^tZQ$7wu{xI^O(;t3je5_f7mgLs<8J;Z|=FSte4pIVI<ldjXaV<*)UjXQ}q
zYCMj3SmOo6J2dVgZd?(apBcnsHSYLC#wShZzmw_mXxvFUU*m0`N<Re}ckY(FSmPPQ
zt2G`ZKQ$WfAYQBS8sc>tuOq%%<MqVrHQr3TLF28&+caK(o6Mj1InnvNo$MSMuiG#4
zAwlEy#4{E|`!RKa#*ORbb$zkMJ;Z|=H|?u6Zrayq+_bONxM^RfanpXa#!dTrjhprj
z8aM45b$%bMhiKgSpybUOH~qJ0-1Oh7anpXg#!dS+jhpsijhprz8aM6tYuvOqu8c1C
z8d`sd)p#TEIE|b3@ftVn9U8A+D)TKt<3Zw1jkgj{)3~ur`pM8a)kC+&&GdRSZst$E
z#!dSIjXT!MaEmn__lV>*8aL-Lt2J)+Uv66W!n2d$jrYayRY&VWI<L3kL7K1W_BA%V
z&W6|9a1YIMbpILD|Lfet(v^k>>1D28B5H%Tncg6?=iFSU)a}i6N1dDNh&ngd1$Az&
z^Xc4Nx6^qe%YPov78`D^-{^kK^%$L->nJ)m*F|)0u5;+zT({7<xelRob6r8_<~o7S
z&Fg-ho7eF=H?PZeZeC~W+`Mkqc>-Ne>wG_5@9NyVj?sAntLMBN&Fe_r-n?$pd83V=
z78_o`+7S=eyzbHco7WjSH?I$L?qThb`#0x<x_vy&7j$mUvvl5UV{gtobo*k~u2<th
zdezwQ?W`Sf`vh7)Ll>MoSUz)}X2V-q{JDKKi_ga_o~s#mGTv;%!!|sQ?gO9;?kCNL
z7ufK8mJi%djSX+G;jK1&zYTZLdYB%6w+#>4@YOcF*@lO0cpU2ocs|s#{67!!+8ke6
zY<Pzak7xPM{bbniVjEt-@{{|iwc(96e7g-dSpIPT2{zng!>et0y$x@%;T<+So|PAm
zr@2mv^@{UG)=oKZVCA)wwZ~S*X^I9O4P!s!bD6z^mDfVX-HcZ;9%P(uwF>{M8TT;z
zW*Z*1;c=|r<9^a?c!3SCwc+t}-vpz`{kPii{Wjb|>-M@Iw+#>4@YOcF*@lO0cpMw|
zdAibUc!3SCwc!pnUUUC$8y-vROM3n|>AG9zMnGPF;lJGfYF3V!coMxD7~jEo6XW+X
z-pu%ujJGm=JmYPQuYoYl{xi(@-<kb>##_xu!e=b&uTEw58I1p)@gVCTICrxCq$jh_
zXM7OjHH?45cmv~Z##<S$VBE>#^9AE+j89>_nawvi-)_U*%+DI;$HVwzjOR0c6XTxp
zNd4iwfZ5;8?3<W9=he)9AG3F|et`2DW?#w5z21iBGe6wEmf4rFa5I=a=M80%^m1Ou
z{K%Ct87VjO!})4vzn;b4!|XY4VD_)Ecor~w&KsHi0cKy!>^W~@_WhZCklAzI%IpU)
z`|XVXiSahZkCL~PM5<Ra|D1=J{lA%?dS=gg2eY5U>>HRp=c`$N+rjMX8Sle*1LI#Y
z-pKfSj5jfU1WRu-<Hs`l?TinAoH6^o4mJ<r++gi`Bl8o>IBYIfuQ2Q9Ij^O4Az{kw
zoy-qyu@$~Nj3+DahEZ(84Z7clt}kZx)iykq@h&XiYHWBM<1x&>)`mM6Ka-_*wGB^T
zd>gZ`x8Y94`!V|l8}4R&FSBp9;oBKcVD^oyU*f!t*(Wo5C#w&fhnamGv+rR123C&y
z882e?PBuUJfblfOn;Fkw{C39k8DGnIknv|ZXY(J<YnlBEEdB|seR1Bz?5ml5klAzI
z#_YE+`#2V#a>nBsKft(y@jo%1z<535PR2(tp2oPB@eIa4W!%m9y^MPpf12@p##b^P
z&(?oRS-H3we}nOS#-}n~!1x%(s~LZW@p{Hxj5jd;KI2V{KgW18<0BYvVf<^xTN%HY
z@$HP?!+0Cx+Zo@__)Cn(vHCWYaR=i+GM>SB57r+xF`mNgTNtOUx}r40jQ_;!3s}Fw
z`F>{q0JD!}*B!lCe9{=doY~h<KW2`9X4R@={o64ngwNHCpT+#tGkziCO^lz&{4_I8
zkF1EJEsRfQep(sNV|+W~r!fC*jF&U}Fyn)meFx*GGQOYj0P|z8d36G_k7fJ}#^V?t
z&UifI>5MxVAH{eA<7cq=I2n&;JdN>LjAt<JX8zrbzs~IQ89$Qo0>%e3Ud;GAj0YLN
zgz;*|XEI*H_-w{&8Q;Ko9pk+jU(I+S<MoWkGTy-W6)b-m84obt%=ndzSF`yZ=Pk_s
zpDf%~#;<06wlhw*NX5}M#{JAsnDOHn?_j)!@%@Zn%fgLi{m;wHK92Df%s!s+zcG6U
z<2N$<1jg@RJdN=`FrLBq1B|;FAIQS>F#bH_`HcUW@nXhLW<1FFw~SXazJl=@#y@Ag
zmhtCV{OcHhmGRY#Z)Lom@mHAt2F4#`ypi#4j5jg<59Ys_@d9Sw!uTi5zLoL4jBjWB
zQRb(O@t>G|2jg^$T^tRvcC?w<#|P#5z>&b1d&9vv8wQO8#^nx1>CDNv+&Lq88sjn~
z$uk(ou+5j7@l+Ne598*ZOFSo^ad=Qwy$TrbsVH3cF}|Pq4>Ar9w5nG%<GmC$j2gz_
zfn4>fWxS7~us?zEzKpMCoX;WZ8SlsJ8yG*D@kYjXvvf5vek`+ZX1qV+EsP(>cq`+d
zG5^~cuV%cB@o|iY8INcFI~YHn@%@aSz_>9tQg2UUJeKi+jK?v4GUM@#?_u$DFy6#?
z0^<(m$I1A8%s!3r!OT8`@ga=686V2Hhw;;x|9r;(#drbZr!hapjGxYUknwAnpK8X3
zG5Z?EH!=HK#?NH-b&Q|I_-e+_X1tzpC*uu_yBKd|JdyDx#y?{5Y-T)#*|#vB%6Kc|
zBN^Y$_>rumY-2o)*@qcFhw%=^&t-f+<L5JOR7C3k1&qfsp22t=<6p9Th-Z8(;||6%
znV$s4M>FnZ`~v1Djqx$eK7;Wr#@&o(Gwxx0EaUl%k7K-m@o6kR#f*Q?c#!cN=BJwR
z(agSv@e7%KE#u=EuVZ`(^Rt@q3CzBp@t&++H84(F`ovj{j89~Kni!wNcr)XZ8E;|S
z!+0y>QyAaQ_*BN*7|&%q%y=H-9gI(7d_UtCF>X{w>OWtnjb%Ka*~c+{3FGmMU&_kG
z!8x-}VEi)1os3`3cpBr=8P8yRGYi+v_zY(6VZ4Cxe8yKXKLw2Yn0+zhGZ_yuUdVVg
z<3)_uFz#o(mhoAP*D*eu@zsnMGhWYl3F8fncV+2qWPA>@Z(@8nvu|enT*g}%znSq?
z#!H$1?TnW(-o|*4@i60a8Sh{mex5|V_A^dDSt5^xBK1F{tl>T~<5i5uF+Pv+c*f^5
z?qGZY;|Yvk#kiC4g^Z^$zKHP*#=Eifx*5NQ*?SnD$I2z2@y8i2V7!L;DQ5g6W*=mH
zF|)5`d<o+<j4x%pmhnZ*e;wn?nEh(TuVcKP@#`6HV7!*`M#h&j-o*GFEI!SQ-^A=&
z7{8hER>tcX-_H0d#@iUbh4C=sEi67AjQ@_=?`Qm0#*L~-{lAUzSjKN>JdW|zjK?!R
ziN(jk_}h#pFy6|zlkq#5|1`#b&v*vo-Ff|I`~~LU!}t)!^BI4d@dC#0V&N7u{sglR
zGX7V_s~P_z^Han4-Hg{VzLxPi#?zVq)r|LI`B2aJI%eO%_&v;jBjbN&yovF@Fy74g
zeT=s-p25OxWqdK?+Zn%(@ixXAnEx>2_cPwX_=Ak^XZ#_?jd_v!zmD-(#vf)pj`2qr
zk7vA*aR=k4uyiFb{wT9|GJZC*Ph<QsW}m_MFlO&&{BdUQVZ4^v=QF;6*%vUL&FqUA
ze}dTu8SlaDs~LZi+1D`M#CR>^e`UOm@uwJH&G_FKuV?%&7M}*jH!}N1#-CxliScI{
zZ)Uui@fOB6G2Y7fbBu3id<KhW8{@Yy9%j53;~k7QFutGh7g@N*{7C)p%KD91#*3Ms
zIL2F;pLoV!V%)*_%Zw*5{tDwx#=R`uG{#?J_8E-7&bXWLeCEf)_$bEn8Q;eI6fi!X
z*%veZ2D1+`{wCwqjK9Tr4dZV!Ud#A9jMp*#cg9yUzMb)U#@}VUf$=RYU5$(%!*~<p
z?=e5kjCW`DEsVd<>{}V%!T5H@KVZC#@tur^8Q;Zt2jd?yzMt`PSUipDNc~^Hcr4>D
zG9JhHN6ddb;~z8bV0<^@35<sscQSq_3pb7Nc4nW!_-Bl}8ULJd591S<|9r;3VD<%!
z?_s={@qaQNWc*9Us~NwFg<HdT2eYqbd>`X=jQ@-A)r>#P{MR%76|-+({A<P=8NZGB
zX=3~vX5Y;CNM_%{__xfymGK{#{dUIpGy68izhgYi`1g!=F#ZGM`x&3l!Zqmm+#|vE
zO2%Uu|2Oj!$M}zo$1{FE^W$JVhVcZ(yD;u#Jjgf;>OWuqlLP<Bf&b*de{$fzFbDqC
z_vG*0OTLJ8ujuabb~oJ1nnPU<w7Qo(AG=wEdLZRP!21pi{Rlq#o?@VVF&^31zU#n&
z19d`U|Glq$o6>lFv9EoT(p`mKuXHz|*DBpz=v7MNRBd1TVx@7=+Sfi;=~$tCO7{?Y
ziqg1&VPAWu(nksHQo5JWgOu(qbU&s02pywzU!nK@WTrPx=v_+p6MCD{M+?14>0^Xm
zuk^7(uT{Fg(5sX_PUyu-4-k5;((yw3ls;bQDN3Ipbf(fL3hh$*B%uc>Jy7U=N}nur
zjMAqFz4u2m{SKjbDLqK&ZAuRodXv(p3cX(GAwsWJdZ^H=ls--9#Y&$p^jxJAg!U;t
zOz0^}pCNRn(q{_oQu-{R2Pr*V=zdC%5IRQbvxVOKZ!`T)p?4|m5_+4`i9&BuI!WmD
zN+%1wR_PR>S1Fw;^kSt)3O!fpG@*S;rwctr>2rk6RQg<@T}q!P^dO~23EfZW^M#I4
z`U0W%{$Qp*L+D*fX9~Sd>Cr-OQhJQg>y^$DdacsgLa$PKtk8><9w+o%rQJgNl+F=)
ziqaPfovHMAp<PN(5PFc(6NT=l^dzBUl%6c~-tW!ydxYMl^c11DDLqx_O-knqy<X`&
zq1P%sP3TohUnKNmr7sqGuG0BJ`;@*!=qXBHDs-mOmkI4s`f{NMDLq~2eoD^}I!0-)
z(0ji#(_bL;E~R}!Z&P}v(3_Ml6neeVMMAGt+As7frDqAfSn1h9&s7=+pndH=rAvgK
zqVyF)XDU5MXqVEZLJv~9Oz3_}mkS-EbU^66`_1$Rh2Ev~T%orqT_N-)r7MMAuXIT0
zwMthBy-MkMLN8W&zR+`(t`^#-^a7!$D1D{SnMz+Jv`guQLJv}Uk<k5=zFO!QrLPfs
z@3&_9YlPmV^kSj6DSfTbo0MK6^m?V23cXh8WkRn~`Z}Q(D}BAtbCs?Y+Nbn#p{FQ)
zgV32uuMpa$^o>FfQhKG({gl2*=oqDM7JBbDX8P-d-lg;^p|>f0i_n{t{+-b4mA+Nz
zwMyS6^eUxq7kaVMcL+UK>D5B}l)h8wDN6rd=uD;mAhb*AyM!L3^ctc2Dg8&GW0byI
z=)GT?>8}@hm(pv6-lp`Qgx;j|JwmTn`p-hIRr+3`S1J7$p%*KCpU`uaZV=k1^!-9l
zQThR)GnIZ&XqVCt2|Y;Zbwc-3`eC7Clzv3$y<eH>Zxni$((8rZru3siZ&La(q1P+@
zxX^2r-XQcUrJoRbvC>ZpJy+=_p?ymKRp=>7KP7ag(ti`$rS#K64^n!g(EXHtM(7x&
zpA~xVzs&SE3%yI}O+s%|`Z=LDDZN?f^-4c4^jf922)#<_7ldA{^ov5zRk}rJpVBW0
zJw@r4h0av^6`@^9zbf<~rMC**PwCf$j#2t`q4(}H)88ueE~U2#y-n#igx;j|n?kQw
z`YoZ?D*d+5tCW66=*3F^UFf+=Zx`C9^t(b&QTjchGnM{_&@QFl7kZG=JB041^an!6
zD7{nYy?f2{w+X#V>0Lr^Q~E=pH!1y*(Cd}{Sm?D%e<JiMr9TyVvC_MRo~v|NXrI#U
zLQhfpGodq;{#<C6(q9NYNa;O7_fz_xLdPimrO<mj%=C8%y-Vr6LT^)gpU|6>{+H0}
zmHtZTwMu_2^eUyl5qh!G-wHif>HR|cl>ScWDN27Ybf(fj2<=k(-$D;k`bVMrDgBes
zF-jj0dheHJ`tivJxc*lf7kuIRUuj(Eh3kK%apT#(_Vr5R10nm`*D8%Sl=ii+Qu+v?
z7b|_F&~ufJ722mX-hkfMK1J!CLT4&{l+Z4v@kZ*t_CZSHgIoLB`zhT==oqE@3cdHA
zX8Q5LX-L1)_#il>U+JTT-lR0%V1x84eXP)HmF_R}Dy8uODp>zedVtV#m5vwMr}Xhc
zPf_{=p)-{}QD~RaxRDFeuQWcu1nE~AA4J;M9-}mFXxP`jcaNEVhtRu}9whWOr3VYW
zN$FFCUT<nxO}ZEUd`RC@vJA1F1bUsO7r~zk;m4~qU8ZTjrl)Iqnx-deI$P7DG@YX9
z;hG+*>4BQ=uj$^J?yl({wfy*6(|a`iiKcgG`fW{b)$|rkZ`AY#P3z^Qm**PI{x(go
z)buh<FVb|Crpq+#*YtEvPt){7O=oL*l%`WOJzUd6H9b(%{WaZN)7>@w<M`<M|Fx#~
zX!;XP@6hzyn%=7EEt=k_=?$7*r|Ek&y++fwX?mrmmuY&DrmHkvrfI*Xr)zqerYCAT
zThpU7oucXCnjWg@ftv2G>E4>|uIV2y)XHDedo=xtrgv!iZB1|0^cGET)bs{TuhaCs
znqH&n+cdpW)5|oyNYhoCF4MGM)6+FQP16%KovrCnnoiO5a7_=@^gvDb*K}`9ch~ff
zIa>K^dXJ_*(ew^Yzpd%5n%<)6jhfz|>2;dESJP`WeVe9NYI>Qb7iqdm(`B0WYkIn-
zr)hekrn5CYO4BKt9<J%3njWa>{+jNs>F%2T(XExgruS(26HV{X^xK-=s_8A7-l*vf
znqH^rdo{gA)3<4QrKXo@dXc89G+m}?zow^adYYyuYC2ofqcokO>EW6ls_B86?yu?I
zn(nUYAIE9sujxIS{zTI|H2t=ww`zKerZ;MOgQnMM`d&@1(e!PaUa9G2nqH*oDovMZ
z+OO&9nx3ZViJH#V^e9cIXnMG&hiZDDru%EUx2C&m`p2<a`D=QQra#g24o$zU>8+aH
zqUnvA-k|Aqn!Z=lYczeErdMiunWh(Mx=Pb!n)YjYx~8XTdZMPYH9bnxDViRx>7kk)
zsOkQi?yc$Wn*K3cD}PPz(ex*p-l6HYHN92STQt2<(;GCsPSf{ldX1)U)AUMBFVplQ
zO;>5UOw)c%PuKJ`O;6Nxwx&mEIz`jNH9b_*12x@W)4es_UDH2iY2~l!J(~VR(>pZ%
zwx+jgdW)tvYI=jF*J=7*O|Q}PZJJ)G>1CQ;r0FV6mucE>>Iuug3~z@UmlKxn4!6Rm
z8+0u!<hxfa9O87R?+o>EugDsb;a(Yx2?t?;3wfM7{Y%Lmg69Lr^F4Up@-N-XcZ9QH
zFNic<5gOuf5Btfz>_Dh{cm#TY5KiME{5=vvA2|N`ixZZA8}4T9U=OT`p@UCgtzAWt
zIoJxvrGw3|8^a<750QgE+c>yUIKUwG!ChEP)s<)o`a{xJkbZ;orJym!Q!&7m3x}kI
z_rv{GJR^gSRKtx?)J~yyhZB_LYh-yG9hpRyM}USiFeiV2T@~myKfIQV+hLs@-QoRM
zjNrRcqw#CXI7G%z!$B<Lui)k+p7anJuY7t4-VzH>CZjvZ<Lh9A9<Qg9o~4uK)A2{>
z_-yW^CpnqQ;n#2r8iN}Juuc^et4l+^fOuHcrCz`>3f<7#k}u+)>IxApa<Aw%Gy%T2
zmuC%$bv3(J9O>Tl;rTC&i#6Qot&0Y`pMtGt_&UYCe7Ad0tNXxnZda>&SgU(^w;|!>
zSHX!;iDQf0XG%9v6Ng|uSn_=QX7`FkHQ`xkvV3RwCRoC+S=3~NdI)?KxI{c3Fn&G@
zJJ~UEvxWXL{0QlbK_fo^H{!7>oKD3M7v2K|^C5*S%HSR;7Ka}`kq;!kh0fdqdq}Xf
z<HC>8nLog)I{HqcGtZ<mKLU;g*@6Xm9T~rdP}ZBxup0wC6^Bdc;Cgv5+z$6C(c@a~
zA&ZP|1c*s^9*i)83nLzJKZ*1>fQ-Fld<Pj%ju>Yt<IiBX0(we}96Vhe+)4-glc$N~
zsXOUWq`$_F!%XdJI;o9%lXx(~1hsObN;3Kz8T|{o95i~68%-jk+sWuPGP({l=E5_w
zM+?uB$FYgc!A?@-Yd<>q5;}Pmot%T(jO!OmW$Kg3sFaL`aHAK=s4p3fBclUQN$W80
zj-wv>pIB>GoHxY1G7Ed*7iBLTCwk#G!f*_1Ulul<d+$f_xJ8TmJ#fsJTkt%n<lkS7
zmH0q-pZNSX{26?r-wHS`T4N5hrFd+A(DwS?i1)&CM2+19hWITu{G>Q__k@*2LxSN4
z@O;P)+yoF_L!tpi9f^7rOGvCnF^@zIiV`6f4yg`%;TuNvVj}tBiS(@?JVqQl5I$FY
zej82_pI?U0L<>l^BWL;h;geBdeu{&;!+*qHDAIXgv10eJ<hdBL|Hy&{<eqOB?eWkT
z)M4Q7Ei#P0Pi!oP-JanO@cfk<tMU7Dz}+i0y7Bw%6T0B{eEj}0_T?)!*1~r=2p-=K
z-yow0$&-WN_=RQF@UIcRTKZ^EwtGb390>P^e!{&XyAk&r_i?XWR4+=xz5FNGd%PmM
z4shS4mDpRYTvQV|)LsAx%kl__57g&p3ZOR)-~XvB@U@>q>`I0BHhk+RARd8+BsMrN
z-yAL!EpKP|DzL()@YrWCvi=SGuD`;#(|(cgygU35iUlAbZw`e2IG%-*N#UepINd0m
zvoM@-=zGP+49uQ6IONGBh?KGDn>ky&Fp{(9K?rcF$k`x{5j<y~-3<V8_C<)RnT|g|
zTSA8M$JqaXW7>+1ku1IePQti9ij7ZZao?p^$h>6P+<vi0@wedzp!nJch~pc1**yni
z$mjQBRf`RO12IFN4+FzMPa6fXhR+cY32AXGkk8-8V=rSW;PYs4?CBwRv(SC+!XfdY
ze(n`Zhd5E<>}%i0c<$3f@Mf}m<<cQ|Qx&@aVI>4wy-HSX&~nfUZ@q(6-(~m07Yy!P
zQIntxfxhf)@f{;P6u#BSS%DF%!eFq4x>xo;3I6`|3rodx!RY(s<DwPg@Vg?zfz5x}
zq%L5#;u_cwKft|WDzNAJF3VIbVt1Ni5xe6RJMw_XfqBpen+7CW#2S<H<42I$*_hc7
zC6P0bCXpqOrlrS-11yhTL-QY@T2bk@?}hK!Xop^D7~r=V6161UeaZeV^C22bhqQ`B
z!ki5!7HQ#A;6^WYRW*RaN5c@*{sEl-Y#hoZP(3Cr|Hi%iMfdV8;kQLL_y5b;@Cd+)
zl>0`&2Si`gzs?a2of%b-!yuBM$5RYH?<>E+TyJyuby1<89?}K@^1@p>9-<MIFQ0q}
zkxNmL`xN@-NaWIF<a)@+^`yuRhI-DzDul4w-=JA`e7M^J=;Jzsw6B5z2-Ttr_FX!T
z3PM%GY>c0amzzb6c5rv;Usi2wKL9B@(AvHi;&I@O_RmCgtVg?#cVUXZ48IDU3ugUW
z(7)J?j7?-)Pqqzc3&o7vg2Hp=VqQNzM2tH#P(TZIhX0It2)GVy=0oN{Anuh9$x&=1
zjnmhNt{vv)*qAENJDeK~uO1^?dxnS;R41(GP;cw-(`>OqU==?BVuzVpPgafGN(i+2
zstw~K!<RAa0x@i#M_GLElb^(sAqXuZRT-E<Yy;t;qUK@3&%|khned;=vA4xZWT+a`
zK%+QbAGCKBK~t(Qic=tz&(|F~DZ{K|(|75eRLfP?9M}orECGs5_*#*_luvMBD*8Q@
zKW~BQgyo;Nx1kS+rzqQ-15l}lig;quJkSE#hodwN%E16dJ?4QAt-poy<p2uZ17dTc
zvvLP|j?5LJrwOtv!Dya{Ue<FjShJLu8=}2X*xIB+h8%7!2UnNi35Tp6Z+{@Iv=Z*S
z^n7tTE2Rg)vONK*nF%Er%n{V|4_=YV?Sg3*r?X(PFqj9Bz9@$($ONcL-G;`)Cm`IC
zgD*Uc{W`wjYXZaA&Eu=@QP{vYmq9Eih+yQzp+bxm8w;>J&)pb?BUPp5%-}khWx)41
zNM`s3G4LM<cgeuH35>U*`fW7u*a-50-#@$nTkX#94cIF!*|;B=f%9we4XzkxLJf1T
zT-FFYXZeet-|Akm42OR)Hea*@s{XZb%~+31CcVWp0-MD7!0}2FBIt(m#F092#t=9%
zVfnwny-2`H=?e$7e00iufB!z@<QM_vr1KKdeVGwycd$c$RC5!{Kpz2DG{wQ@voRjx
z6qzjoC<YMD5nZ41FJ?+~=zR3=76C$1SHgT0y0b`fCky|O+?6mTr6V%ADdLFq{~h!&
z?N>rb5MVV182bSLd^!t@=O3c~+1QE)4phRFss-s5JZCI0I7dtyL(rDtyJhLu!T<}$
zMCsoMC*t(A+xfEeUxl#Xm?(WG`M{t)7BjS+;Ty4bSd@MdmZVwwFGQDq{y(fsKLfaR
z>7NEi4zKk8{T?Kom;PnEbY6%m{d(-*#LU#P^w)!{Lzn(|to|yY?Y#6e(7)TZ^rr~_
zR;52u9Fe8}1JvHA((j4&+bsPZqHfF5kEt+A|3y4Umj0rqzE4&)!wgi+7<Y!>7v-tb
z7mz}=>2sP~uz3PBR->PWO8pn1cRwR$c6YujjaLbHAe@eI0OQ5zWF@Y}%y!8EJwO&0
zV;x;FChT5**N}SXp(ZSUs2hI5L!l21?GAFcdr5Pgd%5iS!qp;C3x{NcPbNpFp=%tF
z@g$fTCHz0$-UU9&;(8xXB#}jlZ&1*9L81mtR5VmkBBEI#@J2T^0ztf>pny<8C~h>0
zU~qReuj|@qu~M(`(u&pgi^?S;mvBo!MG<dUMB)wJ*HB5d6$CZ^=Q%U)?z_35^w<9L
z`M}Qg%$ak}oH;XdW?nU$(by2}aIiv&X2r2+t7&C&;g@L(#tVP686}>i@ng@T-UI6O
zqIYC;*!NB((3#ctQ~654J&(-FnN=x4M^AJ`mGwnc?i4li-Q-EmS}D&Hn=2a!ntfNA
zc`J*pw}b!J9Gc`yEuR<lcMZhGb`9C7!4u4ADpm!V=799gC#9zZ(@pE`mW|E+(CIm;
zMq~!0rdXB@J`}hV1<Fqr>T{Kpg$_7ZC^bg$vKLdvp^94|8YlpZaDe14mH7dyXd^<G
zQBpxHPArNK=XA!UQ9_@Q;e9Z-F^7b<YtB?`oF`y+*qO*|`jn*Qw8O=D=axBYmEnP0
z*A{Zqi=64-!Hm@ONE|xtG1C5GQFZ%;X@n=TKW#8}E=%rad-xdP)7!7x1IZeF=d+&x
z9xLM?LCn91OgvsHNs!s-<CRE;=%%Zr$b3el%WC-}VjHPLS$h?<D7$&PXt8DatUo|N
z1~V}~Q45Q#UipDA$(}ClY_l(x@`Lgvb09L!=m0Y|h}%`=W}skSaF`itDm0@rvXPq4
z)OVUbM*AH$@csL$fo{41N3;n5a)$jFnN825dZxm!0=*FYvR(X^pv3;bKcezCvik#;
zkr-0Hk#V+gyULff(ST*7o#dyu@UKKZ>9p!Y{_eIfR?1Fje+Fej6ot?~U@iC=qL)DX
z5?Hzd(jiU0d=G#zLbg&-?1y&Cbh(8n$HiaS1k<?bE&kWAf@5NWF?pqFed!<RFAK!V
zYm2Q7{t?5icd_$6#<S^Ow7GAckylh{&)P<!lsy@+zD8Gztu^*!j9nwQtkN1i9}7^R
zUK)r^S=UlN-^eSgX(^xAQeI=5oKvGSYq8&+9l+5M4Tg0>+vAB|w8w?V%QCtuTXd&g
z0hoGu%xcM1n%<@T^+{7_mMFIBNv(&4YP;>XQPG4lG9tX;U{|APAT~KCKM)IIyEK+j
ze100VL8(tL#fm!s-F_M_KbW!fF9t9x<Ef+5Qi}T41*~sKqji1j__H!#ISW#_n^{VF
zH4M(lJ_GQjZ6j|mfYd|giOAO?`&a{VxmSrYw1Q-zA`yzyX+X-1a(gz$b2a39PDKtw
zrtN@See0b3a3}Hh?y&Hun^1Tu3fo6P%!FWoo76N;H#8n>dOj*N&1b=;1{1yD?3W_D
z8;sE~Pq81{3iM+A29g|B13CP6LAb|jibqR(DjvPZfo<(tUU&#PiD_yCV4Jal|M*pU
zid`#0hx0Ft2*t;>YMTQjGO>ALQyyeuee0f1pgQ)t>@_-+ZC`wEr<A4ztXC!?O7Sf7
zkAPU!4u@P$Y$ccWSJbz$k}Ak0_8@lwWw7(d3&_s$xu#WAiRi}{rh$J5T16<A-WgE@
zs}Vd~1?Sqozn`e2k3hVkXg)mcF^xmWcYJHiN<Z_E^c0bYQZqK1!c8&2fN%x;9uebo
z-*FUUCFj(n#a3F5yB<*Te#Se%HQHl`6zn(VSVhY!Xb|?d*jg*q^%6o=*|(9R)uiMd
zn?w||=Qs>8`nRE?yeRR@C{;)4Y)z?m2PRQ!JqYKa)K=yyN*zLyT19j1Nim_+sfdG8
zeUm7a(PjUX8ZE+<{>d>vN2xS)8<biG5@Cbw9F%9ruK}Suq|`fax24p~PZXsVqT&Hi
z>JL=Etts`qXpgu|%?D{blzN4^ic*a#IM=@LKA}_!N`g|ok|@>GdH<9;ivuc}zToJe
zqtxHu5=#AENZPL}%9B!OgU}sP>N!+tEmOle@F7!CR6GDmP38pDno@U)GL2K}LD0iP
zDNA&VjJ`!GIM?nG6-s@8I4E^&5~bE3ynjk{kwKMyZ`RLIYW@bH)FdHkzt4_RlsW{2
z?vPS5QKdDd@~Ch?sj;H+sJtG2me_Np5Mu~NV3Ba|{G33+4yE5SWxhLu7$E!P_OlJ0
z-C43R&G5J<LoC{WwH<8bX%A<Oc(Q4I4R|dZY3sfNIzXktFoH!}0}^smdz3lg{2U|l
z2&2*TgGD;DnTJe!Ao{}dJeY;SoZNtgDbBQlX{9Lp8R!YH?n<T(5bVE^+h}Tnkerj4
z>j_dQ{rmNs%->2g=z1y}6dlk((UVY=f_kjvht<IXXQ@Mw;grIbwJSLa0m9Z6Rx@=1
z;+u865pny4H}Kzgz{`G$p`F^$;XG5Lv8_S|IyC7l3dq1K`4c;PE&7A=NM<#pUxYON
zA$Xw<+E05Rjnt4XP>`O)EC)!{{Ba=-SCA@@rXiK<pgqb1sjG&R1KcTf#B&eL7W6Aq
zr};+)V*PH;Nl&3s)+c3irxZ?qy;s6KDl=n!F2fQN#pxYkZxB0i;MYfzSD#TDVtrmk
zO?!lpHN6NKDClGPSjM(4=whtjJp9xT&N&iL%zO~0KMTb&UP5vpmT~bX6y_eGBPc+U
zU)P<`9n+eY-}k$Tyo^5>B6FNQ?OYaNlS5ADbcyz{e}hh-FhpQE7F8)UGcLfoEPxuB
zfWC99K#v@Uth5}8rJV;bs4V9fyX;3%-(UNCR*Cid`b4xG%UCRxGVZWDq2O*HcQQ~_
z;L}9X?Jv*q!nm9`#QNo<HVqrM{Vaf*3{V0iWBv!76b!(tKVH!JGtqe@2~<}FYWxX8
z0#NOE)p@`GwBuE87K2vHv2D*TEhdbSNif#Q#N>dNPk@DJZYUHShL-dIP&R)I4ojDT
z?Q!fn;4CGa!vF_XGX^He!;wDPf3<(K{~D(pAmjJ_L|!^UDnwp3zpv%x0a$F5mrp(N
z@;F5;I&isgRLe^-^6eY|q@W0&cm3vOpmoA_qDA5X<+OTDar^duF4s#>jdT4=%qYpE
z7W#bhCApok8jxIe!F7c2o?VUZoe5A0&Oary63Mg}`vWL5W@QqY9sn?f56BZEd3K%M
z0eMCO)Py*FNQ5<~03^ioG1#Oaz_KG4{dzQ7`3)>ur#C7jmG7FX%1>71V~O(TvHZWG
zsguf&qj?<b(G!_6BnI9*xl@Y$mMHYE`)*p%9r%uRE`S*_;S8e}{5|kz^ePk!Uq<)m
zunCFiy9!bA!rmpT=Yg4MTt?T|Hs_-W#;!U>8`2*|S^l9zOAm(wWM2AA)()-lVU$lt
z5aQxo%qsCVMdtwa3Lom(_k-)pi5g7ztEAyouz_n1WhV|I9Os96Z|w=nG60%qNa<a6
zK%qDV%}c*~9E*O~A3%qz7EY&J;mkCaKCCN?bRO-jU&}sd8W10L`+O2E)?+K}J*t#}
zH(Q|J{W70;=!bZsFLC-|c|aJCp3KTrjALlE7j#Qc_8c39g&h`gnZ}Xn3z28!!8z<c
z%q1Y-UlCXMUquYKXtcYrF=Hxhc<U=nzv#k~uzZD+Trc$(%43SnN6x?JLeMz?+{8>S
z3=8>Rwr#It8we%l?Vp*>S$b65U9J2Ouwsl1{nhr3V2CzXoXxwF$$?c1v-~3{A}b1k
zOWAOU$+*+wbJAD@$$5y%bqM0;M)ea^KQr-TS5RuBJqi$B(Fi4OTIszJHe=l%`T_H@
z8TxFG89Of1E*0tSBr78`oh2E3m;Zs#y}Q98#X@O4{;i_C#`<kLjZJ3Q_yLDcSlNlO
zy^@&zx%&A5Kla0~lk!T+WBqPZ#WzXu^cxv#NS}ru{@FL`veI2->^?Q#*?%FZJNs`0
z(b=EWPiIs{EE7L=l%kr#GuLRJ24thNweqvto<`}5b^j|ntvVSU`0-)Hz28GD0y?mN
zX6n1>9(qzO^*Z5d3c#_#maBvo;2d%vL@tW2M#Hxvh#!`BURduM0u9cY;HL~wtHJ4m
zFy0RYo%+(UkENf=pMpKX9+<J8AQmtJyR<YKy(&qE)%q9ZQ>A>ry)<*-cxyGAE5E+b
zT)C%{nObkY^HZ=p0HkXGzLrL%C*AsNzH;FEsGCxLTA3Ldco9m1C|?G<VwhY2o6x>(
zCG4zmph<oDUr{b~H0s9NXMN{ZWNLhrJQiC&U{q#$mhN!J>|56o%&GAHGZqcbsVMcP
zc*7VI6{U8x67vy;1BXb$qf>@RX25FnU%lEkSF?GgV^#Y_#-jBKhpZ$VG6jeJXr({f
z?q6yL?h^3X`vLE)qI4#~@1TG7*LYHRKaidzq_+CZIi(^MlY{V;_!BiBx=;sY<8!_O
z%tRfb_YNd^$~)&SOp0^+q|CH_G-vPJ2vPA+AS~{qed~&?FKCOefM$-S<$P;Qg(k!Q
zft*}3IwGeK?J9a3-M|AbD<-m6nbde3V33c&*my!xf1}%dIaQJp7MBV$rcHYshSV?}
zyI!RU^;xhgIMcM|s4O&v)yEv|V>BYi<mTNNYIJ}vgaHKes}ya`QO-$d+yulFdz6@*
zu%Dp{*bN_kx$(6QnF%G@+nl|I-B)_l<sTG0S~c=it4&q*p^$D86^6O94S=hY7?k3_
zoO`AI5#V^)aGc*JZ%9FF_pJ-<JI1)D6Gu&~$7OF~UbsH(5i@n;1stnL8=6@^G;6x~
z>d^5wjh{X?I2CiEX<Z6h4)wvRO{pJxG<Ydelb8XF$coN@7g{qBB7T+hz<-VZ+SL=V
z-gDtyRzCKo;H2Bfw*#-~rMSGZe8bSpVZ+8xU_msRCXIGaX}XW|sm>oXcG}IzuPvVp
zO=b=4UO)86@hl81fZ<Uo&ebx+-SSzJ+3>4nH%+=}QgG5(jY9p<<HoU4hE(dA;+&Gu
z3v7pZ6dx9^Exqm1>wYu-`k-k^hf<uq*kD)vTXKDnt#ZB68Mw3D1nctdtbQ=6I~!5b
z<tO33Iu<80iSomP(^3A?MEROl<(FSS-dld~_**7Tux?G1pVq28aPZ$cb`r3uAKE=g
zG{9^$*dxU$lA@T0H64eP!<)|Pyka_9nBceW$mw3pF+;TdvIKLEkQ!3nUpwh~!Z{|W
zrDM|dgGn1m$ArPw&@8resI+u!in9d=5n7)7uTH{2c~ZSVqTZ#+^`1%8Lq9H>JoP#j
z1!PwaoY^RmBrpCYH{IYR5C}R62zfa0YgHxD?-AwGCIc?$BVrS$Pj6{GQSPFtQ&}#l
zeZC`_-2Q7*;Zy^LvM7)qe%q9b(Ehc;p}0i8j%ixS3!%m33WbMEo;o%mS5LEC+#V*M
zUb`ttzEGohf`%^gH4Qbq^0llO2Z!}T4+qT76RfEs$DM>PN2EA|q_F0H*3|OrCY!Gg
znmT3L<na|QGchiND)Dh)NS#W6P;#(ra<V`I3i>-WA(5Z#6B21U`j?MY{NdOqeS%Y`
zqr8&fIDP)YI-)Q9E2iRTJuyH=jKAq-l)p0pe^wHFfc*8;n?$jA<>U$!5_rfjx~Y6h
zYd#hr-*0X9ue5tbwis2n!zc>vyWY5)pD~@iZwDx5+`R!|zqQpbt3z}7C0E?t|AVzt
z&JI{#&E996l9F<3zO$6ZRGeR-y@2orOwHMQ6@)p2wwGm%yB|Q<v^JX7JN{68nz?(s
zx$>(+&Dq~Si7zMJx;DTax<RqC^Oy7ip$B75=}JRC8z;>E(ATM^H5nUt$1QJZ;qFFP
zzp-eY*$F2^*nWN++Pujp>pdV_Jn8*jk*3S6&4Awf#=Yb6_E_7jcL!VFq}C0N6{Z@C
z(heTN{{~m3b@RtY68J#~&Tf$oEuY;Yh~F~|od~VYuR7Xa%R$|DQ{V5fRDNtN_a|EH
zIhcyg*wwswRSHjntFg+6`pklNj4;1Oz@sZ9n8*pBNYwPz7iLN-JXs-+g%tl3vljZ6
ze;1ksoSMUTvMQ*MzXSn<`n~oO7FiZn6veR-IXJKAp2JqyB3sHvsbZ@u(oEzDDNO7n
zfg;@b(~U4gwT@#8&r#MOG|qwmwaW<i`JOQ@fw;wY3i6y|kfivo852A>RX76@yL?eh
z62^p<wq^{4>7Hw#KEa_r2f?E5h4GNOB{m2wIPQ3Ipvs@>Jf`&P?k%BxU5tBXK(|P9
ze&Eqo4wMXJwQ0&3E>Hj;R#Bl^+c^=WVLd$6%=8=GKplr3f~rRp=}`Saj)mqB=;ZL&
z*q%?4#0IWi4Wh4k&0WT=apqxtNRtCcS56pVkHsPDZGRaTLwz@Cs{>}-X0WdvGfoaW
zmDFIC$wnbCIhcb@9hguQA<jJ~AXS}OPD4Lw-+_JE(@FAD*jO-#((ic}Rtv^QzSv$c
z0F7=~x=c;Klj*3CZ*+sB2iOT|fMC_*1B-n(+1=13B^t?+P_U?_rIt^enK4iar>lK8
zVW|y6Z31k<Rlvr+p$;{%Ib%@hMP}fOfm332b<p0kkjbs>xx--3kq*inT5S(o>>6&X
zom(2*@LFe%Cp-yZ4|OqrlJ5S?HGRdhqoq82e>5J)1Leu+dxFLor|Ps&2%Dfaw(nBx
zE<zWOnIsHs(xk->z^hclJew{xEf5)7iki;nqEJI&KJ_vw&3b1eX@X^*jd-@AJo(@y
z|AVi#PyQtf;N(a0i`>iM_IDFtLzD6W>w7X|m;L2(n$ElJ7?w`$aNrieP&88AOmsai
zR@@FzVs-Qq!a#pTgk+~<H@k4zJLvkW=+o5cxnuLB%euceFb8|hZf6<qxTDW&&4R;+
zB#raVD_YK^kLZ}%QI1Pls_x_SSar42#|Kfpt&|;}BxPcB+cv1-E)hy0vTpcBPq@$%
zp5h5tdBUT0*d5ncA)C>jr~a7c9orma(SuEkN6DLm>CK@+AEv4=F?=wH><y1_fvNI3
zH-{$rQiDB3JQC}k3$bkA%9!J1bu|k|3%iE)q;ewxzV-B^L$hf)sNFTSe3c|(_`NDQ
zzNVM#??D(>J$=3=X#lDHz-0V?^TPq)Kk(20d;AML_@DLj`2S_!0pK6QH{kx)^uO)p
z1pPx8pWYQ|wYwVq7iRM@IED}hpL*&qdd4Nq5qt#_`$+eq6qX*pQXUQ1ky>d+HW}gT
z#di8}1$Qt_>s`OKf%~I@*aCG0VjQ&#^fxAO_|o{5Phf1|<6rUAdsL<}D6IZGT5+up
zP7Mg7Rgg%+@<BeJ(79Al7z07(X=W^a`w-Z8##C;#Vo^`+f)cEjJ_p>;N)?N1hOnPH
zA6s@6$hY5;8I|&TpYn%<aT3$J3AA9YVV5lop!;%a2+xAWgz#Jm&(SWFrBI`|euR2;
zs*IGM&oDgFS(bme1XaUdv5bxvxMnti)iIF=Wzeatq&6;h8;j?tt6C^hi)8BgeDraS
zx^#qoPDJdnoJLl$N~f{BOp=zFi;QRG4wTWYv=Euojc%h#jc#Mc5FrBN9}ueJuaT!N
zAMm$P$D8qMuUrO;i6?R2KnI(#a84ysIirKT6$m0cIA<EbR5D^ejaimc!0V!3!#QMJ
zLW@j+9;8IMn-A`OAy9o$2xD}cnCUzT3kXX$*hxrVAov;Grsd1643!>;rGLKy+CG+X
zOkJmxC47k#lDiyXSVl)-i`TyRC74Q)|4XRVCt)49D)v!N_$h?dc|_m!+}(=xn2Hq#
zpfv-U{W;cD=xpEhJmdMpUdb4PqjX2siShLW0sBXw(n*=_-~lW(3TE%*Ers^R&=3@s
zF%*bBD^eue!yHR;sBkVU9y5l);@a7_?7%RI4g4MPv+}(|$f|&D-th~@%)5bBW9H>S
z9gUeWd)khf1%GwN%p<g>XyQXKtHF30^xiRZC0X)67&FHR-2IK29c{-9Y%NIA|9@j9
z2HU0YdO2*vfw>+sv3`SUA)oCJn;W5>x#{?OZKNg-oAWSiC`6ol-ErX=FVA!^UcNxO
zJ^RIk+;6~GcE`Z0o^S)g9)niT$B8{k5o8<!#d;3HZUd!sHNwW15Xad&`?r?vT(T&Z
zJ`qJB-NO)87tWHU`wALLv5aB33qnbT;=}3!iy%EGQ0xz)T*c>sEGn8&o~Zy~cPiL9
zlqI2usDoH;3=p^hn-*ZZnhvnloc_tb(x~q$Q6`mhfEd0N3>B&=;r+k3;{6)<LGk9y
zRfqDjUR9boQ1Fcr{yaUNrm{H_=7?4`V@e&vO;_zfd$cX42ri7(RDtpv0ov8TX2IWa
z`&!^)9zkZKw2Q(BZ$_9?f{1amWb8utF_|2=jwnU!S>?x$^-z_nn#-ZU^|zh@WO_JJ
z;-gB{jDXny*AIvXX9YIHKj75JU(N<>jlU_ZhhOS%`vokMDB(EL+(8MiK_RHW$qOmr
ziRq540N=u*A^?=da6cVt8H>q4Y-x_o_!A(uh!{$|EG5o|_J(l0{~27qEGRn=Eo1X`
zq{XFyn-*#oi%WzXb%o-dVX9N?9RFCMj=!<=$wFqe7KC8*P=e8jOc)c24EE;xSXA|!
zQ-kVizXukeix!1SUob{@_n;#GfIq1O|JT0(S1gFupdZOSyAD<dI&+e@XV)BP&jw!H
zVb1_pI=T*Mpl7T9DI(v9SfXcV+|RtU9O)Ifw&@k_7VU7cZ?9+e>(Kn{(Koc3vH(to
zw*7ms5)khAlm;9IFlMEGZ^Mr`!Ki6qREOjCJuF?@jN5Ca3$Sc}{~(2brNSQ;hyCdT
z@HVbM+&)U$#=zrVl4opQ3%#M!>QD}2E<QH@qcM4k!0UKSqT?8RFUmkcV1Y8&ARZir
zNLZ?eWvOR4PQ+%&G!8+}kgOQPA^9H-$K$Y;G@}@6GA6-;;h1W~BJhsIR>N@j3o;C$
z$-mr|mSSgtoXN7f2t1eJmxVI7!8<;|)lok`{OpnBt+FaX2$9M<{>IY(Ary@dzx#K!
zBcxqg3F)hge_lv?qem3d5)MCX^>8Li7@tARqMWQ;;}a)e?tZ+mtr5tHof(Cl7q>rO
z6rCY<=`r5)r8>PL(|O`YPx+<FeC}3}?|g`KJM~ZQoV(hY3X9I2e{a%ZZr26l>8qWJ
zczCUIt0z3u6P~TZtb4B~{eDmSgP!z1c*0L2EcWi|#CEvEF`%2F7Q~BlL{76OTiF9Y
zmtSn{F|D^u>qCFbn9Kxhu~(FRTx@;ABW~-hfVFC{wZ6Txe1FR@3|8z`B-lo`757Yj
z!81HVQT|Rm&(6liscbpppp{JrEX+x(2Fp>mkvAg8{>I1*<k*2|Ag38x$X6V_C=<R4
zYCjaGuUONpLQP<Wn!u;O$4rNP(6>&Gug4JHD5G1~oKk=e<cy*lVyU1ST7#(?<1QK9
zZ0$mjF6hdBFV_cc+sg-hwWv&LI2RXX&|1p^SvTq*W04xmrOC<ri(v@Ng;19UV5Bn;
zKteTsHR@ybgZjl!L#C6~Hy(j#SAPCq^Otj0afa2PU4gIS7qyE2s#W~NR`J7I#Scov
zGYS8jCtdim-(=qfRZ$Gf(T7YNQ(&(`ygw3MPm<ps>|qC3{=@?;{|$8Rf#7o&`g0)V
zAB2`akn-0ZVEI*0!3P5W!2>Km{m%ykAB&dgwJ;>6jxh_Gf(5R=H$yWsQ-a4~u4pOC
z!acIggijs)_vj06hSp}IPWgMqQM!*r{t?V9#!<pFWSV(j+kbf+XEK$zbDI)8wHSvD
zzTz0v08is{vANR_?7_lWoNCNaeX279#c@s`yQl7X0lh!$&%i$@ffVELbyy$GT1P~n
z0ejqQr=?V$mlEv3om$GI(Jd_p%Re<Q$^NY;)zz;**bjC9@4N$s2h!2yJ37_J&L4w4
z<?pcnP&ptacv-Z_7b|VC%6!}aJ{0=dcNN{Y3Nd{Y(GJ93Fv}H0i@HiVZWNbq)|t58
zo29DIl_kt^mx9frmX>B?Ws$GxMZ$^4yEd(H^}WU?bf+&R2&FsKCpcZFaVkl|$%x~m
z;3wnNHwmxz+QBQ{zOQM1D}3H`@$p{{OXjur(^9y3T5SCY?<hsF(4F}pLg!#EkjxLB
z2*NGfE`&3}`~-~`Q6s#GY9~56t2jCZXPiyJ!RqKw`(Z^(e4+1CgIxz(&F?lo|K5*V
zI|q*pM8~HD2J|leG7S$5;$}3?kWz!^KwMeBe1iZ2!$uTG%d<%a@Z%~-Y~0<)+4Trz
zv66K6WA5r&g0o!N$}cF!q5xJS*ax_0rk4LDp5SYGg8f!xRo>%t9CG<r68XXi-v%Bw
zZ4iB#Oz(Uzy^Tl)%Eb$0IX_{SB0+CwyEocZ!xQ-?y)Sw1m(aTew1EH`VSe%`ncl1Z
zQ+j{&xtHD-w4t{l^j69t_DA#(q^Ea&e>9k|IGyRpnoylg_Y>k|)^vY5p5SZx1L+R>
z24hW5oZfNr$vA68-=4|zJq%F~eZT97zDjzdmx{z4CfhKs%pGAy&ol@0`2dT{AkK%T
zK;Sa;2vhn9491;C5u%&~Zq1?7Qp@qIs^|m^d!MS)(ukMl{t;Z>^kuw1<S*K{_9>ME
zr><z9iD=C%u-PCFq+lQog6o1EJXGRFn1+lQ?c^!a#l@)_>H2sIs1I;Yr#h*_sgIb1
zv#EEG?!}6JP|=~8y5iAzj|^z2bFD#{Qq_piCl!~@F<-Rk=iOMs=Uj_)sXxe<s6PYR
zQ`h(D`j@EsP^qYYfkdSKSqLU<4sD;h6agAT3qg^QTxEMGfKxAEJgC1nZIepFsb+#w
zKdH1_Z`!F`g4KeA{Hv3D^msPrzIPEzO3uP$c87^ppj{FWC$n}deq>3S@WW+Mgh{a8
z^|V)*@FnSGp3q>1>hM5fgPGS5e6l$-6)!=|jSj_25I2Y%Wh?IRC5X4sZzEC#VD3W@
zlNIHJdjrykNPZ>q)l(dY{Q-_4#iXdKpW;3&$?Y29Oq7zgjYv9z?YIAHfxjD6bld+3
zgUFSh0qOOP#P56LtM>1I2<<OIkwp7oU_vhLER4%Vp_U6i!UiVbP9EeUS3##-^+Zs~
zRsTf&8U5q+`|J8$RQ>)^{|Br)rG!u3KvVVeJ@xbB_49T8tx{!1zSNgT43u22MY^h=
z>#2{KP0MwzuK$v%pDXpBO|<{Vc>Vp4i`A;W8W)Qa^<RkB_l!#nKev5%TuxQ>)wsMa
zQU7wczN-)LU|%bJU_`Ev^3(}NxJ1<z$<mVB(rCiv(#Knq_;L2Igg4;^B;aw8wKyum
z&pY_h<5#?U|3~)cslPuU`*Yue!p9{0Q}6GK-k(TE3LiI;3+XMD?BC8CFYum%aI81N
z{Hh?&$*i~i*1t^ajS}l^Ih3^5%^_W2{}x+yW@wLL+=Cq)41bt@6~>|?s>C<tE0*Yw
z<#<+!wP&!kolZZ4u{y0mQ5-SCR<To_qV&Qyj4w8LT*3OI3$isp0~5T!l032Mz7c9>
zVq%(g&lO=!myg9RbyJm^r+Bc8!-ibCs|<#p-hzj*!v#pPN11Qph7lsU%qx_gflut|
zFqTvzd{Oe9f#9)y2566SxKp0Q0pYKZk7>SXBd0LaTHDlx0c^}PL4mdH#LW+&Dmr;1
zR1|;Dn#<}t?5hxOrT?+}xpqrhikcPq4iF}*J5Aio9mVwp{F)G;8(cO)M$>=06_Y~@
zsv*R(ovw1`Vdd-UKS3^<UJiKT!Y^esydoDK0))4*THu`^@MdawMevUYJfOUMB3L!r
z`3p)V5GH)rr(NU;8tPJCAbJW13g`*vzCgjJMmT_dcU}~(D8Zs?n(!b{U>o6IVavBT
zT7=|sNzDorG~<byPD~wAQ5?NVQn4OG+3qsWjIIboXHLUIFiIk|JzE^RbDBLBJSrxs
zA}!@znobbAm7MJ?I|`<S`&v7H$39{cAM){eF_bgP_owG%_qFI<ihT!Y%B14h(22zj
zLra;6UsHagw-%BEUCfo+JK0YI5&%H))CEsY=zF%8l4$RaR_$RU>OxO*8PeQ^Xf3}u
zdaDmDaW8}|a?b*DJ|1-QgG6kxcy<FVA3Ki?7i=>kK6IC~D1k|HxEWnoiCpa32J(Kg
z)1}$bpv0Eo(TWb+%;>R#mu=q<8br2~H^D%|gfug${~`lmeynVw*-#|?$FKBXiWhtc
z33M+lKS4;*(V)SW;4#Ynw-W7CdfI_OG&9B1@$RY}2yPkN3X7!I_*K1z;cgauXoRnZ
z7`Sbjv)4-RPb1I)aJw}kd{Rdw1UBX2#nDCVKjf5)^mLRej?NNEFq(QL<P{h1&1f%0
zhsKzu14IpSFw0Aapm28<<X{NoV2IEGzgiA%@yNj=fP%`LfXcfz9jE2d+fIe29q|7G
z4;eC~nG4Vg_<sxdKScAN9s-*GQz-Vr|9YW79>4_(u*YCTHe<@+$SH|@M+#^O*d56G
z&h8{=DGBHy*l6y5bzi*KMd)=B_h$*&W}??c=(XZLes!<$-e+9;p9K`GUWM~X;G-V!
z=xh?xdnxdE<+~^!V0Bj&)3JyG(;@B;ahj3tDu$m2fPsR^xON$N7yb{TKT5#yBZ{L7
zaw<V29w>t4hYLnm!~F-sn1s@vXuQWz@7%NTa{68&6w<hlgI;UjAqI56gYYqmcl-qa
zD~?``iKBHh4|tnF*U!)KFgizCIX{PupzoMyfjP!GW*`Y*&q#IuWkgl?*9f-<0BWFM
zT#nC(EX65=%t(P~wye7!XIai$Jn)I)ye+h*|5c*{dE4D}y7S|`oC*e`C8a#5HX|DW
z$5~BZ5;dkwP#44LDJH)Ni7K8gf~I9G$mEAP)KqEBi=jvsUN}=>ffmmeY&D`su*xuK
zCxwuU4d!O;FPpq^f_=E5V*=3|MsZrebkS&ppM%URy!9TO5&kFTBI?op|3L4nMepDG
zJ#o3!<S;lkKHrx_&r#!ikA>{WcjZm(===MGoxeoizf366TQk|n@o{|*yCU9BwWpm{
zdcL<bvwuCG$D?VFp7#Ts@GGlhJ>L^$K$Q0Nd<ru=(DQR37*zegOwTW>l4d&4^K+T2
z+i9)mPgm{iPtSYilV1bUa*91IBtIg!MFDFsTxUd{p`j4MFii>Oabvc>?5~Y}SMT?1
z1M!mgd+w*jfeP5Cl?u61;dHmclWi)H{v5R&qFD1n72P0a2x7&dHPggI`s+RM(a5z%
zNp$=u?$B@x2v_TNVCdQRq6jP+6M2ORf&Q}ybRim0d&>Kj=jXoLuRQ0Xg^uKTDTwyV
z?3d5({yBNBV+lQOTFG;nG_!wsmS@~O^88zX6T4+U@?3&49m;b)GdqyyHK55am*?#f
zX{G~tUdP-H<oRmV&M%bbS%4Im=iefdkmpjUv$paa+o<Ka4@56no<~B0+RF3QZUvsK
zCs&}tOqS<6F|ufcC**l+xV1bNy7G+693sz4Pz3XMHYaUPgBcKRID25dP@{4<ovZP3
z4^fPka~{G02#-Hp)$asRmUbvdlAb00iK(V3=GyOx^e&t|c*{=8d*-c1q&eI#z&zre
z&-Ot5J6ll8kL6`4Phv);aP#~Kai4tmv)P|<|J(@2I8kBznb^@xdkZ@^;TcGgT}>mx
zOPbL#wVAm$m5=VZ2O8YWH0IP{=h6sYj8f1>;fwGaFX3~}LR8H=KZC+~m~~wgW^W)0
zxk%9{^hEIIQP_=+5`|s(^`Ov*sO;CcXv;SE8IXbGJ$UsfzAls{*FfV?xTkD{n;<30
zbt0g7yik^Ni#Mw3%xhEUWiNVJ&NJSq>|IYzZsRg<fh^}DL_K@gS0zV}O^#*&Ay45A
zkP{_>uK2Bv=XPy+AQ2WD#S=9!(R!k9f~#ELa^b9x#Cq<0QwC^oXmqHYgk3HtVcz{`
z5B-D5=S_dtG{hED{2a_}Zm6$m5#`;D7j<n)oHyzF;@Be3PVyGWg15k;^yfcLO9|JN
zAB7i8o<NXO=PfWd@JW}*1h5|^ZjcheoADgO%?QGbN*{;c;4mJ0BXTVw#k9S+mc{c9
zJq~6s&x0(?1OT})0<Dp7buc2YARQ#m#z~|cFU9?72$5E;4+4R=x2u7agS`hL^(jmN
zerM(uP#FiG>8WVdd-%zQV{hn&b9BRa&F59Azg?~q0L;arfd_U1itx<diDNk2DYBpC
z+<@H4Tz+8I_W}6~TAJ|ldPm{5Jr0C<1pOb{i^{q7`v8TrIJrZaW1*V+j7VsEi5
zdEgr3&OYKUHirF49ClbedJE3!C-X<2>%RQDbS*eg(R1LY_0oaU&;ht@{p;^=#M;J*
z2c38h>~jWFr&P;%uQQWjv4<tz1aHt@VNy6n<$n;*kIRowZp+7`U7ZU-0((lO5@~Kz
z#rLan6v6mH2MC9!>KCIF@i=ofW(Vw1Ee=6be=Y6He$W2p@uQF8^7s~l1hgIIf=3=7
zmJ%Y5&mc&7oP*!EJpK-mBze3AwEC~f<I#Xlq%KoP#pSUt(p$^pkL%jXqqCXy<MPP)
z5l@@ktk`)q>`TgHcc!9|k7<K<D34D7{QpQELw6D_S02j*=k$^M(eikr>RMbLFKR$u
ztjD99_|q>3CGAVWcXr|^bmD&_k6UN{5_vpjmcrvn{8AoIfm}9u<T2r|jxEU9JF74*
zlJsPS=*gEph|A@l5oE`H4Xr`Bq-_N~IaNvoXFyNRK#+1d8Nb0RwVu2IktDhNq+W|h
zKLBv`WJP>)+K4=gbT}UpBdE%xZE7`sll5ht=u2t@Aomiq0rllg7`VV>FEyR`%g1nC
zwC^Ul^3H8-xk5=r?y6n;Vc|VC8n~^WU7vC`uT$b~6*f_LDHhh-Md`4Ta&AM7?6K3Q
zk7ZRn%_F|Oy}5I5+yaa46L@$Gpo{$7oJZ6Hm5*W25Q{P5BX9BUslHc!7t3J37YTIH
z4xk9`l?P%2^F+g;q<ECViH<J17TzCIx6FC--0nSteH?fO=m{A-3Px?f`pJG<>!F8b
zRd5F&6zo<9185!wXs_E@3wOlh2L{=a-dU%^UQN2;x$8ER*MpJYuF#W1h6H<dTMK5=
z09pq^Bu@=ZZ7m+|(<ECk_1;2R&SFF-RNF78iOICm3*f<EzX_%~4<Jk1y-I)cuua>K
zwPXWc(c^u7UZ;8k71||u<aa<D^QejI<RG?lKce)X)A!5A$fQvnp8$N$uOLV9Xjl0V
zgFPCQqDtp{aDp4aXHeSG`jv|IY5k~9Z%ykE(}mWFdF5L21(N+CemRb>oyl?RHfy<o
zb&2g-b)NRi#`jauT-px$1C<yWx(SoAXMA5PC1iZxh#<%J)%cB%@5>QM8s9&y`X7w%
zkKR$py#p=g8Q)(a`4^7w#WGs{C*yl5=B;+cE!^SwKAQc3O4#4{E`s|6#`mv5PmJ$#
zVY_jB@47?D!(y?W+REI8FcsSy->2yG)-q=>y~FX{cjf<Le7`~5c>y9l4_eFK38o&<
z_->+&)t*BAuZ-`RpolxH_CLPg4Yp6~*K~SoT0g_|R^$6l(jUWaH-0(3?}WbnCF6Vl
z_GI4M@o{}Tf^6Uqv(&gAf;k8C_6{uO!*ya$ev2T-^%wY!+mqW6NgCHDuh8SVw_av&
z7u~zQ>Bq_J7p`w60XmUNRY-|F2@GyWGFC5FN$&Re<c!k_Ifx9&95wKKV1g0R8`dMu
z#;u&80P9bj!L=lxEc3rCV~#~I5bblra&0ySE0X7_eV7B{3w5=G364#kr8awuWI5{*
z<+{_pS=oA!<CWN#uycTb^OR&(VnWjEjQ5eC=BfBZ;%kb;!(E#mvk!$_xDwa#3ZsnD
zgMMznFD0c6`k@IHojM=bzdRHJKdt94K#*YWh9aS!*YeO=N{Bohfgt4}4Zm@DXo3Do
zl7}Og{TJoI;>r|A{S#W6`W=G-Oi6!)N-x%DhUB)Dhnr;{9e6r0fjsnpF;02dK)nfA
z4`HV4SRO84`rns_>!Fv>GUVY?ESl}^K;S=;hkd{OCGt=-UG;Mrekl({<sNxxeZK$)
z`dq|l_rP@VKZdtA9y;5nU6tUod?6Fm_k^lmOG_y~2hLu2pW+nl0@zDs^J_w>%<d)*
zTvs`7PF3UKs?}G?dwcXZ8PJsQccp0aHN@68Msl$q6xx%HZ;Dc*>^azoyh;@U90sJ%
zeU%%1aycJp!y%AQa1dQs-+WLun{Vf2@sd3_y#@55Hp*rfU3a<<Fanp{%e6$T-{0@x
zHAH+5o5t}Hj!pd4ahin3-iLP_Pg_41OOWopbCF?0PFJd|Kd%tu{jMdDM*hJc?nCvV
z;t$Qd4}<HKAKt3y<=6t9+BJkN&|}p%S>cD*W5^QpAo2>}VXHDFwfs?)1?u%!&CW$?
zktGxq-F6R_EOfGPrlBFFFVJ7-I>e|4vr#z3If!}H(pAm1Y<2?wxqRCQ(B<%R1K*6u
zkD|AjPiL)N5@SJf@In^st6cC7MceYCpYqzuZdxISFs5~WPRk~?h7+pubDXo$I()XC
zBOI)qm<4?65VeaT0(fTNQqnn=@!AYLGjN|G{j~_g{pTX&*q@_s6o$Cx4+FTZw8Ivs
zGB9X1WLbY93zLifg5%6Dy&P!usc0Zekd&^%qy=n$k*-cgcq=Fu+SA#H@b#}iv~yrU
z`gw~fU0)!h1QJ8PaNZjr5!0vQQz!vQ8P10rWQj+@i`W%MQ(8)MxsAMqN$>;DUUH1;
z<n6bkcMGSGgY<|3%&BnogOBD!NzlAHj=aiE@G7SkESRpw*o7(&UMc-h;2S>r38D+9
z9wM@lyV0iN*67JCEx4(L0!>|He+dy_4o|W8BD#D$CgQ@;`u+pn=5bC0>)ib>1zqtv
zS3#1Gn!5Sm;2+VUB}Yj&`}I~F+=m}~CHmO1QMVqohan2KJ5nMefS)DDDT++IP1wGT
zm3BhB+OYjN5-`^uvkyc=D9awn5JX@Wghb^H)+v*bl33oT`%WG4yNeOt0JKAUaQ0si
z=l8`6$?x}&k<4$f6l?lyFuXx{9%nedbVo+NE)*c6pH|&^paT7Q>_##g8Uv5G1jjdr
zegj5dBMH%Aq6J(=^RvrT1p^j=(UTOT!Cndh7>zF!A5}Ft$3P(}g}(vE@)?T7xBmv*
z!YQuNbTH<3V-eVk&4{>yD#tky*{j#X<S!ShF8>IWgA?s2AQ7!kVi{MEuXv1xO7TmC
zF)}$=Z3kUy*`!MU-kyON*!sL;YX;f+tnUA<Qs;5j=>g_<#8wN(ONy;`G6c3hNBt%&
zpQ2M9LP~spl%rxm`qQt#;?Q$cm-zm8@;n6}bqp5eQLBB_kJJ+s{2x1C#Zkj)s4(R!
zE&oK4_t%=0iz!Y7aLeReG?k<;;8BsvNd@~XN6WRQt1$wh2r~WhSd*M@Zw_Y3xDH~u
z5987-*y?)tpTq&^wT5FPa!7XjT|ZJ5@o8OvhoA@_9{JWb-XICxh>7+!@(c$GEy0(h
zhrXug*$>up9eOdr@O?ksf23=dV(0|Oq+}5>>icUXNgU>fBWV9=QrxN_L++*;2Sa@J
z9SBhgr-7ktPnZgRK#e8Y3uskiPeSX4ccGt2&WvzmJQKb~MtH4=S$h3bFdRcO^I<&}
zr37<v<3?Tc`QT-}wCs*ynq-o<x#m;kUh@e9Bno$Y&u8pQ5TN)ypKJDe@1(~KAV8?y
zdnbD*cr^|SdA1>|-Eo(EFXh@|%!VLi{65fK@dTf9v+$ybMhNn;V-_5cyu`@x7HIdH
zk9aSghGI?6c@__=wEYk1Hu=;B-<$AXUX158M;2Qj$Sc%5hRcGh5H8p>nt2-o1)mvV
z3uG*|))ue)F%4{<h4-NUjR&Y)KRk%_a82aot1Tz$>r{LH4`lA4(C`~8A=YqQHp1VM
zIk<NcJ;Ex&*eSz}b{xs$6CHzm{4qnTam>9IC+Yu4Eq|m$VHC(~E{UC<Qd00`a1g%w
zqd2YfRrHcJ_qj0mPs2&{?Q9<s$h$Rp1RE&$ynJJch1++bHLwcnJnVwKWHpPuYsPT#
z#==`{Wz^C}_(kFrEtBaU=<br+)M+_l?csV|CVicKC|UkE*M|Yr6!t;D({h@u#Eis;
zJOQ|S4iB=s@)Z3P#d=g%0%1nG=eSHeNPz{t=F|yKU&rv{WU~sts-Bi3X~=m2$y$GN
zR2>Eha5&l%?RPX@ll5k@eLoKrEI(OIN#f-XrGuF$p*{!WB9_*l%AS80NT~5A?Y#;V
z_}4jU@aG0_)exS{0m}~LHR1vMG_znGoPr>gkTaV15IbzYe`^{=>)`4K*Z9<RcoDJC
zc(6W@_o*5A&WO-Q4TX)!Q7A6+VNu`j`nE(b0^ozp)b9gUs^#~Y4VaN2S{I3EmEfCC
z7hro#UJ=>@mm)Lur#QaRi&2_|A%SrB4z0?x%spoG;$}RdgHh95{t4VJGE3k<e`<+k
zLt0Am_D~N-heMQcVUto@muZ#a#=hAcz_r2cfz*1w&(pyF-r^HfbO6BwQch!54qf5Y
zv=FvCo=PAy%QKB`82q6%xn99^=~=wIS<&)NIhq*3M_{_<*yF{_;8U=Y(ol>~3B;yl
zI^RMd%;l;^OK0G38bj;xt=KetnXzdZX7meh!R-7YRG1#mGXLQ~eviI%+24%bh9?7}
zYPG|=3~=!e-3f^llPPqkjfwEqW+O6yY_m3)p<UU~T7hWqgI>f+I|}qLcVWl{4`~h!
z;<`+xvNDb#Gd8`^z7d8SziS2ZmHT*1LcX&SjG^+(;JFP$_+k_2QIE;?8n6LwtQc!t
zfu-pcn0#wiTr=s9fZ+>y<jD*w?7|fVo8KJ8Xv@NAGa_FR7$#(>fiHs#m1js;?o)6C
z!#Ofoqs;L?dZ%FASrfdyQ?L<Hni|uh%%S5!=48~$XJC|wWgf@Q{hkBplxeR3*Ihjd
zy4nAih-*I;xh%-VVh8>*L(wMn!G+Sco-=$-KBD%gV{slq+Ud)NMkEc%&T@!^j7n=Q
z_R8!-F_EjO{i*K|LP{Pw5+4sD59-}j&JC@WO=i^Z3sq1U2GiyP`=b~o)?QK{Z0h;b
z^W2jehh;dp-B8#JWbkn69{$L3oLt69RyO&ar5~us<~y1v2oH?#kH9~```*PkE$fQ4
z*G)bs<A+h7l+I*PcMZQv5?#S+whyQ2NS;!#L<%8Ye76L6!j7kO8Q~s?CrQ|lxP-xC
zO920qZZWZJG9u5RE{18LJ51HQfflp0<#UX}JbEyOs+#F<;!67>;T5+NXnAp=3@PE2
zv^)tW>^G^8yYa2+AEoQVyAJicB<l0NuUBZr{zLbb{R>KHGwdu4K#6Vd1b|}@)bgmF
zShEN2m7$X>Lx;W^96I38RwK-fPIsOvk~uxH7D*7?Ky17$Z7>0AiRA5DerH;Hc|DGD
zl-h>`VO_@OC@Zs~*?#WYI3Dazwx*}_Cp*!fY~3F|wNHCCTj(uR*&s<m)l~@E?_bk;
zERYCJcOan0f^yg3CIbyKd(O9jDQnGpbtu!WWC#PqhhfSx8(}1K2u;-K%@}Zm2KD(R
z?W(uk$3K7I(3xmjB~_kP1U|8g(PPFcAaVF~Z)hU=yl69AsD05=+bWwUW(U`&7*ma?
zTY~}Ah$se-%^y4p8u=Q(nDB5(rHT~+Y@+}>2f+L=*#y|S&|d6p%z(6p_F`w_g>-_f
zs51u)y!Ovr>i&~M7gKtd=Fm_oXk@`(gllpO-%4#LszD<N7tI&S%cjP`gb|xL-`&&r
z^CZk7pI%Ki6xDJq;10^sykQc>M_sYda(*P7Xel~V*19uTL2kT}%f*bd@Z)X_Tnez5
zqw0}lzY7k+ewDt^Qbb?M*TW;VCJ^oS@SmV&3U(Xex7eJgD>G}*mGxMM>#odA?g~gm
zF}yr2xi_09qP-(@Z?FLpso)Ut^y)b@6D^r1gMvPgpnpvXj?Sr|aNE`hZwE=0b?S_P
zhyibGgA~?9OG(f9M8Ybo1u5o(6!RMjxx1GVMM^`VWRkVqH>xk1mlCR&2c7Y6a<5`8
z=GxL4Bf<-_p^7TRrd1n}N#q|Mlo(u9UW-qCf*>WeXf8&q@v>k=B|c5F7TqYB?{C4A
zK9w`)fgGzxU1>z9M+4Dv$j(R@pSZv`z0%$SVQwgrQ*z#zG4bK(;sLinRepoQW&=OC
zK#tDmgxoM>4+H_wpf$;4YZ|mo(K^r6TtJ4z#&5pL^%QjXnJC?np~O5|HXr@O8){`W
z4jlxw{TeN*!6JVd+|o$}BRm3GYEp<4lO4#{Y5{XI`K+{D5od2|*&Hc^3@S;bGDRJ9
zMLPJ=x5YUUiXnM^JR7`HemL>>d7#T6_cN=Y5AFPaL%o&M3frSV9i*Ed7|`!%;E20&
zfUDFY7CA$dKIAT{KRU82Xj>e+Msy;)dk^MZpnP88{cVJKL_{%MiC<UQ`<z!01D@bt
zjs5VI3|9*2V-?l*J)SrQoTdY|6);}K;NacvZ-`dpO;pl5yulwxHWVR6ZjcB!4~r_r
zLK%>Lg;baX*-{%zCybK4!=g&(JQPl}UukzvwC~5o?vmK8a<LpYq&mX~MNM^Qr236H
zYjJfBl7s3+VXQ|S1l%${XA5H05Y)07HB^AhLzG#k4@^SW-UXiuP%dtd2*O-s%%qG1
zw_b{$Sg&C)H9)ZKNmqEvLryxN&!(qIA8wUu`kab(;`He$5oNp$NWTDO%O@$a9CA5O
z{7F!Royq;r@z7ZA{^tbISo`o}uLS*&eY+?7V3qwbv!6z`Y>%Q7A7PqRopP`_&W-cO
z(SY3xMW7iMNv({d@#AX7e&eK_7YXQ9^fy_KuZLti)6oMpx2<;Pw$)9?Kx^09M{Or&
zGi&U_7{I4;&$&v?Xg^_QWBg>q+tfWt>TdZ`>dv({sX&drURU}jq(X|+BB$w#r26!K
z=_Z+vvKpFL&C>Xd>dEvK#N8_MbbzYL2y?rU*isMJKQy6h=&&n~EUBhh%(@%&Ll=c+
z^tW8Js09Mez5adcqC@t`0TLfSl>?-kq07uDRM&v?kxz06L+x`h$%#--kZ23S`k?m!
z{*xmlxD4&G=RyL+F7eI@*dmM{Aw7H@&johbyT-F1EIoAJ^Y#OugE$+*jD~4+h%H#1
z?P9f6rspAzz^d*EVuhX8WUMfMwujY0?O_%1V3n$|x*~y9eJiZw-E6^Xb7&8YhQD#5
z;c>_m!D`k<7!9WiRwJmoJu@w`;OG2X&a~XJ=0Z-?m6+Pp&0;sql9|?X7&2un1b7~F
zJL#G0rDqcx04mOzz#*}J(cya&{|9}EfsENV@+^Y#-UQCs@QhM+LMMJg+#}}$3(V9w
zXfq-eD68^$&^1!>1ISln{rKUKwlWln>z@wWr|qlTU(#Xw*9iO#;HcX^y~+~pfABc*
z!&f|X`%er*`!{;qhr9%$6PhVAkvB2SqeC_aqPH}XTajTX2&Fh=V`$bobnP+_PQ=y>
zeT}yVK9z^YceM=9#FUG9oy+e!dnd*dX^#zwo~4iB>)!4fNiUkPe%FhoV6cOQ5{^X^
z3#FalI^lU2N+T)3$qOY|5&%mHaUQ37kI%Qz_phK^aeB+(lXd(z_WXaKzxd{zOuz>d
zXnZjeghr^vKlRiVHy9%@#(&XbqyXhO{r1H#W(x=~wq1St5X^&Tlz75_bXJ|+8Dini
zhoqf%eF8!%O9biK6xk^}b{Blg4PI=Kcah<NldIui7l&ggK9lEyFGhr~3+tKc3z?kM
z(U1}TKw`<~fU!qeh3*<kkjqNlb3o_0BzlV;^S0|5E%OD)zriHk?s1ruWT7P{#9<P1
zr^sWaFyF9HEq;C}M7Ig^nrqL2-yO7LO4Zqkb}mk8=lrC0UU`H)dm4pRJ2TkM(I}$Y
zfqj65F6UB^2ZI3-r(Vb$Miq!mEW^Q~#GS9puaSB5l53%cx?Khdkc!!|2k(YkP~APp
zbS4oQsJQr}C9kwjI1%*41m66;{M-o&Al6O&)Zhmb`UqeEMCx$=-aYyrhcK2pLy=?K
zKm?^$2yPiC;>Vr#FHofxBFSEbQudGNee&FIqj_gM{!?@4+NQ(ZfcU3zfqNqCQTvFi
zIOXrMU-*I@!h0EzX{PVnO3#ob7+o@s<LByI_Is|r)@?*wlGunovwacZ)H?cUhS1RK
z&}mG`wX1bXEmBzDxfZmNKnVhhpNzV@#>Li~;(uPtsR2B|5n6nT59iF234@!xiI;Js
zrF-zzht24VtfoK2yX9(!>OTE2chZ&=mtAk82#$F(Q}A?()We=(_*^6=Q@J-%N_eN-
zADU%D+4R89GJ=~CJO;sHYKfO%y~zWHIAhI@4y~0L^*vOS8TFkWTtE!P0wR6$A0X1<
z+X)iZ$03*+#KU0dd`zibe?wD3J)A9G2KuWKWDk-z;y2-Iu=sGK#+FK|yEH%aO32C(
z5R}jMvPUvPOz%`X7cC2-*pwV&p9<@WO24oes*PI;)%H<H;+=fV8gPmY&8R{*2eBis
zRe({543P*n9T<64BIrCLOA(1iO+>8w<b$7PG*pNfk*{MX8YGHPN<)an7@~p?PZ#q{
zl|o&j?)KQr3w}>W(E;=K{SV=+rz`u010N7^LN%X4Cy}r&u`40s;DwLc{%qhCh`y-u
z2J|UIMX%sMhS(tgjPP|}6>ex^C$0$RMe50YGY>obYfYYsh4vI0k@xUzP`SZ41)uA-
z7mbvbO5Iv49omy=+}#hkW~d?4o})^PNiLCX-2FXH+)*Oio{kdCH5S=ySb@kl?q046
z<lC2_0H(t+E5FR_yXoE$Ii>ihcE){k(o$A7W>vjqEZ&5}hV@;(>axmupVkH}qwrx}
z!Hu|J2a^x%dV1(l1TeClA=REdOscH|HNcwos?{MYFrd$2N!9jbkZvPzF1Dn<l7>Gc
zCA1`0MV^;PmWsS65g#LF?A}cL8j*RDwg>7rH4u9+Mbg4Rg{}vg(Cs5~rqMOOG!VNh
z8<}S8<z|!(#O~$xHy9qUHdTd1=<xnWV{z!~e8|4~iVWWNWpDIPsedG47M9`-ph|pf
zcq0CdK_oN{fcQIx;Ab~+bEb578RyIjv=5y_7^M&lbP_v)FmlFdEbQNZMiqun^g;-r
zCm>LBQ5e%BG?0?l5`f7>I)N}9!CJQnMn|<SN;yOp(UscBs?k}E_%-shE<-x;BXnD*
zGv6I(fr5JMOW{j~t*tn~u0uKg>1G%##?=#?hM>-L)B#MQ=5GIB>&F_hhb>8q-2OpT
zhWUpe3-eE)6!XQQoree%_O&gP8m4L}4D%0ASeSnaC^so6bCK8<iV#dgVVHk_!ovI$
zC=h-UvaJ-u4M)94juT_^0SG^KFh%xnR<h3?bx=!aFrd%j-dg{L=%Ow5po3a9$U<8m
zS9{vJRn;QK9n_LO4(Rh|Z>{e!s9ml@2i~jm;8JFU%XD6&w3dF8H_riMS#Pt-8|KZM
zsq;3fymP#H-E<x&T+deD-3GwTUc<@RJrTh}rX;HZ(&wMQ<1|;lr;Vll3X*>QUUm$n
zsJSEf_1NjRAj*BRXen0f`a?9WPMSPZzy{b$oZeGq7M$iBDYXZ47roA53m{-I@N{Py
z$PdZGfoFv0!=#gGMXOLJhy-oFLmgZ@A3*Uwj<ZmJDti-_K%@QPka#)7H`)z}I35<O
zv=@8huTkmP&q`8ZoI%=jKE;eKRSb)-4h{ita2N|lYb(7kyWtk&{#&*rxQ6<4vLUho
z<&^<_-e4GvKkg=mpo1BEjmpl>aodl*R?2vH9gi+h&4Z@RAc}oo7^*hf@A3P2*xU0a
z`~+h7fDQ=8PcP;3*V+a3i-{B8CF2=*9h9sDnql7b0P@h1jreC{x`Kx;m!dd9H}b~e
z(?TeYtjRl#Je)e&V=#ZCj)M;H_Lh9L1uW<Dl&lATj=D@Tj;*7d#g2Bv$0C>?ortYz
zd>It+xr~Fc75L>J_WW!U(>jqA!zYls18>6Fe#i<iF9dFAJrG-vGYuv5=fm7~j67A*
z$P4EbBAdUZ`gaU|?MFdA_84T~_iOqijdj)UCuR(a9!W$8Kbb@aUCCyVN)|;%6@K}L
zuZXZH_Dg;Z$Jb*Jf@9tvvAE`0)O*+z=eIicFd0lDbC|xA>1ZSyH35J>sP>rtaK>P~
z)qs{VkxETKgxO*iOR*r`>pw$)&)c(Cuut(Cu@4Cab;do@5a3s;Mj=N&k85o!&Rb{S
z2J|>Z^1GP)5G_Pex)Q1@H1d{eGHWgkz+Mo~wpG_~o{?(*#jmPEVs^V6CAI9nSny!r
z+q_~~NOD>(&laTRqX64Xk^%ckU{Z|l0LL;mlwlFoh4{w$jRncD)Vdu>b`0dva=$=f
zi7m+r*rCpOkb3A9$o9&p$o7Ny`Ngum8S+;U6WMMC8TLSkzxBvAE^|?~ku~`zBM&sN
z!xWQfdWO{5Rp*t@xD2<J@xFEZcBY7bH2u;0l(dU<M|luam}y35L9!QUl7nhEDx_?~
zh^K&$3B&~)D0C>`ie()N_!6!y{<{L+a0vvwCj~Ip?~~z5z`un8QlF)@?Te6|tFpfs
zq2(Xh%hM>T5c&ob_<s}n_p$g$68Z&9pwPb{%aS%i9|p><(2wICU9Zq9MaYp>Lpg0n
z=oem~s<#pP^YvgVl#9h8^<l-&Xd!vqf#m;*=vW^u8H;5Oh<Fo-2$7#Z3?lzJL_V?K
z#-TSKWYhBtj_g%jZ(Y6pE!De9lEJ?m33x5MX9XQKE9j_+XZlN`6Z3=EwMsw3f}du8
zOqH8KCBJgi^sCqGjqq5c6-P_6N^pNUQ1E*2O89+QKbhA1CDsmWRk8JPz<Q%v`|%~O
zOyH4t8dewc@9{25MsiD<{#FHB%hwA`>vO{T0ZwBjk@dlx;^?&O0NzlAxR=Tm-ay`4
zc3_}3%g4~Qu2dco$GTe95%!sGW~uYRHyr+jpxTRKi3hBD8u->%RGxK|Gt37hoTvo+
z+J}Dxt??L+2(|x_`iz{a2QQ@7^<2RA-#2re2Wwl_15S5=kN}TH-4^jt70#__EMEEX
z<jN2H%iT9gRNCQUYFdA9Q-5@F{X;$Vm*jKaKMsPa=&*!qEW2wWp2_$RAeqijVSJ~K
z7spEA=E&!QVA*kTMdwI96jp>O5{p(yG8RrI)~}_tsK<TVR47X1aPyYljyNw|@lR#)
zEyk}~6SrWwKPzQRoyU<Q^!~53m!;dw?6AG>nss|-YzY^e_7CUl{$r4A1ZVU(I)Ms+
zad$UZ<?2zj9Au6cpoR8VNsv9^rx6r;R#~22cw(sQ-DQ=l;SJO9qyXXh6Y$PSz-w%U
zAL9x97}M~!bC9D0Qh@N@mjuD@bp$njHfnhBV>|(mF%9o_4NnRX-lPP)Yc#xA#${$=
zUv?|PP*|@NLt))EK!jS`(^j2RCB~|X!!z1_AK&Y4B**JC2Pj6{FqZK_5k6V-9)h05
z!&*euKBHcL#3i)@f4gf?fs2rl$#8}qD4a$uX`Auu!Dl_92ZYac8qSr;a4w0%!IRNW
zi7ZnTy-cVrvBqHMR9^^<H58fL=~l}Mrxxt+jHQ`+?6#x^FXITLHBkk|Vi3~umJl{W
zWJq@~o~dUmV41QoEmKzajPU|&iIbvz@$0t0qHJMAP6f617}3rc^sLT2Y`5T%Vq|D7
z@NemZ(;sJ91R)w9^%Gd7KXez2H!6IkV=zYrP0UaHhzjN>9updoF@Vxe^(<9IYCGqn
zjM7)|@+k5-Cm|kM*V4>b*QQ$R+IJaB$<3HNB(ZB>HCTkX1xa>S*p+m3UM?H#v4JB3
z$g@`Ut+O|sqjR_{f$X0b&#85DU_3A<JD&3da>VkE4LlTyfN~#F>>cO2+yAQ_UToqf
zI*y>7jh=8fG{bb$TJ7*QwhFIx&WeXuJM;x{%besX9|In4`g5M}{hsjq@w#i9&UFiO
zb>cQeDRk3!A#7qVKMVQBp}IuVWbqHc@|1>Ep&7lJW~<lwc}!V979Lv0SO>@5^#aW-
z`zf&km&z<{kJunvg7$Gbu?{WS1|t{<=aMN{(Ncffd)F{=mwh4=F)U<p1e48F>y&Jw
z%5};?{RnY~eGCw6dN^MFHC6{KY2a@f)&~XqhqdTz1?#Ors$4S=p#hu1#^jA^3{skA
zlEBvKH|m_mRNy}pOD3M(TqwW0>~pz<LKn{DFL0sWwO75aDDWGN_eH>oyW)rF!oMPp
z0CtSvfG2QdtrcHmDOv}t&)cPSKLBw+&18xh;WIVfajLy}778BSw%~9~rtDz6;4>^Z
z5UB}Um39VYL-|+6-?-(IL?#Jk#nz#`=TJzyST+<@f&eKz3@WRVgP>U~F0RTYm96}e
zc?fh1vWxnD5iP00Hk=VA`D&4raHQsa;Xrg5o6~-mjrOVw7{S}@mZZa%(~RA=5lGEP
zDf?OE`uTQwgq84^Fg!oj$%)XZfGj6MhaO3!X4S#RqW%sn=E8;CX!H75)WO8wcPSOd
z%qp`XG7V$|B8bw53$wHMSX|r#DY2QA?g7vtg-|<1peFcPWqQ0APsn6@aeSOV15`C!
z87l(=1{Mhd<WpSR@%}ZmWIEFPxW!x?yH*_9u=(veeYrkunBcdelho=T;@2(U(~luk
z3eK`$SJs;PaJQ@Epy~~?YC^FwtKPqqgm-=mIAZ5vpKS<SdFV$ea4h<k4w+99Gz_UF
zWcIRQ$b1HQF{BAApnCD(ObN}JhYlQqeag@*x>EkS#)x#47+onx!j*Cs_b}l~8L$@0
zmVc<8&XaEsuKLqNbUP4RNRkuBmW50xv6_Kwi<BMiZz+R4+w;&F*ewXH$*x>hffDjb
z=Qps-UvK1zAQ+L`eI&cch#9>c2W)V<oPe<j_Eci3d7R>sxOd(Vp`SHrU|Nxl%tHSf
zB6cFNQ6*M4MC7DIm85Nol5<UKbTuS}ra1}HAaB|;WaN#mGz&Hwu}io@T!MW5;nMm>
zb!mMgZhzs@7v9=8qt!Or$F0-C+qrYNR~EpPdn7G=jik+zgU9l@P(rE+lu5A*FC-Eo
zra+!JB>Dy9YJeEm5JaaTlBGj})Gch|mm&4SZA7Yf5>jUhQY*W0b@ta%wi_$2QB4pr
zOcXL6ftzGnf(9^}XGS05@N9_Wq7sP?hCD=uG(`IA6f-&<cZ)8{M31tO)DXdDk>U@W
zVjCi(+?W-~M-V`3jc#T0P`Xe>@%>3S`dZSmoUeR^QQqcEMzR%Q2$3Pvn(mXMlsr_3
zO0%ucKvWw%0@1^|K~fYP;;G<FQ*bWhVT$yt4IDu|2@X>f91<c4&TTmQj0(ltz#)Z{
zcd$}34MH;oha^sd^ArX~ROFxyoXj?Gn4;ihdEf{^*F&PCBIs@4hzut6izy0@&jUy3
z>_AqcGN{_Xk>QmDhbamUM`2RGENoOnWyH3DBV#uS4pS5y4)G*79|;_)gEnwBGBF7b
zQxqIlaN$H_l%}?4OWByrN|A%-ql4XR5fF=3Y5Z_so=U|iKJ8XqNWtdyoJ2+C{v0lp
zij<djD=wo<wcFxQsYn57Tk)hobRorxEgl7Z6USCJ+6f06(O6cyb<xd*6r6UkJzN^4
z6tr!W6)7q0Ry<lNlI?9P5?hK!N7z2Zu^c0-+t$Ud*FuT|>&CA>$vJ9_JZ)QnSd#DU
zVtF%0JG4=q*I93yOhve{;c?)Ls<ipuwB2-WpnCL_dxmK;;X=6TaE4MtL-oGnI4yXS
z=Sni~F91Td7s3e&YLz!_gK8a9Q^KW?=MAXqy;iV=$C}C43iLUg*eK^R8`G(p_i?pA
z<{;+>>>EJ$<GBM%9L@C<t+S2rtEi@C35+z(x0Ds<i|@TQ=>F<Yllqh(U7sT3_>$^(
z=s(&3#%cf2r3jj_p<QuD6K<^NQLq=xJK4}5kbZ2C^W9>|4V*g8fIXA!K|1GVX8#r0
z>U8gFj3V(Mjr$o2Ewn`hG8^HkAPY8Xa9kW2hoGLZ^~MfbUN#NgaQAhjw%*n`M@p+~
z?&_Q@aoHt07D08cBI6SPT7Mtz>mTF&{T#s#`+J&n=w(Q+?(agD7IM&I_u1*}?;2#Q
z{!Tsx{e2KcbbsMb+JGG}Y^u2XA2KDvl;hCC?yPHyS&?cXyG+}3mDQGm6`8BjsdNId
zWgN{AjT)6m2|;Y0ijm)l)u|YVB4YFLWB+><!U<bDfWsNiF_F{vx^#+Z13?T!;uU*H
z1E^Nu@}+%c&ip)5x`VB86JtiBXpaF>jsx-Jex7Pycd%o+;A&^PC<r+L_yFm$b!8j6
zI1PK+^2_B?>>*9#{|-?Ug&z`zX--|z4o3eFX|9x6v>iTaAvj=^*p&pIlBDJ#ktmH)
zZCzU;9WE#px_s00QF4>5#BhK35aon0d&IXRe1xwWpWJrvIqy4)QTzB%av>TTA7Ymf
zB9FWr5I!;}H9m7!w588=a*QJQNb_pJrRY<r@JSliu2BD{_(<n8KE8JFxmUK~<MfFW
z;KKxAh$jw;5@bk1@IM@2cPwv9n?=~;P3Xf}HTJnVOwjT@tJ*%r6UTB>PX)YRhQ+o!
z5j{ACnirS+UB8q0yVw+pa4vX8B6La@1&GOu+WNc6tb<pcwYFY<g1Dti5oa2z(7(RY
ze(t0MK8gEVyysWz&O=4DdJ-y?Nk`vm8zd2#f%?k;p&3x8@n`+Ic>OwE{|5-JWDx6J
z5|R2}h(L=O;i+F4uV1O_U#HrylwOx5+P^s7zDqy0Px{F|c~O<Ff0(MTsFs?j|6RQO
zYES#s@%nhVUhX+AQTSI&`+rT~|8%_mJWqYBaCQ6hbp8IS{yeFFTB3e-y#9Po{rU0w
z^L70%FwoGk`BMKwi3t7PlmUoCV7dNS5}P2qMfi}9Tz~ZI>)>($ay*K1vuHGa6Bi&a
zMU3Bb9)kOYsEoa*@b9n(?G|valt;eC$Z`{XPNN=YY<ItnZpI*BqZ{#?sJ=>u#$X;^
z|4jWMXo&SMNNPS`nwR>Q<JxCS3BS3EgHQCEbcYF}uEEQ_$mjH7(v$d2Y?3{|v!wWa
zm|sG_xf)Kjz)`OS_?){42lV@SJlG<;Vo80qo9A<8==wj;zj|GNxzu;tpZ|;Nuh8`u
zC)Iym>eJrFCThF(datixzg!{|=bny~n+PRZxs3nQfFFZj_;3FM<*rxdT!q9VubEZY
zO;oz40Jk`ly&A(_s=7ydLNC!r8Lys}IF~Fs9lNiwfyd(WLH4J|BVPpgMxg5Ol@Yln
zq98u~6~=|y3T9hk);R@`={AXH12bF^Ar<jM#{>IhfW}$UGHER%8$a$1kn>L!1YSXs
zT?YhtVC78NsRQ35pwCnicRr_qHg--Phv_m-Q+AdLb9%A6sxYTLJ4J<Q4cPy}x&h_u
z7`74SNkOCh?zT6py!p&~Q_^<WUoZ`)2Rkvno3WWy_A4)_ge^!&LJ0>^O+BSZHH*Y#
z{a}Z6@fTSiaDp1{W1i}{8V+pLjc7YDQ4$kvxH!W>MZK`o-w{_W=oOFs+DqWoFceAv
zOF=9Fj9rem;Lf^i!QnkFU5k6v?_jAxRKEus;_D0M`$xd#=H{8;iS+}7#kswurHt-g
z^?o@;K<0NdPz5P(A$%{wOE4qCd3GlcpgiXn%In*VrQZTzd~dGQv_56MjYzvo!@w~^
z`XV%eP3Bej_XhJDk+unGd|rsBpxI1(oBzRQXctPoY2?8{k6Gem2|VbXX-wY0q(b}T
zV_RBQQ~(*fE2d(8ia<OyOng`%HUb|WD0oM|w`%#AF$RwvNr_Q>AP5fgrTL~+nr+7P
zsjp>9v1z>YHwsXB!)8p1KzS1})Xt_KVWkS^6J!NH+(2JT!FTEc<qi+%(~QN+g&(zY
zu|2yqrv}xGZqo`8nIngN6Rc7xwMsH7N)exKbSoXh=GiGaLRRvZK5f<dw?_Z+;J`jv
zBwUd{?&4hD5AMz6koqW(dS?iYUX26lZlf|CJ`P=t__H~cC}eaSlP@nV+E*N-#0G-T
zL%G)>HWZ&#J*l7L@-hhQRPr+E;kNQ}TS8uzAiM-yB9s?CXOko^S=?udr4Iox{XSt_
zUOpoX$_S)@GV|ac70gDL3P?Lcm3f0nHE2Rf4CLz>%93&eV_E~m`sF{V`gCGkgSx+}
zV43^GX@q6#bbYL3`)$8fm8Bo#S{C9tCJSxUg>Z47Vp-3$=0rusc4WXo3m+H$9k%t9
zSe8>wXtBgryohDC2vieln@TLRU9rrfSfY$8mLqVusbvx$$A!l+>Zr#(1w^fYpiE*c
zpdf@9U#MoRe?lT5=Zg0fRLW!}Y9VDDzrJkdLf#5pIlK!wkPt10V?+-7BaKZKf(+uH
zCT<RcbEesU>5Z=8B$TX7$lf`SJ#t&iUR>XxMK~uL-6m!_heM4~{@~|{;gIjZBeKQ*
z_^2e|D*<o{Ul9hl%y)75TMhO>{$4#1c)8k2v{I~}4_5mU9Hvm>codW*ahs6<?!SlX
zl(<6o?E72uPv9x^6+KSsP?>VG4y;_Ml#-~Q!9X3|&$kyI>2mhb;}e_(@rARc!j|ue
zf#Phb%UP4GL7NHAlKNUHfwLy+kj$k@+q@+p6lYmbm^v|8)wntfdX8+RJ|IIu*vFKz
zAZIzsYmnoz_+^A!vv?xX*!DD$^h%b1bXV)&8vNSJsa?Tq5;4K+^T2D;xs{%Q`WHT@
zfv;Zv2cZ+mZ>}HhojsEHJsJ&=-(^Q9@f(cx*JAq>oa^!WEhKox@5}kqZ|8$*q>7q&
z2iaSYPS$+_=JRl#;ny+z&q8{VAZXR|hJfoa=P817b^3PJYJ`|f2`Xqd!j};;bO-aA
z5l?+c;+TS<`H{8<X^*qKG5G_Tt`OKJfQ&quWzu^+si7Hp@C{+sHguM`V;JEDxQ|6u
zBJe&zK-%Egzyi2ArY3>bJYk$Xr<SVIzZ>N@99M14I!-NS?>$2CFnPRYfQKX(c!+nw
z?XvD3`kQ>!@GU15%NeczsrW-XGd=W#KMTBh4!hwXtUwV?2*)8fT*hSrf!czmCnVMi
zu78BHW7I+Tx^E@06n_c6XWKsfzg@py_)oSEzhiiBUA|v<mv#itShT()6yLaczaW2r
z*7K0yXXtr62(ur0{$Db%I6a?3x7wkn_x>@|89suw7!Mzm(t&^>=zq)%aDBZ9mrf5-
zdG^&;0t2`Q><5cHV&U%3DE+L6lO3=E6ovSuXUtq~qY6A*JK9);tI>mvMS+}S@Y18t
z?6RkL<);vt9mQQ%+HayOqpoD@!2Yx}c|oz9G6mowQ!jMmERtA%;#(1c*bSMuc#W4~
zliG%W2CP%~3}^UpNE*HlAcFmGI}05!(Rca};8Rz(UV{Hq%ljFN%2KNE`h+#e=dVxW
zor)yEz8py4tETdVW1@W%4l5g)U<tbW_ty5lIKSCu5AVT7I?<?#rs1(w-muw0KWlu|
z;V9S%ObtX&YO1zB%0w(SaMN*EGHu0lSnZXU#NF~A%7){qQM}WZ)X7nO>q7gEG46Tj
zAW@bXXJGktecB^t>c$J?`vp*$U+|sp_h9_V{wF#$Do$A=as!Tn(6Nc2uIE;qr+=}2
zd$^XHgZfRYCR797=ZuFLe_7&hFdk(5X^H=p@o|hlDDgRrk6`>RiQmomV8rDuq4>UD
zzZtWUemc`H$4rq(Uw{{8?CwmzFe&{U!ubKKTKUppqWn=z|AOg8Qu%MOPPX|biL)EP
z66N1!`U0lYD=3lvGSeSpI?uim>Az=snCX1eC6Rs}u6o#$nf`E6dR{N2k6`+pN$H0%
z{d}h1kd%H+Po(EC{nDiLd-9RqmFedurB7%2zwn}h(<>=G?M$S9%=FGl=|>a(8m8|8
zumt|A*e`zn-T5#n{ZOXQVfu=s^t-Xtx9?#3b4lr&m_Cl_zeBp;+U#Fx_lj&WswyC(
zp?%jIcc&vTd*2SYm>PG#fw14&s-D;~mtS(l-TgmUJLPPQl-c`?Q&LiHJ;ULPNbdLw
z?M3}JFmcV^tLo1oti3F0-2DK;rnS+u-tmX()6Cu5&6QsrYR>-t$-7fiZe1IQ-I0?y
zD0X)Kl0HxegRQ}M=s3okb9i7n^fgXl@#w03H3s8gjFGPRQu8{qQvhapw4X6DEv3mP
z_9##-p7egNNYiDwx3%)C-Z$<Ym$%2-X1zPu`X;q*aI7%ZSd@0~5dJr~Dy^G8Hj=;(
zLU4AAbZ7bO7D4<Te!(HMI=|{@e=SZ!)djU!#>YqTzEF$(9u$EYqn9IB5qN$SrXXJY
z#Jcu&)Z@(sB&hG~6(Zo{SM-?LpC+mPvR?cw_d>G(ir)y|$*Q12zI+%NLwR2*ShXJ7
zh|fM}7F*qO*a}-@OSBJ)Ek4?7B2P#G(-rWe0O2;m4Apw_xAsDZP{)DdX2C98irvQ;
z3><Myy#aa7bR;RRYsLf*P8H68#4cYHlY}v=4{OaBiqbvTK(xW3DI>6P&-<aNTVjL2
zg5!=S2dez3&SN6`DZ96X_H{AtnF)!M=KOF{Z7T;#21d1M%AxzGUXM)0Elrqeg-!%%
zn9uP6KEKfo)N!^VLG_3N9jZUbu{Z!i^_`qb1ng##BsMVpNDw^@ZE)_S+w<8y6X8sR
zz51VJls{<+-|H-~>iuQK)&`n5@9EdXu&GgAg{P>z<{2KOuB)8o!`p<lDdjhse0&bG
zJ_=RAC$O}{!o$;_;Uj~XsOXt!pFHlMl**a4n0bSEK)VPJ9@fIY4N}R!I=H;uaXFH`
z9u8k2gB}j1mGpLEwHZCweL^xBzz{S<xT(%f(8`JVatUJm>xC`1c}ebls4&m&Iv4Xs
z$U7g=X%NeCU|q>E5-qPp%^Ld|5;uV|IzhBy#}w6IC$*BT+bbT1++iW;*}wL9@wMCj
zeW=p3#O~Kcv54O2ha8s(8wh*Y<$lit@2ifWL6XE0o3yct8cmoC>2gmS!c&9=<mg5g
zHLXvKZrDUPr0gJ!I&8&k8+D}}8xAr@i|qJ}T@IZ!zL~SN|LW1J?Z<ry(Rl=GW^G^c
ziz2OzA!!_jNW5{=`jWQDjA1bNa0S|0V`ocaNk$J|4Cfp4{K)faJKxjTKv|$Ge`o(l
zZac{P{3aej?_k>p))+SNd{}Vi!tHUK`Z#GR9|X@9Kh^Erz{tg#>b7sWCd7Npuvsa8
zCNy;xW873qVIbd4?k^+J?u@|%>&jk%MO`2|4p-Iign199{D^hU@(L7Zq#3@!RDYTF
zA5ak5s1ytc<V?i>xUj+7$<-)nsr41L)hMHzeiR0Yvg?9G*&Z3CF!=w-d-M3Hs{4;S
z0Rpi?Cn#vFThv&CYf?c;5lw`^9i3=YP^=qup}3<aqE$ANNXBuLT5Yvri(0F;+R|1L
zQ8et}3a+^0euf}K6j@aAyg%pMJ98%x(f<77d0tOmFPMAJJ?nQr-}61^I4_cB$W0{Z
z@Q&sDY7>CbZ_*El=Ur1Wy-(AxHcj)ZKd*eQe_k~ERct1@F-;$M*ZfW0x}Ep3C16Q=
z)~D-ZvfpL<VChk@x~#k#_|>7v)UE9L!;LsizouJv2cJVQh~#YFkKZj_IQ#o99BblM
zo|*O^PPMf}`b@R`vA3(3eY*xcupo6bb9ad;FTNl(g6A#x;YX&AQ`tQ9!|3DF?cMnQ
z@)v%y=;Ixx0%`hK+Bu+)E6E6||KCC%H-kjs_`B#MO&W4)IkMmg^MC{b3JH^k8Od~4
zd{NlL0@M`nyCqp1(2gt+-RN2CZW48+>Ea6Hs_B9AC!mFM5zoQ%Z+Z6hXZXCVZ=Nos
z3g&Lf-S4er#^p9jgeExdAcC@H#xa+i2#$v=rJG(Br3|%@vSXo$z#j49BM_y|HI5ww
z2f;$YYj^vK|9L8@aLNkR$)5@=)o(Iyd(KnQ>vH${&_KGytN58t2m`rh(Yea%&SNQC
z0ti)?ZORs!>YrY~p4xt|V&qd!n{=qhv5*;yAcdRFUl@diKrlQr(3%K6o~oeLWbZ>-
zEn+f5!}^jn7!0R_(apS#wzfXH=X+$0?yDoVGP*lPIccNr?@#wNKELPTaR#F@4|a2-
zSw`QpM{_A!6W3j#8%^2z|7kRDUhhUDgy=7XtF~j@>k1_&^J=y>n$t#SMsvX)+aAqZ
zNH*c7^A@AITUbl>xcn%KoBGxN(`eqluEWv%TT!9l)*sE^2W$m5cb%UZ&Hc>dwuPG>
z{%B;sWaH*#VJ+F?xg+gp&SMGM%G8Q{U`aQ=AEiAtzQfTd&N(!iE?XPTdlzIz^FHp6
zZH?wADR6M}8!(YQnmjj}pS#fvWtsVZ7|p~p-DrdW{e>t^anO3TQX!G|^C-uD`_e$k
zk`5;KBJw1A|7@fHn|BUBRyN4f`;7iG+MVw+5)R+zHUHWbB_5I;0Z&ExaA(R1$Wp8G
znQfU`2Iz)7xcF@DD-&LmJ<gPLO7`y7VsnBuw}Q(hG!hh=H&cCp(!s@}(*n^x2h#9?
z5z6MTIoffvD8LX}iCUHMg=v)cq-PHc=~YY@WpXfGc1DbsWG>_p4=O87o@?vA=0F3M
zq|A73ZFCD_=Yl%og{|qn+Ux{##n`HQyKN1#hV#tK2}nahY^}%uQ)#+BCi{KxGY6vw
zc-5a8DpKpyc%e7ShQ9>etu2Nz026}KXM!PcLNeI198eg{Z+l#yQgZLJkcR%6Hw(nV
z(Ofq(-Zsnz%U0eB%(??JbYovUehzRK4gp12wuJ%UB7JI^J*T0U_jedYO>6CuWkjH9
zhJF)(?I247t1Z|toXtCuL0Bdw5Qgdy08OEk28l%$dVp-fE{>PzUj}OJ0QN<$1K6%0
zCIql2tqs&;xY#w>`#3|*YUzI?tea2SI*<-$X$K(f766I+Qy{VPt1MGl>ayz~0g&_p
zLE!^22eF9l2x>=lG@gOXJV}7|klE~mD-D^a3%3oKH;A_oDdhtD5M=J@L<7kl=ls$_
z=GL_VWJnu}bO`9E#?sLo%X{L(ZIjZ29Y879vrsAypp;iKSfm87GNg2_crReYyr)Hc
zA|-b^l}}GKX8pzhYVUycFw~NgBkiHK*au}AYE!LK%33)c($l?dn=RCy1xyZVzR70J
zSDVS+CzcCvJ>F_>(_qiYkF^b(rdQmN(h6}qe=4w4zsbB^hPYQNo0EAr^rIcV8omOk
zR(^8L@X;Af^XuN*)-<njSB(fix%<5S+<yEOLkN+=A1#G!bn;c~AtUsqkMQ-(baoyl
zXma<xhN{a>`U)%aBoaCh!FhGc>xX|RpX6_8S2wVUZk%uQwIW!eC5`df$B;!Np$1Gq
z7f2GK7_`ogIDvBzIOYqS-i~wL08>s5<IOM$D`6PJR3}WHm5vPylmGhQ`YvJeH{`rn
z<^=#FYr8*S$ha(aU<Tol5?Wv|AJLs)a!`lh51!+pXPL)JmyfJIls+~6JTEmZ{k$L*
z<kv*>zI6J$)Pw2g1*r$p&+}6MOg}G3Jsf)eZ|M24^z*#b2guo=pa1gH)wh}HJPT4!
zhSC>=(wBtNUkatamVTa>dNcjJAoWHl|9heI4@2pzLg}rc^o=|R{&Mx#`Uw*gio8XG
ziXbJsXS;LTN%&L-73iJ2@zU#Q%HP5IDIO%YaEmrr;|nj%1<Sn!UpbswCJ5bETGp(W
z1~ujO$ICT_Gm?nYk&qxaBX{wV2kz)`8(q(qpS$^`e{z!CkppbbiiYOem3X7Ly%Nua
z)~s0H<2lpy^!&1Ydu7dKFHvS<(0-_OYGJ`3REfqPdiDKj(fC>|amHU59LGSJD|cOJ
zk_tfFGH1ysI5$s=ocpt++Rb0{w2Q7v+(|2+z3s~9&RY2#tPG`D>z|ceGUk>)JAm;+
zUv3V0FN63j`loKiUvLYt*{(l-;n%bNH60UDnqC4X<i{-mpBO9wdo}6;&@K87MDIr@
zLhy{n@%_Bu#b0;OscbvDAPe83r%eB1@g-Y>pE||{L0o{g107S@-@gb$AfQ(tKR|(n
zo2@pmjcQM#TIedM?{kOm4v<~hb^@y);C<_i(9PDBrZ<gW_w@=$TcMobPTmeGo3*z6
zc%$(R4&Ie*BMY`d{}$fq{&xWHZGdAitp@Q0?fK#V>+fD-g6^iP-ClxaWQ~;@VfoNr
zevlt-AnORnZ(ed}k)0>ZbMH?TB3V+maGO&2{knGVm$!RAyWRVV?cR@W_x_7^?|Zj<
z-=W=mr*`jaS7-O{gUox8`u|eq-HvB&yZ8U3->rTyFMYPTEgE>`6{6+>G&-QiZ_GVE
zF#pk52tTC$9lm#d8!LkT;fCGD&uY0#cK4mIO?8p+3E{h&p%2u9RW}tFSyfn-IulO$
z%EK2P*Dd!ydWj0ohE)-TUO0U>6R)Vd^rPh=@l*P}d&~TQzbc1m{RD>9vQ7Ui#Zpq%
zY!_}mR?t~!MP#L(g{GV~D~O=y=Gq?aRs>CowAldpRFaWT8chFSC1(`4l%&%-6WO*>
znmM>>{@l0GlE-79w{mZ$7w2Bx!@ML0d-)5To<jRx{BTZ-;rm!R!m*y3UmufR9oC@`
zZfW`$SZGWUMN4hbM{Lo{l~_uqQhu4!D)P~*z-TVf&`2*?SW(LFF%_hHq*u7~V^C&Z
z4ey@+Uc>tc-fR5#5xkG&eT4r$lK0WPkF@WLDwqza{$oZ*9`lIBuDE<u{FpJL;zwVe
zkpo)i?K!j{^>YxIA!@DnmPUFWQ<^$P>CQeN2FZ%s^{mvsiW7UfHGk7#P|U(fu;$Z9
zul&_3uJPu4+(k=Id3evYypAFf{su88)(U*@$%l2xVNJPL@x))|)K<C6o?Inqd`242
zQk|3Lt1X9Cu9h7&CCxEjLIF-(?hKjXuea-sEi<`qfR(ps{8iz1%F4}cvx)?WWWT{b
zv6CWmAab7xqCW2hk<3zP0-yirf9}EOnETv`&$_;csZMP%l(ojJAcc8+nI>QyOf}IK
zt*PM9D?2@D=G$k?VYC)8P2cL9?Od5)Z|75D+A>cw7ARzqf9S5Qg5#w6(N&6H4`8%{
zH^v>LMNorzg1IeUj<8=`u)UYiQEVJ98@FN6CmJSHy`ZqX<CTe4HGxtO%pS-LmceIz
zV%zEWo9%ElB?tZ@p9T6s>o&KUBU^k9;KGpf%^~}`u!f}EP;`|0<MpINuWn8Wcx%jL
zfd&ZvWrKSk%X`RPaxq=e_&R%s@uXF<m%aEA#ahBqw-o&VQ*f0rm4w7WKu38m9x5<+
za+=fvY|Z^x9b_9t0Mey^@)>QFC@{~e4~e{C%7>IG`zq3t*`rzn5PH7dw(bDppV+Gu
z&aoD*nDt%7tNy&i2nW{0H9!<^jOMBNUM_RTZsrk8Ldx``XHYQ>E6QSwS|pmqdbQ-_
z>rbOhN)^1~qrjh`vaD))GJT+mXMQ1H9T%B>s`;=cDC(&{wFOQfJs=ppSY%FAF1p&p
zul~%wbf-FTF$Y3(zTyT_M4v?v5yY!`eSEHweiiBvJ8K-pl$wKW8kmTGWp-dY4c$41
z?r?F1P#n-SV+guWhM$okfcx?c33DP_K|3NI^Qo<I4ln{JDq}}uHrRm=GBVIP+KJT`
z3*1;2(d?bVb(m%^u#j#i4(bl|5-13cw+C6FdG-{IkZP_n;3+Y-sQ!gq)8~`TZ7IAo
zZRRYKRs)%9cy^IuU<%!GfJpgm*g&NG2%Ml9{#4%ma3pou-nnwR?dV#IH}04S@z+xr
z!m8y;O1pWTwXczvXu8oM(q^sQYC0=WTy%hxt-=BJ=O!&}JC;+J_yK_|-#kiqSuFnk
ztZH1`h^OPo1LOXYA9I7Rwa{%X#c3sn58ZjH1CFh*9Q)44LW!c3w!+TMeAL)yLN|dv
zxBWlGo$VIz?_$eA9N?K=gN1=15u&jyHjzT1p%O{p_-nh;G#CG~EX2hn0X2oVI8R_t
z<lUrvX)d;$3%JF_R>Qzk)1h_8#Zz1>`%uKP2Cw>S&&BjpwC%Wfb*pXW(nA6+whWwf
zvfO86t~06SSazS0Ehnf^G4e?LH^z9<A&I<PYVjG_ab}2@r`fL1vL{~Fo4R3nd5T`(
z<=d5oKXWRbO7rp&3`4v;3?k0(vgpfFdMr_5S$?3O?Vzk|$NSm(2icDFvvtkNR${XW
zpNN6&;SmON`oX>!D3KVjee}hE22X|^$q*neEe0xdmbNbjrYWDa#Q7iE8fRJXq{V<|
zKDS_lt6={M5f0o{#7mb7s=t-o0c)v6bB;xoBc_m5APFc)<Rc42mmyhT?ErnFzOX#l
z<%4YCHO?cBZ7cacf8XT`pw~Ee>-m<DFQqeLOdBKhIj|%7axL5{9vg;ilzcgYcMAY7
z;p9si0=EEQLB*FZ76@lbUPBE#Q92}F;Lm7$vkgeHVc}llEHu|4IG7?0Is-KNB>(BJ
zIr=nmP=9+yXS<}FqL%K~zqEjG%a+=+WZTEeSBzU7+iOFt%*6Vb+7H=Uo}XLycETeV
ztERgSI@(EblZhzz+V=R_LEOc9R%`wKxplL=M1hw$k&^{_Pd$*-v+Wr$_+HI<z^LU;
z+KDC()+D5eyNu;^DP%OKd3{#<F*ucHr3R4G`Mdn!^U5=IZ3)%^qL4VaF2!xj6huYy
z7<)~mwY`)&k`aaAak4<+&o8`?C_<IuxSEmoIt%m-&2>9_$%CHLT0Ic$3lHdc!ULfp
za@=Nf)L!P*54@X45&SLu*4D_x<R!^V6b`m_Y%fgM`3Eqg`eja}VS6fY7A&oW2=!aQ
z&49hYcC`UV6;y)R8q6wNo|!_IcA;){>2AcZ?srfCA5HF0vP2Q0yWf9uv_?qt!XacN
zu$cE>L`NhKTB-GVMLXKNQJ-8PA|AMa?y)BEPpl%Oh#3X{tBHfH&hJ1S{QEsJ)w^?=
z(ZN*1GnWR7(i=tdd~Y)c$Ox8&kr$_7^EPlyJ;4Xx{<C&?3bJi8@6(km?vTc^+E)(;
zsXzauT5-o=AF0<J^lO)NziOYgeX5;r)=_uM?7*M3E&Silvbm4{saio7{ux#X|L55p
z7XO#=DEyzzZyNt+@RE)HK0g}%Q#J$#{3D4&_}@fw8vmOKV%!4%&s+S@vc=Q*=Xy)w
zpY93&))WK(g0Jvzrx_i<`QOC%c8Gs);NxE;WbuD04V$-tV@LR>E8D_9)mr>p5#2$*
zc1iaugn#PJ;6KCtMOoah-8nPzu4ZxE$NnWYf@Y0540$T-Z-$-7ak$M<dj;G-h(}?+
zFTb@X`rLmAFIm`MxSx;xcc>uXe!SBDssYy+k`DJ51>DcHyMZLdn}_*-vH1UStwn>J
zyJ1q{pACGVFz*?XVefy~ub+gl5AVVK`&cbI@NIemq8DHViP6UikknNrt?u)(Cip%t
z6V%#f9tWN&b{WC46**?tx6}BfCDWaPDY1MOKNGawW_nUCz^9!oS_{;*!li$b7PWci
zG+~P#yv(mqdzv_Pb+*6!yYY1gE6_f^Zqi~jm0gde8p78pHiyNRmaB!YVf?1?bv!Rw
z_}b~`KO(-~&_XYr`W&VT;cF$y;OklYbxV9*fvAHq52PpHt2ef>@b$dJHFe!40o?(<
zwgW>MeEmb1wfuKAEt%6P*a5zN{nq!v*FVUm9_&La!q-2<Sw9G0kF$d8<LjS1s<PK0
z&Je!F+8h>N^*jn+wfv^>btx}d_!?FGBjRi0;jUAau2TVj{e(urSBw3+CBEiK>m&wV
zOHaVpdDb{vZFW~G1N>M5wF7*e`eqnk16&F8<sGzS9-?3e`1<!7-v?h`lS@6gf>wmD
zuff|7!`B;liF|y$Acql^{T#6o!q*>d4vVjQc@(~G<u{G5n|aB?*NnY?M11W+?dsI;
z;Fu79?N2iJ+TMQM5??zy#rGUN0bh?|j*7o7_N7rjNy`rK_2BX_zK+$5+49%p7*FOK
zAkzW9aw)fMd3Y?jG@5z*3SY;<Z$AuQ8}U>5_*!k<i})K58zFo>W^-74&Erw{dW7FJ
zzW&8a7QVXf^&{e|oZ2<Q_o4I<zK$Uod>vrFZi%mRq<0bnccUlZD-U~B_?qoYBiWlB
z;cMe-VSKf~2M%9vVC<L}1v|jk36$G5zV0NKMziDV4qtb|Z$AuQyU-6GUq6*|i^>*4
zoFRO@jw^}fe%Tuz@+f@0!fzU1FY=Owuao!u5%F~@wX0KQu2X?Lyntlzb*%lmCBFVE
zI41^PL{GriaO_LrtA~TWc~w%b1OAG>8phWEC%=$~8?jl;R0?*0ue)FQKKN=Omqv3A
zt%$!`z}pYQS2_Lg@pZ7ANmRBIVk3mF^=NA!U!BPze0@n$8eeO8$->v)cK;Fabv3oC
zQ<LDB5P#i3GWfdCe%%sZodxH_z$fVm___}>QTRH;mqz<aT6Tc1J6;as>sjTr@^BYg
zGOK`02l)E@rSF5U0o1P1{Fh(hYXJQA!|-(~t|A{_$H{9%Wup)qA$;v@b6Eb`hezS7
z7r$wIMR>`=*H^p#i1@lgo-#VM4BiOg>+dolk_RR1*DdjN5_Ai`y6<N3wQgUBuZMkU
zR3~ZK0lrep!uZ<NX*~F=oR-WH6zl+Bm6Y4IJWP;Fqv;HGg|7ts_QUXXH4Y;mUzhNx
z%4R^EA$%Qfb69*0;!*hO%WoQAhwzexuOUDC5%Kjnwt_mfm+Mp@51)|<!Tig9-4b8-
zB5%Og>GTBtsw#H)`qr053nUjiz*qUwFuqQ67zSUb(vrE6f*s)NcQ1Y)e7!|3jpig;
z5x(96Z$AuQcc6=XeBC5B5|wpDY=rQ2ip^p1bq<fh*D!w5_&T1KEPVZ;@JGbg8~eLX
zT?WU5_-iG};OklYbxVAGA-R(n_z*n-U$<c<N*-4G(x|tjWe50*zYxaP0nS1JUsuzT
zc^k-dfUozT|33ISfZ7G(d-)Z<4uIc&7{0DX7YF<$w-1$_3{i#fb-2yp_=`u&U;L)=
zbqFt6`1)k09}!=Vm$**Nfn!4WdWK~2m;JgWz6L?J;OiTD0=_;ja`?K-mqzDFT6Tc1
zH=YaQYgdP1@fR(b11Z=6zRD=KZFzVNxdh@b_!Yjcf!}@@zV1L52mB@P4VC>3u@S=8
zDK>}WFCHy_@teli@w{Z=>#*KGBEH^`i-Jz|be#(1;YyOlU-s*k`1(Ea27JZn3Ha)V
znJE7H%$G)wN?LY+uRhO)@%5}Q>--zEWUi%P2l$%w%=f|9v*glf2GNS}^(<Vu6?_f1
zF}wBS^Sjpe!f}_+^;4$}w)?^fJJI??;*vspnc+Wh;t@rq({_@>^F_~hkdpLrTO<Ge
zNgXTgc@OT$*b}1)bjjVA`l@`k60I?V>4tV~A63o%T;f14WJN?FWuFppg=yK}U3~Wm
zOMkj&FTSRd{ptnkI(?xZZkdbt7uB_42{1XuI`CLuQ7wvRp<K&CP9>(nmSjN32S2SP
z1M`Vxrru}m0CE!jM(c;Q74D1e#gkod>Sp%#3HwjfE<oMGyNmxBuC*l=aduVX38$~@
zrs6YF;}&W4W^hgpu{riv4HHYKaG!2=8L2i8@KxlTq{R5w{!zwU6?v2*4%zIP=xh%%
zu49F1fPxkYH|cg7(JlFkF{{Xm-45c(z!lKv1OP(ZMr1jBStQC^snNMHsna`799NW6
zchyrRz(M_gb|FAbUgMdy8gF7yY3NV|4pi9BOI%Z=e%Ju=1WPVyuWsG1Kn9Ccc08eT
z6<D*|YiZ`zChMK;cQVH+JYwl)9Q;sKu&}C#z3_`74cbQRC8}7}VX5>;I+OafoU4ym
zK>Ftmq@%62IOTxTkdEv|M-t;%OFn|r4%%wBYpov&jU{+XbmL&BVnhS3SZ@(e{I%5-
zL0=Uf7@am;O=k^hsD|W6%~|}JW8vfU0##)bYi`d8?(A$?@@l7?Svv}h$$rCMV?iQ0
z@POAlv3^b0&6kYC;y%70N@PsFIaH0&=mGW@djw+k*J~68SdoP{mgy4R8k_4*;6UJ^
z1Xmy2{M+$7_BeTCC*lDPSv;0Vn?v$ycpE!vtrt14Iqgr@(x8^zdfAh-B8__}V4{A0
zML>_b*=_PqZ4^cC(?6lp=XANw9UkTA)i%g~#}}&ANa9+p2ioxt9P%-PTLn1mdtzvT
zJ@1KMi2LfvE$>)(5!F#Yp&%!6TXX6%@Q6R36L&mat4zGuJOa#abO7s4WlB3@fmU*9
z>4r|Uq%mH;!HqF8xs|LM=V($QlRx0ij#U9+B`z%F-III_rr6cu`bB#zs@nF77H8U<
z`jKmIl?>9ysY-v%f>xnd{!An}9~Zy(IF;ycPc=cWAq{V@XT173xn9{bUY}>Q3R`5(
zPd9pVW?djW&O#(cFwSDX-C`Pu%xDhveeZNNo~@?Ab_pVS_e0>^fT9qbflZGo>mf9t
zlN0$<bEIl+0}C}5Qv_i3uHq9QS{j*r8Yu$S08%28tL@vtOCmQ6a_P$=lgr&xk6mnz
z19(d{>zJA-U4xwG(rnP{OL(Ryr>iG7@Y$#277j>ou|i$-(6Y(}pu$3hZ0b0Y16aoW
zJR}G#5VJ)Ap(m!#iQ<Gc<^=e}!o%$BryNB}CiF5bKQqymTjl?*b;oe!9{kVBbojIr
zgtiVm`K^XWbN@NPr<IHV>zl>x6NLcttg@%im9Yz63`>`7n`wM%dD83x8Eg0~_o&`q
zQLnyU2ge8cp(B2-4A`s3)OA8!2_ZkjdOu*F4_-F*Y@zf9`*z6ED{H~Puw`K2DpYi^
zUS|1s2Tsz$sCe>&E&11T{1|EaFFdYj9$04oW)a5z)I0Ve$-D5fJR%@VK9F<Hm2b8*
z1ntoyD++pSPw~hN=ffS5#&h`Jj)XwD-YdWCB);AK8Gaej3P$t+RQuuB-ALW{Nt=4e
zipb=HDMan_B9n{l)39aadO0%r0#{k@TU<^ERlH2jXCsq0iAbaNbx+41zI@ThkTGi+
zTdGg^dzcT$Je}dgsVf}+Ki9ljUGK4*>#c=<iG4npmjCQzQ17m;FH6V%MDU1Q&v|0v
zhlMYKjg=Tq86s7CfxMX>l5Yq4?Olt!v}mBs>0baTA03M$lRt;ts6Ov^Bu6HH$XkZ)
zgeC-&?n49w?Rbha<E>=MQnKQ#7mu@e;bW>7#hm<r1Zcaj-?p^LAv8V>&QE>JfBw2=
z7;Yn=^FR`u`<{c_xxf<n6%6I$wV|Pm{iTNSjFN26#_sRfx+2q^M%&V03&n{q0L8yr
z^;aRNj?w$A!Fzme{R(bgU(@U@Tj^~#SMZv<HZr0))t@SKo*=^CHDgv(r-!>QACY03
z-_6Ldzw=D(yQubs^xpiM{{s2-qX@E*v@Z$JG&c}r$N4E&l9I9iwjsar>dx-Se#yJ*
zO&tyuGGE*ZJeDGNtJWSZgW}ED7IH1R6}ff=;wf#%tR>nUOJ=d)+2eeU<eRm#$S>J%
z9nS#m+2;V-@16o^2lAf}G-t<LsOD()9Q%v?J15KzHa@;%d!|f?h3$tHg!EZ^ljTw|
zn>AUmkF2~q+p=HpRqxQbcJ@kOZL~b@;2oypDReEM<Ed{uI-dHZk1IC7Ma-X%g#p;9
z7fFpphy?s%JNP(lI)yt&?}d>+&5cwo+iqzJ6u0C5z`n_OBGcZ1etW0W9_90%TjT3+
zOTbQ%+b-;y?!uR_thJW{c``C2zutb!_5VO?rORq*Y|hv*h(c?{{9D2z+lo;mTQOq(
z%60}f@^O?gRTeBgo}!fiVi%~z<iJZr3fJ>FV4X<tFjNmM>;5MjU|*US>r^vDCoFdK
zQwETd5wB@_s>!mCJ>Ndg=+oHhj6NM<p96jROZ(>Q(<Kbbl)wssezWw&j%SA2`xTA2
zJ$(iG^yT*Y^v%ztPa)_f64r#-rGs5CZAHK}+Mgl)(O#bxpf>aVOK}8q@89!_+I$1G
zNf_zDZ(6mvd!tP~<c-MWp;o&c@=Rp%K%OXZ7M6L*`($T^+3Q1!%#HLZF1E$;j&sG(
zu^ekD9r|WuGCTgk-S{rVMvo`nJcUpW0r>mq)6&46o|&QV*c*->uYKI<)8|%Y;J@5H
zOP`K`i8RO=Hsz4CJ{@5hdf))0oB0#kCWCPCFUGyK-$z1q0X4q8+^2tR>p%&Oq}Gr=
zP5Jt?lH|zbw<OR&3<sryp|nBQfle(Q6?u&_J3MEt@`5c{raOl<2`DvJf`tqW3sXOV
zXPmqV`tnMP>r0?tr=XBhukv3=zXovJ5E@q(e_UOFZAia1`qd#U4_SQW-tb`rfSXIO
zFakL@@LP(S(@Al1>ySK;LStXC<3&sMXA~F_{Lk4kj}EZ|=q@54F%`8vlRoZH-xiVu
z?eLG+Ga}&>p6SXRYWNB|Y<|jrfjBtMi39&NEfq>Sm>HD%unvPcLN6s14o8qmD)cws
zR=JdYNeRsd*mw1<93HX6Z?<%JY|KqNJmg{8rE(&ZxtvHNCrad4w}-6tR~}Up<2>`V
z+DcTlb9?mX3`yVZF&wu_mXF|4X;-@W)JQ5Ma5vRNU4c9anQB|}gB-+oTJA-!Me$EW
z*@I9}$pMK)&J}VNoEfF90_zL;1+}J7vTf#p8JYS8UPjM}klOyOy}q11d9t-I<gxSA
z?{j1RR<u~WYIzlhsa4G+s_(4)hPNV(vZ5DOErvDNFO}r}G2JeX`z4mBE<A-GYGBj0
zbmF+hIvR9W;6h<CXR*(XQ>dfit&UC(t)|rSkTgt?n=>p8UKP%;tUS~Ci(>x0ZHv0`
zS;v;<(%I(VC{Y^CO`XBmr)1!%FyHLSWL2|yXAtnbNq(hmoGbbW8B>y_=2yT~P-Ob1
zan=&`zuwGY=E;z@10>U}?@2T=-KTE5ZEK5-+;ZS$yn6V_!bHLhT_Uc2et|rdp$*h3
zm3I3bDMz9<nhF^M@|rexhD_xLvDCMysz|VL&NrxVQB{F`4?KiPFL8i8dX0V+y3p>O
zZ&d}S-BCqeEA_<^=Z$dQzbZCIeN=n6<A&P3w<N&YpJs^f(f@32OY_PDyjWg&5rEKG
zJ6<{SJ9y;+fgs=&fg*L9DpUM=+DTPc<!93OwiUj7)0Wnhv{(qUTjQv=DnTq)3$>X^
z|K04)L*>9+&ck}$vWPvH)t$J?wx}|8VOE7>Vyg_tlqRYQqlrq6Q3mn=E?&37OH|YA
zcLDiwCCOOUZC>v|GaS-ttYjRw+`OlU?>Mb>4P*65jlyPnP)_G(i@=VDu`{YKB;V|g
z|JP+lmt@&Jh@xC+$aG(W7-)OOkEDrQ$nJF4qit()J!<A8@+?eqWu0~hm-v28<AuAq
z#%z}^*k*TKeQNHv0&b!}Va0nQ@71q6=g6O5#D&1gnzr~U_FCXh^QJ)GZ6gaag9`RP
z5A7}oVQ_wIus=Z}YfRP%N81qPPkLxcsP=5omA8`T$PP={^f-R07!DVC?~~O$du7WM
zYi#E-8sF&j_#%x-XM>HA)Pbxj&+zvfxP7tDL1dDIY#vA+bTtiVi&5DuEjraE%!bD}
zI;WJ6QvMkC^Av0>e|e5ciUU};DdF1!wOJHtSSE(m7CzYm%GzI9wKb}U?Q93E?u>oM
z2TJwNSyYwJAwOiVvb#8m-^aX&^sFj~G<={!J*$gUu+-!H^oEa>Gm*D(W9J-u7?Br0
z=$-E9wncew@}s|iv&}lXtd1t~PLk^`F%AWFA(SvkD*(+@;Kg@;P-X2xKxbr1q@j#{
zCx+(ZW<Ju3UyK(Q^{|)5f)l-jVxLnC97h`WB{74?dhSlxPH3(vl-!M<0Z|S-XcJ9v
zepnZpNcKCNmP{vzMqqsgu%-^hV6yb>#jA>7D+^7q8EI4q_R|t<<^)X1rv)9j%zQar
z1A=B1@?YU4v0MS>iA!aGF#EB4J`lL}o-jT{^?~dK=GevJEo~<?BmX5)bFGSVEK+*!
zZjDv1FF&l0y?d$SiGIBqtBM>1EBO5s798p=K2xQ#`KDM%LH`>Z{qI^osC%x3<`;ZU
zGee3+JMOU7H&BuKJ8$NLe+IVFBJ*?GID%ixTQ%l4*dUg%;`4%S$Maf_GX*kDEU~$8
z@)E+yyE7(1wLxP;rgXRBw1Y9OdB`nY-ES)zSCXS6Fvh8x1#|`!?;*+%u~@R8KcGI(
zjL=6*uC;9chiqN)??`^ivBes5D^;h?0(XAD?F2e_J}s1ft^eHeNk+b<_osBk{|mRv
z@c%PBZ^8fTe?<O2l?E*TBaJQpkAPx3;{QhilzeagU-^IL|8*JupPg^{e-=Ob3-D&v
zq5FLPm&2|d|5E|{A6MBQ!v7Te-u(ZS<NqAX|2xx?>4Y=F@;_kxe*BNX$nZb(wH5w<
zt|R`}2gm>OGyMM#>gbUFAJoU*z0`5|pRrp0hsM7b|9|<z`5!6Z7zF;ejc52D>Af}n
zmr!rd|BT7<zs4s1w~X8o|3BdKzpW^X|LKhQUzD*W|Lfyc_@Cqq|5Nq<jsIocjP??j
zVF)JNYNeLfMwGo&Kc2gF8*xN}(dWJR2D?4nt6$BrV=U^U{)a0dz)N1<YM#RQlLnR6
zwZ<{Og<_2Q4P7G*)6iS>8}cFzhf|jvJq_C5q!5L!HlcH*q0T;Zi8TC`FVwH{c|4l=
zf`JH1t%nX>;{E~b>^k)IJ!GS8>~iTWIBQ`+e8>hVboB0!)-Ae(UCC7AQ?oxJM18I|
zdG#AQMXvw53X{M@Htj(=hHhb@jSeiAj6LP|kpnez%yc*m_Z|?TW_8nctnoKU+>Ct_
z{{Uc_Jk66Pp|7>!zKDauHBf~@h?YiMUgNSN;=kZQFM;zzZgy9v<*K^Fu(CPqkB{2w
zu2U8~-dgJ~&IeZmdE|tgSn@n;1`@j9lXh^yu+{SE)cjO;*TuoIziPGb+V)3q)MRWl
zu=o3E_i$={ZEY>rq0D4dUDF44?y^$yYqV@;vcSOFO&77p%)|W{M}IP=?LUWe9Dq5d
z2Ayh`797%31>_6EOH)<BtMESh1AyrFYV)i#g6%^)kR7e2p2wDU@+3LF)tq&|q{~FA
zp&sU!SOHHL@iHnJ89!hDy<iT6pLx1Oua~^Wja4}L=3?MH`5^*IIF4dEQn)hGr>dY5
za}pZQ;n$w+YhJK2m-&IvD;1H7(E;<DzXkGTLF#v;I$5)zB^Er?>7iwX|C;vQ<OjZ+
zeKK&x_ume#Y`#0WFQ8}q1tvxhGSoemt-;!V&Zi=AQ;}KqHaUUeFeIR0up9YXxb78l
zGX>PrGBZQ#>G=Zt<0LIgw~!VkfwXv@QPqz(OpZ=&^>-VlWk)`;qlmV$WXB3~9aQTO
zm+1Y;XPt8dG6B<axZ7?m@`BfEf91uyioZn+1&fAku-V9;C2|~puQA79xoTt|l)L(w
z!TEe$Hwd0CwLDGCOapC+{xzH9rib`SGx8|L|0JvnUwERILm;%Y|8V>YvnTK7Pk&7#
zQ^fd|H3&a7SExDOzlO##AhiB-PTz=cumN2_@ak-ND+o@IynUI@)K3ru#{$7ud6V?5
zUsD9wVu^j{uac%<R&hFHU;bh3Pq1zkWy5PmAg?e79U<*|t`h6=(-5VQ1LUb*d=Cdm
zDEYeq`Kig3i3?%mn?Ls70+ba3rSq*=c=@slIDTLXj|7gheK^*yDKMYjn}H)EGOzP)
zPUJtI;DxT=OOe{v`lvM^`ER(!!S7^_!b+bmFvf>r7aL(Cs%Flb7`vf<b0;@dVhO&L
zGsbRd0b=pzxyNJC&{m<^D__27=mz@~89c|rI?ic~jZw%cBWeZmAXW*vC^aHEox>@O
zbW`M#&$-&>+|LB)HRg1Fs}qOC;)u~``Mb5-bEW%`{M;C4#G)lXHAnD0jG!BTsli#S
zWe}v&4nZRpQCNKywa2gQSc{#D>{9#RpcuqOqtjF#X`Ey$m_H^o@kV4>Jv0vbXnES|
zp>q)lNq6#v>H0fr(cXTW?LFjO)7-$In>9lxiF0UFkK0L`^|UHa_B-%VS%L$vA<2xu
z776%C;4|y*2EUi{Um)N51`h**hf9KoA^wACAhJr`5XI%`iIi_!^t;B~2h(+EN;dt)
zePlecML2QhIf10q&NDCMW6P7Oz0A(W)|e$I2Gw~2b^7!b_(Px<O^E_Qmm#qdNNmW4
z`VHIFj<4U4A8EWBB(Z4R>J1pUe+rjhL$MM<(QkZ8bi=i*Tf(@0F+)ZQe~UEeyc9c9
zyQALxf#8f;_sHl-_PgMrPGD=jxm=?H7=;`fd?nw-(JfsfGtO+4ppDFUHqxl`;OfUO
zm(*PmX?%k>OP|QwQfQ;HjXTCNw71k0_-u@D=v`~RQ{s*e)1f`|p<|P3y?I>*0}#Db
zY6!@OMDCS=oUk6CQB%s=QXit$Gxm|I9J#U9tU!de{3|VgSD1eRf{grKYW@;_@3G`z
z$T5ij`==$x1O5d$o=PtB<)7(NGH-7Zpxo>EJnOFxzCZbg&AOAa-ZU*kqdNo?xo_~$
z*Mi?{JdjaYs$8Qd#<}x78t;1Zo>ii2%-BO+dK)C~$}I46env`08^Cv}+ujHN@)Ke!
z-3Q-i-WK<J&v+Vou#T!pt0b{PF1iML&^8=;%WQS)qT(_B9q@}uJ9v&jd(j-7P_~r!
z@l3|#&$8#fEmSIe+^=w<%_XyD_9B;|NZ%IezZLZ8lqEL9JTiRqjLT4}Z?*<Q4S#sf
zX3#a?172_$D$+HyQbXfr#?7hNWO#;0^`RzR!v-sA_{{a*9Aab(0#*{-wcazQVwYXT
zu+5*v_WrfrjI+Q?-da475B9pep5h^nKg`?|54CesyrwZ@C{-HVIX_J$>@CnYx|4*E
z>8f7Gl~2^Kt?o@d_&3i_bI;V^?x?a3#P}NB|GYw<X!3S`6%4}ZYd6|cpn|KPKX7Gr
zLG-P~&Adc<dKJ7(b(fpFyXR}T9C~E<)Frwyjl8%m-I-GQZT8=n@@ww3Q&xxlw5+ew
z)zBgk-<7QO?o0n#JdqUcn=eSMa;%0uL;6Fu1+I)PO(_D*EW6VYKkhye_0hgX_)FL)
zL$Yal-G{ISN3T~mZ$YnJp~Y>{>j$h5wWrtdPi52V3(#+xULn=I-4}%D(~=+b+3yrM
z25J!jpN_jxB`kef0){>Z{=reW2>94<9Qj%Lv^7AV{XYKM(&zj%VREvM2#{mkeTw9o
zdi0?qQY`G^KLs@T6PUk4n(Y1K(&W1!@_(brCp)6a8-1F*N;G+Eh$cN~lJmhsCxLB~
zCL`FoUZOos9@&m2t$k=2z|rN1f|Lo<<zIce99fzw5?z+xkwuq(#|)JEz6Yt`pZ_@w
z4DPducq%JD&wq9cdfe$#2n2!k!Yoa~I3aX7KUqR_vC@-y2T_ZNvc_IaFN9NSDIZ>0
z>1q%W&UDglF2wMxzOS$(s$&&dEZ}TKrM>9r=Q($oeT+***8VY|nB{O(Kru9g6yrj2
zD|6%zu_7!+g05vievOd)stL=li8hyqq>CZg5MCol&`t6qo#e7(@g<j^x712PC!bjR
z5N*T%Q*1@L>>NSXY+vj}_KyUTK{Sy~IdjRyZSovkUUPY*=T*>0JJ~5grF!){&HtUw
zuUTtNwuWN$E+eqYAPynJ7g&;400YV^NVP!}dY@W6kqijP(iFyRo9P1}C5C)CrTLfW
z;TAYDL=QV|2<TxWGGn-Kve3UCBH6F&#{oUa(EvTTPWj99I$Uz5(vdgRB6_H3PY?4=
zh8{eWro_)y=wWN5P-;HCEg*&G@5+z@ZCFxZWrWLLzg7pN&0-=(M+%-up&~;H(rZ?a
ziCjR0M;XyUjiUqU2`4Q?Cvndb!RYoxFwy1`&>;e0ikwnbk2o~;9PNl;8eamH_8*?x
z6TxWG)b<#Q`H9NWjuK3(BqicOu+31w5jrW46rTco`_9sVBv64pDl=^#l72e$Dq_F}
zZZo?9$`COmttIxUX}n#QB6=>ZuR4Ea>B0I^95o;`Hh&P@86rfz@EwHEi{e>?AReNO
zWW&r)d|C*Ipq)IB3@~5(KEU}K=-Gh%vvIy2F#I2J{+zACD{``7>o|7?foQ2c&YeiW
zR1nUc6v)Q;XrJ-NSbF;&IR6HVdP|)5Z;$gF+PjvkQhHuhkm?Fv!}#VLThdEYzYx9;
z1dJhk-$nIe{@ec09^X3>zKK4_#`g<rH~V+QYzyDT6z>q<)S~jl2Rf$r@r^&t%72UR
z&bI{kPNBd4e}nJt(rsJt`2UXYBYu~K?}s&w1K;=E+8*EgfD~W<-yO(%A?fv5T0el-
zFaKvYuP6JBSwmkZtulPivIV4%VC%i8Q6w-@vfpvLwEzCo9k;LG`Gsj#^3PL4*qM39
zQ6ofQwLZVED!tXG4z?=k^|io`d`+}cVee_vQ;k`6b3ieVq2U6Gp>?Yj##pV;Bd{l(
zM7IXQXv@hNt>C68PBGkAJW+!|H)MF#R`iOJeH%gQ+NuX;Hxqdd&2Fp;VRq9*vOl|V
zXM6`@;Uzx!3S+9(9<Zu}$6u{I;jDxa@byRpM&^9z;56y>MuOg(snIOJxDy!(6&Y30
z7qKC1rBy|R<OWk-CNTx6R^MI-6xW#2)cz6~8*j=Yp$n*5By=_m>e~wh+P$9ks7G|T
zupeTkfJVac4yVCbpH^?qhR#~qXs{t+jm_?^y7;%8B<@A3mf~5=UFyZxMsArC>$BEt
zToP&g1D&-|6_4^z_Es6;gx#w9tm16$o%2Nu%rd&{d~CAIetKC9#mtfH@ETv5uphx;
z6OOKK{Hkt$hRCV(@iPh{J%?)*Dtr#<6djSf{6@QXcLKXs>+smEkM;S=G?3P^U=-gu
zd6Dh6ZDwyYCH2)#Z&|K5kta3)8~!G+YU#y$6`P~^?j_XHA}@DdiY;Jh(42Jidc-SV
z>_vvo@shhOU@$o?i<npKH4Uf9<5NWRzW-dZIa9`+wC)45B$k)}F!zhGgxF_=1LopL
z!{0>Gjm<W`A4n4O2rw6Wxr;3vKhbS60%d2HEz)=z50B>yo}ZPy<s~|Ki#n-4K=RP)
z@{eoBa-MuNIT){apZC4w2^+lfd2XH2tKU=*x&BLKowKsD;;iSg^Z7Mz*=ldjCtcG0
zuZ&#Uy-TEL_s(`{zI<WbXN0bK^`CC`l3l*F4JG?_j^(cK$~V<^^%At(m1vml^Q+6h
zt=+l04~HMVXaT;-{2Nu}#plFwHwwE}MzF=|qPYtDBf#xSQ1z+M9qaRndG!Xt@KRkF
zaEY$7XL)0ON#L*LyxBB-z1bPO(UvwI#QPSTMzxjbTdYVvQs#NNxLUmY)Z6fcwS&Cm
z5!+j5E=tR;XC!{X09lHei(7gOP})$pDuvrBvt1#B@B<yfZvx?^e+J=;VW>2`<iE1#
zai@3Hu>1K7#Oeb8D@&~2PC{9;SqYuV<$1XfieRb`P@TwmPolEaEcSD(&^CZXXTm67
zds#l!N0H=EN^QOk8skeWF`8f}-8alykq*2s^=7L!{+i`TEsAlaRo|30zz6-+Wo`Yv
z0mLxXR?0P5gl8QHuuzQ8o#XN66-3M5sl7l;`IXodc9G00H{%w`zjp{sm8qcaX?s^@
zo7o@*Z|frClWFSTQ?5S_6Q5V$?X}o#T3zh*S*$8c%p--kPs-n^`-2KfU~)*g1%f=h
zX|P1eD{|;n>>%?`CdA_9!)%N0jPA5}j5M;sp(`l_c_dbteA4%ed5IkpgIJvgR#YSc
zVzFM6pqxd6#8f%7*x!uuF<^Hivcg>I4LIn;mxP0#GJ%dIE(Yvp;0$3Z2?k4TaX-#3
z7B^m8d+_=!w}F6BG88BSk?r1`O<g1on}({RjW0zS7VATDH!W|G;XN}OSrTQ+ldo`C
zu}!mGa=ob8E8iGt7zC$Vo=y(U_xfy#mM@4j?5H%$+S}<V#M;ZeWhrmY`>?j12VB~{
z8y$+}k((3p%jeXs68G1CBF0{)_M=IC%!L<T6K6wJJ)=-dffdPpHk-1^0gomJ?+#&}
z!8+W<g>I0-Q}ij-!e6tjh7!?UNvfH5t(#`iS`=D?^(?JL8tx)X;)?wG&0QmnB40=H
zA1)QR^(~9!E%k`7inN#Z;J8SX^QCrznzY0eg|Z9kZjnc!NJ{PuSrQ`6Sb6KXWQa6b
zjTd1CYxhMW(ewt364P51mcQe+mO2;qFZOV>w+X$On!qx)lOv$Fgihw9Y?#B8`ptQf
z#vR4lFfLG0!;$1j2^MwQtZJe(J9H^MQN^@CQN!r9TrlNI<zjk=CH49PTJ=~dW2wYy
zC^5}ZQ{{8S2<wsepq8Ep0Kl;O_n)i-|7l8?capwcs5q7QO5y_(r5QPQIk*+x6>WJq
zk}<U#|2cUmuByS-L^3u0gu%IfArjQF^>vw!9YD)lbnI_;1|93Gjy>x-R_Hobs*Vk&
zW42D!cYmn9|8A|mlhgG*qWbK-FxhYM9d@AK-cNmBB8tqO*Mr^Uz_*ofD5PMOFE>Z*
z96S(TjXI7mu>9Uqr4=)P+fXdfYhA3TY%Ket(;jLXL@Z3hJoM+tbbRJWyKZ(U9|N&r
zW?+7c?)DJ<1rk8wfam_yos;Hz%N`z%l|oV@+Xo%9n^WjmOLvztFV)2Nbf7~~5a`fu
z`+7;cPzMpk1v@(0Dl}7e1*K<*)V_P1qwBw8oX7G}#IPfhND^rP1nc*)B^o>o+eQmr
zzpI7Rb<P02OgdZQ-n{-Z7P@W_K=0atrw{yQwgkuGw3WK2N^T7tr>TZ%0aTg71ieow
zKqyH6Hk8h}2if*ppwHRY!t$p*%ESw4a!`!*l&G{ZPLeXvM=ST63hsy<c^*n^&_<tS
z-kgn{?doKs&aDvr6WXfak2P&4F?Tr~DT&lVMtl)zxL;(W)nxrOheTCcMzw6u{Yt27
zve%A{Cc4JTH;f~w$QN9-AGvD*-bq}=wE$zf*SF>BZZU347+Me;kk|Qn<f%PSyS4t{
z+`7jk7!p?!JJxT~bNV7refoNUA(KE23mo=u&y198nP~efGn($#N{3o&dne^j$HA>I
zGeD@sE^4y(&z`j_t0mNrP$XK-8ErB>6w7k*qRa|^5T{lo5CnoXh*P^x5(E<Y+0L%+
z{%OTu_gkz*FQo3Zo*%dC`*-aIHO!G6pCn%8QYcS$4t>zfqqtJHl<76mr8qOl%)oVW
z9|ab*y|&e_WB6+BWay5nLRT~q5B{lHhKN#!&qtStwm+8yA^mF0S)9kpyIXG$-524n
zxD&pRcE<X5(}>>CAd1YWkm(|#8@~=ojeP>qiIyNmjy{U@Y2o7vZ_d}9%^746RaS^9
zy)3F+PLR#P)}WBZ#S(dS+(hGVc=02E%gL5Hy%>U|B3Ax&q(MiNq=C>Jof*(ZTIm<H
z^pu{D)-QrSDNs)VW_!#R$F;=vk{MH7wywI*y4rV$2q8+Z?$ydfUH53Rs(bzB+}d5E
zi8`o*J17S9JM|gS<5cLSwZ0^`?qMCh#Ev9wCwgs(K0O$as-S!4cyJo9NT_3}%jqKX
zKaHT8Km<##H2^M#DKm;f`U3j_yz25bwZBLYq}h8y<i}qTbYstbLWSR`#Ay8j<~eKX
zR=Q&Bbqcs(O{~vqQ=qP-xnQZ8?o+C`px0u{1>P21z!fcu-n^CSx55b(B#RR~{RW(H
z6_X{MCuAbM94mxFHEsB2{ru5c>`;-DA!?{Z<FImLIaaEylnPVagSegYHIc^c0b2^L
zN!taMkB098AJscPT59&2AU@I@6G8Tx7(G8j5gBBpj(|awW$9MGF2l?)rvtNqebNyJ
zJ%bd>M^Vf8ZKjf6@zH~DocQQ6DPMshUmB1PQ+!t3w)~d#mfkKBSQ9lH5J@HVn{p!!
zpMf7Q;kCNk%obI&D#N0R4f?@eZ~U0=Rb`)v2_^Z>XJ|*ub5;wiFrR%cj3Cm*bZCKb
z*y(FA`#aZ95Wl@f@v<d@nIhD$DIMJOdD8-`2WWElFBT{~OCqy;d2ODZ;Yrp86uLUn
z@K5@%4F&NN^n*xrb?pR4xEsc;&M1iVy)j!=5FacQZz&MRDu@{w1u@co8OE2ig4p*O
z#=R8<k-8SjksR1Mhphgsz!k(xFEuy$1Pq5rF`QmpD5+0F78F)NL>d$!x)s(aB6$lX
zaRze~XH$H^UOiD}MomN-|3!&*s=`aufNZJlY-Fb~Jb)T2;|nWit5>OfDB9ZnzJ3{q
zY;U<<d;vmD<XT$)A^L@>1O+<MFQw8i1AuhE4rvBS-;rjCaJ0-|(Z8e`L|G{L!23l1
z-Lq!046Sz~j#DK6oZYXgv2#8Sc;+NL7ov7Ao@D8-Eu@C<Nx?OVF&pBKjt0pB9FjZI
zfRRt`+cd?AqL#A3*!WeXF-3-jjm-DkZ~?}W4_ELMOWvmf>h0ef>20D>tsyI{pqKWF
zH@0f{Q&*FmcvNM`BomG@fiE)NXvpIihHdk`<Yi0gxh;7%CDWg5_q;^X^_*)$1KVsd
zT=@XA@r8{g)D0R}S7=-EIIz92vA>d{*iQ>>XD}jWm#QP|QLbjob0VqcW69Y$N{r85
z!P_Zl7&a`=wnf>qycb<i8NafOaFiqU#?63c;%s#{$L1`uIbBcEI?d)(u$G{81Oi8D
zuP?uB5LWdNI@MeFwyXXqp(b&cK<gz=g2CQnmqN2C>}ermd!6Y0+c{{aMghk19dl8z
z=|<Ezl2mWN73in8`AAzzXwQodcitlV5z_H%(-W>Wbi}7-p+BiO6D@<dT%0!DB8`33
z$K-j-wHKRBk(Fop=l3OMyKxB^xm2z<RVj1;Yr&M%B0trEH@E37!7??B|AII<N15%&
zlAQq))vrC6w0p@jrZM3@;7-kJKk$-{*5nj!te|%D%#Fd^b}3nAt<kWO{f<c3x#(p~
zNzG`uDe$i)hkVV3#Ib+&;!AO8)Qw@LFrfF&%?g&cgdYNv7a6=1SE)}_4h0$^;I|E*
zaFs)KJWoKgR79TyhbM$7hw|LizrR6OXZ&07+XR}Rwam$K<<E@6cWvQjenZWEjV+uQ
z;{2hr9S443&WAiwLo5g@Li^Z(SG(D#MYvhX@mQ%pr2Qd8HQB#Cy@X|m_GQ#NGecz#
zqD*K%J)O(vTl@S<GuJEs%M5f1c6_tB|4JtRd-I>97tzNpS33zgFLfJ$2@;PE5~qa{
zPYDubp9asr4?W)<%KQtTk0XtmoCS=^>}s~?&g^D+@l*4FHSE`zY5c;I`YtS)Z%`d_
zn{Kd(<UZqxdT%o$e!?)BTCeaC{6J?>;p~wrGnG90Hs7LX9F$o#U7m;@u+4PL`dst(
zD}=mp6ZtYlU)c7`gT74ROAC72!rhFgbvx(H5C?inLk?s;%Y{VYJ5RNj*eN9Ioo6*M
z1W};5gWFLA-vb3XNIf}7D(oZ*)6W!Q4$7lF(ux}eiwroZcsAw?%Dh5F0t1%4$r59U
zx{Ui2S7}k&SrUu4I)90*(u}usvg(!frIVOq$$=v#!m8z8)Rm`xN`j-`7!{RdT!4`V
z?H`E654K*u`i1${<w!t!q~T3I-~zTAvHr%hwq`YMq$1~cyzoiPpawZax9~gmmtV8L
zX7Q!=iem0Zs};7^m``$v_q#!3(}@+_PaywCn)jId$Py`K1u2r0w;|-kK;s4Z@Ify1
z$`<=B^8V&`RB9s=ojr9xN+MpKO-=O7AX#A6h59bH<nhJlW9wzSemin9zsWZ-WRsj+
zw=<xS)ICJw-I}LdIO|W|oD`-4^QW+Lx(Ms7I?=Z{ifwC`?-o1f^b(f)w1mr+?UyDT
ztTnqK)}dbz&3#A6t_C+ul+JLT{M0WIEp)a)tpKpEpd-`!GWosn-LiHkmRT?cQl1s0
zw5-bH$MUWrqbC@g^=kpdIZXfddtZKQ8AH;WahRV@4rKR}e?WFW;rx;l8?EfVi4x`^
z@nACVgYio0aU4IhZWOsC`>h>kbG=HExf0GY|E7<48!u6@1t8v+9xeOYbpYSRm!@S)
z$brLeUQ5L`JZ7(4Dm-=Idwk6Qm~X~e3-dj5U#R`XK}XW=IsdBYc`c`>U1+6&M-fK{
zh}GYo?L;jA6D|LYxHEeghtT2m+3Yr3_f;1xZSCI;vb==yEz5j)lCd9AJCb5<DJ(f#
zY77V3v&fj`;5sB=sunvJn)PDbehD~8*xX*?SWPNXP99~w_+5_XFT);8N~Xs8Y_JQb
z@kXl`&ZH!Hl?fwJO?!#kM%y0`Kge1y{<4?6S|*_iTaU<v;v0mEE5%Ku`H8WM?OqwV
z71%4&%sTdfymMztehqCFjV>CxSU>1%cFD|S(bU?Ck;bJIzz9@Ns^c4&0pMQv9CNV0
z+FIlcO3GE9fK;gDrb>jGH7WTfEvP!(o`m>3)B*>2><Ma61o~)p)N^82Z@}(vJf@l9
zU8=RVyT8rJzLB*=Sq&30q`n<Ko(}fNKRSkVaA(F{EC4ys`k!k2PiON9pb2)-#K}N?
zJ;vd5^EKf|?8H6SHL5x@#vzNgXaqZk=zuE=YB?v+!amC)Jd;g29h!n`%oA5TC3upQ
zUbZQy1CiR9f>Y?c#cq71Sq7k6o=mTsI1BD{{N26<_wh}E1!oapEjT@y{Q5u&d_tCB
z$IfpjS^i!^lKC6#<jYU6Xof;oW-jmvIP2%Z!*7CztE9*;ovFqhzRq#a79pqT*+)nk
z7mklsU6?^g;!z(L?$a`TLiH~}_MzaXd0VLdf7!wr)qiJN_20&qmK%f4r56=+ilFq-
z5Nm0)nRihsEqD6aQnh>yo)6+#dPeUVevZb!tsEv3L8}vR(N*Ni;$k}`#D6mUb~vfk
zi;tVq&eFDCm9tqimHxgL$y!zR71sCkSaRGHf$*WR8f>E~*3>UiW8%YS+CrVQMw;;8
z{Oajcx`H0yi81<7T{qfE+ZB{UhkR<z4b`)#YPxc{`j!XvaU7`XLm1flmMY7l3dto~
z-%36&Y!pareZ4xWZ*fo`XT_<$X1~5>RkWzWN(%MwC0idWdal0tSgZax8MeIyJ9)))
zbJzJ|LZFs7<8~R?4id3GU)y;w9$XlIv|y7QGY!rYJDT$;6L~DDs&!b<@abh=-OzZM
zG9`x3EVHxI$YXbDBo#M23VXCQJy*8mhNMJ7tj|kwMpfJ}Ti(%FqEU5M#^*G>Ly0qa
zj>cb(uWfpFaQur{vaTuqT6`07q3PYojJpKs6Jzl=HTfMC@5`a~uQi=s+zHvXv9fJA
z>V4dFTq{K)W$v1+YnJqB8*M3hN{~2Bi7Rd5T@K^Hq;#68=SFzQxd~sX`42NvPT&Hd
zo}G>kZ{nDi>24xA&r0d4=~*&|?d?7d`M9UJ@Da>Ehtx^vtT*p&R)@0YpL0}S_WZMc
z({`K#PkQ|(wzl1;3<Cz9M5iJR_7p9K7#!*@%ynA`y+QgjIY&l(;*NBWG|1QMCC4?1
z+|B&;oE_-zXuF+V{-W7wf<$uMOg6OL$LFTIgp|ssD3+6Yo5_y&5jV>UFFw3I2Vkq*
zAX#>wm?YV62sL7w^tc&Wb^+v(G5IME#oO{>idXZ$M(8y)iYy%6=cYQ_S=>a8fpX1$
zxn{lll&x}oY&j9!!bTBcZfaMRn^85rsZr#{)4!V<MNRdadPW*guo*=@xv3L1R}JQy
zq5x*xW+RwSo$SvyBQwsJB7mh<+k_nU#}U^Kn0l<dlJWZ6+pZjI#~Ztf@%|Pln%?|}
z88?Uovfj_<V0+sEwKnVF%K9%r6UfMef`=o5hx7c0)E^nIo?F%ipAOQvb1?!%M0b1x
z+K|(GIcYrIr*<dAyN~ioK2axXSK`ZN{dq{|=WLaK5yH)kbr$ne%)^(vx?a5?-7}lv
zHkYBv+{i=AD?vY=RX?uw%iK*FyMC0B*AmwCtFFR7V%aNR-L!?YhDfb+l-Jvrv;?t9
zk|DmWWNdZ#-k={zvfCxes;+0|h1jP3jk1F+r~2~x0V}U>akW6L)zh{jum5Z1p{R|D
zE}kZ>G(D@Lw4m<k1b;_@x{1$HBJ||apn@!kzc?6IsjCP&w&QAc<4Q~D5<V}iy3VrV
zzFVznW>C`%H@4}1O%{NQDy)bRg#KwE)Ihh|2H!CIVoTSb4vueVdb{kc8_t+m8DA86
z%raWjBJ+gR?p00n-}iM+L$C`jX39BI2Joun_>#zDRSh>JFXcm7TiI8LY}woh@QNGi
zCJk<?`;!ED(*kl`H)?Qv-0hOwn>aP`_NIlTT_1lvzE&c7p)VLK6%VpT0{kR<?!dU|
zBYqW#Wh<*sqH(kk@;4)s*ND;+Rnxpg?_m$Om&tTHkjXn+@N7jUBhI%jkRN3_Arh$G
zXj=$71oKr{wm_DD-TLXR91GKDK$EvoUsKg2$TT-K0%??<FZ%McJw+lbcD90{$2C`E
zWJTMhR#t4!7gG<<((d5MjEkqC5Na<(8Pwqd$1WQ%aL7Ya2K^=A!^!|ag_XhWA~<`$
z2Vq)H8QgB3_&lQsZl*oa@gL6y3dn`Z_yTh$-v>8cJaM6=iQLqSRuO3YTa)7myX%@V
zdf-SmfV?;ZI2(j2{-*}2wQvhlty=vxR&G<H2y9_hLvHFQflc}07n?uKFUntL^NR}M
zn%vX^bqt}8#75kU=&iWFfCpcPok|V>2C08wdJ%Dv>SSg5M1cjl{-TrXJ6x8L>wo4M
zJyv@;{l5pGn1TEk2)`?W2Yb*nL7u536<Mj87S!$@h!3qb;*1a{mg`0WVPy8jU^S>l
zhilF>2jDU0VVc=nQzfz-6UwrK#=V^;O3>=vU&$w73w^c57P{Sh#zRZ9YD`_oz?J>Y
zi(EaOsK>=uH@;L`fQYQam^-|znWHv{$MceT@87L?T{lujW%RYG>LCA|ADI8{ADDkq
zI{%dH{>*FlE)$UWGA}WqP_m?ce*U~Efj>w6|DpMxDguwGm(C2}<=*GEd;d?n_j}Us
z(t2@SIF!#31pMpw*x!L{`TE+P+!8hMsQqhum^zTc!Gc|4$#ZOn6&|W{P+j~5=aF!<
z%{SCg5Ym^pHVM}~?Of2Q74Ee6tnMcaxqup|z)U@hUT45%O<fMP<Iu@`i)Ubw`HDP2
zS1YFI3&mc>xypq;mp|)8hP;gLtadw74bjlgBCk)&Oz3HXH|cBrOo;FRkS0pyOpH{|
z6VSQoXw;NSDr{u~eV+~T=31}mhCDC6?j)%PTYY15<fexy=FM4=$2xQ3eyv3u*i9}T
zulzmQ+zvsZ`_7_r7%FL3^A&8zd6zpxX+}+@qiN?U6VapZ)$U=IxlFy4fs6&Gkt;Y%
zhGiN>mDqd_X6Gx+&c~dAu=PmeMU2ynuZ;Ei!1a7nr`j>ZwRWyOi2rkIap119H~27w
z#eeHe{jU4Q9Am4GG!CPJmN#sAV)y&VqSkwu1N_hZ_^jas1$&sD{uiR=?3b2*Q1?X?
zY<Vk#GC=yR#aD6=gd++&Om}0IFRVS!OL8cyv}O5=wRp}Hzd}Ns)$IhqU2*0_=!QeQ
z_y8|{Y=P&87ccCl6=;h)CkIq6Tqcf`Sk$sK;3to0>;bkF=wn9<irvg9mL_RA!R1vH
zUf5-48XwT|r%_WPGX@njb+ZNd*uuu0^)2-w5)^;x{GATz{lZ^=C%r#(dN#ew)%OZO
zg5WF*Zoxp7u07U)8zw@8Q8s_Xi;JW6-$cd^rpr9!)*Xr($!Y2qOB5HA6pNpefAqpG
z`!S-yVXmtIZtnDA&5N{&;MJDhrXaDs_$OsAnRBd!7x2797dCMWf<p;iS#9>WR+X)~
z<zAG_%HN<|)`5Go;xv}_+3FHY{-(elM^xCJ`(yF_Y}fA+|1ds$Wbt)o1TgU8+N-d@
ztKSeAdo2Iw*7b|b7@pH4%n!=vnlBC>%<lgw^xxmQ5<Y*Kc>d>GI5oldhugtO4gDU$
z;>9~_lW%%F9ErA4Lpp`tD>atbr)7%Sg<&wR9W<`p`4W~mzI^kM1HoAN_(!IV+46Dt
zZ#s~V!CH~y1ACzNQCUz@EF`NzMRK^K6^H}(xK53QsYt`^vUw8lL0mFJK^C|=$WxW0
zV5*7HCt<t^{KHvVl}`$`5JF=i$4a-Hl!ix=^5w5b8s-TFiRxBW7>i#_UPX++a0+Ig
zp^x!#D`19JDl@9gtN*M>+$MpgSTFW6)tQ)V<cn8Smv4?nhP)=)?5;!8UIW;OT`Fd4
zPAd^O<Hp9ySB&q%MPipQ92y;$qiB0;<|z7c_o$71_%rR;O%SWiJm^m?D69Gg2_H`5
z<5pP1AcNn;@YYQC1)C~*y(l#VFxdG%&Tk#GVCoV4w(s5m#Imm46Ogsc_uwhQKHmC_
zMyi6g!U2#O`2}mWinUzFQfJ#{+<pMC1X%L5v!W;k-L#n(M`SU#`Xl_aCCMEK^rd6;
zIUnbFx!<O}g2Px3tY3*+oOn>hO;u#k$5qfre5p6b9Hrf1%QlEa`XfIh$v9p?+<=@Q
z$JFHOmJ^*kEPpT3a6E()#m5w7lm*og1%x}Ce8V6kp}SIq6_@N|3_k6E>KMWQ{UgZ}
z_!^71RL8%pW~nas_~fvIa-wrS?pmF^ysbKSL5w|CiE8u!<rztg6j3Gdhq3aF&<DzR
znFymiMo&v@AXto+3pGB_MZDPqX9xkh&ySQwTyXkId?n;fiMoBf_+kZbOw?k94g!2Y
z)TmXL3H)m#Y5We9ibOYhk?8tpB>Gu(B)T>_=c7C(#s$^6^QEGq<?AAi7?<oqSQTmb
zIa5^yG435DsyEW`B2y$$a-4nD$6^L)04l^e`^6Sq5Z_TEzMRxM*5t0R;PfF`1tfv1
zqrx?*C&-bZmuUP&PivT;RmWdT$vYVM+v8tU$C*qHF7_VCt^1i*Z#r3HV`YHW^5v0+
zo9)z1vGZy`|JA5Og+j^do8jYyk%j~8ToEb}Jo$VA`E>jP@FR(%n+7qQU*|8Z<fN!Y
zl~ydUiOidmI!fobjqq~kc*{0MISOt7CtOEvIvHeF$3JbkG>G>|B7r3@G~tS&Owq;C
zC{8z{hebbK)AR#XC5NZw4}>a-k*3vHIZM5f<XYhmT-Qj9jDWNd{6I*wVhLy+huY_J
zYI{y~`8SA_S42I%VsW-ifuC!T&cX}gWgiSAE%rLXU@iD&Rs0k{p1?l{qjh{f)>`~3
zb#Kmxc@m_t+%Kdi%bPh1+va&E(vYt)qyKi3_KY+<NG<5P7^kouiDpt~vehVLK$ses
zgzo`~1BXP9lvNd6v?#^Y#(lT##@~tMu464wlBmXP{K`q98cCu8Cy7Q{NwnulP7+;}
zmPC=p#TwJ3`Az(ULQvE44|#-r;`h;BlFBFg6{I^tP8}wXqKivms~X3y4?$SLZQ6?@
zfi*Jqtj)oA7NyE<=No8|rsL~VDMV*Rf1JK6qG?Su{<b)*I`>Nsv5Un&Hv2HD)bY+$
zK>L{FsXchp_@l`K_eg1nfO(1mhKwid_q0Bgqiuwh30dOKL)?|Gk2JWjrlCa&i4)lu
z`>atKOO7oSXArBVKqdk{TlSz{W{%a#OA5@58)S?wEE4GvcBOdKXyVKb)rm{<tCOQz
zqlruScQ|{iWD5AxhVELn-SJeb!uIsy%Y7O%#2bve-$rRp0R8l1vDIj|iYKkArj>P(
zh9x4W_=jvL(8jeQb)&iuwhj^H)$zC0MMA@`Y>m$OCNIhviGp(F!md!@TalZd<|Udu
zb4AOs8U8DR|7s`50^|BtW{D>i@r;1PHXCHu2d?zu*R*N^8Q&4J*jDJ(w-ha;s`^#A
zwZL;wE7!cj$II*fMfnY4C`66|bi0T*;_sSa4A<t-=lDs5&IBIR+VZfYr*!k7rsJp4
z>hhL^^Q$vd8#P~@*wY)pDTa}T9MuhM>;X1*zpCAm97EC+MjH0Es5k}Tc}O&V7W0vz
z1p-0C4y4Bb7NfMA2~hzKMA%r!iz2JPBMtjOPquPl*K*JM$HS}^&5gV8(eF3ydm(m8
zQ5VX6M2MGse(5YTA0PpM)TVTK5cvgLcy|)U+F>$`JR#F=z1I+KRzwD1uyAR7tCk}%
zzs{{aUJM7Bycxq9{n*+ZqKL$v8a)Q?t~oI_9TcSqX3EkZhY<O|_QW)eMRC-8LIb+p
ztF*ESiM-Wn=8?r5SMweGHffa{YBDC##eh1*cW!4rGhs1g36o)OU=9oyx#?cHfH^w4
z6n7m#c$9`9iTj^TO%u^cAWh~7%7;2R*?H^=uO6A7OAt;haRR~D=aYj-B&?S?oAPj|
z;s}y8z+MbCJTCi2G9{Y(T(vYd<6@%BcC$CkPOVy9BOBtp=}w=lN?dX4kxY0-V*tE_
z&6rQuh&J>}N3bX&O^>Kd{9Ub(KEp>PJ!-3w9UK3|^saW2mmaGBk|!DZA>am0jlFwS
ze#>3~Us`v9#6Yxs{`hWOS&eO5CR(gcj8eC&<Lg4|D-um`3B$W)_Ao0|Uas!*4Ti|I
zk1NayGY59DXLMG_H%BqETZ(Ahm%j=MpP+UEannyZs;z}mhn!i3?M-uO^ILPBQ(P-A
z5&vavCUfmf#lEAs&n;Fo`xU#`E#>RY9x)=2l0%A8y=lPL4`Mnp1Ksu2tj<U;P>*+j
zLg{@fxJ#h(so-gzHP0Y_TdFfeXX(MhT2bJMQ&xD1<Bis6g&K<_Gt`C>XYeD`7Dbxy
zhtiZt!t6_q#^$RyURdErP*|b4{8&7m7}T}_koJ`ssT2iP$}?dN`Wh<;mXjWH*N#&V
z85la>TP846&AFu2DC1UUC6R{1q)R|~MKsZcnJ!kAa&4#vTFfng?s`Qb`GT5Trj^Ar
zzpzAR<Mwo>hwQa?ER~^6#jQ2UHl~_2D_%SRz(E4cZOVwxqIHqKGK^M$m?r^e&sj2W
zf8hp5g=^fH#@I1+VN8NU1!Ijg{Myp4I$FQV3BQkyA<C$Jb5Z2xpNUq8P?N<F5`4DE
zr1@g`kn9tG7ZCx3Crd1geNS_zW0@r@HZxULY~IM|GUgM_)i?t1c(DyWw;**NwL}x#
zCv-B(0uV?#3codBnJDe&;BpVym2af-jIrnxA&j;V!O-G1IDsS^5-+XYIwUqleMn%y
zM;dZLf3$v0ZaWfql}u4~21XM@z%d6N9|u&dAXlHIZiAlMUP^S=5M#&4BYlW}ON}7A
zVCz7-2p&j1Kucs{kc!P4liHhi)#wbUn$%)4nRh@#$iNBwp&dCxkAqRIWKSb1%Ncre
zc5Tup@;A-w0z5<a&$st?ryhe?d^zR(QysOxng+h3{Rp4K-VfPpoYYN(EA*G8F65e!
z<3dIN?Az62QRYZtJ=t&8wX8vqb`R=#R=G?2#8114v}pVzyDMA1{!5TYULx=5Tb!<k
z7HcZdDus6EJ|D*GgMVhGW<W|`>?PfvV|Uz%X7yX_L|!=5*1Pkq6es9zq{@mQOH9lv
z;=Jd7uP;y9Bg#-HD2@s;SOIn-nq_b=Bw5}HWwIq3Y)11h&=-rpKWmfNG1;$1qe>3E
zn7fJ0D<`^z5tt}-3{y(s!HX|Q9Vah)W_oD-Ni`#|eYfzHncqQR-}Rj(*0w2u{VZ?w
zNw~r5SB(xK#%^`2|N3W<$+ua1x%cw~IdBE$pWK!3;FE6OI-ULwUL^%kkt1ptk*3B-
zW7j-}#`d?g$68wo!v3~ozX#5xd>6{M%vF4Azu)WqOAc0U`Hy|SM(-c#{T};%hTdP%
z`z`i;u-@nE{W|-8nBE`ZJ*Bm@K;I_&9q>5mQ<Z*7R{FXBA-zHAM`fjdat`SiDSf}J
z^hHV^ru0Zw`n^g&Lg@sLgxZ~;^kSvImz93D(z`2tQC9jOrGL4S^nYcguS}5sw$lHY
zmHx)bq|aCSgsk+Zl>U&?aVCcP^V8v^->P)DCzO8pXwt7$`r%pWdn^51rSF-Q{>UiO
zV@mIqmA+&c>3x;H29cJ5|K%FjZc2YWEB%SHN$;Zcr?b)zQ~IhD>ADXsQ_poue?{px
zlU^BLUOC6?iP_v#0*%&xeNN>1>G0E}uRq1Uj9h;KiIwqp+!Cnw)Ul^t-~Y7*7x!hl
zHtFlgemOZ~4^Ev*mcU-B-$eN<ReqBze<&X(ZBow2^`$(^c@|$*S--H0xBe53_t@T>
zwDB+8|2@_oYEUpJ*|&7oK}e~=hzE8a$UKX;Ro1U&(#s6ebVsrchDW+rMrJJbI?JR<
z^m{tVB8OX_qN~yKU){5@<pjc0=d9fK!cFJ)*${s({_^1X+T7;B$qG)^?$ULL{u|uX
zrDtXGWL4gY$4PB=I8RM#vmKwTL!s*Dl{W2CIdgmZQ}z;;L$b$>$7_kZ%`|{}d%QX#
z5=n*c#!Ikr!a<MwOBkVNN$~h0HxN6y7FOW@!Qv@J%qkHZw}1=1o?)?@;F5eA$T1VK
zrKFro)5^5M+8#|uyL2XyYl!#98z3oBWWQa+x6~q%9D=l^8W`4g&9y)T8UY9$JS_|t
z(@r)Fh4sn*g$e=Y<wPQ81Z3`8$w7c@w><>0rpjD<<S=q;{af{4Zx@-Y`!{Txm3VTu
z#GAv}h%MSn-D@cxKLxIk3cMiLFsWBbbE6Qw)i3OnS}FRhU)VYIJ`W~$P)4Nx14C3I
zUE=*Qd3gi+v-*}zYcJ^3FWhQnODcywinmZ}iLajIJ%PmGF9&P*%}F2Tv~|sy^nw0I
zZet=c>(DVn_%y6Ny*^8lvoK(eB$SG9YJVd6sHKGcT^d$f(JWPkY5yn$@Sf@Ef4~N|
zA6V&lG{=0+8DjRP;8J^4Y>^gx5_t=5=$sSTiGvm*J7MQD<Lq9{f4<YP)T&DT$9XHS
zN#(VfkL>Sy^ESVthLH1o2RYBP^@UkbnNflFqjCdvcyH+WFFd>T6+jnDUfN%_M*Lf*
z11FjDutf0qc?4J^&6p|QH-?|;@=t2_h{eB<9%C5w^G2H+E3hca->zd;`KYoiY(%@|
z<g!M(#=mSQ^&qw(TO`hga((-3%l6ih=X<rc+qT;)ckQhO1b%y*5BH*@k99V*pT#l)
z>-~+QiGxFEZ?LDa>(0d6v|YJ*WvBWNI#0xLGx;1T=0sgZG`<?ci}k>p?#3+ibY9Nn
zoxAF3fyZS&c{w)sdE_SAyke#}olLorn>ty|uI!8Tv3{Y)^rx#Vs#l|lL-gB<MfF-_
zMx$De^yyXHY&zAi=zIi?U+FhKv(Q!7IdaoU>DJ@6;sxNUHjhRx#uwSX^FIyB7r64B
zA~zkHE`POO9<s^<uMnX+vz*&db!X0II`v+<QxJWoSGTJQKr%B^E7zs^FxjzvtxO@;
z#FCE`H>(TC`BKx0^@)XlU@>><GQYIOKKwBU>y_vjd(?BJ&qKCAq|c4^XlwHg6DqCO
z^-q^MkYxjepxY_J?46J${=B_734%j1BsV85$;lC)K547)BC}G52KAbUsL%F6V_oDb
z)hw*SGNwCj8rMTmDH|Qu1Z8oQL^$tosm9})uvME=snQmdZ1JeF?5k0jwquyrmc7Jo
zsW!ltlb>VC$S28DW9Jt(U*>BW&V;s6bbpN5PDAGYlj}Q6kjO0n|J&owKa?UGsh)C{
z*F_)cZG?tAMmLye_Tejkjnc2mPV>BYL-9m)KfPr?($h_3Z9^liph>qu)eXfDD7)HG
zPC6DUtK5h#ciw%bO3jTYk`XA^G8xTm3yx0>(s4e~J2bm%97)L987YO4K05G7JB@W8
zlfF-(rc52m#Poyj@~fg-+&TU<)t4)`p0vFsHR$$K*L_6y7}`MkTAABF((@5J6f<F&
zl>C#$(`dw8?!Rb|<~;vpqFzSWmuri!GRNr&(Lcmb8DWm_pGKNr@FdB-zx`cr_O`$5
zPf`vjw#XFdiJ4lyO>v?pG!<B>A*Zhc2ssfnZ-_!Th33qGSOXuUN7MRH=|yIU(mx}8
z3T6!VRUr|p?ExiqI6_u*#+Ld+M!cmC$D9cCY1vCp!@}636cKMLd+Ju3L1V`(dK&qj
zV|s<ZKeFZbFR|Ow<yU5XXIR*ye3yW<^?$r6W3Rfa&dR=Ot0}czjxWgWt;Fl~cT>Rm
z3-S8ffOD9O5~+u6O@l=%RXI<orvR7_qW;3vZ!`Va%6BLBQeDWDv4?CvLB!R=X5G{s
zuJHyEQHhBwz0{%M{$K2+E^qh#(Z<mD_-@*Lzm9ja172*Wrst)ez>gd}KNfoaum3DF
z=#fzRM_8vp{xv*j`tMe}ZD<Q}W2ca+!}vjwCGviGo#yx)nAgpV@2uO%;;j1By&%!N
zW(mwwmCr4@Owh04{8FyJWfeEiyvCAM@VSuBRDDemp9}SQZz|2JcXwSjJPC}_*SHd9
z!g&j%0M!y!I7+ztIzFhBwX?oWgZh%#pVjdG0xR`lF%PmgrK)ROjvKUBL3<T^_h_%e
zb!TX(JDgQpUBhPt)Sw#oxrEQOF|ujUNXe*BBpl1SifIYYuWehY(@-JSKP$I$TWe&_
z(C(=tc`_IEXXF|1I{WDqEc=tL?wk`kd6wbr97{arNgpQuSfd}Dw)tMXS8=R-ZKOeK
zvZ2FJo=`LV?U_CRQ!Z)fxN_oneE&8=yE?(b7S3+M&FXrlCQt!(ShRk=5^68Sn4o5R
zUdnQ>%y@lFZTh^FF-)vHdrlDtYB5i-#}xf%5+-P)th?6wKDl-5y9UWoPATe9YQNmd
zm*Bh<wPHT+hZLdh+a!N%M;brkoO{?AMn=1HEYHIJIt+C&8Dr&rxr$35vmlQprVA7b
zJ=w!05Q>zTIKskZp-4`L-rh_>`_)TK>CY!h*%<f$XrE;RZMFrOfR;Y-Xo?T7<LK4*
zhL`cmbdb<C;C0<Z!AtA)?ctSUzx;|XS@4?EcWdyvQ)9@6*X?B75?&g4dw5l3!|N;x
z`tbUH*n1Q3s;YDUKY;{-lAfSoW1V7+N*qC?5)ny+$Ub_gSSMOV@%Bn-olsAN7Ki9b
zAg9OO(kj+orKN*)tm1@-0%1x(E221{R*fV37?G$IPz(8gzVF`qoD&k%{%-H@xxeT6
z=TXkuYaZV9PU~IoT2Q1UcHbUMt8NB3oS`tSy0J9OvT?fOxR0-=m)@A;DVJCQjJ}a2
zE9p}KZTV*#W__vG1uTCy>mP)YR5zbE2&!|<K=1ocLR@}Nl+mF~{{G5j&|3dJ$#I$d
z70NH`Y{|i;0asQczf1aLLzbj}_NA#eP&36ko=wy5AXGpJwJ)q!t^IfMmH&_JzoV|%
z+5Ve%wNkq5zsrv54xYN%e-0VD!qeB-e-R4)uibw=Y*xD4e|1-BzPjwc>cQR3S2z2w
zl#E^H>uc=4HwI<q>&y1v0?U@O|5_mRUH9L!Y}2o^|6XqX*Y@9C%KtyT{|cYFJhT7I
z<T&vG=mkTWfCSF;Ng&i$$3yk`prI`J%W<!k!Q_GdGQj7LLX0}&^O>FLAO0i*`WM^3
z>1*Nh2Y)80ncf}!vy_!x=-&~rti}(BP>T|#Ce8XXI@m;kSY<rdxm={-Pzs|}33uae
zPU;a+LHJiuK$jv;s=+6IkNCtdoU?76s9&}xS?UKr5X{%f%<sQ>zOIMCi8Sb1X&3ru
zDc6zGnf@LAW%DJUH9OG3SrpjWe1*GF^Cc=6%+~~pIH^S;@ssNSBYsf56O`mGGJ<y|
z8hS}um$1|Qjkjs>L~6;Qe!G+1_$kpp)-3+>fS5fZx|Shk=Pb`1RQeVYjZgFGCgmda
z`>KFX2&0_{VOi(0hkY5*%Pwzv!q?2l)eE0E=9De&CT=y?nmXNg5&bN@@sj_?_V33K
z{O<Sf6iex{e+M6$yL4Y;|K?F(XYja_{rmJGg2(@r`*&IG&h~GWrF7Z9`HUqCN;}S1
zH~V*O#m?tzC;N8}MRu}(uUAsAe{UW_nEiX7_OvmsUH0#3Wd9ob_qHXugZkI@@5Ih!
z|5xna-1BIMTP*0cp}euw)RNSgQuyt@vEft8BK3!ZoE>*sel*!KHu-V2Ibe$}0tM@@
zj^~(bl3NoSWF&@}d0$L|K9=84_vp2ps#P?j?!IP}FOSwVG+`%Aq3VQ}&u!e%sB71~
zCCd;+Vi*!ju1|jEtZR)Puv;|M2P?cjcv;HFFJXS|@&)uF4;Q`bT4j{BYfYZB<vHho
zGxF-LD(@@zG|L>!d8-p0Bje+%6D|2QNX&Tl?evg_vt=y~L4z9O_o+MPJ0hBRzlbx+
z>IEZdg6;&7X(GKPiu12A%&NxdrIb%8;ETqZNqpx9!gJGwO}=z_ZS2rvPI3#{rDd#_
z`&1`<IEE)<IM$C`Z>|M1*~!N<E6ZZ;<dM;(8+&26kHPXLZdQALI1<oHNwbM$hJa3$
zA7XNpTm~&;UA$)xlBdpBPIiwzek`}L>o1_OSn{Q4(i{9-wLP!Ww#O{@BxZQkX!i#_
zm_=Q{+ms(4oqTpI=5onZnpmua_FwMwS*nRTpcm$myvBk?y;tJ%k+sXB2|XFoTx1JD
ztp*3GqxK05kI53Lf19~MQRMVl=GPFcP0g}ujK+}OdfBB0)C)kzvRDOoP&QvP$$yxy
zZNQ{iiRJm#i7!URPpl@SRgXtUCtt>%DaMM^V;27{Cl%4Ehn?(=d92^b-ju9e#7Rog
z;8#^k?LsHfQdGk;F#?}Q?H-N{jq^VpjS9NdyLO<k2Lhx#bd`-w`dZ@>VoJ5ncFI~U
z&EF0{1fIYTpDIq{D-g`G#U{j`j`ViXOy0)1kYjV<r)c<OOC*hal&k$wJ?xa!RJ0~j
zOX5RPC*YNrvmj<L(OjZONqFAI9$JQ%#0SL*)*~l5wJ7smefsex+5BYuAEK`de3|l=
zj_<oheRGnFQ1^BAUBmh0cTKX$(UO%d5nB@reEJ*-x_s;+S~d?~E#?nW9RUJ4;;Jw&
zP5|F0&OP2c03Rv?EE|dN&MXI$qr3iYlBV6lO_O@&Eorray}VZr+uqTfC-q;`k}~3&
ztA=$nS54_?uG857b<bE&hZl^bACsz+Gdw<`?hM%)khInRlBd0B%ZygO^jF<7;{!i9
zlGyejmY|Ga^+mAArqF^VL6Bml7OYdTvTO!5rau}SM3BT``~($Mg^4XQoFGU=<1A<u
zjn8J76=rCjAukg7)y*^JDT?2<!EYYpwEpRrKaLb;Xn6Xa68zQ$zjO85GNUBOO210O
zQuIF?D}f-aOYE%bl1QLmF&><?)(J>k8XEagPH!~4irx)M75))3mT2l4j8Cs%yP~mW
z{k|_1FP6)NuU)Lr92GMQb0O}*^eQz*roQ#x0YDwMVIIj^U(>A93Lt8&nC_;5<9D9u
zwM<?yz<`HV>y4WzqHn@(sf2p%R9Zx&er|6h1I@lb>*Q~%#3KvTgW->;=1Bb#nl&%!
z!wc`d(6rHV6joJ~A7%t)IY}$yeF300HpoIpPkn6D$*bG2md|F33_-n>7)^|mY~Z9p
z{Z$^a+Ry_ZXURE(Z2N<c8>&e5uF)<CT$QF~53|ICRxQePXW`_HYA;oI&_#iI`vQjM
zq#nPMEQu5PLSDH%!+@?N+B=_-#*z;yO}Aa3L>s(0HhO=)QnS@bwiz=Vc9)Z@+3F~I
z_fZqMM=*uE0lbcO+QeYD+D4|c^eh+>{)g3tlP8mVA~wssm!y3dp6BQt9Bv=%(R{){
zX7hVk^Pg6BIW$0TXK}FsTR)yuiT|{2`oO&SI=o86a3_J;OJN+R279q)*q8l*F!*p2
zKi=U7h|5ARjRh*|;=+AcuDW4)k@^@DlNi-EUn<DGauxoyc>a|7KA@}7<hdp3-<tBS
zPV19Ol&F<dN|%CS?tvGbrR&{CS4Ee;hhh4fSmj6Y<D8^BnK^0O9em4lRd@7X5vzPe
zQSq{9WbuiuSe{Odx>pt**HQaMkW%}e&#wd!6muUE2ijt-sa@yLM+D_Sq_cE;PwxU~
zGYfA~soM=WlMQMCYxCNnZQrLd|GbC#6wQU*w^SsN(7qhzBrG#C5*CUd1Xw7|8UA<`
zKeg<N*(zzb)3$j3VB@r{!u~E*_}FBDe8V&ny?sEugMn}5^z!(yYktE=%(gt{X*s@o
zdt!{XMgT<?8JR!R4)ih>12^gTJ#|WzllTmagWu`PJ<GwrVF{pv_v&^Uu$x4iyb3YZ
zsluaZ0>y--%X-{A^)ev!3xuN|BDG<>vmO>?Z}LvD-woavek1c2S*K41LL9g7;TN^r
z2JExg$0TM^pnskuZP>Doj=DwK!tB<}6U;Srz?$+$D#p%I)-~)_ISeDu-y;3Yp<<+>
zW-h_;7R%#Ret+0M+N#1Z1hqG`!YvKsEL1m=3g<-{K4ulzGjnM+Hhk#lA7S+bij&z2
z*+`g0LcAA`JMf;z5?M4fn^!maZY~X~^j<$uQ#D&9u*TxChXMbDG&W<;MOR9QjmjP9
zBV|fYa{Wn`?(K;ait_8sp^<X3@62amq@kQdR#C-d@Lz%b+SUT<8AeB#_d0ddohM>S
zq~;c2rMm+Oc>@mC3UHg-$6IG=(qSas+PRKfQ-`a@>ND}|sG4MdmcqJRbv`nlAEbSz
zjlekOSF<tBdr5*WMi+?{W!~8a4U6tn0Xyq^e;*}sK51jGnFs$HFE(onY&Pga$Jzku
zseh|V9r}m?mun|J?5+Ybc?CY9u$0tal~2!8)xX9Z1fYP+IW1tyT$MA)<5U;#^9?V&
z<pOPx9^tD3pwanO8*YQ?3vVDjqT%=kAl4su(KI5wc@Jo{7C8#n*QxGBngDNsVfjUk
zg#97*W{6H_w`#SvI1Sc`&dz)SyrK)@&#*S$LZ4<#R&9y8aXvZ-o(5Z^#q)7O-E@E?
zwLs~?!B_loyJh}4qsA_T%PO;2hPH?O`g0%!Ay@$CgJYie$~@G$D-45zhuN_U)N*PO
z(X7*E?{2Q8M|EZIovp<Cv@uwEpMKkXJdf~~-bjuBFK6iUtstHBcS#8Grx7=!YNI&D
z_ELK>S1k!qHT|WJu(NHS9$-}3uz_#Oy&H$xpp`HF)29%Yuz+y{r>SrH<v@paMQe-q
z79gEo#vqKo;3pJv3LDe*VDJrj4Ja+vXv3&Stu)+BPt*D*l-rUws%z<AT2vKt{fNQ7
zyXY28Q*zEMe!QE}(Lri$&!;!&z(0fE?f!|kQgqt96Z3|eu9s!fQ;P<whNH<~*Du3%
zeb_<-^(vU5^yynvrYZy6^#`zGZA-!Ty#wsqF>V@$vFzIPB%7(Tob&|Q0qpXv>{)&p
z`pWQJ|I{F?v;R9Y@h$1ca^lxz;+Ln}2W88y%!zN#39rw|GnepJ-JdwuZp$E9Wz1AM
z-Jdvxdw@IKU-;MLnQMf{QahDr&MW)c@=SkbF)Po^_@<A;uDOcc=N8@pKb(<gcA?V$
zU!M8jCeOs#p}suxT(pZkvmqveAbF<ve1&!@&%7@7{9lo0M(ig5{0e!-$(3iO4^WBD
z^2{k&dFJous)N0DF3%hapl0Qn+yA}t%zbP~=+7Ww%WmbF*}gn;=6L}dU);Ssv#;ct
zUop}h<(a2#^#4|Q=9OupkpC`u<|-~-AkPfNxlAa}JWCjPX1~eEGZXgcEYFN(xlEq<
z2ORT%N1k~okY{c?DOaA^9IM<cdB$Xj|4Dgfw#k|Ql037UaPR-IJW~!94dj{h*#dcA
zo*4_`9jy0<ydi|frY^wA$$_iLfW3Nr0L(VDBrD2T87EbIhh>2KHEBicDgYcy-Y%{V
zVNe{ix9$$(S-q$D#X@kq_Z&+JZr|%kU*cfVs~#YTAhFt{4<gPZB1iBb_o4?O)4pt>
zOs%2pu^OzAkv%GE>i#=|$49w>$AM4%AQU`q7grYw9=(IP*qP+<pSF_&$%7Y@hYz@(
zigF|muwX{)I0g==tJq<_=pzjW`kZYYa|t7*xeTO?SOxaPCTQSjUQ9bx!V`3OcY>mP
zVPlv+)0$|sx%Tc%O&xNEa-D{hq2#g2Rv6ZHLI@hZ#DVQ`gP11fbS^)9E~?_!n+UQE
z#g2D?ADGxXmCeMCEBYys_X$gtkSlh;Dr2~`Q?cVtM=Z999h*GY$LHz}gWbvxmlY!Q
z&CEeY?9fXa-H9Fho7OB8JLsHsdbo8oD|%>PT}6+fx7%<W61_fg)$fZQBgB@Zazu|C
zN^E$mG7MFuLApx%Gmf&|_z|t#9;t7U+;M_c<*!#Kxr6bEo9|BUm?zDm<PIvy${nRl
zQfIj%c($OE-0>fOhOJ$`bGhTFd4b$<oFssq$sOlB0CQerynmhKhm72D0PCFo3tW3w
zx#L!{d%H)Ns2sUthA%&u9!he@NHNjUz5T?OI}!|4nA{Q;0Of()QAx7t?ePn-$-7v*
z`buUx<nO<(-0>kjP3xbK{}1Gj4Hi|!>~|-3?7Lfz+~Fa2%u*-DYeSi^@%KnWjdjLU
zrE4Ls>Hn}94*^K>#p-Vmy<$iC;#kQStG&M!`&_@Ig!hryed(bk{Y56cJe`lk8cG%A
zL0I{JmlL0t6YiT6uFMHf55oR)8QnRb-2UbA!hUo%$Cx&2&zCWz-RU#*i6vWso*ww*
zhft&aI!K#7x$U?kc<Pgw<9HM2O)0%nGuz3>(dRU8f98#0YO7n+0-XqWyDB!N8wf33
zlxWXb4d<am`vN7}%Y4P$OS!f)GBW=+S^Igr@Z;;-&LZpQWL>O+i5ADHd?kJ$glA1P
z?8l$2)uEsBsu}FZ7vo?*uIVdyFh4;{-|87@SX`hcbO(~FBja>YpJS|nbQ5^shLwB=
z5HW~vG+kVRQGnknu-VAWb$$iLDaAXoFrR|B3)<wB6VwU%-yp+!>*xky2)+*ki`X9I
zgURv!2|kj`1l_>#HVHW;m!Z9m{NfHOblj5*^t?}EYf)sz`}8%+O&maXYmdl`SJn08
zn~{OV-{;^A%krh^9{%g?b?Qr`q@ZWy^rex01wHIax8CuF2XGRt|E%7;qG$Z%Psowr
zpW;I(j=6F^xj!NM_0h_|U4ygpl7bWkdvNOLT77=%&*cpyLh9%t1%83iM-|9J%OD3w
zFvm#0I=#rd!?@JFh^pO`N&J>S@4Ak+m)R|OC4Cq!dD?WQIMD8B=niQAm1@xY<1VD?
z-iCSW$V|TiwjePaw)SzBzSq-RRivE$^_U#IeibtmyjhgBJK`X)PSqPwnGW~PzLAEv
zfaW{VAO6lr!I(1iT#@?E1mY=7>~Y$&k#K6%Rd@D?)R(GS_wjjDvhZYOafe1u2DW-I
z8m+Wk$nb7qvg1+UP3#y-J607V|8f1Lwp~lpjv+{{Np?5~U^Be*t$|J-j<FYd$0%c$
zeenzHNeJxr<xWMp%rz`0#ZAWKdQt9Ef2hs`S9VC77{K#n!E;~@Ph*_VPShk$(KILk
z{|{8xm)Ggn0NcM7q%Vi~SqkvyjpNF$V0bd43Sn4aJB%%@mEyu}Tu0U`&F7AnujPer
zVg7#cC(7ghGz-mRG}HKQ^MSciUtI4RSbPIgLKc|uPg=qyxlVw*3V6Xv0zlrpyk~r@
zV6g$S<dWj^DVht&ILf`|2qnAVhOUt8R4(@+d1w}rbqfrCC>y*QmP(L2h2F=9iGB&L
zpMjj{jhzs#Eww9L59>v?5Ux2VVkvUq`pMp^B#vhkC1e&LyClPpD-QsBQ=!k3opl=&
z=w{5w<gaz+Uzy89Yrfqshn{`tIiPg`>`@T7^!9+>y2T#=l?!jPi68puv4#XU3s8Sb
z2QvCMj-d7*uC(G4_*r<Y(3Cs$LyH>moU-4|?#iB*lX3s!$T-KZcaC50@04+W>Xp8&
z1D_UnS(`#fgb(yB&!@?C>7%S~breO9da0BCcJ>?R&`x?p_Iq%Xmh=}$lRm!Ys~{1g
zD3&u{2Sx9v<?h(e`a`m^PFn8<0gT}?Wn`Ue)8@^DzUZD)Y5EMNl7ZA|#O4ar9r`>V
z2$gu@BnC2G>*p5dbFjN`r6~x89uB;ceBoInS1LIeW^?*V)s>p+q%T&0yJ_DNyYMRg
z^5(I8KHXoLejfcw7=Aw|ye237dJwKYYhmvnM_T+Fv^=Q8G2Q%FkGwN<w2=IC@kpjm
z-sGH4my2|L`bYyUJoTlxA*93UtIHf08wuW#o7q!1B)RuDp-*xVf^*`Uk|OW$yVQHz
zwVS-EOUMGy<NlPGX#VSZIKD=%G;`KpkMX%D6pW56KEUe7x_D-9w(sa<`8Saps24jF
zcN_;R_)4sNy>^XxW^y20WoXHDdO%3hja-v{qbJOO`-p$$vdP<T*L>`R7L`yPjO3gA
z@a6blAM1<9k@Rz@U6K~jT(I)HAFR-SI#qNc_qTh~FP0^)U7wDo=717#)A@EBw^sE6
zJEq|Eg+4yy_=Wq_SEZ?s`_vnvoB*lr%7#XMA`O4wUvoywTq5tzkv&2%p@0+<x*HLV
zd;4P!^2}2ncN}&&pc_rToH{G-12jqA>eOsm9jN2XKI-;{Qa+=HHWVoq`yOu*)+05k
zMao6*6+M<GxA>`PK<d7!NzWJ-*5G~n49zu{@1M|M?3@-epW4zGgxp%j>gRynBeR!9
zD?f_N)B`1r{c-=M<zYwQ1~u{7NX}D;Pb|%^u6(9;iE3WRTIkqTNPZM=C2Fm*7vCcd
z_X;~CRybrcp)a{9OIXvpQGorP4>MC+>XqZT$e%voC0WB%v<PhE+c2SryxU$3_^|xV
zvYJzw#?#Ap9o;y(d>3y+o(k8N@|Xgj6phR$jq5Aqqbi1>amFgwL=p!gXtXWNfzQP9
zB7t2WCdq&2aPaEReoc<<V;xLb8{u9FJIG#H8c96QR&INN1@W_v?=~w?Kvr0CvgWiO
z7CPWd3a!)xyx{XgZt<z84;CJzsda~LJlgQgq!Yn2&uR+Y;*XVlu%EofPadP>2mRz5
z5nmUYPYieH{eJR5CErN0cLe`*V)%pmJ6-V09_+FZzZ?ftbdO)2=@BmK*TpYS|2YT0
zwEmbe{$Ir}_if*ni(i)D$o0RAU-n`U{{nv5``!Ol_+`T6|1J3C?LPk^etD1rS^To%
zpWFV=@ykpr{6C9d4#;#+d*w^<%hoT#vE*j`3jDJCOA2+yFCQJ1#V_t4!!Nx~07G21
zjb`2AU*kb2#4nfl$s4Lk{+^#a1e_A$ms9-YRZ1R0vbTmocE&H&XUWPVn%p8c;A`X}
z1_lQIAJ=^_UMMVeNs)8t5%N4%;w;^UrLO!5BtMKn0|w@>Gxg;Txsd~48>*V?%NKx>
zpi1Un0+Y``-Zr;3-sc4%rG}01^CbH;DPIdd4E3AZXU*uIJhFtN25eHJsop2K{aa-s
zDqn{87(K^fgo79Gagq8VQXz0HJ0a7zZ^>_<!!2A$c>z!Oa`&hSXCvi~*@J<?s&WM*
zd8MeML7tq5R-g?uYQ6x)l>V-TQH{)e2cc1NMKKHy$E4DIH%IgTfvHVXEcs^>vBFw-
z!68y#tKL++#!)?^C}I>&{--{@kcDfn9B&WmZ(<pA>%-c&oz%nD5hqo6+}Ze1a{pXj
zW`yG=e`1pNn%6{Rx+c1-KVvC<pBmnqT5#qT6p@!F6Tv+1TtoQ#%p64r+{sLg`F^~~
zuN50$%;jKkUXi5;MmJX3Mdg#dJAS#n19`lODiU6ysVnXkTj4%YUPqjF2TN9+I+G>w
z-84^(9@;DB9)~&l`!a)1zT_<J!22&;YOJy&^2=2=26L~dJ7+{k8r~x=+JftU{(QS4
z<hD&1Y}Vv?9=#sCQSK9eIaEc_HhD)2N)Wh;yh8}pJ>i2?yw*wiUMsMj+J6i-NCB)x
z9(}l65<YxBuJX*~MZ>ciLh4$K&S3hm?5kWBY1k(dm(6!qCN7)rQ-d~@MI0)VvO)l!
zqpbLcv<la)&?doxKFd~x`U>;(TK8C!#v7?01p{G>w1xyfepC5e;vJ0Lot}fdX`cz7
zWI@|lx?nwg_{LHiZy-%7BS5!WypLQ2mm+GGUQo49G^ef~I>~h6O|B|z?}cv0TR;!<
zWcYqS!IntF;Tl)vw;~Od8gybMVi_*9elDhHk-_WW&FKhLX5`K9x&_;v{1=S}$4Y&c
zQ&}3hX(?Q3(f&{3&>VdcTYjRGEGVz%p|tIq4)@{mR<-F4oiIYq!M1o)Z}Yi|fx3qb
zPSVag4N~u<ZZhqi_JIIjd2h9ae<Iij_4w+gD&d2S7e$MaG*Y)&2=TUZ=_$===Nn$J
zrqUhoAtEo5gW4JXdH3xM$!!P6=cN*Lg635D0jeBeQ{|3(Z@HOa;se!k*@tp%`&7Ab
zWh+T?)>do0SGMuRwz6FZIbUdn%5ArI)A(Pk+Msc*P^oD$>iu4*;d*kUdk`Dw$70mB
zmRUC^q@&=6IWN1k2zL_utN)<F$?3z1MZVXbnvbNLej4drs>|f_l=D6z3b~}?x%4-3
z*FT!7Es74WMc4L})-;37cpwv7J*5glqdbU_{ww8XML_^F8_60;6w_X`(!-}W0bnQ4
zC*n3=zHERP@5q#<0UQDFO40!<dbN0H1EQ2^vNENc5zVvwddJ%IB~+JzPXz4rA_gM&
z%3PM$;9h2B(=U_B8wrLKU{qj3)#ByD;mb2{NqRC>O6hx~reQNVtFg$9x|jCF0Y|()
zga`E;FJHAMF3!tG>CEm3<BHPr_{stCpP^6@*xvNpEQrr#mkIQy7|HLt`U@lpgsbm<
zDTvScL?GpZxZ~C$3clf07}yW^8Lb2`e%2O2Vb_LE$!(|`FdM1bGcX=Nmkjq#lwzCR
zkb(07<V+7D)=T|kTjp?NV{n*)?joI!8YMh5*%XA+dvYihYEkydgqNpx&tzVi-j)-N
zWWvkSr)Bc5OmEGEm#4qT2}kHr*l0`8ne>S{8TQG^Fqm+TKetQp=Y}6V-~byeT;Z9s
z(p;zG=(y3*OowH{^IB~bJ&Z8Q7T!zHnxstFPkv<F&<4l7QA&}Rn~<RG{l%VC-~|yt
z8G_s5drn5*%FccABLU?m??HnO8p%+V+HWGP56baWw;ZTFnfdniVEV&({CQ@0E7Si)
zxHCVeTv9~E+^;#lTj@sU@$LE5<7@s@(BA#{-f#EE_naZ#uN>cn49)w_$Jz0{n1O+n
z>GueCGd|v|(StSqy7BhBh#GWpqLadOpB+BVN$tjZc4AOhkqHL1VqK)(fmd=Cv<Lhi
z`o_r2X~nAI4R0Sq*dVpk8F^y*njE(;o^>v2a*{WcPh^F~^^Frsm6Mv^3ME#&=-=*g
z@|)X^2frTcB!`VoE?@U{^5yH^e16+&z2keybxgx*p7$DF`~ieFzDs+e8r;0!Xo@GA
zz<kR9l6LM#odmN?om>KRiV~mY*FNB+cA?Ot6mpV9@#m}@-ZR^N7wX4wMCNx$z4e@w
z-%EPDQfNaxRgc@L(pX_ma~O!xG`LC)NPl9Ei^8xuFAGNLU3@C3peR+q{Fu`e`CO|3
zDn+$b$|}y`QPrUFV(y9goc9zsJs%Fr`>m)xQHVf3nCwk6^c6&=9ME_b^uc_B8r@$N
zsjuln{uE!SDMg9xyF?m>_f^C`bA~Z97(plZmOr;h{nb+c$^bjK9;5O~4o+E8<IqG+
z<A~_F+6eCRNgG!dtKgZJ&kH!1<V@7<M|t{Zh|iZ_26<Zn3MfQTVrwDCMYNFEO4dgP
zNaxqv8O{Mv{RQ9f+bs&nBEF)0E?qpm=(1Rdx%^6<%jQ4BHpvFhy&5OfKNtMEL(j)m
ziP9buXO!ax4!v-_l{H6h4<Tt(C$25+*La)tvO4nU82IbHbbJzR_G3Zl>p<&k2YIcp
zcl1|ZIVZEut}5|68`5y%zr441;CNRA?TxFly?y9&zqh-7X>Ui+0|@INI%_YaGQ9Vh
zO`W@4PcNdWO9zGE)5`4VPvD~&;~g*7XF>_=HMakpz4E3-fe!i_k7P7rT<Xt*3Fn0d
z#L^Ak4`GP>9jh{wjpnZi#kJO)YB8&?Ju)rtj)8m@xao88;bo8I+)%!r?4faUeNPsL
zN6Hi1S@Qc$Qt0W=;`j6csQF9o==qainS!Nu+$VewfE@W;@F8o;;O9&iVZGOyh^78)
zj55+{#2%)Pv7roB81Z5sJ9R|#UKza5KrTP@1W82df5;ozwCiid!#lP|?PigYh9LrA
zgu*h$koTB`LTu-OpIZotJsPOJt`Ge=$%12fA|`$XU5mOW7R4%1^Pj?_R?8yw>omN?
zXC;vts})+B?g{op`RpXuIM05leQon6kMn(d;h!})wr;N77)`E<CR(?j<QA;7M%<%&
z#PZvtm8;@-Dx*!rlGw3DTB~LH=UG;XBlOMj1MDast15I0`1<g*1-Iez7E5&03X-U}
zxc#3xU=GHU*(4d5K{jvI!1OHFc1^{SU>^6=f~x8=XR&(4UzxN2R;|%o!p>kWm@v`)
z3VNG=s%0XxE@3plMe^lCrGc3Kc~cBmDZ5P!)Ife2#)=qATSizq0>wqcwwla&yRN@2
zQq%mE2|v#zAV-m^$H&jX3bm4;zs$fY-;Z-}S;dZ#NZlT}p^emN!knAczOTnG%JSkk
z|A+?%^YUvoF>QE@BU|3Pm)~jGB{bC9zPq-{;oK5hXb3ZW@IIO%o;QyDv9eZA_?rQk
znXB&b7rb|Th!9>dd0xvjri%{&@4`R&*zCs5`FY;&=z$Wnqd0i}J^4UR<G}`l7_s`*
z@Gt?+Yt9>&K7xX?1fFxrt-thxu;!ut(8A0@Cg0Gwo&DGPZrn19R!!}AVTd%Whec1P
z`3HGM{%0HB6tTzvOR!aKM+i<Xq7`*g5#C{KFuFg;wu?4uaFap77K8?zucl#Za^#W0
zP3h+`HOk5(^naH1UhGEt=VpZ^I{#SMe~?wCFM!}GXYg95Vy#A@>GP&fMJCw`i6%I2
z`MTvssobwJ<)W$m(l5|>G<9BC`e#Ikaw0dWlFw%4DgWs<gt(=kzsuPAD{DVUpPe1Q
zmZoZlHLH&?w>~TZ1;7k1hY6N?f>dRk2wmX<p2dGHSlWg^@eC|Yzp^VVjrj+#^t}z8
zVCgjFV^eCffv3TMk+AWu?TN_}dvDr>@6>IkHgbmhx|2x#i$Hs1zE9q@-?&Elux-A^
zo{>j=;yEcP|K9aRt|I-eC~6$lQ_W~Mb0fG^*#x_SQ^n~ovxAeP%9=+F%0pXKrKCv1
zW<#ROhe><0fEb~C;Rlfb?RnnZi4V{CnP?J95D;Y0Mj8BM4F{qg`(4{Av692G57_TP
zdGx4}`aWs|(&2Y;lT-1bH|bsG3ftk*5@VX2O3no%4UY#8y-oH`gzeC%iqsb!fG%x?
zP4n42*h@WWVM#fg5w`J8x&~K@PC<R~*{Zq3Ezn@swccgSF768hGRSK5<96ETNA%1K
zh$Q<l<by;E+duF*fxTHPVNTVBDs(jka?t>tP|Ny*VDWO|N(5BS^s$ykBPj;qUV{Ra
z5#Odif|_6b$%=|kO}qZ!CHM-LP7E!dT2V{HEY?_zV+<riK5(?mr8Y+zK82pf+`|`q
z#IY1-&?ls2f8_5|_k?_Qu&PZyYx5zM8^$!d_A(zHd(_l#cI3drs8BC6vOS-|sZns1
zU}qT}+m5=$;C8sbp}tLf+EB0<cB7Stg1g1d!d&_baPy@q79zAmuh?Nf$pWR<JK4P?
z<MUq16e!_BJ;LyZZCa3W7?fOJ*fr1Gxr1X6+S1IXdx73~^L+^@haYKt1R%o;|9O~1
z7at-b_3(-v#vvo8Xid{gMOra9O3v2}ic66NLwp}HK>JDT0Pdar+hc=yKFWx+Vaii4
zN1tQ(p4QT7!&z*R9iYB`7o*<92BJg#N{lYWmdkuVoMZG>eFJ`cWAgskKxuE@a!_Lm
znh@31e+9kYZw2aL&fMY=dvnS%*5-L=zb$aW?ug$~&Km){lHn_dQHBL)_m66qnoIR4
zu6Y!GUO_QJ5+P;~NBp`d4C`ij{ABDep=#+Br1)^g5&MwzsYcuCwB$~T_YpwSYrNf{
z6h7CAmF%AxwBHZv6N>as6HCLU+e@72&bI!P?AA|}B)0DwY0%9Up{046oYV;zECdq;
zMzaY^UCsvF<m9t1%e*xWT6ffUJ7ek{7D3W*@>8ecY2Osp$(+1Ed?GXVQiJsKMzIdC
zU=y9xWfbXSjF*$YQartryw}*gkUMg)EU14)oJ_?UQ%mGGhw`zL>St#jp9g0irezed
zLwEhQkkLFQCnKcM-k`C`w3LbCHM8TK<ip0i;A95w)LZk86o(tUo8c0~_t^~a*q14V
zZHlDf@!n7>>*8&Hix7j^jJT(m@w`9cho*SDouC=5FPCB>qXLRF%+mOo!f~7~NU_oB
zXramwqo7}Z>EC~pq3fziLveWFV&S6~zdB$4ITe_3iFF2MB|b#%U*;@~{<>WMajLd9
zQooO}e>(^h|16{)1g!?U79Uz1E*Pfj`5L6QBlB;vJ>m4ci1!qIQu>79+>rV;*9^*Q
zJR>ji=uIY+M*3Y{G%0xofPYg!TYVJWcn0(}P+jICm*g4H+M5goQLHS!w{qEzLhnS~
z2qI?;6ocx=@vMm4+z-m|*G*BmR@Pq`2b<UyVj#+ge6tVa8EBr8!xP7(S!|u&l3~u{
zeZ5Uh)s4L*Pv_&9<-~lF#^4IP-?>HAjVJa*eAYV@W6*V<NZ?VP{%jdjs`{0=th(;n
zLAX7^2W+HXRDkRQ$vIH_KAIdeP)YfgiStI%tDDEjUF@|3y+h(Mus#ph#h2pyu`PbE
zgZ@EFtfeb6w;!Va=1&YfNS507?YU6Ven}J2)KtA45KXiU^F0hh|6fC3=iqH%q+u@*
zGqT+<^H@JfKxnS<x3Z5~L|gV0-fyBBJ-sTa;SkCqaq$5ZPSY0i;mSWB4DnX_WKfdm
z`I3;YjT(~Psus#(gIi-2t-^O2h<C?D!qjb%h9k&quOflW>PeyVVXyt(UJPbSk0n=_
z4z9iIo8a2V#9O*UixFeMwL=d8*Pj3SHqJAE8MUFm^oUK_{oocy5#1N^61vJ2;w+wW
ze<RAPpj@VX?apDI@gWVDP&_0I)v2k&w9^lp-HA99=Vge4S`CPU4f6VrL*g(^9e&vm
z=g@z&Tf?bu({P!nF8g#AB~w*)*l@6TRDDY*k=ko`;jstzgmdh{)^AT-*q8H%p{8sN
z-U8DNj&h6OV6rH^Cqd}#JBT1*&(-TIZ|c<RBeT7po9(rN-rqh7dwmF*7H%Pq={nx;
z^%b1wc|UnuIIP9{BRr+pEwNqc&9IV)H$OtK@LpC1xA>5@{Je$eCRBbtVW#^AycT&2
z|8B{<<|O~dPrg{mxBAH+us~tyX@2r|m3$S+;m80~F%25v>>E1`@Wu~i$loo^Zfg$*
zxSC80`}j0_!Y1qBl|*?-A?bwy7)L;h@=qqe`?di!^xifF%d{cN92X^R{#r{*|B+`Q
zLy8D2c-w!ST?bf+KbHww-WMz3Lksqx!In7Py4A|OeLjGB|DN=l?$BoX!@M<6&ij~0
ze&BsL@Iu#0s5w0r<_eDgr-j~$uo>R@*6$p2?)z0MH?2#w0)3gU8H<t+Wte(A2=WJ5
z|7ZEM^l+9ghx7{9Fx0f}yG5S}Z6Vs0(3`zCGD!~$qqv33Y#$XLq}BQ}AWEO_=4hW>
z^NI3~u*D46W_QE^|0GGxa;j&=c}AYR#T(Mu;ureGr>gj-S1s4j4VLQ<q%6F_Z=}Z0
zHINi<wV&%4zph(z>e|oGb*Sq4lX6L0lpf4eaCGoy)T`b`1E<1uma%wbP780pYr|+J
z%KH&zDRF=R@Nr6bH@_U7Pb^R0$ni*c=Cn8`ygnz~2aR@^=YX8>wK?J624QTi(@$i>
zAy1N?my`F&obcM5@QWxt!*>3X6aRQlxHqTLVV-4~_?7AQoU-@i^nG?VoaxIU**I=i
zW$UZRsXLyNXM9fGN9W`@HYa>~PI%XxI`7X3@0Jt(O-}yZa@ty-)7D<(@8-Nn=X@Ao
zm*gA>jn?Z9-hJ-^8tm$#3<K85U&HTTg2HsIFYB{GCy0NuqeC7`GH%{3K|0B29Q@`$
z%beuwVf^GYT(e4MQsx8BPE+h2qgS(my_9&o1ZW5OZ;^5>)u0OWFaVM0M<Weo5k=oJ
zMOz|OH(5VCOtYWy`CQMUI-$n*R|rLJWv5S;DgApvMNX=*(MmMd@e^t2smic9ck5(s
zLP5VbOmc^}WfN)sQD!D#5S{T}S3+veIE~CThd##WtL=t~icG%Fg<atgyaMUEEyICl
zY#0~>J@*CGY<@+^YbQVr9Lslj|9C-L|F46{C`5XDe|DpPF~6k&sjGwYIpay|?FvuQ
zcY@>kKi2$Y)v^W(D@m%xpTKr?X?WqZeW9rJ-xeo;P9pT27WmP5<Q8Q*^pRQmai^6q
zlC=uTK0WFC$#IUMWXe3F#8T!bC06-G?Kj)b691F(vnM#l^Y3(vk`FH|pv}M`5BFJ-
z;TYD3IewV@JKNrC_t|)n+lsvFTEWwmZ`9t)1X&ksT;nO~-F%_sbGsE!ZJKHCPn0BV
z!yZG)%raXtjWO+-O|3v=ex1zW{pO)Gd5ylGs`LETz20A4Ucyb6^7{qlTTI|Fc+F{*
z;II$`QgwF<__E%o2s&Y6=!uF9!4x8V)E{g6X({TdqEr<U<1>mpkVs6)+anL`g6WaJ
zK7nt(JMBc!??mdBxoS=~!G2}YqbBo?`b1~RJ$L(mdPS?^S!d~I5{lTYuA?<?SRJX~
zRn4N0MPB-Gld?<OKBEDB(8+GWm9lz^CPwDPr+Ujk4pie`tOBln<Rqq*=EWzDl?mFk
zB3(^ta<*)7QW*Fbtc~KzhD=omzp>P8!Gq-5nEVhuW-6aPIw3}myo{0#O4==3qLt6a
zF*vZy8Gco_(>8erJqM^HnlY(=L>-TRALn*9!gMv0YW84m@t*x)ETS5YJG7>k@y)|Z
z#s99$<$qI!-`}feUivOmpHxX_?Genh7XYtUKq2;kKu@s6QTgQ@=;2*S00?LfvV<+(
zIopIxxKr?kmw&JFZt?Dd_J`1kF2|iG%L#7?#Q}6V5uo@K)nf2IkdSvX*nn6+arR{L
z;JRy;<6d3IninZ={>Qlb(%56k+B)ydTGdih=eQH*Imrog&4>Q4#8V|}=8`ar-+BBt
zG8}EF=9*amV@;zr2T6C*J=H~!Dky<gFxgN_jCav9)_L2HsjKJu$OvE%DnX%Fxc9fD
z-|}$@iU6GX(U6n?Z(Bh+Jlq^?SqdW{WwoVr+RQKBt^WI)`Dfyx*yNfRw_%EYk_|{G
zyei%jN1y9QsiHc+Z>3~0H0=9IV`nwda}5n#JdYrPF61Fv@j8Bv#r|eYALIsW=31sn
z%k=H_fCUFzP0@Vl$Y;{CFvY4WtVS<=U#p>4yzE20+n;^;Qk(AEB|l#G)^%^01{g_Q
z{ug@e2?i{Yb47IWSepV;qbVyyDJ3X_8In3V$KJt7?T)5KRu2zRdIgtgWbZ&h**nb5
z$b(R}WWxe_1vE~!AAidWvZhS*{$?DgIfu&-m=dyi4;ksQg#v1~sw8v54X1OcZyKMV
z_VJ2!_bKWHy$J6g`40Z-rcKO?f3vx2A~1^vE3&w%>NvdN$xf{Qa=aU=0yd!iQfcif
zR<K>QAz{nuex8;Q;reMF2=yjEZ_vF!>3*X=6>TU}jPH1nIX9SV$+*6RA{BW1_<PAO
zI!B8ebA!QH+cJV^kUBX1k=9@I$)Jk4@U9bAn|fru8X5jbD?d1FmZ~RkDVJEZl{ogr
zi2w=Fc;(rjG(`eamS)jteOqiEEmxu$!sk~SR3Kv~t14dTJ-voDZMo-6qn`M=iRqlq
zG)!VTylTLgNncNVZ1^I-KNnknWGIC@&@Ctjd|*k=)?|7e+(eQGP<^0_IjcREwBk;s
z)n5BYx@&Wonxo0ntj1EqWTb&;&5WuXeb-|6BlPDiWhaL05^ZEQn(Y*OpRw~!nTN7W
zJ7p{D=c^-AuI0~f4t9%b4%?}GIX;=4;^Z%dc4?2)sZCCtc7+u2W*8<x&&B)W&E>N_
z@WfCFhvJ~)5Wr{m$W|T;W5!sb-D^?hF&V+`X{p+3BdFY33m2s=Vawze_BGwW9K(_B
zfW>=i9c$(~Zs8~Bs?_OaP=nI`T+<{>xi>(Acd9DsgYe=q&wCP-6GZ`v5rw%G{RuU^
z{FD3`m*1E^nNz0B#ajUF9cv*zSJ$1}70<o-q~Hy72;GCQEmHBo6UcUXC-KqUbVyma
zY_2<xJk~fbl7<c%Q@(Q)J<R2E*Qs6|kCc${R7A;VAaYz01+K>tGah#J2i~xD1L`S6
zIEe)TPkoORNQnhvrr~~{(3j6j!(oe+j=HBy0yejbTu6)5KSW00YZ|qnS{R%B2R?2m
z&BEm#>cduOGz2<zVp*29N9v7{#yeZeRUy=J(D(P^HDgmFvC572Q$#Lpe@kQJyy0Mx
z=HhkyEbOapxkC>|{zv|NfGF?Pzv)B)lb=hcq4KnvIVqrT(FCucSH2x-_%T4oK2n?X
zlDV{jr;N{cDvj&MGF#j)cn|QFkT9F~)%YA_gWKTEuuq@n3|}N-)0xujv097dyJ&fG
zt%XY{>a7vi&Q<mk57k_4^1eZ5$(N_J%MHu<%<4g;0}87$Vit}GtwZLXvx^s((Nb!j
z1+6Pwv>W8rhZMA^C9-~qCi`=eVtd*9@LcQzBb#282G`WH=W!y4g)hhfp-#xR&^rk*
zB+omRM=x6bXrS_xSsqNmI^u`YP<(ga9ZLPHFic<OpEosTSQM4UK+eRgs-@eIns{_I
z_aKi(=j)&pc32$xe^t3<%yHJIC;B#8H^$NUrQ?*xv2IlPUHv`NOrFh_XCf(6Co0c)
z%QMc;Go3tYathNj&X#Yx8lUJbOMvw`BL`ZTKck%K2Q%kJxHt2urN4-`Lx<8QO9d2A
zJ$c#R?i~S2PREGO@PF`w8*=R%_|Eq$t=BN~>V8_n${UYrJVB)49H}HXJBgo`=2b;z
zH>YEKXYCC@-&fCyR9rnTeK9J;NwWgLO@9*bpX`LB^tRve`Bqmpaqft^Vs^r1tEBIB
zi|^xOq5S=`Qx4qAc8a3BQ-kzA1*9M2r;qi^S1En4(igB<0#7%1^|AVXtNfLI{(b!X
zCCZ;qei+KSkXv}e+wWQ?%q{FmsyXF^5B7b>8PA9iyc~U{oT#j)o1d%N0pho~pZ|&t
zV?$r6{gz0gTRgpx<bXTBL$-A8i2qV@s>Sv@Ir8Dz7hdJ}tHIAPO*wu{4wC<<<g5MU
zNq+M8m3$J(s0)62gje>WAFA;7AViTPx%j>CTEEB;zsNx<@=c0(yU{PNSpVGFoYn9g
zu^}6tkrV!Q5Qb`||C|&5R8F`p8-|me$8^Jnrc1K1>34F%c{$<Na>8X^wg+!y<8x~K
z%X^s|IkC58W7B`m2@lAr?}coA>9=#jJ#z9ya>8Ha<as(L{8>)8DyOa7u-fwdhyD-f
zj|!z%$>@(Bl{Ir${ZXq$f0h2|?gxd-yU`!b^7TiveErcwW-;Bp{^-%y1odVL-&ue3
zJ0<*g^haZttLAR>M-MOWq(8bqu224T{n0OAMx;Ob)Ym1n1G4CHhV}jL>5qyIv{BCz
zw*I>MqZPX%|8Pyuocsq&NH_YUIk$<P%QZrgSIyqtiT>!!?|0E3{g9FYW%YUsC53#$
zzo0+b&u`spftq|p`EK<`Z`^9V_h#ml&uXpU)c<elkH#a_XEx0O>5uF#RW;EN_Q^0L
z{|)_74LTm_j~sVzY=DMKe{?;!e!J5j#dH$*W%?sFka#_*ilQ<IPD;;_+KK)MABD=C
zrRSzUdT+@N`lE1!x%#7*IsFSMR3R34P^hgCEic_9E+r!$7w(Ax+E?n2R#1qJPt+dp
z^+!kX#$i|e(cW#sKBhlf`dj7Q;{EukEOce{N1x=WM?29Uoe4I8y)_*QjE@-PFV!FI
z@k7mLO&#Put3Ntlv^!ZdmxTXw{n25Itn+Ja!Mf2O{k`{2^hbZPg5Bwll9tj*e{}OL
z+0A^ciCteue^ll7?O)L!ZIiu7XQ{Z3Y0@&i1w3c<N1u1AKf=<y;uT+al+W$qm)eHz
zs5dg1c=&*os9<)YH`*FM^h@+cH>hBa-pCh@7ufW7r!Trqv*mW9FG?7x`da#;^`G?k
zpVAl2c^rOh*fN7=e-OsMBP2Azdz!A}OZ7z?4H!awnOLx|uP-{)guc%DqD2x={|EI&
zbDm(RJJA>Q16+617mc<4>`Y(u3~28EmcHl{$+Mkz#uSuU+8L`@zpt+^3Y4nH`1;ar
z^hK)+0)3IZ4kbG2i(0-?U(~XLzUban!W^N#=oe;u2g-8`m$TDDebJryzP@OEXMNFf
z(--~Mxg8xpfkdJ)LQ#Yi8R(4yh0#wq{`h+Oq6ZcT%)d%sbg~f=AM`uvi*Q`!Kj`tL
z`l8?S49brBq9W8OA^Vci7hS0J%;<}FK^bCIr=!<cYJsmWI)^9KbXnF{6d68_`Kn~}
zHBsHffQ^=-XtSP#E0d;3t`g9O9TrOs3QmEdsqaF?pNWqdoBEC}!JsAju4#!@QO>D+
zDgG8JqMI=hL42DhOA)DvLbB-&-6m1g9Wj#>?}3HLx7CSDrs3GZ1oBW%)PkPKSjb4j
z1tNvvp-s1nqLmCWM^SW9t9=d<=0I;W!9I7cDB3t*Yv}73{;=4>B^32;7kkcD*awBA
z6h-eoW@)A<iW++J6-C?7QUU|HilPbDq3#t$RjBQ*#Qv1`CazHjjc<7Daugx5N7gx*
zuLpSr?Sj9bcF=<e-=wcVp^@chtn(PNN*}T>Lr7ngtM5Sn@lDKdGW-wU`}>t=^&tVz
z73f25kf~ITKIAItJu>gvfu1fvA9BIG^c<;Kq~Az>4Br*%Kg`HwL>-1S-er#l`VT%b
z`j1_F{fE*|4buC3LVBqG@XJ>zeQ=O|m(l}$$Kif@nbP}_4g-{<?>O=jkb_&eJE=R<
zciiw)sPEVX7BSR!Y+o(rY{V9{Dc&LCGTh>nk^_CmqiIXNTgkTqF*>Jki~D^_j;nm_
z#B4uDqjLOIFy$6MsN_K3aiyR9V<rE9WMAJg2o6M{3hyBC)^71g?jVKwj!}M*@2JSZ
z6!E@Izr20)&w@uX`h)awoPvecGB@Rf2L)m2J08r5e>o@oSvCwyA!4iC`(|U)ALoR>
zkrQsq2_KPTYqT*NpHt(*FJ=0e6MJ_yHvM2u_<)@H-p<yS{wOD0l9OkjoN%9<@T)ob
zd*{S|H>a)Ku-fwV9S&E%Lcb_Bx9-fjs1Av$GXG}Fv)jS$=MQdLp4nf@|5e`(4q8#;
z_LZ}iiP-T<|C!sKtaRFC0>mSAdJ?p&Ay9P(%I2M#0p0SDK-Ayg>W^-oCq5{0Qior&
z9nNolO<sLde2-Y_z*GLoZcMBwn!pJ`^*J^^*_S#X8L<BA<TzY8QX^i}k2~O(F#j?u
zAF}{(C1XSwlkyaw5R_MY@WdsF&-3GZB|h&F-<R32{Zg35#UfA02j9tKG)=RdR4?M%
zUdipRIl*xUU@Q)9&icJcvId>~<J4Z}e}{Q$f4oEbsXL@ccSs++Lwdyy=^O+3K6NIR
z1?OQMZ~5!h(dz%&{M!9~d*2gJREZBb-Pt$AsiS+?atABo`kUx@#=5&Hx`c7*Z@=ud
zskY*9{1k})5Q>66Ya_=otyg~-+kdNnktF`M-4(0-d&=pHl}TTYp5uGyazZM~`E~Jw
zU-ZbUJ+D1+NIuW3a-yC39<P`VI9jp)5x(Q@ZM%P4N87DN?=R*=W+e`gHgexTv7(gq
z%$n|}-(`h^^Y0a<+4$wU7Rnhqs^fz4j%Ri6faVLz+Zu_(70dAqE4w+u{h~D<Uiin)
zdgR4>+YsV?00|#HNefJiNBA>$ygVaNDdxhA!GgUg)HlDDv&ZAr)AYbobLbxt24AV;
z(<Q{)df*j|w^n(scO<)rB{5eSMa~x9&UIeL`FEeljNb@vC%*V$WX638UH8QXIHt&q
zCH_a`V14{aAIU)eTXOREXAFCB<;6$09cuO}*L^nfgb0~l6;xjL*|j&{!pA{Y{IibM
zPYWY6Dy%p>)U(c(4axRIbAhw;{awlX`x*IpQy06Fd&=E*$>{tK7amR}uq#n!l^4If
z2hnz<`j*G?x!EyP^X4pB{^`JTe!1+^f%4e3^nHAEt)<c^|0dr0XLPB@jb#o>3w~|M
ztv#Fyq}BDY<nrpI#|@m=nfa0V{m(qHv2Xrpx27YpvBwFG{qvm>8s^l$IwLgpsj<ZJ
zL2Ts4J+-6#C1-yonhKMj>a4Nk1ovndMB40MZvFMDbGoUSoOueXPlrF+DkL-dT-x0G
z+{S9SyUvvwT)9(vxswaipF?zl@80g@Ug`Jwc3ck5@Ev2VfE>K_4`k*eeHLvjfWyjL
z#O&s!|AWs(^gd6X*7MT$P<)Y67b`A}TjNgP2%Z&ufPZ&zfjyNi>$|_6x-ZQ;S$ysz
zPWc$tv@8c-DK*0m8U9B<zAf1IBO{N_ypyIUanjZoo9vRhzjes(_K><I_Yn;^Esr$H
zwz_V~z50&Sqd(%#$#3rFhb5cD{^~o4Wu?qfWZpdUeiE=icv$JF-^}A@%d)(@ru_JU
z`mM{W9iiX6d^ytX-7!mzxAo0-kavOSH`)GP+FIuHS%3BfRef}3|1Hoow!NC6UkfyG
zs<A4sb|tL*J7(csJOKOih5cph;10dcH~PMUA8+G*A)e5Noy^I49uJ@mVO7F=af+Xk
zo@z17KU_{R1s8tAdQsO<zphf%RsC1^#c==P)sNff%<QMEKo>7JPU2@JdGP^ytU0*>
zid5Z!Hzi)SF3YPu0_4eFddvLWZ1U;kz|1)=er*$SS>r*kswV*7Q5T=JDg2rVmBHyp
z<x?U5WA9;r*(!`L&Nz-~Mk>ODwIzO!h_8hBeDB`7g;2c%?+sa;m3}EKQ37`}nNpaU
zgr%;elnCf0pyTXXh4`}ibm>ckx)?kDaZ?2c4rEmv9phq*n|#-<zHqatgjk~A*DWUm
z5v<f)QD$of0`9-+$VAYT3>YY$orqnT2u$mZaC?*P@w<=Lik*~}FxA9j+xnai`k{M*
zd1_4c54y)1AU;MGwhm1`8JYj|6}u`tC~|$N5Fx>1BG(^IaPX^*>l&XKyjoG;xuPVR
zT&BZO9AmscxH|DxKCgJ!G_D(+e5G-XpXaFR<hm>N)Q;+J8JW;M^7YwE|8Nods`8$V
zvGPLx?=dNPT=`(_xYnap^HSk+YJRgcx->m(@ap8}N*Xe{adqPhm&a<TAHHaC)9B=r
zjPJD3jZNm+t0wt!<MYX7UKMQj_#bFON;C5oY4{^GGI!v@*0YW2sUrf16J7}&+bzG^
z`lz0ja1>sGcc1>%$*q(4Et!<)IO0mzyGx6np?~=2<-Wo~bmH0%i!k9w5c!6#FboDr
zX4S8*1<d^!!jC^*E#DuOUT9ddxQxsTIo49adcWWcx&pBEE`)>|`3E|$^$PJ5RP{Uz
z++K~3%)*@coj>bgK~3m)euCxM%ja*TK?&beXu5-JzTf#8>bg2lV&cj{nHLqt(InA8
zTJ3twbUd4vmYP=k#TDJ|Ok0s&R5*J=I!;|iM~&EI@vRSE2K#by<<jBQnQ$EY#ZtYX
zaWO9c?iqtduzAi`b6i=rBXaP|$mpS?;T$TLO~o(|m!BKF4#uTr5_8I|-u4iab87(V
zPI9H532`dFh<`)M4{Vj1^S#&pqB=KuFObXVQno)lh;$yKK2jVSHwZKQ%hgOjI7Ie=
zpdPAOS!S5w*Q$!kZ?8pDhn35LXt_0WHq~?Goi3%PD2K9>>(RFL)}Hk>yF39tjJX9{
z#Km9t!CU~EE!Q{LayOLUq8+t5TDcW_9)i)wG~4`vCq8TY#w<BXz?Dg)9`jWQNJXsj
z>8smfF853{LBG3wTgO7MvLNUrZZRqDSkQ1Ve@o^;c$4osuIYy0_l34ng%jH)6kJt4
z1uhh_*i;%Q3k&yC0(bFesYXn{6u-)G>&xqiK*7FLK2jKNi+=%Cu{Kpdq8rq^;3JB=
zdbGf-(?M6>MuW`4ODrY0QkELWGE||wlL1ufdmebPKY_yQl*vlg`&Wp^!Jx#F>4oF0
zouLo-x`WU8@oE{F&9^*IX$slk{+!nC&P7(uvT5Q!_UVLI!%_(pfSR^GVE{$W2Jb0^
zEkVqm4EO__uOa53`<YyCR-S9glU>^^e4XheSII#bVDIRet{2Z3om_TWWd4hwr?Q;Y
zj)7neZ4MosT$(|z=1RgxuivHLHG?-+H$FW&+1B`ykBz@moqSEmaQQ+~L56Lh*|+lt
zZyeqDS~k;B<neK=<?4cCW8m1+!Lg^8_wZ{ydQ!4)x#8HL+5pGGV;hEjs_~`Ee=xYI
zX7J{k#urB?KWbcAom}G`j=c{fO|B>Aj2PFKSw_1Bays~+E3UQ26A~+cRoleX6ROV@
zst_L$;#<SCqruT7#>$Ka@%FDtE}Nu4$)xJU)*~V_#C?qBUfhwuZ^)Op#V>O?E5y4s
zc7<!i&APAk7^{`p-}TQ4|1PyL^85|>H$5!<TOa?<<?dC8e+&Ey&QJx@sKDEue**k#
zp5NdQk!6#gbj-SiRnVdRb^K0BEE30dM5I2!4_7f_sfi`kun@yB`kPBsWPXDf$7t%D
z@{*?DHSnoFL;I^l>1L0JJldctCTXPhb_as`iFka7%;#yI$o*zr>*T}ufdnVbI(u-F
z1v2;xu)KwRlKFOvtI<3yyjTZk?$Fuq7(H4{l=p`}NuLLI1t@on8<)O@S)HZ6j>xP1
zPyHU3S9_LzkI$>cg+Bz(`z~yehWZ_{YO{REwo6y=k(pm_Iy)<hvrT^#$S_S9%{8S@
zAd~2WcMnWQ+XLPTrc*WarLWm76ufVXy!S$|dtQRred?V^S38jg=TImXlmShLxFPRd
zYBj0Gtn-keU8GI!FARzT)lSM`y9hSTw@8W8QdPtZ=P9nJxk?6PVw;SA<h-NGH{UC1
zTMLaD_E%==81Pxfeir=;{qwz4n-(TRMeq)F;0^C1EHO^wS}R*a0eO(Qkbo}mZp6i`
z^Tzm@_%4OGKtK`HeF9QFA$|B<wWR&Px&(ss0FmZUfUZHKEaCZni$v@67fZn-U$Bpi
z{@P%&_~>+Tc9Tb=!|24_jUW6R`>D8gl#|*6)xxuBVf(G|!(<qY%MYFGkbu=My!3bw
zs?QhJa?xbPWS>>yzQ%KHuW7s@K_#OxVRp|chacbIT`^vgAM(EO?r)&RohR9VU0me7
z^hl_*T}j-+p{xe58GYDb^A5=lwV)4gE|VeL3>WJoX$AHXrGiJ1TY(TOR~gm$=y#$z
zfT<FVDjCWyv&82~RJZ;HBz1u|T#s)zhmJWsLG3r%u9Th&or*XF27Pr=Uc5b)x}Yfj
zek^tPa%QW#?rNUvyQms$RpgH28BJu4#Mb=UCuJu}Yp6W@EI(G3zCjjUyso`NSyZ`Y
z!?|!<?HtWxG<Afa0~66JOR}tm5M{a_LkR2#t0PNkvh7CpzX2|2G=NWIp4u(FZ`;kG
z{!=<RAs5t319qX~b;n}dBio+k?e=eWMvOlS^k3b<_=(katdQC7sj+et8Sgtf*&ce}
zK@)j_o)Nun0}7%(_yN8MXFl=o1T$Itbhraf?`{WFpId!i`rtX4{bmz47UW~s)dy!M
zmU@rOuf01HVLmC@cNzFsP0MHDr!U^n_Hw4bHa>7lJ4jr+0{+z-9%*=9*f;qFnuJ(t
zR7tdAt1eG1-D>ZT%$Ti2Jmo}_|9}{Ct+@cU!Kv7WPoDUm0xF;18E!>MQPv7vS+y`|
zev_X^-FP8L#si*qbg?N1n|RiAjL}If1J;^rUzIkQebemidmB)jzj0DDa>>fE`R|JV
zdODW8u*7(;em%2;&B*`RdfIDmY?}`T+ZUP1s}1prjbDrjqtEcawY#-l7}C)I-p{HN
zxPN5iVR+&EcYEYD*AUU_t1MssD)h7<L{H$U-dcH_1qjCP3Yq6`?#Mh3xm9a!?nZXh
z#~3Nd?Qi_R%zB3H9O<`H6tv@o?Hs&gJ5T+WwL_n*Lra2pRg9d5^gW9Ycy>xMLbP%t
z_}~&BAC&SHKTn82!1W<sKp$m#>M*LhybC7a@wN~XD8u<(@WA~69w_zkfEv0z2M<(r
zg9rLBg3S1H_IpvAJ1G}G^o1sL!4I9*|57BwjkS@sqq6(o!GBBYYEb?Js1!GheCo1T
z+6d8E_qX6!M8@?xQZACfHFz@@0l>$>D|~yijItSL;PLYHQ4|>-MRC19y1Z3^SmlQJ
z?q^GA1aYO0r#glLy~~|{k$jZAU)c6EJACQ$t_6{wz#|v!Q}@!5M`Horibs`jSw-5-
zEd1*|p({$g=Wh`)IE{f^fhv4dnRMKK@hg|My^<ZTj}MHa3)?ImojM3QS&&iEpizK-
z?zNqNdT*<~?w|i;9FdtnlT6XIcafRm-$ys<b*oX3qH+1gVVzcjljCno_W%t()UO+d
zUOlKI5o+Wa1T_wE9eREuj$+GKX@UbyW2Es{WX`A=Z{q_^@cq&=@^cSAHdKu<H67~L
z9JyXn&*;X({ElcIln_2Qm1J(P-kW(3nX1U-!yywIdGt>4pWcJuo^S?@p#!n0ycoKR
znpkl?6h<rmrFlrI0)!=2q{u4;eTOB54pmAkN9H*}YqoUhjqu$e-U-%i@84w0c3L;y
zfx3h2;X6<)gDz1`Jw`l6Z;$iVpvjZ<+!)S<R!xnw0A<Jx;`?kRdZaI-1|Qy3te5wT
z&c#6YOD~7r8da84svtd_Quer$Mk3&vdUM!GS`%=&ms9kkD^|B*c+VqHVZKwk=A&mH
zMH0Gy*7m&diNUzbGg)_lSH?XoXWaH)7G3zq?63=6^B>+#{uSNkw-rnr-%b9Ty3KEk
zn9zKL>tOg{vR1n~xiYf&gY=bl10)Z<@TV`q+XZ^zebyUqm&cD+!4i8b7_Y{!{g<=f
zQOtGOc!eF=-RE4lUAu4Uw5~g^`SKxxV_WNl9xY?dci3(}5~$jC>w+)-eACY6^NPE2
z@dX_a+(l+A*N)?!^ir>^_9!2ksxZ{>;og7{kXSYKQbAMlFdqaHO$F)yG(m{@iI*38
z-{@R|<ybY<;ExRz7AZ;(rbs8a+Hzyq*$n(qv@85Q(>0&qG?KV53xBqKX4p*}Vass)
z1p8o8wjVvxcQN!%{kXJi{eo}p1Mh7nG@z%fubrnrNqv3gep&pRtB)L{>b^uD*<@?<
z_4JYN{Mv`FfIMXAWd`2|1g12Wx<mf3I>Q~>r_){wwNJfMGv$4c<KHT0>AQuo{CDLx
za|DhU&wH{*UR!@(UV}yL&9TKw^QpNfH*{-SIr5$8xV1&cb=0<++f2jn_oz9?O_b`V
za%EfspjQLgjHUjx)zJ7o?fj^X19hH`>?fy7Z!5d;QsQw`-8)Xoa!Ovn^X@}=%3;b#
z@4a6MK<-(k>Y(t&ZG%F6Kn(RC^j$1^cl7<Q6ojj(UGJn4P1c#m7x0tw9O-adbBkpV
zBJRBUy_O&S5mcM{qeD&lfeZFu-TfX_GeprAx;B>|4e^i=|A*uB^_t=tbH}Od;W+0h
z?2mK%EUR;tVp8`s@}oKrvUv{XO@7O|&YQA^^+|sd)OR2CIjP6%EH%UZIPUa`-rIJu
zbvhhVM;srch09-s=i~kI`ahC6_rhNMq8-uBFI8PGm6v?a_~>h-;WV~l`X~Go(4{b*
z7g{>fJpR06{5r+!ST)Gs3B}bOtf8i=O0&&%v>JT+d}3tjn7(NQ^A2w`tC53$w|VQd
zxssa}gQ4>#q;LPFlIf?<FCx7US;vci{zkrbJoD8%&OPF5$8*8)#{#y*)eZ*JfR;nd
zS&Cr??C<off$QJ^B47_W$A+z0rx!Md`Teh(UXnL$4`z6vmoJ9HonGo~WivF7Dv=@X
zF!?!s6=*HVsqc(oNICX;lza3_<~Fbst*~3rVl5;6dT?GbP)Xja(0IBvNb9!C;$ezf
zMU-Ug#S{4w8~P{_0UT<)tnRz7Al=K%rl|xe@!D|=_=|AN;}~JB0p1TwBmuznEH53r
zQJ9PejbuISeMej{TeK};YpdAs)6MQ8eQP^Q*ZI7;`Yg$#{*w}tNlSg1)JFygHXNVx
zB6Nj(D1yG5BQu_%-;zwC$<6qOj^Yd5oYYHm&VdP1>29)hd=?TbM}1M`m*ffwEf>U+
zZ|f+sO_N_wCBR{-hP&X>0gl26vt+&qvsD!D<NX>CFgtHagbYfSC7gniRoR?m&)Vfq
z%cvq%(uP;epgtPYzKU?!7*()21OC3NO^OmgV(bdU*s(lhM)2ve$sd^{%k@(Bd&mWw
z+>YEjX&$`7A|NR{TLC?(J{@@c)7qWE<I7s{PVm^Gn12}_FO#ezc$`e&-v*B--;jmJ
z-Oe_E6bK$owDzL`Om@V<c~PiwHhp8EvpyhAK4AJgQBe7Y&Gf}U<;M7TV(tZ{dRKe{
zi<gg#R^rV?Cc>3C?_noQ?YMY6Yo`WLJU*QA+jrcnOT8@{Vel~F3idzyVcQGZ4|a@a
zTLN_wn?H)8ZL(?MBo0nCMcai`gn)qmQ2awS%%KIlW~}X3=p9c7rTQME90R3SJ2Dsl
z2j^~Q17(0*aPIakDwOJgb2pV?XeE*@hXv5}#m`kG?=Gn`=+QW+QD=|>IL&{Z#Iu<K
z&G1`<p{K$5+%Mba+4v-wjuQ;qp`VBlHOPw!IExDAP-t<Pk|ENw_(&fiRNZ<1GC!|9
z|8?i*WGczd&tIuJXMXkt3ch-NF6=ZvpC`X`eqJScr}MLzM5f1U=GW$FaUC6EvTi5B
z<|ml1NW*l--FDloub8ikDVLeAiF~B@?K)r5pUa-hx8+*LDQF`^Cx-*8FP}z)nMP>d
zS2YTeUCe#Bjgk6K=%6gPhX5A7^55ZT{|qDCLzMd@Z7{f66lfc~!vQ#!=NgEYRKw#C
zf2%<1nG?<tN<eKpMVLM~-Zb~YI@?4+D{vAyhmQjqeo05Dt(C>rxu$9Y$$@jd0j6#!
zyg5jj;$#jraTZ`_ngy7<72c=xpwJYr&SWfquo<O5?I6x8H6NKXGGrC#ht$QXX`Yk6
zDK)h$u_oU;VJ5R%Bw9X=c^2IQ%WDGqAYW<hZ!!@+{s`YI5^-^I<{7W-4z@{4z2hPc
z<7v{+ean~<rkKMK&Mw!1nPcwJT<&EDG&~onR|wj|`gxZ#<#tf~UH3S&0n^}P%{lVe
zq*U-ND#F7G{iNmo0-x<NkK%E*_^wg+%&mhrd!J}g$qLo^qJZ-y5tf!3@vtmNO=&c8
zKdb`}Pb<JAbKJu+U_6u2o#@&Fh|I~%lTSDjUldwxIEkqdr>)w`R7qL4)#kO`;lp_g
zS5L?sjD9d@Sn6AHZ{E=2sLn~^+_P}gi;`jiEzx>g_rsBP{p<E$khbt5RpAbuiR23f
zTf>ick7$oO;wqB8P3(Q|N!HD{jNGGVJJkT?bUnD(YlimkalWyEUXn9JT1c$cdK)!n
zu~A4L!}S)sX10=u%Og9j`{cn*8B?^oq=ZV%6kC0lS)v`vPaWr#Lzq5-n5)(ALr|i=
z*`sj+*ziq@xEkHlOr2-m**DT~KBi`O@)VHlBsbg1WkICAKapnNSC22WcQifV*v(dk
zA}LZ~WU8WxZH1AUv-rYVZ;H>xFuL<ZOE^;kMw9D&Fx{vUZh00Wv8M+jH+XMrr~6O`
zAOe$RKHRmcqd+GwgT{Gd@APs3-<6ua#1}my4aaHl0o+ApyKE45qCp&xB{yt({-FLQ
z@(pgnG>uuu-9?d?s0Tyx+;~x2KkZNEsS060R*c{+4$LNJ4$PQH4J@%@nonyDPD6eP
z?yiiJ86K~ofmo^xhB8vWg<7O3zKZFrKZ%5>`$Tywleh2(Hbq0%Gy_db>4BDF4#I$D
zHD@Ufpp44itigvZ{DF_Kh1r=F$|ChY4q9ke3lsbnE>a8U`7P8R0+umDLM`Hcd~{kw
z-W#Wp^QxFo7LoS|S)d=Wh?~6MgP4RU*?P_1<)-Vnh!19mqf-2KI~DI6{?W@qJf8JO
z^;S4u_E6D^jltROo(T}0bbme`H2cG!nwAOvq-HQrx5}w@nf7UwV_tOJ!l5ttI~5dS
zJGInHaBIfjsnw({JXN6R4t;H%?bMfu^74VUS(r#JU8%U@kNF8rB+s>|5##vro@~h7
za>8CdaSZVdUU|R|hQUq@F_^(qUiu#EbsbG>u|)fuyn;k)e)>0zE|BNk;>YOI!j=Ac
z{ktoye+Q_4A2AQ;19~rJhV|4!x`XO$ZyC@`<jV}K?Z9=D_a96?fOS(-ep7lhX@33M
zDtC%<+-2E&n_;ag>u*rR<T|$fIDt?96csDb<DBFccov!d5iu-`4u}W(d%^nxJWewW
zr%*mvu*uhL^TEZus<5K9?-}%@-=VlVFcsxY8q7uQYS8G`y7ZG4F~(tguTOI?ro;RF
z^@5)fRnIc|hnV(&$n9_#%%aO(eMqXI@sR3w>O=&2dh<hZj4a3|?^%o5;H}mIhWk%d
zR)a6IC&0*G@}(YWc$R67rp|@iyaeqUcK<ghBJ}E>Yxlp11pSq@^?_efW?q+upk?j-
zy;z0!754tip2D*K@*VB{M=N$(#@@e}F$DJhbc&Jf#{rQ^`;0JSMj#B6e(PF>3BTJ(
zu9_vc?~aSx_aS@muERds%R05z@kp}M6@c{RmZn<(0}1UoU%_#C;sZ#_REi&0z9W6{
z?f&r_TQ6VYwni~`k|RI@b~fm@%1ISI4uV$L@qXn4Yy+=XgHOP$G4~>68DGYSBRj_-
zlV)LJ6oZN5kmEw;`N_!4#|`_OPR8V6%PeDRFSG-yQM$xu1aB?Iw}Bok9RfZ`okK(@
z-~t3@MS6m<{v@!E^le!E@fxp}5PymP=0Uc(@D&jBDK(j6KXBCTv|Ekv9Cgnq68@ge
zch)ngx<8hNu*ltl3pnqQV3(XjE=eyir(De(n<^;R^-Nd(IoHK7TcJ0Q0NjAq0Z!3W
z#aQkOam210ng<H$iu$8%KZPrr(Im)Iw=|=b>uRy!-9{#kj#Fdv##TNXsXxt->1rMM
zSL;Hj<%<`MP4-`_JXD5OVHcm9xC2nh9P!<KHAI&)`%*M4Tf7^<6=JvR$)bsoJ1O2V
zqIce9io)@SQwf6<X;@|m#9n*QlrqSH9589?wRn?6Z#CG|wDEqEM_YXs!#YVe{B|}b
z*Jo8w3?kXV<>FM~$*Tgr8!p2->D?mrYl#W6o0_iBTXT)tMKsOf8(S@x&Z=?=a{7SX
z?fv1L0exinC>&5)L=gDU;>@%vveO!=|7$S#GJk%}n;T%Y7P}FeFAEtHCO&;VS+gvg
z&6B;8r&)d-pc9Oy520<JFBOI?O&@4m5PFxR#}n^AE0$c__G|b)vu|qqHFN!6jM=+@
zMI`(S!v4$b@7M|qn9wXiXy_#Uu;o*QhcTVLMAF+U1Gxwvw)L`qfzeIg4OA4adwehN
zYChZgcg}?U0~l4Ac+-tHA6UyPVH~_aWaJND)T>)E5SRq94%61*nGPwq`0QhQ!hgLj
zT;&VsplMfQB<5;YBlU+UyX(Kg?zs#rn!3&2PK&03cbGuG`-k`nv<OO`?A1@Fi)axx
zWpaT$caNqXE}u&$l#6Gp?*Ex;I|oe$$cE?cm~Zh9p}q7hlI?s#3WkXis>kOj#=m`)
z3KOUOJu&L~?}<uYo}zIoFj)Z+DqkaB6*Px#^3`}Z_St2*?G04jWjXCV$cQXXijQ%s
zUdl~tXOIQ_9T+yY4wfkzSv<PDU$iI4DVmaG;~(1Vc9DD7I!#Z%w(kXeh~5pVd@X)7
zW((6kMh!O5JKsQPI<|GAOwWRrgn)?cRO)W-DP*8BwSSZGMNV75SHASE6V%C4^^H>`
zQ`}ac<5V<DW1BvjD#YJa^Y)*?MU*9OC98Kgu$2CLxF4El8W1_DB@z_@mq`5sq-i0`
z6j8C=OHZQ!y5uasSekw+ET6M~?jz3R<ILN5J;e!RC>uQLwKz*Y7*wstM3!zX<ajPp
ze;Duw$L`;8(2eg;oCr?U32bxnSH+SaRYiW?9IJTKUhHzMldwmBC$ON5@nAtzgCK!d
zPG0JDFGJ86%<(a}VoRj{H_Dx?bW%Sf`lsN~pMgD(a#~I>Eh|w)%_o$A-rtL#7E89r
zl5fViCSK7R%Wul?Ggg;NS<0mbJC)lb^)J(^ebQKB8Di1WGADU?JN|n-em%@6Yq>Yk
zgfjs$A4V6(R&0spw*+t?LrV8#>Z3cmLviC5Sdg<qim>9;HIBMYqpopY41b_cOSjvk
zf8M}MX}L7yoB`Cz&`m0pSOxb@r-}o+RF*LpmzefBv0`8+;fQmGOLZK18tMVRnMgmH
zp@}Fi56pjS+iwIf<m0xao@fe&7U_2q`82Gxk%kY!wdpb;?hqeD7q#-q-?d;ju|BZq
zeapSujrhn^lo+fXqWQdETve7noeB&O(9VC*)$>czlRDvVd6e6g>M{?e>XB3)!T_pU
z3wc_h?G_l=AN(8!_G5N)VBdO{6fHia#oQlFJi~GC*J)sQB-x3R!uSL<y|M|j(tcxx
z@xK#jX~wgxKI%@LR6_T#BkPU&!UvTU2Oz#4$4O#5(=w(^xtqrfl!n9515j0>IGvTa
zzwrn;$y25{$=@un$WtcL)U-&ynT_^2^A`W-PUUNv*~AYWhLoHFH!@(0^dc=YR}rH>
zf@}RyUdzn&7OERbWNQ%VB=47IhPHIBN*ijOpRN3`#p39Hq2JHN_h>$0AdKGk%ADb+
zmGQE6Vi~9UrvY?L;Gd<Srs)Io;_Hyuj#)+fZgD$53-?mZ?$8@nD{90m{CK4xgmC?t
z%QSNdZK*xX`@L2@Ickt`n;8~?D`%aVW|KQ{DBdnwD5{C{#fEspI|ZcG_L#S|HWW>o
zN2l)5m}05NZG`TC<L>Q|H}=Af6UXLn8_SiMZ-W+VTB95cb0d<2;W&@8aB$qw9}GO8
zguHE-*7;>Cw?Rk`r^#6A9-A5ev{&aizC|z$QDB}Wp`(q579MZc3<f8`p{nSWE9kXr
z2e#hvqA2d;bIIm?9JjF6*LzRlF6W+Noj+JFDa}9jEcA|Gq~Z!^Frz+x5G$ZFM~q8@
zyTFBkzOr{<zKs4<DSAl=!wH?LVR@)SqDcJ)zEP{KpD!NdYMm<9_=ku(R@{xYIv?ZY
zDdsDVaVmrJgq(B5yR^zW$C*pu-*%3ZxLi}#>0EINQJu~e&m$t}kDcqlr{&l-26$!P
z&u!oPJ_NYHenpvxSh=ciK@#HG1OJeX!#3NnEu%^m%V(K>&+b&Co!7=VF-)CR8XG>Q
zEIx*p6XS<qew7?k+UaG;=&pIZ3>iJ3q{@F8GMbdn0ywLxq89&MF$agYZBGULui}Y6
zEK<{lMRU2=(7w3q`m@H|j`$*gfp5u0f}Kq3g{k#VAk!Yyp12e{cI<VAi|OOku{Zm^
zswg<GW>_{qy5~5&ZCwU`3*M?Zht9R%JvP$t88Z=617p~j+hP@H>Ucs)6r1DHKat3B
z700E4{ca&Wi&ZSwN$DN}@;lhaF)5%q`2=T+s=}m=j8(L7t{ThV#Jo7J1094;Cs9Q2
zf{}~pAi<LYItX1&I_-Qz2OWqHbZ{(tf?<jBwZtl#y;G;qN`9LtV`$8~P6QAtxhx=p
znS0uq111nVO~`84=-{%lNI#mfCNgvoJXd)mcy1QE*yiV~D1LqxhHp*eurye^;1pUc
zO}o$11aI=4;r}sRyva8~7&Ls~8>Yq>oV8q3%3tdwKaTu*X{_Qci9rp9ZKV4V%3C6+
zgt-xiTCj&a2!FF0>qw(?oFXL0K|MJ`ew%4Utx7$<KXM15SNJ@!3_{H0r8K&e+EdQf
z<Vjo63gRxx9fkG;{>$qLu52C@tNbK>G)C3ju4Y}BpCYbzgmpmuJqq&vLhZY}wCW^0
z9j2>bi=Fv2>%?NGe98TnbmdFz)L!=YSd(jiEZS=9L%=`uFZX@|GvlP>6S~>BhH0gC
z3*LDwJE^N+2$sgMABrx0w`Uu7Bt|&bvtF(2al@b4rOM?$ZLvU=ES7)x-`!$4Ig1t5
z!OTD5Mp>s%SgpvfmqjZ!Mdm-Jqnn1ev<tLapA*Qg7RtTjwMZ@8gW)gI4-(uKKUPmR
zCO?)w<m;@D_3z)dJ}+rUNV&DAM5z)a2Rn;HK>NnU!2*p<USwXjQ6O}t)t%PITO*2*
zU6&FuY74XLVlS;_)@2RrGOvzxaW2d93${N0A9rs8A7yp@|0j?@wBQ5<jV+e3O*JWM
zLzS8s&_oD4QztfcrFBcKQd&1!5;sJmnF!-BEw$R()>gE(wXL<bDj+HZ0t9WXxFNPK
zxblpH23K(9_x_ywJTntO+wb@L^7Z<^{&^|$Ecdy~x#ynko^$S9ZpybsK3+dS^g}=R
zwVRPZKX?nnqVxk)16w&SMHb#?i)WxKh64~6W&mLAV7?guft)Ocm@GLsNWjXW;%(Cj
ze3YVBCe0%Kb(RJuD?2T(^WE~u6|ZP=m%ls^VbP?db>2g-qF0s7*Fqdp5*apxU;#EO
zi7voe{<zn<@k5sB2bytov4Qd84PM>55+`VPke7~ss;n9LW3T3FWMoWeMBl#aH%drI
zE{_+#qeaxCe;(3InS?Jt^#Te1%(H@tiY3ogIMpDgsZhet(Q_}io}`02dHj${e}oAn
zE{q?-(8Iy2P~fr**{z*CZbal?!p|B8CfTqIsedbnGjo1;z2-SLY0Qe&A1bctoY|i;
z&ucDao?FlBM=yg(9@U>-eg}GaD6Q$_-{o7~sih?MEu@g_hHBCEqmZxLB>z<K)tbAL
zi!uLZS1~oXxq4GPeaT)Wf?sd4WG~XP?yu`Lz|w6xcRf723ZTxOKr*TJ(I`qHE~h|0
zk4H@EXU-*mdGc+)_Iu{EH+v7qY_`O!-}YblE~4p5y?@T8Z<~KP)P@S%{-Vg_BJXZ&
z@%o!>J0#Gt8zpfV9qaXqdosuHPUS){j>>^?WS7t;i$DGN%Vq(_y^U%H{y+bL|9CoN
z6zLDPYW_?+BWXn3<)CZ3&{DAbL-%EaZbP_)TDfRk>=@A7qzg!w;Eldl80Lq*3UM1j
zpLM)xiw;CgZq4fhyXMQ4!OoMO&VVtFoAk;e3G=KMcQUQxU+qu{@gtnE{DJZG#NHZ_
z%La)X#*aiVb;fY`1w0L2CVd;Wq)4ety*j`B39tGo98Ro~=JZ78kI*a0z_396JC63D
zq&lVfRe$$sIobzN40hK0VT0)@VZ(KgM7l;XhZ&;l3+;`ZaonY(E18S*e12?co}Uq^
z3?BRfF(tqQoy3!FsIW9FJK<-Cz|UTC{OrP+AVT@s|NmG|{?r}U)73odzn;Df_4v5;
zbPVtRr|W49K_&iuEax13ELcy>m;MI#r%#gSBqF%=gnu9s+>r$<2vHyu3QQu1SHBuc
z1nXH6MxC~DahKRXENGjFZZYQ$5g9i8P%WDMEVCvO!MpM53?i3waAd9((4^Bh>y*DN
zJR^d5_eR@c^o8vPWUut}&9I~8ua+niL%81fSgdG8F)Ya)G*h?kf|oh*CSLrSQAGVi
zxSkA0J6Qxn&&=_aj}QPtx$<88EcFL{gRj6+Rv<fAW*9Vh<0j>qVZ6_sygC@r=>$^@
zH;V5w`gQ|;NPZopwBLN>GE`tMBpX!0d{o6#0l#=IUj4ea{9PpK6_Etd+W`9E#r(iu
z-*OW53L>X)>y%LavP_2B`zhB~I%+YH!6RK)^NoTI)0d1uJwZMEI`&s7Ic#8H?<po>
z4mMT)F*XC@b#FxyC#$JspAaJXa=dy=c0amqcI;h|t@%#cgeRGfIh4|Z%w6ZVRdK8=
z`pt9-P_7L9ru_k?%434RTAqC%zh6Np<9N8x>L@4Qw|Q`UR57o*F9#p?=C-pJo|Cf|
zVk=a${p^L$w?E(&e`q?9X8vh6^KZ9tj>-k)m>L}Qd)puW?O@{L9QZBjH@G=@9w^vP
zs&M*1Xg74t<?V(XOMfa8p8b*0$;F`^(BLk!4$<I<8c@J%5ngQNve}3+a8>3~lR7$>
zJzPy*&3wSEJ}_G{H6r)*dXY)JGWmtIWp9H$o_;>)^DdObq{w8i|Jl<;Gt6Bd8m|47
zaXj%)G?J^$Fzn}SKDNCA4`Fa5Fd51F?9=Z1RsO4@DSS2PJBE_XtGx+%^x*B~)BeH>
z!~yw~3;<g7CZFbz5&JbU?Fgw}GUM#muNfYWcZD4uE*6Qv;lfLg+f5p2YWL*Vkh=DQ
ztwj<?QPQ$16rRJj7h&ZW|A{bQv*-0YhfO&88Y%`j3gWhn4c0B2m0m9v_M)QXk&30(
z@rB6&F6SPWYa=>dU8enUHk1#C;JnD{qHwHw6Gsd)|FRgT$b}^G)Ki{pf%STumAvq1
zHsjl$6^^YIw*5oWmdH1VCJjA@Xn$%A5Un{mOL&G=pg)qi^%SdHNqp^X^YeXrAkuXN
z$dDnGdjY?VEeq)oSz^7My?HD&g{&{m-fS8Q;#5O<=7vzd1#pdj#O9ljzrQ<%?rUY4
zUvd+)pIn*w5-sKI&XBLr?p%UVIy)cydu|5!ckvkvsKB1Ell(!W^6v}xyB3^ACqE`0
zr{-H4#D5o$a~yR!`NYCCbM(UTw?Yz1WS2y4&-vdCuZ-8d?y{=ImI*ER>-O$u<oX-s
z<y<cU{AujCRK6S1ahp@;a#;9OG?^FMhh=8-1pL4eS`XUJi?JQecEzk$y?xhvV|+B&
zynn`Y@-cctIzKFK!*EsOq1t%$V?U?q=<1=G&7IZ7?F&uPi;<IY=mI?GB3<%53;i_I
zivO)sS?siZjmB*qVJqZo^4EZDa(ZU1wnD~HC|-9+Wu!|PGRzQ7a%LfBv%5)dakM~$
z4#8f|+UDLdkPmG<)Wi2}E8WNwN<7@SdQuIRyi2S6<~P|3dqec9KN1JT4x$Is*|yFT
zrBi$5nNjKn$sZreJj7>#|9O9V8{VQ(V>9n<A1@Cj#NMWT03ylo-#FkcE|DrYr8rjo
zzU*zp%lG+*LJ~3SDPCUX;I<Yo|4t;aT%St4O;0ZKs^29(it{D(%(Pt1VWF1$koQoA
zBrYY6?hhFp85S?4sPx?B(sRkT^OE#j>je@DFN@Kf*9D3$4i2mRyH93cw0M&U!>S<E
z{!!0`*khIJA7XEO6pHe4!FuVhf2!e4vS;3sy^#fY<7^h-J4R9Z=|^t=r_krfBgd$R
zqfx)TpIWm0v1O)oaAurA)rchS2lV8h`26n%u<5@sb1g7o{FVZWQx-ujzl;XW6U?iJ
z%R~~>G+9h;G<URO=W00JBwr7!RC^VbAT~_e%!3*|i0VOd5@VIG3T`k`ND%{5rGKYg
z+nLuA+dkeslDLVd&WGYDM8UezkuIflEbPt8lwkZ1_fB$Sk_QR>i3~dd`I6SOgS`5U
z$fYHf{6m5kL(5oll7)&uM4`!p1iu_q1R{T8w_81h9>N?XB;t*fvbkV$F`%DX{+6MG
zF>S$_hcUICzI}Lp#E#c8QlYXHnSB|I$y3JrH@iF>nF02+gXWMA(6_hw?JvU#R<eL}
zQ^5i7nT^~qPX*r%$^L5U{4s3Z_T8uPXfj7ok^hI2LV;mbcF%zG%e$gVcvb4Z%WKto
z%rY(1KL1?^K^`Bi{NmVEmHKZ$KY73tNtFpGGRM~*Hnwu3v+sFJ7M=6&h5FA+YJ*mW
z4yTxFH6QKDvYN-abrf#m$Er~nU3sWT6hor3Cvihp;jEAmKaTto$ES*i5SR(9b&*}@
zUE+_z%DV+K{<<~G6|s96zy3z$Xni!K>yd?KR6d&g%6;8);mnIpD4v>ZDdsG?6A1Y7
z%I3BVjHOPf06pY?*}}!56QZ%+NiqwNzq2K31afr)>>=?(+NN+7<*L}feEdeOr4y=n
zSsakjvni1yqw0N`U)(2$i=&OFsJplnKifVs^E*|a<FBIpvATam5-A?XWKOC3*Q}T0
z`(kr=EIx*vi4WSI+}wFiS#kSgsdm`R@y2F8gXM+^F+Q{8H{vEmv~RwEw|XheJ}&w1
zq4sJhuL4GcX2XJ2n<D6C{;MZy<p+zVM*Zlz5O9Kb0so046b~8jARAW5h;Fw}85Q)?
zI7*~zuJ{H61x<`2(M#QkBQ>BXps{H7-}W!GPmx>oJ_?ByG41sRg0i|=Yo8%+VIsK@
zg~HZk!Gg5)q;6Qt_2PxQ(Q-DS{?(fL7MI7~ag}A_RC>k`Vaq(otXCfegrHp6wue=O
zw?H9tuHL4{e@1Hc4_G(BepM*ABQJZ&*8<_gl1`AHQ-O8yR76&@>VN9cTdkPHa;<`{
zw*wXK5f~aqsnznr`%uS$U$vsS`X6Rymp{5G)j!Y|;aUkrXLPDPj!GZ0f211BU`>~q
z2MgzHLK4rbtQ2m;bRxs{izolyoP1yR@w)rkkI!V7Hj|%NLzIXKBkgF`HtLyKW+ACH
zHw2_vaqgT-o}Rufd4YuJwlNZhWj<#4{~bRQ;eb+u!rnODHO4%)|4z<1WxTM62rzE2
zP^v1<Kd5ZuDlL8UxJu=Ci>h0Y&F%%5skyH*ejxuzd1|RqOL3G6>WHfGcyb6;fFXPy
zNAOh@xej%m2a*PKnNFfxX7KZ(zF74GR<L+gXg^>_SjW*e7qy)UEb(K6+dCnB6~EqQ
zob}uPEdk%GX-_$I@;*uVvQIzg)X9JPtBwn#d?AZy#vpMRm_=<r#B6{mYC-U3mC}aE
zP(l<7CbTwsoc&ptZ-uzpvdm9Fw)x(YP~MlBy#Rvfl?|t8bt2IU2*~ILJH!*#l87H_
zecsQ>@d=C9RU-eKyli@DZ+JrTo|#7jKPRcFrS$0|%o!^(=jdg6{Da_z|D7qheQA@w
zwIy%qR$BUrAKsX5|4FSH=4x3jBhN_&x@ZtAJWxkBhB(Rw$uY;QKL2>6Dv2a4OEUK&
zX0oaJlK&utqSr;`^fnD)ewjYjK#@}C^9ys^=Xf!<jeh%`TGpfG98vC1(+%@P=u!~<
z(~h6@$`;#G_b+BD9hdCM?iR31YdQ;D`z*kcq&EH|#{m58TKfy1s<pvtz?%Bt4z=6o
z|CXK_U2^ad1adGUDcbg6B410Hjechk8eVBI5m89YGME7`FwQ{gj}|-M0aCv`;Xgs@
zjbmNkkFp*IkUCs9{UN3F-wsmF4irvSw!?3rivI#~KmYoUkZTWGf8h2Iay_~Na%a)=
zJUi?ce})|rAf`_}!dspl<Sl<5>az(J{EWZ<H^o`8>olhhscd9*$m$YH_Th*o+W>p?
z@7mcay{xApevJN5W=Qh>qhyR?+XY{tmPRAZ+44YM=v<C<@#W$+$aynD8RY74?~x6$
zeV2Q?^m<#?dY>PJn>8dH)Ew0L&Jpbon&W=w>N2Y1I}T7s7A>c}3h81NaT<m5iQOez
zw2VRZ<|J;8#ZPH3lI6?;9#%Al0B+Qx2KfE9y|#nLo4})U(-cH87&z@oWntqY$)c)W
z>#PNN*-Szi5?mEYAXj)xLJ`jlzWbN5l!7gLWq8mQaPf-BRo&j`vnwl<G2FK4OH_)j
z;74@9TrHELK!0nrl<)4p!ZjAJP76Ar6hU!zVgbMzm9;w)((H8V)#6WxV}oQtc5+(d
z+>%P<ut~*J1r5n+lLkU;Va+--7<p}ylh*`232c+{0-HW?B3mQrUnI2E;SIwVRVs|s
zR=V%cCACeeviG6frg*cQ+~yzsA}Tn!lnCP}^KzMV;~1=mXMW9?**`@6;NHn=Vu*EX
z$fvunFN)>FwWpl8wk~x%;#$)X*^&^1OeVuDFyd3ckhr!F&2H|jvb>B<@TUgs0;O~5
zV5IAdyb8s&I9+d0lm5h`S-!xyp<lNmUD}m#_JT>o89*^6AR9P&tyeM<vL1GW)qaV-
zuakP$m4rVuCts45RvSy7igbi+7&9U>XL9aRs?=YBP@P|(Oi76`V<q~mDOx?DrbyF=
zXvI81r{*$Wi<r8|uw%=nCMReLnYg(Vm^-FSGK0z2c+qM8m@*1QBZ+n~Z%R~AVKK&Q
z&a1Xrq@1R1>d!QFy!LN-UQ0i&rGGXO%@ScEe@H!<_KM7JxD4ciPUXvZAe5_taot+U
z)tmDZ)laTwqA=eS%GIY|tJ#RtF`Fpl>fA&<Lw6!w*YhfzC~@v`&eJIKe;L<m=Putj
z^&>A=(`^d~xtzYrUjge*pJs_^W~N^Czexr0x=N(l`Aif#tz={YPqHzI`p?D>saw$<
z0Lird)NJU$hfZ08bu!s|QHA-->O0`aOi7GJnufkiDUev$&kI|wSBsye#q2!cQx12`
zB+zv{yT6%UWC!o=tqwb8+deR%G5*itic;8X*v3hZgFw?^q!s_i;_1bt!BG5u`n1~2
zdPVdBC?}&sDx4t+1mrpouFO1oA<XxnIjXH;+rvVAPvr{mKcg*aw?dTPe!JC?Ifl13
zEz}~fuzpiwf|Hlid)akiD`zdIojxXP@~Vj*eSTnOZ-$Bt^!HPdfxi8<koon78K8T!
zyZe`XmBB+ACn7HR%aD`&@97C3k78Z%)7)e?22zH98t+y30_qC*j=jp!Z0Q$U3yhmk
zdME=#9R1`7fdV5JHWpPLs@(kv`#}YS9R&psw*Is|8q)TW+tc<QBq`B$QT3umCkAor
zEIj}|iUm88AjgK0cI5msK=@@27a$e)>NEZbXnB(m+?@RR3DWmQ2xTFCZ~Vzl==(?t
z<>~vm&-^dxJ3xoNzY?#0rR_h^_t8ZDM!NQ<*&XQn&2%u*wI8oSvXQ6nXL$c5eLrgF
z^u1Ew??B%nNtk=<&p>`m;PHnyT5iHZ<2cHh_WSa5UAEwmq#KQQguQX@AV=A`J)sQ#
zJSm{<jdLe$N8DkuA#tY|)E%p)qwboCLh>$l40)eJszOKJA9zaS{WwsIu$lrEj8&w4
z@61ggSg>xy6g_+v%x?1Wd{}0a9m#v=Rho<#J=<ZPitN>Oyb9+@q7}3gK77w4t!~}r
z!~=CB$A?|NyNG+flFk}^hfs>jn#H2}Y5!tsh}UUKy5@ug9w3MnqK4XrrU?B0j%-(p
zH0CKh#K0*0SSo?S+e@S!mBrV8XCyv=SJ(rG3|&J-=o8p^wxjdSqVvn3^Zooh{Dnvl
zA#I1i_xRT!Q^$(29xG%qJ)<`Bgwb{jt2CNzv|YChtN>P-AN`lKec8v-_7jY@YZe@B
z?>8Tr4dPQDL)*zv@swzlS!bc|*(uQX&e%@r`*nmui)5bxOZA@-k-t(;#C+vS^=EGK
z^nJE;MhgFox(eyLVs(^$Lgv-j(r^iq)+9$Y$!?Mh6vkWmtw6u7$mu5z-=9*Pa6LIu
zXZ6MR-4Y+u7q9MXdjZdA?1N8XOo^vwTi#>5r15RW=vH6)%OBx9eVEIByu>}0tT|GT
z#~6P5{sab*&j()U!8)ISS2S($PkyqCx2?4}-@H8N^xZKJyG|3|D#L&|4G$!Qv^hNl
zwdC<wY8Y|*PM=H?$b>Mqs{Jj?DMx@%Tw4V=V8Pui?o_9UmXU`Yj(#Sp*uGwRR=NKs
zeFU^iI{>49C^vHG>pZKX*p&Kwe^l22{p-3%b;Z~hPP5Zbh~@6FUY54_*qKGy>%HWf
z?2GZ_x68AnTXy*+E#%-M>>DS8`2P$0M*C6pcn7V$k5%0JvG$Ge;Fl~8*W4swCeYgF
z>IQJ_+`jP(fb(By>2EXw(CzFS4;oxI`unQ^vv2HU?e_Vr=xNTrkr&l0_6?1+0Uz;x
zHV$^8IN2>H?Q>z9H&7u>!K~Riv(qTK6zscJ9zNcfwP>+u@<Nc*e4j}|Uz(W$Q3-MD
z*lPEs1Luiu&tm@6L(uJ2wL<oTBEKV0w0TH-+5I)22-jG6A>UQ-GNPGy6K(spK0#8l
zTij31mX&%VRrw!$MsUWS)2u~~G?X_XRU<n$Sn@0Rb*o-byI;Lz#$at{Z}j&%Qe3R5
zDAUBf=0e6(YW*6uYBn!u>PH{ku1k1E0NIVyvz=`3T?kN}pH&#PrTsnatL0}TrU=9J
z*%5jBC(f?g6@5g->dHA!5Hv(H-{_BFo5_RCM%lBEBz_5awb%M52wg0Bimn^MO&u@f
zZ{%45-IA;lur#8|>&HKuWEv6&A&h93X_w8Nlgo<r3e~eLHu@`0M(yk^#~?n?Uv#4)
zF3%;1^s|Z)-3#~yTaSMhsWw*rRQo1%>g2~A;4jPpKGL;8Gf$Eux>BgS7(V7kGeInY
z8H;Le)oYnB5_fWe_pBHND4}hz_mIdnmPLAl)>0m?A3N9v^L?6;@#a$y%SmO$_R<c*
zVUrOyS?=L<#$JX7^wJG9hM(Jk^ASC;RPvl=t3s5D`gfu_2&E)4e1o;GP0B|NVf~BO
z2?_fABY8}JD@>YbOp+NOd3;f(2#gWlA{Bjq?tr4q5xM()zNPzbh<_?0fzuqf`1M;p
zWUZcD*+O8{m%pVWd|tOq8_yW3`_~bX=+c;+&dVjs1_PM%*xhg<hw+W%k$;%v5&H8S
zObftteI{(7G1Xqi!2BoP&bM&yjx8LPYhm$KwD7QN;g(~pg_W(eu-?!;0l?YY-<ElT
zInU__oDXB!O#J5$56RZ5?AIOrT9s`LE;0-FG<;YeKGe2E`0&{9;``ynsbR4+#e`bL
zkGX-~$3V3jUlRp{tpPbD><aUNwBpuQ>_0lry{ys8hTO{rd%1|0!ll{Nl)XiDx2sOk
zBmb9J*t5@lWWAScUoYG3OtCQ6)qN5LGRO1C$>kbE!LICOn=aI1fG3w1(Az|%w(Q4r
zcU<}nXyfu-k&+iqn!-^+c$#BT^9N6MOv~#{RCWUm{o&qjx~`EXPDW2u-o{fzqbhH4
zPQ{0C%=rz~S@hi)T>!T~m`KDiDnj03cJ<t+GDt?f<Q>*RbKRSf#A)^crwDRXR9?Ih
zZEC=fkNmE)_Bv&!dAjWzlHq%stZPbx0{XJrw)wY6n@+ovpLD|YKgMfbnu;hBnfEi{
zS}I|uJyF=|q(181>OX%!bgb^__TQTY1$|^;(;!(;9z(`R-cq@kmbArqb3<Esa>d%0
zIe2dFag|>*kF+uOT^X<Z;+So7AG^7{Ey9=WC7g*s!lt=xm7{M)l{%4jn<9&v1`vGy
zFfYPG-zEZzwcMw!wx5aEEDwN1#Q9&WVxl{aXwDJ4Fj)`ZT+;S=G^*o^Fmz2Q=AVXQ
zo&MOUe-76__12llqPEI<x;CCD@VQr4UZs8m<oUs`n&Yv0k?6jNZSIBbrOvdi-X1vS
z?VF=*Lr589r*P1pgE*9v4W;8`c$-(bna|=tcRjz6MXvK47g{-5sZ?4U=H65J25%bX
z-chM6+E59mU@C)DoWk6Bm9OhFiXWh7Z`;kp$^&`vrmh#oD-W8Rs8pyON9@h*o_n1^
zW$r)X91-U8ROW^jen~n%uTs7O4CGq(vy~rJD&|J1DL|}fE<`ReEMcAX6HqNuLa1AG
zAyN4rzd~>AgX@pWvfwh(wLjtkg;kjqQv8|zC{z(#5+~}q8tHl#GM>ve0Z=qBRUOTL
zRh6sy5?5Hgw?Z}fLbZie8$`Nfo}p?1)k-(yOM8W-4X|AsTxt0$+88xk>PSoet0}=H
zDWxbZfL^H5%Ub!zFM_~vyqVwt*=p5-ovsDi!AlPu6|9I^Fn7#5^m($pzegtCY%*E<
zm*J7Qn|4j$^GBB9o83@Ey#k%85j2xtg;mpR)wJwb4_?0YBd%9PLsgMGE(ke*dIU_>
zBlsr0Dy!$WuAb-){b?b%d&G%5tw-=yJ(_}~S8Mf*clFflSWgZ0M0Q$_W<vF7a+02B
z^*knXS+b>J$9ftxoMBY9qJzN|j!Sc+dNf^04|3enVs(9&x^}3ip#`OEwyFR=-KvMv
zc7TuhG)?zXx7e(SfMFqEdvywPsBOaO5V60Dpe@j(f*c+E>Mv3~gnh($MkxHZVXlgb
z3J$VT4G0)Dmk)pjo`+Xy@~o<O`lf&b+)>%8o@*C72qBgvlfK39g*?8IVMkSnU(B~w
z0^(203)C!s>P5F*frC%}Oq`770va9}HbGf{X@EFo+tT&#qKJ6sbpJ10<?I!&BCc)m
zM@p2gd$qmKM(ic8vrf{&VrqcD6#1VH#s*DmRZHq@Fiyr8XNq&qLEXW3>YQ@~nBXxW
zv5?#Jh}Zhn9~%!9@Z@PE5WqNdTxAPk|J+XzNHE|PB_}ixP^}JV(Y5j`IyFtdd{Ubu
zZ6aRwvj3QySeg>L#!Cia>@EJmCXFDZYOdGwNt_fV8>~8ZH($wlK1CUungu9r!?E8R
zFWh>`iQ0T<22hs2jL5D#9Ert(&Yr@cL&P<mm{1^ClU)=HRhyTsypSm=kgjw3L(W_Q
zT-Aoh7Rs3Qir0WV3tIj~J@Q+9HPSU5sU_Rt<d&SD@hHvwT8$$#m_@CMHD2`^RiRh@
zSEwTMWyFs8oXMKmz(uZoifBPIHCo>+=c3FKP9Mm<FPJki4D~3SPhN?(4AUcCiN0i7
z?Ar!=Ai_nNim=|nI@O?{KA6{b)d_FKid0>Zs{7qL>dHi(eC&531>tuh2bp868~To#
zL}>V2bjQY5Rye-;5#PN;(VfIYCm!U5^X>L3GC${r{mA+eF?%n+&HU@K%$KbaY0%ZY
z$Nq$YF8FpzW)?TWJk^ib#ns;sNlea{@iN0yM)hZATKhGT#54H@YBPTb+rQ7fk46#~
z<lk3i8r}N>xR7{}yKdm8KP{1Hcu}E?%-7$}t$)&U>^%o{;x9l1g=H1jC@g&!pib<4
z@CSC9<F{&s)a+Mh_U1d@dYAF`?y=bDe){E5ndr%W*69Hp0PCbkKlYQ5-W<V|=$_HL
zfWXB%H^~^_{{}%*DcP}RKBblV6A7NSQj=UsG;P$1l{`#~q0*5%8+4SHx+lOMClLp!
zSpVr?T!Vsu_r81Wzt`Yu`F24qKMSkd-Cgu%_YDS-9jd+!rU#ktu-ep(cQMz>DdP1n
z@tcxY9d?Xsz5*0#@+y`8g1WzBm(0y4Z#@h1&1ee>-N6iDmu=D_il?SRnb;NILG{q`
z8Pk2`q<ycP8kOc*{j}C$2lJ^gsz~Aon!y5<0(a7}$YW?H{%Oo$Gr9U>sZ*%fbYZ5`
zOR>WYqg&+_`*62zEs<-fE`sbQ8-S7mToPN#mHxV=UPQAMh5G_Iz2eC>@=6rw6`ON<
z#c|>qeAytS;({>(j+Bbck$G2&)k_`QUZeQ&QeniC?SEEI7HxAk6KWEq5CeJCHQun%
z&k>W%?gs7)@f~WifJYL$Z{QbM_c#2oAQtljjQH+q!x$c|;Z|fzg}eFXkywH&70}Gf
z>oFoB`Ci5gwX>OD{h92i2O?1VqhY;M*$#ervy~r_BQ8UVMj@5gqO9Ugk>3LbA)Kjm
zq3{@$idQ3%Mr1LR9Pfu6?jSibiRF-vTQ5MM86pXZlrBh)<YSy?gHt8lR*91v&$LQD
zN^bm)s@Xwq6hu_dTC2y&jrY2Gb}Bapa@wa=S)rWf2lZ^QdYs&NysPIU<i^L(-&ubI
zRrN=}4&}yI#9=>PZoE>J74}C{qIxtRg>oa3%ty$LlY{<jFE>VI-mB4CTGvl#jIylm
z*S>s<=DUXe2I6A8vcbO}SwTjD{<31pg@LRX<1?cFPF9NZTTJ#hNs&D+sw!7iah~7C
zZ!9Ti9-McO7FkaEr!^-R>0-J{`%BWHgve^Ny++S`qJ+J=a6;nMl+t>Q`d6xa&VQOS
zbFl68`EMQyQRXnbJ}1v0L)m%^cR9NyP0gpqcoN@)S8~>fLTPGG=`|lCPhHFBG$naz
z;TJ_Poaj&>Pn~Tu=j167&5xC*Y+KOesrZ-;N>~B`Z1oRV5(rZ>BusVAFNmjS^sQb<
z_7I}hcw-2q-0Z%4^DbCI6D&KVgFvo=4i{x^79Y;TtA51ZZJ|7|2%Bk6p3nhl#_!*m
zq4&rWE&11FnUUPcZd*U%3EpEb#)_AdCy27m%M*9m_btd1aHOz|mw6|w|L5*K@<eaG
zf!fTE!}k}t_sA2c<=<Cjs=2|V`l3>QYaPsO4_Lf^EAt@E2)<?s|Iqo|z1g$ECzC^_
zd35+>xRHE+aw<fo@pg5c;c<w!UZ*@kpYd;iBNNOl382qEjyA=5?>W#h|HNW2fI^p4
z2@?XYJBk7OOCX%C;Z^>*Tx*l1Li%&+kV-4*U#M@Wb2@dY<j3td&0{s%+N~A#F{x2a
zsxBJrf;Fa_GTmr}1^%?9aI_uwn`R&IpZ1#`sCL8r!j(WM+;6JT4IFXj`%PculMv7!
z<4@bUzXL^~_wH#yYm1U@KFa++>k7~uPfty*Hu!KfI{S;936^|=%_bbIKO}(ken#vi
zH^-8Xl3IZfGnv4`2r>z51CX<bX;z}eUhu>J8&Hp@kB5J8j&g64kt$_YOg9s3`ZlK+
zCtt%fnEHzH-+Ei#h^0#&ReUYG8};U6In0us;DG-&mV9l3$UM$grOVBE(|(%MQ!tpH
ziE62sMLSb@J|#h>IxpFbR7ub4;D<*ghpdc|hjlLNNxvQZ#?#-a2~+JgC!fol!16ac
z>GJIOwo<<o^5#cDF9+#fzl8Q;DJDPJROL?wN!*K7{(DHrg5Vyb1wpfq7ub(&R})97
zgNO(lHhn<hAsTFUvA>j_vq5mIdnJ-sWz=kTLu2wG@^2_~;Z1J~u^ba>k<i$uTPs6F
zHQ9;@y=ISzVDcaFMRV$iW?(i<4`piI%r`N1U7v1BkO}u!l$aRiog=zu^uWgSiQCMX
ziNN<RvaA#}rY9FKi_(dSh?>!4=VtW>m)R=6<{)M;HK8HMj*FX-8kkl#5y!T8avFlK
zAKS9oOP82o+x+QS8DWRj+b`sM+t$8APxUbqit&CzE2$*e7nnJPkiQ`E8G7n}C_Os4
zPTM4V7<7)pUl^-@toT7Pwb+hRr9Ed}2k|7Htl{L!mg45*mh5K?pNX=p@Hax_muiTz
zaD1<lI-%`V{#qWB&yDT*8BpIFdGBU;5J3tuy<riu*PW}8mMNAxwx;7^))QWfB!R1+
zQPoDb2M`=rlAOe095aZ8BC2yTa{F<%kvaSe673>gE=9>ouYf(}_G8ey?z$Mg%Y>rz
z*k-QLtj<=tjvKz9j*=H|E_5sVJEuRo)4Kgq&@posIP0aAA~QX+)<078T6p7anbDA+
z6lTN%-2k!|PtNcFlx7C<*kIfLGmyIvd5EMbycwKX%4U?m1_%Y{o!uw6FgRWh)62o}
zbDt04IG4v-0_CiD6^C}bXs#FT)b3+BKxQkmRIq{4v<8QKjy8`ZHUXi|t-HDS^v<o&
znb#4!5;#F$&YN`0d!6qNIKQ-W>%jJQ?PezL+zgM4r*P$EHtRcOs)!^GMS6@SiL>g)
z<F!Euj3m>rbp*fhq!sm2KaowqrjM<U+dXc)`rBn2h$Jc`d=^Zkz0H2v*%`o?xxj;H
zG}I<ixhH#KsY|1|(^c6WhboTTE7m!w91Lk%Qy`vX^e%nodd^~ZLBXf=aDKsWZ3M8k
z8%(I_a<_^iuJA^aRe*>l|B89cWy-xuvCvXe%9ZA0Z!=@sPR$noDyE7Zm~x#MwNb3{
z$4rA|jS(G-r}pEB?_ZxK(cwJ$AwP{>*=PkF*lFEQj#4<(Y>6kWJN^h7loto2W(u@r
zecPB&u1U|R81t~!&F9o!dPbG?K@W!MLF%NkSpCVBwe9Pu99LViNV0tyc0Nci$mxeR
zmLSdKWBCHm6kElXoIE}B+%ph`IxkeF$ugZ*ccBtZ{^4_gzw=U$)RTZ-=hyvAPa;bv
z?Mt{Hkwd4JD=|VoS#o%U-2tq}f;^LrEF)V_wOsVeGwG_CR4d87TPiTFT|xbqmJ_0=
zWz0-YgdPNmk%P3z{ytHm#-%l4TfymdyYgr|MA%@TpSd2WEp6pGj}VzBM~Q7TfPaJC
z<fd^NPImcVxqlnRE`{;+$Eu&?D=Vny(JKP}RZOCY^DF!|>J4!gufM4L*n$&*JMk{a
zpAVb9MhHmK(I{&{0oVT`*e3H2h>64h!p7S_Ep(=A17!;u-^jxEuVUPeSoCDcQBvZF
z->1Cvy=uUp@h1)D89xizaK#AiSeqRZe3GfuV(Ik3pX0gmul}s0hd;LnRp`2<bVbSl
z9Da<r-NwcBfMLwTW8T)o^w1_{@`%EMy>vRNZ<OXFnJ(utmVD4Mraa^_nY*kh)vgqo
zR#rEZgKg;wYY5VPuQeGOw9{R!dZC=gn;7%54qEkVO^oW3kn_Mkn(S-r>(rfUPNL9v
zpxp8Foi+M2ediQ=y2Wn<Yq5}DH4Pk3-MheE_CM|FYkHNurdzMr4iGx7_0nCpaSv9m
zh*Wo3=hT}^=na*<^<BP5-#b6}0?bis{(eIBRO#>CTdOa3;MFx&hn;zq7~!r<7pkB(
zAd+1w>|Y_$lrCLHehR$qATEITv&0OjN9%jLiYW6o9nkIa7P{86?M^+TgPQAYcW$+R
zZQe)Qawxs!J2vVXN+GUA)tK~EM%SQ6I0FECcP)2-RU<Wk5tkaQ#%`H9fPQTBN9_w-
z`ThMJyaku8=RTaV+)P<Tt*TTNT|!0vi5d>&&!c>Zq6GXJdbq4Rh7OrcbUS?id%BaW
z<tx-;$GL=W$?$8_Vr^9D?Rm7QxgQM@s*gQJ-R~{e<2tgSUm#|k>WYRPd;T+gq}cng
z%KO8Mz^2)g?PhESuso1{TL`2DRFVBjUe^>IQ6<hx?0+i19|@N$l;2-xRG*yu{=%m<
zC7+o5-oz~Ut$9-8Po%w^{5~fTRJ`OWzn*IeFUJ}o{y)HP0Ie0m?{Cb`CkMZq_WnfS
zR|@v}Q3#epO7tJv%fW9lx}|%8{$q+5`VR!VQ2$A;6B?q!I8D+HqB$S;zJlY!IphrO
zj(rr9zXsWHGAtoZJHmIc6~XyyM&$5~OP^KVlpyWat*{1Z=TajPV5ES<+2oLoKwk@o
zuB@>@MB?6gP~pF7<e7+Gxr}Y|x9tVc&>u|8EcIK!i9nk9P)oPr^=;cM@hVb@f3Rpx
zs&g&`Kb~yGS!@-ObuV|ipN{^sVW+xJX9oqOuSv+Rwj4Cy$KeL=oX?5DrFJ1RBHPy_
z#5ZLvP=TKIQk^&JLzny8G$44f8(QiovV{4sOQCnV&}jy++g(#}RIcLJl!r+Uc$e5K
zq~RtSBY{el*V(Dhi!*-_!A3tmg%`JX2(tCJ&*dkQz|Re+o`;D8T?VrJRL<N%u7=HC
z(v=>4`-S>Q0r%-s!G$|-<2F`*=VGg4WLQUw>S)F!kCmi_hS^4#gwep29)0J{ls0YC
zRdg=r?XmF0NP6_o1A?^<-npW|Dm;66PWfCF8P*VhJ(jvcXOtM2iQ*2Uh_;9LboKY6
zAWj{v)DcTBox)E)h{sYtZQ+^UVW@4|XP3sB`k}R2{CDkf2-rWj1_EH?msl0+yh2|V
zXTOu%FO}6oxQgtjC}crL(;D@Ufj-E}6U`h3+X>6tF+W1ao)ID^+x*eD=oh0(pTGGM
zu@E#@Nt>nqkHKbJDD7WWJ<|#_kH4y8orUM;;6p2P0UoEw0jf2(rV_Ux>X`#vAn&zR
zwpalfY2eB1r-P{&W^~X^RB>l`V7%Ry)*G+jP3A&)+0^-gdf4X=r2m;y^vu<Zp4t*i
z->doedqpbAs1YDh<^=<Q>tFhAp&5kKzrjkKLzfK!GZ#~8zVuz9+m>o3{L@Sx*np{I
z%R=ExhzXgm!)2LZk7rs5&B<Hsk}W4KOO~H>-{7No@_>h8X)Eg`2WIN{6guXmh9jKZ
ztv>lTvp>#yXRupt7d_~&T+aZ0c^=F{o=*|A8c+VoF8j9{)IqE3ytI||lEoPXSoke~
zEFi1lF6ydmWey|$x`)Gvz5nQ_Sbzps@|B@M8(!Q0-Q?M{7EYe9>zIms>$z>!%X9O2
zFs)~E8>NBRzRKAk0+Jd@oXLFq?+j<=PvYwTCXX2ZEU+h@PPLyLKRs@DQP)pQ9=Q`%
z>aT_0g#7f5VFmnj3nVb)r+?yF)6uhMo}d1&?6E(fwL*F1b3&g_%pN<O@qB{tYiFhV
z^N~S9__dW6z;6+<M+m>OxEB06Sr(x@a>AI0`wJs0J2;G>pDco}zgT+eHlrgP#+K^H
zu^33k0bUBz&*RF5X4!*e@#Neo9Sj9EISOh*Tnhm>vS&UWOD%R{gOQ9aRMla{WyF1G
z2f1L0aqR8n0yE-1WL0(ZZ9sig)#8t->b#Gu3Ki}<Bo|pC|4=+Uo<GwpjRSb(AIjtf
z{K3xxH%5dQ@M1i56sD&dg6ZMAZ1j<SyrBl(P{BSt(Y*=)u27`XESQ@cDy9lpgfmuu
z6jxRrnjN@uXtZ_;_98eNGMHomFFmWl8HAneB^gxFk_Z<rXtF{3pi>wVG<On&t@MxZ
zbG9`nwMtNwjw!pAqUBF$QJ|#mu<7t9_$I0+7AapIJmDxW*s{u01W#)8q)Jbs!IN60
zBC8?Xk}Xss?hhd<bD|2>+Ky^1EqTG-s$RDO!`jB)Nn&jbzP*a#(*c$zQJ~!4%W&2N
zjW=xyo1<P{%GkvmBp4p>N65g!-Beqg-PQ5c&c+Li%~{$%2Ks2BHDm6R!47}%zGRO-
zQtJ|Xv>-G8L>UV#h(|C0lCdOQjxLn>D}SRU1XuwO&hEhOKHEyRK`09n0!K6wlyPUB
zg=MO&Oohtm0Kl&T|BhP3%Gcn&{f~xwP&`^dQX9Ysm~zL42cQ&~CMxU1AtqT!eDoZw
zWtF6EMz5pxOi*o61)g=FI$qgg9Y8EmCui=a4Lk<Ysxni;GObpo#g#dcGHP!+FjAAP
ze(6v>dCwneJhsoj^G1OU@7@BT>MixYb5loX#z&-TU^pT6r@m2c+cmirz~-3S7};Jf
zHRJR4a%uk+a3_-)pt)9nE;$~PH4%uzfxL=d!Yneh5Zi49+>8aSU{&N-z^VVAT>&E`
z3_IoKK3gKq(OB^VcIdbSZJGJ80oo>y1gl^T44OJ$_@5f6GR+8XZY6l&yrSAluRjii
zuOlH>;?t5T)5M*TJ|;Z_axD?N2sFS{!&>o2Jp3V-oY%eUj`RM3I>fv;L>7J!&VA+u
z_)lQBRo+1~1#a;#T#VeaZL{Tt+J$ElSIe2fsDE%?h-}CC*P%vc4-=F4E?;oh!h$)8
z);CuJ;ad|bZhOu4+c?x|+c-@%5@Ibn_&=7Eked0M8Ki~|KONRkE8>$LyXG`w4NG|B
zkNgyjcM(tt_-l5*@Z!MWf}Npqe-(qZSW#^ZlyM_qPQPB0+UIX&aapPB?Zp-==s#EH
z-u{cXdHGqD{_pHpmow{y^3Ip(^d~3pR9f#pF?q2IVccH;jyWLr`ia2>@=kAFQSfJq
zUjyWy3=u;9AM960(pn+>UT1`#9Q<Awq;7v=@M}VX@LP>vnUjh?k@iF@J?-Dm5p@O8
z%>E~~3H*))(f?2I8$fG?@cSDx_Q}ETrqWLoex+z1eiZr`;$I2v<={6NE*Zk_JyV>p
z|FPT(;OBCQk*<-%Ui}2^DvW;h<4eD$jmc$3>IZwtH@3}}k&1nOcb-RTG<1%wkiPtL
zrPG+&dGE~>JBZVqMbA^`ls9(Fs>e{cklV<@u3LGDZ)<tCKcI8Nz;VTqId;Bhs%rr+
z%wp4|f{VIT%hb-T2Sw)G!vl`?(<wh>S{gv^t}gXNhweyJDn|@4vCuzupw3ZmkKkQY
zKA^2CwAV;Ts45T=HmbhPFim04e0c<#NM$O*dU1JSVJIW#B?J--v46}tnFQzk@uTG9
z=a#86{?AzJ%GyE2ij}3G(Y%>VJ*qJI0Q!!6K>wVGC;LIbV_%_`Sc*WD;s*@S;-ALm
z#A9VS6qCi8MNPOs$ddX3q&pKaZq>oZMTC*+te^Q>I%}vn2aoj7?<3(aIY%zX_>lu}
z{25;5k^fN12b2%ywWTF<mcC~K$1{OTsFKTZT>jBr9$wTwA*M_O7sDTf7$YezuJUOR
zE)(I#_^tmZ5yoV9FsYbkx&d%^=3pz(P~`ucWtm+l_FATGnR}Q<0=8xS0d!$dW~Ptg
z0u`1^gam^DQd=Z66j9Wfi6{XDDat%cd4B<Q`M+dgs9Vp|tzb=X>k>U|D|WPH3&bbT
zuT;e8qRO?0f7NB1d{CxfiQsz715!H{v;3e?68clqstj;0#l(iA2+=><gd?5&guxul
z*(1^2_DyrXhKWlV5|+8T3eb}cJ%x6wNp!ZrY0Bgn#n&gg+yCI^UfM%gZTlZs*YYVI
zojOw4O4~oLX6TZC8dD?9S6kG!T3afZsjl-u6Mq8l><!q|P(`rS#q>JT^%8xPmSWnO
zRv4HYc&3uKTL%4BJ-AR~@V~^|37w})X&x#?nO3Z8Yt;p|B~#M@%%E*#309p##`+?@
zNslf0CWy!F`z#gdKR^g~Kpy-ZeBu4O-*E4r4Bk_Xh12%=e{=<23JTm!0eSW{cvx|#
z356~Kk5yBn5&UPDG}@5H{!1UE$1mr{zXE1rgCl>34r=n-!30fxf#YOP_ArMpJ=rPY
zMb3lr^p6;}+W&*vXHn84v|w$WqNVFf`<?T)y}vDsyHcNjqtGWssyRxL1fo|iKe|$>
zsgzO`QQI=fy{xg9F}-BBv{CET@U`GPT#)v%O=ZHO)Sik>z$)@cKjhsCG>D8o%)BF~
z%P+ozC;HR%0=t3+SG#z&i^6=6c(T;$DC;c*hE5vh4V+X)_9wtT-AfNyWnWh>*RIbY
z_j_p<WG5R0DU_eae@ER>&V?QdhIa0B6DltWNXSx2!bKx$xNsG}?tk?+gx7bjih8W{
z3l+Cz^uGQM>l=Qy)R8(0-#`c${@_Q0IHbf=5=wlN6o<Q*)`}k4@P|h~cxrIlp0bL7
zoV$VGW3Vv?C&J~bHEX|XBCvyp<5Unfvuu5EFwzjZCW0*_PGJUF*&FG)e=(?&InF4r
z^aN&&X*#w_ude?kjvRk>_VxPGzfPw8w7IRNjxMuL*zeK0kmfU8IPu}5+CgBAvBak2
zk~aKWVt;Dr_xV0Z;088PVw#mmYDqH0&wBNsPupIu0oFq2heIP>GwfT6KnL_*`);0A
za4S$M@*)#CIM+pes6u-N*3^n4d43<R(%R#A{A#zNOg~^9GA%56+w~Cls)oL3b87jA
zr5^4v{`%7dqj^EN4<!^+)UD<~2~c`o<%Zw}F!sbq58d@o9%QP7CrKC1A==n&r4p_*
z_ltF(zQY0>=vF*=lWPLJ)hqq!^Y#1^h~BMx|Lpg)k{FDJ_bHxusUZBWtDDhl!5IMS
zF}uiA7__EE_^f^E!jeca<ld@QTZKJ(YvkQuzYdJW0N2?Q^)Epx7wVkmk+hw|RC1j(
z2-%V@em$O^R_>p^fUkH84Dv(1jp`N3-}NR83^R!fcTwEx2Hs@kx)tClRgr1x3?e1#
ze&Xm2i4$M)<Mai@Y`j;iz<*UTZt5ntgi_a;$G<{$zLrsXb@7#nNk3dGBh?#2rQIDW
zao!9rmihLeC9nEfyu;jJ%l#jTx7R({_GKMH5#khcJk6*-m9Z$>f8{D$>hO}6*_8Md
zZB187kqZTXs5{HXUv{!Xf7Y|mqODT<q4i#xV=kbNkTEU*vcZ4CyYJ(;@doFcBJouf
zwVkNB+sdd4xamK*)l%=UUv11?wo<h5q*I7K3!VNl@8F5gAQKd#IQ1({!W>tv`0aMp
zwqKO1C9I+cTgD?K8r4wBGpDduq$C11m#m&Ypu&4;7wnJIPR5hd&bwk)9%S6Dv{L*~
zTw0P&B;6%r%7kTOFWAjk)?<9({{}MS|C2$7yeQ{}vR@9R(*gDZtp2FHCqq;zb;+Yt
zw|?t-+c^ie&%aNq3Phkqk$s{>{vtiG899)BB1unYsser=_&^l}6=;#%vlV`JSJ<c9
z+{LQw^6*J+r)qaT59r)e{s}aY{S|oX+MBG~A8z+aPnJZ!`0E6HsrA1;A3}Sq=x?}$
zcd}>?pg-YAwP;uN>+Npp?O(%&ZVoO);@<obmFu|sys5+Jwu*%Q;&PS<5|Kdx&#x%p
zs}L6?!t?MKvJ`h%HM<2;6P{`SHmCtFHh&2J=ged*Me5_o(utMBVgp42^3IVRvt$Xo
z<gucQb~CxJb5*S#UHG<ksT%~ZmKX=2pC9s3_OIOr0KTGULxka=W!H(au&1r_O9(5O
zHbAqE=}{JH2sQ50E-!BIfQjcNZ#;ItqulrkY(s?-Kg+dGtL;BlDHf~9coG4Ywa(I9
z9!cC`q??@6tCt*MqkYh;##aDe#|QCw#-F{^i`?7ob#6YQdWHAxhrQy5{T08bmzRA|
zMCT$u8l^4<0U_o7p*Lv&2>M$8j1OUF__sjBls22&%bL?|V~d(eZ0(66g!E|!Zq{B&
zQ)Hn+5oSg?f-N%V*L15fvh;BRwTqL(2Rj1NUE(Dtlq)v;e$}Rp?j?i28|js4>=f`k
zCZX)E5tn>*_FWep`fD`K<y4`8GKFQ{BYweTai+)PjU7i6MdrRiOZ3mBvhC%DL_XO)
zh)3`2t!?BWvZmTYBlG^r^XB^V%G)y*B0$gt1_}S2_urT1`q<9#K@bN&*t5Z{FforB
zV(AO2B-KCTf9YWsgG8me14ve@^M<6(BC<6Af{<=T5~7OzXy@R!SR{2LD>+9~>N1*s
zpA`X%3#ca<i}=hU94X}76$<I*@;Be#HXj~^fj=_r=<>`G+y94bz=&S=dqOEVUe`z7
zDcM>XNsH0$IeeMHg0J|hVq7O?RQPA`2)BJ$%mh%Y*SgDedUgMdbRNdQvg=~$zX(@Y
ztYSmMZ-ihHWS@AHYQ#^(e^*8QO}_wj07v_)L4U+A7;2>JWr4>Hx62MLq&xZHwtr{m
z{unx6M(ZHfIa_d&_ok+_v%9$Qu)&vRoW9@tA)QZ;op!Wz77MQVt<lL5>kfx0q{e=U
zA87{Lw!FX1-xc6H!m}z&oVzNEh9D`7g>71Z15@i8f{C>ImvjX&JR15Hd~5YVNwQBB
zQY^DCdW*yJXc|xCh{Rm}T#FWB&74o1$-zi#I-_Q@zvL864Y4N}b>Lr|pa*q$UU;Sh
zZqj4FNlwsw6S35Zth4FGk%gyFU^cK_7n!p^^D=kjV0{2$lP~=g9I;z;JmmnDi6u{j
zN^FkId5~J?;)@@Yv+u)ll;Jzh2d3O$t1nlM=NNl-QeWKF$0}w6lzEw4UhIvAUMMBm
z^Cn9)u?u<*Tf!(Ge>S>=3w)e-N6x3*J*02Ee#M=V)tuUY$z<vb{doq6P+w=ka52OP
z(_@k=yt+3yPmf~kEG8-|M7ePDC(Fgwxf3?W$jDKyces2Iin2i8RZ%2{0#m~;c~Eti
zf_M^_0?L*Vn-U`ljTNFJUCt4eemXFnC~!s{sHPVixaG^$AKc`G+zEK@ci>SyBI8U)
zaV*Jo-Pt5Z?fSB<hkj8nEXG*!f(rlf&7y)Q+GO&DEy(ndG`>30sbthy!e>KyDfr~3
zkNISm@DuTq^dY-j8#CMo<nplkt&?ewHtvsf-o~2FKIY~psDI+e)c=r*t4q}1jBp|m
zFP8jQE>v`IbM=Zi;aSVKLQmG<Pse*SCfkf69rRXC+=*PyKD!=9y0(EQ!ErP9joAi2
zxz6S|R*Z4rn6jq1pC;WsVP4f_0^O=(i;LeAy=8B9SM$*B)S<54W@0or)HQGt@0IJ^
zTDgdmlD;h(uV(8Ub^{lcL{RQu&?gD>JgH}5t<6Na3)k2;bc4y<$_T^BIgG*|H92<f
z4}MDlj3ch-PGNkv!FYZweNSZ+d?Ieik0M=WvPiGFWF`F%MO@ASFv{J<2A8A&u2^>T
z*Z)0q&3f*3vKbq+wMtpFYGTF9b4y1DG~OZtdRKVHOZLxLWG)$icsm>l=in%LOY#$`
zVHKa0PrGof@u+@cmrH=4Fg?T?!LmWTgWeI*2(#gXBL9ofOpMM77->S<YU(F!)gx<-
zd*q_s^Y|R<|I)2mrJYZY)jcC|E1}w;<`!tiz&FZ3(JFcC+N+$r<w@RZi6qKOgy%VV
ztJP6dDdJ85n;yzr(+cG+FQ7+(orPB0uM*3IU`wuJP%d$0i&fM^M)^S0A_6;0)&{Jm
zJC(D(`D@8p?>RZ^m--l<j!f0(uNAKhA~#I}@=vu@h5)s__(M!+5~--4@WbBc<CugZ
zbN4Kj>~kiv&kLM1$>2HTG`p8ukB~;>pWfn$ht)@Z^a}6d_1{MRabT4E^PK+<Qy<7b
zJ4}$5f>0v-a7J<@aVwu=`iZB%?UEaGFc=q<g9!*ktPPJ&6|idBhGX*?{}pU5pOSfG
zkm;ZEW2vQUnZ}$$Ob{HY=*^@y;;&d^lT3?z)6YM639ICz?EX(LtKu{NdI>RsAB<!e
z;eUFmZ|${~F_*P>v8<s@BCS8&z-41{GMfYo)uqPX$qMm-H(arqA0coJze=N2t6vSO
z%AaRu2XjSv9B}Zh>#+6;UJIR!H3zmGkHm|QF{!q%*gy@+pCzrN#$U&RO--l?!uNI|
zd@s`VTeY$5Pk(fOG0calzo|oB>hdc4xw2M2xzp`>CHQv^S5xUmJdQ56*XfcN^R+r&
zAB}YFrTOxpX&086b?z+YYcX|%*11RmfeC~d|Fc6dJtrEuAyhV!P({fhl?1F!CKwL6
zX@7Nh2wvhdtCE=hOyY$>M!F6cT?DQTS|eV#M%rqyALR6-yc{gP&`%EDmSMgK&-<6;
z!ZNxb+AoE1F24nm@aACEv`88&L;V*i>Tg82gQ`ToOdg(Y^6;mjr06fm&cgEBO$ArV
z0?ET8L}lstbd!fK;N#2yJxc#d>y*XG;bp`T`Un0@V(v4M#BsbexgLuFBCTICUoW0)
z@5>y<o4kAmxB#917D6Tv%iIUKHM-K-7%jF4nGh}}7e5v0`Zpzl;dd3Aa=pXw<tUUJ
zbk64C)^{v5NX{BBnc?sH71Dn%ZG|@J;RgBJY`?%JO<_&<TV3(Gmm-N0o12|^*%2yj
z4(Wd5Yj7F4oF!o?Crrm<;lgmSrOt2F<ReAP7Tkz3TRP#(vd2uo73zu(W~;Zcf)<<~
zxr)~KN6J#*Tw0_c5j;xI`YoeftWO{WBDf9<*HOx>rAgC^vnPlAAk+^y2u}JzEp8wp
zd5Btv4ZQSarykt=IxVHg#*WOX2Ujo@!lxFgkEyZc{0PlAz<WFCXJcxxHvHK52AY!z
ze=a?Lj24J_=Be*S%8b0Y0h5wV%i4|W7qhTrtOqGrScmf+lrLOJ!i;}IZMBeu9e**D
zGb(lI6#unHWb46EFt!D7=Ay<+Dl;3zv{RF&g^-9PFP$d(7z8O>`Q$h>G>YIX{e(~|
zJyhdeVx%KIb_54u!|HFD0IUD5FK`TfPOrZx5U0n!J<eXI_4=#!`Z(PA(o03#{QI8H
z^~(rlzplN9iJB>1Pr>HoQvJl3Df)&1EN~)s^j_VwV(MX*ujs-?;v|=h(&A2!o#DE0
z<`HzEcO!b*>HHJwY56GY+2#Jvna-tEv?ROHJuhl7qN$f_mbKdb+LyS3zb<$=U&D)G
zRY3}?RoOk<lb-DUn)^dw9SSHr8B|NpEcYLs8H936&p(kyvcFN~q8?5~bd+PfnomBJ
zy&}X%RT|o>!pI)x?b5P-#wsv<P_R2BSxJ@!1uW+B@6!<zMp<ljR3);99d{xUtsMj$
z1j(98w(WWE+Zwr71o0zyDiLw>dAt3px0suQ?CXzR^+&zJ2t%IAM#D8XtFrXHw^@to
z;GJsR7^|0@Qr_EO3!PRMt02pz!9P)a*)8>xjg;T%dem>V|G?gWJ{Za1I<OgYL&({`
zn`w&e|CFf9FvkW_Cssr{cM*A%dbIqx60dk2_=OLySJZJS9L3^5W1TC>jB$n-mX5Ub
z*tRdPH3;i~u!>065A_s?PbuIzSE)OlE2o7XZH=dyKcnb_0H3zx0_M#I(H`aP+avYP
z8`I-eBPkutv(24nDcXAim#GPygfPQSLKr2}yxXS!G1<x^T@l)WKpaHT3M2n|Et<2d
zy)Nc;aO%f1{)5lKr(UPCh5W`Mw?-I=a~ny1t&o!8FvgQ%&7x&bK+A|OSBL%?@a)<k
z_`FhhO;PBp9nkRZf$G-T0F`qbhVa3!>;sjNaDbhb%2imMJkts*BH8WeAE+LlO_6Xt
zNQ`O#5x_sCjksjCJPjHQEi_BTvARbiiIW(T+mNTQO`c4o>4}<_WdKEgqw*Oiy?s&q
zwuM<A8K-~6_o2BXC_W#m^2Fg#l|h=m^w@onWuSX=L{+~3JoYk+HB!2$8krlPM&34K
z!!!QC7aY*cIy>((wlE;|3QIB^{ya@+&%pYJ(+54ZCDcscthe=Ch*(@nok0qMztAF?
zr`ryby+MWMDV>A9t<rFLGHaAR>78yNDxSS+h;+TfNHUXvBKA2IR>8*$AhGXO50Ty*
zvz61A%pNBgTuvp~zdG;|@k~ShgO3N4+uLG`v>)X@UvdlecP*f3Jb9;n0<Xz{b_ZYK
zIEhNzrT3!UXNYMnW3c)I@psmjx6flf?&&{$J1zKt3T7$jPV}hF%oTbipTREx^3T)E
zC~<Uq@D5jg{MkHB&+Tm>;BE0Qrz%nXKQfVvnSDa>HXM%LGHbM#Tc%E1fq=4E`rLSL
z-}UZ{2RFlwK?UEXf{<H8f(y;kzP!h(M4p8woU4U!z_69dXld=N<psw`h)R3=uFW4L
zp+;)`C(mV;ejZl8S8yRtwt_e6|H~ka)epxf+l8xnzy38h>`-%qe>vYTjWTWl<F`+<
z4!;Bd)qvwicAkUVLh0Nzjl@FzIntQ!7{@i0MG~j8m^&_Qg$6W;u;$h;s(1r89xbq>
zw6}s8_?%5b_CCK9(MhNpl_=|uwP=VW_R=b1&6s%P`5lbR;!bVf7XL#fyP`Z0wF6X@
zy5h-aWGxt3*_?u2Yq8?O@MNS*+ohQeYMDkVa^2vfhKDplRtVmh2nKa|l^3=4A}rzG
zz8Gbjo}PqIfRRUO_1bn@Ob0`4sBK>n!Wx|K=3-B6+ryAXp33s+i6s86W;%POX}|TG
z9W#MXLiuHbmHy%as61^-x34hCklFS;Tz~a(SF4qT@MUymdPF#ejO^lhdT!%0lKnRt
zG(j^w_5vi4<CTA38;D|Hc3`UlY?^O<TFAk674+d?@uc66iFHsh5Ib>*mmFT1`q~gd
z;`1Wu+F!9_l80A@gC8K67yz#BHu5jhlz^eUrp2v*gR;y^8bFTTr%Ge{aW2H<aGXLL
z!M*2tXxd8AG}DLtgP)|wysR;zDl~JL#Xf@3+X>3NwH<OSd&-TnMgry+`T>U~3^=#U
zfS`y)hI&@2^sX-qD9S$W@6N~Jy0hGD@-;ftJ?Y6F7rq)BUYxjn>y2uik;FE*-(d*K
z5*_Pw?|O<`Cg|K&>5rgnb{|UX=w2UE=48~Txzbu6L@$FW{Ifo(YQG_Slfgx;=l$P0
zH-Sv2A@qEy-(oGp;zqigx_J7)>vo4*qxW{qYE|MTY-|myn_8LP_OfxLTv4-w8@C;|
z3QSlvjgQ*bY3q~o6WR_Gz95!{M-n4bw%Illx>nywWEHfGVVRTuCB@p-T|wbbfmp%0
zOg;YMf2&ABXyyE?b*z-jZNz@9jbYF2`;P+@o>3QZ4)XSXP0i_B*BgT7{F=yDbHAuS
z0@tz7f2nh34SO6jt0IZ-@h~vqU=dbq^bgnQd&Q`gJQSn_tMXF(hX`5sU|ZbNJ|++#
zD|K@&GEF(bo}G$(P&(~U$qYj;=V!?fGp~$Y_aze}T6yGO3F7A0Yv(yp*0&sM<2YlI
z$)c^s--`T|YGUS&FHcT@Bpr(q>@=rdm*y?#VsBFg-vQbt^+Kmp4Of3-Dfa3)ovOoL
z=Fg`xQz7R0z<$0>&Q+7$m+Cwr_(LADGRZpO*bnH1sY$}wYfwmIowshFN~};5X*7hp
zk=I*oH5Z~or0WJpJ_=9Y0uNBrTMf(x-$_%vz*AR3X1CRngY*6YoFfT#njsebU>)hg
zXp8aZ!S*LQ@}CQkz~Ob2D80>F7u0L3wlCo(FaV<$N}WE@=1`d~n7P8~gEd%!vR
zf{{Y`@pP_N4sM-{UI9)Y3pWAAUw2MU`|I<+@wi}j?(||aw)W8NOfUZg?NHyh3g-b$
zj+dH?SZV-*Tbmc_g^y~QXBA~yHE2E=SNKU=S5Q2bK7lp5w(awUpD1Qnhl1T6bNY4m
zFS>hFHUOyFI3_lajfMG=EsG-#=JC|w{T81b%oS)4l|?Js>Ns0o^TaW|IduW{(bI68
zb`FUGJgz>G#8nI<z)Ni^T&q#ZS6G`qrqS7&TdP1&2sC~%0WLftPXV{)5H0g~plHx%
zS#fV4Y%8w(ko<}};{aQ6ukyyPcp^_<ge+}GGD#@JPe;1i0gm>A<D3_adUvE?1zD@g
zk+n$IyAF?Q`|&^)IEW>q_P6yK7h*~0wg+Vi5;@bZFEVd71I!JyV3n>VPSvf_`2~0u
zSLz6@Pm!-?B4786v8`(>J688lc!Zozvf7*4FKIGiO^b8uqe8G-Dy3N$XQGTDKciu0
z$gmrnSM&oF2<dHJw)OVybt)(aGWNFJ=nmogWePT*o@}Q+e^zNDQ^zO|@}6utMY@*K
z-e%IjMY_Ika0`{X`FMjwy6)z!Q|hMBzl>B`*1yJee&M(xi34~80z<b3c=aWfZLI<J
z3Lb$=e1ovRJc9_9<I79<N`IrAEV;d;9QFV9Z6)J&h?SYDbwdO(^|vo*W{-CPz03Yo
zNDRBe=)<9E%sVuf>&*VDcx~pFeB^)TvBCw?@BH%6!F-ybz*kfch?7gelBk@*3$fZ6
zh{ZoC15a2G-83A-qdVB=@=y8=J%h`(phx<rqMb<mej6!P{b>=CSGG3y9dQ|W%}U^z
z;-&Ag#MAy+EMRS)u|mXtUA49T#15+MbY+U<Xzwl&^_DOjkVO1u4%YCaN)n#<Ilapz
z5$<MV{DkP3?naWJYbUQcbuR1q)G}>n8f(O1Q#h)>xsNJ4dN)^yiFS%Y2|3cwS2vez
z6&SZ|n+-{d%&F4&lf8aD%s#BpfgjU9aNjXUx|#@^RCd1eCHuE#No(x*4<x8bVhJnx
zsgFEwX&ew0tU9cRANnWT=n22D^Oo57(sqG+3A{MNiTbE_t3Tsg@{N3|{r5^#r|jrQ
zTXyud)Z?XZF*xvMPt|R^EXv4|D9lm5fs)F1{w%*FM}M9l3T)+vRsn@etj!~BYlO+s
zi@4^IOkl25!n_guGm+2bt*~65Qa}Nl`pYFp#}aCPbKN_Y?dAbRv0#MB(Y@f3LMktz
zd78fRe+Pvmles#FCqk>{g@2A?djm4PnWtfL^gjz?$r?L->5-Q%d&!)%$=8W3XXuYV
zjoK4Y{+(SncQ#@w(mYYC#;1BTw(wk?CaY(9dH;$ypT~o|44tf+$d7)X@`zha`SW@1
zFo1;T>O3X+Dmgv6V?9xq2&jJOyL(evVLeqrf1=hOmmj^|)wA8e)E^YOp^H?-4*d~e
zH7?CX64yI9un%?hH0;=)2I@I)r}b!7RF9^_@}p08=jz;m5n_jW8d}5}IiIKWbHTW9
z@rAD1xjL;f*iD~`v%<q#x4A=g2nJDpbmu>;S)Z<14^PzT;M%`QLJGk!y$!5W5LXV6
zsnVyJ2zJDV5XgQCO89k^9lR6zx2Vr}N5le&RJxO>+XB5$_Tu@_3|)nwMyeB)9X!<U
z0)B<OV)gpd^=<Bw^J;6vm1s5j3|}{qGu<5|%I^>wEAmgLfCy!)KgF&bVy2q0AH7n(
z6Ky;$_q_V?@9o9h4R`uzV@?}?QT?xbtDQkQnifw6M-JT^bSN56-C~~Vbn<4N>5LjL
zsgV9&+>zQw=H!^cMoNZLA%X-))_fqpk#UtYs!0_Q^HSp~alG?e9uVa{IUv>4yC$rc
zPAuR$Xo@R?jUMSw8ww!{@JHXQO@FwQPCE7<VBzYn!tdcq2WCejUE{SoqBAgWMsVER
z$w@upCCE~kl$c)^bx`3iKf@`JQv&mea)JKppQ_Jst^ye-K}D{P%T`Wfr2S-}4(nqu
z99?J5wo_!>^E{u{vC+Mpx3kt81E=scA{Fs>Iaw*<`y!U`FC!%Yd@i(C-DAMn&Sd2v
z=Ktd)^z~W>9O^q@cLlW;Wp1Jq_-_VwipIcDL1tDA6_|xkadxy-@LcwmtGjI<``=g_
zWUF|DU;FG3wy^oCOrEZpdu5B5@70g}`3S_F#7xYMn|01+%A1mDl5N!Ssrs=O@g5r0
z5J^PxWju=^x9`8<-s9-|dn}Ye9krS7b7Sx8-1}%GaYX)oRc4p){od|<c_i^Ta0tGy
z$lSn<egDpIwNLfa^RLS?bq2E{KYPCz<Q>K>v>=6(`U?@RU{%|K_Fu-qZRe6jD_Kp2
zJPY(WRO<6)VUab<aId2G$N={S_)VuXZNxoLzVsXPFZ+a-+NXt!%-8j>tDCg=&tf_7
z-+x$jvSu=0<b|L9duW8|$xaMyFMEjzchIPRII^Up5r^27qY(;U5y8S0DLf75nsu_Q
z+i5s`zDGm0IQ&~-`cC9>rZFvvD``^Z+pAa_VT+-!yS^&-Ju1^xg?~6#**jdR-fXZ{
z*duMa@*C7lEhFy>%F6n=>_JBOO(<_z@ATlpj<IpqIVaPe?B{k4_Kao_HQ+Uomr&%3
zBo0CJ3dO!iVn6QuWt7Z}<SMMX(!Wl9N64%Ig||vb4xG3v{2SL$_(gl9L4F%KID2hQ
z{w=i6FuRixC%+U3pA|WsVlc;fd&{@6&Bg|$?KF06Tg*g&TyJ4X#&@rG3?7RZxgNoX
zuOV{P>SjJ5WnmY&P=!K~u3L1$nl9%n=ho--mQ?BnkG+BMf#WJ8yRU`gUAj(L(;&Iy
zzqkkaQE=%_?8~QGkgyYI^eb)C;?UuEGAk~Vtqth~l5k|$K)e_c{bu6viKslYtZf(n
zCrp&_Bc*^=(s3lv&q4gP|E7X$Q@5{(^-eZ+nPqP;p-w43c~lr*;nh7BNxZ}puli-H
zCw%~p-S7w$O_ahRm2GDz>j+zzuu-}QXX#uSxE}iL>*424B7REgUz@Qu;b(L{8x?(#
zYP9L<SN#FK>fm`Ll_JclVUQ*0>0DV=K%LPq85ew-y5a2-@MP48bhUv6*<VbZZ{)&u
z`~!NmWHWF0n{D~#k7MSOV5x=#52CT+6{^pi2QwELt?sZ&ldCcwpY7E3&`Yk!-o`fC
zG5=zXz6=uP^;5&dyR1cMcy8xgW<MMdXM<L3uW!|HBFnc9H1E&fIn)R|vB0Ab!J{=~
zsE%B<JYM}$WZ`{Cevz($AqJhq#a)`EF|UmIx6+U&ONi_$eLf<pIH0-i#kRJ1G80S7
zoKgL(m;U-z)9c?8q&nX%k6iJZu9m+%5KT^}dMDR;554Lwf2qXiyUk%qWY`d}0<hGr
zZhu99>)iOEmoE9hAVa8oym*6b73@sY<{&Q}|C9uUw!MVz^2sC?@m>E}t>5JGc=0=u
z;Cl4WL(b>)g>dpWy2*dZFI^Q(ek^&e%{}(i!g<f@_xI+sFs(BoX7Z3q|4tJ{h}bn$
zP4;-jtBAfRvzT+ox6<Q2Wx4Rz2Y8YCG6MvcR-hhD2Si)jH)Rgigy$xY+3_NimlN~7
zO5@J{kQr}9{_(d$Ny7DE^R1+$J%Xx<WmJ-zc%?OfI0VzrF$}3u1S5FW{}ND_ZyU&l
zrI$pSk_!=79}m3Q>Q-NHsXIku9<$9^5T&~@yXg}?WX>LXd1_3DZ!vd3vpZb(sNmXg
zXCAfqzJe#f1zDzScX<KCi(kj*vyEg>UyP+oUX3BhGV*=^4#$)8QzKmEa`|9*b$D`a
z^&%5in_KcJAsX(ci;$4%!U%^qP@&VEAD}us3qa`kQA5@9cI)}8LC;qQJ)cMwP$28~
zMv$)LFc9v7O?3M!8VJ6utZnNy=C-yk*-;MRcvE}CIse4G&p7?N7i;f{Tm6IJN(#bz
zwi&Xov%=By+vM-IC0_luzhDI-rL6tzcX1@dE5gX-Urm=}bg{<B4iH@~@HhB@Cp
zqKk&eJ?iz0jKti2kK4>Qc7{1*?J>-O$Qh>k5m;|C9m17<WMvE#wTTeQDpso8PE&~G
z5L}1QjlUPI&Y!bd2-g8XZ9gH8qW=wj`L&y||C+uK@7JHc)R)frlxR!I!9ldLXw3AT
z(U>iC(9xJzbhQ&2Gyi`;U*7!>^yM;%9{(kM`EB?AjlLXjV$R3V7kL*Z|Al-?{<|Dq
z`N39HE9gr7&afj#SANPgmpuwyS+LScP9JyP(8=Jiv0qkP13*R*1zYt`4FZI%$f!Nu
zeGOQB=^Dwc>==xAVOOMx!;0EwD};7gt$oP$!7NCbJj-jm<Z~wOtWwbKmY(;w^*VJm
z&{ic<Axzk|bQwZq({dU1Hu}HX7mCR`M2{`^=j#SC=H)ak4gly}*&_ZO(iIb-Gcy5*
z|6}GP5ZtT_*<kmdK!(1h>Y?@uH!XGYTw&vD_)v{U)t<Gyr*)&xUrNXGyjc5yYT)DG
zc@Ilc@HCK!j2ZNF(LfJ`^*cSksR?0aN4PCxay}kzLxZlLiB&rXw<)^m4>$G!1GxF$
zTNZ+;C;KtyH}%}RBcQJ5Qvqtrs1Q)E=t}!v^wjbDoIiw};RL6;Y}3s)aAfFy#@}ms
z3j0B>Xb1%3avxWI>$w~uYE2K<zSnwD97r2m+4{$saLfF!!{eoXX7m~r2usvH2NGHp
zNh~rF^>5kPOu6S>>UeCZO~iYhg=}^*3fE|*nBnM#Lss}@z>EDvT>K%&v&y@=nWV_o
z6DtQRN8jM)y7$}5Xs7GP>hLR#W^&5z6VOirFBOef+Dt6&4E2I!0B6A<HYXJWP~knY
zA@=!_N`~X@A}EA-5V=OfuII>}#Fh3OgRn-Sr=6?IdnawR;z^~yo>9^H!4d60MMOm?
z+SooAADO{L?Ssa=lKm0%0er3U_dE{mNsPd$S^MKvt*bKs0oJ2bP60c&iXvSS{qcC#
z#1sg_<~JchRuR$9pU8QTainj|c3sG&fCf*4c>RI*mkPw+WyRrEcaA*XPT{&S)iflQ
zKE-_Sj<Q4bl)$BYbLVIZjH5vASy|>lZbSJ1N%JBq%qY8#+8^_8OP-vQ&yeg<jd+<|
zYWSr^(!xk<g@<2iW1MMBdd{|=E*<?K{Y*l6o)Nw@_&+3c%7vZ3us3^OLy;+PS}R(s
zpYeNB*gt?3ZYo!hB_Pr@oZe-wq(#5so^YkC${r9<mTZFnJKak=<Ba0=*>VeR6fmAz
zB8V&69As_Pdy9B%HAnb!gO_FgAuM<;oRf2_aaH!WY9srLu%%(ag=#DJ2kHZx?CN81
zak{I3e>WaK<~|D}LOQ&&2>I|av;&Cr;gN-Nv~wP&qq=}R-6w)y@0MP`{>TNM+CPBy
z!*reg^{irv;Ps_14kU#m%OK1rf_el;a%Y@5`8PE<8Lu3&t)&4Z!P8}R_OHS73p~z>
z%AG%`F=Q4&r3(9dTx0SdcTJ*a^5ErQSm_Mu5#PWmkdBx6WzEbF*gi4&EM25-+Rb!A
zxXq3m`di_gu|HoZ$N&3;n@KE=C;v(O`rq$5nyRHBNlGFd^5p`$gY}C*_TR-Rpap_^
zk(G`mAEzAqPJW74&B@Kp$#<|oc*%#rGt@I*XBTRDLc>Cbj@>2B0hNLk<PT+nr{+iR
z_KT07m|PQCxYC_Uh4B~XjkhBU-;x{3vB|&FQE%eh=Bwk$=4+=;Og8<rptntH*k!ux
z*`lItMGbS?uZbk~Vc{~~Sn@Id@<jk-ArBWeU){jr1QXYHyfZw~rCo%`qNblFy4!Z0
zy4+{emiSgZDR+0$`z9veSv*`YO3%7&?mrmxOJCE?_;WvtaQMf?!p!A-QK8wO%*~C-
zCmI(Ql?R8hyxiiSz{<ciVb)@3>doI|?#2#c`7Ubj7S1^D)u?v0x+Jf9sHtvilxH0J
z_C)TPHbh;yU*(=@3)PkTYwnrL{NA)C2m46ZFZGhcfCT?AWaE^csq_7_7HXXgJx=Rg
zyGB~?Tr;|>M>4v9FgoYYqi}{d#Bb6=f5@99I<1Ssk5pLU@2K$gR=9%~MQnFo3}JxY
z4*j{R-rbe{tt(xp(%+}_{A~}rgi9sdxb@qUxdM-tklyJTsLJ_dqz-V$E<XkuOmuj_
zToMj@{N7%&{o2UFE!$y2$E;5q8q96}>D=Rg7D;G>24rYv0^9G*A;V9p;AaI0p_reJ
zS>>EanMZ<24ke-vC4@40e0Y@V)@cQ%$DgWU{Q`g<O9=9{F<d@ChG`BF-eHj_y8llQ
z(0-`!#gd()!u^8ScBt@l?pYocZp=N)qr$@6vz?&A0=?V;6#{zWpLB=e!^FLqv}!9k
zcJrP##YaB^{lACNIedtM4|a4|dg!H}rtpETaE~jzuL}Rj3U}~A_#od1;X_hYf6<k`
z#+819!zul5QCj#Q4`%;OCMd**dDv46@j-;(c-C69V7{R=7Q=urE|I&`nTpeBbux^8
zh4qJfod}Y>=Coye&z1hTkU~YeEd5~T%4m*uqfhWj;F;c}s51CO6Z2xO>3eM>3t_N6
zXGN9RqPHok(aNQ!Y@0+VHQ4LCtn4(k_Ia)n3~-kRHJRV~hpMSdpQcJLq*}t%r=ZBt
zdyYe(@WEu;ix#zwim?k7!&jtnH7$u-4n<HXH)h#*doB*8&p(74u+Q^Px{C`^%X&)L
zr@w9p%_WCIjM&lK);5q{E^XzpnA^6$VWRc5#+6+z^K^J+!^lV~DcD9~-v!BXv8v=R
z>Wwi!kKa2#j|C)>j;GHv@rFD?!d5n7OAlh0?KP5vaQo`IP!-xcf#8oH@a}WY!{dB)
zL^{2{N#no~qRcsDmt|Mbu_JgR<X}oOUF-XCkQ}`bvk*@!z>%i)h9XUC3o)c=EqWUe
zlmw7m&6cH1r~dIU3u<SfHm&`piU$}|#e?;QWEE1tTJl4(dP=Bg>BBtQw6C?6-w43)
z%=?<{X8{|LIN;4@uXj>ZP}GbJ-gW|bLnnJDn339{%^Z7_?4d8=*;K+p0j}C=lerk&
z83z>Osj~vy8TenK&Y-^*pw2tD33X<AeiR_!*|8n!aM~-`R19^F4N>PAe<dGfo-u@x
zv%HQBbn;Ddi)OBAZN~NwmN0Yr!%u&w=<^$gKG*FWefr)jL?4-)^XPMy-bEey9IDP7
z&ujnB49rW;TBAP9T9aK#*T(ZaM4l<@`XLYUk_j8`_yM`&yk~fkI>_U+)&fkz+J-8|
zdglZIA#lmsFhjI(1w$E=`glxcF0bx^NMa6i)10crBKAVt?)F&uTEg=Oo^(-xKY#<0
zA&j{vqC5?V!yu=X#8ln|p253l*Lfd;qE{h_S`btFNnYwx`Rv<mXIrSRf|sSTMLA%#
zqS!?CY{}I5W4gyYJa>f_G~)>+v#CgCl2=2?tS2v#nO));f2?pKIII*mc$Wq>J4g1G
z34Hl&Qg9;3F!mq>kTXpYBrA1JDX&PtX*niGQHvw$t1_?PIMYu)4JC5G-nof|cX&T|
zds2Pnx?Mm1qOGta6hlM_P+o>R-9@>v$5p!EK@DvD2)&Uhqd=2P+^1X4o|WYypRe|I
zo;KSVH-j)&=AU0eD^nXgmrD0LD3Z`dl1{Mfn5jIrzvd3-rYod1aI|JYBHQm<xhodv
zg_f)$+i?21Le;oTw%73lN$|ZrTqfIC<S$o9>$FU^$&JCLly%ad!<j!s#P5L{zyOzp
zH>dBJt$(B6B<lm(Gz;_JZR|zSo0(<)^+x-3>V?jI58i${SeMUm9)?>kWq;;l%4CHW
z;2&_|ROlcDmH&%dRW;9~G*$?p2H`x+i9mE`Ja^MgRpI3@E7J9hhSa%oN&$8ex#p$O
zy|s;*W(S+YV-sU-ElUF*4Z+W7cae;>N_r>vwY@aD^w2eL5Z`|enm=Oa$UgmyZ+a62
z)=W^Jr<*JkyU_%au(p%WJ7$hAYCpSU=D4EvlRIXTq_#=k5AKt-2plqqD4!52uzPsc
z-LzlD6TcYRL8jVp*073q$5JKTnPb!|Yx7IMH~Sag1Agwo;eVl{VNI9i)#1!=ww*)F
zui!d2e|jh@vh9Dyk`G8ddsMoa3_;2P(|;i(Ecb}kg0%Sh@#nnd)`EO~SPKfTV}e?^
zF^r$03m&wFYhjumsK|6(L=vy)77CV*^JQ{KWghR5-(=kZ$iXW-7UZpvOZ@!L5?dI{
zf`#$_*!vdnsH&^&2__O-aDsxO#X4%#q=F<N2?-EPfZ!ROU=&a)(h$f%A|Z*%4B(YO
z5@9-yQZLndX=AO{d(jFa6-WdE;tlYIceE<!7`ccR@B;bYckOd#&P;+J?bq*l{^!fX
zaL(D+wbx#I?X}lld+m+*q|=uAIYR*m#2#nUX9yC}L8d*1VXQ>0u$>EzOJKGI&4h#F
z<SOI=#ur4!Nq3~cIC&k*DSCXyI9VX$1k{LoWZS4V$mc>o4XwhMM`ExdecU55WRjpa
zO~aVk4%;W8`wK(8zXUC6&oL$bT0FU*_)j!G1E$rupz;{2uEy7i``pCqcVJSz<r_|_
zd6W6T-ajA#M9RL0%UxV!%pHstZtL=83$pas120L=>h<{1(F<0sEzlBs;KB?Ewj$&W
zXKL?ceUgmOgwN59)Pkn(p@Aa!8^#RM)7=ZffMpBo#MyGZ?P6>&;achW#$+9z{NqKy
z@H>oAdaX{=tY(JA?eqi&>KbHe<aqIG0SUM3g`P>?ifGn|q2f$cW>csVK2vU2In62=
z*Mg*cZp2f+7h{I)M7!zJkljS^_@vK~kTO|>-G`2lB#FhufE2s<j_`eW@(&(4xsJm6
z84Hf(qwt1NeIw2YQ4oC#<H$T4mvyok_5#9I{+a2Jpb%VQ`B0Uqmzm&x4n6>|2*D_w
z4|Lb_$@O@WBB9=BR3FekZDb(8S%@e9g!p*no(GXF2!LlE_>W~kZj0rY{3fwxQMcH3
zxKr2bzu{sjm{7f2Fe{6L=T%Dum!=6AsuhwMIxJZ!zu0gQ*>C=<<o^gggN|UITgcRp
zSHf4gFNyHL<MGXeHJGpjIjrO?TW<s;p1`-uQy3r0`V0p<!5+6JVnO-{gy}H9fwm7j
zM_3b)uT~?5%MsWBZubpU^dG!FQJ+qsTN%jvLH}j~OdR<{K4a9x@!CU3*R<XL1xLBY
zNh04iag;vHQh2rzLsfjt3OkB0DEQG)6UPFU1H99BmTm4BLNv`=LoNsfu_c_y!Dq-q
zE#i<|11*!ZIGHR0A2w!!J9GY1YypZ-(SL&@oW4g*HQ&L`iD2dEoUe>_d}69u!cgWx
zLBc>D0HK?O^t=dc(wcZ40AT~OFC$3OJ%%o<pF^MMe*_rybLbNNMtQ?zGlPi1>I|m=
z1)c!lxEw+dXPr-AFs0_zaMVt8qOk$tv<@TR6=1-XF`0dcV7=;5Yf2IZsaaFfHkK7G
z<}P$=dz1bYu^JA1!0ZhhBwNF4?6t-n5kn|rcldp+@r>==W-MhqX$ybN_Wl{(Z3;cH
z?@{`jBI<9Td!Y*_HO)c3rayJX&9MXvtSA>TU^uQdo`5%-07%6ij!3j%%DZL<r@X?u
zKpg0SrebCZ6fQzgGeQ{kFrVuR7pqCHaG4q0!YA(aBvlc=MT@jRh&CpXIqhhPgNK-K
z98(5XOpZ$REG842Q)b|_hGnb`g5h%#m~YO1P5Fq#*>;tZA8HPia5JxSX+7n}g-wLP
z(m6X9?2VMJ1?!uULU_~ANU-zSdRD5$$RH=#P@iYuoJ1}}2!;rk7F;5gI{!9L(93xe
zI0{`~zpk(zd5JkztM~u2IZs|+k7@E1v>&Vj+!!#j3)%`7f~M)XAV}1RezUM%>3fT{
zpmdnUQa;pIfyU~uU+9={xwDD#G}e910N!_=Z9M_!Kk&w*$CFvZ^a*0^my%4Dw!S{x
zim>F;8o*}CrG<B4{kF))T;l)2{C)|xx;#t*LsyV0Oy=?{mJ*CohMFiAYJ(!orVzz?
zrm&ukj#8I6Lm9;Ba{R|XwRml{>pohw)K+NHosGvJepf>Uw|NeL69o&P1}POah>QMW
zlp5sh?OgvKgC2z{q<=EfZbn+=iGeHnKYhvgM8pf91y1y``@tIMEX~x@aofS_$|%ha
z29LBL&KScorPuung$b%2Klv-g2TztNuV1J?2M%WRkfQTJN=-Fr0_ca`ud<P#^pRWv
z2A2qNg;r2?qfdSSL6ORVIqI-)8Y_cHX?Fk%C`n4>d|HNVrrQUx*iccXo&@K!R;aoy
zbx}t<@qqs<*AK~r$xKD(58Qj=2vv%Za!I}c$<_3f){KVnU@+zbu=ww=)<9;}0t^Oo
z1T1I61Wi4sZ;SBxe|!S^4ds1`AS-1H`arSzKcNoA>YL<kIKVs;Z)mFwJ7M^dDyD;N
zljVOUQ_l*OBoe(3OBdr!6{n!HvmY{lnX?W_elqf%jrC};NrHaqV+jvljDuGNa_|aQ
zmzYcfTXAy!>i1s|(f4D>?vnFYd6G{Z69=Iw^d2LLT4Y=c78RB^{4q>E;2Zi&APP**
zzyzCOva0YzHA$w~NYl4JhRkiOE$|w2GuzQcRMiL_7~K($z=5>j4N$A}ICHDHf$4F?
zN|=-!?v^ezyYyLf6bgyO^Df|7zvoX9N8nH*_JPfH*=l1b_Q%y4E&4s*7QTUYauIH3
zoo}>zRh{?c1fhiC-0!nZVU4hLlpXw`dCzVGx*O*?NiYeI;=_#%y<G6$n;8TV7E>3f
z?q8OGihTH_BoUTiyEASP!uiK{vU&dDwOL-M{7`~9Kl#p5`4^e*Ud0!;@v*LjCF{0G
z#QRCIxKjc?X15*qE~56^yuha)wu;2vDQB?gD}3@1<qj$+Waj!c@?uG+U{RCS2hz?y
z9o>KjwWd=I5akEfay8ZpRln&TnV(Xyj;rbBKaGlmIhrZd>lb|~tgLARiZl6QQSdC!
z&@(UnoCsbou-RIMNDf2dp;L?FV_N)+m=^z4T7hFwom>18qM}-SC*X-`@iD5!JrCXD
zyYH43|6E$^Z2bAL#Qa$W_0LPqaIYBXiPG`;IGTZ0KFUveC4Cg1a8LS|<j3%W>C#cZ
zim)QbY$T~)Wv0IyAI_zBGBsA?Ogk!Pt3NU4b@<2VuE@bw*~^GW>9<R5jhlVFpc_Tq
z%#FHRB=*^RW&Jk|>W3Pw^`A-guXSk7<?Vx@*lC(TR78ONn_y<?O;UiAO90Ya7Ov;P
zztz(~<TcEjQLv{m`EQ8A+;}(gxP!L~`R;uIOLiDH?ne~g>%q280*3gGw2eh*T)1l~
zB$2v}O?WZs!-^~}2PHK970HQQ=fRp_!^CV<V{6IrN6?Z3C?d8cl<TaPoCk2MmW=EX
z(-OdGBq6bq&(d3*OIzSk#-$yznr$M|GN=8+(Luzq+j}^hcB48D+I)Az{siZORN@iq
zff^b+%4sSWKQ&n!TX+epcb-6I=sm+6pr`Z#IDoprYp!a(stYSZo9c77q_i=BW;7*c
zIDC&ZkXjEMAQ6gE$7BGw>xQZnuABNgPmo3XyR>1cu{05|YIG^Z1?&v0i!u_t|HPsI
zOTiYj9Bco5cz|OzCcA_cLpx0V$Az*Lu|*QVL#m#J^cliokwkRLjKX`0nH=%MtoVF0
z3`_Cktw@JB9S>M>L-hu&OD5u{;V5Pfo(mh3xq>C&T=44`%gpn<l&dy+Za3jG^#hQU
zk0WnmYV-1>105TkOUEVmfK?zD?B_zFA!?xdOQ`)g&f$Z>5S?f>zQ#dKxg9D{#9fLw
z94N51L4)Q=J?Mr<h2=-+hv!m&lUtFgAC4qCH8R85<?*e*M{=bnR*DlXsI)6KxxDf&
z7Ws_mRft4uTH^a6V2XcMHVyYRB$-lrBg@9D!8y{V0_YFB0jK~tqrz-IN(Im;WgX5q
z_~Xx`#=$&}ENdL3k@|mZ9Q+!Y?c*R>#wF%rOVJ5`Z7UEa6ss0_Al>lX7+Kc^B%^Iu
z)Wc*gW>~bbpirM}I4lr7EO04=(lJ6{xOyulx5F43E+CCixvYUfqTNXRV_*pVBa+k~
z9xYoXQU*oTcscmMkb%?H0I?4Ysw01FVC3A(zCW^oA)2ghdY}7Pgc=yX{*aUZ*AOMm
zrG_qoKO)VA2F@2ln!vh24(uUZwpg7_u)0DajKbQY=~d{R4QQ0Gdce3k+*#stkc(}W
z=!FgiE&QbUZVifvoh9~S*Tb46KK)r_miPqkVb;z04safNV)%3PfaqX_)_`!S>A;#8
z(!!;4BvX2tgw*x{!NTP5gl#~4h_FKrh({%(Z9q&$V`2tGl^PIlfkY2FAVA?94j9Qe
z)p12Kwwe@g=G`deoGwW3LifRhybq{p#hE9i<mE)dSZk=H*hhm55jxS<)9j<cWb0Z;
z6#=6GWctYyh&4W9h6a%@XPh|)=%@bShz15_p2HGpt|7uB!WoI_rxNR(LV(f2*|5Y2
zvCcQ&DVzoH9wsLeD*-(;wSEacNdn6kR;@McgDm4?wa}2(2(|Lw8F<z<Wf?e*vdR1;
z**K)Olk7wRIxNfl1D|1r9WBfJPEy)v^f9y{hDIMp8ZhmQJ+U-ulVws!p-@FDS%wPn
zzNAd<9)V<wZ4`RXyWn02A<rmgE#wKbbP{d00<+@${m4X{gHUFyG}9mx_K`7dk!E)4
zlkbc$?Z0e+G=mGqM4EXQNGAIzhbR+7gtPI?y<u*CGF~Op&dtjs+`Qc6=DVTqCpU*@
zVrYs!l`ruR_YPje`ytp?KJ6=rTjPneP*%b&KJspR9HVd3EwD#Jd@AEFRq>zgl=wQv
zKgD>9){4vFVtnPA0$3!?+ifqOvrKv51;iP*BLd|O054kpRlL+U9`C3x2M@i}tgojn
zK1;>-Wqg5(w=aOle4uoB??I=>w$9PNybTJKm+@)JdE$AJGe<sFN}JGfxHn3ZLVK37
zp5B<{(C$}W6F64kq315L?H5Jm$GyMIze^JLJk6Frr1H;Y{%--NzUFt<PK8?cYt~jp
z(k^HYA39j<r-)SzIfD&37oX`Z#;?9by~5P65ar^g<%{k{BN_>NmOaKxJ8k%02EOSq
z^nXbZ=56)8QFRHvkv@b%G4S+Oh#i~&jE99F@Ny&DhnT-oAPM!HfIV<)+vo9D1wn5j
zNM9C`=v-TLNJTGY^dAsyUX$y^g4(<$cf5L~rmzgFIJqkKB8(yOr>A$Y-kthREd7lK
z@e!t~V&ggcXE~;Km>REq!?+Et52wOOWUoVoKZVAbl|KoY!L0m`>Xnsqy(yJ%zC%@h
zHqgh)Ygzfp_y`wUY@CZvAmZ9rA{6@o-a)Z9;-UZTCUmmNY>R7*MeMv#&mvoDze+uU
zspmzf?vEA_*+Zb=!Ly@SL*o>ShZ%ZWm?67^y_3UI^o<{bUYYt5nGI5q1-9MB=x<b6
zJJ1GI7BM1)A^IRiJsMq<%7L?h#xAsj6OJYe<qJab*n|b2l^~gKj8|1l2Sc%75lHZ5
z!OJrU((eokf%LUSSE}ec7(EBk;queEDt{ITkvz_nSe=bt5VR6aMQSw)@vUxRz8{Q4
zUBrB=ys?jeN6DXeG4H@2B7t2(c+tl<V|8W}O7qkd3EgrH>QY+msVqP?FvrOoiyB}q
zBH)D_H+lZiEYB(M(L=#7mgvA|m=21K?x;XtjZvViXJQz@*+{HEc(B;rpg4^9&%xx?
z&^;F|jbcKlAOK8g#!GBt&wKH(m{_6&<$bpSIQ9X@`Yj-F<5r09s;J+gP#N&+W6PY4
z7?gRUDpRwS`3wpJxj)1*T`Y4b%Y=<kq`{FvnG7=2Pi6w%vk=BejWLZ8ag6X%*Nr&^
z<ymvU#j1)-Rxv3c1gxJ4v_=gIIA}f?9Z&3L9};OYipC|oj1j0*ntLs0OxRt6M_JVR
z0gT|#tfR@VjfFOHbXnvGACaa!m1gXV?%#`0u%Z~;V`fs!KzSp@{3D8DD#7VVfjhz!
z^D{6#<0E^CHuiN3J6D*2HGm=3qcH~=!Z)I>2seOhK@NJj2Qj`yIX3*?ri#^HrO*=U
z`Pljh_Fu<4Fu`k6^p6&xgMY@+DP~_Wggci_PzS}KGxNZnBox1%`~YhtOY}7AQ4?x7
zg*<R3xGv__t5gkllLuap8f*g_9#XqriXISg#axOu?vgJNUOvNXxM9UcIjSQc@Xsa>
zfT7#<usRqmV}CcMVPgJtLWHq@>+QgMAAIO3zdc9`3mAJj28=NjWU3eA4Q?LMIAOLf
z5xsBx7<}9K2>T{_F$jkx-vz=Br%F)*x`6_W3X=V=%v7~VrJo`n>vS(alFW|@{5T39
zoQ5vMM?`6P;}lGP22CZx72uQ&Rsc5^w6g+4=r01N%-M^@Jl|w(;;c1hz`nyWx|?o$
zud%&X+1~H8y?bo$0o(hnw)giQi<G|<;k1uoeahZJ9ai}l&Cut4S&reM1y(Er+H6}w
za5E81$aod1%Ldbg4By{j6W+2Mqlu4YQMMen<wY#XLd_`3W=nPlE<?rfri4l;LUAqp
ziJs7zrFg~Q5P1ZfuT6B_j~z3@;0dM`)XVJ{4s2P2OyLWpoHw-;tm4g=MWJc=^dM1k
zBRh7Mh8TWx-`;~DE$|NPmqU3jDqYiCwY{6fm^rI2zXNZ<9zgGN8tRfAU&@yaIjH7x
zku;m=kI55SQA|F_lc@raJ|3ioQ_>#-7i3wqgDZR(mngsnu5cL=F&E9>3Y&qUdSVO~
zAdvLXAez0^Bx!*=cuSlXybzQ1eJmY*FFb*C>=WwR%`Cj<i!TdepCG0@p(k)*j<83N
zjx)7sGT{jozyR(E1)5Zeh#ExT!A({DPZ&~wasmp{LLuJ$h+RhFQwzM>Chn#XJ^~gt
zcPvKSx~9AEe-eSjo+T3pq?hn$=GVF=xZdJvBgBX3XI}pkX2s<{V?ZeDB5XX<sOcpc
z7jmh>WYD%{;FbKjihi79DAbgXlG@~dn7lohCY3cQ-}O!M6_ggB&K#>w_Rf<a2@f{m
z5hnI-m?4zC+gbt<49!7=GI!fXbGK^1NBqAeh+IMv1OlWnV(+%3SXJD@$Iul-G!MTE
zX2jm@kM&Z(P9P9yR|af?8a)|{4%m`gooC|J6L=;;%0wBg48y@B2K_d`;t71b{7Ho_
z&nLuM9?_yN0|&Rx3xSi1TM_HKtw~2gb*rlVNKbQ?Wu-oE)sP{jQ&8|$))Bf-J?@x{
zR<t!42;+EMp+ZA0gqrl**5tqk5T11cRI)IC^n^mMBDphl1@kf!6L>;Rudv`i6Z4|8
zn)wLaCoKxxw^3Eflx<BkJg#eNO2q?OfED$4vKYx2xI<78x`Uy}UoVB;w@c;Em%ye)
ze1wE}L1#58B?)0#U<H%V%rFp|BUwX92uObs^)xfxx~6>y20Aggq*I`G;p-MNv`z5C
z5RwJcwKa{vo3E{DBm)p}7oKLWzGfPNZ^E`1L4fTwln0H+eQEM}-$p(H@3skcSjaL2
zfG+u(gI84%iy%f?@YENa;&A+03-vkm4GzS-yZSjCzoEO+*BV7?((l9s=Ls&6k=ei#
z$T&?A!Z{Wmmy!c5Ja~NE4WzipZD6}uB~CL4q5kW7l#u(qzXcajNHM_`L>7p84xksJ
zcJZ49y@<4+S82}B6~wi~OQRY8h>Dkf9=d|PF7b+<5Fb$S(vL$|uvc-t7bZ3g9EVkh
z7io<17#Ld8j6m}B`_WosrYNZM0eO^#(>t7ROx`3%{dPQ;f%Z+r?(G!?lL~%QP%_Dw
zhcXwTk<O;KQKE4i3?k@n{J|EMuQ)WS@dO?u0`$M%Btv)Di*XKe1aj$Vf$`0tYPQHW
zGQl7|S`VEK8911;FmReZg&8*OlZ8VgFI(%0L;rWz|4dF6OgLkmFc)DSU2|z+HRc4;
z6_@@NGBgT$x=p4(8yB`QTrd)gVVk@;q_|X|X%U+fP=hpN@@=@+6>h5Uhpe&uR8mSP
ztKcKWf>Rqvb$iIHSI$NBIVyUD8T~UvgX&2RN5G+fKw)+c{XV~Gkqk`(dN`ItzasKw
z!iQ_sRM*V5*g3R)a_f>q|8R{UuL!*m!=dM6P(^WQ#el2{@ZN(l>MsCN5iIUQJtyP%
z>hfZRop@jeLwCwn@arw}qxtojphSyb+vxqNd7$@G5vRX>trEQ4(6&8>Jxt4L=QH!2
z;WNYUZN~SuG^fLGkm;Fbt@#eY#`waPX16U(i!DvJ2P1X;8tKjROFX6PTnTHW?j4hh
zBU`7kZzQ0}Dc<fK&6HTWh_@ZL5_{R$Z?Us7rd$nmmBh=b;7}jP;ph-$E620r0ig)%
z>^OXfz2Lt}9U!*S_$$=6a=VTAi(}&5G4bg!@wfmps(jdw@f+=iFS>fv@96kM*(Y>t
zRnKty6}<V*mFmP?{USn3u>Z^#tJqsrFiyQUgo6RSmD)e2Q%e6N69!wVOFE~*fmeal
zma2c}RN?kC6AsGrhiH%BpY8D95H9UFVt#tKK8c+i9c$tlEtA+0(XqI-!-Q30lcQrX
zgDIT54i~=s8bNe4eIP7~KA6T%9};h)4~e(Yhs4|HL*i}pA@N~+POdpT`cU)?XNR5W
z8DUZIGL0QxiMPQk@iurR-UhG4+u)V>2)uoN0=)J0q*a3aP?%^j#BNo=pydVOVA8S*
z-srzB+^<$jKW=WYsMJbzY3Ecpfg77Dt#hhAW~z>67G?3%!=dN;g~-rFdsJ|z_L!;d
z?J-l@+he7&wZ}?jYmb%6)*dTWq&*BdRC{>ON~jN%2PTW0EbY0O4HM*{e=a~+tAAD)
znfdr$@2`r9H{nIcsrWH5@!2u?&y9&cf#19rTsZY)i!UOi<73Ppd~S`*30?DdoJi|5
z9(YR4&fl5FJ{~0AHXbD2HXbD2HXbD2HXbBCG9D6t!g$C}>jGYcMZwE7c6cS;2Cu~1
z;FWkAyb^DNSK=e^ru`In=X3!t!lK}18auoaZ-ZCjZSYFG4PJ@2!7K3*c<1~ScvHH7
z7hzHGGL0QxiMPQk@iurR-UhG4+u)V>2)rpj1>W*5;6+#zyi8+<SK@8(O1uqTiMPQk
z@iurRJ_2v~A>fVW7n*<^%`XrZ#V_z3#V;h@#xEq^#xEq^#xEq^#xEp3nqO##%P$II
z_+5S%@FOe=etbv4FYz|`CEf<V#M|JPcpLl@FYu4D_+7pQKSG4x@!1u>8|7U30vPu8
zFvHG~285y*Ho{u{{s_ax_j>=#2*Z{5@iFm5G4WY3@dIMwWu6H)Ag5!i$nl4QA2;xr
z^Nn-mi^A*cn+ZD^_rsA!c!Kj8?8XYtCa+T}U@#VTVL39A(myo}aX8fk6KGqiyw0gI
zI;X<C$LVN81VlD4AM?<L0(8tGJiF{4)hZ91(QBcEZRjO8ZhM!5j^S$aCwMG+@&uhl
zLH7it@lT717kaYAkB*5Kda}jiRFEm`sPd0BzdI&R5c$_!XB3dSMk)WSVXv(8>=x?1
zmkRhT*eSAU-W!vlqJSun@XS)EkZjub>+lVgm7LqAQp3`|55pE>MEmZEImZoW%+AK&
zLFeKQyi9Ms;nU^qy-mJ@vJ%1ZoQ*T6nc=x|p2BWnr-yL9md3poV^s@?-i0?tO9ze}
zD1r$%cov+yn6wmPE_D{XSv8^>S;W{DpZufVMNCoUEV$SgVgK3_2j6hr^&zvRntv>j
zhW5v4@aM%cmvusi;Y_6)KGCu8+^<<3L&XHU9>wc-&XFRDQ3L=tt$dSMG2Th-{3Fz`
zC0Z=+*d+tuSO;6;<wSTh6TxU}0!-*+TV4OtER;(JhTPZV&e9(%5x}#`fcyo$<tk`b
z*hDB_;<2BOVEu7~ndU_4engvR!uMkfm(d|ai{B93iwYu#Avq+=#^iby0xNhJ_F4Wx
zs1C=idQi5YrJ@Y~3W-G3qBAv|xy-St!pjJ1Fv0(fV3Do}V8GhUgn~L>1a(^Y*f&uj
z8pAs)B(pB~Gj(UcgMKQO_)5isKBJ~qe0Bt`h%ZJ}FCaB>YITYJJAgn9c#0I8wHumn
zXjHay8+oZuOn5LJ*jkc$*rwV)1@_b&4W!e+?kJOLq^G_-3B%ixuen)?+UMw?p5Sa(
z!>e)nN#)SPz>_4><fF!Cq}fzRCe<Kf8zS`VD=kII+IG3yQVXnYFE`aL?bV_b2&`^*
ztCzL7nG)NeDS9<YQ|#fm6JT%rTHu)oiE2Okn?&M<4wU&J0q*qU4yq5>@#CZ@_-nL9
zD$kPn(gnTI2*TV$;<>{~RqcW4*zsHu5y!3bUHQ(XZ*f0QoF|SSE60|JLpX)F@aecN
z8Oub31KiG@A$I>=NG8TJM2}?I%yX_>SAbh{LJ2+`F{GCWj96zXT#SMM-gyco28Y3W
z6%@q42l|R08H3#mphd+wAd&%3>V^6|%bjY@Jk!|E!au?HFA)aRbY(b<YZ`<h-SBW&
zB;AN`7&o42U9={BvvcWt1+Aa;UAO_qd=|DjpWS@SyQ7Zzq5ymDfG~#Aa47(H92K?2
z(wP=4hNeAX04990#iqXq^yKt}AW6cggNZ<|!mjHS=7+x{dp`daG2fn;Z`^ell|G^e
zl6w8{R~8lDLwy4aN$uS_Hcu-6GfR^Hk2AW<P{FtfeAuy3aAZ)K9-$zFZ&u-}7!LjV
z<HEn;d=tzFgB<rcl8%E3pP+kxBHy1)iXi(%SMje)zMtZWgOO=>eFRxyFkv;ETwsEO
z@hc}FqF6D0xB)pd<NsqVHiX6d$4)xD0p|wWGW~v#yV<7#Vu`$}nSj~m1#3>wp8~sB
zel}x6SxqluFkv`;Fyn^~^80Q2NU;9pi7Ix78M_Lxc5cowe{gR8F)~NE`D%Vqan1wF
zHg4X*jm_T!6yWBsPYQ16X8fScoJUr$tT3lC+5ed;nNMD9V^QSh7uPYIj10CS=xEc@
z*CX8g8+1byH(L!5=)j|j!!DF{?|MG+_T!=JRnhwK;dcHU=D+ZN0j_IgAyQvj7FKdw
z{C9QxP>b76vR=#^Z`QU$NK^dN&LY2>qFAH>rr1^dCd3Q3-KKv|+5#!BL{jd9JS4Pc
zHa)7{KLEwVg`JVFuU5J1k(;2ITviKKC(*lzv;DHjctV9Z+6dE|!W>QRx0EeCe6T0Z
zBSdgug4n+1W*=S{b0%U|7PIF8WnU)4IVLe}!Ce5ul5uNH!<w|P%d5YlN8P~4M&4eq
z#SIxjM^*=4w(`XT=hY`n#1#og(bMjWr8aw6^0`?3z1;t3%7M~8_<Ir`mh^N+2JQ(a
z?}BeQXXAX<62#pEfmgM_e%>#HYbt^lYVO=ueZLa<R>LiexK{ys@G)BYW_RFyF}?8P
z&cZjbP6&LBl>YbJ!OP%AU~H0qlh(AEn~im7<#~ED<~;aYcplLyh)(lA&1evFU5Yz6
z*QM91_<P~63rmEsH#j(4iQvC5{nPf3BA=qKvdP=~#is|LrwcdE3O1P=Hd1uraXy5u
z^ApLjz?JZtM2zCum(O+QyI@g<lgsC)@DtB8d4idsi_dq7$p}sb>Sfc=Meyi3v@mhj
zFjxfPc)`b<ZCdqV&ZZON9gZ=f{+}@`IqZp$x8Qpf$JB6P<OXyX%(%2b5#qe06|gBD
z9~pW@a*{vUxDTL38c5p}Um0$4#Nrn5&c+)!K7lcADlez8jGimwA5Qy%LnQ!U>1QI0
zKK^#P)7KOQK6mehRm?6=;45vfKT*6a^9TUm-qqW=MTcJb{VI~DUP-wGyVC0jJ7)Mb
zS|}f$)@L9sPKLPgAG&6rCr<YrBX^2jhDOs&m1jJi&IR6X$i(YE<h~kM&2=wWh<udd
z`WDeu-iWsN6mZ3BUG5N2ih@&Eu!{wQN7)MgCrYvv{3)T4f?vR!RWQaNGP7VXVn}e}
zrckQDWz(qD58IsUQ}k_MV6b0=HCmJGU(&4@GA0(`;xx@2>amWlF(`n4F33f*u|f#O
ziSxq3zHt%kH_gW+%B*%5a~A<pkn9ATh!N>O`a^FzgDH@!udxsLWgJO3E{2zImyv7D
z<2+!V@5bDglD=7AMLs~xknE5@pC37(Bushwd{ChHc&BArim_Kp1|Z*o6orW|Lz(<@
zQkYp#$EJni-(<I%RGg^afHHVxPNI;e8`RmqgdzSU4<?PYLX4Mn%6TjXy}x(+#fvB(
z1u$e$Mj{!(d0OBr5LyswnF7td_g}bQ1{@;?GNS(tutJb_E+{?8!)u#zVTJV!s=qMM
zW1ZYXzQ^CnKH^z%4Hp*Ym}fEfUBTh9i*C+mCZbGF0Pz^iiN<I&Qtj)7@q`P7^*g~(
zg*5GW>TBZZa<2_fiN-9@JN+~9a)E$1%5NkX8plUbAQ?^Ihz!IT-%Ij*ki<r#2apMT
zD2@R9<QRit{b0glhVzb^9AkMHPIv3}JlvP@8Keo_0Z81fU-J7+e(%C8i8(e<fpK>`
z-;uYkQHWY}|0Id3V${`Ul*{ObV%6NauDu^$Fh1L}Ia5H4MqX#rZPL3R=}i;O`G%s*
z!}!owg9n*e^x0hhfiF3Et^o-aOtzLwtJ|Ma6|8Pg(2J3bg{QD^m+x2-nwsb?MTwDW
zRg9YGh9L$;%3O!pLGO?6_S=|K#rMOvaXFgEYK&p#3;E*a%b63S^&}Hg?uygTOJp!v
z``w|0EVL7@Zk4^GZF*4q7ql^YOABnEPb7zNHPDVuYQ($J&wx06-Hks3l4*-<dBggf
zm5Iz&3wW&cS7hx4l@8T8x-g1UUj0kV;!75`xMvjM8U}GDX6#Hffop8@{45-jS~{Kb
zheX`#dL^b@^kD|m9|8QMaEv~aqY*v8W>78}2a^U~%|LGDhCMK@K!V6y08K;R-h~S*
zlTV_QNPxm?Z}>7G$C<Z=;78A}7JdkI4qaA{{WS@a9gfN24$A5qQ7@H6lc_99CJ6--
zo_Nr{Xjn$9K(eB;h>L!T-^`_yMUYQ1>=H&Gi^${n<ew)v85|%#iS(p<0%g3Zmy?0g
z7WF{mkty^*WYwTVW1G6D26e)45|CLHI#ZR7h0jH(L-&%=0rgzNgTt1yigGQ~nB0O`
zb8(P_SBA@~VKF}tu?$cCVb##Wr+zI)<8tMUKWpboqH(AGS29ge6Y2MYs+OOGgpi>>
zGou$Wnp!q=O)(<qB@<mkz^DYuXRm53T;`xp@Iohm^>~UN2#XK3a%jPe7Q9K$wo?^^
zv+-O!{95=!!j*rvEgl4X!HO?}6?NWS77tPs1=E`@6a0Pv{6cV6Vlhyo_l7LYQL9r>
zz|6eY>=|GDWOT<;Bv)xw2kT9QA@I)fegrX;bu3yA&URo05cVjTb7TlpzmvlO0fG`B
z;ah6LZ@?3VAW*+yg2jl|w%3`A$oMVFRPSqzKbY@ee8zo|aCR-%OsYgVxQBHT$erK>
z6FPvJsMcjudb71oh)y-_JxLqZ^lU^4I^}Wl%Mx@hnKWdvooc00EYkh?moi@TK39-l
zku>ulN&6uwS#EEMS41Uc9EAv@7)XfV>Y)6M1}}+r`<8?O(8Q``3%dnEFmx+(Cs4*J
zmn%)!SkOB<{GRb_I#sKbOwd6P_$%Rr10EPp$DO6mqo8`Y7H&KP!!uZz7%J`vj87E#
zmUQTbQVhopDmf>#`+i0L%XHBS5f-+M*eAeC!&;YAl!1zn$p3*#j`A_Y#UJ8M4c9#$
zFK}SyB9O9LlKDl=5_`t|&@mBc;_~J@yhm^%#iP^(?GFpTc4O=m126@I2oh>V2JH@7
z`vfwtbTJxz(u@z|SfE(V|DX(*1rAM^hbH@fjYj);DULLl4%3`XPSPp8?PI+dEbglf
z?eioYF8j}hI?CWp6zm@nvUWLJ3ll~068`Mb{yeS{cm=7DyJKT}vu*taio6~L9bSL6
zkG1y~&TvHcR}bjvq`wAr>aTYs?T_>qyhGahYd$Lak^Y)0a2%w+MkA6`JqAzdx0Us%
z9Gy1+0n$%qzd4((1ybAZ4bR7i=)dik*!piPJ`8%bi|)Txc1l<M*Fo$@_1_LSy;lAA
z&(8g~4nZ;fw}K^f>OZcXJ)!AI;wjvINfwqA=`EhX*$|AKjbv4H;7(!WU-~%Eo)+Ll
zjhh?yIcJ~GHKRWrcYin*j>U{KGO?g7J{}~AGQ#t<9}Dy~ebu^RwJB%9Nz118<S_oU
zz~@}&eWc$;MjpV$uWPg*Rxbh0buga&f^+$05M>zZ37*k@yM7Y-1kCxk5^%OtDCC4M
z|1sq&<7_Y|8@+;^h#&f>wKQ5vSs@<d4;P4lcXxxxils#S!sBH{fkyN`lZ$Bo(P%`{
z#MS!!=suCp#vV>_f5#%gx$^t;7C32%hs3-ZQ^P?D)=;n|zsN+<@MWi73BwAA9QtT9
z8<>udDKLTPUajxD_$QPnZ9kg8E<I?0f0Y93SvBDs4=Y4ypuUV&PIN-(Q|N`Em1ul-
z@UAs7Pe&1l;C0e!-Hp=2^uY2D3%_GgI#B=|gLNOHPelb?pv3CDPm75YC2yPBBZ$Ed
zqeV8VfF>9LbcvI&2Kq%@X_SW7R;n7|KZ|h@s<;kfD6@mb(;z&2Z$)T8fa%fXQb?en
z_7&{99w4(GNbS&rpaZ}O4n247aYz<OC`Ecb8O>?GmHO?3UQ0O}^c;eRrQY2mL3w|_
z9R<7#bm@K293@lW2+&63*=q!a(4#X@>61L@B;Fh-#F}GkC4^IH;#W{5qKQTI-(&Zq
z6709}J?BEY>(`$JFgaVCH&^2W%zg_JUcghl7#=BM48Z9oh4F?pIX1<?A`HwwAh}+K
zW}@*A=rd3jwt4PFAddco@W6SG8~Np2xeFQ--aDPTQQaoo=3MX%s&vlBCNgv+E7w3<
z7Kl%7YMJXbj$#DvYYrZLK}cz)D0U_hr&=69gU}CcXNS)vH()p$hYIYJh5?4<7~w6X
zriZ8CuoS2bJBo9lwu~E<o1E;t`2j?&%E#d$J(beX%D7GUewCJo!g<JQtv*f`v<VO5
zXfS17mxfCj+6r@ek6P%>#hN;wmlUm2S5A@#y4L0Q003w*4qrk`=xiE^fo5cf`$?y~
zgf7yJd%&Y3EDRlg{}OckiOd|z%2OR*kB2^QjQCT?%Y6d){Ra3oczVRzA4jFuJv(P|
z5f5R7`%@GU_T7o&1Wi8!JaCs<-2;*V)i@Lci7{v67xKkjD!G9~<~zASWUe?~JymQS
zq@*XMB$7Q!8i<1Kj%2+BuaKugJ$AcuUvoB!3(UYPGJl~5%==$(x#(KxT6g-frP6cZ
z<8cn753+GSOl{+`EDutZs0l27$ovsU71VoKtvVgB_(!0(d&K!45it(E-^};`Jw7LD
z!5+!&i!|HaH^%ws0~22mr{TWxOM=hj8>Il|cv64PV~wrV#*1^T_qE0=co+U}TXW0z
zN|KGy?Lv$}`3~#fO8~?`3Yuqy`CC``pwzO>9mK`H5Hf!wH~vlpXMmJR`SH;U>{Hyq
z@o^lSt@~5Dzz1;AewT$$(XRvFs>SUag^2)XkuFLQy##LH0wTXRQGc=)JTHRzhwng5
zXQ`U7+c=cN&WY%W+z&PFeI}aoJ?k0T{(89mQXlkgvt#d?b_e#WdrrT`%mR%qw-9j=
z3_ax^048vSoxAnGQM_O}@Pa4M;z`}7c*g44neds2tD)hAoc>|rzSmRG6YjtyUJ-2y
zA3U&wW`RG3`E>5P&iU71NZ`6<=<0Ak^lDGsF6aD-WE)VWs{6{#TSUNy+@WEa*x&d7
zqz@k$9xSsH^%nGuCwD*h$P%56ZHRG)F58FIL?Z5(cjvBgHr|Rbb)oP=zIAb-FfLE>
zB!JiT>JcvsQe0U6Wz2=;?ocMKlGe4{@P)!28hpqqB%}d^xUYFH^%|%y-=}H@7{^uh
z&dL{XaSZQ)#sVN35x9s3XH3y0z(>q|zaHKrPBMJ7{caV%oB{&=poQIt>%?JX)_X15
z0xtNr2K6)a>bQns52hn6*gLrhn@5n~-0*N;B-d)iCz*=+_+;Zb4z}g{RB?0LS7NI8
z6AZrf3XHdC`uK_T*!MQj<8Vcf15J87o8%hm1Fj9gWS8E5f{fFN#;?z})i-hVL{hAb
zhoSBNknt;b)0y;3dd0T=vu*#zcVHJYNA!2G{6lLP?(iM0=u?}T@D6rRLd9``J{$3Z
z>*;OD6@Re>D0n&3*x;iY%l;jk{!rh$ok<Bjo|ds9@<eQUrC&U5w8ig=Ct%hZ3f;wW
z4iC&)r#{9<o)<yfq0`p^<2HGm13XF|=_EQ%yXZ&d(GxqBmzZCFptt|zz%J*?A0VB=
ze-0_KVNGIqyoY)ke*=U=6ol6kv{2r1JoHpMJ+Qx}tP{dz8FNH-RQ<>Hptbk0tTa_t
zx+-fP%ld#wiCvEf4vxp{O@IO~2zf1hL-n`tLzF3Xo*u4KPsX5#PHv7@_*-J#zf^TE
z!8_XFWZk8xTj(LZ<;UsmQ1qetH)E95zd+~#;{o_Kz&IhitEfISZso9*`p~`<sxzbd
z6xW_=fo;xI+}Myf@`nywy%guXy@m1+`rcYvalw+ZT0g%qiavy%ZeuG)s8&9!T6ruH
zO?sN$Mf#s3ayjHR(%%A={##mfF@0C%OM2>}k^VB3{tT=96T2w?7c4(hr9WGxA7rK9
z22|PTTir*rCv5J7g@)25Lwu(F3bLVADo&yGPJ;rL(}`U|H%%3T373Yz0m7>HV8So)
z-IaCahc`3)pts}<#sU*^e2b*AZppceGo7jIewgo2b`vtC>|Vr>V8VE0p&EH1SqlDQ
zEo9IH2)no^hZZs}l6WxT{+k3y-jnluay>HHRNcj>5vuNSY`dyX%x1(qQm@CTNB&&o
zr-=jf+)_14NWzb>LW|vuSHKF5Gu7YxqFREbE#f<1<vNXj`l2Gb!pJV7HX|Nu6$+R)
zffjMtl+!0HBA>+cNpU>|4jm-f3bd2Syiv&(n90(tWV6g<!=sW-Hj`mnSd})=Om<>a
zvRpG+O%Ym{L<LeJ!7U18CUf&6NvYAf-$}`0vgpvc?J3qbFxx~Yc#_L)wv>I*PT(yy
zD0s&vHC4%i4MDaLnrUwnwxw)lIn*#-m@k>{6{Lq1=$ihzbOuaNU{9FO?y$@W^HJ3-
zq-K9uov0TI2#RhoyyHc4;%r*YTIQ`G^+#{lgy%!*)B97rFy}}125Ok2gQiS-Nn<$H
z1TV=lAs=n;6#pXQ8xJ1;2;;dN;rhRe`~k-AG51NS*C*<z4Xg)4H6B1l{UIQUoDZ^1
z`-#G`1MnI>EZLoVKpnsoLz12PFEIj`a}PRPC0pBmjP-Ik5H~3R#6@}s2qYnQ?v*8g
z3CWK}m)aEnnw}>^47~ypa7ovIhr|P63IF`98a4J_`5OdbP`W-H;K;H9cfH5yzh-+c
z6r7v-5M`Gst6%5~UMD|~<>mVEm?b-{iiZ=NP{J+q(JHKpAq=Wj@xn=6Es7Jvi(-&G
zR~%<^>yJIcDM8VU|5#25VPaXWpK~5yH|21-=rO$P*PA(zaE=^8Bv$FrOzIyB+WM>#
zEN8WVYjJyPOF^Xfj<b5>Eb9eVrtB2SP|wd(y_YkB&z<@%ppU(|Mf?t8wXDBtz9s78
zFdY)LJ&-ms;hXbeDB%u}5GHtq-$Tu}?!dnEmgi1JmanzPu(pYnw|?rkdBb^NL;8!x
zFbX$efy9JQ_K&BawLi9hoH5Lr-!1zj-2Dudad`9icd#<Ik%`)?1ts2(@W@7Qc=obw
z91ldtSkqYNX$upOdA%DHC;l-)BcYzupU4LBGKaijzgTQYVI|XPm-u!nrn=fCrhY>9
zhq1f<rtSi1k@{qd_^wV{#Bm%w!Gzx~uzPHPr&cgCYu;qfk_T=ip9?0OjilkdWA4f#
zntwmOMQni0AoG<?f)sV2pc%Pr)8=995Ra6>a5y`}YPMXA8oJydz8v6K8^o^y4vvHy
zF@nqu;%h-R%bm;|$~yBA6+NBNhrL1kF)~MO5U*zdO-om?*v=coISN7UMnMqfOma|x
zzw-{UbwykYi{kt?BUiBdJg{ryq~-e8z?XdsHF|^i4kWiGICidHMi{U`Z0Y5o1B2ZC
z!JiNnprCBcU!2cq3}xRR&cW<q3`L;+n<0^Tp;j@V)kXm2o6N|d?80|UG{(J=@Gka;
zhb4!%hqnSTQEfXhrfs(E;pZi7^!D&Vh`Z!v<bPdl52xfXUbctxI13<YpMxabKUlHM
ze}Q>$V78zQ(hZ*?qSN*;^bc&T;xE?ywV*fIn3`bYEtn*+F$HnSMKVj%md*0k)>Twj
zm2m!q1ifdOw-yyqF$-}jr^-Kb3a`Q#MmCgkEMS0KHQb^9Nn{<^0>LyVyak>n@Qng|
zTEq9SnS(SA@sKLsWz(u_XO{X(s!n<j`G?g?WeqRIu?p1SmC}2v((~Y7)+(KQWYz||
zD*f|p5F4!jra7xtuj-2&C$%)JKLuZ~ote@KrOgei^{2^=LOo}|KvD}#5Xk^eU-CsZ
zgs~dl$W6$vl5F!R)V%#5{bCB`glb;f9vlgsGt2=?$ex0@Jddj`Ld&XAI*tz5H04j3
z#Yzd5ru+@7E*5JlZ_i-mcFVzG$Dp)m%Rzz9)Ec@dGotdNOg(hXJnEsZN#HF>GoZCy
zY^k7q55<=J37V}Gj@07WOcSm1kkC(Sp%T$g|1zSPJXUH*ks8i{dYYb45KGPW$#|un
znbL*{A4HEUfR`|Q8|w+*hIkV`XVdqP{Rks412y?}`I5pu!K+%2uPO(k^gSF7fft@5
zF>u}cHYf7nRM6R2ml!5DVMZ#bzb<*Q!*>P^g<v9yeSeDklVLiz&L_45#X^b8GwJ>o
zvy_>%FO#M)X-*_*iX^=SNy)=xkE2kWKl_JEfYk%63q4@z%y-OX3yq`MLh8+Xsd&l*
z3Qda`0NJd=hfNYTwFqiy+IluV2wHLnn@EHt35!l<0M`iKM{=<RHv)+W)?<PnRH?Vb
z1n+>I8&a>23I3}pWLZq`1eN+>1~12d{A=McoSSapydMt2{oOSZybmzrYU&)AM*#2r
z+uXT3oQ>Vt9!@y?YtfzqdGPPlaDG>N573LGeH%%jc;BZrY)^!S^!hKf)7ES08?kTY
z`$%DZOK=Y8v8G2uN1PAJa;yNMxE`8t0|v9RNfUZJHxBmV!89#63%2E1KU~k40Aj>D
z4GjoeGeAULz#&G2;rRusRkzryto1?M-l^X+6cz^9e3E(k1{AT96iKo5DwYRUV0?>f
zzpVU1|5{)~lJQI0exko;=Ab-H%Qefp5asb6Zy*b%ErKlH&4)nL@ev7A@O2pwc|`ac
zBUssn4&K!ejjxxnF$w3(polQOewTh2_=>)Cu;GO`)o;#5+KnrB>Ef0xSd9KFm+Re5
z)q)r@a<AJ;srC255oYo(r6tIvqS1+c21|6n4%826nCEVh#biDD0puuTWnOw{tDiTg
znf1Hz+I9WIr2^Ph03TBSFQe*z_F5w4czgZVvHpGNT5DfWE`I9|3NZDD@_pTRk?&!$
z`!y`tcoBgZg&b}s`#8%D?^J9=n1MVYHTIAWlAXjlRAr?ilbk9Fck&w`DB%Q2eln6v
zujwBS1az1tSr3_jtsZ_x>M4|rPh&g_qjENVfEJ)0NyU0z!M94zdY;5i4sIMla#hbP
z*7K-j5!Pg@XJ1r3A4tXtk$TQo^++n#GdQZA(USb)a6QMep5Ce+axYsw!py>W^Gn79
z7^Y@>l2tvDiuJINNP9lKn(f((<f=Unor`+DQvE{?Y^z5Ya=4zJQcrQDp2?DFg<{*N
zrx@RndS*#I*k&;68P0kJsCxA1_V6>(KMzXAJqR<~bC#+{Qn5WJMAdVwBzJ}Dd20ab
z`5M;S<`xpR*yEE6xL3#>r65))78iF#IHI8wM{3XM@G>j_5FHz3jx&BPwXN%TtIMJb
zrRamj?nPdfBWN`KjLBcV*jVES>uVXmJ_l_}90}FP$7q!=l9O?_^|hW~-{4Ch1AJo}
z`dlgBc!_&s8;$qWqnir+RDIor2a$w{g7-FJJM9Yr*&Z^%Y{X{cShKO4jqS*1EiO8K
zqgEfX&)V>1T9kmDqB@s?qc2m0MhF4t@Djyf)`@el1bz|QX*WVEss8~pM8ljU+}X9-
zm)3A??^AH^B3i_mh1MaK?(aQ@HLOXK9p~`+h^ICo+kTFz3lhyQ9T8F96G#?8Hc4c8
z3ageF-;bheBE@*UC&hFq-r5{-JVZ8+l_rTS)Nl?+stO2Hd@j14ezg%{K3os0u*CR8
zQ;a_a_P&tSBULyX)t#wUHT_skFI1!7+CRLOGg>D_-rvA`WIj}jxqV6{Q_$h!cdR~V
zWnU7+3%|1TEm{zULCA6<kC#EDACVXbT~Z?wsr3@iq~Vy1?-vc<b@TlayFDV;CK%94
zLk(|!2L^K+XpshMmq7RSckTJ%Um54jLVs$8^uhg+_7fuTVx_r^77HWMyY<)RJA6AJ
zLZr2g%op+QSS^H8TVUs?2v@5iN<_~MX3&Q)<U^<`!oLE&pEz-nt+nR<k*XgRYrzua
zf6sTSt$uZXF)^YqPDM|s`_izb4%=Al@Hec*ic0N3>-p#l1Op8J7TDl$I6|dSxK*B5
zfE`Z&c*=0%B%=&viT>AwsWUzf4Ug*(@GM3T9xgm@q#PU`j>O2S>(KTXeSQC)Jx0N$
za_){7i^JVEw0v&h8f$P!s`|aaF^#5iObKRC+TO3-;5|5Rxe?16-@u|^GAu&hazh)P
zrsclkOJ@LM`bZG-tY@Cww|rY+L;=&<tugj;?%?=D_fQ-Y@vkAZ!`K=#0PEK?Ft_69
z#uN5TN`F~!nTAu4wBn;ve*pXq{6||v1v@aFisXa`VmR#6kyucjkL4QX6>tT8H0CGt
zBm`{e3HHr8PaTk~$Goo&NIr@;Pbl*PQX035_5fB-V7Ht?(;q?Cng=8?9yA!D6y<)2
z3)PSok~WOtcp3~0P0Y^m;v!wToh$n#6JJ~0fp@gMdx7S+ab^pZaO+0~-zY7JQE*jK
zdp{ZP$7oG2`qCJ{Y`d5T9-vOua=-R}GHQrB(f__@XjyWi4~`XKlmR=p_Ir>|v=91d
z!0SeLnuiRZ6J&9r%j1UT*-4yH#GJgaFi|fjN918tI7UB@;KR5ksVMFME$OIcNJDm<
zsDrr)R&hsr0&roEQ-@N!e?G>b-9S*C@6(uL6}Tw>oUmZz39TgOf)NO}noPX@5%Z&k
zSDx-0b2j_MpA>zt&vzwH`{BqRZ0*`lH09iwbAIRB&iN&|_kMNIbAC7gz;k{#0qOR0
zeipy5pZ7~K&-*2wNvOEc4s79}KZPK4h!(g$(P~ZvUrpk%S<dgx#QD8Kle5BB4;?Az
z_xj8EJ(oLhwDA<g3!x8KdGaKd{rp~vaTk{Z>(q%p8vUghA0j<ylxOtPvtTIdAtkrA
z^ZPV@55u>T2UfQqYrPz&pUH(InJ%@xImztroq7uwSTM~_0pC+e*QwnsZi5}Ge|!cz
z2M69#xX?uUMM%$u=ROPotLO(KtX=Gh7>=R(nYpJxqc3;h4H~J=`!a{)E3jk>8f6nr
z#L0hAmFQ!`cR2RJx3z;nJ*{;l?M;}hMs^fJEhvgnTh)$&xVVQ!??CL8@oERU$<~5-
z`S^r2>U%Q$*lodtGZ_?&-zdjCk@8GVl|h`wy@SxnC_KxwpW-;g{<0*98jH@RomiQ1
zJia?Bry|k+h9@Ao0l-5I(Ab8uR^dT7_7aXz{Cn<Dzcx;~&ZfNX=n5=%2tO<<;L6*D
zjulR@PMn<c)eR*BT<eQtZH~izC<ZQ{`Mn&dL|-02cTxbI-4|!plUP#vi=rPwHu1L|
zSVE8KTHp>Ak6P$u*{nsgUNIHr!6iS_!&4BHTj2OoK1&9-B&$!t)%H?SP?dy>#93e<
z!~&%SfhFY(StmCIunK@s%n)$>)6kUyHUODyW<WGyGVc>mMKmo`MbzVA7E!|xb^tGh
zLNQBJvqDV5OeiKFug=sZOiARniH~=vViDOm6ATAQ#CJFVIhiF8Dl_Y3m6g#j)Pn<v
z>KQ_6aNk`mnLLny;8mRooV|`Bsl5PFYXj5{X6-dH3ETHRXssoQB{|7TM>QsBhg5;_
zqJXi2@S0$-F=j0Q;aUih)icsq*)AI#X<;}<x)|3Q4G1g}<a*p#K<Za4Lg^s6e2|<L
z6l_uY=mnV55M#kiX-|5@icE@$jPjqZu`oDJpBQ$RZrZ15>qaICfd%UN0%PobBHeUH
zK2~TH9U_SDHI(f~jx?2HWDXFD1qyl_+cWHKL6R1@kOFF)>R4>R!CQS$T}Z)EkwUJi
z+_6X6BHr4`S1}4t!+s?vo2S6$m_Zg+lm<gh+tIJ-FSZGx_Ad5>@_KS_Go<cm2;Ffp
zP=GrNb|4k@UvN*sPQ2-V5V17x@+;%S$^LRkXYj6nK)(vp1nbG=5~2xfHC2D!AE1;4
z7*F#7-ygp+!D#1dE|~ZC3f9=ihf=UHq4$?UI`nTKYZ~jNka}V#YE&O=96Ot!Ft(4u
z^gZBt>zX(+1;y*+mJ9QhgN?7~j$*vxQG*Bel$W9qYd*9g-nbFbVgL7nBg_V#z><x`
z%HKt#N*D&L#Q47#jw)sg>e>bPzek%7&;R|yM1344TKK=GbPoUb<uWtKJpm*iPiV;^
z!WDpn9w7=(XvG8sVAjaTh+)e#bw7&PKs-3!^xY!OCp<QthzBFxX7hv(Xb<N8elZ|I
zfz2#8;xJ#22XtWQM8)#KKq_`mAAyX)4o}5&yZlTQ_KF$ShhZdpr2E(mi(}X?R9JIJ
z<=u`s7{pG1(Jq&&v@ae=X3YhjfZV-cT@XPMpU0WPc5g{0zqtc@4ql?X;_u`M-DfG$
z1{2m#6VyJzS@p-f;u8gOMZ;pk!%f)Fe<1}7M&F<#)W)j1x1>@A%Sf0R$(05`vlhIM
zy^2eZEsyvy1fp*NIG%v9{CrB0p{#F%eB=?G`u?-HtQy6(oA)rp(&0H$VZt-$R+5K#
zQ%*o`=`aCn0ob&HD{Mxi$0DJ+gTeF(xX!vb$9@lkjSf#188zV^uOMp)k#T4`R2-#C
zI=oUL{|uD(AE3kQfFO$whbUS)3HT!&-hUd->GKqLhko?wAhkX!dWIQ2AJIoZYQLgL
zZzDBa;HOB9TCmQf<^g=b`lPZWCADi$k^){9bVZR`oG^%=Mrt#OE|3~EsvxzSu~!dL
z8;&F0`mGp_-0gn_+>p;S&NFH#nlq6sj;^9AE+i9`ZTgugq2^U2JM|M>;O2LeH(S?}
z?9{t4>?@F>o^o<Tp8l03SP$ciJJ4oOFQnerv`;r<4IC5$a>MiWIa0Mpj`-s+FFdpf
zOYr2zT*WfHFOUa`Kl>CSvFkJ^GmYY8`njNWO8iG7J?Ul#d(XU_L_Bvz4P_7f{z31L
z0l{G&;rM!YE}&5Ds(z}W#xH<NvrUH2d~e@vhM<gmG8PCaJ<WE(cw^5Y1Qdc<VwJxU
z`B|Fr59@`8XHDdBj((6-1c(M8hkXU3>=j@yr_>E-5Cw&X1Mpe1fhZ5`!fHn82sY!}
z6YBpjJTSa=>YGm`j+7H6ETHuHz@sNL9xHf1FLx2C2<uu;s0cc-J-+_71#230D4}>b
zq=BhYhY$YVPqaI+5l=PfZg>mIecN1<`c7a2CMCj047IS(!%-4jy-4fi)5g~W>)bV1
z=(I0{et^$<JUyZ5^fM)rS5e>-<5Khj<rQ-+`6fDA--Rtig|l_~8-1uJqBgoPML!Wk
zLq)9BU*Zgf=Km8!jB*)VaGBCKU5Z0J5hH0B9a7`u4i2|1{;<4AZ_!UgZxnH3nk%%m
zZqbe;b9n~*hSei*LkRy)kWw$CD$3cooBa{|1@((?8F676^R8#!+>O2uSLGm9KMxIX
z2T$;X=D<3F{e%u&?Fr1m#^>lH@iyC&<32DJP+f}E{aCDZF2zCT(Mizcz>GxnChm2k
z6PZ;sS4flOrnLpu?i9&V+D4|J0BDc((}l?2!g{ai7D?U%$eNMnj(Y_g)zkqv8(~%s
zkdgi<2@3ReHvj=~P^bbmcEfpR)8Co2VNHtd?4`_m1dy)FoK0dT7aUIKWpZ4JqV^K~
zeaysiU<sq=ug3eJdQ)E#f_{f%<Rx?@4hX?F+FoelECd@xX8(U(;H(}E2Pq?Q%_}l5
z61me28)2Xa(tHaKg5?9(l3NS(+eGUH(Q+6AQG|X4iZ@9E9>8!EutTntQf?Dg9WSG9
zrF72I^08BY6yqc^1XbRz<P3!t_-^6LtEWXGP5Pq!s`J?sqRB`Lj)pq#^J#(JvdfUD
zf5H(Gg;l?K9AQ~+l4?ZXZrMMI@#7X!MDUc`Cs9FAooK71F?qejB{VaxE4#<RFv!8G
zCB|V6R?jvaMepSNJ!|89C0f9MPU8GHZPT}b4Go_h)$nmQ|0M|7Jnyr8?E5&JncbSG
z9hLAiZ24guBmw{3E)ax|S2&xdQlrv^1JQiYnBM-XjC18rlg3rWWK%!K)Q<`H_TNP0
zqr>!~oQf=Be-AA1uanhdF$V%1<A}KawM;PyzAFSJN90Di7#tO217s_Q1tzd~97R&5
z?&d)NvQ;fWzSNl9%y*h);>o|TC(Y@AW%+Rk!4#i`e&-Bun@oj}hW6@xF$u_d4eVF}
zb0N5-plw?{Gr&*`e3G^~{Nf;f7cUXFna}f!lC=z2XiCHhGNu;F)g=lCH4_B~reR|<
z3zEgjk2^uWHAy<^pFkj{UNxPf{rbs}8#pR;Zl=PL8un~u@Rkzr#TTPrpGR_)wlTQ{
zHR(aZ0K>*ftTvS8Yv3bq1RnZg@Cr&c^yp=c2ii3@Gbtv=$tOZW8OBk4W3TY2Hm(|J
zy{|SFhTqp3cbe~*`;2>R;on7BP><JEJoc#i8_RL5pCdTx**Ke2o_7Zxo?yZa@^#8|
zK5&eDVZstDP2eyK+h(3%A59YVOy?u@1ub=p*7{w%oECNd?M)3PI$Fa<bZC6NC)96*
z2TJzT^@xqvoPS&EiQA&(;u!i83>@ct#gBU~=EhpE&;7E$wqBi9Pt<a8TC@tuW}QIo
zJI7+&3tH|bSW*T0+#opyOqWN3CXZ8c<&ign5Bg91!V~Oy3O)ew(F{xdR!e<XYyGwx
zz&%QXwO|o8PZ~CN6s2OQf70FEy04oSx52|&c01>f#V7-u><XC%Ed5F_p@3xuUer=s
zQ5akGiWc{kbK!AZ+BJN8jPsWAc=mDcStvE$9k+?hM^2Ew!#5q?0uy<j6bD2rKhQ$e
zaMV-zxiht2vNN>|Tt4@MS*L+-;F#sgtpI_k_3p=lhH^ic{RKKB>sM;wGYm&it1F?@
zPt9BJ#}X=az$QG5CopM<91-ku625_bg-<kXmviA#@~eh@bDg(*31}hmJWY^x3tar~
zJsuwiJUa;BR|2WC@@r>me*nkf1K|2u`v0LT9a0+Lo8Hre4<IEIE^aoYp1Bx06@d2~
z9u^K4!3C(6`Z{|10fxd0TWXuudZ3%uup<tdT<3yMS!Ucj&V@sz@!tYenuFlj`(5sU
zP8eU{e=qXCj|k;f6Xcfw_8RQR=C=CIFlCK~mUsvmIB(1O9}Uw&iv(nN{r|w3+OwB{
zmvC|fJmMYjgz`>6nBIz0=S28(Q-Rl;Ayo@rDPw0%W1iI2B+LkHYYjTuu-dsmrm(no
zapi(@;p0L;-+?ObW`jb#C%OaQ5%^b09iTV1VIKoFYYpp&`?neL9bR5j^{p2{sXuaa
znG4>M=IoWf5x=eY?ZRmCf2akW%&di;BFVJB#CE{P>E5@r)*apP@PtPF3zFsM-Q5sy
zQ@_I2wBRAs;}%xD+IUomrXHX=+{<2hS^a5yA<%7f@7>B{)%fs)dcR#1?B)*Dvl3=p
zk1>8L?`nu^(ZB~ny)R*#Fo49NtfCWJc}N&jq!#M;I<noW#lhs_C@po9C)C632`yp;
z?$(bncv_8>q70J}jeC4s4YUV0S@e5JYyB8jP{#xRp3rYqbK>V41EiZqsg83@A^r7v
zq?V7KPvc?SEE+w=13#??x*H!`<7N5uY3k3zEme5XB+ohztfCv*X3$lT+K&i)2>Lh+
zWr0%kb9<nqV9)dM39hLzG??X<N8SZ|1U8UEl0AYo8ruXqh3CK)l=GXb;vDYOT{v&9
z*KrI56RtxD`0(}e=yQWS2F&FHEDanC?8|sxP=a|Be_9xB*$Rd`pD4KLKf;8Y+^W0L
ziU6BLw<zN07uzOpqaN8W0saA9GkBnW4g|{jWl|R$fQ9mc3_O8CY{N2<%VEtDwf@OV
z9}j3lix~;U>MtOZwbe6!oPN>I;e_QM0uNksoH=<i85HP?K5>7U_d~}8p?;q+B`3eJ
z5E8K%;bwPI!}p2KhHd}~YmKAY1h#mBaSPIzusIu#Q$V+0f{0nn92EGpz{3Iw)^!Q`
zhd+blx%8B6fC<i6ZswU6G-(;Lj6{}Ep&1>8&ZVE3Wukg#)9EZWn0LiEd!t4}6tfz2
z4a&rlSt?oQKx#)}<925w?-^LZFH~?DEBJ)+c5orng$m=eb@2vc-Q2q7DjA$+V?ztW
zH!(bCi+{TfuNX{bf1A85bbTTyXx*p;3aj95+hEV4YpMG}J&sCmX*G_5fwb`^iw)-8
z4^&0^BM9Ny0)5pnRC1Fd!Y7!t(11s<$Hx`rR}ea2AaD){V<m#HIo(5Fu@|?5COnBx
zm?!Pje}g2(W>9tdcB40ti7n)x107sMPz<>UmP|HcBR^x*c$mkC*PKNCdgO(F9A96G
z;~*nhcPu9Y7QRf=mPPtdh+(*#;UnAmUt9h>Gk*=R{1uc)`gO;JNy0Y>f$uYLIs$>;
zH(5!rUV19ndt<mhsmikF4CY;h-iqLBA_O`MU$-QJejgS3)gN#QTnoBtAFR4epIC|d
zJ)(8W8%H=k0=lfRbQXX+kAWKyIIZ>jL;(1n1zn*37S;Ky(SILbY=`G12(uPE9o+%X
zvo;7U`wiuRsUYx;+yw!506M6fh;}v}-;MM<z=xeV*hIXqEwjWyaIV=1B#zGMIf-vd
zx>+yN@iXWXk#25&l=SwaT!+(w%n7YpsJ~ll{VZNPs{dQgrG+@1?N1?tna47MeX1)_
zOR#4>9}rZ|Ntnhrk;Aq)7p6;<6jr+PEfjyZ5C)v-C8_0U!y5ST{b$1`r!IUSv&frB
z<ghK*t#x-i9HDQ;OuN<wK+{}~d=PL?^dc4&O{kYnqmQQc0Ln*zv!yV7J2s?x2t*J*
zxyA{_--kFvj)HYbFdp_K>3c2+4NGw!b)G&FV~HiTGeNM=?K7ll5Aq>R3-Zls+72cX
zIZP=MSzg@6^vw&B+>le;^Y)XE`0p)fJ6eX>Ud#=~%N(bWpE#XB27U|vKrkPE-d#)G
z1;FNuYatbnS+?~H%#)kjw=B|Dh39i?{?+zwGpBUp5|*jx`Sl(+7J)IlRpuxFeXx3{
zsj~Bnp8i=lxj&aJ3T54acMx_B9{RmBcBO(<$uTc*_BY{r7>weRgb1U^hS{NlgZu&a
zm*{0s;5t`+o!&9gaC3lI{EzWr|3BwZTUW%zm_{3_6j+#J-SEMzWE*^TIeJw$@Bzxv
za<-V3P^1@Huo-_r6yaom&7w~HFZ|T{!gBFN2d^*lN9ywgyK^(Sw-%g@wf8ztYMYTD
z$sxDcV3U3$KIl9Mi0EN-9K}Y|qbtIGcqE<&^yx3%@(DE8VX0hr<W#s7c!UP8{GQ-o
zE%3T0^=o(F19$8GL{BPqj^Kp*{G{-*tj&2N7Jjh1_{ur|7g$Aja<Pcl;ozqYUf-Sb
ze_%Zv2+!jMwpNAd9n`wUt$xl0uy}aqBg{?0?hR;lKS7`tY9PHsqQfY_wz|3}tNQ0S
z*MVSXuZ2{#VNV`fw_b~Q`48SFx}3f4a;f*JkcgYPJ-~hYd}ptR^VNG@v9s4=x=cm*
zwUeE_mQ7agBWs+!>TA?{upR(7dujDZbK4?$FGi<V-_tDb6Poe<z+!oyycq9~wa9yU
z3*P^}QQm7d;{6|6<$capyf^HUcM9jN&ZZ1hRD?bCTd?U0hrN58O?TnN6StjmeoGPj
zXC;9}CVJx5Ql@?df`2QV3W3{zz1i5A*{@+N@(7d$$(M+0*bYnz<3T}`0OMYXjF;-4
z#p3Z28y+vW;c<!$k5}68SZl-MH8wnsu;KBy_0nd+qg;~g>?L@-yIH~okAJq|@zE9u
z7d$>~!{hR;5-xb0AHgF8G2&56#f51%rIH@Rqje`7h3Os}792W(4f@YIzK6aBt5>L6
zJgEr9W(N4J{`7(Gm8RCH5gw)FD@loD?QdHrOXh;qm1idCAYdfNeVA#Oc#DFW$wg2&
zA*{t6x2`DlE8_(4P8{U{boiUw4p{R}nFLO_O-64$TIS$*bU$a!xv;bh?6TY?(r8$Q
z;YAuoJ~YzUyEtb&>@Bg@!2$wf()qVd7(QBS2h<ne#^G4I8;X{TplBHripL$z<6TG(
zV)`zgDc#UuE;m&@-M9*Ym;d<>k@Q04$9RI{F^OAK1IO>)Pe3>xszUZ1#p!|jEU+Cd
z3VD)I(?)k{OHpp??EHVA0A93k%<N;@PZvE!k+y29l-mCteaz8dunnt`0j7$%@7H}C
z(?Qb~6@{)F7I$GN{yc)S0J1#N;S#BhZs*7~(2;iQMrUENARs+sa+UoZ{a=_Y%xl=7
zZQn)jWSewPXjmK8_P~}OtREifa(DZFuBGl~5B1*vJ!D^H)~tmx*P<|ozUl`CqVqig
zUUNZn%I;9O-_RX*Je*2XJ6uJeX$fqOtZz9Mu{RCQ827dFhPX(~M#Q{;m>ltN$9cn(
z`kmI$ny791q9`;B3+D}lBlo~8xb=F~cNZa=kILI`=4Sv_-f&^|wkNJbuSBTf2f4Ug
zchHFNeN4mp-gvY&l((oFFhAz7)P3vp*S`P0qu_bzf!y~YGo$UIB)T9tY%48L^$(Gt
z{h#_?$Rfh{;j-Z|yi?LyjdIXmLVwGIKyhX&G7WsAHr&KW`}`RCTMd%JD){R#?d^k9
z@~zge36_sogva5s#v+g&_SL9dg9?pH)lHt<kDc@HAZ73#oL~_U{sMrUiyIEQds3kt
zd9et4B5^4DD0k|nqTCI$FBIa!iqD_ao*I_N(Z7M$S8-0oX2}-49dn!*b%w^qfyAKv
zXs~wQOfPix>}&XbA)WQkhRV>n0Pe;?XSn)sy`r=0;y`C`aL0w`KM@By+s|Bj2&I#p
zV7!Y2^oF-dM^S6h0KgGNV|(?*-*rJ_6A_4eDX_4GRdDcVk{3*NL7)k2f{+#TZoedH
zfdVx}Ua-0Uy%pWYG@HW&rVDcdeL)@90{Sw5#-dQ>OG00JNMCKfJE=uK9Tm5S__;7l
zUwic)2$e0~qTF?}67=h_I@9l>SQ5%hpNR&0zy%eltkY|7kTI6L2vwN8<{|;e>n?CV
zi@ZE|CwY}w<;Z?5{DKjk|M&NQ82CR7{2vDX4+H<-V!&NBtF*GBY+UK|6vvd(x{9eK
zb-vn)s_8?9l-AUEtI8Zx{dK<TnI)xuU-k6b(z23iRVB!PlvUHiaZ~CXrBmw8t@isI
zSNTh;d==MtOKQBeB|dLeU3IOaWSYOuTUJt6R#P&yyt-nl*HMk4YrWGQrAXm$RObB9
zF$}*R_`MHJ!e8*a4nGfm{qXw|=WEvBcQ1aG_>ID^2Yzp4AuWE_;pf4xAAVnEA}xM5
z;dcdo!|>~Y-}8eIhVrA^SW;9nwYmx|_D}V>s{ECeuBvLEOE6;g_`vEK-$0}(D;-!_
zF=b%Q)xPrTszK+brw^>F@OcN+lun&dI^A10&{thMwR~Vj)znIVnRj5pIBykl)p`e3
zpfJDp+;WS6(p>H_;|ebsQ{ZtGj2&Be$vD^O0=K7dl&jR|I{iOY+37>FvoiX-YAU^@
zbzWDkx2C$*=PGqg@lSX8s$Iobk0Y?>x*Xm)HQuQ{Z<(v2%IEM^yYYXa6;$J^Eg3Mp
zqON3GWp$}9C6zg+cwIo1w?9hqxuyeC(@HDryyv=HDI=@>m1Qnpc}1PeTU%RQ>ze7U
zs{>+~rDA4HZS^c~*?BIPdz!1Rx)WekE^k$NY1LHkOmCGB;9lkT*7@v^raBx)hMKAA
zX{sB>UOjU(+Y#=F|586>Wu|p%Zs&fmpo?@!S+##kr5AlNO1*T^FH@?kEC1KK<v-9b
zBhZd;w>bVw{gOR6`|$dO&_%k1I7g>Qpc>H^dTXn^m9A<2s;NFWTyjk<ts)Bn*^`}M
z5Sas?30$Y;S)u=vTwqG|95CmB3bN5AU?muZK=bKCGP1J{qvehx8*dT5_hUd1B{NHB
zct_V(&m3Qcl-{yS{FpUndM#)j;i<(o?bLdG{@SX#k~(kYG)GC@)Y3{!DrMEa60nyt
zFmQ94DfMAeVPBMh>D1O%lzB@zl+3@Mny+LEIQ;C=+A=f$z}eNcGwQ(hy#q}?I<{h_
z?RAuQ7AClifm7>hrw-)wHt-_Mb(eUj7tZn3mYzGcrp7S~Jh!@b$Pi3EwcgUo%Ic{l
z6;*ZKS{w5vfq5#x1T98u5*@=F<7z9wvcZlqEun?v?2>kNg|FP@nC`9e)?ym>RhN|3
z)|Osv!d+2kl?k5lKRyMaL5kSYvIZXxwH+yckI<W%XH~syvddd3(@7l>RqC2n0U{ev
z2f@T8SaDTjCgPWgo~TYKt(xwIq~e`AV~DGyq{@{tyS!p*xeIgKnb-JJUBmNaq8l)r
z1pu4T(N=-xbIc&jK^bY7U~Bz#6|*20VP5r>)=u~O3YkRWv&+1`(u&F)cB32PRAN*C
z5}%$4BC0I~)u=%o8=uL+Z5A>p{j%E98fze7O0Tw$t?Jqmck$>k8Ku5#h$s-w3dd%a
z@Hb{`N#=CFuP(=Tb&a<YaD-pn7cdF`2vKz2v=SRA3gDRk1^%B3-z9bcFS!u)kIe#<
zSw9ZtU`b^GYQix(wyd<0vFSFbj|jd_;FVPZf9Yvs!IMgB#{q+dhs0@iNm=FC!4bs9
z;LZAs!Z%QnhHQ!^qH42}cX}RC`zkNF(B8zc*(Lag%6&PIhQV1&E8V1tvMFeQyR2|Z
zA)9}2@{#N;q@V;@MwCopADM(RI<~HilCc@Fb%D%~ageGjx`{{BzOlY6pC8h(`+}%u
zW|4ptm1RRD!`LY#)K+j|Ay9yiT>gu}2FSB)$`mvZ=qLnMeiR>40E-y~d`uZz_+vyc
z`XI2TlZ^3C+8>5?kQqu6C$vb?Ro@|&6-(zGVgUb7;MYQAPRZEtKVO#cW%m@%%uHY~
z9G+8BS&6<^-!}RqnzMf#&9*X_zra>oRH-BX9sFBp28#;+i)l+ZJX5treMh$?ny77G
zX=!PkKL({`re&pNrwvZaNy|-7OHWVFNFS7*nVyxNojy1{Cp|YKEh9Z6BV$lTW=2*<
zcE;e0oQ&K-X@k-SWegfLD05KOpzJ||2jvXP%}mQo&&<dil$n{Cm6@G6I5Q_RH!CeG
zJu4$?P*!GER#tY_;H;di-0ZaM^z4l6LD`wvS=reTmvgdn2d52AADl6G(BRC$S%b3&
z4<4K|I5#IPCp{-4XHZUNPF7BK&fuJ!oZMW1n2YLjQFJb{<s!&xlBG_VQkSA8Agn!W
z{Jzuy!(H-i#UDiO>y*i+jxgg=N>goRs(Sa$t@HXy&aJEV*G~18jKxHXwJ9_m>RCN)
z8lqLoBbM)qDyk}GmR5>{H>SGE>liC~6vv3l>M4%%l~5+)oB2MfVye$E+7FS|>nNCA
zTH$jPUwzS(D<K;?##f;%Uj@`G*LceuBTH-Q{19L-xx_ooaj~ghsdW^>`;(*2KjmD7
zMu&H1jqmDnXL@H=*Ir$s9J8u0QRtMRpi+p=fQhfQUdk>d4hW!i<)t-VrP6Ug8B&AH
z5X)_^C8bqmOot4P5(ulMl_Eoz)|Qk(g77-*m^ERs2bsur%&PFt28t)V@k2*Geku4R
z;kWPgA3C<;w;sR6_}ztHJ$~i*72}tKp9{Z4{B~{rp<@evE%-f*-y;0#@vFgaGJeJQ
z<>TkTuLfnaM++ze19L+_6Qb9_v&=ik<(B{RTuj5iBU|rKzu@;(R1UPN!9sSZ%j2z@
z4pCeRr#R&j$sE!;guE(>?g*=Ml~t69wu!40s3yu>Q?VNGc`t($KhjD%#3jHTF?<pE
z_{s`j3G2mr1{#}kM;Dcaapi!tnw?ofm5`?zYCirQIetV9rX(ZO8`dw0>WISm|DICm
zv0v=(c>kpR9iQU&41V9>7k~2p4j+DB;&+j2e@EG=`#Z*+vcKam_+`So<T(7+;CB<k
ztIycqk$k%P`ON2E@cah9O#DC{LxxPPhHUPgGfZi8aFJ2<NEKpfb{vi&XYcR0;;j80
z!;(=he*MnGJARMj*NAXMKV&LGI^YOZLOH4rT@jp`W6i#5w1!_ro%#0PT>ho!rXR-Y
zFH8~;xMU$~TlAViz?u@sgo9cOXfP#j?JR1^R3z!NqR!0n-)(2kpu=b=;R`o5(jFJo
zrJb6N{t^k44MtP0yAI3G|CN^I9+tW&wq0yf1(dnruC&@?Ys-JLhcYq`QxEaK4d<ZT
z!y}~sC2AOanBZX?Doxmbqm!xD%pG(%BM|y@(a&)ts=u+m^z@P<zt1~IW=6MXOm0cZ
z^eTVJ)H!p~GfF(wQ)iUaR999^y}B^NTC-Yt`5f!Z804*@hOq>aR4p`;6rW1C6XKS&
z6SM*nuCxO7tn!Zfu|mVfST!`D$j#~brvRbqNH>{#jPeLIbcP`s1c}d9+OzaQ=G3HB
zSC)BeWo4-H4;~ARa;3LW>D1U1z&N%fhi?}`peoER<R_VxYG-&=Dfxe@{>Q#5XStSi
z<*vl4&$e(vT#V8MOIieGdrGgyW~8jd<y-Z-`kun|IYZMkibPr-Ew>mzBY_nhf`3$x
zfkpcWuG_I_AAA0fts5;YN9k__89&mR^w{QP63}#8d#r$PgAjlQnZcb}WcL4m6#pRC
zf4>V32iK^`hHjhA$igk3hr%y3B2myDDT>1cZLb{f|2BO?OCw|#Aqx_)n*2$VK!L3R
zP-Gk~oeJ&_g0)B^qVNY;(v(KY4$UAtIMKZiby!zSF;e`I;ir@O5h$@kWCzHGoG!=o
zA(TXl$PTUo8XW^m#v$+%1<s((JjR9^dHpZY&*6+U5KC^C?Chu0NAyVf&$JJ!KH+kQ
zV$V#*zX+B)!w>cHlt_aA7<`9=<?PNV?h?KTct<vVgoDUl1eCz`$+Qqk^dH4xrwxhG
zf}m`*evd5;Xg(tRMeWIehX@Zm<o*s+@14MNMDOolt`r^?LEwJ`KA>!{DPe*89%35`
zVA}e*6MO<tq_ha7cK!Arho4(D5nf7x3RB8qwW(5R;iCuGA=-!9GGeO3lwoV?5w*`w
z(ubHo4`*0*&G{6NoyVs@9Kqli45`R;ZEu<s0QnvS5s~k){p&7PMTDj0!(Vf3HO;cO
z^5irAcFvVY$IcEOD&S9jckno${9QS#h8x$kN#Q23?1@7K2?a4WvAwRT=bYmzD0Xu@
zTxs52*3Pl(YOn8Hhoe~8kkMd|+vw0xLSY@Y(SpIt`4v9RKgCrs%>|L$Rpq6A6N?{Y
zmA@m?8$vM;YdU2^VeSMBG<OE3nFcMU$<R;N*H@MLun#{lVviwkUhId}gozkSQw-JJ
zOti4fhgHsLW_wgviG-EVpeM5DJ7masD3zy)iuZrU!a}Ln-9BXtz-2G?fDTVzaAf@;
z`s3JI1rr5R>RdCF?NfDC<<+kLN8bCu$312L|99s7zWF!FbV_MZcF<i#wdo&(Ez&k=
zlQuLc+qGRyHPv>d?QD}0w!1>BinOwHDT*vziXtdmbZt;{DT1OX^1<4o2+DTl&-?d0
z_x+yCWHPB>_p?6VUml&l-gDmfocsUWbI<)V74svq%djGBwhOv`N(opbt2^OJv)3Rz
zo4uss?-f`0tQqs?&hKa@UO1K-!c5%aSY+9u!e!*A=Ge6ut%u60DTC&)$){}AnP*O)
zCp&)3Bz%xr2l-EU9eUIz*<W>3k7{I^q`z#|yvlRgvqF9=ru+a>-ZNL9sR5II_83)6
zWlzfwAnCb1>jnzdCA-Mz={V8Zt~+<$JeJ(pmdEcE&J#5#o1ACSoGQWtrbqX}lH)kO
zjvy@Icx?}r$NpXPY_$LLEi#uK!m=gMl-Iy;MWtv)Y5YW45n)R)J{3{Kvb&6=NTWYz
zs;jD}hEJV3&opSm%zv{gCd!V5|3t%%EfU3VqM{`a8QSv(5#4Sz$lDZk%rW63{{8#R
z;`(6OxpSt@p3Cmu*^ybY5iq(zU9{qmp*cPGQ~n_Q>_c;Um||JSk^TAqs|{gTZcld0
znx<;>!Vgg|4;#{xo!IIJsF#Ng>B%aZrzn0{dYSzvJ=DwSH)A7s>s|jD`zh{3Mr?=b
z5j|2B*zNyObrw}$UCmsKyrDfx;t#z4<^G@Tf4Tp6`(KaNr9JDsY%Z3zOcZBg(=pjA
zbC=3oV6)GeQ|Fggs&ccm@_qM%cykJ7q~1Jk;!KQqv7j)^+!I8z<`{FyjgNB`<<sVu
zGj+qDDxW)J)--P-zyYP%XO)-Dne(3*5Ic?!#U~f@VpvX(+Ol-W(e8N=c0_R`DsFF9
z_>8%15h9PS=m}<Dt>nHejM>S0<1F^|&zmt{jzs-5dXlRQE31-CAXt&CG|uNx8cXbm
zW<nU97k!U;p3#n@W!roFx|ml#W;?uhpl^!13%J7U44W?J8aRB>DR)XmH>|V&e(G6b
z=yuNMG`CoICoQe>JF{ELi(-(R2AhSwJVoqs$!i!BSq`BM3Qv^NX0s~h9Msvh;rcDu
zD-m5aG}BFXUYWKfn-Yw%?xld)DCgs3Cy6Y-itwpn*CKTHUVXRn>}WWysAx(kOIaO`
zOWVoibI*~}kY;oLfP=kb2qVtm{Kv88got}G1nag+#Z9^0nMp->s5i4V=P#$Y^KljL
zqL441eLQbQg^D^)!fK#7zZcuKVP+Sx?HZMH4MEMHBPSd?cG#S1j;_SGTw64r&85gz
zMb3|wpF2e=(@YY}Vq9)@%Sa#38KdG1kS%&R0%SrqPmW`#%BdA+vBECL3|T9m&17l%
z6xUPWMHJ22)N(Tpdi`$dv}qhf43|x<oXUiZ%6BJuZd`6d5Va?5$Y{N|jhC+9hgsmD
z8G{C*f0*hGJ^E;mvueQTl2Z=GwwX6eJZYT8;UJGR^V#z&jUPf{95L}&v=R!d>R=_`
z%^56{y4Zg!D%zZCJ~R20@X6&9=99*!d0v~dg--*Y)qHCCRPib0Q^+TWPc|RLXWOsa
zoJKwkeCqks@u}rg%_omfJ@R6)#-0u_zEsyIcBFXuIF`%4hq1*IMvW|<GUWtTu!fH|
zM}S9qj<_*%MtbLe#o5^d{v#9lPjU}(oy<9|F8<P3%G{?zPB`35`f{TN(vhPhkrU>Y
zO)vW9gzqw9Z;|Nx$nU;jVk=OGoR*k@+^C23d*|SA)5VHdZWwox_}QaJdhpPnaB}5Q
z-nNukXBJI5iea*1>g?%L&MunhE>Gl7oK{{j@8r=_j`Tbb#rbmzDk9U$k1U%$t(={*
zMH4x7A%DZg{aKJ7nZUM(f>SuIadPDm?hd>nS)86K|Fep4!jyt3iIpeq<Nv|@li!?Z
zUcX=dcF}_VFI4XLuHPSE{^KiG^2Y)Wtvt4X<Jw@di1Qy)OeJH^E|G0N!$r^0wWb!G
zUC{G2qRyT0Ka&1kR9nHnGx21_=kwcX{Ik<SKA+F_^B;c(pt|2;|Ho{MxDyxgMur$2
zrhsNu{I7c&IJVP6X9y|u%u{cRH@@Sek9L)Am=lIGuwpP`=Y(fZt(xLGT%)x<8Zrmc
zD~hL9R1PZ(M~xeAc_wz|)!lyoZ+EkF7`~mS5X_l!dgh`3p7wU<XgLgB$w6MXM5dHe
zB;E;_iYddq1Kno&!sG?(Ec}z7F!y3M<9_9Q@42(jkgcDxvy+n(ZlM2Mk-4U&Y~eIM
z>?3=$qcy(1hyH--yGK5gBn7;SDjO<Q=Gpc0BN)^&b)18WJV*|Cvq@Ln0-R2oGcP>#
zOfmFj3SG{jnzC~-?qy;ZVT10R%0Y4s$vH4{^!^<B(X5Jr@|7NkG@J`eH|B+B%%94{
zqjI{;wpq-H@_JE89OA1eKWFN>6|4uz!5yzlaz6}rt}t&wJSnIMPn$Wbyo`9=9=bm%
znc}8n%Kwf-F*-4lM#VwB9;%@4)(_*>XQKA{;8^_1tu?MDh_5WST%!B#J+1uTI`6<b
zOiwP(QELCnLD<xDxTuD;r%L0x{!e(I;tF(^3tpVu>{-Qr?|SV%8Kxq=0Si^?E^EiF
z2U1#9l~XwNc361Gf3)d$pG11Vu%6U!`P{i@&5uZiMtlB~aI`&pdIB#JZm;R#`P^K@
z@WB56aM7u9)f39Z_%CrEO`BU@F6TMrZ&u|{*)8I_LZ+PUt?r@<=5brg^ol`Twe??Y
zKSPK2bcF%qW00izT(8j&@>ab6i;2%2+LIj@XG~|JJ3a356s!T%y6Bq`>R55%bbP_J
z$ds2|C$lQNJtw%spp<o3kKgq?WNfKjM8$3^k@bB~^ZncHi&&fq%lkP;_3%uTwBM1Q
zFPbs5g3H`YdC9@F=#B-?TR(eh#Pvk~0NcrtNA<LklV8zQ;Su73uZXI!e|ydJP`M91
zs>iGEw4<}R_GiW*Jo57~g=E!|ofvW7{(G6|iT!Uz2%}!4&B-4oqM6}T&{@POJR;md
z`ES?Dkw^AK@0ct_)2vV_a&sb^;6xh@IQU@2(gW>ax|>@WCuEoqqx7(2|GQ(tkep#X
zX?M}XxZGytEfG%fZjLlr`fuGkbwm&6+T5=a^=8KJDi*h<cZbR_*;39qEV;<(2zQJZ
zmyEj?T^V;rf+xuH+nD^9eBTJ74Z}l|j3GyzFxOlZ<!$FJkSiBmSGt=_?>;HHV<O>5
zp!?n(2@W@FJiB(m|G#$sh!i7NVtap0$FX1f{KQZ6t>DpIn~}I)XXxH4VRUIvR&BBM
z+>z0{>Vc&ncE<7rqnOF6tUfWdtT5ug^S<Tydq$8{5RKMdrk6<xYf@)3ALY(fZVX_E
z{;%G#z^w)Bd^ER?;JK1hnP!WD=a1ql67TP1b2W>%oEaV<cl>dUi@P7eGck=}k&n8Z
zCCgRFb*Ek70c91HNyCchiP6jQyp=#%JFj3ddp4WdS-DeT_ineSpUm6(;Dul<3^R?5
z31zYy-lWuhGi8uW<IUBs{|Uo4X0nyd;;uoqJ<BbEW{tRji{#FgseI42(|pH$8sD4%
zuiDf>(Mn1@UEsFIu(zb>8ig=<mnKHJSTj9~(ltx|Q6HtLm+v-D?QlOzY6j<<4J>2v
z<dlgPq#IA4cP5iImf^k4EfwN!N(koNn$vkdLkae+OEL35)3Mn)KqF)wSh<a&W6wkQ
zAT=#Ae+n+va=AS+T2x8V=wov}cj)khgmk1gUPH&i4>{sgwvS`wb!0F;Tr{$8=|Ar?
zhI<G-^ImSIlsgCMBxjsk$$W_1de!L_-ykJEGy+Xm-%6j&b8q{MUXAJIdt@{|Z^wz7
z1C!tA!t~IgbOX6eGl7GQPn<g!Z}^;ZqZK<a?1u5S5^%>32NTYT=^M;0b~A-X93%U(
zk3O3H-{#iL@>#R-*^Qtb9es410QO?wYX63={%vjsO(=orl5mAwAH{~ho=;T1Pq}xB
zRk_(^l=7~gGCH>70wS}XBCSDslG|Us3vQ!bxwA^|7?)<1RmqK+IpSTH=Mi2Fxi=pC
zw}+?i)owe=#9aDiG>O?G=QYxf*U`S13_G{)oJ3YBZPTqsp-7kybY0Lnch0n#^XAT(
zb)HNs#OKUtJALYG(_71o0vyC7DjI0ITg&lw!w=46FvawJ+O<)LXc10|$!@mkWhY0K
zyj)i1mB92WSs0qpQ6R}G2U@548FEClexhSag34_^uDg1gJ@wo(rWbUIlkHyL%a7z5
z`GZHY#i`si4hnHwinZD?%q5A{%V&4#>|87+iH4)Q@U;5T^EuEm$17Jj5=pz{r$Q3>
z6G?`@R4&Lj#v9vM!j;?=ZS1!a*Va9yV)_u*#5m*Ja47?XG27x^b}ku&(UrtTaIc&%
zRqE#)UUavYu^-T750`c?;QQEBUcGfIoR-Vz#JRV)5Tj%Oju|U*b9M=Li_fYUc{T&K
zxSx3*HGdWxbUXjaq;}}x-L{d0_S>0PzPbEyDd}92-^?%5=8o}uh}VA-g~SXDR?p<v
z>G#shg~&*5O}P0PGOSaGUFM-v{~sFF&t3V8%<(VTJuM~?hK5YLI$DxxxEL66rK3A{
zVKI5;+_JD@q3{e>wI9zKw7XGJ4B)sD=Z6|NN9mrgHS?2s)7c134BUPeC(+v}D;tJc
zgMh{SeK$Kg&NIoyfrC3mG&nMA+F2+Z*G!wpe9D`Lc`Cx>qZ3E$cKqlTOk?8H`{d^-
zm^e#slp-=kS5Dtm?pk=9M7K}RJ6D;d+%u+g964IL)5|($7tvhDaLn1AZ$LPS{XE0w
z&zU=q3yxU1!(Uy*Ub!QunERf%RW<(iL5ijkcnHG;H+IkChR%wjY2x{rR!~rI#>th#
zW#Pwce&d1`b}8d-<tH>`Gn{ybDu+j=&YD*=p6}*LifD>Vdm1t%E@!)JehK4GuGfEh
z%r5?^A4ma2PgS!+82hBC4AaszRf@`FaYc#Y>0UGCj?iwnXt{kKd%%@sj5Jjyzb;~S
zR?nG!&Xnk>#VJ!O%|-w0H`>Sg>Dt}-@m`!np>tcDMxIMAYjJjh)nKTPQVrL)IE%8C
z+5)cFn{YR@a7QF_5O5>-3b-Mx)casR4vv2XR)Q%vwmAI{R4NBt0iFi7{zR$%buG?h
z+VEsBJY1<q!Q3O1`T%SK{{`k8sZ{zOTb$*&tVx19!QX=65lWqLbBj|}sMJ+pRgqH9
zfqhDp`UKp0BDku>S@jF0GHz*c223Wsw-Wyu=t6Kazf4nldyA7dOQ{-g4Y&^M|0|{D
z-PPihGkuNJw>X=?Irk!;-!xnVZU(Of7qCitJGcpa2ps-vrJe`ZfS-Z=`2mb;?`v_E
zfM>03aa0B0?*|#E?)Wou<}3C4b(9mA!>)Uf@~h&G*oVmH0;Mi|xW#Dz*Q{@Gwu7(y
zrNznqEq5F|(&DVGR%)NWwm1VXWb6hPfwf@6LViTLp~YEw5$OhN7gKMbeKEd=M=7s^
z`2INgI!>v&4J}SH`Dz6F%fR_Wi?ajF1GkfodNA}GrJBJ-#GChIi?i-`@Pow?-&3>?
z<cFVze^2Cr5l}tT;?#j9;0CZ1Z0-vmXlKI*W`lKLG1$C`-yx(OmA*`Tw4a7oXfI&i
zYb{O<?MS`e;=BhofFo!})o-*oTMhu<f{%980G5HZ|I^~EraeVIpq~5i9vlJIfhAxA
zSOzwMRbbACloJ>M>xB+(0qZ`Zoc199;Bc@3ECd_D$zT&00h_@^pxRD*0z=>$Fbp<;
zIpAh658MWpfURID7@}Mv;1;kB+yOQUopMqiQ?FnSSO}JaWne8>4K{!)!DetB82*HK
zz!Gqklqc8(=6y;#+Y3H042GLqoJn9kSS#N@r+<LeU$i*uDSx$t@6w;YB;Wgxo}ILB
zP<~f&9T?)*In(-)9<V>y0Oo<sK7NM^tmDTSw^MK7K&#V#KjPtMzbAuvp;l)(81B>R
zya%f6R%iH+kheF#X9b4)!4Kwu>%e+&o4o%~tCJlj{Qlf)3^pFXy~tqBfykpBl=g3R
zHi4m^@cUP^htL4-P?mN9uBAQX9Ks#Q{fU2It1}Ud4CXhqz$Sh>YYW(LEO#sa7`c<U
zPZ^B-n)HEn6~s$B3ZIQ!FjU1o%V6!fyg!8ZVE=*S8!QCXdE^r;0jt3}aFx6VH-dS;
zp<LuWs174uFbB+A(CRdT;otHbUFg5?!d9pCaMH7g^dCX^iwJ)t;lYwz(tR;>u=x`5
z{ZsOPHTOQ#pKHNtun}AZHi7HFW^fZ&vI02+2oLrFBj5nA7R&?dz=>eqHLcDfumM~P
zHh~+3el6(-Yr)XL(81wg&ULNMB(MZ52kXFEupV3mHh}BEMsO3@1U7-q;0{o&<X&sa
zGX(Yn!(a}W1LlKy;3TjFEC)-$YA^yW2W!FAU>&#~tOpyx25=|X2xcBiykHn?28V;{
zdg=`ffs?^77y)y@MPMGd0xSX7fTdsq7y&nf)!;U;7HkFUz!3FY50+A&P2d7BbOUmv
zp24+X1gsWY?MBK4tOK`!4Ry!?n{Oh0gTX(wI!nO%TZkVlxt;pUAsu(n{sr$O--36;
z4~Fg`e?yRSFXaVB)>02(_yOt<Z2U9)LrM38+$Rp^JWPJUruE1HYagLrz`O?9{V?Kx
zl=cSJJx)5nh7J5KozTHnu;dBk3`fqBqz4Q=MLy;G)07j~1U7@s&moWY-1I!<4d%$b
z<Y3+_gcp33^nvwTs0Z46>Fe^Fccl9b`TaZM1=oOTD{=*!XfL#X^&WDMLf$s`z{m%*
z2eAG_+8G%7nD<AM-%qIzFz0jLgAF@~4{ZF3e1J_mX>atS#;;qQGO(eQbb-pD{f!`>
ziZeuDo}YWN!B7Tgj*dZYCihZ<jlDRZ1ghQJoPo!Z-)zn?fqC4Oy&eqh-{$ND8@R)I
z_;KWKAop75QQlw)*Z`J+&0rN6K8*XX!5pv-ECJVprC=ji4TeU-5B3M^z+A8%ECw6E
zQcw+QbC!T5;0`bXrj6n~m;;7$$QRfQt_Smmv^h<n8rtUc8BMu>C17|M=@C4f{DV11
zlh1tOJ*LfB0ER|$7x);`2W|r+U@KS+hQ^Xkus_%Y<`u$Uz<DgN1Y8M5zyU?XJ03X`
zNJlZ@Pb5EJ85lmEd&dPQawqsn$UTK~U!eMVo3lc`PbOY4uat5)8G0G=Cqkb=zJE^r
zfHQwVe3fm^Mz9Xt4pvuFUXytb=AS|Omk@6m<x@jFfZ^XGXD0c(mihym!LnI|zm9qU
z)dQsCSI7l(&LW>1DNnHaHO`q)&$ZwNunyb;)`Q!@22kZ82kZkjfdfGGI`6>}aE;U>
z*d+D%2I-;P>%mQwcjH#tQ33gVv(1?VmcGroIItdU0M-8>NAMl;Lp{`i6Ty=As7Fx!
zopejRfD6FT``r5uHvfaOc3|TNZO%ICCFdjZ4K{)+sh{fY$OY>@rXI%e{gXDQ9IOYc
z!P-x$2f=3I9Z&i`qaA|v;0mzzbMBZ2bH0GT7&%}e82%^i6jc8r9bhfEP2RWCPmZS?
zz)4_z8~qIox6@w15{LGD0`aJJCl71@7l2K^c4rM3vf7=kV71-uglSiily)Zq)&<&~
z+!FGY+3wVVs&~7y1FYM<-C08W3GLbLYzCXM+nxTjqwwDCP8k>h)hW=yK48f{?al}=
z(y!gA0>k^YI~%};AGJH7Q<1lSyE7SVKA_!ME!dxUz`UQ3-k%ZwLG4ZrSb7NQ1ZxMj
zI|C*m=P>xdhC#>!Lpj7tJB|!(cP4_hha(TH%WZekP9xk=$OF|+Ne37j(e5;W%_B)C
z?Ywa;`KF!M7m_cqx~Sc;e+mCM(hoL|Z+9xeP)WP95v-ov?({ny`ZV~!&~)MhOU`U}
zwh7LHZwhiF?M^Kin%C|$f+ZE?yOjK#OL>8H=e0Ylz^31{JMV$D3)`LisqigocNT%6
z#pDkxxtR3O|00*RJL|#l(srj6tX|&k%$!DhxrTIu&DXa(o9LH0w^D!8$@iU<9~il(
z-Dv`I?rnF5pGiLNZ+EJ}#&x7i@Nw#K26CSyKVWDh@qpD&6P|uux~bhM{}p`Cl3uX!
zIqCz<dl8&X`oYi~;s>k1oR?@fU=x@(7y8TX&Pq^iA%79#d7bZI>DG2<JJ|SUyEEz6
zU=!&9>;KWtxmC*J6WS?Qm*zMdD)_#e<Aln25B3Ke!CbHjEC!pwQc&&gIF(>MxCE^3
z<2Wn9@E(p+4>p1ugubWaYynI9I?i@^pY1qG@&on(Yxi=T0bt(Vjxz#m21~%YeH^C@
zEa~SsRbXge$Eg7u!8)*ZKgU@MMt<Zt8^Lgx^im#?{=_5Ya<Jp<q&(^mahx3T-vIWP
z{2l5z5wLoQ<K$KH9h?Z(fiuAdZ~@o^)`HF8Dll>c<uV^Wus>J}=7M>-@PX=R$5{b3
zf$D7dz&>E!PYDOsfs=&}RtbHC<180CxDHguIL=nE39PE({YcVx4*3Tsf~DY0Ff__>
zR)f`GBUqR3IE91{7dTG#xwH##AeaZ{fhFKXuoRpLM!*GNHCPMQf~&v=a2?nPZUURY
zCa@XY0je>MlXf2QgZ;oTm;>g3`CuM62`mB2!D?^`*a)r$!(*u@FamA`Yr!306FA^E
z!~>21RU!2chQKl~3|4_TU=5fD)`2D9TCfgm1RKHapeiDr3kVPP2TQ>`FmwXx{4L)n
zzz;^iwO}>45v&Ebg7siC*Z|tULoS#NHh}{{6uy%OhQWzo4mcC6ok%@Y^S+dPU4Z<l
zw0r2);C6Wrs`E)N*avI?2Y~7f@(oslGr`a_`Vm+MHp=%h+A-JwZb5$aO!$zWH=ptW
ztIwwXz=kT)wUBgz6TzHwY3E?w0^+@hd|gQT!0<(mvmPu3w}EOg=~;{%Fb~YTgmM8(
z!POU%f3OM6yOeT3UIZKn)`IVW;brK9MZ{A}zX#>7j|<9o-bu<=74q$qnVMGX!?y;#
zAD`Um_&|2!As8AH%pTV(a86paI&SZy4;j|~0AUsWd_JY;;41*5e?p(kXD0MK;rISR
zAYD}D;j__kv?V-`fEG=Ugj>m{Fo9MNZ6dVZp!|jTY~Uk4pkh#s35II?CkMkbGlHRl
zVA{w4^#XYtpBlm(D`EJP{I&8~2kk5&czlJW!IG1MITNGYlDGbphr+k?KwP-IvS8uZ
zU`bJM(%4|>*x<~u!N}NP)!1OQZ)|XhuP|8an-nL-O+z_ys*n?(?`mk33AE+VA_=tB
z(8?2N>!Hm|pfy4(OQ5|6trS{N($Ali$4+RI6KI)!e259OFtmvYwBgW75@?0ciW6v)
zp%uo_Fb!1%+7A4hs6nsH#|Fa_g9Gw|Ir+id{9xXgU}1i+BtJMQKUkU{oS7et<Oiz?
zDAyvwMQcT=fv~Sf)&P8?KkJY+J~&{a5i};4*ExKcFxv?~iSQ#k!V~wbm`o{`kcFR?
zFzFJ8zidAJq1Dn(_7{R_C#MC|ymxuLTh-0G$-I;Hm5i^7cbmF-x14uP-Mp*k-HvYF
zZQ@-T<((|PZM^H(%{$vh2X*r<%)9&~@1(3o@NOn`xu=v1e?lvPRtYUyenKmQ7Kx*Y
zUXefFc_qb0>!e<{|1-Fyqq=7}@A~s8hG(QtDf=0W1j>D>e^fBL#u^z6FSf@8n|*#K
zBN!eT%pMgCjS8j}1w`&-KEp4<PYl|BgpY8G?HX$-b)O4u0UxQa>*f6=uG|`belUBn
z$X#grf0q%=9%%#?2axSPy1H;69aqY?1YWltRAdCx#s((P8YJ#w-sg7nUh-7Vd+Dz=
z!e0;!E%p<^LMuNQp5aD1GMH8nDDQY4hwc)-Rq%;#yDQ&W9bt>&_(s832H#A|F8fvV
zo0Q=nF}x_l%l$>c>}A%dV0fur5FAjWM+S2ir;H5dE=;BK`D-(z6Veq25Q5Tni?%pW
z37wE5JJU&^=v>h`;VmuBFC?C0dKAxf$Sdk1Z>%d%bWR;>AZ2ti`?GSTNX6w+_g=b3
zNxDg;m+mpaI-h^GoA87Ljt+!LpXi+2g{%$nY(I)T7#5~3PN~sL?Pb>GexWZX%zE;G
znWpSN5vEi9AYEDl^)ogQEA!Emv+y>;t5|c;lK<JAcu_E3gMxQ-;MW~;3gFF$Et7t^
z8Q#Bw_Tlgv{X)C&+T++@j=#Ze=5A9g2w3;{;|Zezl7@-!7r*JX>(9>e<a~3xE{3-j
z-g<cdAnAY5<?T}5@$H22-jK9G6b7;-EutLXW(@*R#lg@*|2ZA{GQzvzygSu<x7hDz
zlxBS5dkt8{yTelWV%lm#oyVskVx6x$(ohiBSjPqY=Ot^b^asQ)Ko{lzopl@HvfpLW
zmJm;uK2c(Aj>R)3E}o)*RZc}D=@w#h<Pq04;wof~?j$MK-?(ut^%sf`qt7q4X{-x1
zCWGHf^JN4Fpnp1)nTRktMRd;E>J~@Zid~Ehf-lsrMiV_GIw$`@)^})w_V8$ah$FtQ
zm~vxyU6?vHSnIbkx=*$WBeM>fi-_|=i8EE=to6tD7tu+%8LTg|?j-uu?JKl3qsvb3
zXyc<3bdvD3B6|mXmq~i>n`O%1)9a>8L<wC^^F~@!ts5OUoHkL+dX(X-?HQl6?Q-~5
zeb(Z<Ed6V?%h#oC$8^cqV1v(pMN(Z-6cBmqkvIHHPhS4_lPBeonTq|&dRdl~$38vF
zm+<AoxAJR`Pn?ak<8LZQ8H<}~Se30Y8+0JFT4Y6_8Qu3NvVJao);}iWaT!%dGRBSM
zf4-o!Bzyzm8wg)2Y5s#7ca4l?i)q&-{=3}<z>tiB>@RT=zPX*XSm_7n65i-mf?B<3
z_pVlzaM@|tORCj*M#7yH7tY`A1}hAh_L)bxQh%#+8a(!7!cCR*&XyF8W6VyI@{zG)
z4SnxE2{VE)Mz6;7Ayw|5<hC<cuQF)L7`_s|jNh@IDtuY+1#=eKi>(^6YTDs?-uI!u
z8M$B1=&{@#@ICfhPwtcOCCKfQj&6#|y{qSPi{U#jD);=Za?5$YoORZLQl>Maa>sUz
zFVf!_(WJk*<&Yn6Z6x?=DC0YY?-=+hl8nLe`Z&*DPq!(0oI(9btb|Qy%4g}r&tXbE
zA55DNK>ZLrVSFC-_>-HSeeU_fU|zg_DW$CYqhHort<F~x|D$Irb&|wiAbJgDztlE1
zKz#fq{_VzeNr+)|G=`G+7+m{u32{u)tq!?U_Eh39?Y>gl(RkDBu}2DdS5LS)+L9yf
z{TRZbnU;w?vQ+G-8k<?iVm*cmO+kJ5t&>o%l2P<|hXqB;b?2(B$a?)ZEzThlX9!tv
zEKFIfYwV@gGUUtnxx5$l_`@yE5j?wn<#aQC=J0g{f+q6raY@UT>0bLo-!HZ%2J8J+
z0g-z(IY#0Xc@Z9Jqj5e04*{h~OWwq2-(a^ow^7cj<QwX^)C$vKlhot5Knd(yh-W}b
ztCK2eJKas&QmHe~W;f#<(>nhh38USZfauE3U`jClhyS++}My-oD@0j|vWvPb4(
zv0)dcFjKU4CwYlQUtsK`O5|+HXmw_#BIk=TuUtoGrHs1VDq3bQ)io)F3sbEN{ewdO
zMHx6j#s>WN4RX^^53ky-)p=cb)4E9m3PtKaVqIl*O+$WQ5h8XXXT@Hvj(>0DSgxF1
zjz!Jd8e<1{ij9DYz9eN-ur_5(a79XCu#W7mhE~t>x|9jQ2A((aX-xTL5H6cEB&jB>
z3b7TdvuKOMTAizD9QMJa!{}qmV$9{16vgp1DHO%|{y|}@w{MWLW~WGj6b7XIt%rBf
z_*Q3-)ba6d9%}q2N&EX1Q!VQzY9Kq`%%4pA6P{*xs*i7Vej_}GCGgN;K8f+jlt-4V
zRsSIMHKEn{Ncw+Pl&3`EIXzfyTQg%kCLS3-HsNPlaw73aTQ>UsH_{~r6Ex}yVr`bR
zAiIgZQg2IrK0#YZumO#3VRqupFHQ?K`>i`ul9p(Zq&LiGGcr%DYH|L?v;8tMVT#u{
zBcPY>PXD;J#W|F6kT{pgTx_X@qoT$xly-bVFmG0{&_98ZG2#mj7#F8NWNgYsFYKJ#
z>ik6FIFC4}w}t*vqZ`7|HX%PP!rly?-TX8CNbrL-6Q)05P($V~#Q%05@&JwEFn?11
z+0gnyV;R+JAJMsLRfm0-k5bPbZ3b{N_b-O0FiwuhE`^p4jp4-n@l{pwUmi5$Hz*8-
z#*qUFTjTL-H>j~S>OB7CLNj%WZ*Nv`0P42fXp97(P$S{jB3s<xZupYmfKuZVniR|{
z?Bqj|IMp8HpZz<1<(a=QpFYq+(99eq8t1%NBa$$=gefM>@jScnOzc$F5=QoXRS~AB
zBTPw`Fx7-vMVK)iVUF(<M$)*7FpY$X_j#^^wiQ}&2VYm8=T^d`&1>!CZ)}EUC(vxE
zhy+?Tv{vLt>q62o5ZcZJS{}3=3ABmOniFU<p>0o~Er7NyfmRFcy#(4SXiW*Ub<iaL
z@#)(HZA${J3EF08dr3L*C;5;)b0RO=-i4M%0}y$1WAi7pe$Yf-JS_*>suNlrx?)GT
zd}s{`;U+;_pFk^zwl0At`|{Q%(3V51PoS-awkCnL9@^>zS|c=RPw{Df4_aLUZ6~yq
z3A9YcgB1z1Ftp_fwBgWd6KI9dY7%IZp)HA{g(<NJv`WT;ky3i5Z*}#-E+<SKVWK)i
z_*O$(xeGoCvxzVpzadOBVRjHEpYU${XxTU)u*m5{V+m8oyGmm&VTuV8P2)iLilG%k
zo56E5Zv#$^>YmR2XbHc7@Jk3kKQ4UEFM_!^uV<LyJ5INeY_&$@quciH2p{u9Z{*$N
zZr;7eyGS?h6t>8sB=2&GqaW{PlD?NDK5SlZeu@3<`7UJgFg3w&8}It?-b2r1GJfx%
zgW50svPV9=Qy5|~qcG{>osZ)KVWJ;SpKbWH=fk@hc?mwe;e^>ldbcq?*6{4wG|@4u
zd;hA`gXkO?kLq}~Cm`FjOJh^Ze0Fk&MT7k$dO`NM))Rg|A7A7<pUudS{(i97I7hfL
zyz-)fi@&<j^5fc#IW|Tcn>mwyxR!mRUkhKh%NK7GCitVq2AeIbIJy5u+`cNzrxcl`
z>_J_}v)we=<fp?oQ0+4|&?3UL66RX5t6y-#>@r?#t`*7SHTD<Qh}we8!t<~#C>Q&h
zPBOcjvoRA$;%_bt<R`VZDE|?}lSgNeJ+f^Q&*5%7@pA&a*NgEFN5qlD!cuG&hW|qN
ztKqL-;PLnQcKlM;4e&>P*XrCV@qhY@&gDy8cjv>}3E#H!J-$sn=gXo0mR{iT-PUuy
znec77(BoU&bG|zGN)~y1(|XRg1-^~!IX2~z*K@uQ{Wgz1%7$;>p7Y7x|8+||zOPT~
zaXnPSmvgDd_iE4i*1@;>GLLU<&-t3+3orHfe&2Jx0sAl>FZ1~3^o*~B_E`#FsMgg_
z_V`YGX6?+Ihv$rQyGMtsH9z2A;F3E0(6Yo~{9*&b*hk0|zlEz?e~w?!xZ8}**-V_<
zd4CeG?GJzM`BGv&Im~iZ9!6J|w>Z;zR`g4mx`_S__2ct<_6qZCKLBrR?M0S9SbGwW
zhFKglbAmj=HxT|!34cA|Jv+kqA9MWEI`lP~sXt*NgpXW-{Vcj}aa6vj_9;>KqmjFU
za5MkV>f9xAmvxbQVuxIu6C!sb;kOY!!zTP$@p5s2N_z1hN&cD%*T|muzl+>+yU4}*
z@Mg@O-0;4PN!PYI1u`a_5ifU?D;FP~#9v6bbt_w)fW%)G3s>sD&kaZIN&FGQ)v;H6
zr^H_#jo<idrF_pL{vtDDKR#gQ-gShJ{E<D^qTlkOawlY^<S(<BsV>z1(RfKp#VH`m
zSjMNY8J>RFRW}RI-ccUczf!QwE?BAyYEtqSruqwEEfm(#0e{sX({@7pDRsfGIC~@M
z`+O3HLrRFbv%e-~sa|GZZcT(foG``gbH7{Uzd{&O=Ozzk-4?rHKBFrH*Nr(o;NQV_
zCfM%xf&j);%*!e6B{|xlYLMFp8;5N>%Dxb}QD4Z|8f&3FZ&0XQ(mK+tEw3Z2k93L+
zV4v=#(<Ys?D=FtKychegT;7lAltytk7j@J#<5)If(tgC5wOO9L{RofCK|IYOt3U5D
zZ*Fz&5m_0$GwCYMN*OKb@)ryW&2tmRdkH_0@GA*_jfCI!v(9}Y&;N_~xF!8N4@CZ|
zR%Zmy@|VwN1u}A>wMaktvn!*^xa#@ZSOWYdelCXEcsFN3K;$$br}<V-&K=)PP9AbZ
z=gg!{?JoV`)~Fn|A$fjR<NL#~@>>t#;^~wSW0TPBjLw-ov&A`@XS)I!1dn=K^Pts)
zDQoQ1fW+~OUBz)vVjT2tS2s2yV{|l*gSy7i%=@O>Tb%=>pZ1Ez5%V9LeB`rb$of%Y
z99Yw?eK~x8?AyCqox`R5_i<%-Hl5MwqRpOgJ3yy(vvT-0-`(mQCw!lt>a~mbb&wMO
zbMYO=^Lej=Z)<(4GeyefX_s%A_;TaRzN0Nz2b1tb$17UOZCB06YQDGCnI^JskIIVW
z-?P_@54Y5skVMpI<9p0K0Ke4zt<EhXrzV;g<9j?^);0IXKJ_%I8MktXzZ{<BoMRE+
zx?SFtN5^<feF{$<Jk6YW2?|efC!T5I<6KG#y?BhxEj*jy*~Ix6(~brv@HF|X^J6?l
z_XtlKeQ@<(JRfIfl*jXN!t<*bk10#x839k(Up=1fr+D?bYn})^Vb10pEBV<J<uT<p
zS5}|zip68}c?t2Xf~Oz4Y$m>2dyUHzKR-zDk)9B&wyZ6I?yPp=N2chUQ`k>v)&a+m
zUzp;3q_Q4Si&9?uSgSLZ_9{C5a=f8k*Rs9+Dkc6uB#n_;5m{T2wT`n$+eFqkuWjT|
z3`v~k?%KxM{>-mAzx0&&jIZpbJQz(qpYgaL^>>~KOg5pF<nT%RF=NK2R%fo%%ero4
znl&zO?Y`ReuO~^QX&)kUGBS5O$2lwunHP4FnJ@=4GV_^6TO;WMNo01kg?eN*zkuy0
zGEeFx)9fRPt;NOCnU!27BcE$8*f_B_ZEkhm)yO;$nbC2##9f0hOAsS`gF@DEKG)Fj
zPaou!TOPdCuj6|aeLAq4^ib|zU5u5w_&RxAbfjlFG8eqXIWWm%C@wwTnn+Pi%6Jw<
z_LUT&rLhS{1;&w}T+-ACZ|EIt6Uj>wUUxlZFV<81&%1-RKTq-^bIHtu@W1?{)ww}<
ze-sz*@p0v_pR7vVBdqy>(+8<M_|_c?@1hoGZ?B&uTT3`2SXjugaA)V01eo~?@sTr#
z8?R~W@L>r}&LeJsmMMYw6Plb^ZGc7+&7aV6p{<7&??Wwywl0BI3T<rytrA*2w7ufw
zErHf=d#m%~INC~Rxe2s-XcH4?8=yrJXj`DwK#QhX?5!&F!4hbFdA6OCG1h{i6TG#D
zw8OfFUmYL*rC9hd{JDhBBoCrzTw4_1K5nMyLX(uyWWvlOzMzERPiPTnWeK!J&`J|%
za{hC291ZoO)<ByC?N|{OwSN~{;y{g#M@G+XCY+Q<en+@*oqU`*(03AM<qFPYb%Zgt
zG_$hP7~HU#*_BlgnBO5+(ms$zy9FL;^YSM&X~<F^(X|Ol=R|0dpA#eyq87{k%7qpS
z8|Ox6a%P|A0>VjsoGNYD_!rGuvGJF&roK=YVl@{e%t%KDeg*|y7B_9}N-?(mR+0a4
ztMf$Xbc|=_Io8djJ4XxiK5a1h`()Q;Hjprr2-6_)NUyhVlrmdr%S4KuPq7II+EdD@
z6rT6s`L;e(DaV!YF8OrV^(}R<fpFDVw>h`-EPus(wm_?a_cO0-yNunjaV4M4)z-Di
z2R~zfio5|dnh~F|b}b!>ynFIuV~WwYM*p9{9x7`eJL#BW)^|l-74llYY<12+-^R&H
z7*D)$9?SSw$;T68=Zn0J$m{nnY|PH-OD1n@u+e9I(d`PkyLVR3;cvaF&Do9ev)`ED
z*&^7i-R+$%@vluf+K=nrml-DAne@Zu)Ln#U`yAq#8w{7>Svj7y)ih|5w^c)!KM<xj
zc7pVKZ%+k_nH_rwbF6_>V5c1v%&&_RNR*cNVRj%-^lt&rs?uG5Mb)Xyq4+bPPmnfa
z#$(fOW$wC?{(`ooznl;-<JJb!oyob@&k1TjPEU`ei~g{f{!rqdK!51yrJgUM6~4{z
z)d}AXE+0ElWL>|DuU*E+I?|NCvd!#6a&;bimt>z|p8v66VNtN;lwje=U>;k!Vm=h%
zle>-9U*G23D)EdZp7{KP+2Rq*hC+UdWDy%B#bcPTp<%?wdG3)sN8^iM9|}PpPM8Q`
z;@8j%p_N0E<|ls=ZZfo)(4uQZVQ3L(WxJ41Gge{xZarzgjA#4u<Gpl{OerJTqwZ-h
zcGwyFMe@55xtTY##jPco@XQUp@MDdKryy`*)Fa~BE;Yn85xy6soey{8TIy$x?d{oN
zN0VipN!toj?(WRe#3}7^1QGANvCVl?+6(m`KR+Dj`ryr;m=ga>=9tWh!AT>7B~(?S
z+klx@7!lcIenks9Sklr!3`WO#eyRq`_)JC+=Bv}VXN2(f-Gm{1;xv)*eG|mhlr8QD
zQ{Pl6<%L|nZsXmWB=5xkQH#vbA35_Z@lHc#bZm{5_xbK<VRkf*<e5DWJR4m$5dQqb
z@h9KZW_&Ng>-A4Fj`?Nj$t^cCCq=qa^uIjPaxsm?_Cbm1)0n=B*{@?{fBUZ1#!8U4
zCuJhzjs47c%B1uDc8Rl+Fat^B9TMhl!o<>P{0C8=u%G<Nnn_2$9T!;8S+5`m85@vS
zgS<XcW@o$d%pOqbo7F!5@f}50VD`<<JOcar<~HXocoW--@qZX!f`1vYQ*Ism=Eel@
zx_0vHFnbOmVjc1_SGAeFFe0*y&m?rk0BEoBY`2Uv@x+gN32SMhLvR4g46TRf9hg^2
z84o`ay+k}MJa=1;Y})Unj@c<Bysf0M>`!gZpLn*XxM}Qk{zNpB+gA(Y&!0$pmpd)C
zz&o2~i8r1P$9)G|To*U<;P>DQ-O}c~=JCb%)3~x<oKW^B1g3T_dPzeLnOKFa>yxCx
z{~Hp4R;6#Se=??LWNeuUU*@fC&Y8j&)5X;Fbho7#Rg6yIb(n}s2)jV$V#gJEWlWp!
zbP}~C`{wif?8)g=#>|2xuFb?%d%LHX;^WHkcZ!T|uhdcW-!*N{#qt~rOAqif40jzP
zN(k0K`s(13vm5qpg<aBT`V#50&cbNvlt4B*vO3X;gzn2m<`U8|`R+D{WfU(RCz|nJ
z^0SI}rM&w<-kEy$#!J!dPeq;g?tHS5u>I@X+%=ZtrHvT<iJi94{;gZmR8K(wOWSpf
zBa$4bG#sRp;X9E2EOB^yZDab#*oz&0mwzU;?id-QjZ8#V-aTz*y`xKd|3V^=Pa@b_
z9!msOrfT5Z3Eyvc7QW8@P3l}MkwVIktrW1jT9tk!`bh3)8F6o$vxlVL<lBsYCg1)$
ziPPPaXncGUekb7v-q+^*I9d4n-SA^Q9iK;dxmRi6+BRpd=(|V9c<D3q721kthm8x?
zTUJO6F=!nrG=3D(!8?&L>Haq7U6E1i%81ubF+Yk|&gjv`q(<dPvv;77e!Y{sL=KX1
z<mbDRXYBGZ!Fr#ynAGJ`m$XspwnNXFeT<UEmB<{<7;OB-1EVs@r)T$$m9tlDzFW)&
zRL+z9Qgr>Z-J~&$yzR)l_hk2Mu>EC$`20GZy%N10qVh{0=1vnSm)8hm+MlttWAls7
z?a#Bd6ZH1jZftGzw)ER7;u)~M%^7SH&-_mDByZ=G*ZSo14&%d>c9eGvvKrc)8$H`R
zUME}a6t*<HlY?m`0av$02;YbDodd7r(_3HRU{Q^n2U?6?!)Ef2a@$Zx{g^f+y0ISq
z{O`ywWv~GKPznE6DWv<#e6Ory^=rn+nC(G}_FI3llUA@vQy7_|b3Qqd{qa28^N<Ns
zykA#znA~sF>T_)@74r#@TdS{>l52sFH?~5UzL{fr`v1^q8e``cyY!P%Yn4<J$xdk2
zV~o5A@=8-Yd2f&Q^8DT9H6X7xz0LWv=>IEsCC~H`cD!5HO8UAUcq}yO3;mP=%JSrm
z{eJR9mv26fK1G|HBW?1V{oZqutAu=$wl$<}_+xF()5x;#9_5uC^Mm;FWd))S3f=Q%
zCDujB<rM^^{YLW8r5oCuA)<FVQ{s(J?mT5HM=!FAo@8{`6ikadew)uB4uZ=#Sugxg
zv^kWL87CL|e{06c1$^DayDIM1nn%5KZ&Q1^uoPcxGEF!#;6F9#i^72QV)C4c^F2hA
za+lUj(pJW^{PpLvolI|lHiKvTcLYds#ze+Gxpyn`$u=h}<zmJ@Gq!ZL%~`1HWIN!U
zk^0^MZ!^609`CN#Osrwu<w9hIsN#v-OSM8|eN)>hG3$OwhLe#2ku@1vtN+HGR6UVZ
zYIVE*BC=K^Ys52cJzW=Cp0o;?fr+dg$jaNqomzgrn6YM8YfRQj$;+$2<S~zixZ~M>
zPrBsJw4Kjki}tJ@Q<JC5)hT7@fDw&t&ZXqh4)Dd)WlYZ}(V-Q|<>i}v4nPerc#-**
zl-W16i&ATRa#>@|I&LMhN;mi1=9d06JP&^kJU3D{_7nyaQ*XvbNIrI#T6@LPfEm#-
zloraV)TC`z`e6vOV8_d``b=_uDaZdsLO;Z>CS&Us(wO#Yn{%p^`Cl2RO&X1^Om4^E
zcm0JVvg^QQ>{O9Ca3uD^Yi-Vc=rsExWPVfGBeN#C&0>6|ru~!0rZ-sk5k1*$yYR-Y
zh~>n}G2Ub>2#C(h!$e%f*o8>*C-%SGL09`0a~Y|tw^4wUL%g2nl$4m{Vj;V85O3a)
zJqrEJopff8!y{2X+HA*J_=xOmit&*nnv#;c1go3coFMTfZ5J_nrX*PEe<xY(OIfcV
z{zC4;drgz>lim2c*yk}>X2-^{VjFd}r;V>iWNt_1=J!3Bha$7v@{aksk4!Gp*qqWv
zMdy@ewmHAcB%KyAjXjlke(RTMgTnr4gOvYiSvZsi6^3^KyrYU*+`2gu-h{nY5~dH0
zq?$V^sYb8vn5M_=k(V@VK+YP%G$GgS)m<9wXd0e#7cbp3NTt0_8d9Wvbd!b^q+wwH
z?$c088VZp!iMs}0;n}2NLdUv9v78YtmbuFKj=74sq~|BAEbkO5zR7s2mH3ex#{ICF
zIK=*N+wtg_AC?_wUJLHnr?ZtbRc`m%{`nYBJ#71Z=s=UXC$UJ%<aGFGC!OcNrT%Sh
z+jr-^7#-5Sis7r|KE=Il_>Okti*K_DJ})`nZVc`uZMH+_)FCsEyBEu(eI40VX2RGp
zR%X4E(vtQAGq03%?vVU(4`YM{WBaFFWp+ttlixaAcP~@)Od<WgO1Hah`5;&3H|ZH#
zD)*hqe4yPqM#?(X<?CG5#=n%~UzMQey?JDV@TYReBKp&Qm&w4c$0E5D%1TYHlZ_8q
z%BRm*x?)DVGf&FrZe$u?6n#2h_5n<g^Qmk<u)LiIRxiJFsG`96F0e<IuUrk%S2h_G
zv-TnRoQcd_?y5BT6q%&6r}-5A)xy7<$A8(k<Ci>b6MpW<%uFFqGdh)1Y&<mj)EjG<
zZ&xH=movIg(li3Su!DOxO`d-C?b0Ot3xt0!kAK*=<DU%wdg13j&s(IPdb#||WPRJ)
z7Z<;mjqLmsRW6fK?D!4?glhVTAvK34A5Sha^P_&~n|$v7H2M3KiS@Uvv&q7LpvV8r
zx9698Tl2Yh)Wm=5x98s~{6B7Y_L9D_uoHjGPbO!;@gR1Z%XocD?ulKrANS%)`Ob9t
z<Mmk=-P7c=UP#`zOq&T)XAxvZ^4+`N>_TL+At+wcOS@Q3m^BBrJ9mriHn5v?j|<k?
ze(roqaPQ58e>4334rzBzlyvVKO?SdPIp)JM7X|c->9=?T^#r+MD2aYH>uwTP-gxZP
zA??o1qVwArjg#r0n4N-t_E}%X$|ycA9MViGlSC$kS4>=CI`TyBk6lk(-(b5KJr|0_
z6K^b<w2Mw?CGEo**H4nP&xod-wZ-^#b@qKMPAPWpq(XO|D>50!6P!C8<5dX>%^Tp!
z?Dzd;N*S+0W<T!9%}pnrFW|&9WgN5RyNp+jKI^iW-4<WQMFIcYNy}I=AY~F_*;Ve|
ztrb0ae>ARSa~n@j*5Q}z&M22^;!}pqf!yDFhxEf6yUH}_kF^oA4^XsHlDUZKM<R0_
zGM8}o?>e6Cvt60qcoN^Y^P*j(V-2xTHs5#Z?WR4TN>%kq=u^2*m~_Uq<JkBR>oe>R
zu>O)f+pe!_63hGpCbT<yNS<HE*`BOl#LJ7Ai<gqL{(R$KUyZz-C$>AX&~IP)jq=6?
z>n!V&<o5)aGL-yAPNr>fU-B6AOkDbs=eN|lJb8XcnK}7-WUW4(JC?Cy+`gHt&3N;Q
z((e5$m9VExq`#N8JF_HR_^7(*-1u?2)PF@Xolz9MYo#<^yISeV^WgeV<lgoCGq{iW
zd*(sf=Oq4HfgboYjrtnTqP$sGH-27IzFqokt!4dK><#jkpj$e|jy1@fcWjHZo3yDb
zkQuD)v}ZY=cpC{58q(&x$g_PpVQ6>petp(V<UFF;3sLI7u<QN@dP5lA>=XHn<TqSI
z&S?@?Y5dhNX0AJeFpHvb9sAwmT7;Y}d0t%UUE`ASTuqpr#C5>lq~~KS?r!^&(M#Md
zmSVjYN~-EQ)(TpYSwp-JNqKIOcxy3iV|VdMIV?XFKkxYN=YFI_<)b&YPi=Qbi48A5
z3PIWuZT~0YeFfC0@GXGP=ALrno2W|8m*|_2vREte!8c3N{WF)ZOCK@1#VZT!l54ta
z_{d|FQfc(xZ1Q>RZpb_enNgd+FuM1b15;L82AhAc;2M^{8si_IM;P9Gc-u#{IPH@E
zo$w~i<3;{V!t9*f=6n(&%+~JYe;k#+&f92h<gbEvB{q|he;v9nL4H5vkC;RonZ$i2
zJiBRk&!fv%qEwT4Uv*1+$KFt(MW8K8pe=${3r!0ne-ds5w3-Ck8fZ(P1x+Y7T!Sa?
z_Bh&RXkqSre>#q~4cb;{*T&IWq0L;~-mz~|;tl<rvbwD^tv|FnXz}uLp-sNMGp!g}
z_8slc{c&-WLfZf>K7EzYO4oFzErHgWKwAlI1$WlR^VUNfephGO2561Y;^W-{ZNc50
zY1^R<sP9Zur(wH7dpR!6eW2BGNB$iiEe9L`ZE{(=bCuBOYfJsuY&90^ma^9`$6rEU
z>lp88qf(ZW;n@feCY1Rr;s4fC81IptFZSQ~UFj5WJ*n2Vf}*2|Btb_Te`bGVZXlir
z@!%cx^Z+_v?!%Jff8)$v1LODKPPiq6yV(mD-+zn5S413(xx<Nd?P6vq?9@myGN7v^
z9~;n73;5lD2#wZ$V3?;H6YTw%eO1KF!8#<lJ}~(Z-IV<^>YMbX$sFWrWEy=?WbUZq
z4CQ#&3E;mUeXv62`{HB_!&d^|C+LDbdA8@lm(W*5b|qno&-7%U8kOyy&m1e4K8+Cv
zga5&(Y#9JW_8RzBM`aI5E<2xzanpnC?mlayht1v#cTKMj9~t>zzsCvj`#q$6EP$`-
zp;-HnboQf9uAPAmCuQ)$P_M0F*L3Xt$@6paIM&vfH;OKr2;ZKhQ{>t=!57SnUi)kO
zc$K_g!0%8D6}gx4KB{YrmU5<bp`Je|be@!{<XhyeBK#|i#YWyV!b_QI&U$iBn~eGL
zI~(O^wL7y#|DMHrY>j2oC%T-ElZ{I6cOCQb7>1FPNuT^3@$A8~J;bED)0i)9Er&3X
z@^<H+lJ-3alQ0f<vE^%hR-?4Fu6B*FyCvR5$lQl`50H4@8Y1Ixr#Z32E58}Ca}K|y
zA@h>Q2osApW>1+m9v$;+D@EqCUE`G{Qj_0SWIn<8f1AWxFYz|n@#Fsp2}eIIo!{=f
zB=IgMOth}Jt1bR4vGqKsar}=-wT%k+{|B#^8%HVp8RYE+iQ`iE6Z(VXZ4qI%@;f7$
zdlRODFeY!^u8(2cSk_6YN%LlWoMB{cLFP2#9V~6)Q)DLCipKt*BIglfeI;H8CHOXE
zf8te1W2Eg!9twYf&fqso%o(c7qj^X+2Qc-6_rhmIlF!eL|2h|$E0MXG-%<Hk=Dk-U
zGg|hY?uIMU9D_>XZa9CQl)h(^Y=QR*@-O?;?YrPjDEDmSM@s2iC0%WX5btVum-`p$
zglG8^T3spQ9kh7cU@f#9XnlDve-dtEDfS4o-a_C{Xj@AuBWO{3fhoPwIr-4$$@sH-
zH)R&zrt56~Uz1isk-6(QZz^)g553YWtC?m$g2X?Wcat9Noc0K`iO@16Jbz;AFM?JA
z&D_D5xOWXBHrw61HbMH3gkMYe#>d*72Yq}=49{jP2_N0NCUyEA;Y&BrpLll1Zlh1>
z!@Ic02j|29g0J}{ze`G;zR$D0M~;`rQ_WofOYLzrn$1cR28B&|7oR~NAfD)WABLwC
z+Gc1Ic$T)b5FHt2vucR_o}!E-%p$_5C-}`9zzu_0!S%DzK+<kj6J`Wqqz`d!E6k+@
zhF{)o;9Z1wMczBMp?1h_;@v9VF^qX>huJ^6Lv|RlJ&o~;Fbqd=@pC(dmq^h`Yfoof
zc(UDjUBVwY*p#C;$HULnv3IM{$6oPHZe0eHFg!cppG`R)M4j3HL{)JJC3+5>*QWex
zNXO2<b#5<p&~`wJwqZ$29y+8MT0PJ9Q^-mfb7h~WyM=f{K=yNIyQ{252aCN_4Sy-W
zWi&(hFM~hHo|H=7^(&*?3D?)pms2|X1iH+nOSsH1i6>=D)F;qsZO#7~U5eRKf<;Gz
zkoFxer9bc+OXu@!e|nggKJHK8Vpfa@Q@hdoh^0@L5WbZ9n_?0Eal*&ODe7jaJw|N1
znBQj%%g>XO#*Zx#MqVBAwlPN4@oX<cUN90rM45gezb92(+~V$MHsx>j<em@=|Hhn@
z!QO$}t=BTbD)JNU^P(H7ptU~JnI^wYwKIWM2W>|JZ7sCs1lmSu+o9opHh&VwR%qLx
z={Q<5wD+J%JCHvKXP?P`q%S^PHnha_4TP4MzC37>zIfh=&?J5Fw3*N(eetvf&?J3X
zk`VqREw#`jeevN|L2H5*?E}*1*FoC~?O<U<ub@s&4<eu+Uq$Cg{Gt!+w+=P>BB~SQ
zeKqgH<aeyRf6TlOaVo{wev*bhbPQ>Kqj^?kLHX%wH2_)-w0(reNvI;~SJIIOeK~Za
z-=K>e+3P7hlX<sdoO@P9cxJ#O@++a&!;{Lh{0VIdv^CJ;^Su(<YH0E8rXJd=gm4?6
z)j^AvkI35sZ6&n$vfU1CMO-*Zzg&th>GyPJPPCllT|eG!<XvJJ%e!3OZSCgWMBX)b
z^De?Wo8L`Klv#qzZ1hbw@7CCa8FWZ|KaTlF&Dd;wnyfe2R=e0UUF}V?t~vmj<;Yxr
zOp9}{=w12ISYyu=GJc8OU&LKDS1@`}zkF5@_91+ICTue)+(p<8gk5)>7k108!!{H4
z0rZKH+t7Vj(e3@FGoCdvKbG{aCM>$e^Rct*-5amRu#<tGmFTn#h3=Gkk>8mtD{L`s
z+fBc_KT_JncHXUfp|gFZe#O`ZExwKQfmRPKzAX-bwg#HaQ{+$NjesWd;=`3d6M6Bp
zvLy1VJbBVj<WJ<)c=F<HxH@PXkQZ&+qVGlLG(bDiri^Z)<`VS1v7_VL2{zp7q|Ics
zx!1=&3!N~ws|^=TSAX8uzQb?&34uSsTxd1W;`3PyZ3(n!S|wa5v_;TNU-Nh?J$bo&
zmp^HHqI1?0u7YP<D5mW(k6viWdF9E`9nv|xUkiWe-S!@zSrMLX@RWZGp8oKxCQTdR
z0hAp%D5;$w_0dRx(0lFfy;;SCqdvU3F#XoM+nDheQ}JZC)nb3TyI2YX(@Zn&Ak0Kw
znE&!N<68+3ujR^%pC=jH!}w}qd!}wr?pxW#ZeG{+tRb$^^m!Q<>|1^!bGA<NV98%S
zVVWq1d&IuDj4-CX#e6-n^q6^;Ttt3lcgZnzPCqR9N4s-hCi2cfo~i5D{w?>WEbARh
z=2qN8{tlC^bC_(2ei#Ao<}bYU`J3Iec>2NY752_XCEdFsd08O*JKCKiu<7jcqj@pw
zugP?*>|O6>zwnsAaO7=5p8Zz4bEfF4d{<t)A6^dio*D0lM>CtUreFVKgtsQ&4?Z?u
z#sAJif8Gzo|LhaFa#=`B51M>(;f&A!k}KG4N+Sbd(lZ&pXOWA?nl->5Be%Na_n#n{
zHGmy+sM8e3JtKwO_%j=F-+*T?A0O|plDrO_OF#OSe4+!&;0vcZ?!B#i7g`mx5zzMH
zS^k7p1Fh=Ic4s$3VJrmepe=${C^Xi^qw^lA%R>6}I^LZr?ed@WSko?(&HugelHCh(
zaHgx_U0}*?1@bojw8eP_+4l3u!)Y0B_*Oxe-{)LK8C*h~-?&a#(p^7bFa2)5WC@X-
z{^kcEI!zadO7fdqPJYuJ=eN>!k0gEF?xpnlVTu2Rq}j!BD)QvFP`3s6txu5`h{{VS
zznCvT{4vRkk2zkxr196-5dY-2X;b)O_GFIt&Z=7np%0{4VWy4^SU-(@<COlj9k0AS
zec{=R8*S~*R7uy>2O8gEd>?1lWX{5g2|#D)IA}I1V3i=b6HN>u>2H<D{PWSCpYdnN
zL>+rSBV@*(w0ZO$r`<V<u%14Wv@;QxYY%ZUkaF=i58(B1!bS-D`z~S4Y}36bfHIM=
zrG$NturjX2jcGUmywixDA3*9On|_(A9Oog)_rnJyZ_lxDU96xj)bEL1=8^a=!pNJ6
zye&t0KHN)?m$1Q8`j;YMi~Nr3OF|DX^dE0#ZaKVK4gWgqwr52Lo$RKk#y_rOKeF{Z
zsYf~;lfBcua^4D0C2=;wZ;zHZIe6*)evCxtu$x+b)9GEGzTYq3?A`eW#((;~9f0_c
zOuZ?jUd+;4LTYD_e)m*J9))LES0177cAZbZ<W~(o|5m@c*uRW)dy;$qLO<zOxBE0@
z`<_qV<wpHuSImKD=pDBDn@{tDRg3+4tF2!2>qcAMYv~QP`q0wrY!mANV~o7szxV5x
zY_-~wkc+e{BPsE~<$k>>RekE$kEf_7E&X^Zzb&OVrKlz?l=Z1z{5=cSd#6wHGfqn^
z{g$N`+g`lMh?3e+DZlQfF7fHBY_-J3h=8`|udG$D`t+Ta`p~a=zR@Z$Z}ymGU1q8C
zH9x&b-g#W?y%08itiILMk1f^GBSOY;_GNx`tK61C`1sF0GoXFp)Bn(_&8Oec>TkCG
zLaP_Na<uor!j_~?B-lTE`bMoj^XWUZYOwX6wY-Uj=#!5Me#xibx72n&Oc#13vd3B0
zJ6g3{`XjA&2<^(0eT~TDb5G)D*?iWrEcLZdUm8?f{Q56Jb-QIfouxKf`lT#&z1B|#
z)f=h$>MXT6?GHY_To}+-2GymR8n)XqH9<eme8z29$5rSTda0E@y(XwG@#~v|>R!Lz
z*h}4NSxb7U$1VN)Uh19{Qu$G;zBZ_Kq>-4XGd1sC%KFk3ahvZUAJkj?)(3;s6Mns8
zu=-5ya(Kqp*9=zo>tFq8u)0akDLfa_FAi24dT)g9#@*%R*M0OogVmLL>K6v7Tld`e
z%R%bizWT|*>aA>j!C>`8wmiSFw_Z9}?bt{EWw2V(Z^)a3+*dz1NWHM1ervG0ZU2XS
zPYhDd0f!?~`oe_+G|7155DD`9VGuqzY&;2gFGo(%Q1p|p@u4}d^IP{1RuA~~9fQ>S
zmaZP6uGKaPzav$*4OUO3wFjQgQ49C*^W}lP+S6_t#N~j0@DbpPL;bwDZlJvRaG-u<
zu)6dxji}28$qk~&4?Q7;@w`u8ZL8b7>I|)sMnr7y+3Kq(jV8>L#HTk~k^;k9dItEF
zPc!?d_UkW#>P=hU-Ag&bvM@zI)l2;~ML*n2y_N#u%~btTFZKI0{g0qpE!A>ux_%?5
zR;BCbgX+D2zO0wJC=>F9S^DZ;>VYhMGm_{C+4p!VMZWbLdnW_kKU4MAfO;oQZwsg;
z8EC0j1M<lGsFq3}=|4}QnM^X}g?y5$syz?8k_evnjr_<`TP*#KrIy=nf5?8-r*GEk
zeZRg|t4lap`+KeatVbJ}y$4c!WAvG7s-COv^CHTA(<eISbH9ElMg76L!n!9#y_(`O
zNCNuO2QcU__v_V`YWC|JE%k&~l;7}L^w6hhS-;-tS9KoG|4)d43w*_|`snR$twd`x
zioHtpAN^{9zTK}@`dyv7$BFD!RMk=|_xd!CTf7Pyc(-pP)#uaew0c9@IKu=t=)EKb
z`VULrqSe*5y!)f3QBh&(BlXoj72)U?UxuFi7rNw)*4P_URf}+3ZtK;l>T%JcuWEfy
zs=75r2!BqI=Rc&*H(Y!C%-Y~nOC$lP9Uhy!wEW~UdAQN1w_B=39%&ZuTIv}|*4uvl
zx}{#W+*AyVMtoB&Vq$nnBL0Uy^o~?@qlt1wDux9Kl~2YrKZEZ_B8+dL>H2C5CBnDW
zW~hN?XJVsyq3w~nLY||8jh&-Q<G*M8-2Z!@TH~cW{W!adA9!;`i+yq3QNF=9`}CKo
z>JguQGgZCf*X`POt9|STsp=zJzmTeK(E6qnwLC>%lcJs%Z9tbZf8UEYHp*o#@@cLR
zUS;WvUAtjlcl;6h7v$X1j%70B(F=~F``+ri&393%`pU1DrK&3}&GQYG{#S~+*_P+~
zY<+#Ix=tU)bEDQ*r}9IRr2Mj!y&zmH8jKR_H6B49G2;6<YK|c*q*0I<ZEx^n<^9x<
z23+D}YOS8b#Ie-hr5Y}<^lO&7+0sv0YOSRo5$l3wYS@1gu`aaLH9o!ER&PilUuNmc
zY_-y&`M+ps-nCh0nvkKJMbe#qeYvz)H$9<?g@(2CPYjVL?KTmElKQ8mE{Ur1RYJSV
zr*}xq9<OPoS9DVGp=7kDg?F=0-)gG|J>Cy|W1(&E>6&bEtvBtZZjglBWa);zB-yQd
zsqNBdnr+>@msn){K*&RqzDH6tcldvjqPOm?mWXjylcpQ@R_oLBhP~A*Qp&HVYi_~6
zJwvbATRoWp;p2d=+gtrJpqKBhUS(|GTYVw}$Eu*N-AmmS)Qk2~OM2;*d#M|HL4G8p
z`4NPTA-z3Y-PBt*Wvd5!L*B8Q-kh!4cheiQ)t24$`fT;NSjAuU(W|o6EqmztY_(wz
z2s`)GE3(yvef8SC)XRPK>b<0nbcj(<9_9}r$jSP~fO_56_wG!!(685J(sA{pnd&)P
z|2<2!rRYzx)D5Zn<)FGdO(@T#N%-w)`l^6hmadlu)HCV(5&ohK89Z;zkl}A_hJ<`M
zLq^YkX2?*uBw(ZkWaBLn@10A;Gp$E#^_->ex7AkKkgZ<#My~KNyggknWfbz+i&E4r
zT3?l-)@sf3y(#+{3H$$!FznKQORGj(zpT|Kwit-5wuaK82b(}6(NBNKP_JR9r>o6=
z|4ZqrP3x`c>W?Y<k_`2Dif&6+*QV;v($x+I<#csLntm!>txeaDq^ox`+zv7PWZj&i
zp7rbZQ>F2vAU0Z6n^V;$E%cAIekN62o}!;gQTM0p3H?cF2IMh!H27P+HkD-Rt5Vfp
z{kkDleQ4{YsbXV5xfuysJ)tGxo3$kTMNuyBj{U@Ie#<h{THF3GU2RCQS~Ao}Df)|a
z^=PVYO;;bL>NhgfO=+4RW&Au%|4+KQCtVX}Yr0g;?HL!jMLp&jX}I^<*2NiWaf<#j
zU2RFx_hqO*rdqFLs12!ld%C(NO<$0q9!k>}1=NRWx;8`Iny%kXSFfdO!rUb`ib<9v
zE&kK%2o`rP_UVUx>SO6tyiPw>cQswI4&Y_+UejeFVM6u!CHU(wvj&tcGl<pJ!G_-J
z$Iy>Dc#W9|taGPa0ryo#?+X~e8vMAwiDu8JB}Im5k)LL$`_kPJD{4bYs9rMbJGI*1
z<QJ>PFz!(WqW%BoQxEubgHOgS^CCW9DKGg~*<Glvw4J^!ewhln(tZxUWlW)JOJ8NF
z8jEFfV(xqT65gyx(eJ0Lhf?&L>FR${o^(51d_;Sga?tnr)ureozpC@;>)c%L6HC>8
z+kE=6H1%eRz9@}q(@m-B&Qzf=>!3{%+u-Xg+UkwAev!F`t)I-ozSa+AsmHXwD@(nZ
zu0PCDU#9D~vecrCXI#TvV>5Tjp~wn+;Xi!~mYt`433J{4zCEw<=?84zyVl8QPFsIv
ztBd4)bQp|gY0qh*RI#t?ZS{pE&o6iiHsf-M?i#Oo@BSmcoJtb-f%hkU>ME}^v4uYH
zsk3#HPt^-a;y&27e~k|<<GU#x%0tq&eJB%_Rs#BofZ8mNZZb?*^gUt!Bw-)eW!Txi
zvcvFSowNH5+3JsEE?fQC*0*Q-?$YPHpRI04*N<hZuQJZ&|MtLci&3wc`txjceJ_0l
z`ZaX&bJ^<E-um@yb<ge-`2W#9`EdPfkNbRld91JI%ZGg}7`zg4uG`nwM}ByElN7^i
zz6>rj`dl76-cR*Sa?`_4t&yHpeI-5BeI-5oXc_5Q(O1&L|LuWZ^?lV-LA|-Jdc4;!
zH$n;Nn!f6_-6rt=2fOD>_&sj&@#V!mHD5a7*w?pvHgW!#%8>S^)D3Y-h}K~@brZVI
z$G)$(eEMF$y3FsIl*Q)_z9R+a$dAy}8=2gt$gJQ+^q{40Oi`~``fn*R35W8wts7F*
z1xAlwqW2@@qgua_qTbbdZHoF@ngw-zif>##PYmI=_7yYN>Fc9+F3I*?ouWVPs~%3%
z5ACU*$kdNhL_z&kUv<Tv1iyAqO?lMsDNHx@9k0~G*}9>l<VEf=zG3AA#5eG8mT!ko
zKbz&d4M$BDUJm_AmbxHK-<zdw&boktdNe50`HzAG_lix*C0UsucQ0S>qh9sZ`hBbX
z`Zm8>CvDc{f~VgzKD|-PxbcEkPuu$|^)GBm8Jh^@Tz8^x44#NTrTK0bt`GeBo;3BE
zwP$;(dM4#{gfKn)W0q=3(~oAUPtx_hLAvffZ)d66O#N_BZOzmhf@*u_;a-IOibUov
z(zB~~j_<h9s3yD;ZwJ)FzTKFMzUJ3Y1=KyZ{u}C5>qi3WjTGIGp;o0G3WdQR%1h}I
z=0WLz-zV;)R!M8Zi2NI~X&OaPeW0KA^M(G4n<JoK45}{zteGsylE-_pB*<TbGekXn
z_jq?*9{V3jD_7TAx=pKBIZ=e?Kx*wqEre$64oEqt$&D|Y{Gmm?nDGqV8B~8uS?mMu
zNYhP0bzj;x-}a#TJW#3BXIays(<tnHX$Rn8hdd51ehRGqm;Kg_wrcW=^X*grB%E)5
zxAY>OEz<CqjTah*!WVoO_%F{=TWxVbT}ihI%Cu>7mRgvqpUF}WrV8b9(`cI0^olHX
zQ$R1yQcnhSO8}R>{ygBCnaDceq%+@6Q!BOR|8*()l{EE0n!X}kZAuf$QYm&q={aww
zc77;6aYkcrwO;FGy(R9ed#f$!`XeZTY0y8&)bI3Gf6vmd^;Rue8p_u}p}f<pKhIZ$
zB>bY@LVrdKSYkN3(WgJoP`EO0$#fkx3$?}yy*3qh*1A;vUWRl{+L+kX{GTr2E(kn9
z$-#Zl>H6vvagyAYqL%n|U5a=*?&VoC(^y9$Q?RCZB&f*_y^Sx|P4}Hv!ox51lY20O
z(mVQySK{qH>TXM~>f`&XrSI(H!)df|clEN?HM^@<({=mq{Qj@LxQ}X-gkP|m{-}@o
zaJLMY{v|f|3w`t}eblvLZId$hk4Ufb{rX<t;@;{ZoH8NxtfjvWDe}k35^4<kdujTy
z-fCx>ey%rHW9ch;tLM_?xhY*g06ksbPDy15<;@KJZg2HvhJKa*1KIH6;l81_+L|SD
zKFkt1tyv<OL4p6@2<q2D>WiR$Hl*(DCC?jr=|@9qYcGAjafa_p?J||NWcKt3UQb4Q
zTQ@OZqrV&Lqt_@ImZ$577>G@=(Q*%hg>3EO)kFT%3A5PJj|~-{MEfxHg{_|(s-8sc
z4_EJK{r)huBSqgnRMn>H2Zo9n`}$C|B28a9Tsr&b!_)=6tQ&@^I~in$s<%Tze|C54
zxnWYypA1pY^wnSIsO$H(77tZV?5%GfrnZWe^kP4K#W2;{PcI&(ZrE49K1^M+pT1$3
zy8cJ{{h?~xkM!$9R9jd-H$+`JK$7#=fT2Wt&B6MyVd~+7H761NbcjUt{2}`GA?mY3
z^bJGQzxaQcy5dm8-*c!YzQ#i}Df{$LO==bolyuY&)NJs6aiD%+h}r>VsJi|ziSVhz
z^c6#JgX+aY)J20bi2U9`<GvoEt{=P~BDW4ck;os(kqF<*k^DF32rngTPjgGSj~GXD
z^rt?x$fw(U>Mx#klxz@!ZRjxs|CRnricFDl-z{QluEjacqQQ2wMT%Ogb)!~yo1yk@
z&jS0(cj_8i>m&aCZs~7ag#Fd^mVW3!G21>oP$qw09Y|Nw7xtHV&a(chCPm-dUtN-R
zf$yyY)nn=Ut^?I28T-6=fXr;4JV0&CIDqFr1oVmn)TaSmbAWm|Q-64XdJUz0fXrn7
zKlZK!KCY_jzZTN6N?6NI3do{@Gg({Q(q-C)ZlNh<mB}=jwxMY<Wa%P;VUtD004}JX
z8Ws^*B!UYfY7iF?mxv4Cf)P<sTq61*;`aT|J^wp-@6LO(H7(%B`$2E!KWEO}&pr3t
zbI%(-+`D^^iW?92?ys!4_6VpfSL|@%5#BHMt$6lu@2dT#K7P3Oo*6p~9PZsP<8xD=
zn(2N0kcz*~^nP|o1&E$M6gB+dVHL>up2M)>{qry>=trw69zMdmud3qfM|fjZ^5umy
zkHjx;ohdnfW~Th|2Qw>xJZ6x`4I=HF)OMa|f*;M4Civz|X@cKXNfUf<rZmA#Go=ac
zoGDH4rJ2$MPs}t;aOF&Cg2$_*2`;RXCV2KRX#)I>CV1d5X@YwXlP36Jl{5kHckfYw
zmpt9PN5u~g^X{swc%aJrBbwmOD(`#yR@_wOU9jKOE33SVXY4S1n0L*L;i-7B)RhNS
z+<2IG`#}{Ty60e&`Ns!WAmir`ekE3pL!_YJMf1$`{(MNqvsK>Uq4MROhaQPv{^U@}
z@mGh+FE2i<0?6fu33BCOg7})?wbxCp_~dl&-W@8gnC|^#hgsK5_kOcu#mA<5mrSep
zpXuHc(<*@cW11k(OrLA!|7$Oq`em$34`REzv-jg^72nz!){i58xU=`G=@s|y>|L~T
z#XUQFx9%)|U$_fS_Ecz`b`mDilw<!Xc4PRcN)fmS?6GR4e05~VS3)NLSJYKpu4XaO
zH3V2taZ>=uuLr=KxCKA_#FU<2=-#SQ*&Oh;Tc-TH!u$GEIrsVZ4)2!__f{N%f2Dl8
z!dD*Y8{a+U+fyFc-TUQ^6*odvoK}HX!M}If3Gdw9`{MM9;oU_~!wcU3X_j1wYaqYf
z<;(Je@9%aT{uNI1(}q*^jZ+4ueq(nrBz<9b@2cth;qM2gSKP6??Bj0O-Men*8>ayJ
z$Zi$*_piGhjDPX#oke5+3We}PQ$9WA`W=2+gC+6{Q-^B2TXw5>s>b`nZWXuHc#rHc
z4aog_Roo2;wRgp6jd#Vq?LdCIf5kwp_qhYleou|}9A*`S-k}v=t?_<**ekJjuUZF^
ze^t!@&xdCw@b{yKUp5sVet+bF`0)8x9*MvI`O14GvkxD;5&ypb)fM>ne_maIFP?sN
z1(=STzb|;rXN2f0$G>|DuusP1n}5V=@bCBH@{6CukHg>BRUeJNAE;i4UrXf{p>j7#
z<z$>d|6j7H`uUx`?@q1w{I1@O*g3$OQ2`xv$`_|qd}Y@u7w-g1&6JyV-UWYucIRC{
z`Hh|B+b4Dw{N`OMZihibOl%MDTJgxPb~Q#mE!RN3+}pbsV|#Cyi;n!j-op1&d&^S%
z#l5{Jcd2;S-rnza6XcfN@%zueqT;uEd5`Z=alzi+GkYBQzkA6R>lb@_ckNa2!@a#9
z?<IeOZN<WG?qIt1_zS0;i$A7TJTuK3+_3@{_8WIR^3v(v^%WKWz*fghqhC&wzu&o2
zv-!myD`oe)x#IRI-px~>|C;*mIc3_wlm!n>0n;hxd8guXp6K*@yjD&K`YLdy{?>};
zM8%kL-s|PNpe<&`ZBq}z^`A$exlhB|-T>iVQ2_vCyB%9`j`yaDbskq3`_os!aDUBC
zVxhfaC-3$h-tyz=vSR;oIuxfg{(fk>{C&wz?@@)TGE;H&RI#FuPl3r;cYs@i9e4~{
z`VAGZ2!4A?#q(3-eE-i=ylbY$P1TM!#^Wc<e)7+-e^2u++40EVOcNCf`^M{G2b(7L
zJN*6hbZt^wE7O*+W8i!hcO+n{-A}&RaRxr2J!eljK3#XgbU8Ww_B8J=Q!Bm-8{Q7z
znEKqbDfdj<1z-F{R>*6n%TdS;V#)LCbnVYfor>3}e`|-SFt}pW!ASeqZWY%Y;Qej)
ziYpHAu71Uw3l8u;y$9?F-s6=O*X{59V$X`7Bm2E7zPX?G!M%_C(SF`X_YqsegZoz8
zx1aak{e<!p`w8-~8E?hPwm-Tp?>_E7Q}%>81bfLTw@tww0XsC)TD#%o;>S}xi1Y8L
zh&vFr4b*y<H{p9Z+c1K3$2dL-d?e1XZ^6!ehl(Fh^PbzG@+;G1WAWK(vN`<@l<H~n
zw=`N>nilq-88{mJ%#Kq&I~8NjdveE$-|px=Ct2IZF*GA$^q=yPsm}tvf5(akcJv+z
z(BL5FS}FS{VXxg$thS~sJ9?_5A1EF49M+=>?{4|(3fQhHyid>!WsY{VEc=LEb$!KW
zc9>#}L^yG}ZwK*tJ5M?ukJy0@si@dzhdnCJ+TpZ{o2N(v;1un1k~cQxrqq^LfAaFb
zFSo$UE%0&+yxamWx4_FS@Nx^h+yehaEl`zGEw%MjO$SfcbodNS$Cy^0sh^vfjxhD!
zOq}T;)1kBY`&LbDlMScg<A<m9tFV*c8DyGaI?QyGsUgBY5Ab&};zLXunT{|WhmBUA
z5fii@_iTExU-Uh#Yc>DDTXBSqXZ#LLD{s+sOfC!JspRjyJN5Gr4sGQb`?RK2Ok3~b
z_n**o;Io>>$ak2(kCAWlr}X>5+cj<dq^2Xxe}sI7;P)ZVI6r&*{(V<LMaOeH(=Rdo
zCR6ie$Ug(K^>-m(g+e1m(zl*aB;VH+M>kd^-N@^U<5%^@;^<C2Q^&=~r8-_?mj}nK
zZKTUc=Ut@VS2DHV+jN+8;}<GjYXse+q#Gk$a|E6DW-Z?+=^7*Gjv?I$>0%Ld&7>P9
zT~!2~t<Sbdb*3t3|KEa4t#on}|A3IEDgtkf!j}g5j5(ie79-1obdAe{pDmpy=ViX9
z)%k3{syZo1H{g8E(ya*6wK|_IT~68({9VrX1cr*FYn1x}MLc6g(hZ#)&aZJ*5Fc<p
z+agq*5~Q>Ee@nW5Qs?*3Yq>7gx-&v+8Y7*p_wecAboTpF)8QKBH^Q{g?-#K=8PX3#
zlsiMZR?=l6=&mJQGwE6*=tfA_NV?_-x~E7NBVA(zU1h7be--Is5p=UiS4q062)b6%
zd8Dh1pu3E8<JIiP2)bd?jgfAgeBJ(elysw{8;hXxQe5vyHyTCfW>D<si-PmD)%pBD
zyXIer-Kg`sLi9u8%M|u(2*+C;ytOMx*VyfR&ICVKofG_QvmT4U8>z<*{$BgLS%R?T
zd76Kj|NG2gs^7LQ4eGg7@!;oEnQ!H|@*Dh1e!e3G@*O=w`41(7`FPdN=R9=_`>N9J
z58y5;&*(K<$Is1U`10`n8oDAJ-^~5O5DfP6j5EbGy8!D!;=}fPemn54aK5AD+x)2V
zX=NH<x(z;5;XXmw)4ERQV+?mN<r#Wh(*dS2rei--ypeRxOk0^|m<})<WIDujl<AoL
zp8Ur78Q08A{*_G42mDjT&oQQrOq(4#OJ68$y*j)d)^%_l__>xN#?<9k8G*ZW?&nti
zuG#s0BR^ODO7nGp7xlU7n(%tW&enR2{hIaoji!xEEx!!EA7?tibdc!~(-EelOzrQB
zO^4naUd}?lf0XqYd``<h#MJ$r-L3VXqu<!i7dfA=aXx?8`Fxx6`5x!<SNZuBEWl&@
zd<Z{(7x&ZU*-aA|60{^XZqjbA(snq2-)}x$`w3SC%+tyg?!@MKl;6vpdwF)!#6y?{
zL~_9Wd5TBdx}!MzmPffF4h;*ki^AS7S-xHQ+1y#j-&rY}(O=rtp_K*0#RBud>(M;$
z|1eMNPnudh_F4UG%PH?emS<;8ObiP76H)L@`A0vkU;Kxk?dTup=T8so_kZW-y==!S
z{PMpqx4?fz3pB2`Gd5VA80AMJ(`KfvOfyUem<}=>Vmi!pgy|^LF{a~8y*+heRWhw&
z8e`hXw3%ru(+txArh`m}m<}@?VLHlmjOjR2j}yC+X%*8L(?+JvOk0^|m<})<WIDuj
znCS@9QKn-|$C-M2v;Iu0n8uhkGHqts$~41tfaxI9A*RDjN0^Q>9b-Dq)Z2&oGp%A8
zW7^2HnQ1H24ATLogG`5*4l^BLI?8m6={QqwU*^xWifN2#BhzN4txPja2bc~r9b!7n
zbcE?B(=n#wOuhY>Khr9vF{X`7o0+yU%`hEcI>>a0=`hm~rlU;9n2s~`W-x!IRZL?{
z8<{pUZDpEaI>2<0=@8RlrXx&8nT|0XXX@?G{FzoUjWKOx+RU_-X@=<l(?O;~Ooy3{
zFdbz&#&n#icL4KeTE#TRw2^5u(^jS#rUOg|nGP`>W;()jl<64Lai-pZ%%5o$(-_l6
zrp-)SnP!*{Fdbw%#B`YH2-8ufV@$`HdIvFord3R1OdFXtGi_y>VLHHckm(T9VWuNY
zN12W>9cStt%>0>FF^w^8WZKNMm1&0Q0MkLHLrjO6jxZf%I>vOIsdotTXIjNH#<Y=X
zGt*Y48KwhF2bm5r9cDVhbd>2B({ZNWq0FCY71J2gMyAb7TbX8<4lo^LI>dCC=?K$N
zrejRUnR<sYf2LJTV@w;FHZyHynqfM?bdc!~(_y9~Oh=iHF&$^>RWX02RZL?{8<{pU
zZDpEaI>2<0=@8RlrXx&8nT|0XXX?#l{!FWw#+WuTZD!iaG{ba&=^)b~ro&7}n2s_X
zV>-^%JDmA5tzsHu+Q_t-X)9B!Ej4Zl*Prb7xLj@iY-YN(pPLcENuJoPnr<z>0iM4N
zGj%@?U99<Jn2s{_<RZH~1?pGzDW$8tgY^2V?7u^_M_Is9oyz_9e-_T0cS6-MC$H}B
z>g%tH$B(Z)K6YHZ-+YXpUtN8CtoHcoWBG+;5|SQ;jo^;npa0{2CHYPDUg5=3n)7B7
z+)jL$_$kC|KqmB+tCfBZ@plt%C2sef@FzF)8RGJ86L}<VrSKVa=-*F#$ic^m4-=oG
z*?TqEKntJIS_SOB8khJD?zJo4&W-V1#GfMlEG>>VMm*E0^ef2!P;88ae^s3V^GTl~
zJ}NH`!85>(ySNJoeLLwdB)ynV1aHnL{awTtz<?n5uSsu@Uq3*+@|++&E`=ESXC3;v
zn1;e<)S-VH@oDwS$MV0A_+XFX|6%zLfI&g{>__@nv78%;5B4j)#O0LdyTnI{-%mbO
zXm8=u=<*>xv|0JcbxwJ{O8jinKd7)b0|QC;SU;IplYay7DTs(T_EwcYcHEv%{L%9i
z*IYbYAXWNS;+FnCi#vRFf#JsRKR3vK8Sz#Jf0w0yOOXEhy~F3-PeCvB8miO^?ZNuq
z1Ot)a!^Gzh|0i)zUdn+7#Wv4E*qMaBlDMTGB;M-aKO;U&+>Yle7|Mi?x0mv_d@ds1
zOx))CPvWh_*RnkC1PkFaO1z(V11czZ^WMt;`^4=$jqRiOUujUY^R%`>d(h6)%gLvc
zc$$0=uFmj}9jJV6(4Tn?81Rzs_eg&X>2D=Ic97D`I|}4EdQYYQJL&CwxtVzD!Ajq$
z<leuC?>tL0wDcP}o<~W~YG=p!0i-{Hd`{U*^Bq4_`R}TUcRleIhyEGj-b^K2hyIYK
z5#vqj`!3Qy$#%Ggxb-u$>i1`W%QznrL4n843p<al9SF{&XGvd0+|t|mQswYjxG;SD
zp8&(Alruwt@k*0BE;8fZ#T0Lbm$Bz&#Cx!C2>p;le?1JMLO<Zpzioe|A9d*OhXGjV
z2OauP!SE~mJ^HiR@;nYZtk4fR^moHBEcDhdj>Fbl4TE$TzT2USTfaGL#~mTw%6u)K
z4TmYciyv5}c!u<r&-aKA61R5p^Jgmkkb@s{xZ=YO{&V6Z4t~uMN<T_mZ-;r_8AmE!
z`86GfR-W{t0i_*UiDR0W=Uc=Fh%Y0)|54;md=<y-GsI)xQ2t*def`l&j}RXQvH32>
z1ebi9iI0%p%AZR5DSn^$mq>5@7j3;RI#&6_zN!5G%<{~774dH=F4wu_`4jQR?<u|-
z`h+|meYMgL|3L9J(zm=u@o`*`l4m~opLU$$JtrtoYraM7>Ejh|L_}D5Fg(ojzSk<=
z>fm3C$+)fbhI%wZkeSEI+X2#F%JNuwdlU1u^7c32BklP(@hb9>*KwJ4{-u_ui}*i?
z+jDG-cT_9=zsbicqQ4>@ds_LdWIOCzqxAN?{;h1!e&8}6$2cFY{P{ryeJvVL%4yHv
zEuX7^3!hWh1m&K*9yN^qL|pL}+@9Zmj^+6{aqCZD@##>I44;2!y{ubng81OS6;H6;
zt|UH2{Cup_@|@nF<=p=aEvN05DfmP9|NC{?VAh=RF!AACwSZ$haJmlXL8je!u4VZw
z#sVaK4t#_1na1`%>~)G)y-@)>FItIP{|9;hw>*CZF5|6{ejsxc_GZ0a`B=XNOaCBn
zp+~GGLuvW%f^#$}PmBj_@(y5m_6C9AXR$ncvHfShDV+YeM#Zf^#>0k-=iNa(!wm$c
zp?T_2VBv55C@lSFi2sj6KYOmykJ0Z!-mNRo?}$G@dU-#tJVW!8zLNeTwtZGYa7n(_
zk3`-fE6;8i7=mAzm|{NvTNCd=;@1BIku}V7{sN^RSQw<Ah4okXSif?JBc-R_J8+SX
zE3)@a0=<kA>o2oE>4yytdBXlWllUHsl-~NeSo-w_N8#w_a~kQtLEQTP{E_(GO*$T|
zAJC79zxPDNtv}Gy#4lP*oPI%%5P$O$Esyngx8<3>ROzkX(8o#tDTB|*`ng+rc?Y?S
zPwP)~Bk32c3fE`;L;5}F&tlti#&T`{^P3dek9_VX{vHQ^W3$rVLY&>}T~7QS2Y-%u
zrdKB-x1-+aE0oVSNe{KkJa%0eqd$&a7w&6O`k#~jaMJ&W_>>d1{GT9x;!2j2`1gq4
zLHyjsO8;6m#8beHp2_jXY3;S1toR7=S>*F0;4*G+S)vTMUGW;w{({^45$<??&Kb`u
zBKm7>1pZdwQce$19p$m*yovZu#3}Z??*NzbT(?}yc{J;L@hRcs<S*n?>F|FQ=q2AO
z;&vR)1upy_XjaBUrc|ESPu$)MSwr0F?X$>-#qn;AD9;DT$KEHwG&hgsbArR?C`{xq
zJJRRLXAdfFXhHLIoT}x2HE}EFXMm5?%idd=!h+g-U+3ieCenA1Pd)h$oe^H1PS8tx
zj$fhuXxsBW<TFG*50TGTh!0EfBs@Fn&%Ec!r|KOFV7i${-lIIlJIEX2e&4q9$p*)M
ziuqdl%YaLLM-EW_DP8`(`$#`P{xR}@8u&Ex_Av!4|GyEp<I<+bpdEz&`cstB&i6IM
z2Z&da|5e28eIdkgFwdRD2T4Cl`riN#>vwrqsN`$!DOHnxDi#Q7p8>XyE$3|DVSHW%
z+_cY`S`esV<~h;uIoO-+jEA>G@L5kj&2LsdwmjEF(0_#VnN~Ic^Bp2S<luJ^A9L^r
zfy;OsamK?B4Za`VDa!#x{{JK&dk>7Tx6_-2&%U1ZPqKJa`HwdAdwI2o2KmeH!rHkG
z^wQ4O?`?Nx&}aA@<Yk=x8Z`L9UdEBzPmoXLxjK*!r$GGeS<1h;PVt%KKRc;-EA8eV
zCf-WCa+A^{^qhIVLVWB`+Ar_t1U@9Ce1_hxg#RF)6N!(;l+O&}r>)k0S>%-e6Tqb%
z?7beHm7dqsM!Z`2Sp3(-hrXf}{UyKnN4wIGT&CrDjQPGgE%`!UqW|3y#8&{9e64?`
z<$o!0>sN2_6V@ob^?SGYr-_fZXgPT-;>}n~dit|NqczVpz(udK{^wR69<h$)zfbE0
zwa`5M#O*!1&k+9<aqE9?`M<hD`HX#D`wL>qJR6A*|3vw}jrbpdn{}4`V&&5Ovz3qa
zFSY#71}^k>U#R2U%9F2<-rg6yh5Vm0_#U41`y=fAoAfdIz3##<j#;ny4)NZfwO5=D
zT-t4@P4hj5^xr4FJ=ba`esrhuiCwAV;b+<$-ZjKW2NbaU*KS~aYl7o#&o0HSKksz%
zZw4;o%HDVSB=M=;N}s9H5rW}lo>v2x@>{=u+s@s@?R^eA9%g2gkN37<`TL1me_`8h
zcM*5T=W*vKpJCR^_TyWLS6!_2ZPUcNs7L#^)#=|K0hjqQFsSYJUbg4!dzF6Jp?@E6
zp>I7`%l~n<!xxEL|5`hbo*{1U|5$vVKINaeAUNK-fE&4Sjq?93`CmbNY?F?2dDpr;
zqvR7KAKU&%^s{{~)C$>r_ui=eW$zu@ez_C4<m>kDPYivfXZ`CB)oi_gk<T&CxV>YO
z@)??`Ihyx>;Wz&zKDI0<PtM$|^d8SgZ8^U~+}>Zb{rjaYN<Y4rw!=*vxO=`uaqG`)
z`K%y5=*X)+;4*G6<2@qVZciBeV6T#PW{bb>T<x!Go&NeD@nNU^e*s+D)7}fdh3$6m
zTZyy2wq746KJW#t7q^|>ABo$0&(^K&$@7$ty$5RfeENLF-S#{Rf>Fv7bL7Ami8sDY
z`-R7T-d6@lPycnx{|~@LPCh{Uyp@xQ3zWXmk$dL@mv*-Iwd{D?=R&0)VZY2|`>Z8C
z{ylAnorzyz@c|vr*oK=YbCL2HJxAxuEaE#}tavN56nT0$PMU}hIO8FHiPGEq5cA0A
zY~WIUdymhS{~e?sifg+yk^Zc=E1&VRHQ$}|YwvmBB3JEwZL2@*d#U2~-m~2=JPBO-
z%ibHX_S?fQ6Z(Vk${iH|w*7m63%$KxVaNY1hJJ=;@5kHne~a|?KD%9ir(Ul77jb`K
z`OgO~{2MRU_P70Yt)Yiqj`ut3*$xjAxA$XDB7XE0te2y|olD%_Q?T-X1h~|9<osZL
z4|<2<83%7CKIY(e6MxESw|(EK<;ghn;$Og}-#7FA%mXa{;qOxV;peq~*Al;+xb?@_
zo&Ek3;8Ok}=4<82Ypzs2<J2ctk<Vf8Ry?*;8CbpRT;f%D_lP{J$p06_?Y*TQEta>>
zAnR4B0L`!7>#tJW#yPP4zKM9NBbS~bKDJ!>*cSk=yqe|b_+P;CA9ank+W_y|TuZze
zxQw67f?&NqMI3MEF_hNM{2ck%dpmaA9{wKX)6Dh7j>EHnOMgA;$n(2MZ{sq2ljZp<
z>GyBfadiyw{`V?>S8w0<ea!a*TK;cPp0oj%{vCJ5)fY(bJ*@4voctdpZtrp0dD{Pe
z;d3Bfn5g9;x%Uo(&+zQMe%lUjLBmRWT7M<H@XkEnwzwmQU-1ELAA5h*^0^$i@Ui#7
z4r0c46Sw!7to;8cagXxt4Wv(hQ26ifx&H25z>VCvLJPRY@;CHoH)kE$>qAN(;{<wI
z#f|qA@v-T_@w_WGdeZ*GobPsiy_tBcqyOIoT*@=@b1l!eSf2lUL~(aqz3!vhZo{jC
z<K*xm){FPlY(Flt_&r+i*RcGR*C`)+KhU=SY=fixYzJHZD@bqeVO#oht`|Pg&zEYB
zcE9si;8HJpkJ0v5)eTB-;~C6neb*Tr_CU6emGgH2H|@i?BzC?$Mcl?uxrKa=xl!q3
z+;`6;zKQtA1v+1nns^s~OzB;H`*+0cJv7^%N8UvKpVNMyuZedSaGCEzjCWz@?{^K3
zeKGCiN3%T76SwhCHV|L`apgaXmpsX1^`Bwj!oQODH)5oJ#Ng<qC0fEmh(AX@t^ZO!
z_Tp9D&BEtkZ_tqsml+)Pf5t_@@HWrUpHTWCXZ<=Cxb#aELI%rY<@wd<2*K?=R@-i0
z0xtPBI_u6Jw<`Vcca@Kox5oiD_5{X}fVys;n@K-7L+R%bf7;L=?Cs=~^Mu=!kG+p-
z$3xQK2YHt{_Mc4#KiHe)@cFvIp(Q!{i|*UC{H?T~EMa+0`=sKrb98|4Ywv#IjTcNY
zly4;c19vEWV@&Dq<+zFuYrBm(^YKF9ru`9>R-SH-=Z}0!=y9IqtRpAfsqJ9nfnd2b
z&nR%=Gd`jjevSAi?@~VYey5%9j{rCHj=gBbrxnle-sW!Pa~W{yM;jmKY2ur(&<h{$
zG%dK*D;^=9Y18t|)n@XZ1upfnaT{!TKKfbZWAB@u#C97c{)J`A;MdIecf@0#(eZOF
z`JeeYmd74ovOWI{T*hscBhP2w&H6g)+9u-T^3ruYYnkuk#H;pD#^*BMFMnS7*!$cV
zVdnWe@yt0&k8U;3v-hyRO<K`D&BnX?3yP2RDgHNGe&W`DcP9CqaIey5{dtLB0bJ@^
z>FghWM*1=KubtP^zo>i$KcofuCF^wx@e#)!awG9dyjxD5HKbqqCFNt|WLbHBIq_k}
zD}kD1o*#W#>216$OTW*E;+4+6U@h^{CT$oy-tGr3<LXh)mnu)6o$eDH<Cbx8YKb=*
z{2<Tu178YU#&hgR<^Sw7{lQl)e*F{!yo3ER^DCO~y0etft{0W}v;5uKKKqh>6>ur%
zI%i)0n)KFBdnU{I>aPkt_9>1&a|!V(NB;ku_&P`K9S4CR^=f6DgkO<=6L8@_?${qb
zU~y-@JWW1h2Wh?Rda=(pl>cC}R@C;(8jCZ|ft4p80WR%e<7inq@Ojd^_T<h7gdX|{
z@8KWC^1lzb)GITl<zd(~?{VU;-M->MrLUs>p@Mv3#9cdi;+sl8&VwdI@;1*I4=Fx&
zlh*fDI&HiUd`s(P<7(M@t^2mpkF;w!?fUpu;L;A89ev~dq>u6bz16$!BVOg~E8p-P
z;d8L}oYRl*050t_;>h`fzpJ<_pI=A3YEiHshky(JQD;2-o%A*?g>BCxMwS0C^>gdL
z)B)Vg)3bHH*!CGDedCH?zP~o~&^Int`kmNcv%aVNs~*$#Tub~q;KJX=U9<f6dYJT%
zUcC;u)GOx9qc4zt)RB8jzOQ`7x$a{bGS59fP`r`*mm7$`;)ja2I{ozo@y5GUo@^q0
z(~n5Mi`Hv5t+w}J;x^94a?<}4xYXCikF)h%^@#Erp+a$oX5)R4_!#xBL5}CY1DA8p
zLF!SOyXP%?RQWeM`qGoY&A57l_SXkF{*QW0@n)yKqrjz|uXW_op+8po#!4MIYshCI
za5FD>&&1ZNhxC<>J>f}14||9+{*U{K=G*GvR{}S3;|tm_x3hhoG4xnRT7%>B%Ac~F
z_bK6Yt+w}`pD8}Pr?!}thu?l&@`WDGcxUIazn1-6@PoXY+0KN$(+!UK<;eNlflE0n
zpU@27#BwekBhHSscA@Kln|bQ!=YJ!;jT>X-$;@A9c{V%x^BaLnzBYc8o!2A8M{d-D
zWY@tbl-|an#PBxH$;1Z+g5^Ksms(D|1jNwUc|8nV^0o0RZ8^VBe0Ylv5ZhnJKB@H0
zrzn5B?p#9LJx{rfxGVo3vGk4}SoJIA@1B!31DAGgbjI@sNI(1wEhpWQy!$PF=Z>bn
zR?m6H@WFo28E+eYt^BK;_W26&n3M0`ztM7L=4(6rOS{Kg4_wBfjYs$~?G|sJr&zB`
zm2f@jW5jJdGu!^R5g(YT<-bF-@ecm2@)`c9;(yp#GuTW#cBJAbQO?Z%oz`o_sn_Me
zO}{(#^Bu9#5WJc8%q=Yc8-B0#vT*{hCH|Pf5B8d!b*kzQLO;VBcjVXy4Gup6XTCg5
ze1PSwB>#DT)bfux>&5lJrJOcCE7S(_JVLy=OB*c1b{_ka(vLgq%j$8(8=ZFeJ8>JI
zHbee@`m@$+|Fd)+9Yg%eXIM_&8`_EZqXx%#pj_HWe8peb&a^9@O1ulWjPq97?Pn4H
zq``6S@U9Ajyps4Mq#ttR*sK1k{4?|$--YxS5+Aul=L?o0^L&E%nA1KB{+I3ZMjcm!
zT5azR;4;qf0%r63e)2iyS*0IX9Fz~20XK5e8He8^eHF_&hkOqG8_VykYtx=1exH{A
zD9(#M;x=xTt?y01CEo#OeEz}Um7a|wdN%Vt<L}Df#<AL+_|?Q)smC@Ee+amYa~q#)
zf8yf?hyKic&?e&RpI82)Kh%La^+5gN*Z)x5#y7r&^auY_%irqApHAR1{%ySD&yfC3
z;zLhse`R*nFW&qwrMGbmwbwlFQQ*=(G1?E&qvo0UZ^Z|l{<@HOEA_*t$mn6>$2jx!
zRsYd^y%V&fo@aku4P5daap-?XdK+hXPnPEmpf>I7oC}N)A9wWm6Q?NsQ`BcRlTR=4
z&5m69BXHB6j^3V`s(fr*Pust@0~h*1>bG`0{G9mMRa#MN_fGGi^y3$5zuWd7BR=qT
z#XBgscdgKTA8_h*32@1Gi1vhiS+CC#Z*}BYYMRow($C+{uOZ@<Ja1h@K8H?M`T-tz
z9?$-|llbU03cQhc%T7w)%=4qA<a6xKijUIJd#g@Y?*qhJ<-#nU1BgEfT>96>Tej`o
zwF~*s@91mfzh+nB)E{hneiXQjtIM2m^*7SH`+|pd(|i|kf3%ML{|elU2lnqm;tjhi
z?wzR(u!8u<iQ72Qwmj9ZP<k8B{DY)F4Y<^|nfEp9yxVyX!7Du*S91pGj|6VU8|5UI
zeQ%|q$2~aC3)^l-S86*9IPGu&aKq;YEq^oH|AsxaJT{K0<#Qw)6@)%BSLv;uvzfS!
ze{SphA>dN4aq3sLUOyzgyDwgY_&UPBxkJap`&i!_flEJDviz3M{~8?jCED$0lfHT%
z<>PHoKK8u+E{i+s$ZPji{>_wocD;U!!4LATb@cNafJ-@T{P<g$@9zvfdWmx87~&QC
zDgF4nwcV^7xY6RAr!z_a7vR$GHjZ+f_%SmS_jtbWcH(y$9Qv>$pLg3|=x2D(rGom&
zalobgL(V#T266WsVGD8hJXhk>NWY9RU)wL!5hq6Y+qmp!s5tYEAYOTkwuAMDJoP}O
zckL=aAnwkiBn&u4KCBMbtNLK&W8*H^_B_|%&;l^t{m%uYxA7_Mxc#`roqg`YL)dPN
zYiRZJZsMcdki9~y=N)>e(vRPv6D!W_F9dG-m+>H9$wJ*q`q(SAB<GRO(-!Yh-~*)p
zI^vN?`#<3H`#x35-^THsL;BNzOTUc&K<9-G2YV^$ZG3f@mCSScOr`hglztWk%^`;?
zK6<rQ=s4<u3yHgO;B4YHUjKTw!`q1uovih;`^P^JZ~nC6-=bXF_ejmx#__P{<DJ9@
zE>OnnS^n!@sqM4QX&>(><uh`=)(h_zG*2t>s`E6%@3EX$5_kQ$eh6IJf1G;09sh?M
zt^5b%f+U_{^0}J0YyZEO_~5IxJU3|Kz3mv~f2t$5A2B%efBNUV)8qoNbFAj;>J?8A
zw{cLdK781#lzy1|4!iEp2QKsL0Y@+TvZdce`-N`YUhAusPvhl^U#W>#V-Cvq^eW$P
zXO<Rp0dONfIlpGI{O<rR?b+<q_g4mozZT_~ZRZ1zQ~E|nZg&C~{*}F25bGcQO^fp!
z-0IcO5+A-$JIa>!*T*ZL!7nTR3HIZP*DCJX^Sg-K_c>TTZ;UCuYkyt=T;za#|G_WW
z4znRBq#gEm>h(V0QZE~K#<ttfiH|w$*-$NfV4rmK<hL38V9#^*5u?C`zm4B&%l{m4
z_dNQ@8l@k)L<j6{+`pVkd}N8@`*Qw{0GIOExa?N1m|e?!C7=o(YoA|Ay!n1@_!#+|
zSjT*Mo@L9~N!-RIw)imdDjG_=*^j>mF6~_DwDaP6<?o*79@{{i`u0BTuM}{@$7#=7
zN$<+Z9}=&+TgT@eENJa)ZJ#QxYlGx}4sbJHxL(_O-A{b%UCL)y&b#wZ(DDpA<@pY9
z;nVEw3odw_<~vBerIGpG23+VpM_)ST^@`i~8DM!b&qm^vJU6lXwPE1WKAW9(7KaDn
zKl(*2_(xc=-Zv`!bB-P3A>cwk_!k`r8Sak`f?-z1RpVJfec*S%g?{Kj9f!!yJbTYk
zyy`c~r+qiY+lkxv|5*8OjivvhGI*AJo+du{b}c`Tqr8XbvYgZtY=1pZ+_ksNoTue)
zrT+hb7RQ@sa7a7G)o&zzDd{~&ul_P{sjn-4-aTLOaq87}zwi+8v9z{N2j|xw38f$6
ze(Ff(dm8a7%EPt9?<em1+w8PJ`NW*@^HJc^Up9Wcl>_$>x9?f7c<n+R58m6ff31H0
zZs4ZA&OMZ05Fd2p$^RNYkhgpf(nglQdy(>K<bL0d+j&ijXPo&m<3z<rd9G^dTZr5E
zUo*-7THrGNn;kuQob>Mb-GPgzczf};{v-~3Cf19A#X6#D`DI50{Y8czaSu2zW|97b
zzz@eb$vmq4K1}?6;#F7c0Jr*pw<Nqgiz4tFfD8X2r``Tw@o#Isc0K)11fQdpDxcAe
z4opsW?@Zuf?Q>}a{v-17*62L_73=%g2>RK}!uekufqxCS)YltU{!6ub-k&1qcU!LX
z!`ExQpw5{mPW;y^6@c1ip0j|5)pu<Ker^Ol7=aH*;7<TI_2N0>-C7*)_05V`-LB<l
zxD&4*cvv~#5rKacxScy>ZITDcf5;gpKOsIipbcZ!U+<*w^2{baM#GiWds~QC-mK;S
zsCJuoF7UAOe;@+)R)o`^0o;s#9=uz7_$Ppe)%T~QAE3WeI~(I)#K&&Xa=w=1>WCJR
z1K8KokHyN#dBDTUpN_yEi@>L1{7Lz%-l^^UWzF7O4m_;<8-a)A`@snO8S)=uzcjMG
z8&1}ID<#k$o+0xu;;<1P<an_C_%QKdN6s9zD!e>#;MYNZ4#~U3@jOWWaz9+^JLcSj
z`yb+!@6dLr;C%eH;RF9qN1xg4l<@XB1o&RyUwMe;+sk~<05179^B(91;$O1(ceTSm
z&wNLT54~LhyYGA6@aK<xN&Tte<&n5}lJAHkS1%)8b%^F`^`+!#;q7@j@UZdl1K`4c
z*s-4<db+M(!_;T612NAVfrpKs50QT0_1bPVq<=Vq{tv(<-%6fS50QS4GuY0%DIZ&}
zSDvZ(DD5h*Cw({Yu<~3Ofq#R1#s+kvV0fEn_cv>O$GBeaP5ws$597Z$0`CVN*3Rz(
zZrb@e&G4<{|C0zl)6NRd_h{f@{1->iuaCgbkHD{rz&{;<e+RhKYxJ|)o?~p!*Cn+*
zhxoorYY$%qJgoeeM&Q>24{QG~N8pb|;E7g_D>{(w$#o<NJdFPpz{AS<7;us2i_TR2
z*4j6oC;vgtQ>(Whk<xrKS8KzpV!yWn56kzm2>dg^!^-p92z<xY;rw3{fv=9hF9R<9
zTXmK;*q&Tpt|K1fIyjH@`c4F&quavE^Jd`Eo}-M@6(gVXfSdVpw8{Y<+j!rI;Pb}_
ze6RNKd~1M9zWei>&i3Qkz$M?IOSPgOV|hLt!RPJ>{14<a_#c(ecX8i(e_HvEeM$LW
z!Et!%8rF9}$BEUK&I2CS?^gj28$TZ*pT-N7zqLRA9=O!^7~18m9#XS5oX-z|3w?&;
z+|rLn(C@WQ%kR<7a}wJvPP~%$Nxx5g6Y#L|-%k3*ls3#KN&f)xp+T+JZkl*cM)0ZW
z2<N{6xYTQq`q(V;|8xZXL%?M`G;h%IJV*K`$>#w_4tQrP|IN;R|8U@DTycMG_eYnK
z{sD*nHsE3Hv)lS`e02o=&Io+3&Tu~I2>i+j{1d>XUIUACoZrp%8IPc!u^~L)<0J4E
z;L=}>&OE&bxa3>;cg@%8Y3Fvaot<&=e&Av4`K<_ipKhVYx!A!ff9!eXn}AEc<COCU
zu)gm%^!(sU{t$s5mr*{0^yfK>zd0LtSUIl%F6}VnjFX!p=pQHlF~+;lT6o?+BIpl1
zN6XXrW*z5n*ECN(aIs^I)1Tb>Ei4BfR?hwi{I&@EKM{CsPdNXRfSZ12|6a;+9?~05
zzXiCIzl!^ab)^3laN(1=Nad|PAN+R&pSgY7{_C7^_(|Yl?eJ&>{x{$z-;FB35-iW&
z{o#BXfJ?tO^FFVY!)Hd&e;By%AL2pW&Xki!ZPa!c;(7Ej%y$KFL(f1FA12<pN%67A
zmBEjQe;jyNeMchjzeeCQH;1ol^MK2|%W&RZ#eBPfOF3hgYJb`N)JH7-RW0X7^=t1D
z;-d{Zo{!bUdzN@B6^fsd{~lYy>opU&=`Zeo?L1u^LH`!gkG)$9V%u{tg8mjm&qDZ;
z2O{YI6oDWAmhkfTMc|(UF7smO4JsMhnD?RQYJFX~bO!LSe!nUL{~GWxz2!ILKje(J
zDd&arKOVT)0je(6a#}mUnh5&aBJiEg*Lt<mZ>p2^eG~91U%T5sPXjLFaQOSWzMRP~
z&Ly8gt~*x#JY?wYuP4&SNgumQ$Agu_e<SYNF<$jHEoUX|QSTtDEyPE8PGR+ylc6ul
z_`J78`5&u^cPVk#Zh0H=(KB>Hyo&t)3_PrV_qjmj&oJ%MR{ks@evGpotphIQaqS6r
z8hZHoGhWse%(wAE(lg%9LB!9r_{BP}aZSKHL%_qz`6b|C<LyTg^iKdc>nZDH_5b~_
z9!U8I&>r&G{%QgqmhX>AKX$r~lLeGB7hSCLE5rI)ef~qh!}vT1T=E@q?2t1q38!y}
zz)uEl>gDLA>m%qd0WSHvdenlqE8fWaV#k`+@VxVYhn4@=5%@oWhxJPh<hW^P#v!rt
z{B+{pFSXx!9OzvQJS^WYl7964O2041|Kq^Tx=+1jCd<FWWvo|+cGRxgZQfD9!}u?Y
zz<(8i?|FGRpA#bRRlubk-1VqCg8l;1kI|pm_RD994_>70un*f|G=k6HBJhK+2rvKf
z5qJ}DDgUrDPu~Sx#$hA%7CXQ01YX6y@F(9PAMXQN@L?Vt{tkFpzx;#rL-%WatsH(7
z_FpNFtDnc-skl2%P696DWQ^yGR=#ZjF7n@_{BL4m-%UPM|ImSB_oqYe3NPo6BJjTh
zH{;*gH#J_V<!L=dCC@zOdnxe4{SC&yuQxc(*?GTj2IbFBiI1lg;59k#Z^SF#r}#pS
zlarvYgthZ(;8ITSF>U9^C=h=}eE4p~EuYzg;q$!<co_eo2>iYX{OJgMpR2T-L)@3q
z+~PG7Z@p8?S;hLk2Y6U{?geh-2Jd|w%65L7eB5)WX|U@_`;2}q*bdde!}u?az`ths
za9q)><~@2%IG>jHgyTIC_*KBAoK>II26&R~a2xT;hqe5#ApS6LX@>#c6S4N&KahT$
z=d%fpw`uPUFV7s{!l#jTb!*@22QKAt&&xg(!RK4Rt3t;APsnG0@rdkxeE0W-muJ5S
zyb-v_s|OrC=TzWQPIupNSp=UuNI(9#$_H!b`9%c%LGKSQ=bL~_yTvG%>{vP(xRmGW
z71~k9vcKLH!RO%!{P*PJ-5VUY3$G0?&uIpS9gOnu6!yiY2>M%qOF4&mK6oh0`5^JZ
z-BpnAYww4U`$9k9=mY;G?)pEx@`K^!yb!qX8KeEv`g`9TL4P0VtCHGZR13W4fXjGj
zrk|BvzYhP9@^7ZTaW328b--;O`G@I3(pTQC4e$`#p*4d365x{Wh;#4z!x8jfkHDXZ
zz+d-2Y-gU&TK)C{;9=!|7`R!-X*afZgWnmv67k)Y!%S9sj}I&VR@#B>_>Th*<KGHg
z_zyaI_%-A+@L?UFJO=PaBKZ6`0{>eCzTZc}%X3r&el~D39-RHoB@y)B0WRZk)RE`E
zfPo^c9i|K^y}M65koW-ieRdp<0T1J|<8@L_+>fKYwfl~Pfrshgvw%xED;;~<j@O6N
z9{^nFt9I7;@&=CQBW_T95PF$Bn>Zd`4?L`#=aAm@Tlf<3N@w5kzr;tlk8b6<v-^$V
z{O3pD?GgC7z@>kW;d^JS9(WCKsqe^pbR4ExuiGN{JVg32>LGUg{|$Ip`_KHC@^|+;
zrvVS6zZrN~dp;O}x7;N3=wIF=_%X}h3*7W0<0xAFdHKf`cl{jh1RhqNuaMsLbND^*
zuyP)7v+{TCW3MIdt{2OI8~wrYL+B#C>+kVV;9>dROFrY|W9P*W$Y+4(GFE>$<`dz3
zRsffFYn?;An&W>9a2Y@2&c6F&5qw4>@O^IKICSnMy#~1S<M<vbZx^v2Pc=9{_>%Pz
z_)jD7|Bb+B-m3Wy(jVp?&bx)c&G`Ag&I`LQ_^zS%6@-5td7IL&J4qYtShm|a#49%|
zpJzz_8{lF6xZCaF_z4mCnGtwT1pZs#VdMOePlo6Fh6wz8;4*Fp_#PH(H@=zp*j2%G
z^V`6sJ%=ght^ApFhvLm&)b_cQ?RG11*Z=b=;F9mydzAm>Y|s8-9arnPjt>%lCveF(
z!}WS+;<piZ_u~%$59^nmKNXIj23+`$U#sPRCCmR7;9>22R|KEOBJf@ARQ|4fJ_5M3
zgU5C7wXD}F(hpNUYb$x)Wx&Jgb*tsi^P1%>|AP_qzly*Q$GRo$;OeD|h!3#+_hUKN
z6R&dQ^TJPumoo)i_{ZL>?R<nCClT}yMBsmkz<>Xl@O*dutmf<5lbeByJlx;WV>^I{
zwe!1xhmD6XN6`N@0^k30TAtxsv>m!AuWlvo(Qan@{fEHA%JWPF{yOOKB8Qvle`xo`
zDd1+_F>ZjJzvls$ere@>WNWXx$I#=x58KD;bqhbQ^aBrS`482^d%MM*eeT`B!^;0C
z>0@tJJ?$xuhk5s~eH{DYdBDT?+)jFTz577~{qH02sxO4|pC5s50dCsu{n~-H{J%2v
z(5tx)+A{Bdua38Iwr3UR(M`aG&tOX1VT|pt*B2Fc{ZdW^F8$a_zqLciXA5wf7a^0}
z7J)wmJZwDw3AmZ3&UxiAUkc~H9JtWCcFr}od7=DY^W|_p?Gg9@aN$4B^G&O-ewes>
z&h-c2(mo@!kJ)j)^N5_&;r_nkXL}-WQ+~&v%1fTUtUA@#(49$l^`zIvtCI_tET21P
zNiw-C8LzM1)RW3&(ml!Ewp3@jJ=xyfm(28ZxA(X8CD-)#0!j7trMvn%y1SCy8`C{K
z9qsAV>dtg>b*gRsrc_URV#VTREy?<o>8{@Ho<vJ+a^k<fx>Qe3YD?4VB^zoJfv-9{
z+gFpWZ&r6kxTiXs+7o%pT32t&T54g#%D7^2i!bYz>SUsQ_3D<oWT8@2H`r2~)1N}S
zyd|BK=0h7v{R55B-np!%Egp-<TF|zso|RiN>HIZ}*IC}F)xF8?{=Oi;6)0ui7OKwv
zo_*Kbo=LW?>+Wbv2RUimhc$t=!0No&)Mh&(+XPyDUrQuYYu0phrIWpV{i}m{OD)<v
z^JWmwW+3gLEHKF8vvia-^`(1Kece5CQ@uG2x}>&Y<?<Et7A2Q7&0R5P#c9c=rOiu{
z9bFxL$z*a)e_!|7o>Y5tPsheoU;5<UbUay;Sh1`Q%-Ys9Ea_O?lj_;x_R(apJie{F
zvt8=ESVlyB)2dWwf4ZSBwRUkwSNq($w#}R4@x;dCJ9>Lk$H$7L%QE#%?<Ljs!TOu3
zbM6<eYH7a?EkKRa0Si{lS-3Q@Y^8PrOOmWBT$ZvKg%v9(_G+>drLDiWuX{r>#gUO*
z)0IqhcBFbcy4EIqaxL$0Ety*p1)OaqK}Z36(tZ6sU6>f@&NamsvK)?u3mTv07IbxQ
z(Pb+1<81aTf-|(XV?(C1sd;f*^8#7gvWrh$GLcBEXo)3PCYP^_B}HU3Bv+)@cJxAG
zEbm#B?&(;wCE=T8+k$Vl(EeFjR}-9-6H>`eh%nhyUwD$#))ivx7k1U#fnjL<YpbJ*
z2|3)A^p~-=h%#;!Hx|%pLB0K}lTFFx(2g)t5}W%Vf!i_}XTmS33(K;%FV)x478Gp$
zs#MET($cqNRx;VqErQtPTH_3h5QaIcQC)3FQ~G}xwa&Sj*Um1$D5Pt0csPa03VVNn
z>=3WZsY`vVKoRUu!)J$-#%Ngax!s%7?TgbrUFpvF><y{)&`$+{8r0P$+GW$JsrDuL
z6yfSvR*q%*lWS6)ouc5X<`S>P=r9_`qEuIVXFAc`(pSGR-3G0pN!AEm%@ZxDrk2?v
zf$(f>7U7fqot7-o(qLp#?fj13OscPK-Qx6?mULgDWnO-@E?#ZRwy`6<DX2}E`p=pp
zdiIL+8Wg84S>Li{LsM5DO5yZYa1E@^Dp|ehzGQEAe@|OFsR~vSQ*c73hniSO@60X&
z$<A)*x<)@w^(5Ovl;Zql$}UpLw(iW9<eDB?o4eYRJ?UO(+QF(riOtJwVs)8dmct>x
zrdyqDc`3oVR3@$3L6A+}K4_U$pp;ovMC*vQ_iw23^Y-mvr03(+OVb;=d$ug-?%9yS
z0La@s)p0wj(i<{;TcBpk-mX32<k+b@w?IzS$-tf_A)AI}7$o&eQd?x#G6Bp_Z<Ni4
z0ah0%Pj+La<=J4Wl3b&82`Sa26v-Antm#_%8j?w)n#ZeGLnUl8ZPc~4sja0i9&cNR
z5suZOqp!EAO(uU^A~CPJsig)Y0ICUQO-&L@O;>Lp_R|^F>RbA%tFhGRem9v(ZRzYz
zwPP#Yn{F}{%ud}|$)z&lARIOxpGo!f>Cb%{MQ|=p4E2=km$V~G7)P^`fz5D3GJjW1
zes*C>L$Hdq*?Hm<S1l01skO4p)U<oe8hk4R0SRW>wt>Wn&Sj7s6CzUvctid<x{dQL
zT=@y3LrCTGR5iZ<$WFM@n1&U1NllnC!ELu+B`RXOy%nrNi&lc}S&V)hrU;sf=BP^{
z+*rU+-E%3pY)Ey6X`i*urrvBH{3=#5phDNU(|1Yfx}MQpmqE}qBo|PLO0F~NPfybV
zpZ??De?b9)9gUX0#ObuF%(9#7S(XtNi+HlW3(E7xbhurzx?!H#m|MMX0hDvI>1~lc
zZ=w!1%g!xLP@GN_a}rb?lyr8w%UC8m)-;_uy907CwIQ9{*t9YUvP5G3%C^qlo|Z+)
z2COJzxLh*7E76;2>uhLGw{>oW`4KxtFiIq9`@~Q;KeN20DVaFA6M`H@&E{kxnTW?A
zmN%rjAaY>pLypNgU0agvslF75S|CH?klXro5}GSiMyQ^cgDaczwS~0cB(z06+QKZt
zs>4Z5@-|BSM5D|rrBUXyQL-v9lwPa)wk$(4iO`7{H-X)HJehxp#3p5k%$6a7tO~S-
zDDE60iBei)0b2t(VEE)6AkF{J1_+uomTxd@dm&Qm3XwV1wD<o^f)vpZ|L?>{5pDAS
zL3R|;4BLBX_yq^n793%9_jmQl;bgiq4NGoQv#VJ&%*$k!b+=;|Rd6xQQ{P1M0<qh~
zleNMNdQ7TmV`9m&*~#SEu70x-tWGZJZd;$sba!^NZAnyP$*4|j<a+mFFDcHKUK%Ae
zi)cGcWZgL4+h%3XuVB8&hQj0*b{md{NiXb6bwI?n-RPZUDYgwux{XFd8dk3EEiaV@
z))Bm|%RKo_U5owhT3FM#@WM2WO|NW^aptwA`~UGqZ#EjHqibXLdO4Hu8v>R?eUNrB
z$5J&NU41?My|8j5GkxZ)C?2B=3l6a5m^?oF<gWI#c*?Z<zDU#?+ix;!{Pi7KBeznm
z(hY6h8(>5>MXgOv@RD*Tk#~C%CKJD!gABUF?B1E|psNiI+XH*|iOqkF+5C&qTXxv7
zWL*-bk6sLT@zQ}SS-L&f_)%S-g56|es;9#kY<kk?^mm}fN%75*_MBxl)iArj=8{Oj
zfODe!2m1`{=k+}>LdmI6TU)xfH(S6Yd}$_X2Kz&)jg)FhEJuz39-+3cKs#)$4I_^n
zw>GZrx(O*bu1jxNnDLiFgU2~Y51a~{nsH3B5nb9Kof)GoF4&DZhbHkCxo#aVC?`=@
zOpN}nwf(R>Vkgsy8IR+mj;`KxPhY{gSH^HoOjuW>sbQJLp-ym$)+XVFmh7~%$aD+l
z*u)Hg5w9khOtqg4Q*3fwI+fX;1?@zEeewh+j3Q5RO53t9xiX$ySsg5PJsd7f(~8?F
z{4!stCJrri%K{N)fo3-4g+e1m#Swb<_HF4x&8!7RGf$NHzssJzY_T!(Pb_MVMZ2vt
z0n0Z2!|{#(Jon{Rh-FXQA}szBVkod~{<ohL<yl8@n6&C;vv}}et(fMjZ-YyDx_U``
zwp>+EcY3S|8(QEP)N2nsorAQxhElHzY@2%x*;%4~tl1OM66}`wH+S)d9@s|VN0Qa5
zdv$MIeujBe)5zR%t*^DSO6`H5|C5}()+S|-HE~ZwPJFYMHKLj}Uel7=m~LOiO@>kL
z{VUj3n2_^#1x>;$S8cN#%wR9SV+$0YmcjD*=NUMY(KdP5Z)L;e;XHqvwhGN}<u`eO
zF3p<vtk>#lYzS5-5>W5Oj3Ijv+<@4OJp+)rxGH0}{i`Q5KI}p4uhmt}=k+Sb>+x#1
z?RBQ$Ln@B5O(*J^yBxj4lVYV$b#$kN&nqNXd<1>Ks`K=a-7asx!z!40R<1&my~sDG
zVvTY})=Rd@qlA4a70N$X{)>iD{)(1y!nKtnDSvskf5T@7Q;67!$;HF<=01UOuME9<
zm}RionO~Eq*42pVwnHxFrh3-)s;6iBmaf!>?yL?Oub)R6Iklc6WFnbb3Lk~%?GWkg
zt*BQXNgdy&m5R<dWcXr1QOAL#aVX4Q7lAim;pU6Z39bVZi>3I*(<M*LaIkH|&^#vz
zU%gl|US}N~=XJxwVY6KLh25)(>$83^gw<A&ZNkBexoQIEM58K~Z0QculKy`-M0Nr@
z^LMLSBk!zw>5Ys@lVvL4G4>;;hWKEHOw#RKf~Y}JLNW44hV9V`Tb+GW6V$|`#AOl}
z>T#oen@UYcLx@cyA9Hpfa!?pDbY9#A-j>b0RZ^&omM&p-<S$yhUYFuc9ewNI!VeRU
ztciJLhPkZhYDKeJHl;EF6ObAS9Z~<1D0kFADWULrWWtNL#b2O8v$=zpbo@8NS2npj
z>JoY)|L~{ozO6>7VZSw{&(kGaUG##yx5|)_zumY0_ES$lvVtw-7R}0*|H2Ae=kBen
zT%NJ=U)Yr|!rEs0v1wbFK=V)5@@!WtTV66H-6n0CXAr%Z8g$Z&X*LT>;??VY_mItI
zA;j)C`^6A@Agk_$6tHHy1#8GxirMAg7<W$SYidJS1+Vj$e3w64a?JNwa#5;xU7z@w
zHZ^aA$xUo<xT%TbEJWpLPxqxdI%gy3pM*H<+XB1YEI6aWEv~QIkhS#HuEX#76U4p1
z1>bIjBT023>VSNsal)#T)v@IK^qN$EXP<uAZoWk}<|~Xm+(qt5%P;KhA^8dprrF}z
zSQ1NC#}Z3c$Ibt6*i+XC<0goGkOZ}o=o`s7orttzt~{Ya#Do<bcX160bi#%YDv?Qb
zi2pHuTAP$#Wl#8&RB{vY)d?KctypY4ZQ~8_UQWX$w^O}05$q6=98oZT0Iio>*tGFB
zYT@ia4iN;ib3^So@p1C}y0*kXLU~xlW0MtfByT49LipGpE?i|IojMXGUft$IbTYpA
z+1N~BF`Zzt_Z@E=&)Mn#o8)RtzxAC6EC;L83zKW;0mWogmCYI4{`!McAR3m&N^47-
zP&OuF1wk9)p_gGSq)(*1C&E0mn~*?yr!%oJQD-i9npP~1E-yR(Q1G*>uE#YN3TT#e
zVY*9$dL>ec+BKc2wJ5`q_$=JUFj9Q2{km7|ipBE^Z9O+jZJb~WlV(wO_j=oSfo$Wm
z;Vh383rj%<LR7xR1ecH#AQ28cU_wlacuha<ZnURy@dBYL;7DA$$9`gUcG;Vo3@(ZC
z84D;El*KJ`lW-$R*j5jgR|0JyM&*mOw4trv;xBHVUdo{hZ-PLq<s}Wc0+CK_P&=dv
z1e=`|HSGNsLs74Uxc+0q#`?!!&?ja|bzGv)EbQ!Fo$Az|w;(JbZ_eiVUA&qzQO3TO
zDIaybtA7JVpM>)&Ox%#^>FDl(LZ#C@-Ba)<)zwmqKr8ev??UJ>a{)L{!DgG-J5mSe
z^vS+X+#-@Y7Ve~V8ZBPe*5A_urB9+|W&0#gHqqq^@1q9D8|{q+h+)^<-QAh!KwwD|
z1vq@<nR4VCa#@0WZ04IDGTNbpNTw7HOw1+Kv9)HjNQ@c@367X{Sk2IUp#@tg%^Auk
zUb7ZTuZ#!@Z5@kZu&BGYFIb?qo<0P1UbE)-HcteV#%E3TZAi9tc6X&Aa1n+)$uhR1
z7QNz~=7}qW-2ZQvU}#|!a+j?GjuME4m_*n1Z1L9gNDQF%{tX+p;3vLM@Op!ro1Zk9
zMC{6?I7eDGKWW^v0~545?ibI9u9)sw17ElJ?1^Q?x1M<OPdUCPz2<lX4%oa|f@1ed
zd~=x&8zs22T@&kd`+@FA_bp9rS)E>hz?IUJ&3!!*P%jvy%xLJfaDDE?su_=UbYb~Q
z%3Vx5)J;^}nz%purG!uH@A8GhG!ZKiSVwcNiJyY#?{>*Xt-4nr&;qt927>f%LV#Ta
zg4Fos&RQd3p73Su+Wx-YSyB$shO&Ra0Z=?0@c7Cd8WhunshbFuNp(!_M!In_qr+}4
z=kcFR;CL;XldU)pK_sOiv$5agj$*P_zOAlp!QKtysk>`=PYYuC;RdEQy#}!|a!1YZ
zY^`|xN(mFrDB(>QKsNe=$#xd9^>v`ldLqyqa?Zy0L%S4H5Zr*A5P;jJe_Q(M`uZ`i
znojg95-s&f{m)luxkqMUaXpa_bL2lcpYwTVBe0ZsXU~;T`U#8zsc7UNF${!FA}K#Y
zIHm11W>#^`_|6ukAN`8I__e~Z9y-j%bYeqbo>iNWr*sihzah*QJJ&GgN}6OrEzaD0
z^G$wVABg|RL{sAnH*O_k%tkjjIuWgnAYSsI)e|aGLZ_zicPF8Y0>c2!p?YztVhk39
zf?})$0fhi<xg*yZl9)$$;RGD3+F{~A{Kq^4(vEOc5<gM>HeubrWq_DzSVH+$1WH#Y
z(;+L4q$m>tLLyN@Xd)zO-V)ZyH0%T=C9IV~jxENl`SEJ3TRox)EW=8efH$_N?-<v`
z*uOxQ-3XYU)I{08YNG95=Bse~7ubc`ztr|+Oic)>8pqvp<4lfA!%fKz=-F8!aq8ut
zY6+<pX{VA2>zm!qnQCpKeJxr9%^?8<*~4&~ls?Yt<+Hf#&xKEWx~qLD?1fNs_!@^S
zRXwSUs?-E6MRaQ@mJ*->ib$e)V=5p#N*O)w;0}x!e3)2L3QC6h{NoBEAFk{YD>2&~
zrZGw#@c!|RKEeeatO$ja1vBkT&WxyJ)>L;XFI2@c7u6E_!?d`$f0m;T)#Ptpy?;Gl
z?_$7%pd49})3ni}p_mpIhdVqvPyB`h*yOc<uKcj(a3dOgrgHhLf+ZUg%jnN$?8NI=
zb@cSXsy;D4YSWt$KN&`J-6KiA)$`ACXG%`nls4>mN5%}ht&iJbD-q9SWu-=S%t{Sw
zX%fj{T7uen?L_#HKDd9II4o+M>gw;r5e3v$6<!nZLXdAUi?Tb9{BJ1oE9@IJ7^<y}
zEqXjh;Bi=&k#{y`xNo13yT8ddm}C#;8%DY##h{kmpp-nYk;^T*hI=*Ye!g5nMLI6|
zP0BPbX{eg{YQ|~kmcwTbEN+t_v0DPWokI5uU{UYx?8N&Ik`k1^X@g`Fh`x(a-wR#1
zw_`2h+L#c$$boTnq21WwWQ4;cE7Fb>FuGC#%?kk&trJ6)l4!Vi<OL3;D#Yg3fjX74
zp7^@UESAv{T~=qG@^7@iB9>MB3je`yi^r;p1*Swizj&sSRH|N(qo?Pos;{1)%H8A?
z=up2Z!{x269S6G%vp>5Vmc^v88D>{4z2LbrUu<hEa<c8M5zQ0!8JU9q-J)6GF%HUH
z+h54$!62+Njs+oae=yfiYV?3XUtQ4aY(jC5CFI2ug~nJM-O;m!f*X~8!8HJ0@ipa+
zH>h<e(V5-=4PGomO-p3F%3t!%wnV+G)U%er<|PpBzt-b$e?d<=Z7=HZ0)BRVn0(7z
zP;@}lHQ+R+(C!i2cOjF<x8jCw|4pZCrDYBQ!DRv#XY%xmHyIE1bs}afXl({&y%up>
zz&JOrT6)O7)UVcb8UnnUc=32YM<6e=n#O&v<uN<)vnQe8@T#x{cWt)PeNNo>xW2!|
z=*d=AOWCaAmDL#oHuZR1or2m@ZR^q#_QHC-lM%0x`+=S5&5LmOknSPdvO2__=Dq>s
zXo_3`XU*?G$S>qh{m`t=!8(`fihw>TM$44R>3b`ZZ<`!l=u#K72-7JhqWRA(7Wm5;
zYWT17N>txh=9Q@W*|+$K1+~Ch=i?!nz&7VYd|U$?`!d(4?O|&xa=lhFfnOEL-DV1_
zGF9{!ay6c?Sz&~kGb-`YYHFU<-J{Fpk_|P8J-uWDEUBWu$SDCVX-jcDtJ_0)qc4`I
zti?JWwowQOKnPpzo3Jim?i4&C$vv4UQ|Xn`7t&%Sv#qm9y(Fh<AS_6I5f_~5SEu1~
z0XHy_e)g>^P9PjJy$Y`>)ioM|6k#^|H}rSHp`aZ{<*7cORui$lSOZt!)tp9Io)E}l
z=|<pCpP-Yl>&{EXYUK<R?}@Qm07i)dcBBdbO{T~dA&{F-b@Ho0IfjQ~mYM+L+|5d0
zy>WDaQrWLYVSUFZSyd>n;bJ=|+IZ77Dju*R6`IracnLH#Q8Tf9wl_5xHZ*lfgiP&d
zwHlgjNRA;<Ub!an%1SSch2Y7C#>488*BBv^aaVGqaRpAA7s%?x`s7kL?b2-k=iM8Z
zg9COn^@xrM*Cpd2VqPxRBprF88OZT?3xeZzuClLytj($yOU|xCjr>N4M8fwjH{AJw
zgKZ)SKdpw1${q*hP|1xY_C)*?=Na|J?-qyqy|QD)PPJ*niAy#ZZ(exsCO3BUN`xK_
zK8yoAywXT&o!w`YAcqS`MQK;QOA<<mDmD1m>NUo&fUU4^7R$GCj5|lqe$DGHqs=aA
znZUpE&DpOB_M5PpVmbBkw>!pPvFnQO#GzCb(4S*r^G(45sog@&@;PUY+hqe;-2PE!
ztzT|#>+Pc%-Pui0xG&CDn4s+oQv<@)5!Hpv$Wd14a?L^8aAajwfKa{A%z8cHDoz!Y
z<I`L(z1oH1a4XJX^W^O<iDqMq&(7+}HTyb^Rfn}DcR?y}tll6C5`C7+F$44Wx&mRj
zv2InxzBSK?5Bvj7-wb!ZY0(s`S-dvPn*^d>LAOwj!^2-em$jA@GVUW-ys_=e@uu{a
zErxkfZnpbMSY-TtgH8!5fxfjPx5MDCA%iii|M7EH{gXgB;_o6<u4$r3Q9D-_A6Y~f
z1~icZwoAGG#8OBB`6qgw+v;%5I6LjkBD0MrfT&hjjZwVl_1BmJjVPCKU+S@k`L&Gp
z5Ife3;u@Xwhq`lEYm``P%^~Z=SEVlaFGiKg46MMAjdEA5!%^w7^d`TnYPq&qI|~DA
zW^hAd7wp_K*LZ#JmaevSJ>6aMR&V;qOw1qM1mr6FGTQQOb?VHm9Jt%K#|dKXRC967
z*&Th=HCUT`9zEimyh*%Q#j$p_R!ofgSzPiJ(HG`g8r=01WjnD8=E{}@LFce|QWYKF
zn~JL)fe8iM4BiEffvQ@6Iw4(`VS{b8ciC3>y#{M?#bvgNVWuKTk*v$wR_Fu~*6Zq?
zbQ^B0B-0!R@W&FVI^hmdhOIidzxQpK9j&f(*6n`1cf#sjIwSwO7|yd)>i}9UI2r?U
z)2X=`X{Aa(f0>cn#Ap+PPGE>RCE+T)UDtA2h}A_SS{7xO<q%(;c-$u@jGdtr*Bm)7
z4!Guo3lV?A!QNk#2(dmJWekyQ9|R0jS;?JM@wTyo5*R*azHixp8wjXBtOv!YE;#4_
zp*-T;P=@wm7-Z#M0art-<RLp0ws`Zn<8&IIWnE{w5bZIUh6`$M6JAi94Flt!Vb-=U
zRjd{*H<p}mT9qw<iL7jEU`KehxctifOkK!i5o;3%;90O($zSwjcf5t&_dDs0adCEt
z+Bt*-*uo-#E9iLRlkwLzXA?Me@*Cu^eZd>NVK<9owZ*`nxNMIRopm<G-^-wsM_jf~
zL^~Usufm;;!7kL<ST5V^g?lu!T(;L+`UyB1oAZ-=`bNB3^{HI%v*2d8gxyj=AuwA)
zv8ffI$%|bMBO?EqLABm$ZWeFm9$e$hM)dZd7vzQ-HM}t*+SZHzOGD7!ILUHCR{n7R
z+TyMom9ur9B{kwfVRjmYTbU+Sivj{8xDuBrHH;88yA@2HQr~1IKJ&&P!`)~xHJTU-
z#UD9Zm-(h`p^N67b`8GWLxclSxyQZ%`5U~fQE(RGte<X#RwkFKUu}=`b-!PUZnSFD
zQ3NqhB+9f{QzjI<z{>2?7);DX#z;VI%TRxFl~tdwOh6s619*1Uq?l+f|3XEc9-f@*
zIA-$kD(&(G3tI33!<BR9E=d?6Q=Z>{0jq;Ok?}cxWT)}!B^zp?Klr}#`GojX_DT0N
z$tNl4)u3f7PBvPNw~!q^?u+!0yDp2IFRu&CL<nD)in{n=nj>JO$u*1UZDO&Rt3f?L
z9%0j|mUXCEZUf_X(uVGyEm?(6dE{E><VGXRE(oh+oGuwU!t71_^S&<9d{CETKJb$B
zPg^!;Y12FreAX|okfj+fa>}IJOe~9Z67T6-y`>Mz@VZ0-=9`r=S;`VBS(}GXSmhk+
zo61_P3|Fbwq<gHnz6CyX8&Yy<pO>I<CLA&S%&UpyoKdtQd@vSK{%aj~IQ@KIndzsW
zgVPVQ6!EJ1yE<^{n|)twZLnNIQfu9O^sT&2=F+S=fXUVA-R(#4rAVT2o5)a7e4OFQ
zoz-Bk&DPO@w+f(ia5chCt2lLDPG9Y1A^g1zX7&hmetxt%Z@vmw=fSQVb-v{Na-<>2
zu298r46LXyl>~;SWrTJOMk`y)nH1^C<L|w&&Wq>WqKOr<v@KdrpS}=Vv^rd8?M<8g
zA!4qXSZ3y>jJ}9w^GxtF)5u2ZmZjVVtVgBlaef`c?R>wv2j2zv2`_-1EW1uyH7j>r
z%5g=Q<UOBnk`=St)vc~L#+GdMF_(|reNp&DJS;%P%x$MTeanx}>TKtDxw?n1fnEy3
z2?ra286ex)3DqLUR$Q*<9~LcaR`6@*_DRl&jxveSTH+MfoWbX)LxIy2`04o8ZW*(Z
zXL0(9+kpWiy&WidkGbOh{jXa25@tV?chfwpA17{U7=GGYB2KTv&C@7vc2Gm?7Pfub
z<1NF^NX|+3Tnbpp)IomYfJ#3{AA6(o)bG&04sUZ0CaV>jh@-ioWMHyE+ivY`71QKa
zjz=7~)vIG~CY@UEdi2H{QoXRj+j|o&I2M&sO-$XuSimKkPK^38EiHZlAv!evm%$Yo
zC<Y|*NQS`}4N;TZN4ho3cWGtQF;B}~Hi)Dc<fL+PsGKkEknC|MTGaMqJR%5%b#4f~
z<|BNfc4D!tQr}yTmyNIQ&&XELx5kLCPjSM`F9v0BSYMq`4cC8J!*0Hch(NVHjAGJU
zWz^W$W)LPJ3x0@FZyX8O1;!a=LXD3+Y;m_?IQS}V8<umbGB0WyHZOV`X1)sFhJjr<
z+pwUVia4;M)>)1c#zoOD;3tmiojGcqKdsSRrAx^oBl_H;c=M19r{L|lns{43qG4}9
z0KC3##A0qwuIa)FRbPKkS1<g-JJ*O@vC%&<hKp;gfcY&jZXG89N7Ux-3=-K;ch;^}
zm+0x~?veRaR#|*DV7Dxj6l(zu7O|koJ$b)cCr7M>mL==4%?tKM?7r|v=mn1|V1L@$
z!*leoSu>mgyA9&GOHSLUP2IJ{*yXSR@DBrM_lN{GtC-+nSu@{U2_80&_pHTlw+#OT
zWo9`^BUl|WXrs%00jTxiHAgvCa>Rq@y2i{pVHrViEtvO2FD`oK`aQ}fwg{ms|3vpq
z`c}ZNx1i(9kzK_6E@4*qYO&3B1}6oyy;2vP;1(!&QaLH-STYdG)YzOpF`9iupU822
zu`<?f09>WYj4n4))k`nZUG_ZP)W>T7;k&rtI#Yr&?G77ra#3(%;AERu^>|k*crs-T
zXp6dik$&Uxp428>NtTekJ?f88o@y0w`eN?b8`F!K_Wl#j7t`Tqm_Mz5c@h21s-VT4
zgvw4hd*7y3_nQ8V2yR5m*OOptG=pRN=u7#Bd(d|+9`Ej%C#I|gczKJ2@`xqR#{aTH
zPAq8d;mL&OoY<$ZYY|T`;}VCvl!)-!*Cf|3H!hovd!k+a$+pd#L4&uc^x&rF#Mwf#
zgq)NP$hzL;<meuu#p{-ItiknZb(WBmG+Ak7s~=j374XCpWyjvTDXLJoOD#4?Td1wo
zyn|Lq+cmk`e-Zv~Ax^ct9aFbb!Va7LODRsZ=Gclhtchtr2^dEdd7R}Yd(9zkZG9WM
zxKE>5cXW469PO^`Np<w~F2J?+J_O%uU!fZ+QJ0JXDqgn+#?n6g#J-d#>Im2>e7;#M
zg2=+=WEdZC&1kM0CK&=%uFB3cS7l{s1CO|S+-v4}P}oN7mS?vi)^)Nyg#e|^8=Dtz
z%m#ljipAs}*aR056Q3k&e$CZAWQ8vf!^D_mJ7xc+cVCSUaH%S@pNYtV2yHp!P_!<E
z35xe41?Szy70YUR*Qeq&2o%<5UguQy3pJo!+rx0|1t9(>{9ffV<N*)j^){M}2B|@=
zJs@8AwLkJhsYvMMPJ~O4L0-rZDW2=uB_{XQi+{B#il24fDMm1%d=a{0z*3QwQBqec
zqQaFW|M6wDtj9@-J=&GVyCOPMz2+4V9f;SPtZ8pDXH(WIR`*0AZ~|pA@q(q5&Acol
zQdlhxtnuD42p;JNUqlUCjrfc8)hXFp8=;aLaAzWL;CyrSR~pWG&f9Uh4dju3zY*xF
zoTEnfxE;HCDWWx7%P!wnWO195E2fI(95d=>Gy2E^f**Gb=9&AY3nWg1%<BY#SeIiB
z2y!ae5rw?qqD6$%KdX@9LqTB?IPc357Ot*qo4PnbWOdz@#tX|%On-li@GLekWAIoS
z+ASPNM11(=O^vVi;$a}K8tLukC2nc<T%lY$3kpZ76A^OSyZe%vp6>R3#714y-wVWO
z7)F!m-k9#`ft-<75e8lFpitsJ6vuCMLaT~;P$%kvojUm&Hz1XZ63e19nyo^4rq!41
z#w@oB;fsi6p|`ug2Z2F({W9NsC($F5aPC=Pe^s1e#0Zth5mMrUU&a}jPh12RXvchC
z9%n8i!NWC&b1_PVe`?7MPTx_T!%dYy-H}X$FVrn{>2;8RP0>dhrIy?;_kxVG+EA_&
zp<k)BYUa!Lx^(@^cZUZvrzjz6b=#n0dX!f}h*gaH4h}hE&yhgzc1o9Q7!Dle>(0x+
z!1>oV--3gcK3Bmhylumlje#X;b#k0D3f!W~?%U-(Y{^Y{8x>wx(sZI2bzsvl|1p5-
z+&VGI#KqMbF!p-pKWQYp&x`P^@oWE9!H)2yx1`P_(y{W=f0Ih}R=r6@<cJxI+x`e=
zE2H=LMD8{_yz#$rY}>R-&?N^ZCCt!r=W&5RY`Rf_B|@I<(RwS9m8ekVUBxM&cltIk
zA|%{%0%K%ylx=gwyH#Q9Cd9Q7;HxDrp-;7@oHOylZ3G%4R<aT)^bEyb=grQ&5>9&j
z%7Rfe(HPjv7I3pOQdO2d`$nWvb(H)XkEwI1D_TxXqGv)e^<#KF?H?^r$RP@&!Ar}B
zTGIR)rL3KS;FrZ|4t8@}Y7_h0qp&WUGs3dbe`Fxq+n{z4E!MTb>|o<bpJUp@e6~48
zIWFpYTIgV`RQ*eobFGDm@za3IqQ2lAffx2#U1DRRx;Bc&s$U@o7;(f8;=7~GTo!>-
z$`M9RT#z#5pnn|5p;V!{E#nQ3)GTnMkbAIG9oLvV^KqPjmys^-^53#<AW9<!zXI3N
zw@1=)6I(prMk%|WnH0%4De}S%QaiavDiLd%)pN~35<Zy4i~l-T41HgjD~9^nr{T*D
z-vWw&kB4M}V=Eux<4RSboYzPC7#7Gv#%YM9rWWV6z3Wn$G+tZWk?IYa(u`>FYe|7y
ztR~LGRxd$;(IggT^oy^V@pW)9K%)nZ%%O1%3(j603fe0sW}(bwyJIoSRPMXR5U6;|
z2Du|QIdUqy)hcO~hz><=VoDYeMFox&$`qY}6HL{PpaT|R&=->_o_woAoc6;>YV(jO
z5KFD@<*R2+toQgV_2_THDW-(_nJYIWQN)6B9_Q#1dM(lHgv?7}5YIMo`EW*#tG}~M
zY_?>FyjFeN+dUx{ikS^Yk_37q+M1wN12NT@Q)R0FWH$%4KewT3D$>p;CUB@!vUf3O
zd#U=O6SB@Vcm^k64WmctvR)6fOggnew=%k0v)Wyhg)2KZwx%tMS7*3IDqZ)8h^1dT
zBN6z{4a9;5<`AtiXNxFT8*9pY2jpZ(MHPm5{5mjz3kggU5y$tc;^5erbVHIW)lK#V
zoloGe!>~Qu!}MVdM~;mVUSGO-dHJ@#u|Sy_6c&)JOqb-q&~S!!al@D^O>*T@iD@@K
zImr=9vOAvK^}ep5)br)!YQ$>Wl)}m$i17lcI`N;geeiu{6n*{dTd;h`(1o_{+aZc&
zP{j_&+iXjUQk-3_tbHx8bvKL0{8NtaNv}Cx%#oOsH9d${-?1T`#A$tZPq})*<P5if
zc3-H`{nh}CRjf({JvZ$6fBxI%@pNY@17C)fXchPt!u=^2d~4f^amZ#UZ=#$RK8zBT
z3eR>F<HczU=nMaz)OP|GJ#dZol<Q$-Ey9oz@tD5$ud`#l<4~dow!H7`tjW<?>`>3|
zWa2yRFK&!kBWKuFKJdCjpTRXdX~Rzst<5s1xYpa(Y4=0!$=<8F2Cjap#!5J+g`M52
zQ=QG--TIDgnw`uRG{=WKy9fup#>;+pqT_Oql~ASzbm8~4?CeZ`?>ZyH6U(6lO3x@|
z2db4jCgL&kD%FTM>ee(_#7%3z2r1{tK-9afpAjKuz~Y!X*FJn=AUptAs9irWS7F^L
zZj@UmfHJj{oQ4mS0v{$k@+M&$nbpGGEr*?0lVWs*ye{du$+fS><9*%YAKR{nGv&lV
zHH)D6`AVz|%dB@N!&a&lCs*CV>&-L$$u+neB3hJaP{z7e3;ANKD~nQH?Qr31_DKz_
ziT1vR?;2&4xh!Vl@3!MANN>DOqUt7lQyay5P@MNAyGimoH|5Tj%Q~P-bZx%_x_tII
zpc_9^8TLibWxrL!*;KoxhH?g$t4bNcRE)~b8*A>knq0fRf93`ztr&)JuI^Ky@pVvZ
zwd2vc#M;mHL_|L06s*uD8@U8MjivVz@<y9dZ&2iL!Wbx1qPe|>%1^#2p&Ub)&-2eN
z!)01D-71ziFlB<3S^-%J>}Uy99m`;tYrEQt1vqdgJUH3|<#h!^UiBv8Wib;6IoobZ
zo2^G+UP~8b-Q&f--#5d>a9zxF#jmnrHc@7D0LufTXrduX(DmHW>z|+n>!5};NEy2w
zDMzsTMW>lNdD^+e69HQ+bPnA2jbkA0b@Gjya<6mkOZiox5Mj7H8u=>FGR2u23d5W?
zqF?;u8=E=AwZQphdt!A$Zu&)Ee$nGf{5wZNVa#PHj1rWJ=pZ#&xn16Ua(CpHA-Bzp
zv%A#U6Y=!KFcR=WDdSC3TyQP-?3K2F;i^3C%rCnF;@GyRnUcYPE|@KPlcl#kgL`w`
z5}Z23sOSt7b9)7fiwqQzP(EE*L`7C9<>ei}p{vwmBey&Kf-LtkY8i(XR+vP%NQK-d
z<$Jj0)IV!U2noM3(I_kZ(O1gUxN^j)oN5$u2{N-(#7TBf7`iF;3jrBKRtG6=b8JmO
z#qQ!KlnBR}FyAnpeNI1K)+}+&u}w?jT6Zs^Pz44myo$|6&hM#L?&|4pZ0ThiGEpy(
zPE-Kj1+}2!oIyzCs%FJ>r_VvezSNSIwTa<o+QFTJ;n4^do6B`Y#6>{76)}R?5Cnxh
z%yMm3rK+yZskR*(PB2$f)nNDF`hagu+qxIe7p!K4@jPI<jJS;uk9YKHV1KyaC2*x5
zgOvxnc5%q{+>?1ci2|e)3r<#q7zsY<rvfiY4>_L=a+_U65Zji>v3z@|$T!xOE|VQw
zb}2i#;w|zbsI!B7_L}Y<Xz>WrVqO-JlIyf0+QoFIj!y}^K`ma>k7zCJ=^j|pHl#9G
zd`o|G(&}uO2E6D1Qeq-ZgM6N_k}ak}Tki7V>MSLype6VLxr<Y%+E@A@kaaD`=f!6?
za7INyC!Pe)>{8bX6xZix>@xv5<<x~yH;nP>oI7cvPLw{V=yQe<D>v8XmhCqXR3t(>
zGOr9M&#8N5(+eKUz03`5Kt9MDPQtF06}%3Em%^(d*{rO=mx)I2%_ghJB_5&#m*DMh
z4dR_AbrdW;GRnj<mX-bW?x}7(-flvR%<G1q#zc@BTi3!b^Kpg+revKlPKZNR886=v
zlV3Je&Q|SR%_3kt9SXm<UEPp#w#}2Sbh$<$b3)+k^--3si;ucJl%y6_lFFQyf%Urd
ze#%ZuJFK&UV^Oy+VxqkM%MHn2>WFeicY)BG{*4e$n_#>yC=ccQ+3YdJXkbkvxreCL
z4IzQ6&4rDQuC;P5Yc)4fp)mCtlJ-Sc47JkJgqUkl0beBKjOhma6T|Ha*QR6~fttm#
zE^l=0=-SwATx44MW+&%#X_PcbR@_{-3fxwRkC0Ub?O2{Lk(kU`VN!gV0>e|gFwbF|
zS<cEVAk^TA0-CY8jn7`N_+k%QSRe|SJTi*QI`%YX$_Nu*)N6D7(?=BTY^dd$rmjqX
z-%=brm@|E|bui7IeTnXDGxT!jP2BGj6`$STg>XWBcwcg|V@<Lvoo-Llr8CYq2-vG0
zWqN$)h>MWXTv#RqbfjfCG6=lL%*@W0et2Gl-dmbn)6vzQT%E>-G>zDx?Rb|qBv3p#
z@up}i;d3oBb*u6E?e%b=RHNj?waIceGu8kb*ad`bX0~SikRcr~Y(iuPEN5leHj7zf
zkde=hDf61HNnF2eD~GS6MX8a+z78q(Qe8E4%)Xxf-j0ptHI2zWyfio6XV$c>x&>_J
z$=39jSrNBx0$UHTe6}QGY1LH#!y)8+wc3Ba+O%;w7Wh~)UY}f*f&r&XRGjXf)Y^2D
z=zS-ubcnYg%x>o@2)b^>;lw`Kp0O`EyAHkY7gM;6W@7guU{_*e;^fZduzw{_gaN35
z=etcO%Gz#J04)D4$&Gy`<QkTG8b-OWW}`cq@scZw8|0Q<cYj~-6k!E?y@XYHp)33q
zc`;2dX@at>K-r_9;v~_eZ&3UVit(Q>{69{ji_s*EV}*#E?czsnUjk6xa0)7Z!IuCO
ztZUYZBfCAxas76OXmRI#)){s}pym23bL$H0Qm_tjjmYR%U#p!}2>VT|wr{nx3{(&w
ztk^;C7gl?WyAPHop0DLfkm`8SL}gwB_wyvYM&;%@PAG!{1^J*l<tPF3HO6W%1jxiW
zm`c`$7HqiSuG*R2knZXehuWqkQfv8ZF|?UPy{ud&FpWT{xHN8VUE(!*+W;3r5yY}R
zVYmm?16&yjs0Y=qegJ=fW#TGpwHHiR1#C?OSY3YaO8;%eE4oW^?+=U|ID7hx7|yGv
zvdT(zTdHjxdLy|GyM9D3fIcX<wM~8VEx1MA(nh~gJqlgm%pREvN*PwNKz;FHVy5U)
zTjqvRg2!cdykYXk_GodU$Ht;{PJxY?z!|ZAEkj5BWSyICoeCzyp{%7i_dv#X(2*4j
zYQbg<{&>y2Zd@rzZ(h{hiIcOasUNepoYlRq=|)CX?gbVGztRPueE)s2DOEN`d5!pH
zKsFGUwP2Zg$n}~RSVQqkP_w=iVf(PIpcusgIke4AoM}we?v+NfrmF35$H?wbvK_WH
zd2iiJwK`0B(Ubj}Ye2Zl&4Cwir=4AXLp{N8wHJjgc(ar*5a{UY=o589I?FDDxt5Ww
z=~DNA0u5&_jQSK2C|FzPwd$7<*;<6Wl+{mXN9AO%v;p~`2a{#4yotcdn``FCQ6jb`
z9)%MX)?UV5<Ujr}0}oz4?eMxt>wai<cNgMu<NX`@(i|C8)iv1tccnKa@nR{_XnPT2
zU9U>MP;Wdb=4LXFkjt@7V?;NUvqQ*#6>DWnbDr}RX=vz7HFEVj<aou@#+}#IzE=*|
zLt7gr&%B;|64a{~ot%s~Fv#Z<Y_yQ<6~q_*B{RNLHYwWdITBT^scTKQIkk|HU^lqg
zB{yU{?~A5s1&7i5pd9#r9=fLZWg2?e)nkiWajcW^gi<o;nq1IQu`OPE<l97UGv%!r
zJ!<D}M0W(PUKH2-xi+84Js2$D{fS{?H>8l3qCT1bj1<FPMni@y^J@k0j+Yl~%r)=d
zf?yW4Ci3m`s$%2c1a7meZdrgf#8b{4%N$PVi&8f@A(JGp`8C9v6HDR!(KO`*yyVUN
znXv0D!~VmztT|l`v>_IP>`<C5R<5S5K6_%+(%HSqn3gOczVVjY81CW?O<kRM4U$@a
z7Mi#XdQ`Bgqo=Pw)u|9fNFsr^b4wr7P*ilA1x9k9jqHXP&Ieg911ADgF!)2Le>|*N
z7zd@lHPEjRhaX~Dr8r`Rsh5p=2i`gJf9+jMY$RD$HGdETGk}l~AZVcx0wgR)X8I?A
z)b(+9d+KME-8~})6rD~-*;$pDk;+WF$|I154I4JFKp=LA4X}a52)469njN#8<scR;
zK|-*kIp^MYU%dAs@@Ko7ZbYf7A~WOnKKI?vxu3`Cj?5&l_WVr+%-toFBTdD!$hmD<
z7$3{;ReWr;joA&T+vYh&)e`^a9q1Ea35z0RR>y#2!Zzi91#>u(DbgG)xTVlNSe%Oh
zb8>g3e1hGJ?(}HVA9g3#53mDS@n$A^S!|u{o(ZZ9;2QCDIgNY4=WZA{C4Bi7kgOk9
z8`9<WRB92z&9R+st^{d?khLAnV+32fS+q`ML9O1vg@!*`8we>f`Y>IIrUnr606}-(
z9bc655(sm{E^&`Cz_IzePv6fE-pk*4e(%Xa{@%mq#<yz-Em$+!vnn2NGhlNyZV<;Q
zV<|1!+>4~O8qSIiRss<nj^s$flw>j0Mk#s%=g()5uQq&Mxn1NYVECY;1COY$>mZw>
zX2}{7mV<G{7nDr&f0H_L{`b9H>d5K$fTmf7dPym-;2C@mRHopU;HlBhGoc_4MIglz
zt(a!{%`+>=?=-tjUZBntA4U+`Id?qn0m)FYO{YUbGBGp3D08x?soqodcGbGy4@JEt
zV@u3FG3KzVh59=0K=ejS0*fkaJkBL-Z>192(EQ09Yxd(xo?I9Ls<pVRNwS9<XfAQP
z1AA_b5>#vQRAo6$<pFEsxDv1SKD2c?DoHaT!PS{8I8S#n>0Te6J|1>f;IRvV*rr-?
zbM{R(J-f+9s=W6#m;It}(FXL#b{K{v@j;?3iHE{&kOH=V3n7NdOp!Pt{AR%+p`M|d
z<&6@;)w?&a4Q+(lguUH4z7jT={0sj;!SjuBRe?hr;bQ|$9)lyA4%F;BHSQm2;&op~
zwmZEZ^)4pm2!S9j1~d%_l_7TeWMdZEFezh>*_AYwl{K1hnbw%ZLZ(H%GJmf-y^x?f
zCCtx$Y_BnaX!4rv@}_;(S5qTv94^S0Y(gWbV>F?$0o3~OuR~Brr27(G3wV5uc%Z4p
z28#Rc6jQgO^3tU(c}7(HZ1-d;B%jDi|3Fu|vv+Tp!&Vn2?LA+gLTgh5Wh(OS3&fQd
zz~e4YV?o1U)3bqRJS#`<fTOSpE;(BHXf=(Pw8{WUXEVv_#i-xVlq5z<J7m?xQ5DTd
zDmye;SnKN7$$*Ical-*wX1qWNf<Ep{6D>ZNMy=0LMZAPrw|6q3w?J7U*|k2&??{@w
z+lfb@0N)eS(wcc`w*9;qmy_AO(b+?|b7w3s4T@Ps?N0g9LZL-Y`oF*`Z_g(kH?3PV
zOEe>}&wzK@&hx<-eQb)-SCoCOGKC4}@hmB5X5DC-1Bb^MS|pBnYPQe7lCaww!kSB9
zl7PW}a^||MxkNp~t`dS&IJoKmw|{27mc)syPe*A^Ura~~D=#$UtXHO<&X5ZTFtCa#
zE~qw)ZJeH>aN+srGUx8cC|rmVC&-k^`);SF!y{l@Q2rsMB}gPR9ooyW@=!}iIcl;z
zV_*Yh9jkebIJCQ}#=dT>Q+&zk6w*zTElt}@nnJ&1nB8R|7pTWJaA6^yICxa3TRC)5
zOfdY1VUZOV%%?@mB$YI9@e+g@!{-I*x+=k^kwe*})V1Sm57}J#w7i@k+)K6ATy{1y
z6&8afX;&+h)RuLLojeRmImMb1mk1IsPbBL$vhwOC`s(5KGSv-RIc2_9_@iPnDs;nv
zJID=mk<`A{I58P4+oj$Y@&_{_)+MBnMPL>dxl0+Zyc*WZq8U|Ar&}@!oG#1mX2@6M
zRE6~;?Otkg0#$cmdN;CK!hs4c)w&J?Asy7J0_8WuDCTO1ivqm3Dn-Tbj_a7>E()$t
z2XMMuLG;nk$+)e8#Cd;oOlEf*0u(h3yb-M<NVvq81anMNK@Uit>y`$PyYFJ@)TC>S
zmE&|GU{~^Ztg_tH9yRZVth=Bjz6sHJyZOyA(UkLb;BjQq80r%~D96_*_PP=~qR(wr
zJ1^Bj5_lqV#d;8p%fWZdVvVEvxw3G|$x(?^?qc$8F}pWIFx2VgteA?l#}dW3q2Mi}
zAW%^56sj2V<^o-;l|M#0e?Be;{oeIKt75R(*3--3X))oza0eO<pKwcj>-eIJ0(FvU
zN3yEK6xqkVZ)3H>9oiSv9ot4@`aOx>8l9hBzBufmwOJ1j&S#Tue>Oer@%El)uz1V9
z-qSB}vVQf!c5hI0Cq#N$`C&Qbzgc%yy#4Ou!}|}u@|C>3*|9}n=ywMpB)DkA9d?k@
zw$sBtkuc!#E@-J#R<o=XHni!z+DhRWcs2NMHQ?jf<nTzStV&H>JpN(y{q_Mj?=pQu
zo#q_#tka(!U!KnJ@!5+5#J_-tY~@GCC3}}fe=)MEwKMKcrv+o>&^wOrXgbCWj%I^v
znOJFJJHtGqLg9<<<p4)MK4*NLKr;NMm##lFo$2fD7{v`cc{mlDt&HuJZn^ZhY-?7+
z$a`l{c3n}dM@e_G_AxgZF`v^8DvNM;6O8*7c2j22L=HK_?>g+b(lcnX-F{x2_orAT
z2pzh@AiO^<viHjJ6=H<(<BKlpK@&}AZO=cil<w)m5hG}fl!O1V!{~hSaPsWwvp}zI
z&B{Q-8-R}BGd(ys$-!1Yd|j02IPZ^gq*wMX-Ub2RyMLHxo9%{Kj>|H8ib2O0bZECf
zy?@ysaHPx06G`OUKqr{csq>V#Hd~w9c-`6H=xP|WU~^x`V!wFPE5<Bj$ybJxi}S~`
z7K*#i;3Rs@`U83SA_v`_BN1XaJ9^anT6p<pYl}||NPJ&c;+Olg3r2Z$fh?b4noS+%
z47WO(6g(?O&n6W|0vkWXSy*RKFHC1l$Vi9-YUl7-{KzU+(ry$cjA~0<ca9GZi{Y@m
za%{zqk;;<&_eWSjY^>`4hlo6bt7$6P%V?<AEZo`-T)fdP*Vw(sF(%Y~a%-Wn57`QR
zsD^cadOaK#AlRO+<>Zms_Pxtlc|Pf$xyU-BB#uJX(X3>SL@*kdw#pK^$)-b@Hv~9g
zFcuF^rJ8Z}un#5$(tlK3`(YEkztvGcj1R2UwLR%hG`=?5((`O3Dh?~HB)3${UP-wk
zto1oR+f6U7(_>+WW6CK#x~KiC3>9}EUqwRaGqea3imZfSHN_2ZhF-9FTHvIw@W7Z4
z+MAtx-hW!oNO_W4gmptPB10&YunQJ@9~;PJTjvZ`&QH0S99E?F(0ExT1W&*2`eOoK
zy40Yf{lv}-b+xFz_+)Z%CP(wAqm*U&$t5}_A)SYhhb+B9@-rV1U?lk~&onHSZe=^+
zXDOiCR;#u-fO@g`P8qU2lS)~pWA;KsR5)nf(~apx>Go;wq8N5tTf^=vaJ<V=k6hCI
zO7${y+hlo`A^7={iC_<l7Z3pumHyo_jM;v7G%6vz<tej?hO|QN&-U!$tTb17pKdZ|
zsz>OYR6JvEQjF1`*~GRRr_zhZudP~QQ9NM*<k_TjTF33xrR16A<ef=Tya$<#HnNev
zuuq0hvl3fBbSG07DM~*t4-T%5dM{CWhu8Jbp>=BCwl#at`??_|P~%|O?;XB8csxWx
zy2%z}kmyN5nOPHMdSw=LmGE8>Nv)lkl>7(7U~Pq7FDElbr3EM|C=;|T#VgQ62lc5C
z@oUH%3vF2A4=yKE@LUetv2K7ac?t2}=yG=@1*u7e^+S_ZX4_Wa@MwZ@E=Bd~c%4z)
zUn+HtnTN)`6MD{Y<EuiYP1t}d2q&!D7;0^-YH}*mqLZ@!oYYT3JQ4<slgX#5)W8b8
z!oDB5u3*N5qml`g!imb4q^mL^S*t&#zT7LM`jU};7(9Z#WVdAF3+Nmn#qA?z5fjj#
zUB|!&F7x>ZatjKK04Og}S4Gq#gtxQ22A<W*TU$98g6QLN5~P5$c8XE(1^U3>%!K1z
zK-hh3iy2dNN72j_Q?qGDYBW9BH5kGa3@%pFsIr6j0)eo)9d}TF6kmE<gGo(G7t5K=
z6@5+JyP0UKcLTV38n55VF%iO_OHq)!15rv5uiw^qeW>`b#$bE*E?!}q#{NJb3w=_I
zRN=n>Q^dFsbqjQwa^R2*!#P>iWUV}DEFR1i3y=v9s<<X78*92Z%cirKiiuenw1>}6
z;i;WN#rNh@v7BnAVmz*qib=sr$q<vQWvJxuJz|*Vbs-N>;FJ7y(S3zy0YHGa>S_Zz
zK6t2~5;cSBd?*Tj-2PfIDG@*5EP-ck-UG$6(!s3oCAGADR@YR4bfrb+TEj~TOw+wI
zSsw0+!OTY|ps15Ass={xbtHy(#hG>orZBm0M6Vg`In|C7sm?km-qX8ya8dMLy#q4j
zng{8=#6P}=n~AqV^DTTDA9a%fry@{|H69|=?zVDiwV`kzoZ_TZrA|bFXs~7$!=^0I
zM_BQ1`m;ch-rV&q4#yyD@uv14Dr2_u;OKpiRB;|UQ&c7up>ftdibib%Yj04DQgXmk
zI(a#swozhFAz4vdO&RqAtzNW_ux^2Bblkkj5h}1)g9H{E!~AhC&#fQmZ`6`HbU0jz
zu|jxb0I+DTse+P$O@b1nEs%u@*X}7MFqjz|3L|@AXof0=y1Fyq>UA6<>lo}Nw@jrD
zEGt*WQcGBR1Z#9T7)Un`LCY96r|B4gvk!F;^GZ=m%Li};j5dWUrq@7<hHzcOZtBP6
zQx|~i;Xn;S&=p~vQ;?HGC(A>Xr&jCr1xiJ{1}?%?$-R9=JyB)nMshiX9J8GnOjK4>
zJ1JH6aIC8`Yitb{&-Rq%Em+oKBrtKdOYD%v;rb&WH?(A=MG1Lvh|OM^PhlU-qp%Qa
zm{-@*Z8VNX?upGk>S7D^XX7rKYaChm%Jbp{wMJ(-dR@^OjoCg*77tOhm{!B6IEUqR
zMZsXR;SkH6S^2mG1({;l&hKAI`A@~#AyVPA%~<T?Op!IB%l=(DrNE&u#(-DEaSS|O
zw}!ciKxgx)@Vdp(Y71`{tOMh44;b)dmN0q8=e7FJBK2uyn`lXGljBmWnX@g2hyj0o
zT%N?hFwsSbl}xnQ{PY^`4D1^<j1WjK##))I*{4|S4<Sc^{wdtSK}M{?4fHa~G~>;Y
zNgeR%ngPgBA3*pYmns9Wcz_2B%SqT}0%3#W;f~<Zrvcv6e%S|tt0a0KOT8GKu^hL<
zyRNl-VFZ&TkO0dx2hYCq4ThkWV2CQOF26&B%@DrOpBz3yp#ce6I5<8QK%*Min@~5O
zOq^|=aOAlagroD2`#s*o?#zJj&|^SFAZrb~*QetA01Gib#>+f1cHkj%2R0nf#)J9j
z|NP#b<WT5NDAu1r`97jESO^#Ne@nxNyZ}=#8cMdWWiO%DmXqrog};Y51IG@6I;3B|
zx`9--#8`l5U@#-{2d7B9yAm!HR>HI&9=e*IfOEkB`M=nuiQ7{;D(|m_ny~T7l1lP3
zctr(L!E^-?_EDKs5EJ9aChJ#q;yuU|ynqgjh13?BO_r}<30a%COdnD%FV4=T6l$=_
zeAAqe*b}c7M^rd}`aqJM;Ya}Ve4AIdtlQ9I5<Op?YkvN>hL?j}@%si%6-<JIZAB_M
zFCW|m0g6T_?72x7%#I<QSV0?0&gmd&Azjgysr0f*NaPD!8FMzL+}(E3U_as~*jAZt
zP)2fd?@hR!sS^ooug!=<IMQ0b(h_N?3k9+YLg4fv|BKunbT>f>SPGfbJwoyd$Ye!h
zA3pIX2n2w)dve*E9Sj?QolpgaxpO|dP!%{qf%tw>yR#o%Mcs2&$%0===BPrq;n9R&
zHOr9#WQAhIyVD80JI7LY9w>NNBp_xzI9X>NPEDkc%rf}1r^T_L?i8r&50Gy0+q5{d
zHVmTv`6ySPIaYBwQThG=LVi}zNeL7Kw!-XMC#O+;NFJTF8@AVAioE9q$qfQXc1GP{
z5#T`Yk6v>#hv&r<?t{1AeeeKL4&qDQ=^-DdXo>1hogr(Jius9Zl7gdHQgb3K&0XSF
zoXUgPkAjtfs{pptDivbAGHF{j*ml(FvSPzYH6;PRnMrW?&d05<%qjSi<<M0gny&H;
zlkj-h0?cpVb<<4DIoIB@nEluZ_YQ<Dt+hrOPy@876^fC2jrlGsx8K$gwZJo$%JF=r
zg8jj`99$g3{@l<TD`FwC-)bF|*u|;WyM(9!QNIfTHy7ZiJ#GCbD{z}_YhiH2ykh<N
zt&!>sA*6z*aCJ2;Z9KKz8(#vGnZ+@*^0=_DdOPr?F#~_cX`0+x-nkbf{%J|zljz}U
z3~7rw06^u7EMpsr5gh=!4|po2unt5}$&8@kb!dqjmlE!U@#kc%At4lgcAs(7tx;Ky
z!&IWAL0j-iSbIdHQ20#6J75jx-xW#&^urdlsisyenDihwh0SQxp%Th(w<sKY>RY0!
zHNL>kA=6iR@uujhXIomdKKrFn=%TNl!_*66w!%Omq;*{nPZ_O-fDdJ7t&H_qbBqa`
zdn3dtPjdiRPS>Iw&f2PuK0AFG<4#-=m8azJGwH}NVd2(Gwp&$1%cs{v+AoV%BbT4K
z$+33gv{)=9lBQ%0Y|rR+KWz54OMyBF1C-A9f8#<3)$biGsG`>e4r@eRk}74wLSxL(
zDZ09CmlnWC4PFBP2CS+22B_F8C^!)bWqa(%QgL^O9l?Z%>9u>+N14}d6F`hXu`a7s
z(K@;Ykpe$L)w!AR*GF8x!9H2r+<d_~H$lM>IL~{+eoLpx02YLS8TZL{z@%a}p|cT1
z(-5gjn8^h@5oorY!ljBVEneF5RD2ulc{#I!H7_LtAdSY@L5JNvjy8G~zKr_VLRO)A
zDZNe@z|d5ZNBFc*Sgz&<7d(QUM4$@>$dfO)PP12S=zVHK+;D%MNA3Q6kD8onSR4Ad
zMvof3Y2Kq|L`TBPA8M>d-o1PyyBxvaX7L*?s<mPrSE%(t=t4|B$T@mugojrFaw<x8
z&SI6OMTv5RQf^WRZ~`mYg=2~wA??>3HA1Og2#MDQ*>L%we~NfwiL@h*4;N8!=G)M2
z*;0$trN$@;QI}StV)*Zk0S5~pk(qIDrck};Pc9}p1TkQrX{aLw)qyVs4d+K-D)T-B
zYzma~zU{1My^Ai*cAb|E-Zo83a(sgvK_kc%TXb5Z1jMSoL#F~=BLNt>`?!CK7`tnu
z5nWTd3rZ+hg#^Zz*jw&O3t%S3zc92teeoe?o4beHhud{e{Gf1Jia)0AY)yj`g>sNe
zI6~+SOdUqM^s6Y1=Q<14=xvRdf+_oqEG&C49$O>}4R5d@3lM%b`2dFOiz?=jz!T%H
zx&z-!+_4geQ-<^j*;wl;ye()3PT>_AtW5N558BkU57o2~7c_&aRgKs+jGgqBnwYH}
z@tEb8qv_?CE?k6T&p^nCIPD%FNQR?9e{y?*bOeU3$mNN=0;7zyvj1nwp2c9*I!nu~
zQH>Q#Qw+reMu}Vr9DJ$;zrpp=9Q@`1$Q6+#YVHb<kf21T!5r@7c?qluP7s&{;_GQj
z*${_>CRn(u4+RmY83!=Dox89Y&EC+s^BY9YZRCe13^~vgfRiT=2IZ(YA+s>PX;LBU
z9K4|&ab7qK+id5?T(?j)g7QEQzxeMl_LWZrst%O^YCOU^0mAo&V=tLpY>9BwNbz;@
z0o-HdXhaPszr8O4^H@0}xFVZ2HvYJ=8g6tKjw8Qg9KCPpp5iq}$0%zBmhmJXc1M7d
z!KuO>lF^lCDE7nPBt4CY4ayuabk046(E{Z-%lgxMR|t!#1j|C{ZnBX~EDWQR+3lFb
zaWEw=s)bGpmAIKl)3QwE<P(GuPqIv+iD$Cx)w(A;T{qrI*l=A-hA7ZY%kaosx}Mx9
zaCe4}n@FHqM?}K!A(8AWk#MS$i4!>0tjj=qTS@T?%@><<-l#5MDik4W;M*(ay~iCn
zrAiWUWUF>6vjPOZt;mi+SF-8e?4mzyNWJq0b93m(T<uQ>BGQ3?eyNSr(Ohq%R1U5W
zU?gzw3{|L4$3VopGpKoV6UNkz>#|jdohhvxo7;?jqw$DEk0yn|#ZW)Yh!))%&dr+`
zmkyr@eDTtOYI2tWmPvQl$H|1eX<3@#A*^Lb&4G8svFp{;036g9G!&tlGWCNlDaSw}
zM!>u&wosnArEcK(X+cPQMHt4DK`ftMDBsL2Vfetg;ozIl^bn)Ro1J1?I;$|$05~FR
zMAh8P1I3{iRa^suIt=xaX$rW<$|cy8(&AOXUQ{s*9>SN-DTO>}!By`e!N(6rSSLY}
z9ISkBS9+=V_=KZoTk18H%u9IvED6(7R8w%aBV)y>7f51xp$v{Ci{kBW-@k|DB~&>b
zDX}i=kik}uDxVsGvMku=-7j}1BUriMeprZp<7LT`$dDmAapz$nf_>~gL?TR~(OGta
zfFZg4T?B85sSWDPxCPNNAP!>+76(8^BNhkcIg+Wxo@(&cvag-j=&Lj0to43kB7A{L
zBz4sBwbxNIjY(g!bUH`}t*OqBo(I&$a5-Z=0Znhq{<3vDh9iPr!XVLJGO8DDw6d3i
zoTJwf+*|8c%EQb*D~xU>OP*v;tkTH!P*K&nd<5i5lLOmPE-*S$At?`kfea=;@Xk1(
zGz&cyjzK#Ng)Ss$Z)_rfB}h|m_|3~f!DZfDkz3$|YtBLG%*Pt#R5R9y$IZnWeGZE0
zP3&B#D<@oKeaEEZ^rXASfJU#L#=AB{X#g+_&CbZDvI))Mb*6Dm&czkY35I=L!<lp7
z;^aP?`4=5zI-kMZ(xP3-yw!Z#S!ap_tvt!kO70S^e2<6`(LiPCm3kOyqsg7~<0;fC
zJ~;-V-O{je_%^!T9l#4yU#6b1fK@is!4PHDvgNwtG2%HD<qZWJ!o@;?R`uRh2x!!J
zSvg?o20jA?A;eWuPTn=s;{nTqkm##d)kJ%g=^{EPG3>^stkM#)D~WY+ZkyoMFTux<
znA!Mfl)YQ!=`VwR0&|R09I%~vel_ebNpo=S@w&Si*r>Ui0oM1=FUw1U>SOgd<jCA9
z-T(oGkb0Gp#~jt9W!%N*pI|g%zaA8nM65ffc(MjGXVE8-0)oHL)X`*iZ^Ek}U1)Z|
zr2xg~%6TW|bZLuG0_BSr@F>a>rdLkX0F6<g8iEC4i8774WQz7z(jr`A_6W*8+T?NT
zbaWy^*B+C&2Re|{FAkomTun_^31CaYPibL>4QZda-92g3QX`z>P)H=X6Xa@Xh+&Fu
zxXyqImI`w8*$Z-f=5F9~1bKVDAeU3kf*g-)6yz$zT{0<b24J-uewYV`If_#PJ;Z+!
z7>0!o4GwV#1)@a;8i9y<JJ=ur*{ELBX6+~y3;DbZEtz&-pq_LXI)i~ta)ThVX|QN8
zVtb`*AQ-v$lPuZ#f(`STdJ<fmI<f-Ib}rL)^&BJz1ZcQMLF(#h@>NrV<6;d30<>lt
z?J5jY^t&!*xrc_N;+$R%$MB6!kbz8%g#vJGcqw5pN%DfzhJ}g+8EW^+*V#5>aZvTe
z`}4#`3S39|`H;mBkgqk%B?N-2B2WSE<pcau!$vu8vVO(;3HyWtMT4pHyy%saGXt%%
zZO2hh#0!jqUebMspoO-Mr@y5(06o2`M?8%ZSRi+i^U;vN;*jOJ211spGAY7$&eX;c
zqS-zGEv?CP)*t4YXT&?z=k6#Akyx%KPfV=|v0>RLorz6{#ot^MNo-N&PVXQeirF<%
z+&VDcvVC6@9>99;&}o@+<IYV17@o*w==VV~!}9FXtoVw#E>s6@pba3NQsr`75X|fP
zTjV!7HM*}H1(n7(b{NW|CbacTC_jpJX1~PpNTdj|yq;muc{<|jPr75x6~oWBFd%k-
zb5m|es0pCXIT0od>GA>tpDC%D4Zr!~f@hkg#R6er5hlX*`)@E7K29^jX*qNuLgk%c
zp|EYxtZi`$OJ1DWOwwF`sCDAPRu~PK?x9V{X9z$P4t#bt@fx5M?W%}6)PC%CQFt3{
z1D75u?6p?;$m5otlex-v+fpV++#7+W?-!lRwT>i9%IYJP9~x-HL>mzBeZhX2Oy1h^
zA5R`1lLbzvs6fOAIKif1BUU{+P$5E`JcT~-y1RnabmHok-^0=3N_^r{JVOntQYr$-
zx?NI$5ZXt7c5npQRpsu$0j|{zp&uoT2Zo7oic}(jRj-g@A{v4)&uj;dXBNJ(vUo1a
zN3K|)S&>RLC|Rc#?M4Wt7%*_juTm*e)c+thv6W3P&(9GymIFYnGR-uKNctL9w7~n7
zCgjYvrv2wAtu{e<rPs80oNUviau>@KiZu&^I+`F;w?`ZH7|V}N(WH_jeZpBg4m9YU
zfER8f%pgjI!84d?6qqN93pu@Pea<0tguV?}V{kac>m;rV?Fk`0rG(IF_*parvRfDe
z`T3hS+)if7NGoL0<X2naRGV0hLL^Kh`}7AWXwLIh`Ko{d5ymILVmp3hY5^*niZakS
z1S6q5Njb*x9F8GybUjj1q(VYzpUp7$Nc>chYK<LyR70=gYIy<fiZJx;{BQ_8l45w}
z>NkL{ycUJepIG&qC{~vwl~lhWBXy!3J0K(c$||j;vb_UOF&si?_Z~hju8IMQz#uJ{
z^5FD3@1MO<P?Bu)_T~0g^}FKjvEQ|(E}#=o7Xp#dLi9wH)Ix<JWT33)8c>RasjFG-
zoZ+bLr7rGR1KFdo9eIqL3KpqWa}+MGukP7erj48|<63O1;&!x~yQLQO;N)me)2Yku
zjY@2uan0PMJ)Z|eEhm_LBr>D-7M{J>jeBwu>ch7#XClpLWpp>1!?ZZFuu+*)$X3lt
z-_oq~LH||Jkd+*P{Z4t1TEv2rQFNn^JHal}_Ji81Skm|bR+0O;Da(vc^_E{ttv;=m
ztz)0C&i@la+AO8Y8euO3^rgl<*RT1ftH3N5nJ=e?*NnY5D#rjNQ7ju`#DV@1TMB@8
z(7mn@i}bcf2*spEcxFbp#HJmkU6plcF-a-&n^)1%D<M0~a2^yim68ST3NY0f2!{Ta
zE~&biV#`at>TY3?8>qXO`UgV&bUwU2E4)Lwp@>4Fxo#wz)Zr#X2OLq5C^)qmYgVtX
zI!k1+dda9Zhl8sA4`)Hu$2DBoRaq7UaiqmBAtM`tajE?)h|tQEN8!3Y@<|{g=cTzr
zk-J%@yBoGiAwdoeI<HA|xEQ#U$l1m2J2-F!dL6D5c3KKD9ZUSR0C&WX`!1H*PkJg^
zB8z{qlP@w*SMkZ!JbwD}G#w}t^twM9qqvp#0#jr5jShB2tIBoLmq>~x-&hbt)M{yp
za)5W<XL%uldNO)0&n9rns)gN&%kPQ~AH=#vqH29QXA}_uS<`0Y1c|yGAj>QWiQ$gZ
z$b5XAYt@E;gf^M?sF;f{`;4McSm;*lrc$SNp<G33Pr_IQlR-=utAj!1T^~Y=Mv23n
zMK!)@@V`WJQ-3XLhE12aYh^$fI5Q9Z4)N*^v!N78AM{ZPA%d!f3?e{Rfl9~0S3uz`
zl5H;XuY{`o4`4LxUV#WTZOz>|xiI^|+L4#Gj^S3bOjFP78~yF#)LR}`K&V23qbhH{
zhq5aq41sow_xW%(-tng>Xxt#Ww9aM#zICFcJqNwXKqtzFDPAU@#huWg_a-Fx6bh^<
z^2Q<j=&q-&r<}wE5lCYjV>4~zSE?xro<R<JM6-^vg%aA>aUGS=V$D(wLF@s(--+97
zUrTSjhWi(;%V9nR63A^a;8~WEx}vwp4e(eEE(?cq8|k2-YeJZb3=|CzMHc(09cjsv
zNyL)OeN_>@iJhjnC~OQSoP@Znn^G;;AQ7{EEW9VR)kc{vBqK_&k+@W+ng=SOl2NLs
z6CGYkQiE~&o;esyF%D>VH=^BRgf#shwtK`V$tC?5v9dkXhzJBFsWoAgBKa3>GD01h
zgG<&I2cF<bY73+*j<1pBwtWm6e(-_Rio8#-{pG>lIk!p~muC=!sCLLp6XHY=fh-JE
z{)B!+$rUs?nQ=CX=hIEvbUPxmx}^MwhqCSy)(SG$_#OSj;o$g72(#D^O09&#Tt-0O
z7TI^uX`DsHmW28ej2`(b55~!p9JLvMgtFB-6&^o<<1{s=FjKL`DnpvETXdx!LLg?1
zV-|{;CdCYrmkSWvWYD2L+7f3_$_qI^>PJR_OrZ|9wGzC|NRZ4??M8*aty?x_o*lf6
zva5xQ2nlH5CQ2D5m~mar@?T0AER8GVTyoPvE5fi$4Oc&zVL^2PfhiaAC($ga`N4W^
zBh+p7NN7?!MhbtZZjZdb5u&l1ETdCeMoEMOs1jauONvvb$&~2%KG<w`dOhl4!=n)x
zsRN<p!5@EJt2NT#0&+>&OSAo#vbJdHEeh396e#leaz6zXOBLoCG$`R=T&>M?$ul?5
zOVvD_tC>6PqbUP8i5J{Q$$nSQt6on)d59OqqOGLRit->K%zfGKEVjIa+(IVZ+Eshc
zVyd_~2)yc=@*zSS!Cg%Ami@LQQ6jMlWx45dLx=P72d8q;lu^?w?sSGYScb^9xFKDI
zVPK~_>_gi{h59|Kzo)C~Q<Y_U18RPL2WHlax%vpz6d4Os8~jZQoI1<4Ifqn)@j^BM
z;RpjdSh%903!RyY>o#10>jtl7t#zp`8tqdM3QJd`KN!e`4e&TsU|-cm3i9CK-Etwx
zO%C`bz34NTQxo#srebL#dTYhf^0qpgSnITyplTS*j+)Y?B9MHFS@G|WV9@Z3w;1lG
zLV|XnAp}O`G3oK$51<z%B%x-T4}32XfdU3OXZNIlp@mv(NW@;R7oytOoL&#HBL|MN
zi5@TP6L#e)HnFw+ChEl_dI1$cH~mki*k1AM*ZXH1asjJDY;3~*z{B)7o9G!tyi8e-
zKrVA%a-4$!+T{)I(|9o3*kp*&#wI>)o|o>!wCHVYBJBWQk(tuR)g{Hm#%6JmznGw2
zXnt{qr`b2W21BD^qA1?SA{d*q)9F;7op&iW^hx%aPh(^7U}<<4J7M7|<&5pXFY+>Z
zLa)UBf=DL0^`D=^6MhK)zSlkJyY7P>-*Ug(;fL*mJ8t3_-oNpE?u0vjF}eQxlkfNN
z{l7l?sI>S0io4H_U-=Qe!R|}n{|Kjl6#wl0uYXRjx8qT!-+usy@4-L2|7Y~)jbj}7
zO1uBtzo{qe_%9Q8p*!#y;h(SK%SS=`zwvi^){Z~#8gA{k_t^0Qm;ZNg|F^%P_uKJ9
z_d>Rx&#?CERT~?x-S76it-seZcHG$4@AiJX-}?Xi?*99F*~Zqtx%=JmLlOx7@cy60
zzx4h;#BXfk58|Kv`GRY(JG$HLtUSkm!G}M<zwrLI+<kUD`pNL>jg2q3uXg<7<o@w5
z>HF>YR(yNy{y$Id|08$59p7H^{=ZD_|8_^;Z^z&GPe;(M9oBAZ)?X+0f9}^f(;FLh
z{4x1f{b%!Nzx*ve@QpVA8(Vt69Y23=SL*St`0K_$;Cp!g7yeZ5x8pZ{D!w6o|34=8
zf9r4bemjn*@%=_W_Uk_-_wW6JQj#5i`<LuWJ*MyfXZ*}(exEz>=C}0zH(%0YI^OB~
z{}uQCIPb?l{z>ouP=2lcv-|D%A2{`S{ImPN<KF)r_kO$1o?zeY_z_NU{O$hly8FND
z?*B09K=O6tbGVs3wfo<GrTu>UtM2Fs!%>A_zxJFzN#6hUq2BlP-_he6PguARf2`eh
z{As*DZ2!hs@7wrLk9&@P>)rMHe<8X5?Q6a7?Y<tne0cEZt>801ei7%{7VG~vzoz&7
z#Y;W@yLHsv_m(@_;aA=Jx9kUZwEKTO`Q4fm9+M%GJHGAS|E=HE2mB%qzl=uOxEcTD
p@L1dRO8x)O{#D!md*8G->F@Ekjg6m}bN^5OmA<K!+>m_T_#b#)A{_t#

literal 0
HcmV?d00001

diff --git a/PyTorchSimDevice2/torch_openreg/openreg/__init__.py b/PyTorchSimDevice2/torch_openreg/openreg/__init__.py
new file mode 100644
index 00000000..b3ab54a9
--- /dev/null
+++ b/PyTorchSimDevice2/torch_openreg/openreg/__init__.py
@@ -0,0 +1,86 @@
+import torch
+from torch._dynamo.device_interface import register_interface_for_device
+
+import torch_openreg._C  # type: ignore[misc]
+
+from . import meta  # noqa: F401
+from . import extension_device_op_overrides
+from .extension_device_interface import ExtensionDeviceInterface
+
+_initialized = False
+
+
+class device:
+    r"""Context-manager that changes the selected device.
+
+    Args:
+        device (torch.device or int): device index to select. It's a no-op if
+            this argument is a negative integer or ``None``.
+    """
+
+    def __init__(self, device):
+        self.idx = torch.accelerator._get_device_index(device, optional=True)
+        self.prev_idx = -1
+
+    def __enter__(self):
+        self.prev_idx = torch_openreg._C._exchangeDevice(self.idx)
+
+    def __exit__(self, type, value, traceback):
+        self.idx = torch_openreg._C._set_device(self.prev_idx)
+        return False
+
+
+def is_available():
+    return True
+
+
+def device_count() -> int:
+    return torch_openreg._C._get_device_count()
+
+
+def current_device():
+    return torch_openreg._C._get_device()
+
+
+def set_device(device) -> None:
+    return torch_openreg._C._set_device(device)
+
+def custom_device():
+    return torch.device("npu:0")
+
+def init():
+    _lazy_init()
+
+
+def is_initialized():
+    return _initialized
+
+
+def _lazy_init():
+    global _initialized
+    if is_initialized():
+        return
+    torch_openreg._C._init()
+    register_interface_for_device(custom_device(), ExtensionDeviceInterface)
+    _initialized = True
+
+
+from .random import *  # noqa: F403
+
+
+__all__ = [
+    "device",
+    "device_count",
+    "current_device",
+    "set_device",
+    "custom_device",
+    "initial_seed",
+    "is_available",
+    "init",
+    "is_initialized",
+    "random",
+    "manual_seed",
+    "manual_seed_all",
+    "get_rng_state",
+    "set_rng_state",
+]
diff --git a/PyTorchSimDevice/extension_device_interface.py b/PyTorchSimDevice2/torch_openreg/openreg/extension_device_interface.py
similarity index 100%
rename from PyTorchSimDevice/extension_device_interface.py
rename to PyTorchSimDevice2/torch_openreg/openreg/extension_device_interface.py
diff --git a/PyTorchSimDevice/extension_device_op_overrides.py b/PyTorchSimDevice2/torch_openreg/openreg/extension_device_op_overrides.py
similarity index 100%
rename from PyTorchSimDevice/extension_device_op_overrides.py
rename to PyTorchSimDevice2/torch_openreg/openreg/extension_device_op_overrides.py
diff --git a/PyTorchSimDevice2/torch_openreg/openreg/meta.py b/PyTorchSimDevice2/torch_openreg/openreg/meta.py
new file mode 100644
index 00000000..c475e8e0
--- /dev/null
+++ b/PyTorchSimDevice2/torch_openreg/openreg/meta.py
@@ -0,0 +1,13 @@
+import torch
+
+
+# LITERALINCLUDE START: CUSTOM OPERATOR META
+lib = torch.library.Library("openreg", "IMPL", "Meta")  # noqa: TOR901
+
+
+@torch.library.impl(lib, "custom_abs")
+def custom_abs(self):
+    return torch.empty_like(self)
+
+
+# LITERALINCLUDE END: CUSTOM OPERATOR META
diff --git a/PyTorchSimDevice2/torch_openreg/openreg/random.py b/PyTorchSimDevice2/torch_openreg/openreg/random.py
new file mode 100644
index 00000000..6817bd79
--- /dev/null
+++ b/PyTorchSimDevice2/torch_openreg/openreg/random.py
@@ -0,0 +1,61 @@
+import torch
+
+import torch_openreg._C  # type: ignore[misc]
+
+from . import _lazy_init, current_device, device_count
+
+
+__all__ = [
+    "get_rng_state",
+    "set_rng_state",
+    "manual_seed",
+    "manual_seed_all",
+    "initial_seed",
+]
+
+
+def get_rng_state(device="openreg"):
+    if isinstance(device, str):
+        device = torch.device(device)
+    elif isinstance(device, int):
+        device = torch.device("openreg", device)
+    idx = device.index
+    if idx is None:
+        idx = current_device()
+    default_generator = torch_openreg._C._get_default_generator(idx)
+    return default_generator.get_state()
+
+
+def set_rng_state(new_state, device="openreg"):
+    if isinstance(device, str):
+        device = torch.device(device)
+    elif isinstance(device, int):
+        device = torch.device("openreg", device)
+    idx = device.index
+    if idx is None:
+        idx = current_device()
+    default_generator = torch_openreg._C._get_default_generator(idx)
+    default_generator.set_state(new_state)
+
+
+def initial_seed() -> int:
+    _lazy_init()
+    idx = current_device()
+    default_generator = torch_openreg._C._get_default_generator(idx)
+    return default_generator.initial_seed()
+
+
+def manual_seed(seed: int) -> None:
+    seed = int(seed)
+
+    idx = current_device()
+    default_generator = torch_openreg._C._get_default_generator(idx)
+    default_generator.manual_seed(seed)
+
+
+def manual_seed_all(seed: int) -> None:
+    seed = int(seed)
+
+    for idx in range(device_count()):
+        default_generator = torch_openreg._C._get_default_generator(idx)
+        default_generator.manual_seed(seed)
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 34ba1031..1565a26b 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -111,7 +111,7 @@ def write_header(self):
                 inductor_ops = torch.ops.inductor
                 assert_size_stride = torch._C._dynamo.guards.assert_size_stride
                 alloc_from_pool = torch.ops.inductor._alloc_from_pool
-                reinterpret_tensor = torch.ops.aten._reinterpret_tensor
+                reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
                 custom_async_compile = CustomAsyncCompile()
                 async_compile = AsyncCompile()
                 os.environ["TORCHSIM_LAST_COMPILED_MODULE"] = __file__
diff --git a/Scheduler/scheduler.py b/Scheduler/scheduler.py
index dfd4aab6..cdcdd2a7 100644
--- a/Scheduler/scheduler.py
+++ b/Scheduler/scheduler.py
@@ -8,9 +8,6 @@
 from PyTorchSimFrontend.extension_codecache import hash_prefix
 from Simulator.simulator import TOGSimulator
 from PyTorchSimFrontend import extension_config
-from PyTorchSimDevice.extension_device_interface import ExtensionDeviceInterface
-
-from torch._dynamo.device_interface import register_interface_for_device
 
 # Configure logger for Scheduler module
 logger = extension_config.setup_logger()
@@ -174,52 +171,24 @@ def __init__(self, tog_simulator : TOGSimulator, num_partion=1) -> None:
     def setup_device(cls):
         if cls.NPU_MODULE is not None:
             return cls.NPU_MODULE
-        source_file_path = os.path.dirname(os.path.abspath(__file__))
-        source_file = os.path.join(
-            source_file_path, f"{extension_config.CONFIG_TORCHSIM_DIR}/PyTorchSimDevice/extension_device.cpp"
-        )
-        hook_file = os.path.join(source_file_path, f"{extension_config.CONFIG_TORCHSIM_DIR}/PyTorchSimDevice/extension_hooks.cpp")
-
-        import torch.utils.cpp_extension
-        module = torch.utils.cpp_extension.load(
-            name="npu",
-            sources=[
-                str(source_file),
-                str(hook_file),
-            ],
-            extra_cflags=["-g"],
-            verbose=True,
-        )
 
-        torch.utils.rename_privateuse1_backend("npu")
-        torch._register_device_module("npu", module)
-        from torch._inductor.codegen.common import (
-            get_scheduling_for_device,
-            get_wrapper_codegen_for_device,
-            register_backend_for_device,
-        )
-        from PyTorchSimFrontend.mlir.mlir_codegen_backend import (
-            ExtensionWrapperCodegen,
-        )
-        from PyTorchSimFrontend.mlir.mlir_scheduling import (
-            MLIRScheduling
-        )
+        try:
+            from torch._inductor.codegen.common import register_backend_for_device
+            from PyTorchSimFrontend.mlir.mlir_codegen_backend import ExtensionWrapperCodegen
+            from PyTorchSimFrontend.mlir.mlir_scheduling import MLIRScheduling
+        except ImportError as e:
+            logger.error(f"Failed to import torch_openreg: {e}")
+            logger.error("Please ensure PyTorchSimDevice2 is installed: pip install -e PyTorchSimDevice2")
+            raise
 
         register_backend_for_device(
             "npu",
             lambda scheduling: MLIRScheduling(scheduling),
             ExtensionWrapperCodegen
         )
-        import PyTorchSimDevice.extension_device_op_overrides
 
-        assert(
-        get_wrapper_codegen_for_device("npu")
-            == ExtensionWrapperCodegen
-        )
-        cls.NPU_MODULE = module
-        sys.modules['torch.npu'] = module
-        register_interface_for_device(module.custom_device(), ExtensionDeviceInterface)
-        return module
+        cls.NPU_MODULE = torch.npu
+        return cls.NPU_MODULE
 
     def submit(self, batched_req, partition_idx) -> List[RequestReturn]:
         # FIXME. Construct SchedulerDNNModel

From 468f41487a13f5438e3a3dd9ecb2f0a639ca0604 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 22 Jan 2026 14:04:12 +0000
Subject: [PATCH 085/194] [Device] Use torch.device(npu)

---
 PyTorchSimDevice2/torch_openreg/__init__.py | 11 ++++++++++-
 experiments/gemm.py                         |  3 ---
 scripts/ILS_experiment/test_matmul.py       |  6 ++----
 scripts/chiplet_prep.py                     |  4 +---
 tests/Diffusion/test_diffusion.py           |  4 +---
 tests/Fusion/test_addmm_residual.py         |  4 +---
 tests/Fusion/test_attention_fusion.py       |  4 +---
 tests/Fusion/test_bmm_reduction.py          |  4 +---
 tests/Fusion/test_conv_fusion.py            |  4 +---
 tests/Fusion/test_matmul_activation.py      |  4 +---
 tests/Fusion/test_matmul_reduction.py       |  4 +---
 tests/Fusion/test_matmul_scalar.py          |  4 +---
 tests/Fusion/test_prologue_fusion.py        |  4 +---
 tests/Fusion/test_transformer_fusion.py     |  4 +---
 tests/Llama/test_llama.py                   |  4 +---
 tests/Mixtral_8x7B/test_attention.py        |  4 +---
 tests/test_activation.py                    |  4 +---
 tests/test_add.py                           |  4 +---
 tests/test_batchnorm.py                     |  4 +---
 tests/test_bmm.py                           |  4 +---
 tests/test_cnn.py                           |  4 +---
 tests/test_conv2d.py                        |  4 +---
 tests/test_exponent.py                      |  4 +---
 tests/test_gqa.py                           |  4 +---
 tests/test_indirect_access.py               |  4 +---
 tests/test_layernorm.py                     |  4 +---
 tests/test_matmul.py                        |  4 +---
 tests/test_mlp.py                           |  4 +---
 tests/test_pool.py                          |  4 +---
 tests/test_reduce.py                        |  4 +---
 tests/test_resnet.py                        |  4 +---
 tests/test_single_perceptron.py             |  4 +---
 tests/test_softmax.py                       |  4 +---
 tests/test_sparsity.py                      |  4 +---
 tests/test_stonne.py                        |  4 +---
 tests/test_transcendental.py                |  4 +---
 tests/test_transformer.py                   |  4 +---
 tests/test_transpose2D.py                   |  4 +---
 tests/test_transpose3D.py                   |  4 +---
 tests/test_vectorops.py                     |  4 +---
 tests/test_view3D_2D.py                     |  4 +---
 tests/test_vit.py                           |  4 +---
 42 files changed, 51 insertions(+), 125 deletions(-)

diff --git a/PyTorchSimDevice2/torch_openreg/__init__.py b/PyTorchSimDevice2/torch_openreg/__init__.py
index a69151e9..5e404f7d 100644
--- a/PyTorchSimDevice2/torch_openreg/__init__.py
+++ b/PyTorchSimDevice2/torch_openreg/__init__.py
@@ -1,4 +1,5 @@
 import sys
+import os
 import torch
 
 
@@ -11,11 +12,19 @@
 import torch_openreg._C  # type: ignore[misc]
 import torch_openreg.openreg
 
-
 torch.utils.rename_privateuse1_backend("npu")
 torch._register_device_module("npu", torch_openreg.openreg)
 torch.utils.generate_methods_for_privateuse1_backend(for_storage=True)
 
+sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
+from PyTorchSimFrontend.mlir.mlir_codegen_backend import ExtensionWrapperCodegen
+from PyTorchSimFrontend.mlir.mlir_scheduling import MLIRScheduling
+torch._inductor.codegen.common.register_backend_for_device(
+    "npu",
+    lambda scheduling: MLIRScheduling(scheduling),
+    ExtensionWrapperCodegen
+)
+
 torch_openreg.openreg.init()
 sys.modules['torch.npu'] = torch_openreg.openreg
 
diff --git a/experiments/gemm.py b/experiments/gemm.py
index 6b6ece4d..0e1a15e4 100644
--- a/experiments/gemm.py
+++ b/experiments/gemm.py
@@ -48,7 +48,4 @@ def custom_matmul(a, b):
     if 'pytorchsim_functional_mode' in os.environ:
         del os.environ['pytorchsim_functional_mode']
 
-    from Scheduler.scheduler import PyTorchSimRunner
-    module = PyTorchSimRunner.setup_device()
-    device = module.custom_device()
     run_matmul(size[0], size[1], size[2], config)
diff --git a/scripts/ILS_experiment/test_matmul.py b/scripts/ILS_experiment/test_matmul.py
index 667dfc66..1314e483 100644
--- a/scripts/ILS_experiment/test_matmul.py
+++ b/scripts/ILS_experiment/test_matmul.py
@@ -60,7 +60,5 @@ def custom_matmul(bias, a, b):
     args = parser.parse_args()
     shape = tuple(map(int, args.shape.strip('()').split(',')))
 
-    from Scheduler.scheduler import PyTorchSimRunner
-    module = PyTorchSimRunner.setup_device()
-    device = module.custom_device()
-    test_matmul(device, *shape)
+    device = torch.device("npu:0")
+    test_matmul(device, *shape)
\ No newline at end of file
diff --git a/scripts/chiplet_prep.py b/scripts/chiplet_prep.py
index 213eb85b..e2437904 100644
--- a/scripts/chiplet_prep.py
+++ b/scripts/chiplet_prep.py
@@ -64,9 +64,7 @@ def modify_file(dump_path, name, address_numa_stride=None, subgraph_map=None):
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import PyTorchSimRunner
-    module = PyTorchSimRunner.setup_device()
-    device = module.custom_device()
+    device = torch.device("npu:0")
     parser = argparse.ArgumentParser(description='Process folder argument.')
     parser.add_argument('size', type=int, help='Folder value', default=256)
     args = parser.parse_args()
diff --git a/tests/Diffusion/test_diffusion.py b/tests/Diffusion/test_diffusion.py
index 082ed865..85eaba9f 100644
--- a/tests/Diffusion/test_diffusion.py
+++ b/tests/Diffusion/test_diffusion.py
@@ -637,9 +637,7 @@ def test_timesteps(
     args = parser.parse_args()
 
     sys.path.append(os.environ.get("TORCHSIM_DIR", "/workspace/PyTorchSim"))
-    from Scheduler.scheduler import PyTorchSimRunner
-    module = PyTorchSimRunner.setup_device()
-    device = module.custom_device()
+    device = torch.device("npu:0")
 
     #test_upsample2d(device)
     #test_groupnorm(device)
diff --git a/tests/Fusion/test_addmm_residual.py b/tests/Fusion/test_addmm_residual.py
index ef753a67..d517796e 100644
--- a/tests/Fusion/test_addmm_residual.py
+++ b/tests/Fusion/test_addmm_residual.py
@@ -43,9 +43,7 @@ def addmm_residual(a, b, c, d):
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import PyTorchSimRunner
-    module = PyTorchSimRunner.setup_device()
-    device = module.custom_device()
+    device = torch.device("npu:0")
     test_addmm_residual(device, 32, 32, 32)
     test_addmm_residual(device, 128, 128, 128)
     test_addmm_residual(device, 512, 512, 512)
diff --git a/tests/Fusion/test_attention_fusion.py b/tests/Fusion/test_attention_fusion.py
index 123376d1..045c109f 100644
--- a/tests/Fusion/test_attention_fusion.py
+++ b/tests/Fusion/test_attention_fusion.py
@@ -75,9 +75,7 @@ def test_MHA(device, num_heads=12, embed_dim=768, input_seq=512):
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import PyTorchSimRunner
-    module = PyTorchSimRunner.setup_device()
-    device = module.custom_device()
+    device = torch.device("npu:0")
     test_MHA(device)
     # test_Attention(device, head=16, seq=512, d_k=64)
     # test_MHA(device, num_heads=12, embed_dim=768)
diff --git a/tests/Fusion/test_bmm_reduction.py b/tests/Fusion/test_bmm_reduction.py
index 4f4d3ad6..7a3060de 100644
--- a/tests/Fusion/test_bmm_reduction.py
+++ b/tests/Fusion/test_bmm_reduction.py
@@ -42,9 +42,7 @@ def bmm(a, b):
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import PyTorchSimRunner
-    module = PyTorchSimRunner.setup_device()
-    device = module.custom_device()
+    device = torch.device("npu:0")
     #test_bmm_reduce(device)
     test_bmm_reduce(device, 12, 512)
     test_bmm_reduce(device, 4, 256)
diff --git a/tests/Fusion/test_conv_fusion.py b/tests/Fusion/test_conv_fusion.py
index 694f3bb9..6f3d5984 100644
--- a/tests/Fusion/test_conv_fusion.py
+++ b/tests/Fusion/test_conv_fusion.py
@@ -101,9 +101,7 @@ def custom_conv_bn_relu(a, b, bias, c, d, e, f):
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import PyTorchSimRunner
-    module = PyTorchSimRunner.setup_device()
-    device = module.custom_device()
+    device = torch.device("npu:0")
 
     # Vanila test
     test_conv_residual(device, batch_size=3, in_channels=64, out_channels=64, input_size=28, kernel_size=3, stride=1, padding=1)
diff --git a/tests/Fusion/test_matmul_activation.py b/tests/Fusion/test_matmul_activation.py
index 2f1d014f..94e5c4ad 100644
--- a/tests/Fusion/test_matmul_activation.py
+++ b/tests/Fusion/test_matmul_activation.py
@@ -73,9 +73,7 @@ def test_matmul_activation(device, batch_size=16, input_size=32, output_size=8,
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import PyTorchSimRunner
-    module = PyTorchSimRunner.setup_device()
-    device = module.custom_device()
+    device = torch.device("npu:0")
     test_matmul_activation(device)
     test_matmul_activation(device, batch_size=32, input_size=32, output_size=32, activation_fn="sigmoid")
     test_matmul_activation(device, batch_size=42, input_size=42, output_size=42, activation_fn="sigmoid")
diff --git a/tests/Fusion/test_matmul_reduction.py b/tests/Fusion/test_matmul_reduction.py
index df8cf969..fdd72c00 100644
--- a/tests/Fusion/test_matmul_reduction.py
+++ b/tests/Fusion/test_matmul_reduction.py
@@ -89,9 +89,7 @@ def matmul_fused(a, b, c, d):
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import PyTorchSimRunner
-    module = PyTorchSimRunner.setup_device()
-    device = module.custom_device()
+    device = torch.device("npu:0")
     test_matmul_reduce(device, 3072, 512, 768)
     test_matmul_var_mean(device)
     test_matmul_add_var_mean(device)
diff --git a/tests/Fusion/test_matmul_scalar.py b/tests/Fusion/test_matmul_scalar.py
index 0815bb90..96b49a08 100644
--- a/tests/Fusion/test_matmul_scalar.py
+++ b/tests/Fusion/test_matmul_scalar.py
@@ -39,7 +39,5 @@ def matmul_fused(a, b, c):
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import PyTorchSimRunner
-    module = PyTorchSimRunner.setup_device()
-    device = module.custom_device()
+    device = torch.device("npu:0")
     test_matmul_scalar(device)
diff --git a/tests/Fusion/test_prologue_fusion.py b/tests/Fusion/test_prologue_fusion.py
index b27312a9..850f386a 100644
--- a/tests/Fusion/test_prologue_fusion.py
+++ b/tests/Fusion/test_prologue_fusion.py
@@ -88,9 +88,7 @@ def bmm(a, b, c, d):
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import PyTorchSimRunner
-    module = PyTorchSimRunner.setup_device()
-    device = module.custom_device()
+    device = torch.device("npu:0")
     test_elem_broadcast_fusion(device)
     test_elem_fusion(device)
     test_elem_bmm_input_fusion(device, batch_size=4, m=512, n=512, k=64)
diff --git a/tests/Fusion/test_transformer_fusion.py b/tests/Fusion/test_transformer_fusion.py
index b1cceb2c..f85c6158 100644
--- a/tests/Fusion/test_transformer_fusion.py
+++ b/tests/Fusion/test_transformer_fusion.py
@@ -203,9 +203,7 @@ def test_EncoderBlock_validation(head=12, embed_dim=768, input_seq=512):
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import PyTorchSimRunner
-    module = PyTorchSimRunner.setup_device()
-    device = module.custom_device()
+    device = torch.device("npu:0")
     #test_MHA(device)
     test_EncoderBlock(device)
     # test_EncoderBlock_validation()
diff --git a/tests/Llama/test_llama.py b/tests/Llama/test_llama.py
index 889e5fa8..5e87b8e7 100644
--- a/tests/Llama/test_llama.py
+++ b/tests/Llama/test_llama.py
@@ -369,9 +369,7 @@ def run_llama_model_test(
     args = parser.parse_args()
 
     sys.path.append(os.environ.get("PYTORCHSIM_ROOT_PATH", "/workspace/PyTorchSim"))
-    from Scheduler.scheduler import PyTorchSimRunner
-    module = PyTorchSimRunner.setup_device()
-    device = module.custom_device()
+    device = torch.device("npu:0")
     #test_triu(device, size=(32, 128), diagonal=1)
     torch.compiler.is_compiling = lambda: True # FIXME. How to fix this?
     #run_rmsnorm_test(device)
diff --git a/tests/Mixtral_8x7B/test_attention.py b/tests/Mixtral_8x7B/test_attention.py
index 58955928..c48ef7d7 100644
--- a/tests/Mixtral_8x7B/test_attention.py
+++ b/tests/Mixtral_8x7B/test_attention.py
@@ -163,9 +163,7 @@ def test_rmsnorm(device, seq=32):
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import PyTorchSimRunner
-    module = PyTorchSimRunner.setup_device()
-    device = module.custom_device()
+    device = torch.device("npu:0")
     #test_rmsnorm(device, seq=1)
     #test_concat(device, size1=(1, 8, 64, 64), size2=(1,8,1,64), dim=2)
     test_decode(device, 32, 3)
diff --git a/tests/test_activation.py b/tests/test_activation.py
index 49a9467c..20cfeed4 100644
--- a/tests/test_activation.py
+++ b/tests/test_activation.py
@@ -89,9 +89,7 @@ def test_SwiGLU(device, size=(128, 128)):
     args = parser.parse_args()
     shape = tuple(map(int, args.shape.strip('()').split(',')))
 
-    from Scheduler.scheduler import PyTorchSimRunner
-    module = PyTorchSimRunner.setup_device()
-    device = module.custom_device()
+    device = torch.device("npu:0")
     test_ReLU(device, (47, 10))
     test_ReLU(device, (128, 128))
     test_ReLU(device, (4071, 429))
diff --git a/tests/test_add.py b/tests/test_add.py
index 118632d5..a9d37d5e 100644
--- a/tests/test_add.py
+++ b/tests/test_add.py
@@ -58,9 +58,7 @@ def vectoradd(a, b):
     args = parser.parse_args()
     shape = tuple(map(int, args.shape.strip('()').split(',')))
 
-    from Scheduler.scheduler import PyTorchSimRunner
-    module = PyTorchSimRunner.setup_device()
-    device = module.custom_device()
+    device = torch.device("npu:0")
     test_vectoradd(device, (1, 1))
     test_vectoradd(device, (47, 10))
     test_vectoradd(device, (128, 128))
diff --git a/tests/test_batchnorm.py b/tests/test_batchnorm.py
index 251805f5..19b9f29f 100644
--- a/tests/test_batchnorm.py
+++ b/tests/test_batchnorm.py
@@ -37,9 +37,7 @@ def test_BatchNorm(device, size=(1, 16, 64, 64)):
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import PyTorchSimRunner
-    module = PyTorchSimRunner.setup_device()
-    device = module.custom_device()
+    device = torch.device("npu:0")
     test_BatchNorm(device)
     test_BatchNorm(device, size=(1,64, 32, 32))
     test_BatchNorm(device, size=(1, 8, 4, 4))
diff --git a/tests/test_bmm.py b/tests/test_bmm.py
index d90410db..65e5e64b 100644
--- a/tests/test_bmm.py
+++ b/tests/test_bmm.py
@@ -46,9 +46,7 @@ def bmm(a, b, bias):
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import PyTorchSimRunner
-    module = PyTorchSimRunner.setup_device()
-    device = module.custom_device()
+    device = torch.device("npu:0")
     test_BMM(device)
     test_BMM(device, 2, 256, 128, 256)
     test_BMM(device, 2, 128, 256, 256)
diff --git a/tests/test_cnn.py b/tests/test_cnn.py
index 54225747..ecc452fe 100644
--- a/tests/test_cnn.py
+++ b/tests/test_cnn.py
@@ -53,7 +53,5 @@ def test_CNN(device):
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import PyTorchSimRunner
-    module = PyTorchSimRunner.setup_device()
-    device = module.custom_device()
+    device = torch.device("npu:0")
     test_CNN(device)
diff --git a/tests/test_conv2d.py b/tests/test_conv2d.py
index 97e5cdea..4d989a0f 100644
--- a/tests/test_conv2d.py
+++ b/tests/test_conv2d.py
@@ -40,9 +40,7 @@ def custom_conv2d(a, b, bias):
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import PyTorchSimRunner
-    module = PyTorchSimRunner.setup_device()
-    device = module.custom_device()
+    device = torch.device("npu:0")
     torch._dynamo.config.cache_size_limit = 64
     with torch.no_grad():
         test_conv2d(device, batch_size=8, in_channels=3, out_channels=32, input_size=32, kernel_size=1, stride=1, padding=0)
diff --git a/tests/test_exponent.py b/tests/test_exponent.py
index e60f8407..a3a706a9 100644
--- a/tests/test_exponent.py
+++ b/tests/test_exponent.py
@@ -31,7 +31,5 @@ def exponent(a):
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import PyTorchSimRunner
-    module = PyTorchSimRunner.setup_device()
-    device = module.custom_device()
+    device = torch.device("npu:0")
     test_exponent(device, size=(32, 32))
diff --git a/tests/test_gqa.py b/tests/test_gqa.py
index c5f2f6f6..ba262fa6 100644
--- a/tests/test_gqa.py
+++ b/tests/test_gqa.py
@@ -301,9 +301,7 @@ def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.
     args = parser.parse_args()
 
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
-    from Scheduler.scheduler import PyTorchSimRunner
-    module = PyTorchSimRunner.setup_device()
-    device = module.custom_device()
+    device = torch.device("npu:0")
     
     test_repeat_interleave_compilation(
         device=device,
diff --git a/tests/test_indirect_access.py b/tests/test_indirect_access.py
index d103ee1b..dbb5f2d6 100644
--- a/tests/test_indirect_access.py
+++ b/tests/test_indirect_access.py
@@ -83,9 +83,7 @@ def vectoradd(a, idx, b):
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import PyTorchSimRunner
-    module = PyTorchSimRunner.setup_device()
-    device = module.custom_device()
+    device = torch.device("npu:0")
     test_scatter_full(device)
     test_scatter_full(device, size=(2048, 2048))
     test_scatter_add(device)
diff --git a/tests/test_layernorm.py b/tests/test_layernorm.py
index a2e842d0..5c15ad12 100644
--- a/tests/test_layernorm.py
+++ b/tests/test_layernorm.py
@@ -41,9 +41,7 @@ def test_LayerNorm(device, size=(64, 64)):
     args = parser.parse_args()
     shape = tuple(map(int, args.shape.strip('()').split(',')))
 
-    from Scheduler.scheduler import PyTorchSimRunner
-    module = PyTorchSimRunner.setup_device()
-    device = module.custom_device()
+    device = torch.device("npu:0")
     with torch.no_grad():
         #test_LayerNorm(device)
         test_LayerNorm(device, shape)
diff --git a/tests/test_matmul.py b/tests/test_matmul.py
index cd30bd30..0e04738d 100644
--- a/tests/test_matmul.py
+++ b/tests/test_matmul.py
@@ -94,9 +94,7 @@ def custom_linear(a, b, bias):
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import PyTorchSimRunner
-    module = PyTorchSimRunner.setup_device()
-    device = module.custom_device()
+    device = torch.device("npu:0")
     test_matmul(device, 32, 32, 32)
     test_matmul(device, 128, 128, 128)
     test_matmul(device, 256, 256, 256)
diff --git a/tests/test_mlp.py b/tests/test_mlp.py
index 423d6e8e..b6b70c02 100644
--- a/tests/test_mlp.py
+++ b/tests/test_mlp.py
@@ -109,9 +109,7 @@ def test_optimizer(device):
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import PyTorchSimRunner
-    module = PyTorchSimRunner.setup_device()
-    device = module.custom_device()
+    device = torch.device("npu:0")
     test_mlp(device)
     test_mlp_inf(device, batch_size=1, input_size=256, hidden_size=512, output_size=256)
     test_mlp_inf(device, batch_size=8, input_size=256, hidden_size=512, output_size=256)
diff --git a/tests/test_pool.py b/tests/test_pool.py
index f5505dba..37248164 100644
--- a/tests/test_pool.py
+++ b/tests/test_pool.py
@@ -47,9 +47,7 @@ def avgpool(a):
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import PyTorchSimRunner
-    module = PyTorchSimRunner.setup_device()
-    device = module.custom_device()
+    device = torch.device("npu:0")
     #test_maxpool(device, b=1, c=8, h=16, w=16)
     #test_maxpool(device, b=1, c=8, h=112, w=112)
     test_avgpool(device, b=1, c=512, h=7, w=7)
diff --git a/tests/test_reduce.py b/tests/test_reduce.py
index 4781112d..93caba7f 100644
--- a/tests/test_reduce.py
+++ b/tests/test_reduce.py
@@ -47,9 +47,7 @@ def reduce_sum(a, dim, keepdim):
     args = parser.parse_args()
     shape = tuple(map(int, args.shape.strip('()').split(',')))
 
-    from Scheduler.scheduler import PyTorchSimRunner
-    module = PyTorchSimRunner.setup_device()
-    device = module.custom_device()
+    device = torch.device("npu:0")
     test_reduce_sum(device, (29, 47), 1, keepdim=True)
     test_reduce_sum(device, (17, 68), 0, keepdim=True)
     test_reduce_sum(device, (327, 447), 1, keepdim=True)
diff --git a/tests/test_resnet.py b/tests/test_resnet.py
index c83f13ba..2459cd58 100644
--- a/tests/test_resnet.py
+++ b/tests/test_resnet.py
@@ -49,7 +49,5 @@ def test_resnet(device, batch=1, model_type='resnet18'):
     args = args.parse_args()
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import PyTorchSimRunner
-    module = PyTorchSimRunner.setup_device()
-    device = module.custom_device()
+    device = torch.device("npu:0")
     test_resnet(device, model_type=args.model_type)
diff --git a/tests/test_single_perceptron.py b/tests/test_single_perceptron.py
index beab1c54..7475e1fe 100644
--- a/tests/test_single_perceptron.py
+++ b/tests/test_single_perceptron.py
@@ -82,7 +82,5 @@ def weight_update(a, b, lr):
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import PyTorchSimRunner
-    module = PyTorchSimRunner.setup_device()
-    device = module.custom_device()
+    device = torch.device("npu:0")
     test_single_perceptron(device)
diff --git a/tests/test_softmax.py b/tests/test_softmax.py
index 005c3ed2..82218518 100644
--- a/tests/test_softmax.py
+++ b/tests/test_softmax.py
@@ -67,9 +67,7 @@ def forward(self, x):
     args = parser.parse_args()
     shape = tuple(map(int, args.shape.strip('()').split(',')))
 
-    from Scheduler.scheduler import PyTorchSimRunner
-    module = PyTorchSimRunner.setup_device()
-    device = module.custom_device()
+    device = torch.device("npu:0")
     test_softmax(device, size=(64, 128))
     test_softmax(device, size=(64, 128), dim=0)
     test_softmax(device, size=(256, 128))
diff --git a/tests/test_sparsity.py b/tests/test_sparsity.py
index a2493673..eaa7c63c 100644
--- a/tests/test_sparsity.py
+++ b/tests/test_sparsity.py
@@ -96,9 +96,7 @@ def test_mlp_inf(device, batch_size=64, input_size=64, hidden_size=32, output_si
     )
     args = parser.parse_args()
 
-    from Scheduler.scheduler import PyTorchSimRunner
-    module = PyTorchSimRunner.setup_device()
-    device = module.custom_device()
+    device = torch.device("npu:0")
 
     #test_dec_inf(device, sparsity=args.sparsity, block=args.block)
     test_mlp_inf(device, batch_size=32, input_size=784, hidden_size=512, output_size=256, sparsity=args.sparsity, block=args.block)
diff --git a/tests/test_stonne.py b/tests/test_stonne.py
index 04ad05a8..ac26c273 100644
--- a/tests/test_stonne.py
+++ b/tests/test_stonne.py
@@ -54,7 +54,5 @@ def test_sparse_mm(device, input_size=128, hidden_size=128, output_size=128, spa
     args = parser.parse_args()
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/root/workspace/PyTorchSim'))
  
-    from Scheduler.scheduler import PyTorchSimRunner
-    module = PyTorchSimRunner.setup_device()
-    device = module.custom_device()
+    device = torch.device("npu:0")
     test_sparse_mm(device, args.sz, args.sz, args.sz, args.sparsity)
\ No newline at end of file
diff --git a/tests/test_transcendental.py b/tests/test_transcendental.py
index 38c2f4f6..b930a3f5 100644
--- a/tests/test_transcendental.py
+++ b/tests/test_transcendental.py
@@ -73,9 +73,7 @@ def cos(a):
     args = parser.parse_args()
     shape = tuple(map(int, args.shape.strip('()').split(',')))
 
-    from Scheduler.scheduler import PyTorchSimRunner
-    module = PyTorchSimRunner.setup_device()
-    device = module.custom_device()
+    device = torch.device("npu:0")
     test_tanh(device)
     test_exp(device)
     test_erf(device)
diff --git a/tests/test_transformer.py b/tests/test_transformer.py
index a3ac55d7..bfc31233 100644
--- a/tests/test_transformer.py
+++ b/tests/test_transformer.py
@@ -119,9 +119,7 @@ def test_MHA(device, num_heads=12, embed_dim=768, input_seq=512):
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import PyTorchSimRunner
-    module = PyTorchSimRunner.setup_device()
-    device = module.custom_device()
+    device = torch.device("npu:0")
     test_EncoderBlock(device)
     # test_Attention(device, head=16, seq=512, d_k=64)
     # test_MHA(device, num_heads=12, embed_dim=768)
diff --git a/tests/test_transpose2D.py b/tests/test_transpose2D.py
index af5aacf7..60a19ed8 100644
--- a/tests/test_transpose2D.py
+++ b/tests/test_transpose2D.py
@@ -46,9 +46,7 @@ def transpose(a, b):
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import PyTorchSimRunner
-    module = PyTorchSimRunner.setup_device()
-    device = module.custom_device()
+    device = torch.device("npu:0")
     test_Transpose2D(device, [64, 156])
     test_Transpose2D_2(device, [16, 64])
     test_Transpose2D(device, [640, 256])
diff --git a/tests/test_transpose3D.py b/tests/test_transpose3D.py
index d6c1092d..67d4d88a 100644
--- a/tests/test_transpose3D.py
+++ b/tests/test_transpose3D.py
@@ -61,9 +61,7 @@ def transpose(a, b):
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import PyTorchSimRunner
-    module = PyTorchSimRunner.setup_device()
-    device = module.custom_device()
+    device = torch.device("npu:0")
     test_Transpose3D_1(device, [62, 34, 44])
     test_Transpose3D_1(device, [62, 134, 144])
     test_Transpose3D_2(device, [62, 34, 44])
diff --git a/tests/test_vectorops.py b/tests/test_vectorops.py
index ed895171..ede70e0e 100644
--- a/tests/test_vectorops.py
+++ b/tests/test_vectorops.py
@@ -6,9 +6,7 @@
     import os
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
-    from Scheduler.scheduler import PyTorchSimRunner
-    module = PyTorchSimRunner.setup_device()
-    device = module.custom_device()
+    device = torch.device("npu:0")
     
     # Target shape
     seq_list = [1,128,512,2048,8192]
diff --git a/tests/test_view3D_2D.py b/tests/test_view3D_2D.py
index 148fe8fa..ae8a67c9 100644
--- a/tests/test_view3D_2D.py
+++ b/tests/test_view3D_2D.py
@@ -44,9 +44,7 @@ def view2D_3D(a):
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
-    from Scheduler.scheduler import PyTorchSimRunner
-    module = PyTorchSimRunner.setup_device()
-    device = module.custom_device()
+    device = torch.device("npu:0")
     test_view3D_2D(device)
     test_view3D_2D(device, [12, 512, 64])
     test_view2D_3D(device, size=(512, 1024), h=16, d_k=64)
diff --git a/tests/test_vit.py b/tests/test_vit.py
index aeb4f148..6149166d 100644
--- a/tests/test_vit.py
+++ b/tests/test_vit.py
@@ -202,9 +202,7 @@ def test_encoder_block_with_class_token(
     shape = tuple(map(int, args.shape.strip('()').split(',')))
 
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
-    from Scheduler.scheduler import PyTorchSimRunner
-    module = PyTorchSimRunner.setup_device()
-    device = module.custom_device()
+    device = torch.device("npu:0")
     #test_multihead_attention(device)
     #test_encoder_block(device, seq_len=197)
     #test_encoder_block_with_class_token(device, seq_len=196)

From a62540913e622ab7577c682b54d99a261fd1c5ee Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 23 Jan 2026 06:17:46 +0000
Subject: [PATCH 086/194] [SDPA] Use math as a default

---
 PyTorchSimDevice2/csrc/aten/native/Extra.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PyTorchSimDevice2/csrc/aten/native/Extra.cpp b/PyTorchSimDevice2/csrc/aten/native/Extra.cpp
index 129ad621..711d114c 100644
--- a/PyTorchSimDevice2/csrc/aten/native/Extra.cpp
+++ b/PyTorchSimDevice2/csrc/aten/native/Extra.cpp
@@ -19,7 +19,7 @@ int64_t _fused_sdp_choice(
     bool is_causal,
     std::optional<double> scale,
     bool enable_gqa) {
-  auto backend = sdp::SDPBackend::overrideable;
+  auto backend = sdp::SDPBackend::math;
   return static_cast<int64_t>(backend);
 }
 

From a053314da8b29abab746ac9ac66525eef6b2c2fd Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 23 Jan 2026 10:46:54 +0000
Subject: [PATCH 087/194] [AMP] Add amp interface for OpenReg style device

---
 PyTorchSimDevice2/csrc/amp/OpenRegAmp.h       | 15 +++++
 PyTorchSimDevice2/csrc/amp/auto_cast_mode.cpp | 28 ++++++++
 .../csrc/aten/OpenRegMinimal.cpp              | 21 ++++++
 .../torch_openreg/csrc/Module.cpp             | 67 +++++++++++++++++++
 .../torch_openreg/openreg/__init__.py         |  7 +-
 .../torch_openreg/openreg/amp.py              | 33 +++++++++
 6 files changed, 170 insertions(+), 1 deletion(-)
 create mode 100644 PyTorchSimDevice2/csrc/amp/OpenRegAmp.h
 create mode 100644 PyTorchSimDevice2/csrc/amp/auto_cast_mode.cpp
 create mode 100644 PyTorchSimDevice2/torch_openreg/openreg/amp.py

diff --git a/PyTorchSimDevice2/csrc/amp/OpenRegAmp.h b/PyTorchSimDevice2/csrc/amp/OpenRegAmp.h
new file mode 100644
index 00000000..2f81e9d2
--- /dev/null
+++ b/PyTorchSimDevice2/csrc/amp/OpenRegAmp.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <c10/core/ScalarType.h>
+#include <c10/macros/Macros.h>
+
+#include <include/Macros.h>
+
+namespace c10::openreg {
+
+OPENREG_EXPORT bool is_amp_enabled();
+OPENREG_EXPORT void set_amp_enabled(bool flag);
+OPENREG_EXPORT at::ScalarType get_amp_dtype();
+OPENREG_EXPORT void set_amp_dtype(at::ScalarType dtype);
+
+} // namespace c10::openreg
diff --git a/PyTorchSimDevice2/csrc/amp/auto_cast_mode.cpp b/PyTorchSimDevice2/csrc/amp/auto_cast_mode.cpp
new file mode 100644
index 00000000..fd650026
--- /dev/null
+++ b/PyTorchSimDevice2/csrc/amp/auto_cast_mode.cpp
@@ -0,0 +1,28 @@
+#include <ATen/autocast_mode.h>
+#include <iostream>
+#include "OpenRegAmp.h"
+
+namespace {
+  bool g_amp_enabled = false;
+  at::ScalarType g_amp_dtype = at::kFloat;
+}
+
+namespace c10::openreg {
+
+OPENREG_EXPORT bool is_amp_enabled() {
+  return g_amp_enabled;
+}
+
+OPENREG_EXPORT void set_amp_enabled(bool flag) {
+  g_amp_enabled = flag;
+}
+
+OPENREG_EXPORT at::ScalarType get_amp_dtype() {
+  return g_amp_dtype;
+}
+
+OPENREG_EXPORT void set_amp_dtype(at::ScalarType dtype) {
+  g_amp_dtype = dtype;
+}
+
+} // namespace c10::openreg
diff --git a/PyTorchSimDevice2/csrc/aten/OpenRegMinimal.cpp b/PyTorchSimDevice2/csrc/aten/OpenRegMinimal.cpp
index d54ae552..39f019c5 100644
--- a/PyTorchSimDevice2/csrc/aten/OpenRegMinimal.cpp
+++ b/PyTorchSimDevice2/csrc/aten/OpenRegMinimal.cpp
@@ -4,6 +4,10 @@
 #include <ATen/native/DispatchStub.h>
 
 #include <torch/library.h>
+#include <chrono>
+#include <iomanip>
+#include <sstream>
+#include <ctime>
 
 namespace at::openreg {
 
@@ -105,6 +109,23 @@ at::Tensor wrapper_view(const at::Tensor& self, c10::SymIntArrayRef size) {
 void wrapper_cpu_fallback(
     const c10::OperatorHandle& op,
     torch::jit::Stack* stack) {
+  const auto& op_name = op.schema().operator_name();
+
+  // Generate timestamp in format [YYYY-MM-DD HH:MM:SS.mmm]
+  auto now = std::chrono::system_clock::now();
+  auto time_t = std::chrono::system_clock::to_time_t(now);
+  auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(
+      now.time_since_epoch()) % 1000;
+
+  std::tm tm_buf;
+  localtime_r(&time_t, &tm_buf);
+
+  std::ostringstream oss;
+  oss << std::put_time(&tm_buf, "%Y-%m-%d %H:%M:%S");
+  oss << '.' << std::setfill('0') << std::setw(3) << ms.count();
+
+  std::cerr << "[" << oss.str() << "] [INFO] [PyTorchSimDevice] [Eager Mode] Operator: " << op_name << std::endl;
+
   at::native::openreg::cpu_fallback(op, stack);
 }
 // LITERALINCLUDE END: FALLBACK WRAPPER
diff --git a/PyTorchSimDevice2/torch_openreg/csrc/Module.cpp b/PyTorchSimDevice2/torch_openreg/csrc/Module.cpp
index 38c45633..052a9ed4 100644
--- a/PyTorchSimDevice2/torch_openreg/csrc/Module.cpp
+++ b/PyTorchSimDevice2/torch_openreg/csrc/Module.cpp
@@ -5,8 +5,11 @@
 #include <torch/csrc/utils/device_lazy_init.h>
 #include <torch/csrc/utils/object_ptr.h>
 #include <torch/csrc/utils/python_numbers.h>
+#include <torch/csrc/DynamicTypes.h>
+#include <torch/csrc/Dtype.h>
 
 #include <runtime/OpenRegFunctions.h>
+#include <amp/OpenRegAmp.h>
 
 static PyObject* _initExtension(PyObject* self, PyObject* noargs) {
   HANDLE_TH_ERRORS
@@ -73,6 +76,65 @@ PyObject* _getDeviceCount(PyObject* self, PyObject* noargs) {
   END_HANDLE_TH_ERRORS
 }
 
+PyObject* _isAutocastEnabled(PyObject* self, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  if (c10::openreg::is_amp_enabled()) {
+    Py_RETURN_TRUE;
+  } else {
+    Py_RETURN_FALSE;
+  }
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* _setAutocastEnabled(PyObject* self, PyObject* arg) {
+  HANDLE_TH_ERRORS
+  TORCH_CHECK(
+      PyBool_Check(arg),
+      "set_autocast_enabled expects a bool, but got ",
+      THPUtils_typename(arg));
+  c10::openreg::set_amp_enabled(arg == Py_True);
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* _getAutocastDtype(PyObject* self, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  THPDtype* dtype_obj = torch::getTHPDtype(c10::openreg::get_amp_dtype());
+  Py_INCREF(dtype_obj);
+  return reinterpret_cast<PyObject*>(dtype_obj);
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* _setAutocastDtype(PyObject* self, PyObject* arg) {
+  HANDLE_TH_ERRORS
+  TORCH_CHECK(
+      THPDtype_Check(arg),
+      "set_autocast_dtype expects a dtype, but got ",
+      THPUtils_typename(arg));
+  THPDtype* dtype_obj = reinterpret_cast<THPDtype*>(arg);
+  at::ScalarType dtype = dtype_obj->scalar_type;
+  c10::openreg::set_amp_dtype(dtype);
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* _getAmpSupportedDtype(PyObject* self, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  PyObject* torch_mod = PyImport_ImportModule("torch");
+  TORCH_CHECK(torch_mod != nullptr, "Failed to import torch module");
+
+  PyObject* float16 = PyObject_GetAttrString(torch_mod, "float16");
+  PyObject* float32 = PyObject_GetAttrString(torch_mod, "float32");
+
+  PyObject* lst = PyList_New(1);
+  PyList_SetItem(lst, 0, float32);
+  //PyList_SetItem(lst, 1, float32);
+
+  Py_DECREF(torch_mod);
+  return lst;
+  END_HANDLE_TH_ERRORS
+}
+
 static PyMethodDef methods[] = {
     {"_init", _initExtension, METH_NOARGS, nullptr},
     {"_get_default_generator", _getDefaultGenerator, METH_O, nullptr},
@@ -80,6 +142,11 @@ static PyMethodDef methods[] = {
     {"_set_device", _setDevice, METH_O, nullptr},
     {"_exchangeDevice", _exchangeDevice, METH_O, nullptr},
     {"_get_device_count", _getDeviceCount, METH_NOARGS, nullptr},
+    {"is_autocast_enabled", _isAutocastEnabled, METH_NOARGS, nullptr},
+    {"set_autocast_enabled", _setAutocastEnabled, METH_O, nullptr},
+    {"get_autocast_dtype", _getAutocastDtype, METH_NOARGS, nullptr},
+    {"set_autocast_dtype", _setAutocastDtype, METH_O, nullptr},
+    {"get_amp_supported_dtype", _getAmpSupportedDtype, METH_NOARGS, nullptr},
     {nullptr, nullptr, 0, nullptr}};
 
 /*
diff --git a/PyTorchSimDevice2/torch_openreg/openreg/__init__.py b/PyTorchSimDevice2/torch_openreg/openreg/__init__.py
index b3ab54a9..81c2fc60 100644
--- a/PyTorchSimDevice2/torch_openreg/openreg/__init__.py
+++ b/PyTorchSimDevice2/torch_openreg/openreg/__init__.py
@@ -66,7 +66,7 @@ def _lazy_init():
 
 
 from .random import *  # noqa: F403
-
+from .amp import *
 
 __all__ = [
     "device",
@@ -83,4 +83,9 @@ def _lazy_init():
     "manual_seed_all",
     "get_rng_state",
     "set_rng_state",
+    "is_autocast_enabled",
+    "set_autocast_enabled",
+    "get_autocast_dtype",
+    "set_autocast_dtype",
+    "get_amp_supported_dtype",
 ]
diff --git a/PyTorchSimDevice2/torch_openreg/openreg/amp.py b/PyTorchSimDevice2/torch_openreg/openreg/amp.py
new file mode 100644
index 00000000..0a9dfdf0
--- /dev/null
+++ b/PyTorchSimDevice2/torch_openreg/openreg/amp.py
@@ -0,0 +1,33 @@
+import torch
+
+import torch_openreg._C  # type: ignore[misc]
+
+from . import _lazy_init
+
+
+__all__ = [
+    "is_autocast_enabled",
+    "set_autocast_enabled",
+    "get_autocast_dtype",
+    "set_autocast_dtype",
+    "get_amp_supported_dtype",
+]
+
+def is_autocast_enabled():
+    return torch_openreg._C.is_autocast_enabled()
+
+
+def set_autocast_enabled(enabled: bool) -> None:
+    torch_openreg._C.set_autocast_enabled(enabled)
+
+
+def get_autocast_dtype():
+    return torch_openreg._C.get_autocast_dtype()
+
+
+def set_autocast_dtype(dtype) -> None:
+    torch_openreg._C.set_autocast_dtype(dtype)
+
+
+def get_amp_supported_dtype():
+    return torch_openreg._C.get_amp_supported_dtype()
\ No newline at end of file

From eda34ffb26692a1b1950759590ebe22900759140 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 23 Jan 2026 10:52:27 +0000
Subject: [PATCH 088/194] [Tests] Cleanup unnecessary code in tests

---
 scripts/ILS_experiment/test_matmul.py   |  4 ----
 scripts/chiplet_prep.py                 |  7 -------
 tests/Fusion/test_addmm_residual.py     |  9 +--------
 tests/Fusion/test_attention_fusion.py   | 10 +---------
 tests/Fusion/test_bmm_reduction.py      |  6 ------
 tests/Fusion/test_conv_fusion.py        |  6 ------
 tests/Fusion/test_matmul_activation.py  |  6 ------
 tests/Fusion/test_matmul_reduction.py   |  4 ----
 tests/Fusion/test_matmul_scalar.py      |  4 ----
 tests/Fusion/test_matmul_vector.py      | 10 +---------
 tests/Fusion/test_prologue_fusion.py    |  6 ------
 tests/Fusion/test_transformer_fusion.py |  6 ------
 tests/Mixtral_8x7B/test_attention.py    |  6 ------
 tests/test_activation.py                |  3 ---
 tests/test_add.py                       |  3 ---
 tests/test_batchnorm.py                 |  6 ------
 tests/test_bmm.py                       |  6 ------
 tests/test_cnn.py                       |  6 ------
 tests/test_conv2d.py                    |  5 -----
 tests/test_eager.py                     |  8 ++++++++
 tests/test_exponent.py                  |  6 ------
 tests/test_indirect_access.py           |  6 ------
 tests/test_layernorm.py                 |  3 ---
 tests/test_matmul.py                    |  6 ------
 tests/test_mlp.py                       |  6 ------
 tests/test_pool.py                      |  6 ------
 tests/test_reduce.py                    |  3 ---
 tests/test_single_perceptron.py         |  6 ------
 tests/test_softmax.py                   |  3 ---
 tests/test_topk.py                      |  3 ---
 tests/test_transcendental.py            |  3 ---
 tests/test_transformer.py               |  6 ------
 tests/test_transpose2D.py               |  6 ------
 tests/test_transpose3D.py               |  6 ------
 tests/test_vectorops.py                 |  5 -----
 tests/test_view3D_2D.py                 |  6 ------
 36 files changed, 11 insertions(+), 190 deletions(-)
 create mode 100644 tests/test_eager.py

diff --git a/scripts/ILS_experiment/test_matmul.py b/scripts/ILS_experiment/test_matmul.py
index 1314e483..b0bc474c 100644
--- a/scripts/ILS_experiment/test_matmul.py
+++ b/scripts/ILS_experiment/test_matmul.py
@@ -52,13 +52,9 @@ def custom_matmul(bias, a, b):
     test_result("Addmm Forward", res, y)
 
 if __name__ == "__main__":
-    import os
-    import sys
-    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
     parser = argparse.ArgumentParser(description="Run matmul with given shape") 
     parser.add_argument('--shape', type=str, default="(512,512,512)")
     args = parser.parse_args()
     shape = tuple(map(int, args.shape.strip('()').split(',')))
-
     device = torch.device("npu:0")
     test_matmul(device, *shape)
\ No newline at end of file
diff --git a/scripts/chiplet_prep.py b/scripts/chiplet_prep.py
index e2437904..2266d74c 100644
--- a/scripts/chiplet_prep.py
+++ b/scripts/chiplet_prep.py
@@ -1,10 +1,7 @@
 import os
 import yaml
-import shutil
 import argparse
 import torch
-import torch._dynamo
-import torch.utils.cpp_extension
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
@@ -60,10 +57,6 @@ def modify_file(dump_path, name, address_numa_stride=None, subgraph_map=None):
     print(f"Modified file saved to {output_file}")
 
 if __name__ == "__main__":
-    import os
-    import sys
-    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
-
     device = torch.device("npu:0")
     parser = argparse.ArgumentParser(description='Process folder argument.')
     parser.add_argument('size', type=int, help='Folder value', default=256)
diff --git a/tests/Fusion/test_addmm_residual.py b/tests/Fusion/test_addmm_residual.py
index d517796e..917628e3 100644
--- a/tests/Fusion/test_addmm_residual.py
+++ b/tests/Fusion/test_addmm_residual.py
@@ -1,6 +1,4 @@
 import torch
-import torch._dynamo
-import torch.utils.cpp_extension
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
@@ -38,12 +36,7 @@ def addmm_residual(a, b, c, d):
     y = addmm_residual(b2, x2, w2, r2)
     test_result("Addmm + Residual Fusion Forward", res, y)
 
-if __name__ == "__main__":
-    import os
-    import sys
-    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
-
-    device = torch.device("npu:0")
+if __name__ == "__main__":    device = torch.device("npu:0")
     test_addmm_residual(device, 32, 32, 32)
     test_addmm_residual(device, 128, 128, 128)
     test_addmm_residual(device, 512, 512, 512)
diff --git a/tests/Fusion/test_attention_fusion.py b/tests/Fusion/test_attention_fusion.py
index 045c109f..ebbd3037 100644
--- a/tests/Fusion/test_attention_fusion.py
+++ b/tests/Fusion/test_attention_fusion.py
@@ -1,8 +1,5 @@
-import math
 import copy
 import torch
-import torch._dynamo
-import torch.utils.cpp_extension
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
@@ -70,12 +67,7 @@ def test_MHA(device, num_heads=12, embed_dim=768, input_seq=512):
 
     test_result("MHA Forward", res, cpu_res)
 
-if __name__ == "__main__":
-    import os
-    import sys
-    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
-
-    device = torch.device("npu:0")
+if __name__ == "__main__":    device = torch.device("npu:0")
     test_MHA(device)
     # test_Attention(device, head=16, seq=512, d_k=64)
     # test_MHA(device, num_heads=12, embed_dim=768)
diff --git a/tests/Fusion/test_bmm_reduction.py b/tests/Fusion/test_bmm_reduction.py
index 7a3060de..45e31dab 100644
--- a/tests/Fusion/test_bmm_reduction.py
+++ b/tests/Fusion/test_bmm_reduction.py
@@ -1,6 +1,4 @@
 import torch
-import torch._dynamo
-import torch.utils.cpp_extension
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
@@ -38,10 +36,6 @@ def bmm(a, b):
     test_result("BMM Reduction Fusion reduction", res[1], y[1])
 
 if __name__ == "__main__":
-    import os
-    import sys
-    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
-
     device = torch.device("npu:0")
     #test_bmm_reduce(device)
     test_bmm_reduce(device, 12, 512)
diff --git a/tests/Fusion/test_conv_fusion.py b/tests/Fusion/test_conv_fusion.py
index 6f3d5984..bc200ff2 100644
--- a/tests/Fusion/test_conv_fusion.py
+++ b/tests/Fusion/test_conv_fusion.py
@@ -1,6 +1,4 @@
 import torch
-import torch._dynamo
-import torch.utils.cpp_extension
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
     message = f"|{name} Test Passed|"
@@ -97,10 +95,6 @@ def custom_conv_bn_relu(a, b, bias, c, d, e, f):
     print("Max diff > ", torch.max(torch.abs(res.cpu() - out)))
 
 if __name__ == "__main__":
-    import os
-    import sys
-    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
-
     device = torch.device("npu:0")
 
     # Vanila test
diff --git a/tests/Fusion/test_matmul_activation.py b/tests/Fusion/test_matmul_activation.py
index 94e5c4ad..232ec98d 100644
--- a/tests/Fusion/test_matmul_activation.py
+++ b/tests/Fusion/test_matmul_activation.py
@@ -1,7 +1,5 @@
 import copy
 import torch
-import torch._dynamo
-import torch.utils.cpp_extension
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
@@ -69,10 +67,6 @@ def test_matmul_activation(device, batch_size=16, input_size=32, output_size=8,
         print("CPU output > ", cpu_y)
 
 if __name__ == "__main__":
-    import os
-    import sys
-    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
-
     device = torch.device("npu:0")
     test_matmul_activation(device)
     test_matmul_activation(device, batch_size=32, input_size=32, output_size=32, activation_fn="sigmoid")
diff --git a/tests/Fusion/test_matmul_reduction.py b/tests/Fusion/test_matmul_reduction.py
index fdd72c00..9b09214a 100644
--- a/tests/Fusion/test_matmul_reduction.py
+++ b/tests/Fusion/test_matmul_reduction.py
@@ -85,10 +85,6 @@ def matmul_fused(a, b, c, d):
     test_result("Matmul+residual+var_mean Fusion reduction", res[2], y[2])
 
 if __name__ == "__main__":
-    import os
-    import sys
-    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
-
     device = torch.device("npu:0")
     test_matmul_reduce(device, 3072, 512, 768)
     test_matmul_var_mean(device)
diff --git a/tests/Fusion/test_matmul_scalar.py b/tests/Fusion/test_matmul_scalar.py
index 96b49a08..d5a159ed 100644
--- a/tests/Fusion/test_matmul_scalar.py
+++ b/tests/Fusion/test_matmul_scalar.py
@@ -35,9 +35,5 @@ def matmul_fused(a, b, c):
     test_result("Matmul Scalar Fusion Forward", res, y)
 
 if __name__ == "__main__":
-    import os
-    import sys
-    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
-
     device = torch.device("npu:0")
     test_matmul_scalar(device)
diff --git a/tests/Fusion/test_matmul_vector.py b/tests/Fusion/test_matmul_vector.py
index bf1bd513..f87f9432 100644
--- a/tests/Fusion/test_matmul_vector.py
+++ b/tests/Fusion/test_matmul_vector.py
@@ -1,6 +1,4 @@
 import torch
-import torch._dynamo
-import torch.utils.cpp_extension
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
@@ -41,12 +39,6 @@ def matmul_fused(a, b, c, d):
     test_result("Matmul Vector Fusion Forward", res, y)
 
 if __name__ == "__main__":
-    import os
-    import sys
-    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
-
-    from Scheduler.scheduler import ExecutionEngine
-    module = ExecutionEngine.setup_device()
-    device = module.custom_device()
+    device = torch.device("npu:0")
     test_matmul_vector(device, size=[253, 123, 47], dim=0)
     test_matmul_vector(device, size=[253, 123, 47], dim=1)
\ No newline at end of file
diff --git a/tests/Fusion/test_prologue_fusion.py b/tests/Fusion/test_prologue_fusion.py
index 850f386a..ecfd5fbf 100644
--- a/tests/Fusion/test_prologue_fusion.py
+++ b/tests/Fusion/test_prologue_fusion.py
@@ -1,6 +1,4 @@
 import torch
-import torch._dynamo
-import torch.utils.cpp_extension
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
@@ -84,10 +82,6 @@ def bmm(a, b, c, d):
     test_result("BMM Element-wise Fusion Forward", res, out)
 
 if __name__ == "__main__":
-    import os
-    import sys
-    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
-
     device = torch.device("npu:0")
     test_elem_broadcast_fusion(device)
     test_elem_fusion(device)
diff --git a/tests/Fusion/test_transformer_fusion.py b/tests/Fusion/test_transformer_fusion.py
index f85c6158..1581cd97 100644
--- a/tests/Fusion/test_transformer_fusion.py
+++ b/tests/Fusion/test_transformer_fusion.py
@@ -1,8 +1,6 @@
 import math
 import copy
 import torch
-import torch._dynamo
-import torch.utils.cpp_extension
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
@@ -199,10 +197,6 @@ def test_EncoderBlock_validation(head=12, embed_dim=768, input_seq=512):
     test_result("Encoder Block Validation", res, origin_res)
 
 if __name__ == "__main__":
-    import os
-    import sys
-    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
-
     device = torch.device("npu:0")
     #test_MHA(device)
     test_EncoderBlock(device)
diff --git a/tests/Mixtral_8x7B/test_attention.py b/tests/Mixtral_8x7B/test_attention.py
index c48ef7d7..57760370 100644
--- a/tests/Mixtral_8x7B/test_attention.py
+++ b/tests/Mixtral_8x7B/test_attention.py
@@ -1,7 +1,5 @@
 import copy
 import torch
-import torch._dynamo
-import torch.utils.cpp_extension
 from model import Transformer, TransformerBlock, ModelArgs, Attention, FeedForward, KVCache, RMSNorm, precompute_freqs_cis, sample
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
@@ -159,10 +157,6 @@ def test_rmsnorm(device, seq=32):
     test_result("RMSNorm", res, cpu_res)
 
 if __name__ == "__main__":
-    import os
-    import sys
-    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
-
     device = torch.device("npu:0")
     #test_rmsnorm(device, seq=1)
     #test_concat(device, size1=(1, 8, 64, 64), size2=(1,8,1,64), dim=2)
diff --git a/tests/test_activation.py b/tests/test_activation.py
index 20cfeed4..dacc102e 100644
--- a/tests/test_activation.py
+++ b/tests/test_activation.py
@@ -79,10 +79,7 @@ def test_SwiGLU(device, size=(128, 128)):
     test_result("SwiGLU", y, cpu_y)
 
 if __name__ == "__main__":
-    import os
-    import sys
     import argparse
-    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
     parser = argparse.ArgumentParser(description="Run LayerNorm test with dynamic shape")
     parser.add_argument('--shape', type=str, default="(512,768)")
diff --git a/tests/test_add.py b/tests/test_add.py
index a9d37d5e..7a0d23d9 100644
--- a/tests/test_add.py
+++ b/tests/test_add.py
@@ -48,10 +48,7 @@ def vectoradd(a, b):
     test_result("VectorTensorAdd", res, out)
 
 if __name__ == "__main__":
-    import os
-    import sys
     import argparse
-    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
     parser = argparse.ArgumentParser(description="Run LayerNorm test with dynamic shape")
     parser.add_argument('--shape', type=str, default="(512,768)")
diff --git a/tests/test_batchnorm.py b/tests/test_batchnorm.py
index 19b9f29f..065c0870 100644
--- a/tests/test_batchnorm.py
+++ b/tests/test_batchnorm.py
@@ -1,6 +1,4 @@
 import torch
-import torch._dynamo
-import torch.utils.cpp_extension
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
@@ -33,10 +31,6 @@ def test_BatchNorm(device, size=(1, 16, 64, 64)):
     test_result("BatchNorm Forward", y, cpu_y)
 
 if __name__ == "__main__":
-    import os
-    import sys
-    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
-
     device = torch.device("npu:0")
     test_BatchNorm(device)
     test_BatchNorm(device, size=(1,64, 32, 32))
diff --git a/tests/test_bmm.py b/tests/test_bmm.py
index 65e5e64b..02a6460e 100644
--- a/tests/test_bmm.py
+++ b/tests/test_bmm.py
@@ -1,6 +1,4 @@
 import torch
-import torch._dynamo
-import torch.utils.cpp_extension
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
@@ -42,10 +40,6 @@ def bmm(a, b, bias):
     test_result("BMM Forward", res, out)
 
 if __name__ == "__main__":
-    import os
-    import sys
-    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
-
     device = torch.device("npu:0")
     test_BMM(device)
     test_BMM(device, 2, 256, 128, 256)
diff --git a/tests/test_cnn.py b/tests/test_cnn.py
index ecc452fe..e6b01bbd 100644
--- a/tests/test_cnn.py
+++ b/tests/test_cnn.py
@@ -1,6 +1,4 @@
 import torch
-import torch._dynamo
-import torch.utils.cpp_extension
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
@@ -49,9 +47,5 @@ def test_CNN(device):
     print("Max diff > ", torch.max(torch.abs(y.cpu() - cpu_y)))
 
 if __name__ == "__main__":
-    import os
-    import sys
-    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
-
     device = torch.device("npu:0")
     test_CNN(device)
diff --git a/tests/test_conv2d.py b/tests/test_conv2d.py
index 4d989a0f..533a04db 100644
--- a/tests/test_conv2d.py
+++ b/tests/test_conv2d.py
@@ -1,6 +1,5 @@
 import torch
 import torch._dynamo
-import torch.utils.cpp_extension
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
@@ -36,10 +35,6 @@ def custom_conv2d(a, b, bias):
     print("Max diff > ", torch.max(torch.abs(res.cpu() - out)))
 
 if __name__ == "__main__":
-    import os
-    import sys
-    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
-
     device = torch.device("npu:0")
     torch._dynamo.config.cache_size_limit = 64
     with torch.no_grad():
diff --git a/tests/test_eager.py b/tests/test_eager.py
new file mode 100644
index 00000000..7a2df6e2
--- /dev/null
+++ b/tests/test_eager.py
@@ -0,0 +1,8 @@
+import torch
+
+if __name__ == "__main__":
+    device = torch.device("npu:0")
+    x = torch.zeros(10, 10).to(device)
+    y = torch.zeros(10, 10).to(device)
+    z = x + y
+    print(z.cpu())
\ No newline at end of file
diff --git a/tests/test_exponent.py b/tests/test_exponent.py
index a3a706a9..20f0a143 100644
--- a/tests/test_exponent.py
+++ b/tests/test_exponent.py
@@ -1,6 +1,4 @@
 import torch
-import torch._dynamo
-import torch.utils.cpp_extension
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
@@ -27,9 +25,5 @@ def exponent(a):
     test_result("exponent", res, out)
 
 if __name__ == "__main__":
-    import os
-    import sys
-    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
-
     device = torch.device("npu:0")
     test_exponent(device, size=(32, 32))
diff --git a/tests/test_indirect_access.py b/tests/test_indirect_access.py
index dbb5f2d6..95167d1e 100644
--- a/tests/test_indirect_access.py
+++ b/tests/test_indirect_access.py
@@ -1,7 +1,5 @@
 import torch
 import copy
-import torch._dynamo
-import torch.utils.cpp_extension
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
@@ -79,10 +77,6 @@ def vectoradd(a, idx, b):
     test_result("Indirect VectorAdd", res, out)
 
 if __name__ == "__main__":
-    import os
-    import sys
-    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
-
     device = torch.device("npu:0")
     test_scatter_full(device)
     test_scatter_full(device, size=(2048, 2048))
diff --git a/tests/test_layernorm.py b/tests/test_layernorm.py
index 5c15ad12..3db27dc5 100644
--- a/tests/test_layernorm.py
+++ b/tests/test_layernorm.py
@@ -31,10 +31,7 @@ def test_LayerNorm(device, size=(64, 64)):
     test_result("LayerNorm Forward", y, cpu_y)
 
 if __name__ == "__main__":
-    import os
-    import sys
     import argparse
-    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
     parser = argparse.ArgumentParser(description="Run LayerNorm test with dynamic shape")
     parser.add_argument('--shape', type=str, help="Shape of the tensor in the format (batch_size, features)", default="(512,768)")
diff --git a/tests/test_matmul.py b/tests/test_matmul.py
index 0e04738d..a5bdf422 100644
--- a/tests/test_matmul.py
+++ b/tests/test_matmul.py
@@ -1,6 +1,4 @@
 import torch
-import torch._dynamo
-import torch.utils.cpp_extension
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
@@ -90,10 +88,6 @@ def custom_linear(a, b, bias):
     test_result("Linear Forward", res, y)
 
 if __name__ == "__main__":
-    import os
-    import sys
-    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
-
     device = torch.device("npu:0")
     test_matmul(device, 32, 32, 32)
     test_matmul(device, 128, 128, 128)
diff --git a/tests/test_mlp.py b/tests/test_mlp.py
index b6b70c02..e3f79561 100644
--- a/tests/test_mlp.py
+++ b/tests/test_mlp.py
@@ -1,7 +1,5 @@
 import copy
 import torch
-import torch._dynamo
-import torch.utils.cpp_extension
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
@@ -105,10 +103,6 @@ def test_optimizer(device):
     test_result("Optimizer", model.linear1.weight, cpu_model.linear1.weight)
 
 if __name__ == "__main__":
-    import os
-    import sys
-    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
-
     device = torch.device("npu:0")
     test_mlp(device)
     test_mlp_inf(device, batch_size=1, input_size=256, hidden_size=512, output_size=256)
diff --git a/tests/test_pool.py b/tests/test_pool.py
index 37248164..2848e04b 100644
--- a/tests/test_pool.py
+++ b/tests/test_pool.py
@@ -1,6 +1,4 @@
 import torch
-import torch._dynamo
-import torch.utils.cpp_extension
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
@@ -43,10 +41,6 @@ def avgpool(a):
     test_result("Avgpool Forward", res, out)
 
 if __name__ == "__main__":
-    import os
-    import sys
-    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
-
     device = torch.device("npu:0")
     #test_maxpool(device, b=1, c=8, h=16, w=16)
     #test_maxpool(device, b=1, c=8, h=112, w=112)
diff --git a/tests/test_reduce.py b/tests/test_reduce.py
index 93caba7f..07f8fef2 100644
--- a/tests/test_reduce.py
+++ b/tests/test_reduce.py
@@ -37,10 +37,7 @@ def reduce_sum(a, dim, keepdim):
     test_result("ReduceMax", res, out)
 
 if __name__ == "__main__":
-    import os
-    import sys
     import argparse
-    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
     parser = argparse.ArgumentParser(description="Run LayerNorm test with dynamic shape")
     parser.add_argument('--shape', type=str, default="(128,768)")
diff --git a/tests/test_single_perceptron.py b/tests/test_single_perceptron.py
index 7475e1fe..7d3401a3 100644
--- a/tests/test_single_perceptron.py
+++ b/tests/test_single_perceptron.py
@@ -1,7 +1,5 @@
 import copy
 import torch
-import torch._dynamo
-import torch.utils.cpp_extension
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
@@ -78,9 +76,5 @@ def weight_update(a, b, lr):
     # plt.savefig('result.png')
 
 if __name__ == "__main__":
-    import os
-    import sys
-    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
-
     device = torch.device("npu:0")
     test_single_perceptron(device)
diff --git a/tests/test_softmax.py b/tests/test_softmax.py
index 82218518..2dca97b7 100644
--- a/tests/test_softmax.py
+++ b/tests/test_softmax.py
@@ -57,10 +57,7 @@ def forward(self, x):
     test_result("Softmax", y, cpu_y)
 
 if __name__ == "__main__":
-    import os
-    import sys
     import argparse
-    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
     parser = argparse.ArgumentParser(description="Run LayerNorm test with dynamic shape")
     parser.add_argument('--shape', type=str, help="Shape of the tensor in the format (batch_size, features)", default="(512,768)")
diff --git a/tests/test_topk.py b/tests/test_topk.py
index 0d5c08ec..c8565310 100644
--- a/tests/test_topk.py
+++ b/tests/test_topk.py
@@ -38,10 +38,7 @@ def topk_fn(a):
     test_result("TopK/indices", res_indices, ref_indices)
 
 if __name__ == "__main__":
-    import os
-    import sys
     import argparse
-    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
     parser = argparse.ArgumentParser(description="Run LayerNorm test with dynamic shape")
     parser.add_argument('--shape', type=str, default="(512,768)")
diff --git a/tests/test_transcendental.py b/tests/test_transcendental.py
index b930a3f5..34546539 100644
--- a/tests/test_transcendental.py
+++ b/tests/test_transcendental.py
@@ -63,10 +63,7 @@ def cos(a):
     test_result("Cos", res, out)
 
 if __name__ == "__main__":
-    import os
-    import sys
     import argparse
-    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
 
     parser = argparse.ArgumentParser(description="Run LayerNorm test with dynamic shape")
     parser.add_argument('--shape', type=str, default="(512,768)")
diff --git a/tests/test_transformer.py b/tests/test_transformer.py
index bfc31233..2b7f308c 100644
--- a/tests/test_transformer.py
+++ b/tests/test_transformer.py
@@ -1,8 +1,6 @@
 import math
 import copy
 import torch
-import torch._dynamo
-import torch.utils.cpp_extension
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
@@ -115,10 +113,6 @@ def test_MHA(device, num_heads=12, embed_dim=768, input_seq=512):
     test_result("MHA Forward", res, cpu_res)
 
 if __name__ == "__main__":
-    import os
-    import sys
-    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
-
     device = torch.device("npu:0")
     test_EncoderBlock(device)
     # test_Attention(device, head=16, seq=512, d_k=64)
diff --git a/tests/test_transpose2D.py b/tests/test_transpose2D.py
index 60a19ed8..4e9807ce 100644
--- a/tests/test_transpose2D.py
+++ b/tests/test_transpose2D.py
@@ -1,6 +1,4 @@
 import torch
-import torch._dynamo
-import torch.utils.cpp_extension
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
@@ -42,10 +40,6 @@ def transpose(a, b):
     test_result("Transpose2 Forward", res, out)
 
 if __name__ == "__main__":
-    import os
-    import sys
-    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
-
     device = torch.device("npu:0")
     test_Transpose2D(device, [64, 156])
     test_Transpose2D_2(device, [16, 64])
diff --git a/tests/test_transpose3D.py b/tests/test_transpose3D.py
index 67d4d88a..e4d4e952 100644
--- a/tests/test_transpose3D.py
+++ b/tests/test_transpose3D.py
@@ -1,6 +1,4 @@
 import torch
-import torch._dynamo
-import torch.utils.cpp_extension
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
@@ -57,10 +55,6 @@ def transpose(a, b):
     test_result("Transpose 3D Forward", res, out)
 
 if __name__ == "__main__":
-    import os
-    import sys
-    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
-
     device = torch.device("npu:0")
     test_Transpose3D_1(device, [62, 34, 44])
     test_Transpose3D_1(device, [62, 134, 144])
diff --git a/tests/test_vectorops.py b/tests/test_vectorops.py
index ede70e0e..90e9c0f5 100644
--- a/tests/test_vectorops.py
+++ b/tests/test_vectorops.py
@@ -1,11 +1,6 @@
 import torch
-import torch._dynamo
-import torch.utils.cpp_extension
 
 if __name__ == "__main__":
-    import os
-    import sys
-    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
     device = torch.device("npu:0")
     
     # Target shape
diff --git a/tests/test_view3D_2D.py b/tests/test_view3D_2D.py
index ae8a67c9..cc7b5e41 100644
--- a/tests/test_view3D_2D.py
+++ b/tests/test_view3D_2D.py
@@ -1,6 +1,4 @@
 import torch
-import torch._dynamo
-import torch.utils.cpp_extension
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
@@ -40,10 +38,6 @@ def view2D_3D(a):
     test_result("view 2D->3D", res, out)
 
 if __name__ == "__main__":
-    import os
-    import sys
-    sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
-
     device = torch.device("npu:0")
     test_view3D_2D(device)
     test_view3D_2D(device, [12, 512, 64])

From 3f8b866ff6885f56cdebacaabac501e4ecc962cd Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 23 Jan 2026 10:54:08 +0000
Subject: [PATCH 089/194] [Cleanup] Remove built libraries

---
 .../torch_openreg/lib/libopenreg.so            | Bin 59728 -> 0 bytes
 .../torch_openreg/lib/libtorch_bindings.so     | Bin 166144 -> 0 bytes
 .../torch_openreg/lib/libtorch_openreg.so      | Bin 569736 -> 0 bytes
 3 files changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 PyTorchSimDevice2/torch_openreg/lib/libopenreg.so
 delete mode 100644 PyTorchSimDevice2/torch_openreg/lib/libtorch_bindings.so
 delete mode 100644 PyTorchSimDevice2/torch_openreg/lib/libtorch_openreg.so

diff --git a/PyTorchSimDevice2/torch_openreg/lib/libopenreg.so b/PyTorchSimDevice2/torch_openreg/lib/libopenreg.so
deleted file mode 100644
index 272fb567b8daf1c45b8dc0f7b3a557257a8b68c2..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 59728
zcmeIb31C~rwLg5lC~+1ng{7D+2yog2h_SL^0RbzvlB+~cOq@U<q_Semu^L+vvgA0d
z2H7d95H(v%AJC>P6ezTWmeSC&C?vsQHvt}@Kr#Cj1`<Mnu<HArS+1@X3%c;W?|uJ&
zZX(Z}GiT16IdkUBnRDk#7dzc^3kwR06mb<Prz!<9gS1-QTVBlqtCcEcJW7sItPHm}
zTeR0*_4hbPriz8<sSOj|R`Psq`e_4)%hXtpwbv)=<)rT&$l)?I)*FSQ&>g%ybl>o_
zn=zv(U4%rJ;Ik07tlsY!R0-WbJhc!4E}53_tU^4}XHb;qrdEL`)7*NB9Fd%~T-1{(
zwL#ZteCR%3{|JXJi7(TJHzUn-pUa=jTd5K*EBmV`lFxckZ{R9kkefD(e#=z0cRK1x
z5et%En}}gh7K`@s^{-FJNv5P<bWyuRgNAqB!leq>N)b$omF`JpC{N|LEWdgC&Npvw
z4HO=?_(S#V?_L%;*p4D)5<XS<5I)IZ(wJWzdWdWPU%DK}{>;?<&gx=Ian-_6S1lf>
zElAxqruG=ZWfhUqbrMgMlaW^A;}H2;k<LZx#%H0(%Vd$fL3%nqi}6{4&r*EOz-Jjg
zXW{dGd_4FVuSSvaB5lSexboMBou&5Qzhcir=PtSM*RS6<-rn})>)Rr4&0U(g?VZZe
zTlaWIpS`B~2y5@c8@wl5k1Ow5Kk=3|g}=S(wcpIFEWG3W3g_!rt$)4hvKy99yLilL
zzk2o3BgXA_+;bm4IPQ(xjw#!EK*_}oe?IK42hJ+|!eVcGv-72^JRSFab(nJA`n&Ia
zH#YBsGY(vI_1uql4-LGa4Hh17c;OpQK0N!`tIsd7x79?iT>Of?>HXQie*E*!r)U23
zH=k5|`q(SC9b>uspkP<EwROt#fBRH)Wv`?T%C!GpP(^uVoT5w^0sbc_#z^=sOrVkA
zk3w-qg1<UuWO#8Neya1})3_f!0-yIo(MD?br9AYH%hRtYlyxLJ<$3U@V}gu?e{LT9
z)AGQJ^1yrZz$ah=jx=630REj3<nwpHM&k4SJaYap4}NnVetw@vZpY-Ie{CN6$vp7i
z=YfBcho7=M`dJKqMv~7-dFX_pKO@PxEYE!T6!ITQ&aOQ2S)4~c*XQZ)XL;b?$wTK)
zdEk|K;3awVIh?26NqOd1aUS{8+G52g_c|g^yGQ03m-;+#T5v{^KhQ^P_xL>UD;Ynf
zN<A$exCV#-E(gZ_D6EjFD|vnaPn1<`d`p!LKi~kMvFm0gpHijT%kw`4UAmqFpGPaD
zilvL^59W!|$LJrd>=FJ}nSc+DrFIWi26Nz*fbS1FgAtC{Cg@)+`B}~3djvd)ai@0s
zF5&QV1pIFpF9yGm!!O{8LY^+s57DBDYrT-ek%E4GD~De!;1_}(;a7@<7|pTkaKYyQ
zDH1NTX!kOKUoG&9gkQNsz^$}s<GM=Fj|w>q2suc9CnV|2=lE|yzI6RUz%4<Z-@p@v
zx=(ys$icu>!4qW;qi<2HVth{!@O45ysq;DDaly}tkPpeNehr703;gTExLds(U={FN
z1^sS8e};fh%%cy}gq#Qb9C5qACx3?cPYFk9qJRfLhx9}he)nVn-zfMQY~q0Z1iX>i
z0qAXj0~QN-3HT)X^<un4qQnKAP$NeqyG+;dLe7mU&u<s<d0gP{674<-dC~QmXg4Ls
z<#N=gYZAtv<XI`^#S{)!b_)8bGdbWEL4O`nHjXtM?hyJD6?7T}oh^cX3(yFEP{>V=
z`^(^q=5@Uo7dh`X<Qex>%)XT>R-ymj6@2~ybbbjs8?NF2sh?*F{M0HAUm@@nk7reT
zFyQHkc*7BoNAb9p)OnhHVc#l$N5mIiQdiR!4EUCKSGM_h*~lfHrY^6i#UJpt`Ooty
z!SFm^q}JEzZ}Qayqk)JKfCeLN9iCObh{xO997a)t#8o_JH@DSywj&L!X;(s#*09gp
z>}iiid|jTlVAE<8G(>Dwp2aIYk+9DPKAXb6c3&V;KMWiQHv2p)y&b+B643x7((Y?-
z3awSzL*Zb=*A!tyr&WTj)^Ko*r(-Sp-R|*)!@;m$X>a$2l=f(Vl2EY2-{nDVbaGWd
zX#occu$zK`7WAUd+Xm|B9Z6zU)EjR0c$?0R`ooYLgL%;lKUy;&gney3Vl$^({=heM
zi}0t}bust@9X<@6H_+?}`&N3}yn!a)$~+=zsPrsuz%bK*H?8(GwXXKGc>Qevtm^9W
zgnZ$SAPr(<t*5h6@hlCj@q_AOUq`f^hIKCWnYErB1ieX5OE3(%p+kAvriP`U)L;jx
z1rZdtb$nQWz-}BJYXcDH*$Yn$hI|2Nooy!cWr??A^^#x%M&H}+tgnLdu-k@6RZBF`
z6!8ZGuDP9^PA5p$2mJwPFf`E-iG=+tp+z0kysteNUb{3PM_)qfb!}L3#zGs^p{c9O
zyVBojvz>_hHg89VC$cu=^Y{ZTK_~b+dtpPw9)zSlEp6UaP|EtSFXRpTJiabpQ<ONV
znc>1X*_)sk9zt|2Cf4RVot+TGq9vZ%(4r-k9_P}wPN&D|T--3r<6OuxGdy#bE_8Ns
zwV3H)l|4j;&qCI;&F*OkdD^|JeV&e1G~4WncvsEzG<YI@XQvaDkOhGy4VW1tp(-7b
za8rAT1Wa=WO5E%xrahhBu%FtgwJ{D*I*ZRko6XY@ff9TC?V&c;!Uj*3#}()duJ(mp
z4Uw6VXsFHSnt%2}eN1X*fDzF-r_)(GjnRW}zb%S13Rz;aICr9%Ec`!Z``Z>n<+m%=
zS)MviI2w>dM;gf)YrOsl^ebYsMFakGqdsO!Y0#KuB{9OBkjgLvxDqB;P9~;xq==B`
zNTh9*fndbnvepCTlq}3@0+~Q?y622(wrRqeHj&|-9tf@>Bu|@vWs|4FA6V7qqc&mV
zea)De&Q4hJS|1cMxK?IQkNU!T#@Xf#b@-Z>_?a!@Gau8tKJ167ju0A$NgChGHkLM>
zZO+MzAHZxd81>j-Vx>u5FbtkuYDkE7*qe4Nf)QV*&W!8<Uk<@8=TS1f*c(_yCKUpn
z+3dsMU|e!+S%cTrFbx(WtJ>d)sVl7w(<0<)qD2@cg4RTrd!Zf1J`g1XXPf46V`My`
zV4J^bt#g{w3CjnC;_DK`AV#GDwwOkhFYauhYA|jQ-<keM>s;8sj{G`OL(B*|yb@P9
zeZ_6_1y)5``HEY=ph=fpwgo9H-iWkEGr=i$Tlmm9F*21x!$XlUnJ8ftsANX7uf-c}
zi@<4U^F<(L8$?=97MZaFGm*8k&{1aDu|{;Q)tv|ep@i0V9~Q+XZ=|VJ<X6I73JimU
zeQY-6bcT1}n}!dAR_6&tJ6bVd7#wf3%Qc^AfU}d<437dIq&?V4YJ-VeWt_O)aM-)n
z6Y#C^aH)z#1WOoW2L$1kkfWjx$zoqqFig{r>*er~=1Myp#_`T1n?e&7Ot3}9=BZ<T
z+CQOVR0V1dk^f6oXes+w)mb+6@6;Ra29Sr?;UCEpSkmyVtu~~KOQ5*l(wdt~PEH2}
z7PuqQ+;rS=6&=BfS(IJX1d~iT8|9h=ym@MyKM?Jj+Eq1c>Z}<AoJPgMBST)9S5xDe
zUQwycbGzo$c&1fM&)!YbZ)fOt)3dj>?CrFQnM%#FWsW&6kF8>Qg`J}>TL#KADy9w3
z+H3~YVTh8Ts5f0O#k0dOrwP^srp>ODFIw!H=UR9II(L$|rwvjeo--8Vp9!B)cxF?A
z{22T#W*^Fr!To3k8>@_C<>LX_51&$e2-nQ&jbVQYzXW%qu|pe!e+7u`QJ(Jpr~mN{
zAGKl{&!4ea0jmG*T10#%DAj<`Mde2r$`A1Su~oMys$OnV4iM#y!2iRQKfw;$tSlC=
z0~vm)vO?tVX656RRU&^4%a2pSA|E}}kDYjlav}0`6+jjrgCn_bUHkN%$nB@Z1^(JS
z6u}y+{6yrvEI&rMS>&I&b2WC}qm{cwzL}MeQXUlfYL+)CPl|kG%2kMP6e}-_e3+FN
zDQS`SvwWel2YI^Wb4XNT*9NhV?-KDK2~UZKN!>Z{kihTDfme!n$#psKZV}g6p9Aj`
z@LO}>iikIC$bqjH_#1QJ8w7k?4qWNu{VNlE$$nWxTw!t!yi&lca^Q^uKDdd~lXPT0
zCEBHM%7OUEahLJ6T)0Kh&xKp_z;_tv*KrouUm3?FIutXL?d~z)8{u+%a`80-{^<t#
zy9{ub0j?PM6e_|>OAK%kDq`?z1N>A2oq7X&wgJA(06)b5Z#2MH7~ri2c%uQ{{UFas
zxyYhE0mr1rOwC@`8Q>Hfl-GI#or5G0>8%1j0ry7u1_Pay2K<c%xCqs<(v*SzQ3m{N
z2Kq+$fPv022K*fc_^}4~paJePz%>KhW`OT9z-JlYdkpXs4RCD}&yZeCHNeYKJcF<u
zdtNMytOhvMk=NuDzaxBv0QIXf#cv6Y(5!w{rT8tuMaYr?sts^2M`3?e1H9G%uQ$L+
zC*-v(#qWqdedN`c;&%j-wtyh5Df-KC9sHL44W)R7@DUQ$udWooB{&uc{pwEfTY?Ld
z%>b7f;4=111$_p1DFMWFU5eij9kYN@vOdK#1UJr&TMcm1X?bl(@jJrbUjmVCOz{lC
z58$`#Zz{zz1pf}dWq-G&c!uBy^IP_JAjLBTpTlq2-yJEQA-Ies6J#(&f4Q6w;kWFs
zmf{)0Ka}6Hzq?XAL-1;T%l__3@eIKa<G1|prY!t$1H8llKf(aF7~m5P@G=AZNCVtz
zfG;z^CmY~a1H94zKiUAVGQh=>%u1^b@JR-I)d2sl0bXx_f6oA4W`IvNz#9#4@o0~g
zwi@8)81O>|xO^5t#a#yYaR&Tu1H9P)zsvwX-T?12z{L?RR(hQQUSYsrZ-C2ZQ&fDb
z0bXgq-(Y}GGr%_*;L{E8lmRZEO;PbS1AL|df4~55GQf8j;0p}!eYg80uulT}B(P5c
z`y{YW0{bNJ|2GMIXrB0)8hf`yO_*N&v7)HGsYqdFK#grF+043<nfV&PyEBtsM`Auw
zA^cV<*`0nhlgae4d;!Zpt>+6_ev_W34dm|hU3#82j=R&p)bq4q+?~Es&yQmHOY}T#
z3U;T@)$_E`+nx66c?ultPM@ylX=ArL?a=eIq1&Cd>3Q17?M@%1=V=4CJ3T?qm$H0;
zo;S1n2Vcql(nbya*YmVN+ns(|&(p?icY2earwtkUujeT+h5qY#3Rs~3dY(3FyVIBG
zdD@_%|9YM_VCcV|r;Qi-ujgsQh5qY#+EAhYdY(2?=)az)4HWvX=V{}#J6)jXX~VQT
z{lS;A|5lcNRnH&A@=xn|+8Ck#dj1%eze~@Tv-~ghJZ+57e?3ndBJ^L+(?$sW*YmUi
zLjUzVZDi1YJx>7_^k2`@h6erD^R!_>|MfftO3;5jPa72UU(cVw@*jL5`%fDZjK7|z
zfDOi9&(p>P{nzufA;I|TdD@6z{PjF-K+u0ZKa1rr(eo7GLI3r970Y{N9<y_6f@UWI
zDKsCEpMCl8{LGyE)SUdJocs|v`Ga!u`{m?KIr%Sh`1vR&|6Wf1&7AzpIr-;u@=xUC
zbNhdPPWhj5^1sW;-<*@bF(-d*PCl8F@6E|yn3Ip@<lA%dzMTAuocz+9d|gi7nUkNL
zlb@NBpPG}Ol#@RqCx1{*e!rZ&DJTEskA~~tM>+ZTa`JEH<X_IoKbMn#A}7B!Cx3rV
z{!cmi-{s_Q&dJ}HlfO16pUlbk=HxGwd3XF1Z6gec8sDk)VL@>ty;bD9kyjJ7<yKmC
z_xlU2l4|^YU}gr~@wYWv-}GX$n!KRgB5=OK3Jp5dK&b0|FH%B8I|R|=s21`~$P<;r
zkZ)0wv;X)*MQLeKwHIO4)qXUEEWq~(@Rlr?YMVsumjNb|;)-w4E+Lfo&IdN4sumFl
z_W`IK3`?Z#21KHs0Vp;8)#ua&C%lUb6*W0yE48ZGfvEO7>0Uufi@BoRO~AwjB#TUB
zl$scwsV`Au_uoo|=H9ztA*quWaTc_7NYf$oj2bx$trO3*zLC5vS#i4K@1q)|1u0nw
zr%vEHky70+S+k<f3u^rBtb~UiCHY*ij*IOk#x!xm#MUdo$t>6%&uC9lf6>fu07k>^
z_*>cn(c;y{miS=r#oe8v`YlL_JCCu>Ufk*SzQw#Vr6v!Wa2e#8ocSmv{idgo(1kvz
z4#;qd=`KLhZ6MuJVLq~l6}I%DI1Lq0lcv`wvG0i-1Xu9MUfe?|$7BZ0M|KnHW|0E_
zUWPyO50p&#N{#PQH@$zFx@k|5TJW&?^jDDsLEuI~pd>Tc!uXW+Nkh74_oA*6J#MKQ
zn>`koQ2RF{<JH9Mg@9{QK7}?lV{jiftw6qD1?uYUQ~mTB6bgPQPby>XA5C7TEw!1q
zw~>6(%`MD7rSc;i)cEH#pb7Mx=D{-%L_&!d*;31iz7ao`DOhu414lncpx*@aco`v&
zKtD0soq&#am8eBsm5z@$6-1Cbcwxdl77D4Bsfl{4ng~s<O;lSPnO9qk@+bb65^v;$
z_im4z5c{kkGAH&~VPt9Sv!dw!YT`6vf9wOa=4)R;G~}Cu50VPjdkoyC-6X)#Iq<|N
zBMeOQ5myrx4>D4}2B{CtIC^TH0H+~XXE9GuEx1`~HBTURmsYCrdVGq8t}rm%=Qz`G
zhGVH?33!gZJ6VmxD1kq7&-IWun|$s>sI0E{V{^|cK<W|~A%o&mkZ~pEm8tRC@)Bs4
zxo5nn07<o$VSbu>?nDpWiFQ<YFVb9>@Zsi-$i;ODcRB7JG56esa#!LkAio%yq9)ES
zbH`Dn{Q%0K#$Qnr%PkM6B%~MiVD?ZgAV!}@4Gwq5pVA&c5mooQ;x7#S(Vg(|BAT+l
zMiDi5ygUA8ItrS6oWX~zzEitc;5=LhoU^H2SgaD(s+tH|G!G;HUSzr&Z!0fJ9{~&(
z86<{*2~E8()9i!=F!)PFegVsxE@OxuiI>s7!w9_?DRISLNt0=ICB839KhgGl!h3<z
zzo&o3tR99zu;H<2J4E?dHGWRHCG8`O*p|u5S2)gbtZ;na;n|$kK+?YkHQ~W1z8E=@
zq^cc4_(p+V#_2_0UA~#kC#eIj__IO{dbgXeNvRm5YaS{Yh=Q+=)ZUMyhdCa^AS={%
zZG4ZRdCWI8IS<P9r7QlVGv2S#e<+6b09t}_^ge{OP))3`sEN8VcM|JI?~CT1S24le
zg!%H&gseWtpCjXb2v!+jFD-N1+pV4NU_jOQRun|vR_&jfdmbRwBH^o08H?8cF^vH3
z+-w*yJze(a%$HtA%of#G+1^uQsmTDT<~bTQ-Fp#AxIVb;o6J2=p<Cd@_M&znqeS(e
z4V$ShulX!$?r_91h31~=z@@u2OPw?k;3q3WEMR*=>_Vk5I*w$&Ja%$n^o%~HQQEqX
zu)fi__o?w`sPr;5QG@<9pnpy1pDVsYjc*=$v!xHDi=tsRf9MrZqQ}Wc|3vjR5Y^ig
z&N7;EFAzeiAnq(1TBO?pnt5-5sccPi&l*x~)IJ8aIW==H2`Bx16mU7P77-F)^&&q9
zwz&s~Z4}fn_xu@fcYHUNKU%cY{~&7bMaEJS=AM60CBmZl%)0BeWwdL6l-`c@POdYk
zU@`LaI;0^5hqj(YmZ<bTh8tq0w*$NlbrU1K#J}D?QEwsYzhsyw{w`s@!il95fI=T_
zFQ(nN>r2!yQ4J825LHcCdOG!4UolZ0+f!ifd72P3hEPQaQl5IV+U^g*A@#WTU?A9D
z#2~Pm_lTNKvQOsT5727*BmCiVb=kL>d;S0vrs*(Zw)aSJE!qL-0;wysfJ~dv*_az5
zONW@B5Jl02CCq`*ZU#0@q-GFC!)X&zHPHc#^94psU@RdFbh8vpN(|d}tyvT=7sVD<
z%$q>5Qxw|)Ci>)ca(%xU6>RUN$AG1HzjotpbSx`9u^v;rDgKqy_6TFvmT_!E^~}8#
zHqg!oW!rW&wkM;SYagOTZL@&@vp?z4plVqE<4?y2Vy_m#2st;!A4=Z_!RzC}{DnV%
zFbqFE8W@K94^q4%{Th%sf6~4!RpTFtrMaqEO%zX>3_LPalWi&UJ*HP-6?!nOY*2d)
zrQ<%N3M?RL<29+0hb~)EUJ^NhEdAtW^*CPJ+`FY1*s(2DwedlNMb!CqCuhC@y0mS$
z3%#T<{3&WM7PZsBB4mS|KzV;{c{vG9jc-zu&W!rW2WqmlKsDc))?B+X8OYWkR9i;4
zweM?(qw6l1M#$IPLoY3|#ds8MK$tck{hD92TR`?UG3i)hCpr5tBv>EKJ%1x`{8J%6
zS7J%Ig%-K8I(u8WwZqKl{T`CQ!Z!43pNdiG`4zJ6_^VhihMp4j+{sD5ypW)8Yd_=d
z^=u>^O`84yu<&S4!)?FmM*O8qO@{WsftwG<1>Wo*0Y!2j5_r3pNSwIoQi8-yn0i#u
zEeo=Av{nd+8Xnw<xibf?V*b10kJs71h>T;sFt7V7+Ix_B6X!GJR;w$XQa2JCBz0H3
zda|p(nmw3CN)DCc=&!D(6lBajQ=toWCqp3SOSc1NUUxYFZu_M7E>IM5G2Z5r`U-Hs
zCbiq%YBjI>9z^SkT~J=8(8|B%VJwLSjH_<A8H%}wmQ%K*y5p~52;A|9iA;(1z4sxx
znCYh=Uf@nNmzQ8X%In~RR63w1lR;H+0P%PiKve{)21Ie%-f+i{BTYM)1kr#Qi}i$Q
z@laaYGw1|tIEQLKMms}Ka=$Y%^AI%Ww!fvuBjpxz&mxFL^s7v35&eRlLLuv1?6+uA
zjdhhNZgcIR`C9OuwxW*E2kZoPoDYo1MC=~=aYvOpCXjdZ8AeE-A8O*DpP>|L6*Z^n
z43w-Ne@IQthu=D<%w>N9%kLaZwfU-tp^ecupm)_hADerx209AYv;rVpvVQG$^eO!~
zZvy^wUHpr55h-HgAe-LKwr^^uKif{m+<ODi-S%|Hbu?dkPX@K2J}FOIhN%U3>rc-@
z2}V-2AN1sT=t$3%sHeuCr!j@2f_bG)1amZ?(os>ZDhwPgIN<L}RAO=%TAj79D9<a0
z^oOj$&s6(=N8o%J8~W3GBQ<m)Q}t)lT~G~I{2y$camPPP&j(z)3RPe&{`xN8_QT&w
zn=$MLV{JaNn#CjKjsZ6FcWbGOpOe}A8}*jlhLtoml9OIT5zWfKfG(N6B*4&x+o=Zf
zJz<76^J<3nk(4JELM&?7W7(Hk!-p`ZZ+qluH9lZ0=y)5xDK?n#ZMyw&+z)aXrH>+~
zsP+}sHM3$DPA<gyS!wP`qd<ED-Ezl2h9l{;y{G+VD3js-HF~@jMQUv76fjai48MTF
z<+X85xYu%DHg!`M^>wFqFYi!~nT9QC+72+cZ?iAPTGwy79{{?rg<nWj$jtSd?v&8D
zX#+vyrn@NVH~n4~^qU?Nx3GgZ;TFm7kZ6~IA!=BmlzfbzW8Be=<txA?-vfY2xgU&H
zahD-X-AY8ONC?`^?_gGZfofb%=>x$#y8a}<!v>g-uh<hpMdVg?r1kQyHGI8v(7<B7
zWR|Sje9cyxCfB5@OSaOsZun}sT3;=5_7CplRH8@zWgOikg{jro$%DZrUni|9teurf
zH~z3ISzC^k@?&>0RNyw>`7wMo?S?e3%;s!j(rnOVtK7RVnppKFAx{pM)gAwe*}d#~
z=duGV*1NCJf?>VejVzpB<9cWQ;WSXxt<Fqv$mpsp@Bryzg1O5p;jFsI!Ki~#&01<T
zK8`vX0<Y=6VNlrqX{xuA)MBT07Vm%0ttf$hg#i}pXC_MP=L!Jm66@z!a-OD`Isip-
z0TJmpIV4WpG>0H@Qw=YumIYZl+GOxS4X@Y0&os^t#-6QIOe*Gelh7f)ex6LoBtcq0
z<1k<wi5K4CZ&*av1JQLds+wc>qZM~zzu%xkb@o?j<6mO#Ihrg2UrNQs?tO4XjR|{c
zSFIEXC^5%=N;)SOpK9~9Tc!RJ(_^Z8un8h+BQ36vz6F{w(@AKWFRtkt)@3n&VzaF%
zQ83By7gqW62okJLW74kzfc+cnahR=3s{I4GE)8EV{z^6NADDkg`za_*`Z?HJ=_lDI
z-3yB5o(70abk;(<*b?pZw|Qqv__hF@HLoMh;~s>$hn{ajDVd>T6ByRCowy^+*y+49
z{W%{$=(-(cwEWcqqOZ4X340!pbT#u2pr14%Z2fu$V7`8pG8S*6e(1{zMu8Tw674MT
z$-EM=+YUh*tni`qC@=w;^>=WYnTF2&C*;*LfOcK>r=wTb#UD@8@(L0?g2dRN>D)j0
z8o8%qL;!Nmpi5ZKhW<c{+D!1G-H*YBoqOxe?4mXwI+a~t*?!`C<bodAjqVEj=Za?>
zXSm}3jmTL13C9v{2xIS-Va5Avm|MLV)QcNBijJ{3i3y>x=tOQ|u_Y*jbsoMYsD)UP
zvmXHk?ItirM(#2)h>0Vv1x!us_Y+EJ4ZV>9Cdrwdl#jox#y=S<#Sl{cz2YMm?iC-2
zoJ2kxwq)bWwL8#rH8H-N95;mKD6;W<_)ue1<arqt={XM;jhY%q_K3$R5Qj0wDJYB-
z-wX~Z4)c_`Cx9B73r*Nw<g`bMntX}az*b1h5yRaNUxbjI6r?zmKunn_m~bR*t)qvq
z&eMX;VgZA=k^W$E_Lb+Nu~XiF>iz=bt;45I0i4X&QCy@wEu=w7(@6jeyV)NiyEzE}
zx`f@_j`a}+^JGAg)Dn??(-?^pH;pAo+%%3Cl*ocC9c|a^Xb26rB2DjNbDTO^6g`i&
zj6J8KgDm2~I%)2uy*2A!C2m;%zK0tT%XyEGc`RoE`B{v@2jJE~q309onT+EIs@DN0
z{dW`y{h*ew67?d#V(smrEJbKeKqE4sk<pGK_Oo%EcO@2^_Q}~tLulkHJ^C7YH<k5{
z^iPlD@b-I}(6k&zC@zH`Sl<>fUPT-yeWB=kZv;0w3q<~UB0rIk<DaB&K{Ld1?_oe2
zdR{my3QDDYM_lK(K+^4*jNdR(@%RlrM$^mnwzO$zh=ghGJrW4n_|1P)zfn`fZ$3q1
z=`Znz%ag^AA`uZgdQgXBa=s9NIAG>`{s;=-o>_4<KBHVa5lyLy8JKji>tfQGV?V@`
zBZMN+%}3}NQo<PD*@_xu4k%_t@s|%s5X0g-zZMv$5C-BqKR_oW1_C%eqL|h)D&B=u
zHi2TFC_V(Wi2gXFY(Dl}2SmhoJYWfi^8P_|JZl|A{3QE?pqic#lne`=RM}E><j<c6
z<$`^@zMX{eA?%a`Tk!Z++8R6s0nt=sPb#t=9PMRMg6T&AvDc_+)g0KY#_<3lc|@29
zCHGqnf5ZG8>@L-KiJEMKD?qW}WtG6_!-U6v);yuw$}9F)mk_<$cq5i73auqg#{-87
zx3!qtHc>-XZ3<-Jh;Jd+)QX)*&XZaK!`s#4I~4kWF|Rhncj);N;yQP=(QNueyX{pr
zc0GsEh$Kx90W9p!BHZ?yeuuww3A=MP)*l$0&440#90}~sl@cdzx{4ri)78A7PZnh9
zXkD++9Qs7t^>?W1O&CA$!*}%QQ^A9_4dt}1(Pq01%+Wn<=Pnlac-n(deFFyT{ngag
zhcrfy5r<?ve#LGum4AjC?Bfmv2ed8P4XUJ~PK+}=kJqVca`v-GxV=h6zGK{@t!c;h
zjJDD5s3*^UfUtVV4rzyh3{45u{*3uj3RJTwJ2J5skV(`}RujuA<4bXOe0*uK8ef5@
z;cshBbalBcg)NzQUbme2+1WgM<S@V<^ssdjj88zLRCf#%ixhMsWP|ZJ8elO#cm9CJ
zXFmYw665n1XbcAC7-D&f>3Ad<pAUe^aN?%jxW)K<NJ+nGmn_KA(RRE{<MXywk2FnQ
zJguj?o2<s2N}6JQm~IP!VVG{fqp=dt<M=Z~jFO5rXoyna0R4(1OXR(f2@Qy!56EI-
zC^@?Z3AG2s9zGrzb1@$F*%gG;GZ_`N1XzF_rk!@88l$o~jAxYCH5Tl`ea9%;Mn{gN
z<{o+p;)`u5(Y+c>gK9_u4{!#u@sNbGnq*B%1e*!MacA{VDr-^H<kYo(65vklT+V;b
z6h1CL0$7a8!MN==Me&y|F)l~L&A@;p07Y^Q5{UmSi4!-SO^~>0IWJfy3$k>yYU%+R
z-uNQM<&PK_+8VRXGGA!4(;y3y=zib`9Z8>rRQMU_N%~_{!Jz#N)%Epdr1840b=Y{l
zfGi2{E;LWnDL#S;^eccdURNXe#__s{kl6DW?M1ZyPmS09K>Vl1i#^4-1A@jXC<g37
zy?+?5M^=$wcWRqB|8l(k2Cx{fRk-apZN*=@#CV0_USObJ1Qf{|NHAWvNu0Rpw*-ls
zZs!HJ%7QE%t?zGqyh@O=@q(Oa943;&VCZby#kl<tT_wqy#JEjIih<ho0=Iqap(fVh
zBh|!l-4`}0QTWF@wWC?3=qYOA0A@?p%~?j^7g0$~oJ^8izCw-lPfo{ysLMk;e+QKx
zhgsY@v=NOYfNz+>Odt5ubSqjMwo>A_0Vq=x7#%;QoX!DUpk0s7!AiENi6Rys#?c9P
zvT^dzf!zOK@!GewnXJ_!^F7tedQy>hSo}D846RjV`dWp2Y>Sn49ERt?YJxZPtX2p{
zhn~=GM*_B=1wpnu)*i)Oc7E#5O8!@#2bc`?(jEcBIp+ZgPUiuBPhx_DY&Z`<MKrEg
zqX_%85zhng>N~Zdz{xoez(~+}fJKb_2<HJffw#%&DAT487L5$q;9t-pTn2q~;hFpc
z9517dVuW(f1Mt#MH0^obi=6WSoDG}@*e1&V+4BJGbVvMioJKi|PNTHZX_Pf|8l_rI
zoInnJ6;4i+kZ_L1R`f?+Md>~n=4J~d=RiM7lJ)SHu=9tZh+Oo5MF?~bxDZP+*{`JO
zm)|EzkEXLMP(isrW9L?M7z|&<gG535@V{wt2HANP_yaf+upLLb68nv#=Iu(PG&ZNC
zfR3wN&(`x@lpWgXN)(sFmS*ZLU@<a_*c&g{8&7tucLfu*Kq^ZwM0O)SnOzR^CUk|x
z+6{p5Lys6HFZ!X+-+x+OXE*+H^4cFA9wsmEiho63o1Yt5UN3t7KbIF7vNgh~bSG9Z
zO~PZJBdUot4Abhpv<QLyolKFmFP=3>1^dh41MJ8h;ON6TLfOz?Fka%Em;OB8Ft#N_
z*R=E*^5cgd;QJX?hnSv#bBWxfB3^C2CsmNzOyiHUF6{UV4#8|H*=Ft~*Nq*Dq7zNx
zs1#H+fF-4rrVkFt9kB$;&t}c%z==6a5L#S`@@W(z>ggu}i6bh3Q-_#!2}67hKqpNv
zN3y<v&9B7VbL@_DBIhvvoUkP&n6Bm?+S*bxG&$dc&am2Ta3T^*OOo?5@j8ySKe-jL
zEf&52$o+Dze7e^*a+#TXC~n_<QKJ$WZ@$NYWyRck8X&9!*@DZ_+(l@PaXfUon9qst
zbN1|Kon!88!Nekh(T(ZlxFvsZ7xg`zfv3dRuU(E9`cc5U79GW~f5h6LZuH)dzWqhc
zPfL0{Zh1Xd;=po9U_4Fs#WdMzrki`70KNl{sK=w@4n$4IlZ1LH{4uOT8DXdU0md03
zh3G)xjC5`Uoy1b260hSlHsvC78ovdY`o)<vZ(3TCz5*cKA0TE*YLjy^X$p9>^qKFO
zOQP)g6C*dbm3c_VIa)td&Bvx}sSnNHHyv^Ug&8M|PX7crdgsxn^a)Ie-QX?T-uN_`
zG`&62&;C(_fuoVaUNPN_6+$xlgPSjw!g1RGeZzT&opiEKo4%d<NJ&H%9f_XuyFsEi
z)pde9F}sxvcN3nlnOf-X-rIl6FZYx^I0jmbLkOEx`$OhSzl$blDcv{#qY70~9P4_@
zHz0LkU*1I}r6_SF?<>C*K+TI5T```)P#;DiP2l{9zzB?3>6Afb9paKzt_1e=>?ju%
z;1nWMhDud;(vN*VosWsbq}#R5*>yO1>`MNEe)oZ`3pj*`qizs5eL+1Q>fqF_E7osi
zUVUc3Wq&35vMaW=#BD#me9dNx$Kc80_;S7%Rq;UI5}C^5s&w9r=o|w&?BK*se#RoU
z)r#}RI16Ekz6g1{?TDjBCb<%GIr<yuDV{50JI>Huv4?@~J{d^S7hS^ln#cO$N}PkP
zC0k2W`^%k2%lQe1@B!+K8rxi=?R*LfYkzsoJ5-Y@A>ttpSwD*+%msc}yPxo4TX1Y;
zulO=`9=$gAz7FM*JvaYwD{?HVrJ_I78$50yy;6C%Bii0whHy%vucYajG5Sp%W7Ar$
z>w=EO+)Fbj{RaNfJmL06O%^X!Hw_lM3i|6XKy}F=cJXNzCgS6?dC1J?IgWYKxv9v3
zbPz850fn18ooXD1qc;z|jfeX%#4h`yGIMY2%S?tQ*7-<rmP%{DkvdoZJn|**lp|P%
zj}>`xl9corK%&!i<Wyi69EP<%4@d1*TGw2l#$n&LBPx;IAcs)YXze&l#nER9Sh(z-
zvZw=%M^72LM)o)U3QlLTkb3+{;sD1e5b@irEq?-!l9_7;rYcb6iUlVZu#mg`vFNt+
zV_$$3#LTftzFdtzt;Tn!$!sz?u~*hmZ60W7a~>!Aq5B_Lo=#Si!FoDMj9C)_W$-qO
z1cZYgs=cNXV2AnIE!q^+b{4!rPlE^b_y~^&Nf_FA9ptn2B62Vmxyb(gqe)!06t^g3
zPx$y{_XCU{{e19V+Zj3n`7}tl6G7Tw(NSMIZ$9@-EG<=vx#vBUIcP~x<A;~?oew?U
zFtbVaiuQA`P8&Ch47rm{llcxPwi!FIH;2rmPjdhB7`VTu2z#IR`QB$9HD$#T_2L;~
zMWmp&1Mem<cW^0lR_D-$YYE#T9mo!QnS0~FqN&3t&gMS3KiZ(Zh9*EXdi&5LQVv6#
z`dkSo=J!tFaKayPa({Cvxtq~n=*4A2S93p=^)|g9n>FxfbpOLylJ?WM<HMX$=lG%f
zwTT!M8h<p+H)FGL$6OGE>pp;Rm_9K3PC;gR9QBd#<{#{4J$bi8yA5^jEu|dAa7+(D
z8P(*47I)GIn~JlxNz=uPsnCgm{!qOgWmTF94dbTmVP|VWzq5^)de|Al4&B)WLW=g>
zBNQKgvmkl`_tu?gqs3(oY@t}{6L8B}ICTPvk{w>BQwS^`1S$nXk7f0T>l1Z}HbT?U
zrM>-a+sA3*hskf4zF}``IHiY%=%`#V!p6mn$q<;rfwsG#Y-&@ayg0jZ;Q`{+fV$&@
z>Xd=(8usv-u8uQcn>xuhxs%5i*4f{UINkC0vi!1B%CJXAvZK@Q<FJ@KI!$w2t9ujx
zJpOtWcC^xEe>eJ+J3h6N7$9q%-HIOt!ITLd<Lk?mhY{@g3d*Ya_g9pmr?jI%O3ZgT
zzxFz1!AAacw(NecCb6h60mPrv{)Eb~;>G44(+|8bdw%&8!h?XuLqOx1fXv&VSIm0_
zh*?VQ0t_tqBo9^tL%cUcdV|n;2_T52<Hdn$yfaW;iBGlW20nZ!tin#^LAFpO54;v8
z4eQTqd)!IH?T-b93(GT$vb&S*U8;%jD4Qacc^0bj2&NE{hoD~};><k3i6)%Y#EE$V
zy{k}%=A8BTG-`eje7KkV3WX`3+TVLCWpMTPlG{SsN>35W2&DZMoy2%w&vd-C#GPzL
zSRZ1*>jXH<Mlt@_LzoB_JOx?AQ1iW%bT}C$3?rNk=8mvdkslvtt-_)5ECmLP@!Oz7
zX6tpdkLjh-YC|p2`zVKjkCqlyt3fZ^aR!<!K$T3E2TdSO6ElycBssf?(X?pK^mCe*
zfhN&;=5$WybK;pc5FGrH0KY+CXoN+MVG}vI$y2J;O>Y;gMYFG20?`Eb=lqTaZR#1s
zfcImDN~C=w<49*&xpBm~b(~`rN~^_C2d3$chgotn5^eofZk%W)--5o#S>0~9zg^rT
zE}XvG09T-}68{arrT;@PdKv*A)w1_&r%8>-X+L-YtHm}tzoq>Qyb$L!-2~^Oz`13`
zX6;E7?QLH$iy{|0mbjCv>g5q&xKi3tOyYQ0n^;hdWff19P9Ie51Cev+y#waA_gAw?
zgEwmM?hMX8E``sxJ#rwuUPBNIS6?U87w@*!R9cDZ^$?PpJikjT#7%a;FV@vQ=4ZX4
zZQX)k+yLb37BZ#hGZ~EOF#9*MJva+`iW3R6OYNy+I(sDER)J}#S;ijVtOd-SnEKEn
zEEsfL7U%x~p+(rXeVs*sdIX5s0_r7ADArqjW|^lHmoxroDsN_nUs`>>={?AUCb~~T
z1K$<Y$W6yHyFb5ACgDwp1GD42cQ7WPun9gBr2}LL_#?kxV7SDSIP#{&lQ`pRbe-tP
zrweJKtSAwj{|L<TWj@YUd2`P;Kt$n3Ifc=g@Nj9)VgVN@pD`3##p8St$MX(>QZ_V-
z`%N@iXkru#ui-h=p~;~?1jJg`1DIUj&6|qY*BP|*#nbCb(zh^YxSPuPemDjI_jh@o
z-gYqNJN3NI3uiZS8h1E2tLEN=$n@##vhy~a4)-gFk5NF$u4fVscjE!(pOf+2W!~f*
zpC&u1-LN@3FMo{TkaP6UxF>w3k4v~z@Ot07doRhDT*uP{hVK6u#F#@Be_wlD+-v`}
zi32gvS_Qiwwo;PO!AY4xiWzF9sI>R|b)1RQOs|tVXCFlGJvCtR4~GsPPEywnLPPWd
z=WGxpOMMv>6uS8HPr!?`)aSanh5Z?Enp}c~B5oneyH>z2VLdo#C9*Kr=TnlLy##Ck
zPb+_j6Q*ZqRH<B4a-$OY=UIz=tSFgA<TzM`u|6Dr2#ob0jJ4@)XgIxRrzV$SpY)N-
z{&#cFW^@B~8mjAxZ7I+WM0>PO+c!twRoS3o^vo{%7W0p3fp=|sqlm1}jp#SM4iMk$
zvcDgh*aH7`F3i#=w7pd0HG8zj8N$P=eQOk_#WRbNkx6iq-qYSl!61#p`=i(_Q`P<O
zfJM=+05<03JN(Tb2yVQ#03gf`yxlo$?XYFn4)#WxoF6m@mw`5xohM-KR^S%N8A!CF
zKnapN6eFg6^dMI~GJ`SLMp8SvcNy-_$35R4eZ9ZrqOm69SWPzw)43BBuJ{)eLW2!A
z{dT_Co6t|-kyWJ6H<>{v7$#?C>qusB8BruNNZT5|*<og|TQiu!*WUz%k<H+B5X9G;
z!HZ!Ace_Xdbu)N1Mt!&$+yscQaqs68Mo-m^hBSj8G8BUuJWNL^W1Dks-$u5FTT%I&
z>|rYuw55;RyCL1au{>=JdwT)nr5EE)>Wj|A$oB7W5Y<u2*fuNI{?XR!pR#|WfvDZ^
zK-T6>#T=9NZz=Aj{e#&np*dcya)Z(OATa^eV%s0Q5v@m`xj%1s{KoY4fo_N^y+2yw
z<vd@g=hc&EP6BD>p;#i5)Wk*DOV5059%O9)Byv#qX~#y&v>KrIrf9C?g$w5(wl=ia
z?_)20(5;A5u!$4vuSy>SCSgBVK4H3$h{aNH07s|Kx(|#<d5iKDDCce%!_j5}2d0O`
z=vSoAyq{=MFZg=E%`l8UJtS-#=C&=frCjkhvJ$~ym2h7Q{xK0C_hK@;Np{ui&qBFB
z@ME+?)|!5-3mdT#77lPHJG$J-+i5?5{y5C{IN4zcdOMPcoA2qSB0@*R$ASpKSk>Nu
zWB=@Z12k4mjWHGy^UJ86O1xA8LAc9R@;MHJH+M8EhIy+tPe2jOTeW#gb9rTpIS6$8
zb(dphFb7`1E!GJ-_iAMv6EFpTX0v$;4ttxYETM`hM5GH(1+XIV;{|v*&Ve8zW3k4H
zoux&MX$Zshs|b#nd#)q?QKJ-iB5=uZ`mHS3p~la*L|at6NMwbP8H<j)#X+bHY%sBF
zMRjve-&cqyBgzVc&c7xheGwphe!#)CP|(QSOHpxV&p**#xfhW;if|jpi$5>EQrnKu
z^w5upud$3Mn4><NU&i8<TFqZ+64ao1ye6N-^SgM%WHtFix%KNO(P$9OR`UcLN1K7U
z#sfEqU~zsUdwm543YO_kSM&%Nm_`}jMCg|DwMjdP`1*_XIuhuEwj6!Hic*X$(`AZ*
z<ZA$%hnK*1wzqQs?TUXXp6g(-SOrBo0ekyM6|KMY*0LTC`1>GDi&xmY>7>NkY&1JO
z9^gu}v!^u^U39=NwgrOOxm=y{KKvWUiu8VrYs7`9L~OOP?1=eFVSE=<MlN%Z@giu=
zyh=pb@H$73DKFG?(PZ*v!0kK)%Drev50npf){6OBN0H=_;^jk3PezWHZ$Z4goW;v=
za2T&el$m>8A{}M%@-s2(n74k^-Z5helSeSZ3?cf83^rS$kGUyE5jmRTX(PD$2vl1c
zM)Va|;#3y(ph*qqMa7%)&?_D_KMekfF<$;493c@eKh>4!#3qaad+MgQisYN|=^KDT
zc3h8()A@LKRIG>QAJX@sq?uDq;qJ(#vTu_{?A!35dGtluzmjNwx)vJF+m~&#co^v<
z8CJ5C7^x$UpTokHk$H)E6se4#$pV#QxH`a|g8{K!!85Oj&FK9{|0W7?P^*kr$P7yV
zeqPDYJ;VL`p~v<0i=3gq(g6NNJlQF9oax#qAM-fQKwO%X4Nel@A2U`j#v_;+cnX2{
z)>sUBmpbJ`bqd0GFRPnAD|T)Ave;$5bD++Ar{<oLR_)KNX;$NSQGbuHH%YZb9LL1E
zQ;GP}AQ(W`R1Ke9$nRnK*gFN$XVhdZCJp(*1$$gfrS;h%oJsxrLR5R&d}*9apdBw#
zzoVLKo^uzxW4`nUv>f2w)ZQt$!CN;@+sCftffLnun)BmM&ME1iQ^MPP2>i?$q<eRK
zj)o8TnIrf~6F>H^xK+mK;h{5G{H`{k!Uw=E7SW_i^LMJTWtqRjoxqzH6Xswsp1;c-
zcVK+g$><s1M#tyl?N${!iQeb-8ehv|2Qp%`b8ztCk?1c8>z`rIz9)+FcArp0V?P(Y
zi-~V+(Kh}GDqZj%`KLI!g0GHPAm!;U*hNgU!><nW1bC&x1*>k93V{cFs(IdkJ9)}t
z7a~aHwx3R9nF`>@uPgp=T>&0=>`K!M;jRZ^2<f_b=s7q><I!Vn+TF0M+&_o?awpf6
zIF{mzBgDA?Qh%6Mi{W|r#32**C-E($V(eic9?d=DGS5LA!hF>xEIiTIdLD_M0bpk<
zf4QvA{#LZ}L1uXGMr-V?*$-;tFv*)K5U<h$ni{L7{eh;U6Hj5#TQ$DpFHi|Zt|a)j
z44M~iyRR&A8*Q0>1WM}3%dBgz2QO3uu9a>3Fm%{!dXekobIPp_deL(&dM@p2{2%U<
z*OWy}YGUHh2JsyPXJyN2{%0S{())C5as^d2v1T&aboID-Rj&AZ2vVC|Q$E5|GIi6Z
zMQZFzJTZAxjcw1Un_k1Tz`}c`RXy%Z3+A4QvRFshRfLEYzVLyxKXE2H`#F8VOk7i^
zU?NfEyT~;q<)V`@>#l+jSOiz=`Xdc2o{Mm|2agCS`h^G=pX`RH7ouGUf9Nm=4=k7A
zE@%;r&jc(#4d?^ZH}(*l82EN2_(CG~Yi>hWx2BSTc+9A#I!E<{{M^ZN3l}7dXVC-Z
zAJY53n8Ykrrkb4(xsw+Z7P{kj3ay=jZ&pC*{NyQLBEk1d(HJK>CS#Kb1oxC(>eHG#
z8QBUk)TqoKvG0zIQ>WldD?ZWIpu>QppLnD;gElXU79#@my7c+5bEDA7i$*O-9$(>(
z6V26XHi*50`FC()$ohV0z6g4z2e=1Owgo+{O8*M8g6Y+i-L!>PpMJ-kobfzaM6h80
zxbt8Yo9dqsGw-0`q06QJ4KSW*Vklk73ro}~&$|mg(Y662y$Ue+1FlUwXGcuh(z~!%
zY1{yo@qJC>!TuK9Gk=HF2}(=hf_>LM3G9==J_+oTz&;7=lfXU+?32Jg3G9==|6LL|
z_H5gXVA$=&uidvU@Pz}uHdoz3{EYKF_o6utx5u;4W3x@?zj0sZ4R}}i!p^yN{Q7b@
z91MFR^_}$#I@@j2*pJ>fh1M?c2b!Jp>+xI2iyNlnGt;xUo&6BK6TdfHum6;L&X3hA
zoxY|>FnrQU_?cq)tIFO`2tSIfC{5k~eiGb@U(3d?LtF79vnQRze%9LB<_&`xYoyg1
zu(o@<@LRg)`6f>p37=EXSK%9m`(ON7VFmbh`ibmz?`of?qtzS6&tylutMaz;@B9X_
zk=VER+rEq}^kKUAac%xf&d#N6old-%Aug1kD1X~o|IOwc9EnRQeo%Tk$wU5RIDSam
z;}3+|yiGor%_drMwTIg1SIH|q&aeM9aU*4MUmn48PJA_s{2W$vax)$WEkM5hMm$8o
z{jT+y%&kb3UuH5Xq<uGKGFB6Qdk^UwNH-wefHU&Dkk(_fY`Qs<*@E;Sr1dx`xfAK(
zW}G8KdI3`RZ!(!Ogs^TzdI8d1NN=F@mP}>`=v!{hWa#^Ys*#r9`AP`sWTYFAo`SUT
zHt>bC^7c&TampjrkgmTolbKPB-_%3uM!E}WBT@_Y?h&L`r1bqCl}K+uT8(r7X&(Y_
zwdhCa&za0MNE`7mt_1xFAw3*vH=Y-_BCW@Xf@i6pTQV8TXtakj8*Zf4+cKHU5$fE4
zbR*Ib&Re~Uv>N9nd&VG-!<LUDU60e42aW|DoGq?L8baEP)beyDa}!c4(v3)WJ)6mt
zj|1LwnanvzyOCNTPv!YcCWLel=^IFQQGP$<cc2_{9()mbr1akHAktk(Pl3F<UqOG7
zDhMKOLuz>qaLBv+4afuhZ$P>pX$tAxNC%K^Lpq4`C8U*aW-?zOtww4=zZ#L^ol2z-
zDSZP#qn623p?|xOhQMzMCy;g^tscr`=y!TIyqC$`0X|bWrgRPXqU%7OC<W&&RtmZ-
z1&56vUD5|7EC-y9lx+Ji@Psi{N-c9s%jTQMttshNPCN9ZW2cuNO;xFWEk4yh2TueL
z7s1cLrwZ^ssxL0WiQr?oIg^<vU}Z=z!{>0oDhM{e)DkN^yR@WsGvnbVfGUTTompBk
zZ!=-C-<sTlv>N5p83gx8ci__o7}ZsrrIvMtj?%JN(Y#Wte{5-4O{v9ES~7RsLc{%h
z{eC`VrlJ}hjNAbl57VTaQ)>Bf;hfU4D~jfnT9d_dN+(}#a+FrC8|5giijA%<Z7lj-
zL1`5rj?&4%048e8DaEj`-<fO$jdL)sPC%aci52=;8MQ-a(n2VkUX0A#5!$IWwzG0<
zX%zrD&D4TtlKB8=HDeymz`dfDT2`YKlDP)><$yyy*hR1}0J{vZgAFhX=KD1{c;$fg
z<-n={BYHxO*f<fHg7fO#Op``mzw}uA1g6v{H_FQKX+<3e=63NjV0fgyGMUzrOy`w`
z3Qr%gvlt*NXptVZ{(|dW@ztPL8j6ixH|lazviORk9~Uar<{<d2g?-tMzEXQv&=@9Z
z3@<N+2-ndV#zqysG`4hdtzp#Wjw`$tAV;ayDF@Ouj_7IlJd2>@TNtb2@8#)3cD$$$
zi$?4N2FwatRp9GxF?WjJg3-z81C7&7m_PORWXFzlW+Py3z=|MCE=MqWMUj(q(@DCS
z<)OB;J}@E+^T2*Bs?%3xTJU_Bbf@?S;32P^YSEof$Cg&sGSTQdF8WdoS?|GGRSlZO
zHr|&}>r9syCyTB?ky3?E2y%<So?nHwsGZAcUal*G>|@1+ckwBRSu<~(1NSIr_oD6{
zsJjD4C2kRQlQfo>7h#ar(OAYz^GYiVujC^Ms(k)A$FX)b)O9}yTlO#3r8!j&8Iuh!
z$34v{SSB{7P652+<xGZ#fG!%VdSr?Llk<XL&4gfpkzMTq><iFz;a;gDIfTZRmeh<>
z1?}rlemv^HT(gVl+yU5Rz~p=-yO{!P24GVN%T0D}nQm9EfsUvs+XCSi{|=UDq;Z9w
z#75PY_7xPJQ;3m;93fAAWF1fuWioxR-rfQE`KX<&J;k_%F;EU>PZtBZ62(hsZ3gZ=
zgqs~>(2W(>mR1)=$Cg?#KB$DzT`-Q>+RISqg4BO*YiVwfZ(w}|-+Cn4PZnZs81)2w
zCd)!IzE<kV{Bc~Vb;aRx!;1CxZulMat$MkADLhlx*V=J)Tny-uI?kovlUZhStQB-_
zz}c$*gbvN04WQEl|L6DAw*|xJ59CTK8(5e-E>v2w`QgXwN#yH_E-z%d`7Y}2L0v1=
z#dQk(fm#%A$z*;?6SPUEC&sF>@Mnghn$IX!3W{9c$?s@}|0VUb6=cRk{*8b=3H>Wx
zfXt$NYnsaKd}Gmb1+Y*xd#!5o#uaw6jy&<?Q*%ojhgo;%J=#j4{VnkGR^UE&d^-3r
zILVyM)9)SQXfEwRy?0UXyHpQXGd^3uY8U3q;~4AW8!)k;yJF2}c6Ed`|9Ik?FZy#0
z)5bLp%lnAV0P!5bB0Tta_}tVmn@iBb7}>z@b1vbV)QoG>@8>{?Xsntsey!m7V&YHX
z{Ox4}VeJba<x7{Y${;{C=O)y@?z&9oAlw%}3WHc$A2Y2hzPu<|sMG_t1?89FeTD5*
z{s)v}-1eF~Zp@uaK);45hP1Kue3a(Gm`IA;e=LKX?P!x^UQFMDYQP{^H<pQipiP=5
zbkUlA4Pe!f4}Hs?au$t&TVLDQ7_haC`3PiAE*(kIwa`^4japi5RGOli?06ZSG{-oa
zTjRk*7kGIe_cZn+$hNxhfgIV+9j7z`T}9n5U~|R@J0bK2N+H(ctY0HcV3lhxb$5<i
z4t0VMbDTT0!NwnYvkm<1A$$HkOs<i6Y%Gewl9L(N6~Hl$jX&z&|C3DSZ{U~J$BdGr
zWd&^fc<2}QMtDI-=q23ci^nqCP78qwa3yfcv41&-#wI&IvOclV@2DncVT5S_VUfIN
zV4Rnsy$!f8{v!x7dj?xA{ejiep3!Kkx-Fx=gi!Y}^o8lwI@Hxm`CREk`5mAQ)29@Z
zBIT1+A2fm%0$o-vCaVF|KOgk#h__RTepXhK0jq++dkL>2hjCA|FE5<O`K~Nniiumx
z%rJyTKI7q#UF85CYZEV@K|z?@T>1ABp`vR>)>GFwE9kX>-tpiKuS6-u&w*Z!&q%t`
z2b{;Tk8Pti@8xZdpwEp(-Ff<t`iIE=(^~|F{*%nB;NZAXPWDV*nM}d>hYFr4{L@%v
z!&sOn<cr7gblm!4)8ED@&yC3e`%PPll<fs3xU;}6ITXbwbCd~J6p;<?GmXdJ2TcGH
zhH1-aUQC%$Zxot8EM(1CICRX>NIMV0hD~q#`q>53XPv4jzbG(mGbx)2OwXDMHWiu%
zP0G84rgu%sWyPlZOv+2erVS?L7bd#@4K;=yvnzA`Aq5lc0Gy3av&moZs{+%hq7-e6
z`$>W6rUD{9th?iXTwv-(3k9Zs6f;&%;N{<&h4iNdrca8L-xr#Oij*%4O|fF-$s&S0
zN51@9MFb}9?Qrj?g-+98vGNS~&QcP*?MwDaV4no`NnoD@_DNu$1peQZz=o4}dnr3l
zLn1G^sJx%wb+6%RpGcK1o?kEW8zfv-?2|X&PHLg!uZF8KC*PQpH=us)t@MvRhM!26
z%$u-3Fy;@)$sZx|GCnp<<d5Lhl+hxPAizAJgfbalc$Wor;F}8k(0%;oFp($Ul&(X0
zqI8RL+B49F?I^qOOJza2q32n2!S!MnJtw0Jp#XN>iY+T$*bcDkFu{oEHjC|3fgpo`
z1VdBccC%}lAbg<6V~fqMgGH(x#cw3MTL?n(*(%z{){R|o3fc8lhVx%7I+hW6fv)Tl
z`8FY#PeoqJU6A=dCJBl8mfNAqN)F+_afSG-$Tx~KB+_n?_K9@8NH>TyCDH+r4vKV_
zNEI7z$0AazNGnBJEz)|CHi|SP(r%IViFCb4H;6PP(gBeUigcGqm1&}Vky=GsDbi|@
z){C@Jq#==Zi?mOq>qWXjq$!aOh;&e-yF{vphi?{<T18qZ(rS^`i?mUsA(3{Av`?h#
zMY=(xgPVqr?7W(qldO}Mu8amEQLD{XF{7e#sx8X0wu`1stEik&F>Q({7+!ODj_g*k
z@<C=e6tNHh3YAi&`ea@r*iiZf9B(PH>n5HkKL?=#<zOWx?2Sdh>AT~IPUR^az@8=I
z_O}AAU~5VjdoGCdsXXw1NIC+aJu3vh4Th%x(LL)!L?vtA0s>ww;LCWiavR{pPveOk
zAo={Wgj0|Q7kgfY^br9M<>0?4;N1dVO_gxH4|oAZoBMdamnX_*7`Tz-+zEq4{8*NA
zM4!N?pBE%}w}7)}2`Kvx>?y&k&*6wtp4e@3I^XAT7H7xp`+(E9ScQF%{oM}+p3xCU
zLL~kgzzN@ygMSr+AFiZ?JShf5*Utrfy?{&myiw3k3HeaWm97B+e=bL!?@Ih1aJ=_<
zqLe^xG`<@JeQDnh0G#-LQ_!K99$nJ~oc-hg^0L3@3iwl13{9!yiP9tBhYLUEbOFCn
zz}p2};?ubVYFGL_GXC=<;56>i-;i=h3%K+<#B!z3w=RvO4+j8F^pzZc>L3BH%z;l9
z@Ld1vRKQ7YijW)4bGqg;_;+-_fI}7KTtP?rbrOCd!#@aTP$Yz>Hvm2o|F-}>N<URW
zIQ+`j-$xn!{gv)t52sJLk?8MW_!dR_ZHq*;<KQ5W-d-m76vM1s1USjF@_ddM6ZrQt
zhhl;<aFqavWFQZ|84eH|ml}>J{fOfPT>80~ZtS`Q@B-y9CD(3WA@HT&>k{}G0k>Sl
z0W9{9+eL7+h>j-gKf2GZ)qoc&W<~mg0<4_N@DEm`|1I@!3&Sr}2Ip{0)_vT*B;e9N
zm;EBYw2-{MN=ccgRe%@3>&x{w91MP#vP<wOYt=D2rOJk!cAEt}CEy2(3Kt2u+$TuC
z_wNE868Li7eIVewa_E~-k;W@`T$Te~K;Cgma>&zjC4M*0v*#GN{fB_d{fd;&65Q+$
z`t?gVKAjz)Yo&mP8d+BPJx`Q#8C+xx$<2Tt4LT{IpY-gFuBQN>z;B2B{fN;iRR+Qw
zOfhx3#+Zqpy{|I?C;FByUMA&yOCJ2qdEoS_C-rww*q;i~6rG<Vc(s=U*mG3e9w*@S
ztsG9z?C3g`!TG)MZ&M!l)qo!byV`vv$Cl&%SRQ;0aO!X0B^;kUr@?Ingn6{FL2>T_
zd?dMr^1yFlboQ3#Cx9OX`=g3=bF!d67sLs^N7yAfAFl<R$!!rwWan6L`*Q{tl?}<g
z0^f>n5~PcrA3-{h2miA?@cj<p{0!#s;{+U4_PVZMaP;?NUL?o;zw+SUn+N_x9{8R-
z@EHe=tp7g1zq1#Yit;1CN8<l>2FLt5lUHJAXK;Ie9{gQ^GyS=c<6j`!nh6JHB>MC6
zz#H?xw=y{F;c8Bpo@LTCm<NAP9{5-YVI+Qz%mZHzIL)K_D|vfT57z)r{jFca;pIG0
zUSf1G-<vo9W{_RG^3Wd#!@}%gnB$2=Ig-IeWkYff;2^cvbzvU-D*&f?G$8aA)0|zK
z7(V=YmFK13z723p$-S<3^1vrTAS55Fkk1LC=IH{S67U5Az8-Mu?}i_60G3g9{a(Na
zBRo&fAL&X7_%0#OCyO|IfWi5_@$c(-;1&daSi1p^y;xK_lEIA>_<dC#_?d!^Mffvx
zCXud20Z$1#Kaqo#a|PTg?7UULFB0%dVaKFj5f|{#SsZb_u>U_3@Klh)>G?EWw=g)r
zH~zgZ4}1sUq_>S-9GlK~(KQuI`$+S`!QiEeRjfzf<>*Rd9{lqG|0VQc!&Mwn>f3Jw
zodG|GZx?j#&qL?sJn)b6z^5KIGXM1q4m-ArH@HHy`+dNz;@+5?mxs=kdEmdw1K*Sf
z{yyL&|4JeMDWdUH59j(>J)Z;UoG4uj0JpNc;eXD~1CIbs^mF~9p9}gHp+6fWxprTB
zFub;+DHKvXXX9-O+lijJco`<*4+cDVx1_nv7k16<>~#K*yeA{xk!d)Qy(d%UaXK4j
z<bO}bX~zpP{y=ANwa>YDK|{puaRk<Sn!OS4|9;kOmEgTyyg{_s$UAyc?}6N_JmLi-
zPcY;QduagBttGDi5yAh9FDNn1sPZgc>4}7WK39EbLu5wC>kqpY;GLD6*QI98X$v;3
z#z;oc=LTOJ-fgmVL|k|m3ZNNnzK#y|q6@w60+18k_Mp2RK4(2@Kp}V=o<OkK$1p1V
z9bs=pB@OY{zL!-y&Dr_QNVb0hX%^-LUgmO+^mZ3on0LlEaALEq@<m*Mj)*tV<a2g9
ztNh{~mH+Lkc*V@y2FX@51>2gHV0bPDq5^quo43vH?Er+AtHF+l#KC)O=!c_YZ9r6m
zJ~jNW(Wm^DW~4W;{{6|H?|rRrqG-@}{mnhR*++BN)$9vI{E@XT<g;2$_t0u2-=NjN
zFlhDH+1$Lmf2}>i8)si<Px29^+4FVGj5sX+2_qvB1QYHPS9#i_5nmVGPpe$z>44So
zws&}77x98*r@zSub$51R3>u)?t>It*FF#79XYd;KzT}zyNbB4-@2U=7M$~NnfIs5L
zlsV7m3Hw(0LE7)bZ^5H%)+$d6kU_eEwU?z)ANB{s7;Z^s;fXNO0aUlm^fosy@pi0U
z66AH9*_S*UB2_J7A!Q4wa~OmC<w6OnWt|$KGn}gaqM=|wzMR?-UFmTN>4+7a=64>k
zQc>QaZwf~`B2j#oaFf#P<AQ;(Bki81wqU^5p?EyaLC>nT;7V_srx`D+c6hweE~P1m
zHy3>oUo+X~kuW?hkhjMh4tv*nFkIoaN=w+=?(;N9+uPToiXn%U-5(i-%7<FICOO7`
z#;TfVI1HLxc1{qRyVz0Z^f(vRdOR4<B@N&5yO$t&#<xzA-e2V}=(-vr(>t(k7&S;N
zm(#dmLBb5oETnw3^UPbg)Z<h|&(+$+==Z-(Q7nzRZcoE1<_Y1IXL_}mFPq<P_iI^;
zp5~6A@H9F1-{uRmwi$3Qs2|O6L?U>3oB55AwILtWttI#$hzqM%?%UeVcBS^rim<Px
z!sA)h)kWs2BN%|KiLCW>VuH|IAjQN;f17=zPRK!<$$SK7eSK$p`?oL6+DcLbnx~4V
z_N;}DI#&&*-IBT*ebVWS%xaGY+Pxw8P>c`BUsZc39E`vvqyiEKwDsGR1zU}p{T-no
z97^U6%9dxe6FbA0nb86t3XFXFZ1CAB%=DtUa~qsXJWCvN+<1|kb@@MWEo`%(c<?BF
z;fN<1@Shv?xuDeKbV#S1t>My-_CxEvFxbp5S?R=A4}AOG<x=-yEe)=P)aAD!FvnA_
z)U~{rH$ICk+QI3bGp5<5b*$}(_}D_zw0e3VxW?I8J<si$Q?qQDXF8tG+w8o~O!mD8
zm^(SkD!E)_8HNA9#6<=$(h3Klc^3R2d~Jer7`<uJz!G0H3=1yWfF|1Y?S~LBHo$Nb
zu;`I1zmj$+`p~f*if1KuF4zMtZtxfz5xwMG*vMxD8y-IvOK|6@^0?S07Gn~b8Hr*A
zb<IC}q3&bW%)p+3ed~aeF3~Tzofye>e*lz*35d64t7-}cn&H}D+urF7`@JjMd^6T~
z{ShZNn>HI)6c6pps8Qx!4HwJwN{D5iPb{Z$)SPYJP=~L1iNAd$mx~DvhR}nK2G>9q
z%^qfda`}T~XBv^DU*y5=rEM5;g>43i($WnzZ^(Xa0*Tr64d0F6@icXLJuBhV1wc5`
z)LIP{a?EjgY!%bFkiJQU*whL|J6h3|rq$%(xaKq6a(32+*vw%~z>YHql8?A&TE(=A
znY@AFdPgH?)QSqI9-ICn4VJAE-}B*bS?htjl~c`T8%F&<)hE8rFF7MSPyR!S@V&68
zeT(Fz0a0Ma{aqex$06@kfno9vhU<g=0Hy?V))9dhzY;qJ_^C9FE2Xlv`2wpVt%xt+
z+gj=uG#Lj&;S1C-ntW~kH#CnL%^C*XV0c>aB{3|xk;ii6nhk6{^cl$2c~#UKCKJ@=
z!wfJgG^{3Od`k}h#?A}7$~x~icD;fr)G%A-58xYS+Tl&-VboZiM*EC~u=$t_UEY=c
zP8*qkrZ#U!he!9UoJL-0P;9UOK^Rds=gB%5NfJjPZY}xF4=Wqls$sojT7g*Q(tz+G
z_+VGb!Di-%?KA?uH6A|nIrC2}CI9jWvjt8J0v0$PAJUm4o5c(ROmmk%qQi5|0z^=U
z-I^B*H}5NpkqOg`FNUGDK`vPk*bMnyK-_obfrWaC#SRl4ThH?CSaqqA=yk@XlaURq
zX*bdVq`3`FnPJ>2NL<6zq}lhcsR`Nf5x)+I0%2Ium_pE|8_Q~a*cZar1$lg3zNRRx
zV>L5eXx&b01k)VXV(R;Rr?V5&Cg&@ZoLJ_tEn3Ky%^9A#OBXUfgo_0hLhj%(P#6H%
z_NLa=o))jaO_#|GEO{I~*DEkA`PpfL@UsC<!Aj-|<VX}qHiOz5OG)L)|I``a+CrNH
z)TyKJ2c!5+s~(R8qF-(Pl}-2-sKBZ=pHMGfvrc?Etvorimds>_{~IPQTZ;b`e$Y(A
zx4IM<;yL;d=>ZWPBHc^)piS)|z2Ev29BIj|!Ah7rBvJ`@@JGz+u#Y)7hN+>?E9OZt
zziL>UG8zm1bf2mwh(o6UunwIb^&uF-2Tu)l5FF83Shsq<KazMnPNc|f)38Q_X2BYE
zt<CKR^D~4y@*ghmwBHCv1KjSl2RnzWtn{VPO+J0(R9ox~tnvu~;;YO?a>-b@aU}aN
z)6?LI_~8bV+sHz41})H|dLX{{HrqHzsNB}Uw*Lszg>~5pZ*U})(7iAioxPRADAXB0
zfcvpGiqs!Iz8=>S3KGb88g|MQO<O|CpW;~>Sc9*N^DOptMB5?Lni_0(DizwM>YOt5
zWlIb91H4V=M*ZPoen4J_lA<gBgId920aDGG24ghZvHRc)9Cn%1_mO)}<Ni&)I1dhG
zBOCuxs1@gyI@Y#FyepAL!aQx2ckr`)6{`Z#3QQb?Xd-Ks3LUgE>TheF>TgyULaVo<
zRjFuR8vrq$M#8+L6QLb?K*!KT#)C5W2wtin?m}%5rGmvvDsWq|Du^uIboiQ-3Rrk#
zSx~4V9Au9%D}1dYB-+{xYBGmrnwqdMHu*wu9hml1L{@fmaN0a($|=cvqDIVsFk{@?
z?njGgpPC1Ro|#rK3JBqn-zOgj|EDP)Z!0*ui!+4$8WcA&wF<aQO5}}5>710YzW#!Y
zIQJpX*~n`0oUF_{aPu+VS&{9__jF|1EzVQOdb#b>ITbpSBJ0cdc=S{p)SzP@L`v4j
zu@iPR;4iT%>&y3oWGdeS;@F&qyr*<IKJ-qGtS{fgk?E5HS@JLOWJ<?0=p7%Km+$%L
zspvoPPc$U|f~>Mez-4{;UXe`YdquK+Szq@55>bDSASmBclBs;JiR$Ahhu*%dp8$+l
z#IbgMRSr){zl@jUL&DbK##rAf>dSP!&?AX2&!@<gjw>1Kr>1jEnM%BT_5UlU{(z`2
zQ{#8q5e-RCroYaqFW<M2>E&*!mAzy=3Aqh7M*ihFXPL_LXc9kHes_xca{T4{7&0Bu
z3pfR<BqY*5Bc=8wnmi{i(`H#9@N?VWnp3~(T8=1Fcd4i-5~+U@`V?-A?e~yCammzA
zg@!A){Z~;&G>b%mJii~hgICJM&&|ILG-6SzS#JH@@sj+@^h4A&8|usV5W7YD5)UZs
zl6jfZ3rW;P*}gp2f0?MS@oM^CSx*ASqcY7sSzn%45B-i~(D(e&C62x6^H=tZhRfJ~
z_k#>w=_bX()y@89|H$@bdJrlb>nkaNyMd>JRA{(lJ(*6-sW0Cn7`%&Dl6*=MvYyPB
zqdvi8|K)oiyF`6WB8t?Slb<5mugrnV`cre@vOC5!XNs^I)<5uygQ5dRBD)YD$-lIp
nv^?p&^D6xBW}ZBcb700xwku&1a5GZ<=U?F%({d{0WR?F1mZhy&

diff --git a/PyTorchSimDevice2/torch_openreg/lib/libtorch_bindings.so b/PyTorchSimDevice2/torch_openreg/lib/libtorch_bindings.so
deleted file mode 100644
index 144e6dc6d88bfba08b0424d1a4b975ae430924ab..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 166144
zcmeFa33yaR);Hb_1cCw`6g1;T7~`0TOBx6y0+NsfZc7IPArhC^grtFJHj-`#Dq}DS
za;NQ?;JDzp!MF^L<A{z6;(!TD0^>3PL{Ucs6u51a%@J?`zTY`@@9oN^`*ZYt|IhRP
zo@d&J^!=UMPMtb+>eQ*a?Oi$U@!jn9?l%3?-FA)5ZUB5|7|#p48Bd_4+0Mk@1lu6a
z+rvyb{bBYEf@XZ%;smBPV(4)?{lRBC|24lKyH;>ve9Psyy_GtbmhVI3MLov1T&^dQ
zmVGD6EjvT>$M{a`E6Nq>-$psgMdb>A5akN{h;qhvhJZHXJ62R<BOdAB3O(KWouKD4
zzOCgp>E*27iF!HXo7$jH{$5|{&uK4DFBiW_^w0YJxvm+;x6$5Ylq0$PryqUTZBz92
zqK(6JRtl(r{;iKP%aZCK@`OnRHi)N%crg^0o(Jfd(zg%Xf3T0z@J!uP3+DCmH`=d_
z`~06sw)MxqPw|h$+YA4eBrTiKcJGr3wv{&9AL<tx!@zlvY}<u@pX1*?{F6T#z!&(}
ziGN?>-y!^?&td#q{#n}M-<<o!$w~K}tQ)Z9-Sbnc7S4P7{4Y-C*2Uh@C*g~eFW&o%
z?LhUBu@67;hWk%>TQm2MFF3X5!mO(nojL0MU+uYW!;tMq-(22r*SRmQy+PLkuDZ*g
zcYq9;$_WS2!Lww7{``Ctde-@o>-iT18>#;HFh3&IAA`)1^56W6$oXetU?b(<i$RG*
ze+~H~wR=1I7m4nOlD`Rx5-I;@j>zpk4Spi!e;3LYiO+vWsYi>_?qp1&NcGH(LcbgR
zjg)_6lyN+7VC4KmqU0ZXcI5o+$R9}#-J{51E;7?xvwo_fSCQ&@2Qx2H{_c=Nr2He#
zi%fqo%DDG~;fPev-YEQ!jKb%>DE%4?dZhaMK%SBM^=zNW`3Id7S^f*6@To=N^ScWo
zxBCp{V<i3Qi}8)b=c)L}d?rQVe_<5+(Aqn4{ZB(dBFSwCn2f~d;3)I)U=%r=7p0!#
zQRc;3@Dr)s-$lva7DW!ODDC!-Vh?w~(npfRAES)>m(c%6d^SeW^FvYWk1tBQqoeSD
zBkGUTZqF$7w?^TAZj^bt_sqz0$c&;_=R(gT@$-DY$n~s`BDZ@`PbB$QqF<5b#e^vB
z9)dh0@$*;|{(DE!t2d(9|MxGn*?ws?s=&8Kk#i9`6^Z|+Fkd3k$4AlU=c4GtrBTM~
za`2Q8fzN?a=INX$c4=Z1`P><WpN=T@YDSd&B~jMFeNooIV(=ene4mWM=Oa<{ZFiLX
z?NQo&Fv>V?2R#x$*G8F-|BNF45cDdNK71Qxo$VWCovJ=J@;rJ!O1sZT;s5g}^N9Ky
zNv}SO(%<YTcIMtFdOjZYN19(BMH!c(DC_&0DEeF;#SXj<`9l<^eV&S92VRN7&xj~`
z^#E#)q$k6o*s?8)XR{@75AS0w&#h|=E=qRi7fqVS&`Ma~%Bi2AT8%08knTEC*`
z$(vF7br9`F>hEV!<os2X_2|_odRrDn4vVAM+ma~yRuF}sSWNav`migCe^L}h51))O
zPyZ6d?p+#1-?p3)MV?XS>EI~(kP>Bl--DbZ^|v#M9`1;u=Wj)kvp0%942{yS!YJ#`
zq9}5!isGNVhE7G&!<;B`n2B+Zl)owpKmI8Cwgml(v|cnsnJ@Mza#$IK&tp;O=S1P>
z?kMB(rzrD!U=%-jR+RZTGfKbqN0~2yDC^zzQRdzE7?eo***%K>H%D2oAC98uKSY_Q
ztE2GwW)%B#985;y(-y_f*I=GT(zlW*`?%gw_K8cPjPH&p^W`2i8mZk8s6P_DFp7Ro
zi!$$Si=zK~qUhWGQRMc!D0cP1D0ZnJN<Ho<eph^yd2~^f`MwzRNb=bNeTYPVD@wfU
zj3{=#6_rG?8|}zH*mjOB<4h>3v}QG8ynboxXRCjS$+qjomu(c<W%}!akfqaa!8*(I
zzX`&tIz1KsD$!fc!T{kjMts>O==G=Q^(%UQ74oF~wgDpJ2*H-E8F4D2Tk{`<J`;WA
zN>Rc0dOh1%&(ZEZf&ek{XSU9NhtB^5ou9X0mzHCOO>Yx~&3gVq%y*Jco-QA#2Y)Vt
zFe!gVd)NHqbUs(UDDo}P^T+A(*`@dQVV!<I%OCA-5cOQ8({BYo)UWDi1!2F=X9DvR
zXWOOcAFk)Oc|5Z!swzFTzTz66$7A!jrsjG|y*1uhWwkzU&D7ki@~TSj)Z!WCUXd0#
z#ZyvO?3r0sSzKOryVsUCKd+{$+FRqB@0mKk8U;PG>gqfo)K*arkmsh@JU5n>=gqA^
z=A60`&lG>9udKqGQ&Uq_L!_D-Prlce@2e@RoQ15D@_o)>H6oA4OPR8V=FGKu^5#<m
z9(PsMZT@PMb5)`vl_lO>uWxo$sc2o#BRIKnQmU`2rewBrMBe;q#Wj_vUMa3DE%(-B
zIjKS;J=O0kE3b8qDD}=QEAe{Di*KLrDXT2=<xCn@;!JU-WHX)W%%MU=9EmPhLOjml
zCH|TkZ>0}%q{65+Z@#;#a+YU&O;rUwgqt~?jcjIv#-rpmPs!}tJTr^S${|4#u(z_*
zQ&Q|JnXTg)7^O;saw9}E(rDRRSK_Vql~q+rT^gG1t0}ImEid+YJ!ZPcSLLam4`Eg2
zOqobhPW4EM)xH|%uy9^L?(L6fbEbH5JvH8PZ*eX4U$9?;L8+;lXY&+P&MQM#rg&@p
z70?cg_=XCxc}l7<-d%-PP+40xtI}IaLKK2zxfGX|S3xdPxAYc8d%E;l0rKXLtEwtD
z6oW@4vzERxD@(7N>#dntP7GQ&8V(JeU0gZKn=J;k*vJ9(@L_nI!&#@YW_wF+8xNe5
zH?bstsOMJv({=f}5+0dSDtMYIL-SA|Vv^8?8>hNXE2Q^uW{p?wc7?a1q<TK8s+?Cr
zv%{0^W#QOreKqCYN+h5nr$;HSo$Z-1-{-B(n>e=uIf<*0G)i?&r*r6x;@Ywjs31)~
zR|%Cs_iH>*pE6&qt0X7C7=LooF~lV`UdTOX3fF;*=^hWu+*4jP3svw;rM{uAl8S2Z
zTwXQ<HKX2gFXi+WmwI)fq?>xiQ(GT2F*rXqKq<9ebGBh{(C<mGTCg7e5}%DkWP@NT
zs^*fi<@<)Fm|S`*XZdD_*+eeX^z+40Tn!UYN~3AmHMS6J)Vjzx9n((8H|J+c-#L`T
zUj=(doy@5q{h?7z^T6z|QsN1%(8$$#Fdxe*s>@v!IXQFlN^YI%$)S8@voM~bw-)OP
zm1J}9Q*|Oo*Z1*WvQom1PpYb^;B^7BNEjtbFrLZGPpv5~@y>w3vnFH}`)g@A>5@CE
zs?z7J^Xaw2h+y(zVA&FsdMhwbX|4J>8m(Kc>T<A?Q&GXgh>9lu^tmw{YoXACpE<l~
z7+r4-Sx<dx4IOSMZI$2Wsha75EuZD}%&hQ%&D$Z2TyF&y7^pfeQ`bQcr+8=OR+ai8
z3?%Y{CyS@+P)H}=2W^NtwWC{ush+7Ro~cexPR@1tX`cLacntpwb>&UWPxVlRmT^m!
zEzu1B%;+2oci_gU*UPN`=f!i-@YL&Jz`{km#T8Lnq|9h$1T8w{Ufo3hymtR*g;UsL
z>DuRXA+joAUGNwEh%Pqn3VSre#o9*pYEp59cU&s31v!)QVLQS}IWws5LyNH_6ql4>
zeXl9T>~jvE<@IHTvB(8mR902%0YBa>cGg(JI)6xs#1)UjR9Tikqd)n+A8BMbdvbY^
zha0-xWqftN5Sd4KJS0{K6s}+`+}nsfn1Y<c!h=BK=Sh>3Kf(h&ALfBK$A;Pvj;^}w
zL;i?E@iT3*7&4>op%|>Jg6vEMPIE155Dlo2nvT@GnzFfA)(dLA&d8alirlF#xr(1|
zBgua#t|f;KYfb)ep8FCbPP9>&R}h&2O3cm6kDjQw^8PQ0U8<Eh|7pr*OkSD}!)BFN
z%_uI{*Ct4{T6i~}3O^PrxI<Xnx{~~rOzi5A)XvAQt3o&=Wzdsh!=@^E6TFpTp79PO
z+dH$^U+y!B&<QY*Zem?~e`E@zmcTc!$$_p8^|&xg@y~}LN0a|MD+62f|CQRfCMd{h
zi^G5DrjD%fVy@1`LMrB+)zcaJb3GlN#m-@#adW*TFoJC8TyTwN<m7-#yJ%xyA|@GD
z43X27SAuN$DOg0EBasmng*GsTQMneKrL>&6li*Izs`Rs44o|F-9Vt&4_T^-CU3tic
zEk;gGcJ*~rX{S+8Syob2>h)yS7EB$Vh8<15*o}{ar=M9<Q><^t^$oGBHs3$PNBeVZ
zip(6DwfSPxEfb}BYEgTa9JL-jmG<meq*bWzx*4}ZC>~_&!d6Zt@mFdoZ&Vm&ST8;S
ztBTK8V`g_FCLvm5+5c4MbVS%a2Up{3u*`YTL>5?*vAe9&JIP-$!&?J|`Ei%})(9@O
zRQot`YkwBIWFKKYXnz&<MI~ePAL@U4wVyV7U?a!aWB$k<pAB<qX^nR#D=usV=8)Ld
znR_%W(f>)PfH$rqD&eDhOT%#m$b%BXJ3e^trSmI`E3joODXygr7#AFl9UJe+g+at1
zVI$x%w&~C}#ESH+PT>yohEvaXeQ~*84=CxGhRUw9p37BP>aFWi)lh7d&0V3v>Wt!2
zkJw2ICg5#X)lir!%)*GW%DLF*AYaWa#5F3-ZVWB0swhUFMlc6m_!%3DktJ0X)x|Yl
zjuhn0pWt%mlV*5wDr^1N{eUU7P1-!pUsmpedje8+8K$5f_0VTj7Wy#WUs*z-A&@d_
zW>H4NZcu#6oNQbg6}qq>O)K-w7Jbc|pH*C4i%kW!l~+}Utr$&llb<Of<^$4H-rWl>
zk=LYne6_WvC`BeFiBLk<Y_u5<w>Zf=56Cbrf-}lb5x^;kB+T!U+Qo8=gA=M3)CEGC
zl+UZJ(J_gaSri25+J%D35(@8lOGRH;+?geF{AD#>Q4o!rQb3-mv&(8F5fGn}K)Q6r
zRcnMTEx8P9DwoWLfPLO95n)7rW+cCOuE7N@;OQ8Y3It<)UJ5-LqbpVgb9|+74DYBV
zBJBdGepvNsHCS^BDiQvv6@g%~ir6eu7|BNz7w}KD#_&CcR~MJnxM=T}H<yA+*iK@@
zA%!rozM;D%K4+~D+0(pp{7`aVl?x@P_-wS#&y(`Ud7RiXBjhXW0Hq+vgxbb=5bn`^
zCXra`p;RLYzAl!58ZXg2)m7zXCG$B6E3*O;ch87?xo0?Y+%r<}&xz%k17M<HPQ{do
z<X#}QL{S%ta>C-0C7?yO(mT&1YzjqV$((k{Lu)^WX{DaS1Q%BekK%tOTI@N5XfsK)
z6bFhT(GikF@l1Im3pBbsDJ8NvJwGl@mT6fX!U`jJCjc91hO#2p>l}^+veJu%#TW#H
zN)W*lfkuj#&+-@7U=YMC)gwfLriZ4x(28eN)%?uZAG1>9<-n1r*k6YQ8v(gmUri~W
zO6Znninp3<WtL%p4clx?I^8lE5P`Aq#MG;RN_MepS^7bTuqTEYk%-`-WHzmmKWEW#
zj-)Ua&UI?wcU4wV*okAC>`DKOMa(&Z>nq0+-?h51jYp@$tI9NQ@on%zJm}pN?@WkJ
zmjc^GN(2@0OOp<>ORH8Md!2gT30XL9pa3HU7b!+YOJJ4m31O3@FY8929WkIVQ8$0%
z$?v9xh?hj&+vVr1Dg@hgIt(){m8N~=3}RfS`m3>E(z6In`&@{{ddWp#Uzmc_KW~6U
zm=VF|pI1D!Mha76fnqlG{s{nyQm2eQP7-lfV;CSw@rZQ2u$~|MVE!5rj^QqZB#rYm
zD_uxloa11tT*huwKCGnElgoQ&3Lv@M@?K~t4*YOBrd#WwIUF^jBa)w3<oZdlhqSvY
zo>4Z}NoKhOx>@V-kw;NhIkSo@CSFE(@~g#x6wJ*}VCd(R>&g`!R@9b7VvJ_AWivZ`
z-t1yJG84h@YIug8xn(szteVh}sjjZw#V%kbjzZBPKcC0t*l;+5ePzjPkA9MQWo=cM
zCQb>vkV>lkI>zy*{#5HLEg3Qd;K~tHc2)_Fdk8rRm6&wV&J36rEYp<1HX$p^GwjL~
z+XT03T$X3(mCoVj<FKLTW0*Mf%2YuuEJV5CS31*dS%rm}<6Iu+mBX&oOXz9oa+0N(
zQ%)P!rOKfxa++L;oK1eda;Vdmc-<7&1lOdi(BG@|XF6=R<G*hFw>$p!;HMb8t7ykN
zady1hM(Kh|PvSqy-yNmIf3bRccUx~RBTB?@9{PI*@=;ygY-i$c_`hy=3y%<`_R*ih
zsr1wX*MY(m%6+EZLO+Ra<xX(k-R8i325PsvUdHGL<rjUWzv2I={vOPeLGNvH7|vzX
zLKIOuJ&B%(zeE=jm7hfJ7^F}eLL%0ZLY|`a7}O&E6K(f5IW@`}T#EGQe~nQtx=7r)
zN&q46SMzZLu8-|EpwWkVb(@}^2YmmM2XUjwVY^SKy~Mb$?J*q>C@aHFtytR%9sAEK
z!wr}owwHkELnAdwPa{10_(Qn)(%-gLr`2qE0qw`x-qEpoS2EK3**?;-pVQB@X*w?E
z@@Lq-(Qz%O_qO#IB>Gg%>Ah_Ibv&Qbd)g9oyzPbCa04aAc7=|8$@k%AOLtqEj^}Xs
zZnkV-`WWx?B9r*ki?0>>`@RM};Vwa6X`#31^cD+!rGD>blZ9TrRMgX9p~vYLYj#-Z
z^(RDr%|hR#*MG=Dum3^hKW?EX^wcln=;e%lwLB>D$64sRbR2J?cj);OEOgsLqMjrR
zeV3j;#X^tM^QT$pDY|^BE%bzqg8#)9dWKGKw$Nwmc$tMhUC+P5LO1fSve29L{2dnh
zN}aA*=q)<kroZ=X$l0hr#X_&w%coiBdAfWuEc9xfo@b#a==5C!#l<tDT>~eb)0O_s
zi$b0m7J9vojc=o#o-gV0x8zT_SB(2I3%!q?e}#n}7ZmwhEcA=@{F^NFxO$O)mxX?n
zo?o-jTbe|E+qqr)J6rE>tc9MiP~=ar&`0U@Ct2vrbbhKWbYpz$EcBK7_|{wKOLcmN
z{!P-o0{?JblRt_?Kd95_D^H@g0vq{__NgA3Zs?blZnSTu8~j@7o1(P4D+;|(q8}#4
z_0M#P{;5I4cb!DPL88Yy#5iJ+vdQ!oDL<|jnx7p}=q>%i^~m`*>ElBEmFb71dOFS8
zZMJxQ-cf$t^mD2bg`ObMub1jsD)B>g8J}em9alch&kBitj!CuIR!VeSZ8krvB>I;o
z)n?ly(FaNN4vC&9(RWDnUrF?3lH6qaE-8PKlz*v|U#4qPe&Y@)6+0x+he`Dum+0dq
zx=qp(@@tGwtVEw;5b+%+(F-N|Dv3{-o*?D_Ldu^c(TgN{ibVHF^fZZnn?zqBwJXyz
zr2OSlenp~JOY}U6?vv<+65TJ+r%Uv?5`DHrzg?nNOY}P=dYweSQ=&IZ{gUbRQvQIH
zf2EXPrZ1NA>q`!oYL@8tN%bt1=(<b6smmn#aw&g{)UHflA?1Hk%D+;gKPS;wN%R*a
zdW%HYuUK>HCW-!vl)powzb4UlNc1%leV0UkQ=)4U{T~wjkVM}i(T_{?of3VqBp;bR
zTcTG<`8y=~pCo#GoUp^V=8nBl_>&;f@rsH0Ns{PzRmJ?INOU;-<|j>})0$y?G9)^+
zZG04oZumNsmM77%;W0mj5*@E}n4jqq{Y;aJ{g6cOBhjlRIyPA5r%s~d>b?1?m+1RV
zs?D}oq8q*dB{xg-{!;#>68$WRzD%Oy6&&-kLZai<5c9KAq8slhP~s|yey*7tKfWW;
z<0bkgiT(?T-XYQPii!EzA<^*)g8A7c(Xn+jKbk~OFsWEyB>G^9eq5seQli`XhxPVi
zi5@G_FOleR68%z%9xu_8C3=EHzf7VhN%SESJw>8lF45B@`jrwrL!y5r(G`iFBGL0C
zx>KSTO7x);eY!+9-|ay9Y>A#K<*%0LyCr&^L?0p1>m~X~iN08(r%Ci?iJmUemrC>l
z5`CFOzgnWNkm#c%`bvpDTB5I#=wl>$i$uRhqHmJuV<mcrM9+}uJ0yChMBgRR$4PWe
zq8sl#QSu>)o-O4+F41!&y6vp6{!ftTu@YU8=y4L=CDG$0`W}g%AknXt=t&ZNqC`)T
z=x&LgCed>xdWJ-wB+(U#ew{?mljxHrdZ9$mm*~?a`c#QNTcQ_8^lFJdO`_LH^cy95
zy+prBqA!-{H%s(piGGVjUn<e3OY~(Dy;!2Jkmxfc`bvpjBGFe#^iql5BGJ7PeUn6=
zDbYJ5`YegQL!!@?=({9(nMBtl`mGZEkVLPL=*J~`r9`)#9oGLj5<OO;*GTj@iC!zw
z<0bk$iJl<Q>m+)TM4vCwQzZHViJm6We=X57BznC>S0wsEiJm9X@0RF=61_p9PnYP8
z5`DHrZ<6TM5`B?GuaoHaNc4J%zF49!mguTPZ<gr4k?2b$dQhS-ljy&d=qn`p?<D$4
ziQX*HS4s3G61_#D|6ZbRlIZtK^bU#sfJEOR(I1rPyCnKU5?zz%4@>kz68#Sn{kTMb
zM55aUg!TVXi5@G_ACu^D68&+B9xu_Kkmv~#eVIg0lIVYw=qVEYDT$sY(Vv#+84~>&
ziLOZW6%sv9qCYFq3nluWCHi!U{uhZpTcSTN(W@o;XA-?mqW@K**Gu%35`D2me@UV@
zOZ2}<^raI0Wr@B_qQ5H9S4i|#5`Cpae_f)llIVvddW%F~Ezvhg^fx4WheZFoMBgFN
zLlS+LL|-e>HHqFL(GN-VbrSu!L|-q_ZF*e27ib$KdaOj>DAD62`sWfoUZS^2^aP3C
zF42=D`X-5<BGETX^fZb7mPF5x=zAr)BGKQL=y?+T9f@8j(chKm(<S;pCHicM-XYPe
zCHhv0UMJDFN%VS&{+>i%EYY`1^k#|vzC>Rt(La#r%Ov^^iM~Rje<;ycO7xE;`YMV3
zu|#i?=>L-Fn<V-t68*oA|CPXhCGcMf{8s}1mB9a>B=EK4qVJT3FJqNJ%={PZHl?Y>
z*X>k?(y$>`cuS{J_keul)Fq$c%Q47C`FWpyB((d~sZ-62?dZ{w&=wQZ*5XL0)x_Ny
zzi8qfjF+1@hVc>;_hh`##N?zM3C%HaEaPGm_hvlV#Ah(hG%*G6kA$2i?!$PniTg6{
zZ{mK8?Iw0GK77*XZye*@ChpI8i;2%-+-l;p8NX=a0gRWMcp&2?CO(JpLKC0Mc#etV
z85f)Q7mO#H_&mm$CO)6B)5I4r9&F+Z8TU8wMU3qx9>n<Y38ViBjCY%OFyk#I{w3p9
z6JN~uMH447UT)$`7%wsLuNW^h@uiICm^g`Xv5AuzPd4#oj5AG4fp(0)i7#h7*u+;b
z?r-8N8QV>K730G{82wLSyxYW1##>B0lyR$xhcSN9#KRdcH*qTCB_<xhc%g|$GM;1N
zG{(gyPG>yX#8)%UH1R0LP7{x2JlMoz822~vHH_^hra<<Q(Bbcm{%0`WZQ@METTDET
zajS{57{6%ZY{tt?oWppDiN`ZuXyOTs=a^VwTx?<&<H;tzmT{(uDbS4ZH?f=XU=!yu
z?r-8rjO`}Aj`88|jQ-~_-fiN^jJKG03gcE2=QDoM#8VkBH*o>uB__U}@j??%V?4*i
zg^Y_$d;{ajCccqzripK2>@@Msj0c;zh;e@t-@@2#Vh`iP-x~d&&Um+piy3b*@eIbT
zCN5$8qKQixFE_E5@e&iyWW3PCvl!1YF)ctxLd7O7V?5c!w=&K&@okKqCN5_@*u)i#
z`<u9uvE9T~j1M0-`d`g>w~6O4-eTe!#;qo<W&EOveT<iz*w1*0iRUt2XySQ{=a{&T
zaj}W#GoEbX+Zkt?_zuQS6W_^ru!$Ei?r-8>Gq#)fF2;wC8U3$kyxYVJ8E-N1-HcmJ
z+`#xn6E`wmZsI1!OH90o@j?^d!+4H~7c(w4vC4R|iGRa5)5HPBP7~kDc(93sjQgAT
zw~Xy3{vG4PM~(hBGv00DC5*S2_&&z1CjLF+7fpOW<K-rPfbkL&Kgf8Yi63G-$HYq+
z7n}HD#*<C_2gaEueuS~p#E&u_Y~sfl_c!t5jO`|Vg7M*RjQ%fUyxYXf8E-N19~rlr
z_({eun)oTk%T4?=<0U43hVeoZ|B3M&6R%)gY~p7bPd4$N8E2aKImS*C|Ap~j6F<+m
zzls0K*lyw%7$5$((f^f<cboV{##>DM6601A|BdmBCVrXmaudJ8c!`N$WxUY9@V?hP
zN!~ZsP4c&ai=OI=3oJO-f^#f*j0LA!@D&z(i3MM1!2>P0uLZ|g@Ci$QzOmr1EO@U4
ze`3MgE%+S^-e|#AdA?#vf8K(hw&2Gs_yG$JT5yvEFR)<01y@+G*Mf^IxWIyQEjY)5
z$5?Qx1z%yomss$H7Cg{``&w{}1)un1SN;9Qg1@rhy%zk51#h?DcPx0L1+TH-S1kB>
z3x3*yAG6>GEI4SvO%}Ysg8dd;VZmMtF0$YP3(mFR919*}!KoH}g#}+?!53QaKnw0`
z!7&zmV!37fE%+-7-fO|1SnzfWe#e40TJRbRe#L^Hx8SEO_%RE9z=DGo++@KEEZA?s
z6&CEZ;35kyu;5$^&avPz7MyCqS6J{R7JQ)v547OE793;2Cze^p--5rg;Jp_7i3M-B
z;CC!|qXn<A;8!g8c?*8pf*-Ts2P`;f!A%ytz=Hi2Tw%dp3of$Y0t?Qy;2aAcW5KBw
ze1!#HV!;<$@IVXhYr!!VeBueq_*?K-7QEMjKe6EL7W|F{Z?xbw7W|3@KX1WLTkvBR
z{D1`qEx5^o7g(_0f-5Z8Yr#bpTwuYu7Mx?jV=Oq;g0HaPODy<83m#~}eJwb~f=@hd
z8Gj4@%7XV=@Fy0$-Gbk-;Efi%#)4n5;O7m@0dAMNK~cA9%McJ$)Pve=Y>eINens7(
zG-!5zhIR~|IwjnNh!K7FLgZL!7@cVI#S)AKSt~)|9Ovhuy|Ff%)`kLY*@;PZ?Ou?z
z53th|fq0ZbW1`p=#9y?_k&d>smyOM+&dUIumw$oYg_qlOUcN?YYWxmFaV1{vq)I<P
zWRH0Hlz6$?<i$q3OpM4&4i#I5V#G@lvg8%MnSotOp7sb*UXR105*&CHHug#|wfgsV
z^rFSt;#S+geohI-+;gAZhG&;*a|JU}Txv#wc0V?k+G~*0n^aaSAZ~-Hl_PNkp|&gP
zM(s)JM_}Oahd>FWR^b2ZNWCdSF;U`TQDSts#2itgn<z0#FEIoqShi}rcHP%3_B6XT
z5M2?}^@_S(Yr_b+QT=a`Sy4Y#RE<Y=sCFSrP{PxQ1=05vB=&|o6XP(nj6v5Hqcn;0
zw<OLp&Gu09LG1!Ggc>^=sI|^7#3)mF^ifpt1++=a(KY8C+H)2t>Io%qSDeyx%GXB;
z_!E@mliHI<sG@^wlOV35&;vkP9fsEEnhykM6uNdbsuCiyfjJ0uNmxXa&>aZ%(+60n
z@$>~AW6`K2jMpJq8s>!{OTtjJIJ7|tONIVx3Dh@J82hyZ>d74_?Q991I#cXb6_v#>
zko)&(Xevfr(=bq{31Yvt8)~JEreOt78<5AX9@Bmx9LUlX_2$HCw_4t#8YfjF1><gY
zuhP&Gr>KpIyGV&vqQ9NLR04M<<|(7@Oib|gQv%tE)0JSgT`N5-cz>JgyE8F`awLfy
z8JuGra!||HvKpCnYz^0ufE=CcNCK}4J<(1cGUo9Hw5`j~rM{>6N%W{}3{|Fn>Q;BL
zLZxVLqXZ>PMQ7>zU&K$?RD6PpwKNol2)AHnXn)6u30cXef!fz7UP;V6j4Wcn9aL#B
zby+HnE2i59t%y1m7<d3P60_)1d{K#WjY`$^oeruUL=2Qx#aw@?T`IaY+pc{@LmC&J
zEY!D$K_k|SNtw*PIhrUcqJ4Q#B>GrF2elulO6W+2ZVP{aQfbpr15KF6`@>#fxT=-(
zPyO@V>Su2Cly(_Pgyxb3)l+=^l|W-+J!<E1`U6@()=)mOP-i|uWv)PurUa%YCb-qV
zBsNn{_5L3G9f&!DJAKU16q_AfE`p#!XOS-644v*${U}t=M~yVS-^K8SzA&a4cf&N#
zL#Zc0R76Y+q-uiDIqi;~qE9JGz?YbTnM<=cM*I8#GN8sUX+SvR6lCn&PSVeu>QaB;
z5iH2lCZbxmx=*))kC8Y71FwVu#rVxyA|~uKR6+%LEjXyH(wS)1$zAIEo+pw~Uy-P9
zCsh<kJy)-<SJ(Px>-A-TOm+R64ANw6H&1`mbdXMod*4`;d2{HoEAYQYr-#p>44G4r
z@f(~bCzUFJ9+!}%Qq=6kcqK3a>tJ?bER{QhdLOF||B5=M)8;9oa%}!SBIP60)ag}%
z<%yWt+4Se2){}DMdnZmN42whfucU$)mROsAU8mC!iEff8L(t4%G<*jrG-4$*5y3!w
zsWL?Hicz|SMyX*#5_+_LrWtQ5P=Z%Hzf_3+0?0-QjPW-UJWzy1?+;jLLK{&V=U0Mb
z7E;z=%<c4*e%v<@2J7hW@JvEGwaYPxG}*RZ|3?1FoImwi`hvQshm_U>W0lt9-4%PA
zvgM?204i9ZR}gz@m*C4NPkpW*(-&oJ{vicQ!<hS!6Rho}m1E3F7)tH+PAF<Al)Wux
zFV;qT5z3nFQ~A(5B<lPS#yI9<6f?SvVw$rBwUSI<haS8+0DaH~eTkZHhn6Ugwa2wB
zXb0(JcE+=sVL38|i_vY^kO4b*6UkQl1|_K}R`ZU~%di$Y41J2{nO8an(IJ6YLw<yx
zWbmVHpn)bXo}-y4xM-o##1Kd2qN*zwt0gY>;@L4MgX6841wyk)D1}{wqQJ)?MI~GD
zN0LJz2RA7Oo(j^?cQ~3Tkgkjx_yn+{kxmJepf9lx*lW*D>_gqtz9hb}Mm8Kj<!Gch
zH>_Bnrmc>qb4byG4fP-?V@~0(<DM<xNoo4nv6#*a#0Wh0#i>*3r`r3}Q|f&#Fqok+
zSf|f0B|HXmFb0MC7-W(fWY~_@Q*hkw>#K~4IestLY`Ou;palD#d8OU9+DWYkQ!m9c
z6gBNCXwyv?W$2H#jhbv~aWstpR}gZarWXG>N_#s>pVHbwu$w-5>Vxc|Yjwef=IIam
zus;OYJTpAlsFzGlzM_7u1heAAy_6A3V62kf=7UMdjZ*^ou}b<seQAp7hXn&cEr7)H
z>6`tfs}yyVqE3uc+Oyz|*=%jSU@$hs-c+Q|XD%m~QM-eg&JIK0l=M%0qqvo(t-iCB
zhVSgY3$zKWNN72*08^_VwynpxXna)XA7(Z?Ud@VI+=~k!v2(Yg9_wsq4tGh|;>F-Z
zbE1}+j)N`ux7oS1e)^3kmG+DjViMnq{+&nPGu6Dp_6*oM8~o@ViH`m@NB@jC{7Z23
zSJLo38~^GskeEsq`H`@&;Pg})uCn{Ot#1x}zn^4m&L`sMJu=e}^7jyMTo-+g#Uw|+
zdn-*x9gX$WeDarWH8IiE@PlJsHoA-lyMLJD)g0U6UhY6*B1mqvBz9a|k4qsDnDel*
z<w5RoJljdna+^N)ZSTCGnH|b~&aGN^wiYCOtQCohnuSTU2PRef2$NE;y7NhB-H{>C
zx>v~3v}jmk!22QRR^24Ty0Nsl1DCpv9m)+}{$6e%7Y%K8G?Khsfn;>irEber{{c;b
z+Pv$#ghvqaxm5{V7wg>WQom5tO(^Y3-{!xb-0t*u{U0N@EjN9OFJEce>^nyZWKvGf
z<BQEz2fRn=s3^{*p3%icw?WkGpOcmRdLQOT!)@_)p?;Y+^Mu1Hu-P}POO>xsm2hA%
z!=T3L9li@(?K$8dYJqk-9~Ac9t#)+o#QY%L?%Ww3Pu*VgfHr)Y*JU)>I+c=j+OPFs
zYrxj2SY_+vzY~@z2G73Q)2tHPZ^J6l>8hCEj;1GJu7u^e82xjs{a$+uBclW^ycrdt
z9i^eWT~jdoX>$xyM#g6VdjkGK2qMqa2ifk_FDAP~ixBLN4a2?q66$?0^@U`%J7<yx
zd`9iT?%V_mM|P*N4EaOH$@Z^663HR-HITHb<bYYS-R|J5I9}1P)=Pc~l-;df%Bubt
zwTJnj9)lpYO*C|vcP@1!?6PCtMEte;Qys4s*cOwe$%@Tv>(Rri@6*t~c<LVpIy^qK
z>c7*Oh(Q$VnW94ZF$uKpP-aE#PA;6O|NGsfj|a35NPwWa0~dETYn%511;>1m1Svd^
z6v<%D<nYAWZ@QDX#-#ifp2$s~cb)Ik)RURl^9q))Ed25~oC9|t_1#5wn<R_IM2(z=
zK+I0i;V8$?R2-EpuHg7MdcZ)&thvW-o9U?9gnu0~9p!J4H$AiUV5}p#Pa@@kS5p3-
zBYA3KoX?w4f70$~e5;$yhNnJ`#&mFpr@oHHvEpffqj9fDJ=@XPDV};d8l585;b<Jm
z{lO*;cH(=o6Txh_(r^ZoI>+jM%9~)=rcFYk+OD<{EB<@nj2(oKL~eMQDN3;B2QEf`
zQZyTFLW6%v+=Zg<U@5j8n5Uz7V#Iuoo|AEQ9(D9@Ox#4`Ko{wG2R-XsOlQkXJhcd6
zEhJ&JHzuwkR`5{*g|J1lu}0mYZ&&RY(5o~TX>1_o?j{x?c?-e6C$2;RY7aXeN3y=V
zX=urCB$pFgVna&_w27hbW%$+{=q`;4_;$mQsH1LS891s08xuQ_ftbNp9R5)f@_|_b
zNZ5pL`m~9xHti)=vZ;w_v{6YR8&MCvU!8zFn0kNtmDnnl(_Db~rz5y99a8gQZ;h%G
z=uaMPoR(33xHEJoiF{l~BPK0$Ot79ygcKsAw>a*(6c0+^-bA{;;N0pwiVeOid6PDq
zIf*H5WC0CNR04k&Y_mXoVS&(|Jx*)~sk^)#4D4p<UI6JrmSUF&sc8A6H%8!q1g6Ih
zzDFw}mmW2^9j&|7!>cE9jj11BLK-yYPw2nqz#P@}P5V9Ol@hq0XNqam98GjpLs__i
z=9QTHnYUzmr1`IPqWaD@bN*}3?E!w(#7gjG?kmMQcA<&T*Vt{s&LMsl+JZ-I2nL{Y
zk(pon9{ml~!uElL`vBVcp8=PIaVHib-WzL6kdv$-OibuS_#Vdd8~QUMP@%Kos0)j^
zG4zQ<e+|#({^v*h?`OfG)(AG>$2wlE*D!e-TYPZaE}YN+Sz^0Li^4F@z>5O;<NBg7
z1x7PNUli&gN@&=VEvSIhaX0K9oXX*jR~tl~{`20X{~M>4#K9!^4YpHxNo|~HrveEm
z(79N9co!K3ZHnG~a1~O_uC<5AnUfZYQb5U~QcC7DvY0OQ8}9hRFXLgN+<{9TUT86z
zv7lpOKMOi-S2MV)rt!r5Gpy%bcb{%OPq&=fbnx#A6v54mSwnTEuXi-jRTa4AMZk_m
zx`#t+^=P>Gm~wQD)296a18(jh`a*VCc1P0=LW5<~zJwH@0Oq(j8ZSc@A;(=D?jbok
z)_#2|4|b<ry8!Bme$WPDA&Sxrr*4{VoulbGl!wN%P{ejFvxt?R_2p=Txu3XjMLn(S
z6QP@KXgLL5$nudzBW+(td<SA~L(js7m)9ZP@Y0+j3q}Qn;oU`)KujzN2PTayGksum
zA+wA)S73n_4gkSOx$WpnFKrV~-55LA!TJRQOzjy)6}A9Y*N7U4)(m^f+Z3!ily@Z>
zqVE=bYl~>4U=`t79dietVuMDkl3}CZ4eaqDJQ9!iio-wC9`7Q61P$Nxfjy?IHtm<-
za`jbsg7tlTFk3V2094r>D2a#n>}Z^f?vk1(?<9k@kF1@#PJ0VkXp~yG<fDo8^bO^x
z!?&@@A#{pn>WRV@dLGq;GSnxw&<F6q7J4sbfrVa$Z(*VLlZ9@h$s34y^VgDv-bGni
zcrWvq8bc)+Y3djo>3tt_`FVF?;1JbIG7K=uOM8~po&tDo3g8_J58=H%nyH3B%y=pY
zwJ_SyHi99yx_k9G+(PQ2K}ZjdaX@U^jnLq*|D_Ef`Oqq~7g5Ym3RVTyQ?jpTLo-Qq
zqLDfc>Jpfj(D1HZd;cTkeFo;YIT!wdFh~crJWL1}r0X*suVVZ#4H{eg7l^6!0Cuvx
zl_8Uq31L$*jM70Zfc8W8qHSTX$=>e*(=?EWFp#0A$;gv@F!`SNmoQDC61WVKje1mv
zWXboLN$W!=q~})YxhHx_#VNm0mds!>=#s6YeOcx-nhe52)FXWn!yJl=;3nx+2AIc0
zfImfhpB_p>FFQv)f<^Jdr|$-0peP<hI69ZFcc*tc8n;0jtWGdqNu9sseGIm!>tmIM
zHiQO!iD@qEyRRTm1n$UGn9=}thM0Lu`uqO(C@u&G<%l*G7iFmwGW)kF0pbCfP)UUi
z9%m{V(MIDgw419zN?fjb$UOu(T!^MQ8lMK|3TACmjuV50&>!)ET4Jvaj6P05O3{IO
zM`J%WG;Z>sjyt!y1Fj6$vBTK4fw>+XE_Ew;^~p*AR%Z_OmB$>7bc0k0VBfr%+><%j
zFk+jEGOqNkzMV?og*;T~3Qn*qD)zcfUpbm8iJ-PO%N*vWA9vhy8!07+G#!oAyuf7h
zE~*FbqTp~ajY7+qNa*^8!kdZ^Hl%F_S>ezfJJ^2!v+?dg89IP{O!`LOSly3t1ujo?
zK|ZlqXt2V3<7gx!fCI12empNQ%h01X-ymgFy#I)+VPmX2{qjWrCRandJ#-y-)9a!0
zn551#nib?w(hvCNVZ0#$gicblLa@MXKuVvw(tU|>jz&L8G&ml4(d6YU-s_Jm>N!gA
z`r|Gh#PsbD9X4Rev;|U<KhOrF8bvL#8B92u#-jaD4GD-^7xQ=&SAUc{aS^E#Wp_cP
zm~ltbnN;Q){BfnX`1`sV4&dBwbfTMdj#*aAVRV8O@k%fcqf>xB6vQ6An-(!BvO9o;
zKtY@<c=L7}Y2AgLMPeSi)SlcB8sxg5=Drl(#iKJxZge_>0tr(Q{TAALR9lE^;oKKe
zPvrM^P<o2sVk(WL1`Z%avQ~6O*c}+182XqGdW95NP&6Knrr|Ifp;`FDZICsYGFY#k
z(+5!c1S1_1r7Tx<$#NJqhY~3Z%+!0^0oKAy(S(pfhL;HqGt`R@^Hxw^4-EtCK`a=x
zh32EAmhgdLwKj$h!YLFMY-6YoR(j)kH|SyOEcUN3@O|gQfUEB-f%-V;1_!yr@n0n{
z-k6$-Bl{4t@2s=iPA!W^R<1WNAwiCZOYS%Ck2#L0Vus%pe<}}AMZDW)`qNZ?Z31;+
z9D1eirx_15v4}(VqOximTCc|XsRYufqR^LEDNgH8Fy?t5^`}?pllSFh*c;|I7!yP~
zg*wSSc^!E$1Z(5KMN#OD_v8}Hzj+2jjA&;-%4C1~DuIFT&qXuod9l6(u+<8-_~hsy
z#K>zJ7(_gR&KzJiZ5)Z>lMOQ(IJ3FH*-}6DGH~L!zeTZcBvsgqj^lfEuQCTt;@h+v
zz#j#R>%pIhhfmdmhp>OYj)l824HFt>(AOV_dZ^9US9=;O8J5Z(?qCTEoPy4DrlR);
z$0HMKgaq>4fmx7iFA5M|ggwJ;=u-VCoA3HevdqD}Q<*B}1_Cg?0XKmw@E=;Wjm(hH
z4_5$T5_Lbuh6*{UP#?WeABx(%WoC%)d=qk51+_hODAUo3$~&K(*(~8co$pHgJvS#Z
ze{aE(QB${^#@_`bybHh^@wqmhMib0-Ze@FHvcY_&pT_5UGedmmYgRs&hc&DVzis;b
zX~Nk7Em`_O1p??$6y4tl<m9;nPt(|u(!_u@yxxgWwe}L}QD8x=_N;h_)0Xi=5!Q{0
z1hQv0klw(gH>^+48aeF_O}hgRWC{1tBCqH4pVfJVu`VDpt>qUY3c>;T0~Cq&Yg+9V
zG$#`MpTXwMCU!1R0(J3Pl3)OoQ@%5JGHS0-?!bb0BfgqMS0I*wxRye@B<SA*kk0hO
zEr8HD7*xH#(0M%w$LAZgmvgB{A(=I#&%C4Kz&GrAPX!wX=>G!SWnt}q3**IMxPzqo
zMHo)3<;QBF<~_EfX|xT^QMz|`2hws$?sRwG6}Spfbyonf;@dGN=tQDM_yreYR4^kP
zjZYz)_PY;hRydmIeU98fFE?HD5Vg7BOQa_`8uw8Pa3b6kB6bIE!{S0=z-q9WS$_?&
z>9V)c?kN#$@(cwaMqdUdPX(*}v2R@I@A+^h0T=4>4yswrwXr_p4vuzH2aSV>OY7lE
zefL(6wlUtk0Qq)fLj?xzKv(KWM(6_>2pELXiQ1odbZ8aDTHU6laoetA$Iz~Sg)7k0
zjY?eD_oJu_)hOVfy`+8^CO4uEbfS)QJT`MI9UY!#bU0FbBlY%7lhnq9T=<meEy*i`
znjYm+vHv>}>Vsws``}hjin#jwzR^lM^$?tC)GFoN14fK_0@sDc!j5$UuI*;uCPf|o
zLFYO{uPN@nc4o7P4e+TBpQ5PSFxQgYfq8L(3Gu2sv4?xqyaY$%6&QCVSZ9Zem#ZGh
zd;`$tTEoplAr$qy9QAm^ew^}t*E2WR=Y2T0O7amUIJ^U4^W-+fq<@_y_{t$?fZjjK
zgm>UpKgw2Hm2G=S@rc-P5VRfYx5~C%u8Yrd1$z6gQ+jW5sW9~i?Edwus9E(xq++M2
zoQaV=wpF>fz*c`}BEr{>>~#v<jm(BqV||60@b7Zq<HuuFOOhg4%CQb*uu-G$uf$H%
z9LU$z@C9`9i_BGH(FEAhi6IcRH;EP^M;7vei$JCjo=+PNPvERs8ZhL$x&`_1xywCd
z<h8*bhv@vY@dpxPQ_E`VhvLZTQ0yI!yT2x)nH~8Q$Aoq8TOm!Yr#m=$BQ-}iNm`(n
zxq;z{bk2j7O>-yXoacZq9>PG-6q_@i%i8PjkgdTYlc|2>3i<|OOT5+5NDjJN{Q$~>
z)27#nfzESb4};g*bJe$k<GQ)FeTH)-41h0N-I5!KcdMI7&Vhce!RwXYtt7XLd*(KM
z47OVpwP(Y3WBu!0<Xl4zLLZ!4d3>l2|KH*HB8#K(OX_LhoJ{P6<K4YKavj^|zW5>@
zFP9^G8ysdAIE^DGIXHZaJNbQ56>lb2)HwkBHLSNc93Sf&-_XK3;B%?1ZgpKQdK_>-
zanui8+xC$5yLxY7WjNP8cnh@M<;dBhsOL6(Ki0p7TzconsKvipX=umfXJvYgc7Sm(
zeh5Y!^&!^c6Pxwqqgw3SBtY73*$?KbmnY`ZUeSfDaUjN%2C$QuNrf$=->d|GjTS<^
zG4c?B5DfPwbd)^P;FuNtv8R9LU1;2!$5}y6IYNVwr0b0_ANhPIEgMbXmI+P(U#8&;
zc;eJ;fobtB^&NOP*xq8YC_{R=)D9)sXB~EY2|!BguI@_mCTx-%_xu*6CItsv1`Tip
z$BlJ4T<dTVqo*r4d_4}}@Hmz>t2p>O3|+h0WnY)suxG3*8S3=G{8(4;_6&D$(s7!R
zA7-v4m2tg^KlG`0Cu0s`OfhWhl;n45*m7w9&GY+7>|DkCJ`$eaMCAE>NT1(lV}3gt
zZvt7G<8PAS<J~xP3+9bg94;_PCS~+{%EL<Q7gxK3!{ag=_JPOu!C*sMhO6OVY>s*;
zH+{dO5&zp48?F;H<)KQq`kgy@4-_rdhQ$hmwIpnKRxDDj;4NBhk7kpJ^bLGH16@+u
zdB$cZ!ysv{-n{}jSW<K2EUYNO3$_SWXwQa-zyR<=GTTbCY%#?vS#@BE;<D8O(vLLB
z{4^`847*^AQ8Bo7rDH%hxg1#=6??1W?i)dpGN!k}B5YC`T451FJ+W}VLqkB59gb$Z
z_AHbMw<>Kp&Z98T@JyBDL+<o_x?O-8`VyfrZQAn9ln)?N-7zsZdL&26J_5tet+{Gz
zuDVBPtGcE0?67~<&}w%zgzUbNxq+diZGm&iHV<x7daq-X06+IAOad%#uZC~O`a4_=
z>tGTv%QjOn|7KVb%vBmrg@$vJ()ulI+VFoWu=}mwQim|6ePS~kc0-1nX-r!&rUx*l
z@3_)WIvU?bc!11TIlk2`N^oARK2Eg6z&<4(;W1VMIA%LRp@iOOuotGDKHj8E(9>A0
z1ig2|3c(2gjZW2@{>GsPx_6Ni7bu7)l@1PA=T2USI|IM2;1jvd^YnQMJCxaQVyw@V
z*{~6o=nK2govnrtPI0N<=fd;awu?;BXRbh>L|5+?h{knsPxs&tg(+&m?wU-|YFEPs
zm?Emk{~9eIInJ-BF8|B4=T6AR-aQF3$#)hUjs*0<g@MGt)wR>WVYJ=q0$Y0y8y`2;
zbrqRFMo#dLUAhX5?W%4wo4c%cj>gBaujMtb2oJ7?PAqTVh1Wbmq&4rjSo5YKmsq#a
zJH)!e!v9VZK6M)BBfVQ)DtfQ}JJgI~#yUuRLn$&44S?g86BpV?b(%XigFiZ*{x_YV
z=@ay2^x2r82pplz>64VCJh)dVUohE7lfssnzSGf2zbv7&ucH~yrLvO`D=>H4H1Zwx
zD1$qolddy+VZm@+e6G+RI1euDZ%3je$6?rm`A*o6rj>{+5O1b3odfQ=DpTX%kf$q)
zAvDM=_6$9UL?Oq_^ev9Y0HA)gNat@eG>Q1zqHNo*493uczn;wBITC+8L-d;<r?=wj
z-7_?d>SX6e=znM;dhcv;ZVmmKq!Ao5@k}f{S8T%bDr4a>Oh{-IPM`#Tw4c{c*Bic^
zEuE?3L=+1QwhcGa!O+bJU<Rg(I6iJlz})VqBcl}U<~B4ytCoJ*z;VwRNTv%1j;2F2
zK8{xhBrc|isQMA*OKa=V4YrPk3ZJpc5Ri~N-mwU=Z`<l$pa?ePt*C@HNhfJ#Mmq^u
z9bo%#^g%&O66xoZ5fdE*1u2NU;~};!gD$4iGNmGcPSD#kc>3Uiu6$SeoDHZq5cAqh
zGy@BXylL7VHka@bl=OolFIvUT>o3+*UWh|FLt03!H*A>A{m{2-LZ0yU{PWP)kOSSL
z{U^LMnn<__utgVD3J2ODGLlCIoKOTkw5MpG&<Zvih^g_>DeL9ejw;-oO2(`YosSwZ
zuFbtMt_7$|tEOHA$1Ldu!c;1cd-^LmK%;K#>`e~{pm#zrB#5p87)YIhfG;GTkZ$et
zH<(NqW9`I7dGew^tS@xmmoK@|4=**G?Cn^D`@Imz$r8sRyeN(1wUhg?&Nvo5CxGKp
z0{0VmbK(t)(%#=#F<eoPu#3@oIrmdp`0dzn4&?fnM7!|YyB=H$(2vW1durD=F^)xW
zq##brWq5x3)k6(!J>VgniM&tXBE!73e4d3Zq`IH(oM{(>a}II10>cx73VJ!_P$}N)
zShfC}q?>=Y?wcfiyr7n?Uvxvgu4B7gsJda#53bfwH@E#CSa|Ovt!>@aO~D$W?r^Jb
zWvZLA9j{KEW`|l&8^3r!zwDsvR70rSI6Nw2sGswmSH@81=Z$VykD&jq#R%$ROON0Y
z_cK@>V`21mj3V)<G<;t&F9U44z^3EXY!BMFz_WNfc1Ths!RnrT@!QxR2CI98y0NP1
z_K<W4-Q`WB5}XjnLZ)zjFD*`QDKrBG)?zWVg{tWZ-iIx;20rOp@NEnI4Nnk`sE@{*
zqh$y_D#5HICB2;@cX*(=@h+SjBTyo)?2QG42W#BmgYu0Eqi#2?*M^|TP2u>Zc0O?D
zhh}@)nS@uHnDxzwfw<HY2!LS2n$pEkCDAR`QVN5UA{*{H*$u30duRs@8bv4hV3Buo
zG%Z0)AfmPyO(GZH-#e(C$Ez=vXIhzm!IMLqiyZWnK*1n!&y^sQpCE;Fup6U28QP53
zmO}No;R7_qF5}w42BW|8gkzJ#o7J9n?RDKxV~Z89U53(R1Ry&2ZrqGEQ@24$rVa9^
zK&?^`pfeAz#dv9N!4V2oVpxTKYi|&ygAXkejEMMU0i9)qLrDrQU3L*D-IWfy6Z=3y
z;+F4GXbxDy&EJhpTm1+WwreFU65J_OL*$3H(48<3I!7W%z-61uY_FdIxgesdtg4<*
z|6^096L(n1sevFM`c+rsXP#!_DTziRo-%iy*$fBDpT#zrhvQQ%fOcTUutuEnT`TUX
zl6_C_n0Ker@a-x8yL6uou_9dIOt8@rY<y=6*<G5kZD2}C{|CJ3E!y9BSSfiq?OS+U
z-CZp`?HO>mCEi}k*F{d3J$mrjmH$@|xuRQpvO>YsVLfP_T1Vm<GY-$%i8T=P4%A~<
zU(*a@k-~bWo7HDnc+gf02fs#faNyt8`JwJlQ9VZ|q;&K36!wGOs|vja-lTL)Fh}DB
zu#S8T%GW-yUCbADHB$UErx7Q3^dm1K2I@z|bEm&wXEMo+L!3J!#6SNI)<oiI@lW&$
z!Okx6Ps)8-{1e1*{1brUpSMu_^LkTXQOKr!(8{G_wRgosoVJl4C}7LXIvQ_B2{Dc$
zzWWqsKz#QRJh;=p_U{V~LY8J@FSedcPPT~u>-H=!Y?xfIT1prW1;q5{x7TVjD0I8`
z4dO^xw#$`3ZTwB3hW-hvJ`P$wo;%x{wTbk-(zIVNb*Ar2{IWg6`nko8Lm@1qN?;1K
zw;!B;1XgfP*8?l@VG)&e2c{uR;dG~W%snSJI2pR~zN3-u)8qySbj+Qo?Svw@f(WR%
z)B%a)8j+iXo4;k`nUiPa(p@Apg{=^IMY=;oo)Q)oI@4geh|3cH)E>oHMyS#MS2!X{
zT&ANxT&~W)hWo_+3CJ+8oex2OGFLmmVJMr22lCc&u7SD#;%*Z4UOk2=<aX^GD1#e@
z1ZIg%$w6&EWIlWdR9ku`b&5Nw_l!;Bde}SoPda~FbpHjOU+`wTjn#lJWgymkmx9%;
z&{*9Hd38JVIt_Zv!Xixa;2m&N=>kXhn?kF2`i0j=<Qaqd_&iLmFUP14542h?6ZJW{
zz6E-HIW}!JnHn6q%!cHOLboHAEMM}^UFvoY*(&M>6t&g%t)>#|WIs9we(}0_Mfx&B
z^CiMC;)wSriI2`_!tz7ZY0O!tuwZ@=)_WdLzMrcPXD+;5h>7m!Z_};;pP5(;NZ0ne
zVCIE09;l`5Laqv@$KIwTldUj!1g;^-g<#gaWRA}Bx$iM!tqBYxr&I&H3CgRle(S;3
z9Vfw72G}Bguu8db^o)3duOG$v5VCEvYrjMjydEQ$MZQzvj1v37)K^dsIT&#ErJL3Y
z47_-EXbcL6d8OS8e1PM{jz!xrcB^OL4=P!TN~om_VjIgL?(vG<3zl@Td*NG``lbcK
za?>fgzI%cA63|H^CP}KTolT1qyCA`svD4@U<sOOVP6aJMzcOB<ETa?YjtI;^yV%9V
zaatm$Wy?q7S7A9T6k-hfV?5q`JtkC!ISbJyoPcQeLMb&md?fkOu40h>AXqZiQNhQm
zSEKXsDAH&QpF)L(J*0`lM}7(do_ImXHz+(lw1NlD=d)#PI@R0svA+`m92m_7Hs?{q
zdeP<p4w-A!+{XnS2#)Pg+H*9F20Xr;UHlCe&n8b?Y2DZTrlQaQ)Wz!uNaE$1@O&1h
zBl+M4tz4zxB=Hv0o}*c5-IxO?ts6Cf#=7C>b>sP0NE8d$BOVUE5U|-en--M&DFJsV
zUZ4vt<aCT@O{Jq(CHVjyyw)HG;yIKb`R?g_pY62P`7+j*#U?-6$As@OF|Bvfk|X@R
zX)g5x1yhkf4)Z#F!<Sf3XCXi~5?X-61%79bjxD;;PRls9puqDtba>(1jN=Q;6vut7
zZXEVdM2+>Bj#TIr(fP3x8<}3Yc;zN|=ee<Td9+<odz?El*yB<HxO`vU>PUv#Hf_D@
zQmp-VjYGu8e;MvHV)0Uoa9h2AcGdE!UNf9(Z3ntY+0TXm;XWR7BW6F9y1j8G(#f-?
z(~=p`DAea$*|~%Km^Y4#?%qWjgw;{2LAny4mf98j`p^_4Q$A4n<51E!!BK2Qz?ZeD
zvmLtc{2sa=f~uJEWqZf>Ys~{$VeIH-BzWsMJDiRKfz0~RiQ=?#18MsRj3saMF|XUS
zO+fmgYyju5qdWK@mVWe&PA_xuIuP9?p|eLaFXHggH-|1J^w#>oSmNNhj*i>u1RD32
zqMUPLd!ns{lz0chwzN%~#4Ynl=llNATt}e5mJ4&k2fukR%Nz8wU;poDD4-aI2Zd|l
zkQcb?+=p-Y_53~$$wT;4W65E1!Z%7F{eJl+wA@bnNn5*klMQdQ@gA%_OYFefvl1w^
ztAix`qyNYCH3@yU>g!W4TJ`lMsMY^XU)Pbf`-%Fx@Hq4}9fRIQU&A`y<+O=TeKGkj
zb>osDLgDnnZu%xiBi=xv$<IfIP;KnFG5I$s_DwwR_c>nO%u}C!6A;-PO`qW5bz0Y*
zAJe_WKsT&RBi-qHaUGH-Cf?15ZGp-N`${VypCaPeK)v?3lCJr`bOqfbaog=1-?^@2
z><w{NfO1E)B~Vrz8@TO<XS!fY$b@Vr6GFR&co&<3{bWv1yOy((`dmd(<sx$Ta>$U#
z=fGzBUF`zhZ-X166;N9gguyNshbLqqaDovVx&pQ#EIzA=_&ZEQzhe-28%7)=hrdhn
zAza}9DR~bjWAPu#d;4GiXY$_rx1THT$G?HRug7c(%Uej<9juBoR&Kn1yYS1!(8(@t
zT%EI}Z2|(sfh+M^lhQsxIHS1D+IS`DJT23Kdsd(n{T?Uod{X?H-WY<Wa<nLx77ALB
z?qOFF*Bo%S;d{i<NNYG!KAkR0=25sjPqDAluH#S+UNj<ilrmvMu*1=q#zowT(@CY*
znR@L?KjCQn4xF$jK>nSJu8xT<9BuLW<pP{dqX8d&76k5eAvok$gy9(GnIT?q0dH{V
z^CV(iq&EW?j;2~_dEo|<OjpZp+6(G>tfT227D(zf*J0~J_k-aB!oNn1G~-RVh+plv
z0b_|(nPv^H>|nXl9?C>BaD4M1EOLbJnLrq42#Shx|J1^N10r0M0VSer@K6VqGi@j3
z{*cIGO}~iLBp7p@o8t5cm*B{ToZAG`xlQ1%PWTITm7?58%rFx(IPo$)F^>|bn~Bpo
zu`d$oI#LsD!GZ%%zE5ANmrfL~FT4OwjB_n8*8CiWqQtO&K>`A;5BK3_R7@j}{<?w#
zda?p&lc`P&K$p??T|+Ud@W^k0MO#NB-_-LY)!rkKH>*%N(jmxfIwTc}Lz>WcdcFC4
zdaYZ3A79<6mBIG0a`C2zR7Skt4Sqr&;FO5%67iaV@e};}?jW3^%U`r(@{VgIXj{8a
z#Huh7+NB&^L48m$N7KWoGxSHUIK!shhDt+U0}1{RxK5l-FO);si66d3qqRYDY#H%`
zE%=#?F-H$Womy!xMWY&NNPT}6i7AnWbef(B#!ny8t0__aghurRiUq+noCY-#JHiw4
zjxdgM9@OH|1;O3M&K<@>yUwrpDVN}wT4K3hc!$~fxRId-R>7P<L=;=~2vKSUuHeM)
z!3)?%uQ_f5H;!C6yf;0ZMxPW0YEJ2R!sKTc5J6IN7W>b`(w2hh%lA>S=pnBj;CNdr
zLYKmuC-`a1kN!Rw1<&av+QqE9xWp}4$Hv4uP$`f(A&!kCW}0Ck>5?q?NKk@}i7hY$
z1gaIhF0_&W-UV8PZ~EZ9krw)fCy0O~%&T^S+(2ZXP1(upu+4m=7gYiWJ7xi-N`Z`E
zOd~(FX=h?EDDzUvj67sxALx(g;QfhYo0Z_piOc94?b0`W=$X>!qx~6EntzSr1eFTt
zx$)+sDBYMy?=sMfT+{K57ToIJnMCiP*^g+Cyh%EPIj~bp;VrjFb*G;aFK%)vvaehJ
z1V&K}F{<UD6b)D!tVav%Bf0Nd2UaQSN;7hhb)>EwK|mdiY8x;kNH9r6UEIGfQR!{%
z-j3g<oCVNu9A{~%^u(u0jzwwwEKbr!(X+aaZ{;a#jT5!b%V<BdP(MeTOuaYG(QXt#
zoTH5+pr51dQ+G71?|P1=?xb-}AmN~Qbe(-Qek2Da?i4TVW>A-ithGD~Nq6FQ1m1`$
zAxRVU*Rz#C-$gDM>^F^*Hr{8eU+Cv-IJ~0^OnlC!;hYVFPv>k%*3a2A+^fWu6`ZrV
zC(}9G<nfF9^vn)sp)ERR>uH>`1#@r)a-#QKYzzAiM1wFrWp<pGIbQ8iwiwrD9!*T4
zPQVh?VN`}6q&|E~wk<_#<FON}0JERWX;^%)U&T>`Hf;rs=KYC<T>fRDF))}_WH1A|
zx<E)mgJ`ZI*n;<FbP8$5%TJ4}%mT7zP*tcL@gF@aQz)ydXC>*;%V!y)Dlq~`FQfEe
z%oM!jhATfTZLSK|FJ=;mRBfT7=?DfXbXgCaw2~icP8XVHKQd$LXdLPHTdq2joR1=G
zJmEhFZ?h}unDvgv51^ecyhWm<Z+G0Y7@)03{27$TrB=XY!Z}u3c08@nmbn(V8IrLd
zQPf9Sf?5Uz`XY#x?5`<SoKts|<gHJ^U^ZNyh!yV?CJ(OB*y;5YTs9tq?WGIXR%wW_
z^nOjpMum76>X6n(1E?Dj+R~EOLYH=l*%a+~0#$=!>)+<~<0nkDDfDC<TTc~0Ot$L*
zupL>0(Xn-2YR<2>X(Ai*>mLG$`SmXXtBhI6$JKjOtzl!AY4#dU!JrnqP$^Bb4t$%_
zY&wmoLi`h1yWtsP{AC6WU*USz$&z1tp!0*r+ikdCX3R1^s>Vt7u|0?q)AUkW4|Y@R
z--+oZ9b@B%BGiMJUD;xGWiRe=O>n`fhOfFU9&n95x1J4VVQzgd=GN;uJhyJp=hh9F
zTM*;)%vBeHJH#=9eZTC)!%=NIt*4lFHd15`c|OYTh8j`N+n`>Gdaq8SYa929yoi0$
z^KlY}bUs1hPk7q=!A+4yk)h#uH^(9`8BE9xrq;Haq70bfjo?Cin|OeSl_40omu-^4
zzz;ZNq=tB=@x76Ig`pz`pgeWdF>DC$O@ullC)Qwk!AW9C>+?sd|8&zZmD|-#L-;lS
z=`0FN;-{8VIblyDP4Y%PojOF0JkJ3kNR*zbr;<KUX+@;(;*KMIyq?b12I;Ym#`k`p
z28H3FIm7PmMaU;6Pt$s&yXg43sT4otMqbjAW$@&t0nl$uFdr~^Z)1>2&?~7axPljO
zj8zDp6qs2wOq$8>tEdVVBQ|r@l_*D7sc;LX?;#BXGzwEQv<1Pr%vA#^jekagQa6W?
zdLdF>4GZWJ=bbTVF+fWyUX2!R08!v92<-p2J8(O0B;h4a7rn$8Onn|Tpd~E{m2gPy
zZD@w>zf%Y6VdwBocOwU40l<21YL{yWBFc1WY0=lvk=58Hkw&T5Yf^>th^xT(Y7#^q
zYX+-{a5_i|>{`%qGZu8M>@Xw`Vg}{JycH$L^kYWz2qAGgSHnw3o1^hpFx%$xp}s6~
zA-AYPXd62v_v05bA*Ow_enFee@fP~P*=b1`@dPnk!G_!&^cseI5Sk%Fj+YG__uP$6
zw)IF#B14|5dU1RFUv1e4{-~d1wthM2*e4F96RWFZ(YJ8c8cx7OeusyJC&a~`xv{vO
z`NhH$!bIKz#Lh_P;D+yza~|Zr@H>(74t^H-?!xnFmeJ~ftf4{2%!7zCgTpV-t)f_~
zwOlB0j^owO;dZ#YKgFU2Z*#}P9R3--pV_Q_fwv>dwhn6QoD6^ClG~1yCLdA{sN2<#
z>@9eo0loqLjZV*8JbFMdpFE1p#XSZrJW2e`TMv(d912|e=zBA2vws!L;S!mPv$6cV
z+9Mg6;h~4eLmyr3dN$F~Xa6&gVZ5x$1~Yaa1ZSA9qj-0Ql<f@E1!aRR!Xl>|B%xuN
z1HX1b(t)uu)JoSV@fImvVf`s;^n~oBklC=P(ZNTFmDBuzO=!mS2k_fQUHk!d1zZhR
zV!5^<iRRr>G()Ba9=7i)I^P-T!k+e47$Ce~u?m@NPW+%4Z}|^ukI-@tl?+X%J!LbM
zKmG{1jdP#PO2gm9Uc+~zITf)jVA~W-?Y$Q><56h7F+zrvNUATRTGPic=AU7=rh^Bz
zk0+@m%qy-D0;v`PX<>7Kd9{<%#a_wWkU5%GlhH(2h5M^X0dwO2n3jf8#_yuY{oP57
zJ@=&E-`8l5-;Dj-y}EH|b0pu(79f;?B;Af7T{j+35Bz+Quo0mTzSsBp!NQo(8<4CK
z?+h093_aHc_X<5CVR2kPl-g#$QQ(hJUFIq}$q(D-v7bY{3wZgHjB?X$-$7)?xHuZu
zqNcTUmxuZ`=nv@l6i~DrngCdqXH)FB@jNOMfIg5a^IOAMhGK}Q{!=@H3LXCz0K>!v
zj@<I~+PHIRkPHL-5zKSP9&q$M56S5Z@_Ob}&mMKa`=T+^kRlgmJ(k~gN8@g+P{#7f
zs`1&wLW%L}G8~f7TS(`94eV5k_RKl#N5Z2lB@@dRAt%z4;Za^CfG~q+1DGD=6?$A}
zp}ueX71>C`uS^s`_?5qmWIg`}ex?0U>S9;Fau!VA=27%38AiPR|9k)P2DArnRQ4|o
z2L$fdbPTj(H)H~Ra`0%~O#8Wk4>AqM1)pjwn9Q%D^SZoKo4J&_cDhph6@H0n1u(z$
zA7cv7|A$0+9H{})ty{63dOCYYP#3ar=7x>|t-h=DTpeUKP>a=DTdJp<tGA=^{I98D
zUH_;KAM%J@PtzGlGslDPKRgHp{Pz)>qG(??o@`8R;dLLz^G2lNC_fKZhwo=DLl6C%
zDTSas%1&>n=Qnr-lW*%>jNG`GiqoFZEHs8Yru29-9(h#{-3n!YgM6aUbUfk8DJ72~
zmLUU{m6)PWoQ)1+in94c0OcTuUUjDzSrcS&G2s0#x^ud-xPB)lcfZHc(}xbAbxavs
zXwNY`VHVgzJMg4)+!yPF&T%(>WbTf^##s!A-NM~uojV=|dG1$Xf_E8={SSbR8$jhI
zzYcv2j#vs`hdTHX^7=Zo5s!S^mbZ$b3n5g#WV?nX*N-6azJ@z+3Qi(lyj{aRJJc1X
zgL^6SWdx}fQPuebY2-yieGDiD>l$K%GK&$riXb&91eZXNniLZ452z(!01%WNiL2Mz
z52TyHF+=G2ka!+M&!6Mj2zahVaq+ydbE-i+Y7*Cna6?_sus(!MnUOg}Gej>KI!}6@
zBt7HuGklonuGCHU)8ItkNR86dd5WI4k-CslRa_tuTkc4+^Sj@R2F%%Maf&{obSOE;
z{c<Uc2hfaOY8g6M41fr<mIB<FYUUkG@KdvI(75T;*VCMk`8oqUiy}B7Hy|B96@`+x
zknHPAC!u`7DU-^<O1fw`(ZO&BUhJaHzd*ju$4x%S3_6-_hjw?W`e2He2e9bU%LCDe
zqT@c%5L<u<)^equ&O?Fj((v=*x&ST#vBU&20gI56e+|Uc4&*SNuQZ?6hf4J4&Lak^
z9EJ~hG4l;dAN?V8JvSrq?`X_ok`$-wLoHtsZXw+DUzuHxI#T~!B1t-IK`aXMte(ph
zlcRAn*d+lZkVHrTO~@0XSBi{IylXt984rAuIP?xC`;Rj=A~Iu+#yq{@45Q&b)bQ1y
z>i#9Q#=kfkkD~(qp@GwWlePY?G@I{I+J}UHXT{O9hm^iy1LAPvC$i0Rv%Lx~;6p8I
z_S1qzhKYPiZ1{VAgc3+dro|e^fGMY4le>#2l6bcV(IV-Z9DbJrcT@J8x8&dqCm3-W
z3fd%T>)8dMEBjq9r=8zS0*rh~?$qz_BjZ_V_Rt{A2H}@U_u=Y6)P6v5F(Ml~2k1ki
zzz3ve{Fh?ZN2~iu)J?s8jweMn^%mnE1hsQ>_{4_qYl(}br@!P-e^<KAkFD-BghRH{
zPgixrPgnVIx9li^o&Yck6c3>vwbF)Snz!YogWBcvD2Yywiq>mq(@emn#$f8g9Xt+*
ztZ26ob6|2V{3NY^xU11R_3m#){`a9phP<Tr$>OC0Cf>F7LLA3`K5-OlZlC<+;7Gso
zwcbV7Eb*%{^rI*TwSNZrEJk~cWGDV76GW~fBaZU`1Z>WjJC^TP5g&n=hhXy1p=|09
z-srNiw<ovdyF?p>x@g=eEUa$pTme^dIP#M+mGrn9H{7l{wJXd}s2s-B81L(~cfk;T
zwuajYq-Ica?X3_bgw~D<wcpZ;310{ovk>h$p#Av?GS$W}ywD2?ftVxs6>{Qk2GT-B
zq*JkEu+gvPu)srfp-%IN=8~wLK&tDs<_1}s8x^6sAzV03`}%P-cL2O<y+y5CnUB=J
zomT5+W+&W<y(nqz#74ank5R2lQ7g$PltsO{8qN~+W}Pl*zC)wke=n$kfkmRxi|DJ1
z<fd90t&Y&>A-&NgYV=|>s$C;$?ImjMz%`DpwVq^N!d*EpLapCXDU8R~$587#u!P(d
z`hU}Cp9Yxe6`Uj0c^cDCSgL;mRahlOze7*e*HZPBs9sZ1t2mUPUyK`gEoFet=;FZt
z@8&YRNRn--db(V7eCQHX#C&#8jUxr0Ho<2ZeRUD;G4^`GVmMo_42G4|@@xF$2l4sV
zqu{d*tBkh$H{AO_b3!Wjqf4#JEw!#eNo(&P(`)rpt@KteGkrcWeYdE7BIPjq{z<YB
z2JQDP)!%|D=oJj@!|za6BI?q{QWf+%QM;DGsWA`GFF6HMFM3Z{0E*{?zC|#Dn#BRD
zSlu}QqTQ$4y7Mp{n5!m(y*rJET0DepGsj0Lr}3ME?%=$*%mR0?3a^<Rp+kTCKEnZV
znK|f5!a@5AOk&9dibHL@quZjWo_HLu#~~e^pO8S`Z4;94I@ARGJbo|js7igsAsihU
z-+(n!3FPU&>Q?V)T1BM-*W$;85KCb|cjo9-_uztf?H?@Hf&>b;#Pa(T6hqqp+X622
z*8@fXne~^EFCf=1?kA1IK?uiZ5En!^UJwy;qS~&9?C4DGDP~Rd%QWb?7L5G{;$hT%
zDHtZsB?Aic-K4-)D8a_WdSs#(_i&*AN3Zm!8W;TVYuc!qO#AmF3T6a<uy`GJIPSp<
zuXI9=pwTyY%H&s!@8_Cm?dMAB=#4;}kft_lrTAnwstcD9wYRCdRcM<&{Q7YrYRYgd
zdIvSF&IB_k^8(6H+jo)BP)`PVCT_J9i?)7&zYbd{!y#eMB!e#SE9gfGk<;kxNIo{Y
z1koezKX(-`{E`$C5Tar>!5CWoaEoX)nOF?6hcK{$IoU_{0o{+&*05s0W7oF9zoGN~
zAfm+d*6S&44$lU@nt_{GBrKdD)KiZj6b#!wZ7iEhN&%%h^d4-S%uh3&4w2XdKQ5i0
zhZl$W@iHyU&lM&=Svo&~j4pc9v<Y)vZe&Q;Mn3NlogOYVvcl5HCQ5^k6{k`BMfe%|
zPfRvp_2T};hv@zkcCHQoz`{A63}SGM_ia311{P%o{!CthF5X4l65LZ=ytn8r;iN|w
z9i$#_ZqZnn<hH%vggOJIIHtfmfO-#J;$@ZQ-T$HP&EuphuD^c<85rZzgNly(sH1`!
zGZ-deM08M=ZXJ}kM2to!(I|;aFw6*v+rabyowg&1xNo>5!LO)j4B`?UcK0Ag#Vrsw
z>e>n@3aC+j@6V~a-8~2<-{0$b{&?o4^u2X!Id$sPIj2sYI@Kc8)Ez0E1XdJ2UuB<d
z{;Y5l_2OOJPJKh!E{T!;(er@3|0L2FqLJDIU^G^^Ppve$1bLGHD!CgBozL(yTFcog
zzxpVEdSLii>25>By2B>=yj9VA42-xLqNsRSoUmsiVH#;DGy?V0<+szm;3gqh7(R|J
z;4NO&7T~zUvJeri{tF!-9@5+Ezog1UUVf8)SGg}`&E@w~e%0?w!NjJ4+$0=_7vR_g
zCiV&+3*Aek&xHwSs`xH;Awx8hq`1Ww!b4OY68*9>S^}x*Q~quJ8$A+Ug~|QnB9&G(
z$ZV`%;Rnk%a$_5tlREdenZdl6@A~KmGtsVXfoNVy3vaO`Nn*ugk2{(aL|gRzkfHc6
zq|lE=c9Tu%`tFC&<ueeP0zYu>>7>nzD)ZvhfONwk-};!ja3KuXs<J4xqZ+fs`%gi>
z_^c_O+G;|-G;fJpdm>$C44gCAskSfS*<+ZfJi)WCr!YLbyLQ!xa7zx++c*yJeut@i
z33~)Fd1L2=@_s(SW28xIn{W9e*DW^fFJXoh(+GHI<?Vb<$T!f^z9yh7s~1qLSL1VJ
zvgZ$%JMcN+E_w?v*}t)csix{GH*;y}3YHWWv4fsrzMf+&hI7wePROG*$cYcBL;`_?
zjl)!cv{Cv5yX31*sRe;yGR$pZVjRS7bVT^n@g5rmDlR3i!}*94o(xcB(W%hH6v+pI
zPiQXv0P;2$L%1C@Z@z^{HNsD<b^F2=G?wfb<Fl!_tbtzK#jW7tVG=ywb?54ReDVEM
zy|)+M3+v@-3sV3S{&+qZ`YT+>bpTzM9{KI!hr{ECDvkZdy*;1WmA=W|j?hQOW#ki{
zS&U%nsJ0TX`|%&Jw(G2eJ*gqdmj8u~ORdumZ_HNfL;>;s({{;3i5|1VmSi12UsY`6
zE?qd&>$2jnj3u)hFC?KmIViKh^%HD(Xg5)o_+N&4^FvDCpv^)>GexvC>GY0|+FYGz
zM^XUU!0Y!e!$g9-?%UyZ7tb=q2k$^BgP-C<6TG1wrlQ7N-QG~3Sm@rg7<(CZmkSbV
zJ;=Ec+SKMN%qfw+h$5z4<#<T7b*m|}5Ij<Cn%sDa2Am?)%$^mhVE5m#6tUKthjEvS
z&~!lvR=t~g!N4?#T1SX}ldgk_I+!MBQ&Dt}(GZZGML1!OcpuN4LSGRf_P&gUvb*=C
za&7DW@d+C1;+BZ<EfTtDnK-@WJ^B8`QYNr_NS`uuA0EKBz8z~4mE}FncJ#`cB-gVZ
zS-U3D^T_ICrYp}(sUva1oG{I*IdU|Q8op#^!gT%QCRB<eWF{>4Z%O}FrJvjcZQ!b|
zk^D1Z3OV#EdeN_pmHO=;e#h~v*yG7XM^^BN9)F~)YWmeDekaW2Cp^01V>L;Znwc<?
zU-C7D6t&7DKhyLfv%Pzho;PxB6}E+6F?&*(Nl)sFhG?0HQx5mDmE`7uRH@`o`eeu_
z{?dm!eAK<{78u^L184WvB5TTGiV>FPwabNE)J+<(ihEOo2<nAPne?1cSfGENR2I})
zZUQ-Pj%B+Y71&}ltkUW4Q6lznbh3-vnHeES(hRF+#yGdX8ElYg&g}1`)w=CTv-1NG
zmU~%!W4~^MI+xq#p$`EmH}Ec0xnxGZVMv#>|EQp#@GjR6;J64Vr@;Aw49XWNuN$=C
znp!G}`|=a|Vy-~%uq4spL^=Cvb<)<sIVBDgln<s6h>}+O{z;M<zhGshP8d>o9rdgG
zb5gO3vFM)JD}XeL)UY$<lt&i}&BYVjZ;82&{eVnfVrUrn=~JNQvP(R!o1kVx;+KLY
zSNA?2H8Y;S*rV<>D<l38Yo_mLwThTz2uxS4a!ZsVJ`Se-YUQWR6hJYCwhL`-4mEPx
zG1UBEbEr9kT2rzs=C-M$j{xY=k}F?$DYS2BiI*6;^0&71X-h%L%Riv*fXd<cGYL~*
zXI?25RadQ{OEb;QV?d#r$ya)GEqs!+<3NW-w}7r~J-Qy{qtJCLyB4Cf!?hYS>5;3O
zt%9@6{*5y1X4n&0VirLjjhSh)MIPB(_2X7jK_H*`OpVO44OfYJVlKE<kH9X6qRXVR
zvg6%Rq#n5fl{XT1qcJlEA-;@{A{|jjGQ&SS=E^U*MAfzR_iw*@Q0=ub|Mab4EI%BB
z>`(ubi`u4Z_K3VPIA1PmlQihf|L!gZQkXHNB{y&eJkr$#zzdk^2ZRvPZG6<0c>AwC
zX5J2Ly3*ZYOMao`dV4B=phkztZ|wjUL||0&xnECa%P*u$RQ35ns=^@jo7qg%DgiRu
zI0fXlz`Y~^lk2tePWr=nnJUdf?hj|Y3U+_azmG!>yn^Cre1C(h)%&IKJNMn_{ZRkz
zj$-^ZCE69d(ba{w^gsCQTi1Lt;$PrS_N|Zht>3kiZ@rAxrz97(YJ~I`?eTAvj=ZJ;
z^`?&m6~3nF#hSM9HSOnXIwREd5ZDd<(CsL#R3>uk_(Z*(Q8Ufn-=gEUED*Y}m;YL@
zt#XGWY{UAr1djWGqu@_blK!_Zf$x|YzQY+o4_`$QzVi|H;CbR5YPs%-9eq!CD#GWs
zipyV69+!Vo$P=4b;J)q45B23Q59MD0+*rSRMso~3=tAH26~(e2;l@MT-|}UT3T1Dk
ztXn&lLfP_RY~lje(z#@WQbx>*$7*P?mb-l|^W~0{tNXH-@4-nzSZ?5c^BC)g!y1oS
zSKNI&PUyT<1c$U}a_?&W%TsUnx~+s`ox@ZRHw=78@i@W|x6BQrb}53UN1z|78nFj=
zOISu%xv4zm?+KamdEfafg-egPPvB;Bej5+=&;a%=P;6YrG+4p>ojB#BWcD#54y}SE
z`wNBxd(m3v<`#kSSWT<Gl1r?QT#5AA8}uP0DCZb*MkAC;$F5PwtOswH2}jd-2~Uyx
zst6G0HU_>l2ZWBASWL9*u>&4Ps1`%#HL8WD{_YCZa(hWFr3Lm31^T<6g#xwiR28_8
z0v=_jiF`cD=2RHUUYn-6>$cy)kMy~aN{k>BH5SdEhSzAXVzbe};&Z-tod0|3`wh@i
z)P>J_!BIWX!HZ_%5p6uhQxtvaGn^MPG`R<qyo6*A=wG0b0BDs5w66#B7u$P4=U!9<
zv=j-ugc`@WrqJYQ_e0hFK>-OT7wdiukpbPjbgSU3+cwmFcCqg0uO&S#ZRY_IA7jG#
z1eSi$w(3xB;A2GKb=R^A@V)p@^Twn5c;C{wzNI_2^S$^tWE11}nSaN{W)+L=>WlUB
z#m*1K7L*jbsR&>+IXv6=0M4ZyfKvfDJU=DS{*7|UbGx2b^BlK*iwFNrr7tZ>Zzv4N
z-bI)uNlN8<O}j;P)xFx&ccpI;CV$I&AmfZk&dB)hu-m9#p~l6`&VZacU*oa9#z~>Z
zWz7WuJJtAo)o9b}Qq}m{7lmxmbl;ER>PMyT>xKBgxNiRWXL?@Gx+f;t(h*%Qvao9R
zQ?)}F4!^%Uy9}Wlw$}-<%7~yIWrkI_!#`39Q|QIL&j15N91?pT=C=Elz6}t!xVmHV
zQ59U?v|Zbz$rR-TmIX19{laZ!ta(7^FU3s^cX6T4mKn+H-xdRjd&SE9&6XCrPk7;?
zR@JLNmxpw@Bp8?D0r@M!{Sx6kp~WP!%hg@o7%ff~x=U9}EuSq#RIv@pF{iW;FRT1r
zCW4t7Fdgf9kL##9m|X^)!F4Hx^|Zy|Tjv(gD!pD{udCcFdu^G8Uo?hnU9f{LM>Xgj
zUGfhVOMCc2Jc_pa3^~nPmxh>dH-g62i?{63vb$TnN?~5(Wl~q#SV*XfQwehc9_K+T
z`8RC5RG+&8R@{0q7qyowLl?I_<a?^(OJQ%&z42cvj(*8zkJb;l^5NS;%Waoepe6K3
zr0=8GsmwdlB|Z^dg}^YuRyCflB(b?4M^F4IuXcrmkF9u=3Z4rbBv&VO{0&2yY^hjB
zkQjr0sV}r!uwb;D24(aH<BBJ9B%jZo;_kkHeiiR;bT{h#T9lSF%`@Iv-Xmxv)??Z(
zuk4a&AF5W{F0bNo0FM!ry}_-f#?XIH%=242A%OGtA#752n?*=q8A$-0z=m!+1NYR5
z){Ai`N!mP-EumVr1me=1S>;BH;$iyQi&e);PqHruRxsXLbT{$v`)!{L-jFL?mj$Kv
zI#w}LAM%RaL-cRc`#P*^7%J~*Qry?Eu(G*}QS!sq4j`hUHy}qeWG5m=PlS5ZhmFm>
zGtT$vScbHYi7lH=Hey%u(fm!mqK95@EZEN)^6l<Q#zF|r?`G^}6V(s@rs`|?=$>J2
z7bi$(dM!Nw%&t)1jzF9z!vMu$ONZ-BwM3kv6OATw*zW`Jc|-pE^VH8$;{6$~7Wc>#
z@7cU!xQI#noW~ms)vMfZt)v<&jJ~^Qv@JKCg)(O3OHM!6LDKy#$dT=*x!0Kxp{15<
z@;A6={JWZJxjKKndzAP5AKYNTHza1@DJ$H*@M4kH?wS+@=EJ<UJh}4ozg5=7PirW<
z#}k6p^}UcXxw>;65s8?)$Etq`%%4>}QLVl#8dAwm7q_MIWqbW<5ox*Ivm)NyHs?`c
zW2Olb!kjUJ|3_95>uE@(dZIQlSmbx|04~b!;UDJZ_jez_SOuuIDg^j>kc1l+UcN)!
z?prV_z8F4JnK#^_>hQC1osXCHZm=v^@GH#z#<4K+oMz%+h)hb&-Hjv9^O{DRdZBJE
za8fc6&t=!@r|wYe$tw4UfKfqSh;j*cUd3HG9_Cu)qF|m)&X~Vg`#Od$W&&TudYwDU
zyno=Tz?bAE5~A@{xU1-vfZByUf3S9x@j)&t;WD1*+<QN#j|)AZuR$weJdW^yj=aVI
z{XK4|YCeL%JVI^gj}Fl74OBs%#qsMD`K^Cr_TWxjjA_xB`B0%tE0AW*<*BwINZc%O
z9O4W2Dx3Fn4>B%hnL=(W+g2%mso&N}D5gy!t?<_3en<t``jLVd%#2|4p!IbZw+fK7
z1jY%%wpS`;lCRqI_DTl@1XQ)QSH_Oi)=JBfowrr|jtao^Fr#hr7%|b}R7$i<K59CZ
z?12dOx%oZ?c_3)3{r*8u@K~~cP%Q&aynpa6kPFyq?H}aF7l}@^+kX_J)#mw|=%Y99
z(Fz-@f&Z96zptgVyPYxhBU&NJ;AhzRny0LPr`m`<!N$KvbSFqWjA;H)U*;c%f18zZ
zo~F$4zDzchdHWZIGOA**O=iW&`t9XvZI#<DRGNRY2z{Urn=Oh^*U*VF6ed@eL*h6H
zO%?74I;N_6(3X@{nH{piF}$=|`D&G~x7#V?d%`g4t*&<{A997gd1Zzd*1BKXJeSJk
z-tqmjD&JOCj!|{p@S^hB?h*PK7aJW;0Sz*YJq}v?a|SrRp_uvQ`K18UCYQ(Eb)Y}s
z`^(RH%=OywD*e<Q*WKg$&kRj$FSt=nsQhu17i5QOG>?=%u$(I6Vd)t^>?3TyY_i_R
z!13U^s|-i%Pf)k@w%B<$mT&pLd*Hs1c<{u~<bnIO44=8W2g*Hg-=c!Jv8(iu|Fk%O
zo#oxv^|N~vz_*~Eq2$N<*@>SBEHA)<e)fACp;8g2cz-4O*$L|EGoM;PhI`0H`q^zi
zQ%_d8hxN-KmFQ__1wGA}xRkXP@_707#aZCuPlfsD@_H}E`*W}J9+A7VzV<y0j@Q=?
z<S_!F8(afM1|i5ceNRZ{2(}2n>qOEpJKZY=Rb#Te?09$cEWwJUQ$(K1>}G*V&ipR}
zm81f1@xjl%e&&wZM2#)?a;%L_Qmn!Jo4*%1+$pZf&8|Jhhr$2<czn1DhK#+mPVo%W
zXELmv;u-#ddG$^4;cgcb`CrC|`^o>n0d|HA3iRaZq3;q?;1z_Ji2j6UMYy4ZZ<SoP
zc6e@w7l_x9%5Im;EG8Z$_qb$bvTFI1&~XIUZXT#@L9TUzpcT|`ZHiDfJqR;PgkwU>
z?e%yy)5iVrr=mxHYa2R(U8ZRCt-=R(F8s6#r?Twkoj}Nm#mzlai!hB!BJQ8RH<@`d
zkMq=-mHEiZd-%~ObXP@|Xex+WN9JU<M>0FI|JsIAs+&)B@1dv>(M7vLxpDp7Se7rT
z?1OfeH09j@<V4*Qx=Ri?y`uRy$?W$?X!$s7c22Si%lpG|k{I?a#%9b9vz@EnZPQiH
zvU>5^QN7L6_11zj-U@e##2`Dh_(i{wpQ7`kE8M6;TB059=a*xlJuwj1M4CPT@2T3y
z&{Dc2zV)CJH#MH@fc!7V6Tg8?jHhh{yaWi}Jf6kE|LgJeE|kRc&~>4@{~G-x+b|9(
zz$Iv@b*ptvL-yp+M`w3<T8{AhCbI<WK9<NT&o%cB7$$tev4Vh0mdH;;?6=@W6@5E=
zs8`c7yMuG?OsY>-v}}{FjrCKNuV3B#LbCq!N{#=b(e5q{2c-?uR1ox3smv`M`In2r
zMs@?EN7E<eRU<P;34isT8Dg+-*UQIR$YEJ9zkfgU$okOMTccy{%4%_O^AYhI$;!Yq
z8>J_R-Pcx>^n2r~pax5{IXh2`M_`B#HOCEP+-%oXA>{BEMZI*_n>^AhN-1szfTtjC
zOV*&$<<3$xVyrDI+}zLf&b`3e^v%^>a*tYC=Pss3WotrhxZKiosgK*?CnU264Y{78
z`g7&K{YtZv?y~GbBa)wDyc!;DtKspdlaCAy`Z%KSafE$5D||$*$}=)9i9&_tV;CQ$
zSU#b}Os-e|jDG5V@+A$D8Udk(MGKWPJOrC)eT#2u{k<Q1&^FonQ+!nGgKQ~9d+=AG
zU`|Q9lVbpGiXUdi5A#b2$kWfs3L5jsf@43Jum4Ko_4ZF@aW>9=-(Lo3Av+WI%}qWs
z;QsCPA8{Ut-g_1MU(lw%bTG5V2k^A32miojS8Efqxc+-dqJ2A`Ezv$$5Aho0kx}Ax
zi@MeD1IInAMlKE*B~HOTX^^c_c=<^+|NI;a$e?OvOXa$KpG(6VNbI?2P;Vvisn&io
zqZ|0dYya??<aGq^4feihP%@;CAblk1BlMo=IS2wzK`Nt1;sMZ<=t*#3j;{7>s!sGA
zg^gtmF*9I%!!w@*zPM&k;+Lo>`q^4AzYfVK_+KIP@sv|Kq@0Sosdr8sk>NBsLLJwj
zTuFVdR{Og1&-$Sh>xfdU3rcBF9!e!uY6O*>Q_VlWtkpN`58sIT*GLFFQmjX$5D$;Y
ze>^m$pz=~LwR7l-(vl_KY-fhFYNc_=kBO(ax~H8g(Z#xjWPfR{K2@dK=8xiva|<n9
zPnO2|w-fDJx2JL^<4?yG_VT(DS!IZhZ2WJu#)~LF9izBuZWPbsrt?q3>}2M`r7(cy
zyd<AY^t|vo5|)$T5<P!CooC1MY6eIdSM$6+(UWW5G8e8_EZ6DDhXdJeo9wfcv;CvE
zXmE>U;=VbBps0ItA1AHiRj@TPz1w8n!L}xGS+^QF6nu=Az(Q#~A;9Z8AK-PZJL^YM
zT7IcLcN7h<t>{xjP+&V<w|DlG7!j%ZHxlhDFgr${gN5Q0b9bLY+LcN(Jh&V6epTVU
za7*LmxKzvD*IS&j+Vwj_9Dpae)%Q;6B3Wt0BStl5*o=#oXw6wV4ed_P+zt)g=Q$-c
zXT=uzne39hH4RnhhSN&n5f_N<G3a0>^SDTf=o$E>FklF~e}KZn676_`<L-43A!)>T
zSGn6y7n^xsZVR^T{S_E$%$-!LAlb1kYKU&e8IDDHZSz6N%tr`B#9MYyZF1cjChb4i
zv=w!6bL>vZ3?h^EpsTele85)6(Xz(+SDLp}#B}%0NIIDLF*4$3G8qsvU02V{n?g>E
z8TH8wHz!iKt-ee1*2iSFo{!3ypNxK@-&TGbbLYS>hS^-=>Os5lL-H~?CqJ;c+`Wr_
z=sIAe!Ur94XVG3t3y;i@-fn+{iT|*`y$)9uS3eUfr(VO~$#$`Bd{MHwv}(29JDI%*
zhsMK8B(Rj!4mS)`4c8=+hOj{4u+>tw+hXwhlJG0lfNcvR0WVHwk3rl#N8~=GXLr@Y
z8&n-r8?*~94PM;*&nLtiT!Id8i*jRb9H%=}4+!PrO(J>chE{StLn6HmvpShM0jPcm
z+D?GVe+c{Gf=<@_7I8VXXlR8B4q7<05}p7XA{#sm@r<j8=Si&jt7L1cbiV^JTsxn}
zW+Xd9D!VlZJGVAf_MC-q(=z>TL7|b|!jdwu3Xs6;;|eELg#NI@zN$CTuI&lNwlr+=
z6aknVZ~`ZUyP$A_Osx)dGaiJpr1@djC}sng6=3Y{=#9cWvIV20;G>W!;z~Cio5tZ0
z9R0x5SM((q@lY81QB~1u3ez6_N)e<F^w`E%0vAbeZKac$-Ci&~iK*Wu`Yu(t^G+_1
zh7Jd%2-B$Rsj#x47kuZ3YJ1t^HaL6R3->WG;fwsh(@&|0?L;lChZD2P$Fg6S?r8ol
z8(Gaf3U4x_#dyPCQ+O-A_p`9RYRv6hPZ43_U!V#sJ@SM^gnQej>u4rLLj~o%E1`_j
zN~t6P8QM^#-$f18x`dlo8-9Kpl$_|<Q0umNi9wMKs@Dy7iap5n7{gQUS8HHjjhTaL
z+%vMD<Q^SK8*HYzXEhv%b^PTDj>VORtga_%eD@T6?~M}~5GOO=ZpV#@DcIne;abtZ
z8KEF=z0c+zbfQS<?(fDV*g&+SX1xvnD9L-KGJED{xH&Z7XYssfG93ud`Cf3wH}3!?
zGf@=d#GM4>Mi4fgDA>6;1=BvZOtZwaT+aR%x`JH>5nlvx4lOCkPRA}9Vpxn>BDaFH
zG(D~%5Et2oDm69>R06-LZ2~#jh0DtxuOM4vChy)mfz^4sG5Z~V5ydOK!h)kQP@odk
zfSVf<WmhwndA|x>T_m;QdUfOb1GxRdhpR@^Y~6|e*g|usAjLGOoCn*~P3o^%7x3a<
z$2!uTM@QgH>$HR~-YxL5x=Z;@-YxJ|^H|+2u#KB(1e!=+M{eVY;vY9!(aqd3fXVSx
z8<fIb>L2i`*$P{Hx#g#xuWs&!fs9Rd@!EP#sc|J!>fdVB2deILf=oTYWYDr8Uaz^W
zmIym`2C(TC9;t21I6~ZA+@-m7m-xOm(8oFjF|`7%g|(o+k6t&>T+1906)ZE$B6^`;
z;!L~E6UlW9q-1~pL9_P;E;0U9#;o0C>U{1rEq-*v%V}tJ=J?+{SgHdr-s!)I2Y)vu
ze13#JjVw7J<|?rL#QoteY4Jff^?1#NaQ9<V{&7Dq7I?pTgjRm9IHgWY8kS4pf=fVw
z^VPv$mCpL;>qmL2!bj#k`fgz@#o~sNU*L13nXv_3E67(Z0wi8v4p)V;vK2qc?w3US
z=`Rflbr<t4J*$>*D7BNbOeHUN#dc6tet>X_FYOc%slQ}m@cKlbbWJPI?g+JwrLjp$
zjkZg3{)4UjQN-Qh@NOx9O{N0d(ir`vpX;uI7<9x|RB}{uz=PIABHevgC9&dOSdqjN
zRXI`;U&H-w#cAv{Gbp1|^>4Nu6Wt4>K`;CZ_U+Ot#}%s_)$?mrvO$INGmneZ-b|!N
zQA@b@@DO+IM)h#6CkCxyQ}y7`ZP$z?<iZIA7n?u89qt5)FHOjny{H}gfll{z(3`~X
zrXeomm#DX7?y(R)Y^7U1jUu3MguxT1OLSATL5X}EtzUVd&E0{AY)HU&*D{dNmrmDM
zuXs;Vz+HW8DR$B|)6{gKW7n#x&Nw#&WkanT&I+OCta6cE+A4g^_d!dWdzl4x)~EeW
z&4*8h>(wK;{Z>+f9J1?ka&>jLsKr(8L$G8EU)aD7*Z}qm=`WEE@}}aSO@Q)KI3u$f
zh>a4R9{6ip{?(R?btdT)^6VkhD2u2G70gF!O~1~PYZuOwqN7mOFOmyOXXEd!UtF9>
zU)~LY{l%q;^d9AU*f-IBnf#F;T=bG8BiG0nu#TusW_QDxo?0*3i|=6wMf^8;M9`yW
zKBMCr_y1AIQ!Jdh@+)sv=DNxHVJa;(@OcKx#R3`7l?FbNnHhGbMZKtU4G1@@>hnGL
z%3%0hAvY5#Y;@esExd6h3Y6_BVWBOER8JMYFo02qXb5dgw`OlLgQ~2VKr&NFKsTFT
z{kgd@0pDKjTh|xmwBKUu`_OtLs6g`oinvY-Z*}YKYIsHno6>9QO_I*A{#EXeBA-C|
zI=xElg`YN(l%4A77T(Zg^*vUzT)_O%q*j;`#aZCbPTG_ZQLZ{hRXi5jOZ$jAn4Zx8
z!;5k~6>@oi(>@?poU77WCD$Dx*G9d)v(h8Oa10<eA>zV?*>QF#yi_PFqUvNT%!Gzw
zPqza_X!i@tAk+3{DGfk5MJ2K`3Y;<NCqQOyhwIVte)q(zj<&bTbgvTv7z4xH<bWM&
z676e@KqMTXQqDnidr}mHDT0N@D}i_nkGZ31hB53=GXu`6NhZ>NAQoOS_hXpM5Qg_A
z&vB8Sy*<aJ9u9e}^ahDk76asPhH&<Bm>qwYr~0f+)h|p;S_l=`F~aSMP$oO0__4S?
zOD5_{l`Txwf0#%=roelBKJ2?BmY>MX#$M>gTUYXQVlP$}Cstx5m<wlqspYMsb6Yc+
zF%V&D=opUJ>YdoJo4;|v_VFrYV)-lucX<dQcJ?5EJxxl=IGZ+8nav99o*Xc6_YYva
z>F)p=fjOr0Vz*=*M+UbGC>fOLuwr)gdcBGbDF57wjNnDL{k9%{2GNN0C8lSnIKjqH
zal+}UjGJ#z?>D$!A^^z46LGIumK>j3PP#uCePBRxkD-xkU2PQ6`o#hMMv*~lVQUR7
zY_QR<(~|?b)g;pI1H6s}flv)?pT(FH%k9&|{ky7PEQ#kVFHdIsb;Tx-NN=Y^fYrEb
z%FUI`o$ZHBC+X8Q;-q#gf8xK1)fflS-SZDV{~>4{#yAbM=mZE9{Q?b5{$)h`Ed?ga
zIyL~rl|&Qmynp7I<eaz5IrqJb!Vz#?ND&&Y6uHhf@@;@pe-P@AnP~G?(CledELy{K
zXMw{-w#(dYj5EfkEed!g^1Zeg27|_{&*0m}%tN<sOgs>w&EeJlgLmAJXy1dE!}d?K
z?<h&m@YsZ<0*K#B*v9&=66u+pB<q8nC2MA?NmOC9(zKY(%gCl-0%~jMl~HU;=6+sw
zwkG87@$m<zLty|1)nw)wj-yY$@N}RzLWUFn875!@1XIuuTI9dv07<vRkf|o?8gu8D
zF-;Pa7BL!=cf&s5vo>aYQ40_3NAGH@9y=nhkX8r&u*-2nJ*<YYybG4s_jL<UC&R?i
z$gKiV2OP<v1in#%C^v@Dpgn09QoF<tb!#`5VqbItXV&^f5J({wWpPmzM?L>Pcz<_E
z?DK(t17-B^j1c?$jLF>x&E5j+g%DP6UhSvn*=Q#*#rZS9f!4w?;O{X+3M+2YHv}FZ
zZ9P@ch{Bvw7O!Sin#*A<wJ;ITCV27E%&m|zgj`Z}g^MYyKFOo{A^xjSsE7ar13yvb
zNw8ZGD80xN9d4<+)gT0teda0Ndt&1CqE(Qdn0TLPdi)nYit`Nr;^UXzrI*ZWHp0aV
z-Sc~xJuePj`WK!17$2@~?{58HY_agac6EDV;+qmy<1GN{*vE;n7(GjZ*Ta+95tVC?
ztgPnNlk6yX^oJ6@<u9+G^7<k4Oo8B^s0KOi4-k;nDbnso;c<852!?5GqNfZc7|=*e
zX}27M{Afcar>=9oc7KDGIbw*(MZ=mT8>_bd!KUFJ$M02SI$HZ;Qo^paAhdNuFI4&>
zcYzJW6*a9Yi&j^q`8Y;->Yq^rs4KkOrOk2VfWxuUko+bn;O=XGuQLC12QU=&K><zD
z&30U7_eZbtAGhJ7rH|wmf+2lBRK`(#pYlG?ZdnpAkkG8a$_%`_4Lpuq4$r*n?w4^U
zM~m*y_7qeWbh6%1hW-+>cHe!a8iGlPV^x!mYFw+&Fpk=?pyI-(rTV6k0K<`8$fX+K
zlT^MG!6Ur<oKM+qHR}EWQgyi!q#7=LBQ+HxRa$&Xs#NtGa|8?X{WYd=x{<Pvf=tM7
zh5Bwp;6CvVu<zMQH3T8=9Pj({v-+&htn%@`2j?2fh}zwKsQ%%agPuO(HxJ`Ch?2+7
z4FSI~Hr1U=opHeL?vn?p;K7s2Itw^sP8yDh$1(+C0XGr>CmH<%1phZNfm9EndUH%5
zXqo>d2Y6SqYqK2S`-A+~0tfIKYhm{9#CHv(M({>lw{#k{lOz{`$X-2ZS813*LkX98
z{eq^4+wFVn-09y~SQ<tsRA=zsWRxd1CQ7sT(#ZjGU`q_~RM(grg9JkTyP6f$!fv~K
zOH5078(F#wD2pym>$qZ~lH3Q?4Qf3cLi;06tpktELlM}o?Bz(k#IPsY_faAS5xzH|
zc8R>ZM0=9x`HYvTlBeliwBnKmf`DaDE788C63{1>TZMI7qf@*2$YNyQ&@xkCGV>DY
z%htnEFa1PfS8`DrSJ&pPyi82@(%54;*M5u_@K8+A0JS3%R3vnItD3&aq+cTfTLc%A
zR6&2qEVUpTwr>0cwjlA`0cbwX#v1sTkF)U+-Av5dVzOV-H)hr=<0xv3`+|RH?}AW~
z&~4(Q`>kmin-eZ?94-ihjuLA;G9NUr(T5Td^8h)c(@8aaXdncn+BULeg|_&g!fPlJ
z*9kAGh;CX{<o{;G!aC>%8DOx|=yxksYb(_OzI^Tp?6gn#Iy#!o*Mj{$?rVrJKggUo
z-1o%S0d_exMBA+^$#1dCrHq|9(8wIMah)4`1lcz`NA}mkyz?9xO8u<^JEU(xKAy<Z
zyoi6^)2bf?*$%MZTgVbk`&`}e-z9<A<wy5ob2B@x$~~hgMtpTqKb7fptiCm7MpYS&
zu`@H$(gFiP;D@NzV&i7g>rGcQX3MET3PEqr4p!?`3PXi`mvN%7P@I>nUjTskLviA$
zZC-^NEHJQ<trBbJ7)Y8c^7!bDdj%hqMElsSl}t1U4NR3hQpk7ek6Bk~zi`m$gU*Pq
zKdc1b8Xrw?!@FBKK%Mon<5X14HiQLYC9}XT#(Ekp!tTWy#No_vE!O@#X66p`ib;Q7
zOvvQ>#Q3!PD6VA$;`=CWv}d3HR-XU!eH590M#sVd0-rDf^3=p~VU^1tCFFXJ7NvkW
z=q+L#XTuGmxk@V>F(}a27IZaQS)%8B4V4E;X^IgTV-N46R>loMvI<l2r>hI?fk#bW
z8K+mevo%W*=t|L=hH>h!LNPDk2o?96HNbTka{ELU<qkIqba8Em0^hBLdFaOI{rJLr
zOz?&Ne3$R71D|`BW)khQ&@7F^id9Ds^A?YXO!Lyv{EGU>wL%^%>TKPaZvrY;O}0Nu
zpQ3U;7~fzO??aZ47I4pkP>)~In+Sq*r!DfLhvB_G9Up-i-lhrtaX-Qg(|Mx-`B0zn
zi)3zU%`^aYD@E1xTrbfn<e5Sw@#t7%TJd~5P4jVpyGk<Ewj>F?AtlZH``gp>O=NIl
zJy4Iej<!-6bHLNJJC_gaCPl;STNlEHM=L)r;v+;9|1j>aMKWI#y;*$kq1fWj#qYxV
za@C)!J2k@(P1gfi^eeC?{Ts%@RaYz*;P=s1##ma(PX2=Y+LLmwn+ko&a2Dw4F>fg(
zUx)Hzxmm2=5(A?<;(4KqZysaXB*V^1xLFQFP;Q!))7Vl-FZy!_XjRKs*{<B~yhPvs
zwAg<s=4jWPhUjd_e3IPQ81I}cP{Ku69nX0MtD#JZvPZhy@BDgybfJCfnXPvDQ^D!_
z&6Q^+`q!Dfx^Y%durR8gRc;9N#3aqK;82<kmc?VBYgMkv5iE-p{e_0jSQd|AQpK8?
zgq!GATP$WXv=_15tH1AaRqHFcj916G>_xeL8FF<AxhhNRvKQq_DObTzy39R%tgqY9
zMBO7wOWBJ`ZEvMwOYtJtW~B<Y#MrpaRIQ5s%*>D}xx<i(2pH0(%ym7CMCnWKOZ)_~
zafeY&bgmJ2E4ht+q;u$RVfQXvJ9Z$9p`y!>^oK3GN4Jsc=Pe)3#egucj1P%46pCfx
z@lV+qp1{btizK1cmZCJGad8audCglJ6gZe#FoQ;8{rXgHc$t|rMw94G?e#AtCLPkX
zOBX3T)snE<t0@y4wu@pqQK;=mGja$_0n$De`&KfGVNJo7#9VWAAGHI}A7QW}f4erX
zfvSs{l8ffbC`Qgj%K7N`RD)mFH}-Mzd`jU?GN}sfESY=IMyIh)Bm~-kW#s9Bwrejz
zCu=NQnXI3KOVN|qtG0FP&9c2Y4eii6H5jZ)!IBc){Gsm$`++CcjB;y(#2*!{ey3yt
zb<oF6O?*uxO-Oh6)2hZM)->M#I|k54;L<%~0K4%Leea_p-vRH1(es-l8lEs38orT?
zX>mL?l%g6fdn=5R22VbS{9J)3C`*TO;oyq0F3}x`Skq%nkDMB4c%@q-@?~65=BX^m
zM^tZSzEMpw8Wwq+CB4!pnGa8JpCh<q_@e)z_OEROMQu)5G>{@46%2ak<E3R049u!-
zHPHoXh+9sOIN}9<ipJ#dac48xzgc8P8)((NbqG%{zQ@y2Jsrc7UHcGS`9ZkxA!^rC
zH#!tu9H%|XWbG1NhO{WTBjT5ORX%?yL=&&b(FZ;6%5V>nC)4OS_<*}Xj6)0kYivEc
z%6&$Hr-VfM0p?s_oQJ?T<Kzai8mCXy4;h_ke;$=c+_r-zCBb^oDSaMSV$ASjg(cQW
zrwlse#F|su-a^y;fLSQ5_eWSt)~GNV@^Bbat5sCkJ@gUKMHCow@TcJ`#9Jm#&WN}2
zQZUw7ptDZK8sHhrYSqrxVMako|LhGvFtuVR$d}&y>aQR$V3J?6pU7aMl}Ip{tQjxF
ztaFV3Ax3sHpTgSs*AN23zk(b>#xMM9KD?gFD=@SGLo~_?IkU5|9Lct}3JN@IJ`&<a
zB)R*!TK6g~G60EkX5S%h`xPs<`^Wnj;h}F*I@*mFw@^GoVCkkQf9|SH!z^<+=RfBQ
zz+PoPHaHJOEfAc4_^w_DmW>6q3;pw_7xDave~$EguYbOkzPQ`{^9xG9hG(-2nYJFS
z5_ifCoTX0}^GW8dT-}vGP3rrk;stX@Z734mXVYci4b9~KF@U27L<gW!xTd~E|DnPq
z5E!bcC2*fxv`EY;B-zyPG`g_fU9OCqSRK9p9m*hMTKO(*Sy)SDJ{VsAZbCOsLf3dx
zwau?^djU({#STovVkO|~2HNYa#xTLXf%akz2?JFS&&}kqV_ib^wNg0fLkG>d*H~Ld
z$B^H*w5S*}C*cmJlzW(wiw3-_3O6Ssb}WS;W;v|l+s9F_MTrOTKhdb|hB^~>3EJ|m
z3&}n!Wr;;ju|It<4Y8Z@BFI;RCb!h7jciVLSiV(mSAjnRGK*`qv%~l|<^Q#Lc+Xbh
zZ>W4|_+Aa+YYZzB={__ejM}7?!m~GdBHeo6>HCg;g4^i30)Z{YD**+a@BAZuA-HGV
zanF&UY2*F~_4#t=7RxQ3qjJ4;GZ2~gw#=*CZ^-(!zTq0P$}N^E5VvEq5%Eyvw#3L8
z{YHX>JTu4><#_Q`gox#G_}MQ;kE$50j>h_;y{L;ve49!@lc9lJ*7rD90TDZgTwPQL
zxhhNRvKQ60<{$yI%$+;f*J<NQ?h&QA?M1m4SnhmuK!GmjMPIPu35bZU3_U8`+Y;SX
z^7WG<IYbv0)AM$SH2NMXLe1~Rq;j%nzrvb3N)|Ily?OS-^DMG33~S#G<?nU$Nd45c
zJ)^N)<z9yj0&b(VqLJLdX_r9g-yLYr^Yy$x&%U}#eRUiiac@0g)qQcYRo9Q|<T7?e
zNp;T^y0IW%8~42AW}2wJFs}0qQ?$LqrL_0ZVd7$Vae8kD_h)D<y4(XdKpCzfg9qpX
zO*2onKkxwk!UJ>%{^IK27mTjYI;WU*KcDpopLJZwx|pm1Oz&3~9fe?4N_40sG?diD
zXv4q~>DNA2LLfYi_=^m6G@sPCqq5h;wS6akz*_Gs**wwymWq_?n~C<Ukn5X?_HgBb
zQECO~?$?T)iocn1w+cl}JH1S+;#Hp&Iubz$0h?o()*xP#u=H<grPoI%>8I|1r#vN0
zU<$<DjsrMEzodMecvAfEi~o$27G(5)hW)sZYa`Sp+P7dVqW$SdLGs-RACJn4J<Hz~
zpb-&&->97Ueaf_YeG%mML#F68h%wk>IIrOSR|}`&zjlP!T{PXiP_yG26p@x48oAgH
z!;aEzF=|Hh0eDH;A-)SawD)N<hc~z6E`#RnIRa!Z`Ky&*hxRycnq`N&#J;$%7Mt1f
ztNm=2Z+K_VYhnp`dvo>&*iQ#l$&H|GeMQSdjoAZI1J3W#a<48l#SEW46Y+IgP4%GG
z!)gvlOzlXO&A0gsO6@d(<|F>JyFniEXMM<-aOmcb2cT$C8o`j-V$yOxG-fM@bEYaS
zx*MLweHu@6Tr1H3e?<>*0eFLXgp+L}1(8ih@aB$$pc=_kDP8*8qzum<GMvL6bU3ay
zmHl30eKi{zeAO9_gOkKRB6g+0g4xV{b-efXU6ZV*HG)oLJvf<~#gt8DFT>WxejPiG
zTc@%Y;pFj^+jW2RsNRAZZFE~+Qf>FG=e2Cm+IFpl*i$76BL3-%MLo?Zl<ad}1gUe+
z7$?0h@1GjKXgc}aAx9{UXdM8yD@bSc!k-;7s2^I<Jk+1WY+1=wWg`OwEp|TX`u$Y!
z(uO#3guqIZ`1o~HAZyH6UZSp$oX*2;a3kjdia#SeEkDzbZz_v>iZoV)Co`o1e@;I{
zpas|7BQ&H*ozg)Wa1YVVB(k>BMpnZ*q_-c|wT8Gv`vw{_O`VbBc#7go$w}C}dVZmA
znNOm__%)0Orn622#K^w9#YegH2oW5QD2$K|wyc}(<R7TV{c3cMRoh+b)+q1yt?icD
z;hFas_vlzEYs|co%3Zq}G$c8B^WCs=A%jXQleC)Q*;|(@hX|#yY^fdMS=$U}I_$;#
zoK)r^70=Ii$AiSucwNMNu|qMP{L48FL(Cty@J?ZPo?H367epxiV-6H~p5I>v#Wb<v
zQ585`paY;_rd!Q$#YSKTF#LAc(JHrhjp|r6XakqegX%6`J$zQ|HNLuOh5MUc1$fDn
zwj2N2IvqRlH6+@nL5afl4}IP4pH^Hb)bO*dUI}2Pq5;efMiA}4)?_gVaE%cq-vwhH
zo;}~2M&WQ~CoO@g50z@_kGINiG@V91ne5MR)al0Txf^x*+xvu0zZ7*BA%=Fv2SmuX
z_|I;?{{n7nE|h0Sxr_UV5%33&S|$|cm%-hFzkDeUTR6N_{>VasTeqf#A3PX3Av7w7
zt+@uUsE+pMCI2_EcGl++J_IJB;M7rCBo5p=k|P?x2Qi5)_3~5ojxYaa;zU0QMWmMQ
zqSW`4dV&$~I`^txexsU-<Gz42WxQu!|NB?v`6+n>MQH>#y%*$*1tn<vAlZpziMqm(
zoOjT+@h(dLTZ5D>e^bgg;>H~W)hFxcCE9zNQSLyu>$4>AMcX2K3X1Hx=0<W@EGM92
zf(qfcXf>S=qy4ZYigWD<&`D^=pZi$07v=f<J|~PI!Hc<G2SZ|ijB*PoO#Gg`RsY3{
z+38)%X)Up73<gInx8M3@iS+kGn+zQiGsI`XQ`6{meo|*fif0|7F|>u<%6(C-It{Eu
z`>`Ni$C<U+;rT7+i(Y+J#0LGs<-`#`1g-vz(d8h6!)RYWV}yfCgP%J8#Tlc!rm2=@
zO@0FWXwu`e)a!uJYL$S>I&#E$BllA+Qem>}n66EZUo=uv?{uSkxB;;N3x|2CZPW28
z`ZYXzur@-El+ZAF`|V!zC8_g9-9#Y!(35z7q7&9yj)cCGMQqLU$C0iSb$M$Klx_5`
zlVZ}B;ZkS0Tfr&Tl2eCg4$fy?U*3%Twnh*C<X3?E9b3Z!?nbb#*i&7zgGbG^$?Tq<
z8eULpW?4S6N|xEbr%_eUUsBaOVHKD^IX<Y=#I`N0v(rJP$3ep84=QbFIlORC=`0%4
zAnyjyZXGi+oK0Hgj^;~r`UX|Br$33*pA-R>v13RF@RollrWc!EA^z9@?!4(C#yyJX
zO}hg7e|g@t;(veMw0RHp;s4rs(-z3fjet7~AJT9>*<B(Up`!Dq4>522hXw9jF%&r0
z8oJATH2ekqm-r1PQa7Rh-t6^X4!`RF=yJknz%oq#Ej?n&^>4kCnOBn8;T&9iHj!Rw
z?7`-`tS7^L1Pc{L8+h{xX>IOq@tEAjBkcfEAa?jLvE?`*+Idp*UTv%=dsLIAZFMqx
z!U%X%gZrTrqMQyOWL{3<4=q2<uEsEr>0f$yw9?X;K%BN_jwUm`lWj+hz?YP>K1kmx
z)$U%EVUOv_yx!*gN5c6F+=g+W!Mx!u->prj9D$!5+Yj4Jz@B$@X2j|6kjr@laNO5m
z8}9edKu|D{YKuyLH&uggmJK6@XYXs}O>?*$ZD0es?HN_Wvtz4?1JIJlA7J*-Hk}Rm
zXEPTV`4clMEmWLykM0U|{$tDA)HX`h>?Xsp8_irwWiKa%qYpk#&6ngR2?=2;^Vr_f
zO&y7kPX+?1BCqozrnT*G4ghM}c{yT{+L)l)A-GPn+VD+N=PGj#iw4(xQi|oMx0$q5
zx<P}4w;@%@>`6KtjJoI7Dv3$2sJ8^bL0K$4XZnDWJD)S{+mU}&ipER3Fv)LoUuot`
z2Ha0pZv}XiPKI@B^?f|%deslpPu;JGV<?XDG?ZWTPk^)-j;Gx}z7W4zZbu0lVw#QK
z_>!X8qmm*B4jLx`--d<2HVq=ybz>TmnS)+P=8j6*5#8s>@16o<nl_5RAy)h9`4G)~
zDK`MV7)lfrQ-OkSSL@q!zJ=+R?-xG|jUNsS56K(^XwoP@AS9`xbER@YpYDOc%q!e<
zGlZ;iwQ55IrqS)ucddKkG@nr6+C!FY$WlbcM)RG?KK=vAwIr_OGc_2QX7(&t(}6OA
zOq8f8>EOsrD3$3p;A~P<swBjK4?Fv(j5=a;NY&a2qV%q8S;VA}+ikH{roB;Pn)g5}
zOJ$F6?F@(COlcWHoCgl(<HpnSZir8`f^88QXOgr~qFH>b<ni+bafzsm^`3^2T)N&Q
z!n-$_JxzNr$5iCEWPj3#@rGnQ5sKuC<~zD<gMwCSuO6rplBn!&!O%D+q4}tzsngWa
z#>|~HRs1Th6QL6E-?4?TPHo3sO|%aqgOX7kd|#97OQ^34IbiOtm#HbE=y@%D@LZ>_
z%{9rEcm1t5?FyBuaejLz#NJ1|6#D~vqd$wApH9I#^V@nmK<Lk`&>zf)))RjvRI;8P
zeXTp3oMmst4_nw*Gz$x}+50+hS)wqIYlyp~(Hq7@*1=909&-H{!e)H*2_@9mXjGU&
z`uBH_$|?gQ*{!jtPsb)Xk6*zjeFvN-MPE2g_4u1kh@rtI^%jYC^;HJh_jjb@@9<*%
z)3#Yvbl*mJ_@Uw9k2a*8@$mbA2Y-KgKt+dWx!$8>6WFkAGzcu9B;HQ)8)0In{{bsk
z)U>LB0%F2BO>omsm_U=ZfBL&yF+C||l(5V2qvi3%a05~L4Xc(HWEb-l_VC3GLp>7h
z4}b;tOfS$geItCFZ1}ieG>ZT{wjxicFRwqx^nP^#FCD-d@UoOgwX38};$@UJ!;5&C
zru^(e*FxVL;bq}Q)G`!mxuV7j7+$u9V6O1qD)mod8UPFAuz&iyN$Z4{wBM`tWOIa2
zQ)Q@OQ_uUk$R{Fr;Sl}IYKT$u7ole64xr{Pp5SXRY)H)0$%A5wbmh2``9H0I4<RAo
zW4-!EV@lctJ_3N9Q1YaT0DAv#MoFUmyQXS9=$()h9V6#ave?=TD5(}o_@}>XUh7dZ
z+v>NOG*~F9Hk4EgN|p2|F(5ojs{MR|S{@ZjZr&c0+;1oe^iYDA`-ILZYCx#zJdds_
z;73Ra`01if(z22^i64Wu6MmMfdhpZmpYX$_EWq)gcm68Dk5dIP$*?#X%$aZSbK<8S
zKdY^Nn@xtFTEkDRpj1hZpXJKr@q;;4{098!0Is`qJMg0exP|#*`0?|}@Z-Dm4fqjK
z0)FtvF#ITK6Zk=dO6BRVZW`R6Uq#$l_(F8Jbge1U8C*|Zw}y`z9S^OF=A&j{c&mzD
z7tE(Hrm?c{{rDkP5t2E-A>giJ4W$<mrkfaEG;!Up!;7~0-Fe}qS}$jW7YR6be0ZtU
z%MiT)+H*{?xXEB)Uyy$_PO(VOGMZdfSikF2Z13>viSZw(&y587>ZDJIrq~AG?f0El
z7FgCxRu=udbsCP`X=U+TB7G871}nm5R~A1&uxxT=G3r08EMn)Gu(<dPU6gf&{MI`F
zT5F4Y%_~@Ii|12>my?G2H%o%{wX`7Os9#y2VavlpM5RA>sTL||HXXe9qkOsDx4ah(
zAm^hu`qc%iii3V3-Qz1P5?X46^~Ii+I124-WUV{o2~}0}5s?D;IQDZ<o7$5a{ZEEZ
zvr)|%lz-hyAJ%c6wbK0rSW?-)dA;DDZbBU@cQ<{}Y0_uTP9-jwkGl$SdsD2eDs@j<
zqK~gi$C4Jmuear@n8+nMber8Xk$x91FcqJte_oLCB(7EJ45>@SiX~gncvmO;thdl_
zaCofs--2<n4A?i;gLcE{eUonpb%G3$jr|oi-y5@iv>y4T9K_U8V%E{MYv*)Lq#xv^
z?Ozp%_7{1Oi%o4=%Z`azN7i<@t{m+n`VG4?p;`u3WjCC_Obxg~mWjzGW{rURV4Vz`
z>Fnn$exJUkmjHNnYm~+mtHiCJs~~T(%!5Q+DhmhJ=QCk`slLqlYztUgVgg<YMh?qK
z)SvVtI;82>_w|S+Joi0ikFQ$Wt!vkoJv`8f_8*hX;T=I84dL12#l4;L6KCAGRHmab
zY)LhYPO>GXttjRU0iOW=SeD|?H77$HNuUmKB;zfqV2?`7YMTN?iyh4nGvJ6sdV>JW
z4Ls&2knMYW#O*4YB6Wv2lhK=OUwOS$aV8fw&u$!WHgP6XxpAZ2pcxxBthHJhj~Ub!
z_oxl777y6kBWt@R+RwC<<AKg&EZRz)*0~d@#!nsxp<+qyjSXJ!qo8X2F0P~=fZ190
zVU|EWWAlO8Kkvpz%UZ<G>Sf4Mb&KhgEGQbcBzgRN@u!jng|&+--Vu;AA!lZ5g6Ocy
zGzznnZ|;_A>%l7EWTIdVixuWX`xlTciLwkPbBCe*0B^E>ee<qb0$j6F^HAQ!EeFR-
z0G2zt$1;(Am1PY+pht8`v};>3n#4Z^e(Loat!oxH6Uve$O=r1%zsYUpx``yT`xFNq
ze&iNpg<_)(iO`W+%d%zJi9Y&4AKJee>51g&M0#vDp~*tJa)hiiq5LzRkF1bm4bimz
z;FHWa1hrYwTfk1Th@E&jaiUfUrW+rRKC}Z!(jk3%2B`*>`MeK>DG7bAGMiplP?=es
zX*r>KFeg}5Cf*^TKZ=Ahueck(1q!e8D1?4I{=9rgFZ|j%YFf~HxOz8WaGYl4$mSoq
z@xSea!*QF!Ars7SIIe)hXNow4!)%PhDM6tr#o=i2iJ1_WXt{vH(VM}cPF&J)rpn0$
z9F8jBkmUZZ_Xi$_m^DmM8q-O)W;r4ps-BIt>M5Yj{b)-NIFY9?uT5~YZ;eMvv?u=r
zexcZpl~GTYww^AqL{&xqE`Be?d#hRlQz%v2g!KtQWtic@{Yrft7Zj?=!UE_Y*L;TS
znkhlQUn^~D1c?8~2u$^UuEy;k<i}x4fsl9n+8~5}4c3{S;yA>skvM)wVJKpixf4Yv
z+K;j(!bb+Ax;P+HYg(-zfv)+|-%Wkrk4ckwfUN^dG<#*&m`n}Yxsi6z>>+L4B_`7D
z=)oA&+Q^`1;4i5q&+*0@FnQpay9vF%xDFI<1DdZh@H|eNf9$TC)Ctr2x+zTe3z)Xj
zF{ZD8?FCHlunA1_O=bhs6f2e6=7T?pcfaMDndi}`)|Xb=BcI8Y-F*Dw=zOXT<0QGl
zSzX!hBwJRRY<c&O6dz5RdwrsSpDBG7Ba2*`mp5j!5~+&yob9Gy-<nwaN=QJ07^dZ~
zafBBbB+%8&lgbMpA`tMv+kb?>`xI-8wQn}VMr^avd8C{!8dfcM(Pt|OUR#yuIjnMc
zjxc_iC8_N8gcOhWcS_@m_aLr#BK;IF4F{55@L@A9(&Umjw0MukOqsbcxTCw%p|-oF
zx|b2FZBXl3HMOe<4nkDtgIO<dc#U&w&HIFaOdK?U5b3Ng5_!wrT&}{FrR7aDqeQw~
z_L;AXz8`4Zyy#<p3;A(sW1p241Jlm5Eufx&|4!rbAx(7}mxDTw%kVoHmnzcY^_5J_
z6;u`k+-I!(qd)Oa{9SgY=t1~~d#f9tj$viAsdlwt->KchC{jWXFWo_5dChG^NB3-8
z?mU)OCFO2bxskTq@_g<Ro};VyCmu&w1nPaeLIp-F#Nu=YGLc?S?WKdshA7!m7Vdrf
zWX|7<sxX-{fT#RGHD-H6mE;odQN_DbXN12}!S3gi4^#39#rE8q3d*R_F%X6~h0jFP
z!Q$n@)$2MQbG;5`mkv|-Id^!)=?O?F?gw^o9V>nI9nC+d`8@_ZxL;vKwQDH%-`c^u
zM|&xI=ILl>;A_lSH>2HocK0%d@#1jj_@P?5+MQb~>jw7$iz#iwzhjH4Rqi}d`$nsn
z^+SA~j*urw9#uw1wI)3uwNF*{4ar)72B+Hv`NNXA;4U{orFoHLbuHm#x?V<wmuY%A
zH8k?JW)8K#&ToJ`UX+>F8=T?ElgR%%L=e75*!2s~FpT&)Z_%*@))g@h3h+(RZ0oa9
zi<aMOg?3t3R$Y^dCVCYp+`mR>>~ksxtxeb51*8ov`v|@pR~wvzHcT}ei@(i$XSPu)
z6f}Q|dj{2+Ant|v#RrhZw`hD>xUVl>pJV^ed_i2<q?##NoU@<L%6!83cx8^E9yVy~
zbO=0DVPy~7^ve8MzkK^oEAxjKxm51b|IMBsCiy1mJ+&D5N8q~E;C@O+wdaSX$ethC
zh!*BIn-)Hth56sP9oy#143kh{VU8KGHy0jh%a3hw?fLZvwO!o144<iHVNovSJy-Y3
zTXf+X^NoejHa8#_q}5mXy}!{YH24sNm3ccKwe#n0Voyq&ezmsg_c%pk{b*9F>Y)pG
zybT)xXkqhjI8sELf5YoPZP_mW?O4BRTR_Un=H2lu!zXD&b7B7v*RGWR__h5%NO$A?
zKjr?~{vV0O{Xbsh=2Ow7d3ym~VWDgb^rL;i-Vl)X0jDs#Z43;rpU@qVaeOnB@i=C(
z*%NZ?2lm}pI3-%MHiG&*>6iF0!-*i&!YZXNr;64yYriB1xmv3FeyBC%&bI$aq?xu3
zjC)#OpSKNx3s$$}ler_kpMVA7&RtSo=Kc*)6N^u7TZ#0i!f>veBcZDGm|6z8vulc0
z`OK73M++S<X&YDFH8E)n6PSx3aV|Nkl6=I^`>5^7u9J_d%|ED#dK4Qq{&309x1dQi
z_bRzOC*IGPhjHTUANZQ<b>{*4seAKg&xz{*)8rRqkNM_lH+Y$t82gc_buI8=KcAvO
z2p%XM#a-dR&qtwSwFRYYi>`;tpuF0TOy!2KAK4>1gzWB62GF~UOw;&edzIZrFSB*W
z>xTtEwtUJob(?*#Ob<7si)T~u=4H3zq5d8T3ev~znNs4cuaF6Tw{!&G^4k{qT4D2Y
zSj^WB3J=K~GfA{jl|L*bsiJ$0yED_=C$dCAURCaWvq`OTzfvQbDe>-QSwoy<QOI%}
zS&FRD@aOqs-?baAm4ajvVUgQkN=Rn8_@8}`{dwgdII<IC1Eb}AK*xqDtqh*cMjOgH
zm_%c?slr`xNzC=W=IMY69&a3LOi&_w7d#z?il)6+(`N0Fb405D|2!R7)&*mv$x9g{
z36d{{=eb*FF9!v3u~Utmy&Posi(L%eCm6ytJqX}9aPsxR$w84rD+>(oF2o(u%Coz1
z1f|#znB8^u0cLmK|AY~Xhw$*j#r@91;mC80(>2DT3PbXIkW|Vs{;Ux!<ro{PRo`%W
z3qQbMX1Y#ews#|(76<FUr}`{P&DlUG=fjBiHh}@`%e4rzeptJ9E*7TSdGC0?tGCW2
zEt<%!BsJ!0-sFd1LmTqjE1v!$&a2|O)<&AzIXb&__0idG>yz2OTBjZK#aFmgH7_lI
zk$D<@x{Odo3MM$`a}0Sej_KBzJ!omNez%&IR^c#NVp6)m2mvM!pBdT7sFW5I^!JBV
ziCH`B0+`W>a2#_($(>eH>jjTIxZ}W3J7`Ki3p|{0El2xpRrw@CR4w1Xd2AX$%=2$j
zH*`*p0Gh|@S+aYgaRTLx!|2wUliJG9s7*}Xmj2yp!sZATGzg<Y^_xyZZ-?|<8~qkF
z)g42T{Bm)?K|Do`N;?N$Xd|j+mgj&qHqyocGgv0!fVYeN=jvW(;U`w`T{LLzZ<rqX
zu^aqa;D25Z4N=s^(Ee*Zl#uB)Vu1CZe-&$9M!@-90td?PWgI6a&f5m4$qnx3G8@vL
zkAzx~u`!aKT2DIPU*LOOQ$d-%$M_>NSxW4lripHFJB1o=4JBgcdhIbV(Eq`nqUvTL
zBUJb|CDlF>*MC>6ua)vI?VltwpQ5F`C@l>gr*klEMp`$Yq~+Dr$khOi^@k+VTkzr*
zj*Yn({uYf`*jc#dVqgDPG$SifLi#7;^9#%|`4}|Tf0Rgzl?C@=uBzI!)7DxB6W{qx
z3Y=+)qc^*?LVHbW7zOTR6u7Ys?l$9xMFr0InL;)470}S=l4T!BK1<eqk92sNxj$xE
z7o)`u^IBY0QH$FXSyj~HL=4j6?wVSn#r*@B1`(9%a8-e=&9P6T`@5xoR5c}%>705c
z5M5J){#%iaI%06?0KbT82bxZ0AD8lzlJe7UH@!>w>A7}@Ox*LBT<CIDcAZ6^zbD~i
z{=O63fA?&_>h)st{#<`H$Mz1g$1|UsUuLDhJ~2URMk4(-|FUYUf3f9N?XAq!KhLK!
z2XQ&fd3eb)eYr%0an0}N^$flwb3-aq@x-<}2n0&=LCOsropqynAeU~094%p6Q^X?k
z&liQTb%72heHWUK6Zo>T*1wq0op0!pT!m7<=pyWHbVjMei>YJZZ{DNtlKhfleaaP~
z!NluS>;yzOZ&!p&3w{^kHWa$yJ87GzI=mrO>a&+Lu~thx8rx7Lb8rzoVEWWBbKV-B
z`DN0$ZK*yrwxKa|(90O|QMt<Oa&-4w@ksL2d&YK*4Yx72nUm>WuGdqNQn|XWQ$1t*
z(YmiTxW8h`is!O?!fqbMESzaGewBMngboL2C1;!|+^f#QYK5EprDsG{?s-$dIKQm5
z4JD4H1huV7Qrl0dEiV0f{IG3!Fj~*|>_oNqf6w!~Ux&)ds4VD5sm!{*YYUe|5UXFa
z3u8RT7_!~=)sjdy>2ahYOzOb^9*U@snBVP;5yG|QGHvQDCa3<)IL7-@|Jo(q;N!YF
zHaM>P7WHMzcfN+7T;1_0z4doT3eRy^4h&RktwfMk?yGEwD5B=`rGzFuy=CUXQL5sG
zV?auG8U=6v0whNPN&DH_@X4M=C|p_d0Jx(B;F&xpvu&;Vp=gt}!;0XP#6!oNo_Ofo
z(=fSeo!RkzkLeAG^xnwJ*3a=RaLKmCgsznWY)q>jKh!b35M$NJ1G%_EQE0XKw|Waz
z4@lNFTbfOpalkN&U;XKMbjkC$lIPa&OqR(d$y18S*OesCjGq}qEnlstAx#LGrZgD`
zXXs(hyLe`C-Uae#WE=#Jm<tDKFt^N+aYj(VW;z-(a~nB+yOa4TbR`NgTF#t`WM<HM
z9C}+mkZ}`F)CX$>d1Gdgzw2nB8;7fx-F1{I`#`2hH7|T7divHBYgZlAcCm^QrWZA9
zL1N-IdTCp5L~_oD-D5B>=8$V+_Rbo0y^*$55YfYituTOTxv|g3!|Ok3-c@e<nMDj3
z27(=G+?fEu@YKR^YErr8UR^Rz_k9ZS*|KeZDE(=h!#eQ&vgSjFLvz9zV|94)#pS8J
zI+9y1O0HX++^0MJBX9@X(cRlVJEG;eRNH*|zp#R8DA_`k@CWVCCw@=8sX(_ru><s=
zbr-?GRb}o1?m`d-stwc`9{92kOGlL<Q`g9(N)+Nw=pt3!6M<EWh)|D-pE7jLkiPFE
zGu>*|9(4*D@Er!VcZ5+a2Tckh>vz%eF>KXqpRm6sj>6vv3IV9$)qL;JFG24$280ci
zw`Jzw_@V*JL}mkSs)vTW0*>=k?60<c&ZcoY54j`D0=S09%nO4aBU2ZUDKzfYy>UQI
zO@io`B<bvm<_U3Rm?n+&%M%m#Azjucg=#J%APq}K8tAbXxka&GRjZiYyw15p_S)R%
zt6)#Ix5}l|2kI+DbSCaX7>xNTvn88IzfN8YviP~pL+pYA0K-ZM5})V0XnlI;<aa`H
zYe+_*`%cS|`qq^i&c|%Zs4Dv??CIkK`*>rky3X;d_6Jy`syj_IGsAmmP?eZYBJC9R
zu>R9Td&`ai%0uK%g6wYMI}bj{Xw;twl-{0nOm^@J*-OB5r(rA%-!s<D@D(aeW`?d8
zh1xg@lZTKYF>82L=I?F)Dr;MTv1BuY`*Skd;3}kSad0y)*zjstVT@aKPp(^%+-Jvz
z^oC?&FpRQ$s%?Im4Rw1j#Jh-=Q3-CUFbdn~OgwZ{6Q=<+@RM$!$BhF}vJw-H??Ioz
ztL+(q*IkpjP!KL#>l`P7q<poX%;1PY!;ae@5NprD1%xlndRE7ikJjjUoHNH(3zz(f
z9j^C}_S}UL)1T5SN}emjb0YnokO>T`0cmiW!X<MTnanHV`;ECf{mJs2A5gaz6+y8z
zL^YswGCy*UES)>d$Z3}Rm^o6zA8)ei86v^P`<*ocOupI5C{Cc}<yNV5;%p06p;T}t
z`rKr#DTJJwaciy?(m6r)KJ%kMCiG!sDAilP^thH^{<G))dI--A;h8K+d(@RpK3+nc
zMOEp~O{6hd%d1IHqwrog0ft*ZOlHM~6X3(BJKXBit;W6kq$b0ch~yQ!QpAQR(S8nS
zmVv<7rh3J)6JxpieqQqoh1}hPxm=VnQw<d-?~|G>#mSpUr?eF$?~3*#cybfpXX#qU
z(-e2N&>`ZvW<Rhz#pY3rp0z`&(9~zD?11uJeg{_D@8BhRj;R~g-*!yp3H3XAM)iS_
z>-5H7GB=c2=Gk}jPd=JnpBu6vy@V;Tz-tIn4iXbzWCtvnE%JtIc#oFR%9Ld8F&js>
z&#olRrWQZnlewuLyL=x-gYL{j`TpI)cP-GTaeg;!zIE9gTtENZ!{x>(^HwzJ>vw1B
zeY?VY3`gIm-1Y2`corTFqV(qA=?^@wv)LHFYthz2POaH7qag{FGG8_KbkEU>zI^zp
zzFa)oUGj_`<a(dOBf%d6%FJ~N1Sys=TK6B=zf)^k!Lb~&5`C`lzFvK%)+{GY)}QvC
znxWYdh!d}c%7==Tu>nwi+!cIOP(Mg7nnKeh>koKre=6w`y`%Igm_DKojU8+S4w88W
zEl=#_1O%JHmWsW=uq90jqO-trKZAP<0A=!W6Q0YIv-g<yiiK^K*WN)5#%mYRK#|wZ
z7K!226^l?cVp9NWusH|W`MDPw)9?#Y{zQ_>>n>!O6|D^8pO|&bDIHNaSbt*H9EHuV
zOtg)s-{F1%ZH~P7R%)M<y0jeZ_3bt->((dQenKX>`e)`?*0z74oqg5aGN1Tqm*!Uj
zyNkIWd@S&-!2d{i&=?rr4w^ZE->FB_c-1WSGru#t6UJs+Z#+Bv8vFCiR-wJ7s2awk
zgY1d)7dxrawDD)tyxSIzLKa@9G)*L)r|EgLf1a-AasGLx_?@2h7b~lYo)^rf4>*4C
zA<^@)Csn{8&kS9v=jkd;!PZca$I0PQ$ttEl9@TyQDOokxb7tsr)iRTJrK`4(?vy?|
zr0aV~U#;{Gr3Vo&JVg&O&~0zwUAQ%ywb4384VShg@leTV`g=^4a^fn`sKo1C?-doQ
z`n{+Y&RfW0!O)azdsJD|uEf_yV5$wGb_6(|Y_o%8R`W^92T2YgN`kc<(Yd|P-W7L;
z8u%u*Q*B*|!Sig-J&47rP!W+EI#+QG=SM7b;RV2}exl9z#>CQFwt<}=Gg;KIuOJfx
z+KmU%=x8+tX=<3!q8sFj_Mj1ld@B2_*0C1^7YdDf`Os0PdmAv<5=(z-te+9I<xg0n
z#%;1ahqZc$#iv&N(y}(1@yxr--$kSv_eA=4l(uRAGaic5{y(c&3sJG8GNlzzRfayP
z&0DbQG`(2_R8V>{3%Dl493>O&U&+qeb}2WNq_;Q5Iiyl-LV2UP($+c=#AJ!z@Y#@e
z7nj-m#}Vs_*&td96_!$Zsfg6BP25ml8W2@|xMHKGUaF!gyN4s4s1(A_LViw~z8$H+
zD1d#@-tK`=behOWLwrPlQke?MN$7^@A%V0`!G5ga@uV=-2472l^Gwz)GAu^tB-%!z
zd&eW3%yfG?Zr{sw%1>b^15+!G?E3`Pw~@UE=!+w(X}mE**HkySio9H)cOFE~5!@eT
zJYin;b~o}QVHIuo1axLG(IjRyoHCt>|2G@i@}IU-M=boV!V%T@`Q`>C17&!<#ZSP*
ztfyjCz>iRtjiacIT(KSyCMx=gA)3pNn8Ty!i_iZkClRC1VLu^%w%44r0P2b~_5+~s
zq3PvdH<dYwQ11)yR|xYmFa!4x7WsIxMw3++LWx?0oj|GRKfb*imxCFoA3oGRu7tw;
z*L%ckAddERB97AOVY8wAV=6cN=#XCT#H43wBDzrSF07};n4%j=fq!C2wAG#w4Mb&3
ztDp$16=cN=<CKYws{h0V!~mg{<}pP&`A-}`3n<-&(^$VcG4b(_Im(;)G&Sel^2ROB
z<?1X=@b=@i%Y+ME+pT<#(Z$Ne{RD$g^f8K-#(Q)0a51I7ksdB)?;~m<*T(b^69J*Q
zHM9SM^Ls>#X7d9dTJJWW-I!Su)5XOjN)-Axg~05^Y~vLc=&$*tY9+zQIGM{Ld-r9U
z2mXS=^c2f62Ie=<0`qjy$NGHB9R*WN2RVW%=IwuqP7bTVKDm%V4?N}{oR6?c<SmzH
z<@v&#UQ9V19+-|!`3QEA_z*#&{Yc9ZAB5oYNVcX&)J6xA9P}n&;l6V3=OrHl-(R=p
zO|UNqs__1{-go2Oy{e~!p}FZqp{|?pT_DPp->;v<;MIz$Rx4QpJ+8J%woE43{_YO#
zT50-Y%fnIAU8On0lFW|0=X(7{<ufbkyh~|@Fl|Ptn#^n7PNqRZ2F!C`(0U`4FIMHb
zUj6zhligO=Q{|B5R*v#bR(`Pd&q6q8OHnYjiBw{moI%R$2w8g<vRaC=o*J@RXQ;J{
zvc?tKi*nV4T)yk+nubtevXIqMl=WMd6+{erj7&a$elo*#?0!75tk*OD4A--)8=N_-
zrgl*7yc!d5!j*f(Na_khT)3kpi95xvm)pGmKXyOMGncXK{xCNL{kvOyKZ{J${U@`#
z^)UngT3=zF^Rd<+Ddf3ce>=!V!%Y<NLYE_j21JjBL@U5s(1Qv?MBK<~IU|IT5^h$$
z?!Oxv`JVLO3%Gked>0~0Vux1iRpsxs@8DL`qxpx)Y_}T03@027@ho$DV4Ur{#Qa$G
znn06G=0^2M4wzw}CMJB&h<p=(!>xgC+~figkFIpzej8UMwhyd#(>NaCn`~`WJq)S|
z`oBV|skVP}+tY2>0T|#~yH(*wyo6g7hUhV|KO;D?mPf3_7N|A(_%WwU?EmtD^ZVs@
z3+sFCSGcI3zgyv={^LJanTrm?t=F!4dM~rIZC<zJocFilcBfnMV;Z|Sg_SG%GvY8e
zlH=h#-)PTf{fZPNGB=W738YJ88m=A&ncB(2P4fVCVwoHF2K~w`dl;1znKTz30M_c3
zeMeu>I$Z@wX~gn7La$A!+$mif>#y(#op^=qt@z2Ds*e&AZ!u$=|IDpTiME^)GEb+7
z$=&CDqR^!hLtoGZ@RP0)ugmpgQ*)L3*<0XQ`#yT<|ElmDs>q%v|7}lVa+L1#X0i{y
z4-2q9W73$bCMNY$O=ZR;cB7(B1Dr@7W)PK7;V9~l_ET!E?yG}+n}h$Qa!{5I*RY|g
z12>X~iCH60X?wfOeF7oKnj}44L4PMsq2pS|h&NeouX5bv)i}hNa_Y}zgkC9xpKVWB
zG@(*u(CW*1O)@s;b<zu{NwVE)`VMOzanSwEjQ^Mi20%?sLo8v_>`~@^Ee4|YPC@sR
zl7=@7TM@n<cj^VAgRf8Bw@bO}CZ_Ipm-@ENer1~9ZCJ9jeR#e7BEjaBx#xs_!bP>&
zM9bB2A&w2{yTBAJwt*;~vWicY^}}j|-eCOEJvv_SVK;n0QxT|}gmEx*oql;YJcAV}
zHA0NA72{gZtxDvN$I15S?S@lAva+Zu{kdalEVdDPyWxB8X!xD>X`w_UDIL^2cMm=Y
zSSGI9+YrF0iK801Ftfc;lO>qPB?z0MyRfDC*$p{)$0IWinF@@(mbJE;zT=fJN-LO;
z>DM{Fo2wgtkjK_TVz0Tnmhd{=xUF^-Ktq3Emz}v*zx1Vd=!*<=dX{a3WPR43%(tQ+
zrKzY-)5&0Mw`xMyy6N;-^m3`0y?s72t~BNbex)4^TiPBWCY-&=ZwoOY)dJ@b1@|79
zGBt5GNL1toE~Jcm`(;0T(+voJ+uMU+E*?Tnu+agq!tg0f+&rzV+O{Y&Bd3mDN?5oP
z#e2b>NL?F)f;X+4>Z>>!f1>sNoK3BA-LcSfdR5zsH)=LuCs%>Hu+cYW-cvnl(j8|c
z=3)D(?Du?_sfxg}&8c8tce6gA62eWbZM~b%M4bFw$e?NE)}OYstOQF2rMa|9@M7(Z
zwc;rBHXGg2Uu=2`HkqUi3eF~`;LLvr3N5<!3$+WmZf9P^6~?PK>gJi?rAXD1-1JM+
zh`zg1H`Dmr;!?48wTZ^ROXHgDcbhFd94*JJAoB#pL}Ts@AP@llzEe*~NE(q(`y9z1
z5Hd|5pG_rwmpqJ36I|Eb*2k02>X!h4%r+M;MmUM4?xs$65zGhA(vvSDKcu&q#**1X
zt+QVtwcLK&`^v?!-IfL+XKIy;kzB4{U8%L>a^XEBNS!$2MYvKIr509B8w66Y-ISV8
zJvqdCmFRF$hLwv6i?Hv&5D3pgglJm@+-0)+SId@++B`xuldHQSNZns)7YP>8gy=P0
zV-6%<M9LL#P6_ANHRvFvWrQ0omXYoCgBF1pB%)!Vd5EjoPJP=*Iaa6lTIM_5WUbTb
zUn~Kpt(2Op+Y!vGk)br5ztw#QH0K{KP(Pz(%+_0<db`nt%8LH5ijs`W7*iU;JYmq=
z%NStqvT~h9OH*fo`|@XMr`L{OkwX0M3)S2z_h<SP`x6Gf1RJIE$weJ{h<?k$nGe^2
zOZ|1)*%%}M0Iije><(Paj!bl3q_o%)&<X5X_wZii_?B|CcX;!1V#s2yXiiLcT{zKZ
zKWq@A>>_syOpD{p+U6c1D$ro2GTkaPPPL6(L6x3H5!?_^=EM3w_=(FGA#?P4^0a)A
zJW#pII^;mEW9%JrXTW+&DqakB#}Ayq?{m{?qnL1_51Ll8&qDVMQWE<vZ;jUI7J!xr
zjtY&?ml#U)Q{lj@Qtguvo`m4#4hkMh@Ny<jKheIJTjUR0T^Li^>OH*AyP8>Lnr_r+
z99q-%Q|6;o1Ceg5qa2s3S<&~yK-+#21hducvUlZzKvEa-yO>>9ers(D%G?CL+nv(Y
zI#LU1sx(oGK0xZsD2T8A@eng*OnkL3ks2fG%u079^OLRGyFFPAGKpfIP=CVH3$l9R
z3nQ!b7idgYxDimc_+uqY00_H_OF-5sTu3tWqLJG0fog18j(x#rtaWd)LQv5<F%XIG
zjrYc*mq4{!3n0dA7NTi-X#Qf@Ga|3@(De1t`~ql1<a23+c)J^I&Bg;EM-ed$hQp1#
zkfBPy4Fn~VA)X3jIFa_KH1nt@2DV4Bsdb5Z=m|pX5`D=s-Hl^9MPFj+<Pg6?+cai6
zqE@ln*_74a<uQRryO8Yfh?qx7UG~ep5Ibmw4P3$x-d5S?>1syzMU3_7kNEU4q`PDP
z{?&#~JC^c^XdeRT(dWwwj{T|3%lSRMW7xb@=B15M5KRJdjfKYB9kY|z=vUnm?Gmkw
zg(z>P9s}mwUC#j#gyyMF2-_b@h=#pI2dHHld0j8r)k^b-4!*uGzEbN`vPn*u?xSqA
z?iG@0IhlEr1COo`p3Rb{gMf^yd&`;!^KaMYUuaNw2Nt*f&m3`Th?{H7yp8f|BN>An
zcfo*gTY|LwH7dRAS(;Y><Ff}cY(EhN1qZa~H?J1G>$XwezmV5mMiy6Zl(x#vp`O_9
zu5g}f8@EgNd^?}rHsR&!Lf(1#hA<TQ>VkmSgnU22V3Tt36iXc;)|89K!i%sBe%y@k
zZj$y^y)d&klC*wNZ(>(c_OoZmMUTpW$<Lj!ny6ktym5v`vDc?q<{-I)&s8g{+(d}Y
zXKgHIz29ej+GpJ&WIdIvaR<BWA<tN>at}&u(g~aI>$EE2#gwu4c`-wmFNHUj;^_Go
zrt{+b4$3OQ=kD79;Igs?tUv@qqBo1ycBVh>PN@!SnM1xHQA%{)E>y$9ba9>oznfR(
z-hCQ~vJw-X3Dmk10ISh!QyPT9dWA6^#&4=^C5{9WZ5_SJ-Jqg`8?4rBS>>`I$n7X-
z#>u{#`BLXb!W4??nC>1xKzgx|yoZ=WZr~KEa9Q*#p3{21oac}|x&#Eo&)3s!|2!|5
zc_~t4OaF=igk?l!v};M`D@&fIQIN`yQ{{UXDi>3TPQ~aM*RL?X;W<AdPTW<(IX5uB
zCuQG*k;myL6*EVdt0Uil=F6TEnjiBPONf4Bd(-LRd%@n6NFQWXBKoa)$oEk4L^~D{
z>F$EDN9zmVt2c$|k_)!Y0<Vh3pvq_<$NWj49)`UNLuYhPp*}-KKQj6L+3U~fVxe2G
zE7RR#n`A+eyX{HeIRl<t263FL0RF`x*J{;Ct}g0VoXcJ`M^6g5w3Je=2A?aDK1#M}
zG$Bh+&I8FAecMt?$UD)leF^n5tMyjfhY;)nIwj>YS|kdqo3pzo*w@6TeukD4{k1TC
zVjK!>F%xxzR1>g>)x@N0OgkEitlI>B66veeB+xd+gLf%yN0)D$dX7?$Lomi1qR{4Z
zw({Oon@doy!VpE1sNS=udnB7YC)@5+)z|C5HP+j+&IXZ8&&QyiP)t(?I*#wZ@FihB
zZt0IyGS*&uMIU=<l&jm*DuGM5mO?cJCL^5X`xMY1mYzsoV3;XEz$FXm>PS)<n;40O
z-d@R26dINa7kz-nPz2ZCVgf?}bdBv9%AZAIrZOM8+ikI5B9^1cg~AJ>rA2fJ&LFSm
z<p&AnwnmD@`;S6BUmxZ(s82X_Qb}mGw>#MKh9&l1yhQubwE%EFc%{$FKkZY*^YX#b
z-R9MKs*v_)G8f!iTEOr!T`w<%7cDB>ND(YtLo%SCr}z1ajh{sNHrjLhFQDaXL(Ar-
z{E}~nXOy4lAXCEDrBGonxgFhTH{LzHTxzn=w8pna8ohiWTVAfN-&vksuAuDzy=wZb
zfJXEl=~eMTAbk``x?Q28=tlulkBg|#_j^Yj|6hCW0^UY(^^K1%>r0$C20}<epb8|#
zfy9yId$`%L<=BbisIdd#(ynDmwnc17mE<G_^45?LZV8xj3wKj44U~(aTmqDmmQq5?
z*O~%f1C-VjXu}IFp`|q3tnYVbW~G(9n-JQM|MUL8?qm6^&Yn4Q=FFLM&dkngaZOY?
zK~zbG7U5N<3~c~aqgzq?BYvu#3~eIn7v+9KaYmSSr4ZFCM7@|3zrXtpm==lpPBg=r
zX&#(3u34I9dGB0<Z32RQJk5UBsaz8uyw5dptYm+_WdH6q*g!U;-7g_sd8aYOZI0kZ
zjl6_&OEvNaa21Vwp+jrrF<_0*vKRM!cVCJ`fw$uBv`0@F4O!n=lo*knzzyxb7X$d6
z_fW0qqo?25imnAK4{qn~u92)TVBS6VsUpzpSEqKL$Ab+`Pp}&M%<l84Ul8!II`jVS
zXC{EnJ_6apKP4d!z3(ps%75z<#-hRwr643B&vgnQVdJG(2-|HTSry19Nz#LGpNZfS
zstO{-Am6(WYwY2>zMdXx?>va<l<3DFP~;Fl0#S4d@5i}+<wABqNOBsF|4b7S_C+Qv
zlO%E~?;cDw<#A2)!2K{tOJzPncfRuoOjx7^8{VVG8S|coM*nmCM+5)Sz<)IG9}WE9
z(ZKRG!B}TcWUXUucQEYe3rBta)uC{gqdvGM<PSE51HpB?;!t>vuO}36_+njsy}@w8
zk%%~g>-^oma92>!3i%eSjl@>R!6mrB+us(6`MX;~y}DCD>t2wcOKKz<499|93;gkz
zf5Eaypsy!b;*Unbs>2r#`PFzL24NO1RF|p#NEqMx{0T?6ucyZmjwBphBB>TffD7zM
z^hbmIqA%?5`TfD3V9bZQaMxEH3nJ0P0`Lm>7W9NV7DW3K-H~u<iOaPh9!dn~Q}caY
z!MKobK`89+=?erG)V2k~3+f`_L~vcAq?<UYUBQGJ2zL7VdJ;Gfi)x^fXwaXCJ5YNl
zoS5h6=u0@dASzR{lx3#(o=CWh87Vs+Oh`Se>C=`-ILIJLZMvn6k;v+J6C{mwq7%5B
z^+L?Wnv^U<`6X%{Q$yiU!lX?an(Tp5-DtKr7b>80^iE$m!PcPdk=O|xkx0);ywgJA
zcrcbweSPc3U+WC@^q^iM$m(rh6lM`%3mq*HN2=Q+F?4ZfC>#vT$|8sA<3ZWIY)*Wk
zLsg@ZxTAP&cgWxE2*n+9&hDG*ILVRd4#nr66c3#pES{Tnm9dp!Uq?@ntQs|9Oe7qB
zAIvOe?%0>Xk7J0$%dN%{={O4wb`&2i1ao*#Th!E2?{ca2p0?ViW>3B9S+QdI3U!&M
zwY7G!r`3Tr_#?g15Go3V(VULHE=O-95Oj3%)+KzQp7>m;DXza*5JE)!nwlWN%CIo8
zU?5%HqSM+ut!izn+S<0Fsbw)**9lI2u^_BoPv5_&KM|~r#eDs0ZM>BatqfXfpiU3I
zctUF_x+&oc`-6^1rx;<3)KGWaapK!CCm-`~*8<lzV<$-EIaapd`QfJK+C|MCNBv5V
zqiwmPwq<!+qi2P~v#QSHZEITIauQ<`8bGvig$JDjb%x26^#sGkstSh@jH~M0g$~X+
zMF-XQ`Vt8uOc9*{KKrOQRUdeDz+uFDeDUsBFxo={t}oF!zlx<;u?eYDM=>Nhk<DFb
zBxH)}4E9a1HEQ-r>;%s;Z`)}ODcVVF!i3ty!U^E$V0^@feLap~EEb6^bVU0*Lg9eR
zwQyk|NOrYwA-|w@1{3hRzHk5vLBo468Wt{$1>+dEG%U#IVOl;%JV;K`kqGq$OB~Br
zG%apwscm*}dq!=H)%p6mx)bE*`ogQjk+oq*7)($Uel8eIgd*4(Kb}T+5j;*jE}5)2
zHkllqn@rN1=`O<OJe(hg&m+ajWIH||;76keK$qP<mXe7%R(QP4wRN6)$D-3nG=69;
zffK-JlLMBk07EYj@>5XYaRLsFf&(8i5FchOEC=oafyCD<Y#!&4Sg0#R8us}Up*6vD
zL-`oc0*sDNO2M2}b}3mvM{iPZiVeEOyyk_DWlgQiYTN1>X^f_2MIY{u0X!ySX7(Z?
zFyfu6QNS4?xIhtcbxoPt<@c)yr13-maa;ATTj%Qtxty*FRTcf9l}g(oMU6BI6zYsc
zdR4zK8b|bp3DN-e)Qi%Lf?Jw;A>3zQ549-ai1#7p#p9hw2Kv#Zp+txqGuL$j-YOAv
zw6(T6RtNiM5wE&bLGDrMvS^z0;+zO{y}s2!q#eElvX5vC`AIC%uToM`j8P`+M=~If
zK^H{HeU_8%n4CV6B9u{A1bu=2;<>X#eOeyTi=pa=Gl?5kmbywc^7;(R?V~cSoCsVh
za;)I~+NPIRvsE;MS8v1yRq(^O-$lB`^&3<%3KQx6)(T`RKV_L}M_;H1emV|YM9_q>
zz{XOgs3+3jgk-7jn)CDZAfN{NDTy%&bPRnYL!At8F=Uy2!G&pjw=eGKMb;OJ!m-4H
zoxxZTkrDaEzc~%Su<C><_C=!+WJt&Ye3UN9q4z~cLQljO2*xs4Iog`j!$PMbEl$vg
zBW<Fx&d5TzcP%kO@&*5hE6_z}gy$?rp)Y9cp(j+*Dj#?Fa_l?@iwAo;8S*FQIMKlz
z`4c8hEXYdS;(hQNF*eV!0Hwd(=j$PNi_#_dDo@j1hr&@GYFgt`S@E&p8=vPm_Sn^H
z>1^%<M)@>vY{1dk7lxmVgi9t&U?mP|kD2iUvmFWxQC}>s4P!?<)D`w6kTp4qU30VI
z>3-<lbD7^4Ca>g&fpOOoSh!GR!;T(E7>vO$Ay)SK)~O-^m0}@A@^TR=KB7t}&XbXF
zBs^bhiI!;|#n@4fwV^~e0(ANsQZj_6IE5<Gn&3>{SR+}uQ2&j5i-9-Ph-F=xe41dS
zCZ7W?s5=r6B6LI`hwzk00|My*B8cLYc?(C)EjSoB-tq4lA-zadkUaBjndw{hh1!`K
z-?A;zRd}?t#Sx4n79kU$utZZ1n&Ih#easg|!iV&MJAQ)!M!rG}QxPz@Dh)KgHt1WO
zb|5kUX$~zjUh2jpAAlJ?NGmcQGiewP{87L`u02i_YZq1NtOH_3K}OIP3yFkB;>d9O
zXgaPj(NsmEHPp7%HZOFfh7rxnh%W5!clf)5{?#%#5Q$hR@b?6LF*S-visb@CG@g-Z
zX%f?TJG5vT38y%OgKN{)L`@Ax99lcF8_69KS~{o1?R3UVG<SsRdiyYc=?JD1L9B0(
zHOE2$k?c#y#k(h^r?lQR2^TITkI16{&zrj=J#eWDr6&|N?n}wXHP5F>7?WMd6KNqp
zMwEY3gy0q+?H`UuTNjDMm^OwPbuZQ~FxQ0oX<o}5IGAJpNe(sv9TZuOo+n3w)bDr)
z6J8YO=d3@Gi3t%yMz?SwI|0+0m?P#3#ZkaRQwz+o=twD=-26O8Nl8g+^1vOGwuFH!
zyEhc)UbdY`+f!3&!N*`+B>E__3VW5K!Z3fM{2@h_VjC}Uv_bWq5lo!da<7$Iu|nv=
z$_$)`464BZCQtI1Frgl+Tm6wfifia_nh3NXE!Te>CurukAfD(c={~9bWXJND4ojNu
z39iGHc@xsKT6g%U$UHrHcPtW)a3rQ<C63kze7CPJ5y4C}<ez{pg`LI`!i2b3vcfsd
zJm6+y!Eg74&<3nT#e9x#<TD^8X<_P8+`t6Bm?xGRNCjwCEN!78d6FYhH{X}*9-7ps
z3DGc2t89IIVZ^6RqsRbfaV@7~=2~BT0(Ml7J6aQ14_VfSg)^RRHZ`|Yt7=!ckERMP
zx7r->uf}RkPsrczaeF-DiZgG+8gEOLrv3_H_U>|?i5pF3vw5KY3bhq86I!4O1`b$=
zblt8>nSjz#o|xi7Ufp%4A1fv*mn_|o^mVs0l`LYmGoDhkJx%Yb6php+tzheN%N&|t
zWDM=A*0(Q!6;FhKrUO$NwA@;%dRl!=Ydi<mL&fKafx3P!ITyYjnbC<(r_<$hJ4>Bq
z&T?mkv(j1Rtj7G%<#M}9U1hFvSB0z6RpqL7JKZj~+g<7|bC<g-+?DPscXg?=)K%&(
zEiEl8EibJott_o7tuAwxxysySrDbJh<z*FRm1R|B)#c7|SGl{qw7jgmyu6~kvb?Ii
zy24rEs&H47R+LqgS5#C~R#a6~S2`<QmF~*Y%CgGx%8JU$%BsrhDrc3e%3W1jRaRAA
zRZ&%0RaI484H2tReKlBDqgXXi8I96?`Ttk@22Qr1ZTxe%?fAEBjGcq;d0WSl%+A<F
zw~Zx>0k;Cy0Os94mfWE*b`{_>JnG>X9!pjM1^}-DJokaIBs~Ih8(<<2Pmeq}mRvQ7
zu^B%E9`Js^;zGtsA0A77h94s72K*U3@^--8fO(IMCD&mCZW-XYc*Y^`QP3aB*j~Vk
zQ13<C$C9Ifd5?`H+fdKC$H$T*fQ?UpAK-StX?VtE7hp5s)*p=}2LQJ{IhH&D^6dQa
zSaKrj+4j^}vKx=-?F6(#t~}hqJ{NE=;7fqJaU*|_<ioy{Rip>N9>8sY1AyBBuL9f&
zxD#+UApMTlUckM8dA~+Ikh2PKHsC72Dv}=%zqHGC1I~k7(?-UUKLT6@_!i(^z#_=G
z4I9v|B6+d(?<K%#*z`FY&UPF2-QNyaOdF@rj#YT3jX$4<ZJL;Dv7LbQsN6Jcmc56*
z|7k4w9^h_3hW1X|J(jEo+zNOu!FR@zHxmB8$CA$jb_4DP+zHr>e(HV~bb!V0q2B?w
zVRynFz^cDO9_+2idw(q14!CL$>I3Zl0Q~^C9dH`9N919vQ~+?@hhxcMz}^4Ab^89v
zSkjAU*YZAvod8xHl1%Od+>Y&-4E?_D&}6b0aMcVvqYO9=8~pbI?mar0tim&PTisYY
z1YA{`Ozr^O3-}peV_7okz;5Plz<Pq^$z%d>T4geM4`5YQGWk3DUY$%99SS~xHGstn
zlgTJx;|a;+Fks$E*mnas?c`)~8th;fU_D@QE%@W-UUmaE1MXdvOl}0+UWamk>*`UC
zzBfSrY0%e_WO5zg?xo4(cEIA6WHJvMNw#{E$yI=jt&j_F)j6nV2I~DzGT9Bd{k&xI
zCBSJHK)%CqeG}G10UIyDySxFneHU`f1m9be$<2U`*n_kKFc15dr(hReF`yH0_YWW+
zVB>HyxeJgEG@l=qvsW<7x+2Tr6La$h;Q(d>3l<pMuyNQ7D=ca#oVH|g!P>k5cJhpc
z$Cl2<Y*du52V8XvY?To5Aiihf;{`rS<>f&%34HRf;{&SU2jMQlX9{o*j=?imfH&gf
z08Yy*2sez+OyDqV_`zkwndNM6GjBoe4WBPz>~O6eP&04KUW{R?&mJK~7Z$Y?=J7hn
z*?op<Fk^mD9aDfKdJRWyDF*IC;7Bi2{#04M2Kb4P2j<2Ps;>>WX~4}POmkt;CTkn0
z8FS(j#kG03)<)MF3X3*d>kFrCvMnxjL?#qYt1T>ARG8-}Xw*(w8OYwkryIU=6QKQv
zsDYQOrE;}}jt%zu!glM5355=D5)5h!c;C@0PiyXiPoQ$wqFilZ`v&_a8@)q`_@034
zL*RR^cHMw)qXFLp$|W?u=VbEb?Fau+TqnCgokg{UdEkut@RhdW-Ge%PfOb{1Ws7xD
z;k3=vmQ83&@dl;7aKK{y$%Mk<`a%bqxkze-`g#w_?F8?`sC*sDgLH$k$-ddP#Y%OL
zpza-ak0p=6In}+%>f^dE#@F|7Z3r>vnbb9CY=doSp~JdZC?Aq8E?~rG8e(7lLy(hb
zA;4y<^;^V<?4u0Vw&EIG0zXJD`pwvfur+Ef&h2l)(s<onsebQ9nT;s3gGz9Iyx-Bl
z4fe%_PV2XzXEasxpQnITf$T<<DMBpFL0hTJ7VF}ao@#}joYwC{P0(CQP4xxb-glvF
z8|2~BHLYJake!7Jr`2<XGSUgf^&ZgOOy`^qwztW~<ysA;2-0N*L^}_(qoBRdfHn@=
zCeF68fNX0k>JLD62ldl$;Vd;7B=7pdsC6i-&7{W>lsg9f`v%VKPorF6R5Mv&KagSK
zb7M)eZMK|rGR}6K)=x53Qv3AJpZ{MVZ^p-o^6#O13;N4`CHO<HmlKc6Y@R~L7JFl1
z@n)sA(77q6wy<hL?&3nP&2E`cSOpYRzNpXvQWU``Z7et!{A%Ec-ot*jiy^m;7YMi5
z7ZI;|;#E()8Zb<MZOE#&z<P?|g~kHgd&addf=>;g${ncxGvpa(Bi}KsKiw9n-1c)b
zZZ^NwP=Y$tL$+89{tv-^U&J}pBlQlQZL&*Stg-sCY;kD;s{#4#ka5>bW67gQ=SO4I
zz!voVCd&_6Y)*(*+@P~&j7H(BwxRx?;W3~;Q@?Xt;$s)~Z2LM2$$`^QaBk|nw!i^W
z2g=<Df8fQr{n;t9Y?JhzxDI?hXm)Be_m8KU4VrGylp)_*OMI>cji<0^gVnlZLSbHQ
zK?D9T5pDG%Z`%yI<x)<)ZFy3*1&?jZh1l6P)l@(+;wtdC2=dIsIq6!)2#m>Zq*~`G
zNO=oBF7i+&fZsW|3-#JRMKt0%M#z?amFsvhEMOj~Snn`u3&^MC!MDAH4Wx*U{HVeI
znZS(#cPzC9W9>2;CtGNoY^FZnM7F&l$ChK^3{wMx54T^Thy$M?$o2edW62oK?dL!)
zp|^TyY<a<Hg?Uu>M$}UUzpVGCTY;N`P1>L0y6A7~r6|Mgf%LK+*FVJd_44`_tB2YO
zd)Q>Ru7aAxAf;i#{rPUt*VArtqDM^LU|q|7Gx>Tu5{z}Yc8a{V$qHtmC!1*p{1M{S
z@$eh=r;v1l@8uM;FSDh`czTR2E*!SoR@n@P5=Mj2$wS~d`;M{XHB|Taz!QZw<ZjB@
ztZcDgX1m-<{&N>-He%lMHql%Nnza8+#e6g~9Wxq5%r8P)z6IV_Q@L}9cPixb_Be5U
zCgx9XP<!Y)_(~h)?Md6H^nW9E+o3R-D)cq_T<-uIwDxQS&+nkljbwWpsLqj$_E3Ly
zV}6oAe_e-jVVk=C>Pqz&xf|{?c7f+xpnnA4?W&Md+8<=tjF<`gYa;uzsxWAbjkNs{
z52|w}>ZE7v&n3C$3LcxSEh4@}t-DdH2yYm>H1255G>ivAF2p(6=_cyu4HOHl0nh@2
z{Gh1-NRiZ#UmQRk6EXjK2IuxSF=^3!lg!PmeJHt@2aLr9-`4yTQb^FBi~5F8b}@}F
z-Ve*o?5e)dYrWdgs$k*GD4qvix8q?X`U!KpD&)}jYdWs+ScJG{eZjC-kw4Iwm^Kmq
z1oO!{oQpBLo{txjyA0R1;@ZC|H}M((uN(2q+gYRoZ>BBs_?aGCl;*x+sDq{gu1}h?
z?!cV)U*$#PeieAVg~voMq;YZxyj0dWX-LI(K2DYt?y%Tq7>^TJml*dwSW9rcgI|us
zcVQ=s3u~-rOeidZyMg6Xy(6f1Gic_b6YMWz@|;!gxbeBeW?MDMuqXLgB408a^>$m5
z$v(1;jiN1^$zMtvMOX;y(|ouRo<p9Aa>G{qMk{#OY2wbu1j&3lWX4#)JcDH32)agF
zGWi1OYPsNV5Z`Og8W(x2mt>)k`6AcAXBX9tbtZ}<_WQw;$3etAnk&^<cW5K2nXY4^
zwk<cAd<-$#eub$2aw}rrWj5pzTkK8LS4gsD9=5b_)M8B<8d5{S3E<ZTe$xuEFNyM%
zR|UVztjHO*kk8*tbCgZsH)65=Bg^%83P_%-P=0$+GI<N2{TWRj_$RZNF=(~c8Ft}P
zo=e{jGsyc|GWlch7JY!W@myXvZ}Akg!qW)Hgh)fa>31kMh;mm^IU_sbxoJPzpm*!d
zJa3})nh#NSIhExx+l%nKw7`%5)JYDY&jo*nBbme#o6LS5d=95u4vPAM26&5kzz>0D
zCTJMho=@|SseXh{lRg-coosak^zVWG4x%s5pyzRKap91~`fa_nHWg4klt=`QN+u7X
za+5O3k+0h>bc3`V^*BLa1^PCuAH9Hc`}_E^jPfil7ySJOJDE6ne3XEdi+IulGBz7)
zI8^2j_yV2k?R+JQh_s`Y%XA}1yr92~=${t!vJT!BXeyPrlHENIl3~zKA-lUx(CcF!
zw>$EJf6p>INJzSyh-MFdJ(;B6C$kS`)W_RIMIY7Y42a;QyLyzHhP`%0RPMTrax~U%
zf|z1z2Tw+0vkakhFX*4Zx$t8=PBuc%h{du$rB7}L?Pl!f`&VN{H>!FMa(a;`CdkkH
z1BSwLF+R>=z~kmy^@W&G{D6EiVhC~<<gKY30TZib#uFas(t)~GLGA@Nk0r}-Zod<C
zftfx>BR!lzi$2)1nASrbjmss-7knu_K*!WK-Jrb+v=g9Xp)+o4+zzb0DMM&1u%1^q
z4R!^)<GLO~xgP9){33NcPx8!5CLaV4euDQ4a$@T;)KJHL_TmDXs-p(@Z^%Y=SC4WR
zp<F51-B01Pa8+-2Jl9{02uA}iBh{+o{%#|cFG(iptp)Z6MfuCE`^aUYwkgIbavk@-
zFM(e%)^txKdt5E}>BkvgBcer<+aV4#2u+G%Cv%}GQ&1jj!1N9;B#OK}m($vboa3kG
z<vey{SlBAlW}^3ni+KA2;5Q9x$3EgWQR8Ro&oR$FJ7siwZj=akq3?&luL^6>_Y%K9
zBG5tK`^?plTTC$)5ZM}R3!gkpp02|BH7(TGf2i@~V}q}2WaUzbUN+m@DJe1aW22>j
zPpRdo(Ki5z0mV1G%!XC@lF9q1zfZ>4$dW%@7tb|mspq^@O<DX%_Md^@)<80OC+UM$
zT}}CE{bjw*kY8<qt-)ANJg=a-oLKVtT_@JtN%p7qTi0S*iF_NCdB#nD;6i2^S4IJY
z#?_6eYa}G?_#9E!f!OiysQg*U<eMbxsV1`W7%ue^wMERF8ui;eEK5ei$>gbd;CGq{
zzjVI6xNy*Beb7+Riwj7X=YrqB8jLfTfZb=pFD)NDh}AY^H)PmQV0$+QS1_tEy7e|9
z=VsfG54Yj|2j5l<`;c)IzAf!M7ZtQ=(T9(-nJ6=WG7nI^_=bsm`t}Z3Y<tWUi1vaV
zb7Fzyy^BT8Lw35^x{A+>Xk2dw!5&=OPW6@Uw>}=9&M~eJ(Np+}QSdzTGPIj`o@m08
z>sZd~(1uRp30&5Eov($E9h9LB-MDA8zzv?28c*&Yp?Mic^?KQ%SZ_AzHquEw?oI95
zI+i>R=OXT$&UHdxlSvCp-Z0&gH}8<MEgoexUKa_k&R1<%vkTX6IKncqObIbnu_%FJ
z#&p-`@#`q-RHByS6`#Ir(SFMAaT4!YmV?x`=d6~;EXr@Jwn6K$I8RsbJf|tQ%wbB%
zGE;fQVsYc7Mfr=x62QsX%6F6pEy~|5>?KQ`Ua+gE9B&u+uXJ8ywOnaYuCUs6TPEQ=
z&A*59pKqDT6vZ-CS*Ey^TP>DGd>ODPPg>Y^%UnHMc7nz?X_<MGMR~!-F19MK+Sq?v
zv4HxBjS$b;&g6CH9jR2avYF$~Nv<j7AU{*tZDo5ct3R-^3vBq#<xN^jcUZiPoo`bv
z&12Wum1HjafqnXIdF(btxj&EnC`Y+AkL}D^#@G*X39%y=`45WP`@jPFoNJkSjPeU=
z!~eF}uCx^X*g~oWBHd`vB^D4Z`t?v{23j3sy_x;m#;>1dX?(!adS1Te6034azU@Vu
z@?^f{W4m%kzGaJ2{6N0t>Kx^{e9KRAmFw~?Be@DFFUV7Xe<ZK)<$TM#d35!&Ji_a0
z-&;a;&$k?Ym~y;jzT&o=s`!*M0k>Kd%;&aSF#vSrY$wqtEfW^C3|iO&7NMybC4grr
z!&bJ<qTu&)@3hkSKdBp}9tl<?KC|r1V^^VL^4RS*<!^cH37hh19{Y_=c{Pu{Zd2~g
zWxun(X$8^Kc_{mrJO!sHIcE7U<%+qu7p8n%%s#Nx|D~9HYNhjQZO>WWC}wE+@B;Qw
zj`Cm$dp<|GtAyQ_*Zbjoc3Zyk?tJ!-d<8UDPEdgPX@T<WeD>}{<@sXv{v_pz`Rvz)
z%Kh`%p9_^;#q6n~Q^54TgPyaXwwDi4ez}1C>QJir?WxLLbJ+ujIdFdCH07qbY{T?#
z<NU%I$^|$-e2fZgI)dtXj{hG$LV0Wf+k1ra(*^A9BdO-=9m<RI+4HlMKh9@w&Qe~1
zT(gu%=dmqEDfi7|e>s|H-kW`!1#P>(_#m)*wwT2G+i^mPQ%(bnDYskMPK$y>XjCe9
za#J;97g!Yhs?fu{H*OfVSbS~D9hT{rT9lV8?AL_dVx6_a%ATMypIK*JW{1jW?Zt4h
z1M#eV*3$~RNtyLXKKo3W^==OPZO-JOe0F>8tf4&iN$#wj`RtXv!fW%{Kl2WMDxW=&
zk3th!!TJg7^VjFC&t0FhURiHnZ(DC&Z&@!r73Nj*5YO>hhff<m-T0h|PZXbKd<Z`q
zpJIH-o_xxHh210V=_*Swh@Xmn%BFlWk-cYAwoYV^*p+)Hv0vI1;Qwe>ZlA=)?8*(3
z*hPx+-AU|vr2+WADayqY*=0Ej7Rk2eD3?!UJ93Erw>iou1?=xR%5U@8=Q+x+^4T4^
zgnuBHIAg%z{PkSruLbP7yc6CoU~lCqpG{yN<tcxkz#h+6fcfhLv>9R^Qwaz)UaYW}
ztjc#4w#}vtDC`p(VLr0c|JM{jdW$WJlM5{hb~apXQJy)B{mP>J=rH!NW#+~i>~X7d
z`(f-c+viq%`9+R${d9JIuJYq)?BU!>U<UHOXTgqzC-aqGAI>hGpa6Blghwn}4`=Tc
zD1Sec-8+#`PffhXa^DoTsZhCfCVQcf>iAQkvU4W;jGx~ynTWPeJ_P^Ynyg%X82f}{
z28)!(rn38rl%GyzKjY`G6)E8Jk0RyWDeUti<&RU?@IlH&Q`xT$Qm&ZF-s0zb4pR2c
zWS1SRoOcAf{a`vDK3Ku-kzX9FTs(!1^7Bs*rdACeqC9#CyYmn_fAA2Z**FENcy@~N
z;S~1z6q0|#RAT?hROM!nA4X!{HcfeU8hdOS=^!~xxqmvldb)DwboSD8y86y^<=Gi5
zIbC^T1{<28yf}kBGK27M&LF|xpK%bBb>-m-^z+c+%C5uNFAgWgZJMdvJ(JxylhpIt
zOuG8#nbh>Z&m{5Do!V#OLl)%?E4$FD;Kzo6FQi>ZRmx`;b{QXLh>O!^C<!)C!EcH$
zQr25;Aw#{#GGV8MqB9*RRWupMQl*78DW|eH&Vid)40xupmc>NunY(zm<uM!EXjAb2
zW}EUC8+(YK!v|tm?&jxwSfcQW^H8{K4qWM(<W(=UOvC92EXvCMYEkY78}4TZty6K8
zJA_MZ%AGcLtBpv&fIibPsD76RH*m3ScOLui9AzYrJ(pXIb6m7jLjmWN0X}C@uC=jO
zEXv(>Y#UH+wUa`&+Szxl%H{Y^QSlOT7@$|K&0UTY<S7SL|JE{SwcKu1?xJXc%R(U_
zI%qR#>2u3A%SJnUK$bjeo$|hw{n&EMppCt6Q9iY@C#?$3pSRXuX=j&7Du){-N32co
zA-0*nu(SVVQ+{G+SK1Y%irZ*_q3#cDmUnDhECbdT8YzyvS<eP?m3JE1FLRf@-pHQI
zYrK64yE9*TxrsfNubjV_{qKC`;|6xoB;|%W%L9cqpxiWBdAfz&F<E(GIlF(da$gI(
zt_ZJvZ)EQttUOlBc26O8cT82r8rfBcDL-svcTFeE=hNr>qmjKaqvqx&_Bjk{3A_79
z<(Vbyny)FZ*0U=d3xT=zs8&$^<tXKsjqHP?mD}ssYqLqjOXeI6abKQ8eD=<H+YZu)
zk5is%WZUKuqu1vv8|$$)FdOIJTd)vBo1Mz@E$mX)G^{MX=$iIu3mb4Nk2bR_-O7E<
zY}l>*a~Zp?O!*!#mCEQ+_Hw0y3bs@!!?kQzHC1!P!VMN&xZ;HQAo|6L%16uDA5S_O
z6}){i728mwJg|seS3~D_)Eor-lQmTAxf)W`%QYm{o*HW3MYUAH&9x+MvX1cE>d_T%
zdXz^R*ro;rTEC$|d8&au&_FUg-lV*+gpD*QP}c6IgTV6QCCcy;cFPh4J@eKQ1?szC
zDOGhn{x4<2OG$)hmMVW+%3kB=SGFjxE@yYOkjQeFk@4lvK{||)BE}tZtXXp~fB&}a
zPP^rNt1_gpKUr<p+bxgVlvnI*lYI+{rfhQd^H$|?1;#w@A%(2!eLMS+UBUS;?Ngsu
z*t3d)|NF7)mO1T=J!4TG&7q;OA(y>sRX)gJx7n3Xa@a4(f^Slk-8t+DMY$%Iy}-}^
ztPuILIheV>nZqyUg6{Y94=rlj1dsBgBK9|{g5~cQZL{!y#IEd^%zm%T#{c(nlt(AC
zk8)?@|HgdfS4C{ogxUCiZGrN`B6jD*+4%qHB;~ou?B|7b_z&OC`Ce*S08xKqQFhH_
zH-q;P>>aCe=}fl8rkp<$(dSW+T|HsV)*}&0lubvnCnin7{|%Fs&yQe_79EEF*B-3A
zkFtkcW&zkU<qY`Ev8l@CN0O(;BxZ*4_7Uu(8OjGou+4|l`7Ln2Guiz!aq-2OaCzI0
zK-VA{!X>n`WiG^lo52n2Ypmpm{$N#pj4aTuJPX%kKZ~)S<|q&6vC$mm%{=xIKff@S
z$dkFpVy!oiE?!Le$365998W)H9b<p6AgN{Fw<`b4#hxVvw}R2YUn4!YJ%AOG3v!eT
z^4LfY3cQu0+?h{-_WpdfHJ8r6PkA!CJ8upWBT6~1$S3Trl6#{k$k=~dlqYi8kKk(a
z*an;O{XFuoH|Jq$0s04X=;C>~%4jaT0q!RkNyXQ2{u=e{mOL!LzDjJ#Up=H)1{F7^
zBiGsQvHa=G-1GAOYI#e|eR>j%KlklI<!@)^?w(xu>NoRWE}H)O()^E$Y`^+u{=mVb
zMD*a4<M4lM%6!JYH?t4)FCX!y72xJsRQQ!y%2;dej@jinADweF{*M&T$Nx8v#X`s~
zN+5S1r##-q&Y!2e+RBFJDO=ju<MWhTTiLB8%Fo-_0}GTtwX$axDA%>I*B2=FwzAiq
ze&DYuYkA^yc5OM)JX)@dp27ZBuH62uNq?+Re)6qJ1C`1jzA<Sl{LMEe{l4-DTpg+^
z-1?3DE!ER6{YL)1)wZ3j`4=sG+wyJ;d-Mcl)W`OmFl(oeZ8%B!u!Y@llJb<Fy?T-|
z?4LAnvhv|+leV6$Jk~ntcPA^LAFeqT>@Hpe$**5@q2+sR?1?&R;?6o{?-}f`b;|2!
zvW@l1OJ}l?`eV@0n>@;UXXNf~I0iKLG%B~9k^4-ea{n2*ukrJTo0OY<?8PSK`#$y#
zKfiJbY5T<`)TE)MN1%~EYbMOo%aolh?1C2M#uj!-3u?c&MR}@)J>D{#u|LuGPnP4~
z<zD4wFT2C5pf!(smEU{W2tR-J8_M5S<c_RRo><9VUO~JETb1ii$-S{vd46T?$F0iX
zs@#X$luJ*^J#Urr<Y{cvDr)O1rzs!#C%u0fsr1Uzm7krS`^f3ksTY4s`Qf)F-SRC`
z_-Efzey!%-q7wcgRoSKH{z6q?5LdP<_qFHV)lTPcv@2KpayR;jX3$41e$A)gC*3aY
zP(JTq*K{Z!cChC=s2^VQD;oooKJb(7ZA@hCcDPhz@3{ZuAZx`skOS*8`26Sij|Tpu
zf&W1o7!a~Lz9Zlc3AGjs7a%0!h;9{_BEG>0<Nb5-UHiXN(hYnjFb=+{7W+l6(I^BQ
zm2@K-{#uEb-;3mTr>rlt^!~p&o)NiQHf4>(cgy;<>+L4^K@&d1CiopD`0V<$dd=W~
zl+P>m5S37)cgXKrD<>T(u4vz9WqmKp`d**)y(R1WH?zL4mfx|8&5u6${RsKJl{P2f
zm?!`?gozWRar_v(TS$f_aefR<5cYJq{9Ys>mbm#bEUzOM<Ogo0@B@#sMCfFXD9#X9
z`QeoXu{DJsLr7uhm>>Y_lJe1dKOMB5PKPAJ25`YpvSr0c!RWv(M1Ej<B|i?6u%=O*
zX!wy6#CJ_j?O6<Luj7YSAAYBp%Rh9yz$fK*O%A-%gkRr|<vBX=Tb=wnmaPAC{14W^
zs)?FqWj4auF22`D=#{Wt!l;A;5)Mi@B;l}xJ0u*Da8$xQ5;7m-N0EdM37rzwNa&TY
zUBak@0}>8OI3(e)ggYc0k#JPPJrc4GS-*r137rzwNa&TYUBak@0}>8OI3(e)ggYc0
zk#JPPJrXj%tY1QhgiZ--B=kzyE@4!{0SN~s9FlNY!W|NhNH{9t9trtF1Q4MpAmngJ
z=#;QVLa&7F5=JE)kZ@4KAqj^i+#%tJgrgGfk&p#t`z3Tp=#;QVLa&7F5=JE)kZ@4K
zAqj^i+#%tJgrgGfk&t!D`XzKo=#;QVLa&7F5=JE)kZ@4KAqj^i+#%tJgrgGfk&t!C
z`XzKo=#;QVLa&7F5=JE)kZ@4KAqj^i+#%tJgrgGfk&tg5fe1z2LJo(7P6=xy^h($+
zVN}8a2?r${l5kkU9TJX6I4a>D30X+CUqXk3P6=xy^h($+VN}8a2?r${l5kkU9TJX6
zI4a>D2^kh?=qQrVA)!;k8VS7;wo4e5a6rOA35O&cmT-rJBNC2ExJN>^TGlV2Lqey7
zH4=IyY?m-9;edpL5(;kV|D%Um(_akEECx6u)IT1`q(pwekAbQC=5NGHkIQ*b@O5k!
zSiAlr&L_(2MO(y$ysYa-Yu5+Fh3xVR<n<jF7?<ys?Q)!FeEoP?e#nHsC}sZNV#(B-
zmTznGmaSR&zB#7@yRx{hZlR-iWe0xsqR-)Sm6Vk@=ezp&SJ!{J-6hVl68BtrVPBm8
z4u#v<2gz(2YL%50GRInRMRp&XCGq!U;Tt5rYMr337l4IL@Ha{Pxsslq_oU;e_)xu9
z_4BXn7y(!(t`mOne1UjI;wil+eESB0@Jakz5<epGSjgrFbqCS2t>XJ!iC-Y`dnEl1
ziN9Imhc^m@oK~{m+XSB{q?~U^`Z0+gy<Q;bIYc_>))>|6q=w_*&l3WkCGk6k2+J6M
z&JXZzi67m>(d<P5aK~Hlar~D+%#-+%9J*_O$%q`Un)_V^Jk>jTp&+E^oanem;%_`#
zeAlMKO0MKDJN#{Nnaz=S_c;P_wZt!xcy0W?A^Aro{#i-C5&c2O4HDlh<^PpHvqNEr
zykEX65VQxFj<Y3x>3Z=U`&Ibymc(o82FFSK81NQ$ID6$Vp?Fy>o0Ko~qxrK@Nk1KU
zBm18XJjvtDT95d;#1CZQt0g{r{bD)rB+sradCuVYLzy<;l;W`SB_B4RVFbL4(@$aA
z{8hVZCjY&h{$O_IpEM0<C5+qq1gD?DMx>ljBR?E-NdBoz^Jm(85?d3vexiRAbo6Wn
z9rSw}ydUK{fi`d7DDj$~c1W>qk@!8i0>Sr@6Q2U&Gnr}gcg`ID4hEj|xjRcvY<1@N
z(JVYYgGKU;GztE+FO-hEIQ<N^GmHKwl3rUsqCN9;{8r+zTZMnWjQ_`;kcmP++vU1J
zvBWP1p6YG?fj~%i$<CJentZ|M2}yrF7A$D|4rR&n2=K&b$HRi~LP>uy6iaw*UF>Fw
zzgOZJ--wQN2r0ihS;|B2Ov6EY1L?R%;+@0ddrScKB=FSl+Io=I@2^PuXjZ?|^Zz6d
zqaP2zLHil$Xp?yFBjWor0<g=0x8P~}nx6~$LnQtftbcI4T+h<#t>bvC;>h(9NzUk<
z*i`R`^bcNn;U0+}{HZ`lx6bkoCjN)v`7hxgG(HCbPkh?X67tZVQaWlSe)K-^9TM;(
zF7aM)qvHkspWy~1@t?uAW%c939ACt=^}eGd{cDm>c0VR1pJ*1Jxlj~u@9zYI$0eT*
z;Hlo>Ec){${qQG(@W+z=`<(u8wkM1J50c(_uOOu7jp?Yuf)v&3l=i<)pxLPs?~ntR
z_IT5Ap~P$Jh+xZ)+kq!JFUqR-DM?>r6&2DmGIZo4aP#(Njo%81FOuuM%LO^}NxVZ2
zEZLoG7w}Z?U{<{!N_yuqQ7=7{K*zy{iF%o2pb9iQR^lCx3WVmbwgFH4v-9zXCH>%!
z1idB?{ay^U%ln$Z)3bYYL?oWwFTNiu@efLT_Bee-<8!T`*Yy06#1|C^2AZFliWGsy
zwYFXkGJebjp5$ql`uv*2w{ZF*HuN*Wr&QuEFrmL&;<fd4q$~V5>TucKEI;r~iH~j<
zbo6{H9f6sGUR#GhNub$3fhRp{@l>Lj?Fd1?<2r%izaN3q`y@X4slZ<)3t(@7uq&DP
zXmXx-q`(hP6N1n)9CXAaUb{a4PsfiBHN31>x(PN73M74M^A4YESCz!K(<>Hn@E_g)
zd;xfB*Y+&?{6M46iX*cflK<1hg)w@bl#V`!=-)g!@2KKf><R#$<QbeO>MfP+dVQ9_
zYxhYI*7@;Ii7$Fje5Yp}>6m$xppUi)K5xo++M?m59nKNtY&Gy?Z+oRYPKm!v(mQ3`
z*X;A`qh-4!pV^XrNa9BdMZNSa5*-c@P`gH@e)#V)0N(&S>2n}UpK(b)&Tngcq&;BR
z@?$z0LVVU`@u}nZBBtGU`L?9rF6nn<(T{NaR5qHGC%g$f$)D|?^S&<dHQyJEkG2YU
z1n?9ewfjH_$NX5v+dGwM^G$7BJ0&0Oz8DJgqe0>aUl-r$*<Lyh<9H-x?fh%%-&K;{
z5fKbvmi*Wz@kImT`z8U{(-J?hULdr%{HDYYN&0goz3mv$-e{fRf1m8%=@MV_kRYUI
z{pr{!@!9d{HVvN?bcaZKI}$U}v-2H+AC>xCAn`S_UG!`R9g8G>NZMhWK(o^%p8Z%L
z==pIv{wnd>eaSmyy;J9kdUu>I5cDh!9reI7b`YC}Et&e`RN$$99WwvX^nA9&SIK-S
zPZlaXmgr$ua(rq1zCq%PWuBvryIX)KJq%>o`41(&eT9&Ro*|;6;5cFbTeIx>7>RF}
zc-dX-REh7-qR&BKB00775lznXK~M73OcnL|WWU@i@!5Irz&t6xj8mFDUjsbpO}lTZ
z>FqH|KO*DcgQ8jNV~HP<dT>bmr1^q>yIsgb&)U+_3w)6{&G^3oc%*9Mj)zU~IVB8h
zC`D{k#?vxc@+9Dm__qRo6xpp5yiw8*n$X_}ypcR_nb3c1LO*eVu{<+P@biH;s&|PA
zUNymAWrBYQcq9Eh13c+xcvvV#hIw|7(^#H5;Enj4Zh~KJ!sk0C^p~38Z#Tg|YJz{-
z1pfyU{JSRj&w;-cEv|W181i${e#%|O`neGJB1xkMHv)f<9wE;kGvV{534Mv%xZX=m
z@b8%5?MOt8?Cmh%jpUzeLf;C!QM(c*^iP`5|K5cDQ{au{IS2#a2tNmSqjpsRZzN~P
zgwKEp{Y}90@$!J^KpCFdyTBXOn^$g(|GEkO8^9aMIRre_JMf^W_Y`SAubc3(Rv7E&
z5a2nV#{`2q$)^T*Bl)i|!9Q%m|K}$7*MO&bN2UBS-Dh(vjpeK|!8ZeMBxlHkzR!gI
z*CzNsneh401aH9v!APD{P4MTL;I{&AWY2e);2-9Eu&#BX=)gl|KR#>1e@3-2|6_nR
zl7BJq+)jQd7(XKUUt>c5kO}^I;Eme#I}`eu3ytYrz#GZA1bCzI(qqEsToe37Cj6f=
zp&v29{~mZu&Bh&{nb1!@-dO%}6a1;bQ#@*yd8`(XE(hL7KQ9Asr2jWe=ubR>F~>OR
z89NntBmVus8}Yxy1i#G$zr%$8izf7cF~NUqf}eUK<&9IBH!II?0p3XdZ<^ryP4JhP
z;BPg-KLosy-Tu~uemC$&`akF-WBk#;8}V;5p-%vBq=!u=^gl4cKWW0}1rz-1z#G}y
z$0qzIoNV0Qc_#QK;El#(0C=PJMu9id+Z87Cw*YU{FFQ=|@0jpmHOBHBY=S?=1Yc=_
zUuuFs6L=%}*O=h%Ho-q<g5M3ik)A&T-bl~0YK`r|3A_=X<4y3(fH!Ja+ysBE3I0(N
z{!g3GI~Ez&dlK+Q@_S9_?=_)+!UX?I6Z{({{693IpM-?aNY1$?_!<*@Gw??GzsUsu
zIPgaH@LS-?Z`Vk_t;sf_-dN7Bncz!+H<D+W3H}-rK0h#_|D_530~0=<ncxrd7|U61
zg1-QGO(#ruYz3b3w;cytI99tabAN*||Brw-;xl=%F?|W}M*X$kg#IQI{I7sFs`n2j
z_zz6@6k=h=h|dwg8})mY34Sy1q_^l@LYaRS>SK4B&_8B^|BVUW)g<_jJSO;S_rI<J
z-bl{JfH!K_b0+v#P52~%H|np_CC2={CioEWMsi+kLVpABq|fa2w_PUmyG{64Ej6z9
z5)=Guz#Fx9O0#i$=K@dbitTb;5zAuyI1_l1pTG0~m<p@Fhn5-hSr5FCey%maKV*V`
z(FBjjzYW_px5c>Le&CJlXV3(HhY6p*n$Ul0LjSep#`RX3;Ol`m(%W}U@b{VU`MC-H
zMc|F>fvKmvob<AK@$z_YrF;=!t5YrU<4tySy*zy#uVV`*42i1S8oetMctN?^7mnhk
z*UflQeTuLIZ_-SZIKj+akwqu!Z3+j1>kP<IH{Pa&S7UYs%>`^}O;o9P>pZ>j5pVNq
z#M`zL^ioolzdNuAZ>?%+RnbbifY(|Ff=#W7vZyZ<Yg+2X+sEj2W#T1Ot%(ZpQci={
zV^tFwf9EN^2CKDJ^>{qZ9ZvlBcwANLGSsiVIhfbs@wR%@l2AP6<L!owl`4O!8D4FL
zw=d%L!Q%bGO@4X>8k!hW@xr!HBHrYu3j7|Auhp}>wM@?=^`0}ATmN25x0AnaS9>v=
z8pFG}V*$?^UF}O(q{>!6n}MESWgzGWWoIT?Ya-pwYP?#p8!tu+1T$z3ydijx+cNOM
zYYt<5{zQ{^DH^w?O>M1GTN7?4-lvC`v<B1=UQ0%=%~TV<E}i7ij_5hwrA^r#Ci}6~
z1zumc^Qx%xs23Nho($u0s;ziGnm^%+Cp<7Jyh;x*)lEd2z#iYrNFn^ZrL|tAPLBjs
zyza9rsL~a@Ne|rWRku^E5AxT7ii-h$RlDSk;g#u7JXfxKvmIwwsWV(@31na5<03O7
zD}#>WLCE5&q?d?x#ULNotS_8s>S*o&-3q*^Ev-K&5AWUG=h(^6FR5EvI!0t_oWLE7
ze`8$QRD?Dwzo<5yI&I>-rM22jo7Lvp<d=-K33mTJZMt>Zgt2Hw)&cwJz|b#6r{E2q
zcn`Sh>sv=XQ>re)dx$l2?nw8ALFqD-?hN(No12Nn_~N~p<qy<{)VZ7J#eE)(xfM&v
zFu1YMi_3*qN_z$2KCAhwJn1Eco|HF6*>oFDC&LRRmPdo(6~QhH0K8}0<Ep@m(CN*v
zYMp%bHavZc+n1=Ui@-|PCEQhrFtvCgdyg37jgiReIDc1cr_WF3T&2CkvZ<xikN5vM
zg~hhP=XyNp+W+l5wHVjzDnl295}tK<Wi7o~Gu3)nJ{q&n>nSD}`8<;H!M`{d7LCnj
zwSOv&Vftl{+KVkqh4WU!LA+Zxkbcu7BATlr8`&J9@YJzlX`8xYDU1)nG+jcCCStDA
z)T=7#t6tIBhT(Q2x~1RyjdyZJ`|y_IpeNHIuSgFNZwpkXa}nfk%VCPT-u%B_iB#m`
z=H-iOoACznz7DmCyf%!s1Q8S++nQ)b?huMl=Tkpd%C}Q9e++Lc@9XR=@iV+ZRlf9C
zP4ue%o(L3;7u*ISYFAIB10Fbluob7_kGJj8i(b(+G`x%`FxtZ*)fbET`c=I5IM&ZP
zWAs+@KwocfKZ@wSaf8etRn>+SwaYy4nDuxKdi|*-h~Idnao4(aY7}pIrFX!F68-8L
zO0>q?j`>Sb>s;g@s1S_gD}7yPz0<P>^_oQ~A(G~DGfj=`31Qg-SHBGLlqX-(jW#Xe
zS3GO<FO!Aq3`b(U=q}AQH+d;-Aoo}wUEYRaPJM$a$Lr2jjd88^MEq1*zPvl`S>vhT
zC>X=(i%<D7rN(uVtJ>g&;q`7PN$c&?+c4IvQ{$F2l=;qAsnV9E*`uq<2L~LFM`B2a
zorsXySMh$`P%vJF1WolN;0Qdty?!6w+|B!^?Ub~nhVPv{Fng<aajs--`rJ;-HoQx<
zSqP<yt$05#y1F7Wxu{l~0vJCqcZweAM2*O@8}2HV@AVdW8uE=QvwWk<Jm27#jPnh!
zGt4(!$lk;Uei*}#Qa7p`5ugQWwaDDk=7=_?=`%ar<&N|v)JUhuD8;bI&|Nynz@){<
zfO=EYA~SW3%WF%`G*fD>8GgxFGhk<^8Om!@n$gT5)o%tSCTVnoEOQRza*_uR!2zBf
zY=R$CQxP*WZqYavcAe=>JS%&skO;3Hk5gT<gre0Fble*F25&<fvifKn^i>~4Ky|~+
zR`#rDX?3fv($vHY?x6<`Ni9<wR<?Lrmr&-Aw#ICM1EDxxzm6fM%eBA{e;LJOL$a#j
zK0$>u(DBcB!L+_+O&9x!0B$V;q&h(ss?uIh?{<>zvfp!$Vp(y%Ls+NgwA1n#H-yi_
zpiYl$(LNORRrhdp>h<MlQd?`fC9X_O%k*M-Lqn^lO>L`P)a=RZLRYEwf^xMJYZM-y
zjU8wz@5!)=GI~Wg9QgsSDtJUzC{n_d!85h~w3gasO?4TSWSfAioJU!teQJ*nFO5$K
zH*RV&E-W(ayM)0=uNxQbTxI0HJ9{E)#oN=fdc*9+>G~01P)~ML)pf7Yo80x>_CKB~
zV@S{=F^ZlUf!kFP3S$U`0?6yS`g#$;G8FEr>I(I!aX+S0pzwdYFBA)MeHbPs6*M4I
z>4+MQA|r9R^@t4SsqyFU3Hrh^mk)U0#17nuG8mU4Z$JvE^Pd@)HEoq4+od(GaGWL%
z>6r);sSJAk=*sRSWDG8`;v(K}k6uaVIV6XF&6jCTmIRun43jaLyQGZXsL#@iYt*vz
zoG5K$TBO-$7Mlv{6on2fm(8>~cOcU13x!j|2O?!=4_V8g|1YhoRrBcsEysC!S1j#i
zDOWAlgtc|u%#PgWVi@%)%_<CMCVUy@3-xEI%IC{+ZlVU_5iBIq(ru<F2WV5MpJq)5
zYEKyhO&ihX`Gm{O$ERH5UR3U1x6U<g;AAXfR|ViPX?@s(g##^nbCsuZE4ir6vy0T&
z(*|m03N}8U)WvJ<V)h7Fx<;FRkb=hdZz>NGYn@m^#vn@N*BDJ-bmh3#EW@rf&#?I=
z;|v?@Ofzh;dW_Ca%{@3H!?{Nl<{lb746Lmwb5=<<mr}ct%hM9lR&T?a@!E5nn~Enh
zu`%SGo-cm2s}reu4?LSq70qz8WOe^<EU%YgF$KwmoXw!!Uun-_Id>VY#lm~ja$F24
zxBk`^7TVgD)nN$JI$B0<l{HDjsMltdV21_WkRLbY(20HF(6{@7v}#H>7d1aL&Rx;m
zsI;_6V}e^m4SivM0t?J5_)WTcDIU$fyi2_qLiW-W)4Rr_O(!v_NW~_sQu^q|4z1Qi
zL_GLv6)cB)+9zl#C?&0@@0F6?^(HT8hhMF*7=m<v7V#z0pOxwvO$_9k<i2h_XS&0T
zK<4`Y)Mm1$8`<{+G6dg$9M`%zon-I7jN6S{y1hRAS^=_te^8$Vw4L%*dxVUJm-S*s
z`K4yh9t!)_!7$%n5bA7N6;8xsX%A2p4tDuyc+(9Tqg%AAA`zm;ilK{Sz!rtIzW#Vq
zo4N*3nlG-1%ebdcmqVS{PJvZ^+<^%DgMp^jx-=o;x-8-W+EYe@&a<3k5t%EGt4lM(
zc-h)+pUJG<#Yb*x_LwzJOon&nB3g!yGp2E3NM$Y~cl3pN5;Q4U=MP3nXPJq>mtWx+
z*GE3w9goMz5ryI^HeUp>lB##x?r6VE_ObVXI@nba3v&IZRF58#8rF3a-GbhKH|?Wy
z(ixR5L#>8i7c(`{2Zqx}x!WZ(@lCCz8O}Tqioa0ak0r=5T8|U=MD%H&Q`{Zyj754g
zZV&67PC6od$#og5B=Gfsui8O~Ab~a62v+GcmTl5aGKdyrv~8_lwXk}P!yk3{uw2xG
zn^7dpDVak)b5I&BFp<JjnS~q(l=twKjc1mL(dJVrQ`S61wZT#kA6xso3q4MjaBwX|
zLH{7Ns1L3Q`Gcv25pj23*Fjk;t>U)kK6kLCtK&(PT(Qj1T4^P%wu&(cf6RS<B0{%s
zur5jW6L4!B>v{i<bzQ!e=7yehOSAeRSSjY!z8>814D?e9=q~M5JA-r|0s8@BF>IjH
za$em;dSCOejHYV6XEJ|JnF-ewH9Sjw=%Q3knr#w%IgRIrUpn;SzNL#Nm((Fac_*_B
zK`Vya6xdTo*V3*UF_o6uGgO(*WDExLzQSV}Et|&Z4s*hXSe$l%m`p5gS*dy&<=y$l
z`W1ZS=&aIsL0R9gnJMH~elx3_6i4^5>F)AZo19an`rFI8Na#r$7nGhr|H8bC%+>9J
zv%y&8k-d}RCq?<V$Gc|S?J2%3CB2yAZRns)0Jv?O=~7+po?y5u(Vbc{)pb^-*sK(H
zx#j*35)LuLlVYpq)dI&w8$W#=p*1erf`jZBsT};sTACpnHOFMT0g<A2^ta5@j_`lr
zZC|?4gC=Fg6M@aR-8SCo8LG`)rrW8bjZ-y*9XVtOG!;*E(0I#&(Lfk|FZ+GbI5rBE
zmh094)`nmbYa+ORgPFgWQew$6f;+fNVQPL_`-0P<dAqiSAvJxkR++cINqUcE!9+JU
zkG5gd$zI)0RHRjEUm(CY?1~ZxdjGIG+K27D9oU-f@z*SFZdz2gYL!}wAAWVaz>udf
z*l7_(RKpf;*e8vkre^(J2e;ZB@vj!!MfyE%EZlkf>qGv8idBrJM6kD&yl_{RbmjWZ
z&K-T7=KGz+T+`bRc3Lh&b1hLm6v4)57=PAIX-_%#%Cv9Ntu{sC^r6A1Zy=;x)Sn2}
z@;z6zaVaGh0|I?fL8rP)+$H6pqfP5-1JYYM!FRD$Oszw(O$6O=^!{}|wF8^M!U*~Z
zxi!#X?V={tRZ@y(2e4Yxt3#DklP=0(9{%p~KH3}aLU^V81-DPn?nfxWhQsmu9fbnZ
zTYN|_rtOgy)S8I)80iPMBCy#SYBaGH+^%-2w5tKDTG);a8>KCA?lM(PbjNVNRln;*
z?d(m!#?RKvQsQ<(P0KsZ!lr4gfG29P9wim*?I+pPrg&>#2Spdw8|<aYQkKd+v6$LG
z=?nOHw3#5WYNIs+GFNdyT>-Pi)@jg3VnA`jtBXXjX%01YQ##a2J+H0|BNg(m4h9xu
zL`%9>><W==mSJKS>la*Tyz3Y7v7Aq79rohK>3IS298G6=JZrrEvoQUm#r{wicIZ<F
zVOh&Wr3O|idZ9eVmqek$-rh_dx**%iaEP|rbL&eDq|Ciu`a;4;T%|Fg6%nSyK0ZB}
z9NeeqE#QlfNY8jyU^KqcFSnn;fcclus;h#uiUEZLJshDuQnZ#jUc=b^nSe2&cUSXG
z7@~HJ{4z33c>feaxPnD+f6}fsFHJj>wfwk@kLo(Iv<BMJfjTk2K1(cj^CT)`$CkdW
ze2BTQLqH6%)ba<Zc0W<TqxUZ4azst~R3)n>+A*!k>_Yam%-hO0UnBHlykYE}8pC$T
zm0_CM)4T>)F__duJg)k=<74oZs3Jo_JZa0wY-l*37vj3bU!AnzE4A6^i|f1u@zGCU
zv&8?`{TkNP%T;o2(tCIH(ZeRbjz}z_7Y{>PrztmSs;#M3Qx$iLNaIA0#rrY$=oLdQ
zq!+lv#Y1VvXp`|P0v#ea;(yTBYwoFyM_kI&eDR^H?E)P)Tuq`$TuntrsDetVi^Q`~
zA`yTK)1G!n@97cw+m{%$;^vU?I7R%yZUk5fk`;vQI5I1Y=5(OtfjS+~rE9Mf$(jnM
zFP}N!!|dM?6}biRC`0kA8ILaXMN`8>#GE>s8j?%a*$|Dk#wM?3LSnR~2oF4b@LO83
zV6+Eekw0qE+uJ)nk>eQ(H&dO;Ag!jK7ObTpuYvj2>5{CJfqlV8(=wmv@)*cLSww2U
z==~EqT|Z<P;x)~K)D^)v_CwX+y#z=~DXE|?j;p%JO>S7kZAzRno<JehAGk&-rxqU0
zvtoR}kon6BT4y2tB9C)bc>7QF#ljfNjeIjkOfG=^n>-cnmyxT=uq5(Ebt~(2-iYiB
z>-GHU18MHup||zn=>r-Al#YKnrO?i`U~Em$gYqe#4UUv=c@r>lZed8Y)`c=drmFUY
zI`C8@Gy@r6f>2yAkO~^5ZW=ROJ|uTsKJ;-U(xXj+N=xx%XzHOEo*c`^=u$}J8EANQ
z>N`z9bkPcG0)m1XjEQ#NNnxXXv7ovp6ieV~zRXdfE3AzM9#O|VN~0?V|0CmWX7ZJ3
z)n6d9>^06TznmQ-@P@j=bep@3=9$eQq(gk{Q0T%#u_^^1gdSSI!kVQ~1j5aF7?qfA
zNop*mVSYec@ZkhY)gFBtX?D}pMvq=}p|W5MH`M9Lm^Pk8z<?=R!b%B$F3eSi)E=9c
zw5NDIKFC<7-3k{L;O$@B)GXX7%&o0E6c@e2Q6LxoK#Nkf;Q((QayNf(_B2k*rEBA!
zFeTNP8St5@W`M#8%6x?Upe6}WwXv?$I@p((50mvR_xt-|v0wl~>zY8bWNbyKqf6NK
z2{#h-t;T#_g)}RIoj|0;UfI@Am6F+m+k!Nv7m532De1Cy#p?S8%%E`f6kkuDbeY@?
zs<jNJC(?z*v>;8Tvi&hBG}@mUwyrX6?fY58(hm<zlgSyU?@5q@%o0wEMrc(Fo_<0J
zm>z;qf=@bAcQZ1Jsi%k6W=x!9E0@d7!6G)uNxaC}$v)F-LKzm?84G4kMaP*e@5C@|
zA)zU|$aS*f`zr5BExrrCh`9)V1R<qAF`3|#RXL3iDZ73{X0~$V;t*MB6dHjaReSs3
zaIs{FTRf{XctdL1VUg*I<T7>^-;CL{mL%y$ohKZJqa>x0!PSc0$HIUK3B!itPFbaH
z^oUc?2x2~Sz=TSA-$1Yz1HUJzvk_q-`<v`wc0t|+sTIE=m#K>Jir{fcX7@sNeT}J5
z))d#);J9IvbIjh7ouM~8Ue_B5)8r9%ddRg>?$iL|NwqiSv!oeoc9WXet<ecW3q>M=
z((O80uSk*Us7OI(`3Ao~3SCn&DoqN*Ez1Jx++Xr$l)mj^@gUQYJJGS)h#)5}x?HG$
z3|5XJZOUmfP5yEV2B=P2-HKq8)_}Ztjv2wMUke}&{836cmF7@5*wWYA5scB?)!!Y<
z7&F3{Tlm9(SQtu;3vf&=0ItOCq^@u<KnS@Qs98Ma`sBOewBsnk$94-Ia;CQLcfAt#
zs<7ziMTe#5atO;}VHE3ddi@i_EQf>N*ho1XKKW0(*{qV>&A8NM+;JeNe}T1Jomcka
zfmp<JN&)exS~!Sdk9BTox?+&Yk(p(+2cEsiVb13@p59)X6hf=oAl3OoqX%8HYNxc=
zMb9<L=geJYT^Y}vdpgn`M)M%8#c*t?7E>oyVY!*ld&`>8UE=~6@8|=aGgbJaNQMHm
zQl#%H8e}>ryh8XpVM|aA&4a}+UqGw_trE-P+D`s*6*nemV|w)$-MEaW-%y<P#q;O3
zTkvBUCO>(x$b926zht~|8SD%<F4JRy;_*Q2R>|I+|D`lfOb|ffY&H85W}RrFhqdV;
z9G)R^1JCIFacd<={M+>X@UJiKWMs*1k!pxrFe2fg>#nkTo`R6Kr*ui%NQXPy89$Uj
z#x0!`3|daH;N8?#zF;NtBd$U5+;WgYiQX1<BSJ(L9;sloMt3{(dia9N2K**gY7}JT
zax&JWZM+&Q@$lW4O=2cWPCP5i7g1C$qGq{)FR>!!s^rVaouSzN){)6VwC&(tKMg6&
zkW8QTple^?SqnTqz@LHC_FRn%=;M5lw?7r_7Nu{JG|}%Ur8-&{Jm`ULjKeh;1S{h~
zmoA8$l9AifqD?%spOsu5C2QkaY{Xnl%fVqmx{LY?GCxM*WSsHHw6hhr@nz1LIo8wR
z5+<?l`-rrkNvEDV%m9TyP5I9Q8@JiIm$v#PtgMoLAHs*n0u6#dW^=#bh)Vw?(D(%`
zv9xZKXN<p%YT^apMzo0nxX~`R1IZ|Oi&BrsVDS5*VlnwkxDH(?!(xO8W%vy)?gGso
zy}_oAR0QKcA7B>1%;OhbF%Dl}AbRniV+yUqt<?&eI7rvLxDHF6baQhx(y2gy*w>4x
zIBjRja$@i@G(+eKW}Ba|<y4A-R6o;+3$<rtFykD(unjp3Kx()ilwlO$DrO{Zqm6xZ
z-!$5fJq@%3h3?(o+>Tc6G<{^<HBP1DG~KN8^~L$%#b8TWl5wB#_3>~JI~T-{Ya!NW
zMi^IBx>Dq`P$Sl;>6UML(SdIf2nW}yV#S)=2v6@ZrJ!(nDt3Qps|++BZs7aesXxAi
zFA(=Y{FyTHy%?B8AouRYQbHn_*|phgMA|AC=3!!GmF{z+Z)p>k{yEu<o@6EQ{$Avi
zfQguZ-P#%Mpax62!hI$56Hzp^VkIdQ?bi;>4+R(}pu~`sz<$9@K&X^op?z4`&7JW@
z#WhSAd{jW5MSBvggl0pm1g9lk5qT01`dJD6XcQ|!NDhHI67VH_tR&bie;ubg0B+hh
zRO81|7T-C5pN^wjn4GKl;T*w3J10IQ23T^DY3QMRBta4Y!>j3t#~FS%sFy#SEq?6s
zU-*AG%I4xDe>RhClP4N<Wbw|+`d)(X`tmzvc@56aqPJ&#$ImpgkCMrVR{tJ(qQPNF
zua(QLA3r0?3h>d&57|Tk4GvEg-xUD*saUQ2G=XNVfK)^)@0}qoXt3*Rl28KqbF(Sr
z892j>#8dKb$9og#(BNCLfF{3Ij|RIzJEki?gx~$ALxWCPp5!MvHTj8#j<xs@Z;jsw
zeqWys4Ms^hIJEk;@>=`Plh+n;D#mIW#fb)s<&Sbw`RVv%m%kJ^5-|rKeza#ovM>Fi
z$)Vvk;Y45F(JroNa6rnh(QChzs=+I=$`7p;lp56N%*tP%Reo5O*Py09v+}oPmDk=U
zuE9vD>_EvNTb{eJ%4_cf*5D+qVhOYLcfTyJ^*<Yy^+`BQ>u6ctA)y8y#R-+r%4_dy
z)8JcVbh;zE{%3GaU!FZL>zA-yjwg+UW<T2ZUuTtX|C1=M!NDwrW!L`($`|6JiK)Fm
zdgv3uC#!hMel<D%g0ho!<+b-!56kiz9Z2}0eb?Z}z*0?G{n~qyx6ATFLJUzD|Fv=&
zA_v!L>}ch+_x7&(Ocbn<9Yf=v4z0Y_uB(C5*T1z;6xcc@%6ALE%j(N(d?w?%zI^)q
z!-CI<R<Y!umD3=-MpR#3dq3=`BFgWji@L+1JEzklah-T+?bqIWx+_<dcMy^8(CXDt
zN6Y%18bU&?{QNAuCXOD8nlcIV@&tC*F#>-iz;=AJ`n6$|51b~upcMZ{7l`tI$L|{J
Y53ODeNAA_A{EOca<>zG;$ok6u59JP$EdT%j

diff --git a/PyTorchSimDevice2/torch_openreg/lib/libtorch_openreg.so b/PyTorchSimDevice2/torch_openreg/lib/libtorch_openreg.so
deleted file mode 100644
index dbfd3478e7e06650efbe50b2bd6de36f52cd3986..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 569736
zcmeF4349b)w*M<E8jvNZNL<hcL4yi*XCqP31QO`T){vlafliY&B$A|?qys@ggMbDR
z4UP-W$QZ_Pj0+l6G~<W{5seEP+y)(y;6_Jf42%ovDF1WrJ$3u^O%>qs-sipdpVp@P
z_pMXs?n~Xe)ww2n!q_e`F<lLMbv4d6VoU*_8T8$Kf2^VzMw-zJjtnv!!nT__()mkW
zPwCD4G~%QU((d>*jzZHuk3T_rH9z@z;)qk6Z5Z0;t>nl2<mVj$!Bt;~|H9{`(VxQ{
z0vl;A`Dvcl1teN#Tqf-^=4hX2fXC_6%q5H?zT1fFpAI;NFNgToF!b{bvqZGdA!KiU
zVi@>365jaTE`4Z&FUo!uUhXHMB0NuzpQ#;q61*JZ-$*<?bY9#YvK;i!$B&UGn4e}m
zr@(nK2g%_fKFs(|rSR<1SKD7YGC!$Si9A6c7|U^!rWsHi%@`LmTk-p`r15yq^H1HE
z`utIEcG)!K_jlZV)>}E}eY6P<8%Kh}uJCri_oLt)|5;pIOgG1{xW#c@3LfurJT!=I
z&pG~_8Mip*=%t>FK1R2tgO@ChS=GIpQ4n)h7h`bGSg=1f)-m)$&*qpr?;3oR5xX*W
zNZm;_N5pvg+|b-@X+}(!Zds1FE~D!*3hEmM_cSt%yY?S(eN9hCW_8k~#!}<xBix2@
zQgVJmhBJ0c!?j@EBj(7Sj<lGpmLoRZS`*vjT4&~IJ)AR*(V6l2PNV<Qm=g-R_dDj;
zgo}-%3Szq%&Wwz1U6%Cg(({BRwFNz)1}r`*)9BXqv14P5W8+5G4z3xr^xBMWy^Oe;
zK@W6!V{k^7x|qbdHAa`kHOWU9r^F}J^@z`Uv|sZqqoJmzCuD)hkM58s_&NeUd(d}m
z@<+kDH|f6giFW<y`_b?T0UQ0{eH^iP_&gEbCzD3|Q{cM;-h<#h7~VtReHy&Y*Xi^<
z0X~Pp`wV!WN$f2891fpn!`lgO7qMjeOrcNomrCE$;qx5E&xP+7z<V^j$HIFYyv<h*
zeZP=C$HV7D`aYRHr_d*!lMCPT;5`lA7sLA!cwY+d%iuj7-d=d)%Lku@@Gc^aY!-Z<
zO}Yd==fJxR-gDty3GWcRtKq!>-uSu_KChzh3*qw``d&ky*TUx#`Yz8fjHRTP!RK}G
zz8>DoiQNF7H^O@*X*}m<_+H2KD)|08c;621JK((<-uU`GeBK4`HSqo;vAgNB9zNI7
z_j~B`&-97s-V5LFgLeb52jFuZydNRm2%nF^`*G46=o8Ovg6~hkdn3G?;k^mo_<9aL
zH`Dj$;qwLhE|1E8TY$X;@0a2ID!jMC8(*)(XDht7!TT*@@4)BZ>HBv2d>20dLErJb
z9rS%CeD0#}AHwHH@ctCuyNRJ~7{0fY{tQ09V8`${zV>MBOVE4ay-(v`X?j2C@8JDC
zynlfAzv2BOybr)zylBko3ZLERJM!-I{RsFxlD_wV&z|t^MY=b99)14K*SCJ`J7#Xl
zmf6AX4fj01{`RIhG2hIeQ(v09Hs`$L(aHZRjG6rE8}n}X{Da5##BF|gtZT%Xmk%6W
zdBmyTWo2Yt@Z>4)uJ$`ij^CR(^xY5E?o6AW-S*kO@?(}YH7s9#!Q|(|@9bK&=E|c=
zPaM_!b=H^@JeOYcd)HN`j;_9aXODlx4;gvmslJKLH}t&p?^Unw^u`X~vS!1QX?bf-
zxu?hGn9UC@Jn9c$7u}ir_;ru<Xt-z6;K2u`efs>UJ&qBN{r&#NlKNYG>-O!9+x*=%
z>ubvj2993zmpcX|?EWMp|I#-<N||1D+YheG{+Yh?hJxggvBuMVOHXWi<d#L(JpXmp
zX)g`=(^Gdoy6w!jzj^kw4<|qW#Chk`=eL#J^@lIo-<o{OPt{M)&bt5g>cPWT4EXG)
zmv+Tpo^bN}XKssIzOHR$;vZJTUO)fCfn(0zU48Z6J{>#lKuYeHkNm55!#y!w2Hkhu
zl*Fl1CUu;X_x0&(HZA&c-P<P>optWg^|cRN^2Nhb?wGjwp`wAK)7$%;9Zb3GoULCU
zbwhaQ^$VXoI$`#`4?nZv(W!qM@!%=%eEjU`uMWQMyk5@?d-c7_(7Cb350Bll=!~nD
zJp99Vcb0!O_{y_p-S}+#3E$t}_mqkEzxl-tf1G|_)hnMKn3Hw?vD>G&&bZ+7W0R{N
z?zt*8;mtwrkI&wE;%7h2xpZ910>`lrFYJHntoOffdUb2B>*s&_{ry|t`SZY2%2s4H
zPYYdhan5m`i?X{fJm#v}^|gQNwdI=R3(uG_C$QjywFhQx{pPrY+0)*6ZtU(ypV{!m
zsqcLJ!;ROz{`NI(^)F@gdgh(n{Ego=4j5E0<^DJSQul28rtFt*UDxs8zA1^1{PB!4
zW~Ak=c>m0+pa1Hn-j2_HikT5O^~>!sb$=al?8C2YK66n2+b%yY<MNAN{_?%9&rVOu
zZ2ar<@nagFxunao9t)4@K78!%OZ}nc>+Wtix%9<PpBQ}0D;Galx!{AJ=2VvWjyb>X
znuls{Kkdn9Hr#a4hOMVAyyccrtFC?h>({={==N6E`1qUWZd&_MTJ1Nd_nffe(VfH3
z*)U+x)m_^@e{=o^gTr?X+HuB%o4S5Guy@UuUp=^Q(6UK)jXdJ9NrR@OO?oaj@q!=P
z(ms5-<d%(r>b5_XUG>D^yRRPbO~uf=?|!oN-+y0o$Ckd|x5SUB>UwVRvoi*Nv-|yz
z?wPgZ!CB28&z>;nqtm@#+|zLKj+@qQUbA)0IXyNnoImB6rN=J3?b_EjopRYbsc(Lr
zF>&FBbI%B#_tdWCMH`;Ja^V;0XZ4)$?2&DspZ(o~`;t%3d#+$d%enm_eAn1Z-YG1-
z^{1}+i5IM2*!HJc#g!-in0{OB4}V?r^80_-()T{k8<{g=;_sMy!A<GEJGPtmh!@j`
z4S!|ix50L3mp8dyD1T-7ssBhiuxI3%GwORRz2m!4f!Dt4dS2EA@9w#I^`EYKD){%a
zcfL7n@imYA`@-kLR}WtBnXAX_o9144$vf>kPTIX|RQ2s|{^R`-^?lj~+*ew8;+>DC
zjeUGquXDFu5cANODS2xu@A~t4+2M5l_k<kiM>=C;)&Oh1JKmZvyU?2NKh>JwJH?ux
zeY!QDHO`u^pJB~s*~D}AKx_N?)2w-FqBZ{r^2{pVhNM{Y@qMlN72~aWZ-+JS2@~pm
z7UgvUj44*}xhv0__rBbk|6#JV|5Y#<xAI@t!<x6+l<!>-d8_!JVdMXG8(v^jUdKTG
zTls%%l65?%Lp-hQ@43X<eluKDS=nD<Q?K5zsV7gug``z}9++s&&ppAKPqN9MVdJgi
zS=P(i&jmqio(thxrMKo%Yx|^tHLr&YL#y<@In0{hpKZ;%gA=Rx4Dnd=t8MD%P@F$m
zw4+II#bw2NjkM-ZKz>@qXS7Xycr?@6PrXgL_9a>Sf6yl1#^hN0cbsI+H{0a%yEggo
zb{}g$Z8rV?%V$~JzhYBQ`oOifRsQdW_*><3o!8p_`!lThl{Wq11Ls@Y-)A$9Y_Vyl
zi)`9y!v)s<lc7JiN^d4Cm{{@erdacDZ2FVI>DKnibFF!nO+R^@O?z2tQ@)FB#-lkl
z{wtvUTBY~oTx&jTv^B4{>6d!i*!QrRZ}f$6&@w*I|6B3<Y{vahj<>dd!6seVHuZDR
zY1V$SPPFFzp#Qgu=Ng;#GTo-1e9&gz^`6c6cmdQCt9I(L>6doe_<zMF-#&r)<Y0?_
z`(+#dTWrexbenPT1e<(#)n>jic$jrQZ-f5dDj&YIspmJ_^s8HXTl<G*XE-eKp^Hs@
z9&A&uuD9t=CPL(`;{Vr+t>ZK1WNY3r)tYy)=_gm)lnbuMS=Hy$Z1U{_n|$`$q-&?m
zxZH4=b-3g5t@-md^Xd|te&YihKQnFYTWr#Gp3OM^n9cmK#^(Ai&Zb@kZ0gmIW32Px
zK$<ll3;wOzZ-Y&}S^~ya_3%WS`Nkco*5O`cGcJ5<<7b48J+2>G)x#XPv1G-6vhlOj
zCZ0#w^glP)^kc`_jK2^~!vSyob+Ju9_O?xV?XsCieUN3H-oZBQdX`N;dCCxLKe<6`
z{yUp-^Pe{TfYZj$9GiS7vgrqw+wjE>>v&GKsm}qM{(ON=KQqmyUB7HI?q6Y3e=f4g
z|A{vB|1q0+X-pUE^rk=~wW|Nmz&ydK{XPl#Z^a+7X%{s%<+a8p|2@zEtm1QwO+9%K
z>c3Td&W7@`;`iIMyVVKS;jXjk#}?#R+dpnoPxji_A89lG{@bP;Pq3L!UNhA?KHu3~
z&z^5H&m3zre{QnzbBRqmeaj|Y-E7MDzV6oXKNBj0RXGl@@qf}3Yd<+Q{cw3-Yx@H*
zj$4Jh-DX^Pc%-$TZ*1cEh)uZTZ0hH$rPlt>I?|f&vT2XqY{va>dRhA!2$^8j9(&ld
zuNgM|{9kP9|8+LkBLSQK^UF!r@p;0gUZvUa56`sr6K@ml=Qic?M25AWr+QlRVi;$w
z>eZ1p_39&Nk5=~Yr&#+>u&IYHz&voU(Z>kRgN_AWkWBHKkqmnWi~&Z%Y^d<?f^Ja0
z;0xNvRZIJQgx6mrx#xA<8DbcJh5W_WD^Q=&Ps`=f56(yNwI048FP<LFt6_dC{NoL1
zc)db?HbQ?p2zKGLO@$5tUh(2z!>E9M9_=0LWPDzR@eE&=k0sti#%C1yndXta?S9Fn
zH^bP{U-HI8>A>VYARzSLm@2us$Kn#mXXG8LWw=Ahe;DRJ$XkPwLo*bwMR5I#ydYEh
z=}Gn@G9-7rDed2-fD4b4ygo_Vn|{7MR&wKP$)}M2OrPYf`(?TyTg2-)N>>5dn|_X=
z{Eyf2f9PPD&&}gyL?Afv3W3D<x8cSGc&!ot!d{6W*0(-}^CW4XPCR~uv~OH0)Aa%I
za{`jr43hScE#lRM>QBwq=yLh;9BFSnE&Yt6e5-+R3gc7bmiA5L=Us|_O|JAmfVk%r
zS>LJ)qU+n@L6X<ch~{HqeG>h5_+&o!l))HVr%PTuMMh*8#pjkQB=_`^;hObr2DCf$
z6W1W)f46jJoCoz2>;Hb*)dkf|yc*_6`?}uJ4^=zkB(iUvCFAKN`<sSH`$np7@+`x+
zoAR@s^7DR*=P9smiSem_RmP`>{4k2)I#;Ck1DQ~>p6tF@mapec$w7)&06Kq+e~{X>
z8ULj-q@PdoqVwk}oAxyf`WXzj@pm#JFbx*3G-_Y1cSw%=o$&P!81F<saH~w|a*2)I
za3cCKX2}fxg!q#aBzI1i@yGok_}UDeC;BN!mwsRxBVKPn1x7ywkIHZzWPjngl6y8v
zem(J%T$0x>m)vX@Zw;4RumAV^C3k!;<8wWQ8)MThJ|*6Gx{T-LRDb$XKk2{$4qhKq
ze0Dozx(sT+XN#k7?MU&-xK_sVPU3@L{SeDB0~a*l1>J&p-9B5!v-JlV&-L;&WAH?Z
zzgO}SiqBHYHzyg6AwOwU|J&}74$hXR8ONO}<Jp`i(`DA5E1}|H{s(W734&@YUbXNq
z^12$y&3-8XPDHNv&o@)Oa^5DxJ(cq3a_X1rmPvjZ`9FJ<^i#iCa<g2n0=X7)rTzjL
z&zH!48pWrE;v<j3-l9P=KO3n(G1E0x<a3<ixJ6omnRu<VX{VbZVrz|)jJhSVTw=-o
z=@QC^12Q8Y>m?7AXGv~+B;!xpfDJG8S8;99ei->VJzuu#xVI#KkNAZ&E@WuqLT}2?
z`a0>qm;5lknk>t)V~I?cSuXDsP`S`}Wah&@@}sxw`-`Q$Uf=#=ldiL1{w(GXKAAst
z6n{76XC3u>Fx?Wb(n9G!?mL;@O5&fxr>JjBWCej9M7+|6NM1+%Y7gSCl}p~{km>ak
zUk9I}9%}XTPeUc|AV21KmkJR>`)1sL2d@HZM_r&}N3QoLTZ<&u`;Ak<A=(?Gq=P2m
z4ffexCV9aQnO?Kpo1uWvzHNlGH}j_kqzLyq8J`=;&pdO!w?GznJ@FpUu?YL?CBK*W
zCt_TH_HwSwA2?mS5>J)&q+pulX8oU$O7YS9hkJvveg<h?v5Ne-kCA@rM@c_Wtm5?z
zBoN~hw@}7s1@Zn4$qVj~={5WR_0(V0P=5v4D_*0h|8Y?N<0gIy^?P+bnV)Skn~c|B
z-j3n6UMBsYLH7M9o;BymcqS3=L*r4~rP9w>d73fxJn2X8f9^U{^0pe8E*e&i<7qyV
zpv`AuGo*do8`8cH#q(|$M{u6j(O1UDOji#W&_%v&lks_z;<*Si1bH2`7qdTn2NHr@
z?*~@X_}f9_Z#Jdt5y}S#^`FJ$|5h50+Gsp_koebSvV4QL$#|Om$%8W~o?81&p?vUY
z`7n^$my_C;^kx`;uxY<bX};$vlM&xf@kz>(;kJDs^Yb0z7Y>zNZ`TVcA2Oy(KRd|2
zC*@ll^`9y~M^Juh?dT#nUF5UY9^1~ByyhhtpOfW>@h<hN9-1#*MdA7=T^Te`A0>;~
z_*l$;1{iMby6nT#rT=ES9-K_|r*yFFZ`){GOC$f+LHCRC)Z5FIG=HmEBO^%D7h@Fp
zY0<{V<u>i;MCz}C)L+4{C|<49Kg11`C0j|nD-6syud7)uxoMx)U&b>?*Fk1~eg=(?
zZ98SgnEm9bmyrE-$s5W42I#nif0|!Gwu{&BB5AL;mod=LME(qve$49voIWA1yFq5~
zeDae`<?DQ2=9{_KyEoKZ47ZKi#SLV?yRR&l4y_!^ZQ9q><iB-`^lx5I?WB2H;{%eH
z%j#=f3mJ&>kXmgX(#@uQy+r+g+c`4-&2fL=Op50fP+{QZ5dXqneQGZSG_if3csh*>
zjenByNuYXILG8U>YwztDGCz%XrM-C_eS#>j0Y)9ww>a{DH;rGOT4@j6j(Gi!t~VQ-
zBscTt{6y(rZ@<qCkh~^EW|%o{J_hl`@{PM)a?{V|$+8?9X}$!_Q@rMQW&U_*9&YB_
za8aKJ7#>=OG2=gv+I#b@GM;|&vnf`F+qhO{*h1pJzg%*I>bWc(!?=_BAALUj9z+}C
z@1zSIGhKb=%5kKQ+TBw!dBy~4FLm$AdOm>odo-RJcgpxQLj3Uc6xGkVB{KfUQu&@R
zLxy|HR0+)M>%Sf)^QT^`=id*N`I({RCob=Z{9hscbSMAc=Sv>c+I2gXV;o&@*Na%f
zK7S}6oM+b2b-LN_Et@6%IOzK5VX|)?B6%~-_aOP=^>0X*Xuqpu&JQA9O5<`Xjmyw{
z#H*K0Kd|Lung0e|kLQs6P1G)0w01FVFojF~6%5DX)j{>4QS0}Pv1yNekCW-Dq4g-3
zc8b@rHtlpOmE+U7(d9T15{~(hLGf{tpI#oB&k26nKbZCZ2I^0o)SsC7bDmB6{ew+=
z%(iJSr%`)veL~hlb6%H6;nvgo0Js&e$yEP6H2*g_j&E4s95>4JJ}ccB<7u9$uTMT;
z)4rauX)nJ!L;A0$`E3=&C$50Xg|4&A{^8<NWj+^NEbH4Cvahyj7dOGUzZS~Hu~gc>
zME2{cJ~(E`2HlU+^+(|!)`#i(%Pg;xhsyZ0Y4tyw#_@U@$1}+P8fagbZ><&5|9Qk)
zsJ_)FOWue0?KECzXxFn}4VLM3(0anjWPcXKU#xFXJQIj-bjWaxM(KZ|?6!=}7fBwc
z&4)LhOX+$>X21>dG~>_E5HSAwx^6zrH)?3U0m%`s4@5s7XLQgy0GNr_<E1j5%``rm
z`S~8@TL<Nv*`M4%^-W)o9S`HRh|gRZPjD+<GpS#yr*i2<{5`4<dcEzQFZ-EB>Sv(4
z5U&@<OaJ=1>(SKi8nyBYLA}EGctSEh{V3nEZR+#4G!DjV`Oux}Nrq24F#FqakWr%k
zjFfyB`M++Y<odd6vADh%U^J3{v%MHz@9eTbxwkUps|b0$hBqf~qPNIj;h$Yn8S+=;
zO&n7iDEH_2W|sQpG3z7V!fKy)R!O<9wB$-KymV4hq05=*EGZ9FR8^MD^LvA#3NHjP
z+aGd`tniomeU*N1g}*XTRZ-}#%${eS;&SCKD4P^0^5>KVOI?YDfpUmgdB_WnY>mo%
z3ugMgzDm;w(^5FcTUb2DJIhy63a*PvJ@d*egUE75+>XpC59KWg`ja4%5$a0LE%cT8
zD)30>qv>Ny13rth(nbdYrJc=PPDoy-U{XSXio)VFUsWhDyTVtL<TSrc3WP8p{AK>~
zkiTfGue5X~q)f(bsyu7tgp!#RzKR9ukXAZ8c~Y*|<r)(Vj;$&$43z}RvoYz?Jc?(z
z(3N*~b+tF>uc!=Q>4g?}=Q$1UwDS2S<wf49{>rK{Rh}+qp1-^@AhN>DByVZS%tCKv
zN%`zjzc*A|;rA8!i!i+0kjq(F?5pq>VcF-13c(#@XJ;kKlpKboh^VYo$(XB&$+;rL
zP@sHrMXm@1u?GUmb$XR&tb~VU&-}a6r<E7^p(^`}WNJlCbW#!_K#`ldp^;vw^VKew
zD{-c;vIMdqQ~^=SDTLIx3L!?`P=&7~RGCwlo$G^tvR#SZiC$5kE2;`Z*=2>0NtjD%
zUTAJ1sHZOY779SM^;J}QT~2Qalzh$=+1c|vg;(Txv(c_(Hh7eo`?G_0Lhu#-V5zUr
zpIuhw8DA(G#LrJhI{5JggZ}cOY;uwtN_0Zhiz@>2y`}#0*`Z>uzoH^g0f#ZJSO}>(
z7x_x7{3DC}kWGOFs@z@Pyeg>QGOZKJT-g^+O7w+NWyWXcy1ejzR7E+=6R13~ETThj
zT7>|C$XLq8nVp-2;T}5lU*e2n7TtKwmY`9zh5Uj%#e9rDIL4Jd395SXcxa~Z?PpiV
zY^ZsGAodQvQfi#8tdh#0FH~4O-oGFl3-^j&R@JbE6hcQ3@@G#iD>KUcWre{7;0j73
zUDcDoY-lsQny@6K@D??*YL<PYqKwPCi08oJpN>sA?;?m-<P^WJ%qB!?_o<};7!Uli
z@5BbIT4KZrbObg|WMA;>PR1b%GPTkl!t%l9<w`cY`rN`|f0@s$PnO4`YGBe8CD2BK
z(B=5$so7J<^E4f1=axMaqoNH-$<VRQ%ARZ)!)(taXkV_x$uRKvVECKlD}&lHJ~xy)
z&tC{S2m67fJl|~WJ#*8b6Tm@jBn(J$UNX7jB7a56tOZmku2gSs5PH`t94Ea{ef&8S
zCP8JJU0x-6=|t~@089yjfzpz~1=)$v8_Vuc%tNp@67!eYRlbTMudi@!RY`?k<t@Zf
zHWM|RLsSkqs$joGZlNJmmq1yuFlYu8#e6dZ6{<wjCcyMd1^CO#<Zz;yE}}`bh|j&E
zs!-Gcsz2jdzYbj!OloAWF0kxKMIYsLCBft^G7HQRvlBMU!;(#vLts*J#8gfeu?qRB
zN<-fAs?t(#2*&uF+(d{oX6~hvWE^B*>9X{|0d`$D>yv%D^qiO^=2#F*e+Be-nU$6P
ziqKdXb<r3+UIB0TKjKslSZSd^R-g*zzA|khyfg?;%=%m|8RRUS0$K&?N)~~RDfSo6
z(ZbRLN(}{Y#>j#h3Be$8Ib|p_@i3*Vd_E4|c{yXFMoXCZXsLE3mXy!(S9mK$uaM|0
z^Ud*lXVI*XC}w|JW<`Z>!Bqb&C>&^&uFQ(rFzqW3WkLJR&dyC1`J%P%6ftOHj5^OZ
zv50NjDz7$)NiZRSN}m^q%qz2T=D@OnW)3cANhRh6HVbhyy30?OBSt_@!r))%#_dN9
zkxelEl>{n9Uol2bL~$e_M{vaIbUZOHkN1ziDBwc`0*#;q>XY@b1pNvYGIW%39D+86
z7f)cE>h)s!A?0u(gw@+J1yjK?DUw^5PkG)vs5-f+Ue9=#`G}UoGKY5r)JzYjG5xYG
zOD0W?sE0ZFh$(4hWy$Pv971Gm%?%)>j3~>_o|+rT6V+JsPc-AUX`jD1?l2QhHfK12
zs*pD@%Uj_qpABuUECdbZN-cRmukx6ahNw<26ri3|Hj13FQ+byFx#1L3Xb4#AEySUs
zQaLy~BbT_MV+Gq%$OJwW7u_Qc(2-dqUTGE0gesF$ls!{4jhwN$p_CBL=yNP4^3b)&
zE?IWi+B6${!<-w&@l>xC3{?4|nKN@oVmOBv3>3HUmh6SvK>vqQeAvn5%$!h`Ec!Hc
zsI(MPA?;Y|WLVJ9R5y{F1MQO9TPJRo4(3_8F9b^-X}+12a8+0(vPR^k<Vk6Bt6)h3
z)=9h}aXF0_-{x1}EWGlxIxR5~dhSY?-BcFAOs6<dQW!DI%M}@Gz1||T>`(?5O@b*A
z&Xs&KOXj(t_x2W+`YJ2MkPFRd7VJO9xpI^hUiMX%SQ^R*xsuKK-FO&w{iQ4_mWOh!
z1~iL6)ewdgf=KLg4^sQ1T|@P7dTnce?k}tTx&J-2|B%7_hP7W@K)7;Y8snQSm*ngg
z??kU8X2q^_UwL^T1Z_YpD^|#Pf@srbxnVyM=|`X~{oEd3BoVqFtWa_mD1uI}Tu@f#
zhlQ9zHiM9na+Oq0D~A~2oVwFNn1_ftf29}K-#Q&mE%sH8t?>IV@>P`JLMW|QvLq+N
z!VawA;F=Nm?|gh@)bU(s`!Wz5<N1{fu9SI1Q&krL*$2OnF^A{hz!OdEf4wJhaa9D1
z34zMrES;`o7|vl~7A^=%{V<6a7mMkt{mYT{S7#=z2LD!Bz*oCOQ*utKDw_$b>QG6{
z1yptIB^z1vA&5w)Mbt>mQ0ttBGUJzQqB*3YRPLKUM^#3H7wjUtNLQUc74CqrL}
z`D=ODr4F;^kQ7zjv-Oq6pEvI@M_ibDsBYr$ms*k_k!D*v#JoGD5S9ZfU||fFYIESy
z0N&7x`J=lSSGru<y0|1%JjQJFaE@BHfXq6?>a<wWNTSY*or(FIYR-*d)gyZ8Qw3mY
zMkOqY$T`8c0A`=jHAx&v%JR>HdBfzHS6CcOo#-zMR4f=9s3?oRqaoHU;N}9XdxiX7
ze|4c>EQnB(gE@MUcfQ{Tt1F=jng!+MU?%bdU@R8eG$>q)m1FYetST>=TP1E6(W<0Y
zAF+6n%|!v(7w9?V`V~%iKJsyox>=Zuh>YZi#2AD$8CUFGiJk?Jzp#jvTN((Nv%*d%
znFnYxFYYj<i^;sU43H|P=AG9EA_pb}%4f3%oFXr(y@7fD3Rs$$kC*7G4%1mKrwB_d
zW5`o^L}1l}s-e0sNA=Q`9;^tI1)v(hor-e*Y%$TbbkJ$p#0(kMipB)+rdBmK>6}54
zuJ~uK?WMp_A!;Novxx{&(M5Vsvr7KT0T{DZ#6C3yy*e((=LK@by&$o&f(=cq(NhIa
zs(>}hk}_yVxX=N8+b_P@glUs2+8b|u(MM>ZaVD&umV2QY7Zww#@Kfs(EfH?&L>-+4
zlb)ysZ`OZj<er<T{_@^L)Xham8jToo?r*M8N8fmWPL7%l+^@`qD|lb1s=^P~XE5;b
zt^n-xMQ+QU09T4tWmijRu=ocR+PtmI>T6^z&%(s&iW$}j%}ZT=c`U9c;24IDr(uPl
z3XcWME~LpK>M|V?Cmh%u6W&JBBwQvhGHR5<`gkd<XODzB4{JZzOPi}67Xr_o$Ctrj
z{4B@rz0_L*Qz%?_&+(MXaV9t!mPg>OLK$=$vZ@|*z_AQZcsg{5U<Qi@a9dA=X-?p2
zp)b9p5*BIIG9NAR%GFp{&a9LjQ{?z$k=62eA}m99%9-r!JTG=*dAU%hT;7RQuo{0j
zcL+pD{`^xQl*A-|IbJ4s>Gt<5=rZB<C?rfa2;O*Pk^h2oFf~87V>FZWOU|PUdVS!_
zGw&U;YebPIvgc_P!FSFA#M-)o%j=z2X1%Z=!xNKO8BWxaf!Pja8@0IRH`^#2Iz$`Q
z789b{Xy;>{+o-5-zkX~HQ>kC=M)dAM=vlq<N-9ECKC#HlChW=3u&QR_05*A6c6A6Z
zuAE*^ZUD-LUtX6~ddmVuRZ+J?QpAdVV&eQ_U&uQjX2rPhiQ80Q{{w6_5HkolJg5yW
z$=DOSQq9#bh*fbQFo&+1t!%-23EciM_YKX9z7DrK1G@^`fl=tkmS`V~#MB_Ha`|zg
z4kx#8MmgLAuYh|WrL;?7yrugTaPJGqgwBV%hB5?w)Y48(O!UAdMP^}PRasRj1UC*A
z8{Cj`%aIK4a<Pm1sL}9O`<2t&yj;)Nc^=wW5M4Sj%)o_HVObDs01RKr-axQ&<h&C9
zeAsA}4Aa$8F>&##tr~LG-7>QhRY^nY;r42wSY=^lKJFr@Wm=j<9Xa@!*(v_AU<h{>
z{HoJ&e}YV>Xb|QeC9&<tYGH_7h92hqiR8-pz98IzfIPw(6=rF~3C#NOLL9aXWMiv5
z^p+sFnPIghXgrjX8TZj}13aY?R+-F|DDButx%0z3ianuRB}<N8y2E}Tl=fe^Y6tt_
zMAF6ODC9!aa)ep!TuxEWu+wag->T~Ir9?~$I8{6LU?!R3g}a)m-c)&2%s`g77?xnB
zvU4T!>tH$EN-czSd4FOWMsp%;)=Eqjvv+S*d1X})?_k2RJa(S2_UOyT)sjGkbmFOi
zYo$VQS07SCHwIkk>IOh$<v+SPSuIUdOC%>F@$Le2wbAEIg<*>9M2k%VD~l`xliU#O
zF0<XN#_FfGzfIgM%)_M#s2gc`%P>WR7@2q3pKBYOSP@By>hRR;GtyGDm>h;Tx|)x=
z7YLJLbuUoWuyk3&peukC5brFp1S0cI4D9^=c%s)_qrs~bzKWd)yY`}LUMki+uvazD
z1gho%fSO0xs^)d-lG2NOA)nkYHxK#>C+wjqs|tzfCLYP2mp!d?UN&T<Cl4+v;KnXi
zhfZD+N6PJb!WpdFV6W_lyFH>>%dt#!3b>*Hi+S^1INrgh*E~U1yOCKCBy76GnV85Z
zaV>`@BqqTk670|s`+>!om2!PEdn#@Y(5I1;^cLny!uEu14TGSL3Zs<hg5kD23zjea
zxExhv#z3u9B)UY>Cc%CY^O&}!AS(1j=ftmXDU)p)3U&$RR!95wJ}WD#*yE>Sme7?G
zED=R6i&%3M1&KY^!ImYlLE}y<ab2eN<*G*b^L?024_52oCZXI1EcRy|Vq+GpH$%rF
zcjCZw>Svye%PHz)Gxkxfl;!3@5`5@WRQe*JM6XzA^BGr)xd7s+@Y_!MF#E9>QZ4(!
zLrgD!K@chxaLdN+t1K2BNlsaAC^_gWsmK`*-+qHsVb3m-1-s2)bcpPDhRq3ZH>WV<
zf=xZZ;U*j`O~6Hb8K`8q=~fKiCgqOy!q(`Cfg;*VfgEQ`cmVD|!yYBEK}>A4DDvV_
z^N^<k7uqWP$jrTe!rjP7V$E&Z6J|QI<=X|3jhcB!jQ^7}kTY0kXr&3eeXJoYhdlFO
zOu{M*cV~H7r^D_IxT=EZ3!+LI&CIfvr@$y`UD}EEr7aFwmo~VwEo~Q+_`yp1e`i{#
zO>6y+xSopK#jrXmMsceH=FJMrW9$K%)RK9)%Rfcz>5qgZ#wdPBO&6k1P@R$4rOGMA
z&l$R4cLMBun+H!gm@|^-AXsEw>CsQuh-HlENXw-S5nI?9sBV0jSLK*4YcG@%djpDx
ze$AHXs;rtl8}1r=qh_LVceR|v;hOeL+{O$m@N_Za!u^6?oS+rM<(t2>(hECxXg99y
zNn(>>r<1%%w349}wJXuw`;Y@$fy@jzxRG@|bS35b=J|_EcfwT=W^C}>kQXMhur>=T
z*mI0%R8=k@^fr2s$#A<8*IzQTCiv(1OJT<?%-?aFz{~~SlA`KJJ#eMu!n$|qMPhe3
zULV7wHEFZ)k(FQopJbT_Pb%P5QRGpU$_4NMT$!8_mxyEOFrzJlXWT-5t5q#MWj5(g
z$*hu(dEwn-e|5+%GdtO_N=5rHUFq`CxA_&mAUqsY=B<R=eRww@5BD;||6*4@n+pDl
zYbomme=)Kj@)iSJBEs%Hw&1UoqUf7o;#W4A#}yuK$u64hSF<#1(b|cyQ@K0<#kv4y
z2mUm#Ty`%K4Ho8kuH>oy!azllIq`C(6joK>(q5jZZm^rssu)FN((p>YNL)`@1S?Ej
zNw9DTH>|vJxzOGh+z2~#!eA6l@{X3zKFGThFow>A`#MQ*U#Cdk*MWK`Ly6?lOlg%K
ztb!SicsB4iEb{2iG!iJkorNPkUOQM;515PNv0rdUC=%Ee5CS{|XX78%!Vc~I3K0u+
zx9Ko_|2o%V=yE0I`eymf{Q#mO+ve4;&#7Ot?qZX@&~OhYgFCfveNHHIgPtD6h1G2A
zpP=f%BKP4;62VMO62&RWc9X<p`$?iWWIag)cfWj+_<tjDG`d6@N2k6vY6>7KbM!3=
zHEYv5X`2%}hsdtY&3+}i57Io#odZd@XR`#lb*~?Xi|E_!*n9u%G(`u>PXF1<AeQY)
zEV3NB+X6Gu{#3^$=F#%W1+j|-8xjmIEPn7HmAn(K22EIO6D#`%Tl>MqhUk2vVMPo+
zbg74f5AEaonFVc`2T}VU4y%uW4kH;Ns&YfC3(+%F$d>4Opbv`&JzG_7%c%dC#8%8i
zp#G<en=SZr0EtfZHE%*L+%$ysS@X##oZO^T!g7hf9G(Iw7kf%wBXQd6gB?62<*-E&
zAK$W^lenC6*Bsu1#iuY~nkBws4Zs?fhIb`#K8255sVS9OUX2`f!2_u3xX53KmbmMX
zjQ0sGZaYZ->{+uEww8|J>o);i`XtzuW+rlGRT<8CD_}iA#O;u$CMM#k<*+Bcs0422
z;5<-l=C;nHB(IoM!wr^TRj3kI6(c)5wQdU6?C`iNsR}mN;7@13GaCL7?c+9Y$;nZV
zR<&T|DtZ+JH4KOFk)3;?cT#RTdnAvqMar&2gch|gSe2rP3BO|pQ*ZIVa11Brd13QC
z^eh-_%yPXUbjtDMO1NJ0YPZ}WwTa#=xxLxztE$eyrL$~!;Fccnr^H9@l+1%m5_zLY
z^kTA?8k2|*gPKMYvdW=pu}c6r1@Z(Ny;(P@cN^pS8RvDX7)d$hP^Nfm0VnvbbXc5*
zhe+V(L+}bp&cH>TfNMegMF#P+2Dp_~JU#@Q9?c&)z-kBi0DG#K4eq~EZ6)gNCe%Kv
zscZ!lt~M=ZB(Ul|5dwmGB-bwC_ZlMW88Yj{>L?a=WNWrut$@oYyup&3D}GsJ0`A$x
zUjX?flS4?Kx!=!ftprQO-2P*C9D<Er9UR}O!bPtK;5<(*Jww4mjmas4c{o-B*x1Ks
zis03H=@{#ank&NhNx7MJbHz;ixuQ5^Q8Ln<?OYL;ki~)`K6_}E-PG}-@H*GSsD+T!
zs`4QIwhLw$-7dk0K_dM#Yptn83(9?E0kva8bZwpmxUIIN)L$VUa-O9&-#~YjC^K2!
zYoY~nya!R~5}i2ilfe~r(TVFFLUj3b9$0B)jV|W@jf=o~nZXa31mJ0Mf5kX|C^H1x
z6lTKmRHeGLKdl_LLYp`DaUm~F7{kNmiAj@#J{X(T?*@pc{NU;8bm#-<exDq(;4D{~
zBsmZeZpg%?DR^QY@3z7KHM=TMW$x0jF;0QuAD*l`sKYeu>2QC(*kYpg^Q068f(yJf
zP-9O_k%Ml9){g9!4_%O0T62t!W^Smo5;y;AKY;_c+&evXT$l$fH3fDMM1CbkE)u(v
zVQCEZW0i;G=8#nSF_B-`hhc2Q4GLHjgaR~|`mk$C#Jy_fL%LA4DK7dEzOOp!6m47@
z3BOl?tJOF<$sLN>;whh~ZhB;9`2sYGJpU7!TJWL1b4M99ZpmH|2S;-O@z=WPd_DHt
zT<$>XJNIekv<PzoM*YKB#p!f86iM%4FG&3ci!9Iz&1Pa=ql&8p^N|$#X(atp$?i0|
zx>8%VaRbj`<%Yh88Uhe=Z<iDH7JB726pJZ1Tos84m^2V$N~eJ->UurpN`FNFCctu)
zBi-!Nvc%r0arlENmQKVqMKX+1{-`Zo`rx9^HgR~tjz}BeTuu*+js7C|U5ZM};5uI@
zU|;LSscVrwcNKja%_MP~sO<D2TPL(QCW-3?S~<p@i@{>}A-l>s@LYr+x7x##(W38_
zr*(?CxL_&487JJ+p*3Y}5$YluetpCIxwpuAH@0gz{ep27Ht*6?*)YKl$}vfME}M2+
z+fBo0nkHw6!m&Id>gJf>a(lHE*RmyY(J1m$x=zMTz%o0q!qcxTILiZpV3v4nGutEb
z3J=PM;c=-2pM;00EE|Bd3q`o25!Exo9M5X6RK%+s*ko0TjA<@;KO|yCvmf~oerId!
z-I;MR%;@M^p|k|HBpoWosp_Fgu}+l;*?|w=dFgj*VDSbV;;$K1$X_wSc3BE;a|4hQ
zE~ji^qJK1pnIsohYO|*)3PInA*DaM|w<9W<_2|fzh_e>?%pR;7h`4t0iHV~!N|GDk
zX8X%w(^jE;awoFu4GKW~UQ>F+qxhkiWcG_;xo~|WAm0NQU2OP+Pn|bcMq-iZlCgj+
zDS=-Rg!_1Qv4O$hmqjSLOp|d5BdXv<%~Q|XkabdFu<B5zrIR2|Q8@;ic%nl~gdbPI
z-wvY!gMx&gY4L*Z=&BTbl2}WLHt5_sSUg=Lu|`k^RxW_smD(0j?d(a=n9M71Xl5h5
zyvT4L68UzyWJ8(=KOt1%a*1CO3W=6OQ%^Y0{3S!1t6F&Y2CizW;oj^V`29ol8>#ZL
zf=2%wSae3fkAViOB0q8oCy08keqq85OJAk1qQ$#bSJZkau5-?mKXAnAE$a?tJ56R|
z8?M?!E~g+u)RP|4QfvUmpS=rI;4cTb1C=583T;w4$Eygq8y!$nU$|Ao+pFFKKxOx`
z#s^RDnK5A$^GkHb5eD;gs0}5M3je{jZo&ku(<3HX<p=-a0T`Iz&ZPE`|IdB0NIKCU
zlb3^uIj_ZHCc<uk6H!lzsz<{Qb5B03J&83inDWDLoJ4z=f3_D_XD^ZX!fXZ$@;6zF
zVU=eL{>&ZJAgD1^o2V(lwtjy(j47^UbGlL_x44Tz7vrTYoY?cxQ6E;EsH4f*)wq{r
zwqN?s?rf1L&y_#97a4X&UJRRaVJQJp4tov6qG5RmehUbesa&pUur-H{L%sk>Gq>@=
z3YB>P*HVA0OcP5Jn6PBnB7(mS1&+-aihhmt6)*~=kHhf>FT%x&6Hdg<I(TTq&R0*?
z9&Ilcz}%HO#xqStLfqbgO%Yf+&_JUf=v5c1&>>l#srv0sK3%C;_GS&Kgk?q<hNT-=
zi~{4ptXc3O&BUIi&Q6^;H6gpmT102N#HfJH*<k0C!*~l3vp=4W@e&VhVYy-<o0pVg
zrCs!emMM}K2KcmBq}sZYqH^7pE~fdFaPuBl@*}}OW7dAm5UMd$Dw;(d-p?1}&(_E#
zZMnK)A2w#GOya?E)LDqX&@F#uN7{)hkDY#^?O5g-S-GIR5Z1QKVfL-Ja_iAe{`4mN
z)--i%e1__QriH(IAw7x9LM(=;p>q<}oWvBolLd>E*v!Ndq_T0jWS=6sH=0z4GhiKd
z9uyaJAFk*#<PYUV>Imc)4zz5ND(i)em6}Q)DjsrIl~osbs56sck6@>M^H3+HLbS)o
zjY(7xs<aQ5TiA-DPZPbg49zyn5)<VG7c&@*t72v&Qd3{KW_|GdcVX0JoX5g5&iEjR
zyuLd`BwX-gOquiG$2TL-W**{<k?`w}RVAg7Ys;i0^Os;*N}_8AUb1)-Q|ymJ9|1p{
zcJN9mW{0ASJxE#56!PGi#XTJ)PUgf=%ow5P2dARLK2*iV#JZBAt;94@q)<PCNtXkM
zb_xwPBfaLPIoME-^XiJ6vGbsxh1i&@Ut&c!a>ujG7tDcS>gP_NvS+HBHJREyQPgnb
zN8Jy~gkN5OmnV0E7w$CTZ#&Kl%)$L0us7EWi>hLR3p2Ug@$8u=*JxZgDlfv@eenA}
z&{4{1z;D`*!|41!J;TIAyUbS(KjQ|g0X)Yb|0cslA8xvrhv7+I?4DsxCTa;hBK%)m
z@bFX!Ts*3WKj7wfbit>~f|pg3e#J5BDwV1TFtKF<V*X!WTeGxCqlVh*^5%@s&5{w$
zhK$Y5o=}zuw*(^-$^WehP!n8zp94+4*dVK#G)$qRN*1-qB%UF^k-td%_~R%Rk%N_U
z83*XD)RKkW<0&Eq@=+vQna4#Qo5xd<?C&OvL(u12Jd6bHEbk`EM^g^|c*_5cwAqx~
zFBnQ9Lq!TSKrH0L-ZTLFTvero@BeL^2C$gbrh#Pm>DMCYoJZ1P1oYW%cydiXllOn3
zuwl^!9vFvqgik2sc*|hyNyDE$5<mG18~k7#h4o55Tz!?|{cmv(7=Ex9_f*it<o{ll
zxPDcZ$h$K5R~!;k<N__5@X!_80{S5?e!+&brvJv#5*|p88VFEB`&4tgY;(lSwXFO9
zA9e=%ZWOTy#D=Sh@W>DRj63c%`1zCWNXDm9Pbr?v#lJdav#Sc$&Q%8q-|=F@`YBEO
zQ4o(<4}u371I^A+^Z_N+usu-RRiR~g7oXlk7vUB6jo}Z=6v6{=ID5_gA2fM|--Y|1
zGn1xY4A(Oda_oQFe7^{{IYU>W=FO?FbW~PS2u}s!AB+<HNc683q#X7sZ0^B+Y%tSX
z3L{ac9~s4UJDSE9`Q$nWnDO6bI@pO66fQ`ai|~nfI~TS%!QWuO<rDbbE%CQWvOQ_A
zwl8p}b$5Ax7#Ff(`5d;H`OBe?m$qVrME4OJxUnbP(BOI4`2sg8@lM@*FD%sIK63b}
zEBQ-TxB~2j9bV!IOJ7k@1^tR7J--Qybm9-)TnhU>B1hm6ORMd(IG{)625yL=y@}BY
z>y!(LXi95ehct~WlSP{zEDOyN5_{?+CFH?B!BXPGTQ4GC$3W4A{P;`2(60q5;MsOP
zQC1FS`V9D+c>#H=W<+HG6)c7p7UQj8(ol@_tuj<pc;=ZPMx>(QZ20x8N>naXg~W@7
zmzI=QRS&ODOC6q?j4(0cm4!5JLeA(h-oz0}#<($Kyh$URhDlt>>T6P>`Wk_WBT|ep
z`T3cnbG)t*Nh8vvNq#;!Ngm-!<Hu5>j-^K_my27Dj6N29Vxp6qrZW+BR-%hbW60#G
zIpcCBoei0D4t>Wz-x&k{brJu@7+vYJ8+>**jsSk7_=Yz4AM(!sb%pb~8nJLjjL`$$
z=&dJxLnhCY=N?62M0@IG^cHs9H^v36x~kaW(LSB*kec!CD~@9LGTb<_jt)(xwwJQv
zq3Yk!wjaE^P%NZh{1<)Vrx`XX9uE4KT84KtX!Acw<Z=AoEh>#N2P1x^MOP7;>G2ry
z)s4c4&Lz_y#@Y-^=8YKx{KT@t|4a*h>_6ZClLMFo&%@ug#=nQ_fcG&aCB5-40Pg{Y
zFFgJbJMKAc0sKYwKE}7|csJuIIu3uz+qiYr?eK@Z`xys_Jt}mZ(X&E^ajftkYxE~w
zbwWuG18V&#pz#%B^fK1axu~nRJp$n!W1LC$#ryAozx3PJNHfnD$9o&&NH4zz{}OO7
zBagIC*dJxgBt2i)_cZ2`_KV{^jH^ipg+9_)NqT{>Kf<_+^qY^&fxp1q&A6ZRTw&kU
zc${=-*e&pv?z<S9L5uibD*vuIK=E?;<lhOe(Re|=<h2^FyHxTzjW=E<`D%@~5wF*H
z+~v}~LE|3cjT&zz-lTD3y7bemaS!npjn@%x)p-33>1Vsf+X^HPYrK*EK75D93!ayM
z_<6s^>xdhdMdyEzc&x@7iN|TYmAFIWEfk*wjXTNSsquQ^X&TQUo}uw(;%<$1yd=}>
z(Rg!({5#wE8gHwVyjbG}vt_;nHD33i{DbJ#8t-^S@*0iDJs^3l#@il}yiVhB6wlQf
zHy)Mt^%}2ll)ORXZI4UdsBs7JCXE+7CheOwUVp#jEgG+RLh@FPJ1HNwYrK)--==XV
z<wIEGp7qjyhsNvHNxom>1rJMZTpnGYn<@Webx!Gu(|G*`=_g*}9c1s&xIz9CG=BRk
z=mz2C)c88$X&Qfqc!tJbCSIWNnsn*EPUFqQn>1c<j<j#nxN)xJvD2gTt&O--<IU$u
z`+SW%&X>GK<M|X`t;U<Lm;G3s#+}P0Z_s!x`DxU+u|(Q8X*`a2tHvA2&vuP>Tr2&w
zX}o~^?AQ2q@?*@<@{Rn&YTQG992z&sZ-T~K$&XXx8RW;U@dWbY(RefY$=A4(`~)@b
zAwSg`ZzMl88h4oK@<!)_F+tXcG>tomdo*4^yjbTGrJrhzw-K+?c=IG_U$60u$&xo|
z-0(==qVWRaZ5nq>k@g)LuOS{=5S<T>snR}P;|0VsG;ZWd`(lmP6R*>FMxL~9(s
zHjUR$llJ>Ho^g@nvA*c|*ARDT+;OqAH)clL$D8@0@%ppnxE!bPx@5@{H0~rnX&P_5
zRNB|-JW298jW-i-(zu)aH)}jd_AMH(p?J1wJV<`R8ZRI}9U2dkpV-3ad|OR^;xz6d
zKk*tbAU{rxH=FsU@eJ~lp>YrS$=7(8{1j;1Nq&koo<V+UG#*FwtXAU=@>8dAC;4g6
zcpCX>)OZ~EY0|iZ{IqJkfc$LNxIuo}G#*EO_G`SB{1`>i<=BxZ>tU?M4f5mAcq92q
z(0Cj9acaDS;_24-cJkxVcq{qI*LWNG32NM+c3!RVX7W>`@mBKFs&Qk9>>t`RUO>D<
z<1M7)>AJzZ9&9Chx5n#BrT+qr8)cGLYrKGXt<KA({c4S81SD_Jc~J5ujXUN_-m397
z;$e+vknYfU-50X{?AN&GTgl^RMb|gaN0O&$Ja~${uF24N!9>YD8h5xPFVJ{C#iLl`
zaUC+;pvLR|C3&63n<<`+8gD&ShTEj^davXy8aEOpkEQD~v!29JJmWN8aJ#gR*Ldsi
zBrn!@<8oP!H5#wKLGslauO?lu@%k2d9nqli4&sd(H(rwV%^I&G`xcEi65p=znw2u1
z9U5;Xy<g*=?R4E#99^#ph!<<zaWmzQ#tTT-YTWt0^k1j(`VS;ut??S-^%{2)Z_s$#
z4(X>+;~5lglg2&7n>AiQyhY<d;;kC5A--MXb;R2=UQax%@mAs;8gC=MU*mO@UZW(s
zygJA}R^tZQ$7wu{xI^O(;t3je5_f7mgLs<8J;Z|=FSte4pIVI<ldjXaV<*)UjXQ}q
zYCMj3SmOo6J2dVgZd?(apBcnsHSYLC#wShZzmw_mXxvFUU*m0`N<Re}ckY(FSmPPQ
zt2G`ZKQ$WfAYQBS8sc>tuOq%%<MqVrHQr3TLF28&+caK(o6Mj1InnvNo$MSMuiG#4
zAwlEy#4{E|`!RKa#*ORbb$zkMJ;Z|=H|?u6Zrayq+_bONxM^RfanpXa#!dTrjhprj
z8aM45b$%bMhiKgSpybUOH~qJ0-1Oh7anpXg#!dS+jhpsijhprz8aM6tYuvOqu8c1C
z8d`sd)p#TEIE|b3@ftVn9U8A+D)TKt<3Zw1jkgj{)3~ur`pM8a)kC+&&GdRSZst$E
z#!dSIjXT!MaEmn__lV>*8aL-Lt2J)+Uv66W!n2d$jrYayRY&VWI<L3kL7K1W_BA%V
z&W6|9a1YIMbpILD|Lfet(v^k>>1D28B5H%Tncg6?=iFSU)a}i6N1dDNh&ngd1$Az&
z^Xc4Nx6^qe%YPov78`D^-{^kK^%$L->nJ)m*F|)0u5;+zT({7<xelRob6r8_<~o7S
z&Fg-ho7eF=H?PZeZeC~W+`Mkqc>-Ne>wG_5@9NyVj?sAntLMBN&Fe_r-n?$pd83V=
z78_o`+7S=eyzbHco7WjSH?I$L?qThb`#0x<x_vy&7j$mUvvl5UV{gtobo*k~u2<th
zdezwQ?W`Sf`vh7)Ll>MoSUz)}X2V-q{JDKKi_ga_o~s#mGTv;%!!|sQ?gO9;?kCNL
z7ufK8mJi%djSX+G;jK1&zYTZLdYB%6w+#>4@YOcF*@lO0cpU2ocs|s#{67!!+8ke6
zY<Pzak7xPM{bbniVjEt-@{{|iwc(96e7g-dSpIPT2{zng!>et0y$x@%;T<+So|PAm
zr@2mv^@{UG)=oKZVCA)wwZ~S*X^I9O4P!s!bD6z^mDfVX-HcZ;9%P(uwF>{M8TT;z
zW*Z*1;c=|r<9^a?c!3SCwc+t}-vpz`{kPii{Wjb|>-M@Iw+#>4@YOcF*@lO0cpMw|
zdAibUc!3SCwc!pnUUUC$8y-vROM3n|>AG9zMnGPF;lJGfYF3V!coMxD7~jEo6XW+X
z-pu%ujJGm=JmYPQuYoYl{xi(@-<kb>##_xu!e=b&uTEw58I1p)@gVCTICrxCq$jh_
zXM7OjHH?45cmv~Z##<S$VBE>#^9AE+j89>_nawvi-)_U*%+DI;$HVwzjOR0c6XTxp
zNd4iwfZ5;8?3<W9=he)9AG3F|et`2DW?#w5z21iBGe6wEmf4rFa5I=a=M80%^m1Ou
z{K%Ct87VjO!})4vzn;b4!|XY4VD_)Ecor~w&KsHi0cKy!>^W~@_WhZCklAzI%IpU)
z`|XVXiSahZkCL~PM5<Ra|D1=J{lA%?dS=gg2eY5U>>HRp=c`$N+rjMX8Sle*1LI#Y
z-pKfSj5jfU1WRu-<Hs`l?TinAoH6^o4mJ<r++gi`Bl8o>IBYIfuQ2Q9Ij^O4Az{kw
zoy-qyu@$~Nj3+DahEZ(84Z7clt}kZx)iykq@h&XiYHWBM<1x&>)`mM6Ka-_*wGB^T
zd>gZ`x8Y94`!V|l8}4R&FSBp9;oBKcVD^oyU*f!t*(Wo5C#w&fhnamGv+rR123C&y
z882e?PBuUJfblfOn;Fkw{C39k8DGnIknv|ZXY(J<YnlBEEdB|seR1Bz?5ml5klAzI
z#_YE+`#2V#a>nBsKft(y@jo%1z<535PR2(tp2oPB@eIa4W!%m9y^MPpf12@p##b^P
z&(?oRS-H3we}nOS#-}n~!1x%(s~LZW@p{Hxj5jd;KI2V{KgW18<0BYvVf<^xTN%HY
z@$HP?!+0Cx+Zo@__)Cn(vHCWYaR=i+GM>SB57r+xF`mNgTNtOUx}r40jQ_;!3s}Fw
z`F>{q0JD!}*B!lCe9{=doY~h<KW2`9X4R@={o64ngwNHCpT+#tGkziCO^lz&{4_I8
zkF1EJEsRfQep(sNV|+W~r!fC*jF&U}Fyn)meFx*GGQOYj0P|z8d36G_k7fJ}#^V?t
z&UifI>5MxVAH{eA<7cq=I2n&;JdN>LjAt<JX8zrbzs~IQ89$Qo0>%e3Ud;GAj0YLN
zgz;*|XEI*H_-w{&8Q;Ko9pk+jU(I+S<MoWkGTy-W6)b-m84obt%=ndzSF`yZ=Pk_s
zpDf%~#;<06wlhw*NX5}M#{JAsnDOHn?_j)!@%@Zn%fgLi{m;wHK92Df%s!s+zcG6U
z<2N$<1jg@RJdN=`FrLBq1B|;FAIQS>F#bH_`HcUW@nXhLW<1FFw~SXazJl=@#y@Ag
zmhtCV{OcHhmGRY#Z)Lom@mHAt2F4#`ypi#4j5jg<59Ys_@d9Sw!uTi5zLoL4jBjWB
zQRb(O@t>G|2jg^$T^tRvcC?w<#|P#5z>&b1d&9vv8wQO8#^nx1>CDNv+&Lq88sjn~
z$uk(ou+5j7@l+Ne598*ZOFSo^ad=Qwy$TrbsVH3cF}|Pq4>Ar9w5nG%<GmC$j2gz_
zfn4>fWxS7~us?zEzKpMCoX;WZ8SlsJ8yG*D@kYjXvvf5vek`+ZX1qV+EsP(>cq`+d
zG5^~cuV%cB@o|iY8INcFI~YHn@%@aSz_>9tQg2UUJeKi+jK?v4GUM@#?_u$DFy6#?
z0^<(m$I1A8%s!3r!OT8`@ga=686V2Hhw;;x|9r;(#drbZr!hapjGxYUknwAnpK8X3
zG5Z?EH!=HK#?NH-b&Q|I_-e+_X1tzpC*uu_yBKd|JdyDx#y?{5Y-T)#*|#vB%6Kc|
zBN^Y$_>rumY-2o)*@qcFhw%=^&t-f+<L5JOR7C3k1&qfsp22t=<6p9Th-Z8(;||6%
znV$s4M>FnZ`~v1Djqx$eK7;Wr#@&o(Gwxx0EaUl%k7K-m@o6kR#f*Q?c#!cN=BJwR
z(agSv@e7%KE#u=EuVZ`(^Rt@q3CzBp@t&++H84(F`ovj{j89~Kni!wNcr)XZ8E;|S
z!+0y>QyAaQ_*BN*7|&%q%y=H-9gI(7d_UtCF>X{w>OWtnjb%Ka*~c+{3FGmMU&_kG
z!8x-}VEi)1os3`3cpBr=8P8yRGYi+v_zY(6VZ4Cxe8yKXKLw2Yn0+zhGZ_yuUdVVg
z<3)_uFz#o(mhoAP*D*eu@zsnMGhWYl3F8fncV+2qWPA>@Z(@8nvu|enT*g}%znSq?
z#!H$1?TnW(-o|*4@i60a8Sh{mex5|V_A^dDSt5^xBK1F{tl>T~<5i5uF+Pv+c*f^5
z?qGZY;|Yvk#kiC4g^Z^$zKHP*#=Eifx*5NQ*?SnD$I2z2@y8i2V7!L;DQ5g6W*=mH
zF|)5`d<o+<j4x%pmhnZ*e;wn?nEh(TuVcKP@#`6HV7!*`M#h&j-o*GFEI!SQ-^A=&
z7{8hER>tcX-_H0d#@iUbh4C=sEi67AjQ@_=?`Qm0#*L~-{lAUzSjKN>JdW|zjK?!R
ziN(jk_}h#pFy6|zlkq#5|1`#b&v*vo-Ff|I`~~LU!}t)!^BI4d@dC#0V&N7u{sglR
zGX7V_s~P_z^Han4-Hg{VzLxPi#?zVq)r|LI`B2aJI%eO%_&v;jBjbN&yovF@Fy74g
zeT=s-p25OxWqdK?+Zn%(@ixXAnEx>2_cPwX_=Ak^XZ#_?jd_v!zmD-(#vf)pj`2qr
zk7vA*aR=k4uyiFb{wT9|GJZC*Ph<QsW}m_MFlO&&{BdUQVZ4^v=QF;6*%vUL&FqUA
ze}dTu8SlaDs~LZi+1D`M#CR>^e`UOm@uwJH&G_FKuV?%&7M}*jH!}N1#-CxliScI{
zZ)Uui@fOB6G2Y7fbBu3id<KhW8{@Yy9%j53;~k7QFutGh7g@N*{7C)p%KD91#*3Ms
zIL2F;pLoV!V%)*_%Zw*5{tDwx#=R`uG{#?J_8E-7&bXWLeCEf)_$bEn8Q;eI6fi!X
z*%veZ2D1+`{wCwqjK9Tr4dZV!Ud#A9jMp*#cg9yUzMb)U#@}VUf$=RYU5$(%!*~<p
z?=e5kjCW`DEsVd<>{}V%!T5H@KVZC#@tur^8Q;Zt2jd?yzMt`PSUipDNc~^Hcr4>D
zG9JhHN6ddb;~z8bV0<^@35<sscQSq_3pb7Nc4nW!_-Bl}8ULJd591S<|9r;3VD<%!
z?_s={@qaQNWc*9Us~NwFg<HdT2eYqbd>`X=jQ@-A)r>#P{MR%76|-+({A<P=8NZGB
zX=3~vX5Y;CNM_%{__xfymGK{#{dUIpGy68izhgYi`1g!=F#ZGM`x&3l!Zqmm+#|vE
zO2%Uu|2Oj!$M}zo$1{FE^W$JVhVcZ(yD;u#Jjgf;>OWuqlLP<Bf&b*de{$fzFbDqC
z_vG*0OTLJ8ujuabb~oJ1nnPU<w7Qo(AG=wEdLZRP!21pi{Rlq#o?@VVF&^31zU#n&
z19d`U|Glq$o6>lFv9EoT(p`mKuXHz|*DBpz=v7MNRBd1TVx@7=+Sfi;=~$tCO7{?Y
ziqg1&VPAWu(nksHQo5JWgOu(qbU&s02pywzU!nK@WTrPx=v_+p6MCD{M+?14>0^Xm
zuk^7(uT{Fg(5sX_PUyu-4-k5;((yw3ls;bQDN3Ipbf(fL3hh$*B%uc>Jy7U=N}nur
zjMAqFz4u2m{SKjbDLqK&ZAuRodXv(p3cX(GAwsWJdZ^H=ls--9#Y&$p^jxJAg!U;t
zOz0^}pCNRn(q{_oQu-{R2Pr*V=zdC%5IRQbvxVOKZ!`T)p?4|m5_+4`i9&BuI!WmD
zN+%1wR_PR>S1Fw;^kSt)3O!fpG@*S;rwctr>2rk6RQg<@T}q!P^dO~23EfZW^M#I4
z`U0W%{$Qp*L+D*fX9~Sd>Cr-OQhJQg>y^$DdacsgLa$PKtk8><9w+o%rQJgNl+F=)
ziqaPfovHMAp<PN(5PFc(6NT=l^dzBUl%6c~-tW!ydxYMl^c11DDLqx_O-knqy<X`&
zq1P%sP3TohUnKNmr7sqGuG0BJ`;@*!=qXBHDs-mOmkI4s`f{NMDLq~2eoD^}I!0-)
z(0ji#(_bL;E~R}!Z&P}v(3_Ml6neeVMMAGt+As7frDqAfSn1h9&s7=+pndH=rAvgK
zqVyF)XDU5MXqVEZLJv~9Oz3_}mkS-EbU^66`_1$Rh2Ev~T%orqT_N-)r7MMAuXIT0
zwMthBy-MkMLN8W&zR+`(t`^#-^a7!$D1D{SnMz+Jv`guQLJv}Uk<k5=zFO!QrLPfs
z@3&_9YlPmV^kSj6DSfTbo0MK6^m?V23cXh8WkRn~`Z}Q(D}BAtbCs?Y+Nbn#p{FQ)
zgV32uuMpa$^o>FfQhKG({gl2*=oqDM7JBbDX8P-d-lg;^p|>f0i_n{t{+-b4mA+Nz
zwMyS6^eUxq7kaVMcL+UK>D5B}l)h8wDN6rd=uD;mAhb*AyM!L3^ctc2Dg8&GW0byI
z=)GT?>8}@hm(pv6-lp`Qgx;j|JwmTn`p-hIRr+3`S1J7$p%*KCpU`uaZV=k1^!-9l
zQThR)GnIZ&XqVCt2|Y;Zbwc-3`eC7Clzv3$y<eH>Zxni$((8rZru3siZ&La(q1P+@
zxX^2r-XQcUrJoRbvC>ZpJy+=_p?ymKRp=>7KP7ag(ti`$rS#K64^n!g(EXHtM(7x&
zpA~xVzs&SE3%yI}O+s%|`Z=LDDZN?f^-4c4^jf922)#<_7ldA{^ov5zRk}rJpVBW0
zJw@r4h0av^6`@^9zbf<~rMC**PwCf$j#2t`q4(}H)88ueE~U2#y-n#igx;j|n?kQw
z`YoZ?D*d+5tCW66=*3F^UFf+=Zx`C9^t(b&QTjchGnM{_&@QFl7kZG=JB041^an!6
zD7{nYy?f2{w+X#V>0Lr^Q~E=pH!1y*(Cd}{Sm?D%e<JiMr9TyVvC_MRo~v|NXrI#U
zLQhfpGodq;{#<C6(q9NYNa;O7_fz_xLdPimrO<mj%=C8%y-Vr6LT^)gpU|6>{+H0}
zmHtZTwMu_2^eUyl5qh!G-wHif>HR|cl>ScWDN27Ybf(fj2<=k(-$D;k`bVMrDgBes
zF-jj0dheHJ`tivJxc*lf7kuIRUuj(Eh3kK%apT#(_Vr5R10nm`*D8%Sl=ii+Qu+v?
z7b|_F&~ufJ722mX-hkfMK1J!CLT4&{l+Z4v@kZ*t_CZSHgIoLB`zhT==oqE@3cdHA
zX8Q5LX-L1)_#il>U+JTT-lR0%V1x84eXP)HmF_R}Dy8uODp>zedVtV#m5vwMr}Xhc
zPf_{=p)-{}QD~RaxRDFeuQWcu1nE~AA4J;M9-}mFXxP`jcaNEVhtRu}9whWOr3VYW
zN$FFCUT<nxO}ZEUd`RC@vJA1F1bUsO7r~zk;m4~qU8ZTjrl)Iqnx-deI$P7DG@YX9
z;hG+*>4BQ=uj$^J?yl({wfy*6(|a`iiKcgG`fW{b)$|rkZ`AY#P3z^Qm**PI{x(go
z)buh<FVb|Crpq+#*YtEvPt){7O=oL*l%`WOJzUd6H9b(%{WaZN)7>@w<M`<M|Fx#~
zX!;XP@6hzyn%=7EEt=k_=?$7*r|Ek&y++fwX?mrmmuY&DrmHkvrfI*Xr)zqerYCAT
zThpU7oucXCnjWg@ftv2G>E4>|uIV2y)XHDedo=xtrgv!iZB1|0^cGET)bs{TuhaCs
znqH&n+cdpW)5|oyNYhoCF4MGM)6+FQP16%KovrCnnoiO5a7_=@^gvDb*K}`9ch~ff
zIa>K^dXJ_*(ew^Yzpd%5n%<)6jhfz|>2;dESJP`WeVe9NYI>Qb7iqdm(`B0WYkIn-
zr)hekrn5CYO4BKt9<J%3njWa>{+jNs>F%2T(XExgruS(26HV{X^xK-=s_8A7-l*vf
znqH^rdo{gA)3<4QrKXo@dXc89G+m}?zow^adYYyuYC2ofqcokO>EW6ls_B86?yu?I
zn(nUYAIE9sujxIS{zTI|H2t=ww`zKerZ;MOgQnMM`d&@1(e!PaUa9G2nqH*oDovMZ
z+OO&9nx3ZViJH#V^e9cIXnMG&hiZDDru%EUx2C&m`p2<a`D=QQra#g24o$zU>8+aH
zqUnvA-k|Aqn!Z=lYczeErdMiunWh(Mx=Pb!n)YjYx~8XTdZMPYH9bnxDViRx>7kk)
zsOkQi?yc$Wn*K3cD}PPz(ex*p-l6HYHN92STQt2<(;GCsPSf{ldX1)U)AUMBFVplQ
zO;>5UOw)c%PuKJ`O;6Nxwx&mEIz`jNH9b_*12x@W)4es_UDH2iY2~l!J(~VR(>pZ%
zwx+jgdW)tvYI=jF*J=7*O|Q}PZJJ)G>1CQ;r0FV6mucE>>Iuug3~z@UmlKxn4!6Rm
z8+0u!<hxfa9O87R?+o>EugDsb;a(Yx2?t?;3wfM7{Y%Lmg69Lr^F4Up@-N-XcZ9QH
zFNic<5gOuf5Btfz>_Dh{cm#TY5KiME{5=vvA2|N`ixZZA8}4T9U=OT`p@UCgtzAWt
zIoJxvrGw3|8^a<750QgE+c>yUIKUwG!ChEP)s<)o`a{xJkbZ;orJym!Q!&7m3x}kI
z_rv{GJR^gSRKtx?)J~yyhZB_LYh-yG9hpRyM}USiFeiV2T@~myKfIQV+hLs@-QoRM
zjNrRcqw#CXI7G%z!$B<Lui)k+p7anJuY7t4-VzH>CZjvZ<Lh9A9<Qg9o~4uK)A2{>
z_-yW^CpnqQ;n#2r8iN}Juuc^et4l+^fOuHcrCz`>3f<7#k}u+)>IxApa<Aw%Gy%T2
zmuC%$bv3(J9O>Tl;rTC&i#6Qot&0Y`pMtGt_&UYCe7Ad0tNXxnZda>&SgU(^w;|!>
zSHX!;iDQf0XG%9v6Ng|uSn_=QX7`FkHQ`xkvV3RwCRoC+S=3~NdI)?KxI{c3Fn&G@
zJJ~UEvxWXL{0QlbK_fo^H{!7>oKD3M7v2K|^C5*S%HSR;7Ka}`kq;!kh0fdqdq}Xf
z<HC>8nLog)I{HqcGtZ<mKLU;g*@6Xm9T~rdP}ZBxup0wC6^Bdc;Cgv5+z$6C(c@a~
zA&ZP|1c*s^9*i)83nLzJKZ*1>fQ-Fld<Pj%ju>Yt<IiBX0(we}96Vhe+)4-glc$N~
zsXOUWq`$_F!%XdJI;o9%lXx(~1hsObN;3Kz8T|{o95i~68%-jk+sWuPGP({l=E5_w
zM+?uB$FYgc!A?@-Yd<>q5;}Pmot%T(jO!OmW$Kg3sFaL`aHAK=s4p3fBclUQN$W80
zj-wv>pIB>GoHxY1G7Ed*7iBLTCwk#G!f*_1Ulul<d+$f_xJ8TmJ#fsJTkt%n<lkS7
zmH0q-pZNSX{26?r-wHS`T4N5hrFd+A(DwS?i1)&CM2+19hWITu{G>Q__k@*2LxSN4
z@O;P)+yoF_L!tpi9f^7rOGvCnF^@zIiV`6f4yg`%;TuNvVj}tBiS(@?JVqQl5I$FY
zej82_pI?U0L<>l^BWL;h;geBdeu{&;!+*qHDAIXgv10eJ<hdBL|Hy&{<eqOB?eWkT
z)M4Q7Ei#P0Pi!oP-JanO@cfk<tMU7Dz}+i0y7Bw%6T0B{eEj}0_T?)!*1~r=2p-=K
z-yow0$&-WN_=RQF@UIcRTKZ^EwtGb390>P^e!{&XyAk&r_i?XWR4+=xz5FNGd%PmM
z4shS4mDpRYTvQV|)LsAx%kl__57g&p3ZOR)-~XvB@U@>q>`I0BHhk+RARd8+BsMrN
z-yAL!EpKP|DzL()@YrWCvi=SGuD`;#(|(cgygU35iUlAbZw`e2IG%-*N#UepINd0m
zvoM@-=zGP+49uQ6IONGBh?KGDn>ky&Fp{(9K?rcF$k`x{5j<y~-3<V8_C<)RnT|g|
zTSA8M$JqaXW7>+1ku1IePQti9ij7ZZao?p^$h>6P+<vi0@wedzp!nJch~pc1**yni
z$mjQBRf`RO12IFN4+FzMPa6fXhR+cY32AXGkk8-8V=rSW;PYs4?CBwRv(SC+!XfdY
ze(n`Zhd5E<>}%i0c<$3f@Mf}m<<cQ|Qx&@aVI>4wy-HSX&~nfUZ@q(6-(~m07Yy!P
zQIntxfxhf)@f{;P6u#BSS%DF%!eFq4x>xo;3I6`|3rodx!RY(s<DwPg@Vg?zfz5x}
zq%L5#;u_cwKft|WDzNAJF3VIbVt1Ni5xe6RJMw_XfqBpen+7CW#2S<H<42I$*_hc7
zC6P0bCXpqOrlrS-11yhTL-QY@T2bk@?}hK!Xop^D7~r=V6161UeaZeV^C22bhqQ`B
z!ki5!7HQ#A;6^WYRW*RaN5c@*{sEl-Y#hoZP(3Cr|Hi%iMfdV8;kQLL_y5b;@Cd+)
zl>0`&2Si`gzs?a2of%b-!yuBM$5RYH?<>E+TyJyuby1<89?}K@^1@p>9-<MIFQ0q}
zkxNmL`xN@-NaWIF<a)@+^`yuRhI-DzDul4w-=JA`e7M^J=;Jzsw6B5z2-Ttr_FX!T
z3PM%GY>c0amzzb6c5rv;Usi2wKL9B@(AvHi;&I@O_RmCgtVg?#cVUXZ48IDU3ugUW
z(7)J?j7?-)Pqqzc3&o7vg2Hp=VqQNzM2tH#P(TZIhX0It2)GVy=0oN{Anuh9$x&=1
zjnmhNt{vv)*qAENJDeK~uO1^?dxnS;R41(GP;cw-(`>OqU==?BVuzVpPgafGN(i+2
zstw~K!<RAa0x@i#M_GLElb^(sAqXuZRT-E<Yy;t;qUK@3&%|khned;=vA4xZWT+a`
zK%+QbAGCKBK~t(Qic=tz&(|F~DZ{K|(|75eRLfP?9M}orECGs5_*#*_luvMBD*8Q@
zKW~BQgyo;Nx1kS+rzqQ-15l}lig;quJkSE#hodwN%E16dJ?4QAt-poy<p2uZ17dTc
zvvLP|j?5LJrwOtv!Dya{Ue<FjShJLu8=}2X*xIB+h8%7!2UnNi35Tp6Z+{@Iv=Z*S
z^n7tTE2Rg)vONK*nF%Er%n{V|4_=YV?Sg3*r?X(PFqj9Bz9@$($ONcL-G;`)Cm`IC
zgD*Uc{W`wjYXZaA&Eu=@QP{vYmq9Eih+yQzp+bxm8w;>J&)pb?BUPp5%-}khWx)41
zNM`s3G4LM<cgeuH35>U*`fW7u*a-50-#@$nTkX#94cIF!*|;B=f%9we4XzkxLJf1T
zT-FFYXZeet-|Akm42OR)Hea*@s{XZb%~+31CcVWp0-MD7!0}2FBIt(m#F092#t=9%
zVfnwny-2`H=?e$7e00iufB!z@<QM_vr1KKdeVGwycd$c$RC5!{Kpz2DG{wQ@voRjx
z6qzjoC<YMD5nZ41FJ?+~=zR3=76C$1SHgT0y0b`fCky|O+?6mTr6V%ADdLFq{~h!&
z?N>rb5MVV182bSLd^!t@=O3c~+1QE)4phRFss-s5JZCI0I7dtyL(rDtyJhLu!T<}$
zMCsoMC*t(A+xfEeUxl#Xm?(WG`M{t)7BjS+;Ty4bSd@MdmZVwwFGQDq{y(fsKLfaR
z>7NEi4zKk8{T?Kom;PnEbY6%m{d(-*#LU#P^w)!{Lzn(|to|yY?Y#6e(7)TZ^rr~_
zR;52u9Fe8}1JvHA((j4&+bsPZqHfF5kEt+A|3y4Umj0rqzE4&)!wgi+7<Y!>7v-tb
z7mz}=>2sP~uz3PBR->PWO8pn1cRwR$c6YujjaLbHAe@eI0OQ5zWF@Y}%y!8EJwO&0
zV;x;FChT5**N}SXp(ZSUs2hI5L!l21?GAFcdr5Pgd%5iS!qp;C3x{NcPbNpFp=%tF
z@g$fTCHz0$-UU9&;(8xXB#}jlZ&1*9L81mtR5VmkBBEI#@J2T^0ztf>pny<8C~h>0
zU~qReuj|@qu~M(`(u&pgi^?S;mvBo!MG<dUMB)wJ*HB5d6$CZ^=Q%U)?z_35^w<9L
z`M}Qg%$ak}oH;XdW?nU$(by2}aIiv&X2r2+t7&C&;g@L(#tVP686}>i@ng@T-UI6O
zqIYC;*!NB((3#ctQ~654J&(-FnN=x4M^AJ`mGwnc?i4li-Q-EmS}D&Hn=2a!ntfNA
zc`J*pw}b!J9Gc`yEuR<lcMZhGb`9C7!4u4ADpm!V=799gC#9zZ(@pE`mW|E+(CIm;
zMq~!0rdXB@J`}hV1<Fqr>T{Kpg$_7ZC^bg$vKLdvp^94|8YlpZaDe14mH7dyXd^<G
zQBpxHPArNK=XA!UQ9_@Q;e9Z-F^7b<YtB?`oF`y+*qO*|`jn*Qw8O=D=axBYmEnP0
z*A{Zqi=64-!Hm@ONE|xtG1C5GQFZ%;X@n=TKW#8}E=%rad-xdP)7!7x1IZeF=d+&x
z9xLM?LCn91OgvsHNs!s-<CRE;=%%Zr$b3el%WC-}VjHPLS$h?<D7$&PXt8DatUo|N
z1~V}~Q45Q#UipDA$(}ClY_l(x@`Lgvb09L!=m0Y|h}%`=W}skSaF`itDm0@rvXPq4
z)OVUbM*AH$@csL$fo{41N3;n5a)$jFnN825dZxm!0=*FYvR(X^pv3;bKcezCvik#;
zkr-0Hk#V+gyULff(ST*7o#dyu@UKKZ>9p!Y{_eIfR?1Fje+Fej6ot?~U@iC=qL)DX
z5?Hzd(jiU0d=G#zLbg&-?1y&Cbh(8n$HiaS1k<?bE&kWAf@5NWF?pqFed!<RFAK!V
zYm2Q7{t?5icd_$6#<S^Ow7GAckylh{&)P<!lsy@+zD8Gztu^*!j9nwQtkN1i9}7^R
zUK)r^S=UlN-^eSgX(^xAQeI=5oKvGSYq8&+9l+5M4Tg0>+vAB|w8w?V%QCtuTXd&g
z0hoGu%xcM1n%<@T^+{7_mMFIBNv(&4YP;>XQPG4lG9tX;U{|APAT~KCKM)IIyEK+j
ze100VL8(tL#fm!s-F_M_KbW!fF9t9x<Ef+5Qi}T41*~sKqji1j__H!#ISW#_n^{VF
zH4M(lJ_GQjZ6j|mfYd|giOAO?`&a{VxmSrYw1Q-zA`yzyX+X-1a(gz$b2a39PDKtw
zrtN@See0b3a3}Hh?y&Hun^1Tu3fo6P%!FWoo76N;H#8n>dOj*N&1b=;1{1yD?3W_D
z8;sE~Pq81{3iM+A29g|B13CP6LAb|jibqR(DjvPZfo<(tUU&#PiD_yCV4Jal|M*pU
zid`#0hx0Ft2*t;>YMTQjGO>ALQyyeuee0f1pgQ)t>@_-+ZC`wEr<A4ztXC!?O7Sf7
zkAPU!4u@P$Y$ccWSJbz$k}Ak0_8@lwWw7(d3&_s$xu#WAiRi}{rh$J5T16<A-WgE@
zs}Vd~1?Sqozn`e2k3hVkXg)mcF^xmWcYJHiN<Z_E^c0bYQZqK1!c8&2fN%x;9uebo
z-*FUUCFj(n#a3F5yB<*Te#Se%HQHl`6zn(VSVhY!Xb|?d*jg*q^%6o=*|(9R)uiMd
zn?w||=Qs>8`nRE?yeRR@C{;)4Y)z?m2PRQ!JqYKa)K=yyN*zLyT19j1Nim_+sfdG8
zeUm7a(PjUX8ZE+<{>d>vN2xS)8<biG5@Cbw9F%9ruK}Suq|`fax24p~PZXsVqT&Hi
z>JL=Etts`qXpgu|%?D{blzN4^ic*a#IM=@LKA}_!N`g|ok|@>GdH<9;ivuc}zToJe
zqtxHu5=#AENZPL}%9B!OgU}sP>N!+tEmOle@F7!CR6GDmP38pDno@U)GL2K}LD0iP
zDNA&VjJ`!GIM?nG6-s@8I4E^&5~bE3ynjk{kwKMyZ`RLIYW@bH)FdHkzt4_RlsW{2
z?vPS5QKdDd@~Ch?sj;H+sJtG2me_Np5Mu~NV3Ba|{G33+4yE5SWxhLu7$E!P_OlJ0
z-C43R&G5J<LoC{WwH<8bX%A<Oc(Q4I4R|dZY3sfNIzXktFoH!}0}^smdz3lg{2U|l
z2&2*TgGD;DnTJe!Ao{}dJeY;SoZNtgDbBQlX{9Lp8R!YH?n<T(5bVE^+h}Tnkerj4
z>j_dQ{rmNs%->2g=z1y}6dlk((UVY=f_kjvht<IXXQ@Mw;grIbwJSLa0m9Z6Rx@=1
z;+u865pny4H}Kzgz{`G$p`F^$;XG5Lv8_S|IyC7l3dq1K`4c;PE&7A=NM<#pUxYON
zA$Xw<+E05Rjnt4XP>`O)EC)!{{Ba=-SCA@@rXiK<pgqb1sjG&R1KcTf#B&eL7W6Aq
zr};+)V*PH;Nl&3s)+c3irxZ?qy;s6KDl=n!F2fQN#pxYkZxB0i;MYfzSD#TDVtrmk
zO?!lpHN6NKDClGPSjM(4=whtjJp9xT&N&iL%zO~0KMTb&UP5vpmT~bX6y_eGBPc+U
zU)P<`9n+eY-}k$Tyo^5>B6FNQ?OYaNlS5ADbcyz{e}hh-FhpQE7F8)UGcLfoEPxuB
zfWC99K#v@Uth5}8rJV;bs4V9fyX;3%-(UNCR*Cid`b4xG%UCRxGVZWDq2O*HcQQ~_
z;L}9X?Jv*q!nm9`#QNo<HVqrM{Vaf*3{V0iWBv!76b!(tKVH!JGtqe@2~<}FYWxX8
z0#NOE)p@`GwBuE87K2vHv2D*TEhdbSNif#Q#N>dNPk@DJZYUHShL-dIP&R)I4ojDT
z?Q!fn;4CGa!vF_XGX^He!;wDPf3<(K{~D(pAmjJ_L|!^UDnwp3zpv%x0a$F5mrp(N
z@;F5;I&isgRLe^-^6eY|q@W0&cm3vOpmoA_qDA5X<+OTDar^duF4s#>jdT4=%qYpE
z7W#bhCApok8jxIe!F7c2o?VUZoe5A0&Oary63Mg}`vWL5W@QqY9sn?f56BZEd3K%M
z0eMCO)Py*FNQ5<~03^ioG1#Oaz_KG4{dzQ7`3)>ur#C7jmG7FX%1>71V~O(TvHZWG
zsguf&qj?<b(G!_6BnI9*xl@Y$mMHYE`)*p%9r%uRE`S*_;S8e}{5|kz^ePk!Uq<)m
zunCFiy9!bA!rmpT=Yg4MTt?T|Hs_-W#;!U>8`2*|S^l9zOAm(wWM2AA)()-lVU$lt
z5aQxo%qsCVMdtwa3Lom(_k-)pi5g7ztEAyouz_n1WhV|I9Os96Z|w=nG60%qNa<a6
zK%qDV%}c*~9E*O~A3%qz7EY&J;mkCaKCCN?bRO-jU&}sd8W10L`+O2E)?+K}J*t#}
zH(Q|J{W70;=!bZsFLC-|c|aJCp3KTrjALlE7j#Qc_8c39g&h`gnZ}Xn3z28!!8z<c
z%q1Y-UlCXMUquYKXtcYrF=Hxhc<U=nzv#k~uzZD+Trc$(%43SnN6x?JLeMz?+{8>S
z3=8>Rwr#It8we%l?Vp*>S$b65U9J2Ouwsl1{nhr3V2CzXoXxwF$$?c1v-~3{A}b1k
zOWAOU$+*+wbJAD@$$5y%bqM0;M)ea^KQr-TS5RuBJqi$B(Fi4OTIszJHe=l%`T_H@
z8TxFG89Of1E*0tSBr78`oh2E3m;Zs#y}Q98#X@O4{;i_C#`<kLjZJ3Q_yLDcSlNlO
zy^@&zx%&A5Kla0~lk!T+WBqPZ#WzXu^cxv#NS}ru{@FL`veI2->^?Q#*?%FZJNs`0
z(b=EWPiIs{EE7L=l%kr#GuLRJ24thNweqvto<`}5b^j|ntvVSU`0-)Hz28GD0y?mN
zX6n1>9(qzO^*Z5d3c#_#maBvo;2d%vL@tW2M#Hxvh#!`BURduM0u9cY;HL~wtHJ4m
zFy0RYo%+(UkENf=pMpKX9+<J8AQmtJyR<YKy(&qE)%q9ZQ>A>ry)<*-cxyGAE5E+b
zT)C%{nObkY^HZ=p0HkXGzLrL%C*AsNzH;FEsGCxLTA3Ldco9m1C|?G<VwhY2o6x>(
zCG4zmph<oDUr{b~H0s9NXMN{ZWNLhrJQiC&U{q#$mhN!J>|56o%&GAHGZqcbsVMcP
zc*7VI6{U8x67vy;1BXb$qf>@RX25FnU%lEkSF?GgV^#Y_#-jBKhpZ$VG6jeJXr({f
z?q6yL?h^3X`vLE)qI4#~@1TG7*LYHRKaidzq_+CZIi(^MlY{V;_!BiBx=;sY<8!_O
z%tRfb_YNd^$~)&SOp0^+q|CH_G-vPJ2vPA+AS~{qed~&?FKCOefM$-S<$P;Qg(k!Q
zft*}3IwGeK?J9a3-M|AbD<-m6nbde3V33c&*my!xf1}%dIaQJp7MBV$rcHYshSV?}
zyI!RU^;xhgIMcM|s4O&v)yEv|V>BYi<mTNNYIJ}vgaHKes}ya`QO-$d+yulFdz6@*
zu%Dp{*bN_kx$(6QnF%G@+nl|I-B)_l<sTG0S~c=it4&q*p^$D86^6O94S=hY7?k3_
zoO`AI5#V^)aGc*JZ%9FF_pJ-<JI1)D6Gu&~$7OF~UbsH(5i@n;1stnL8=6@^G;6x~
z>d^5wjh{X?I2CiEX<Z6h4)wvRO{pJxG<Ydelb8XF$coN@7g{qBB7T+hz<-VZ+SL=V
z-gDtyRzCKo;H2Bfw*#-~rMSGZe8bSpVZ+8xU_msRCXIGaX}XW|sm>oXcG}IzuPvVp
zO=b=4UO)86@hl81fZ<Uo&ebx+-SSzJ+3>4nH%+=}QgG5(jY9p<<HoU4hE(dA;+&Gu
z3v7pZ6dx9^Exqm1>wYu-`k-k^hf<uq*kD)vTXKDnt#ZB68Mw3D1nctdtbQ=6I~!5b
z<tO33Iu<80iSomP(^3A?MEROl<(FSS-dld~_**7Tux?G1pVq28aPZ$cb`r3uAKE=g
zG{9^$*dxU$lA@T0H64eP!<)|Pyka_9nBceW$mw3pF+;TdvIKLEkQ!3nUpwh~!Z{|W
zrDM|dgGn1m$ArPw&@8resI+u!in9d=5n7)7uTH{2c~ZSVqTZ#+^`1%8Lq9H>JoP#j
z1!PwaoY^RmBrpCYH{IYR5C}R62zfa0YgHxD?-AwGCIc?$BVrS$Pj6{GQSPFtQ&}#l
zeZC`_-2Q7*;Zy^LvM7)qe%q9b(Ehc;p}0i8j%ixS3!%m33WbMEo;o%mS5LEC+#V*M
zUb`ttzEGohf`%^gH4Qbq^0llO2Z!}T4+qT76RfEs$DM>PN2EA|q_F0H*3|OrCY!Gg
znmT3L<na|QGchiND)Dh)NS#W6P;#(ra<V`I3i>-WA(5Z#6B21U`j?MY{NdOqeS%Y`
zqr8&fIDP)YI-)Q9E2iRTJuyH=jKAq-l)p0pe^wHFfc*8;n?$jA<>U$!5_rfjx~Y6h
zYd#hr-*0X9ue5tbwis2n!zc>vyWY5)pD~@iZwDx5+`R!|zqQpbt3z}7C0E?t|AVzt
z&JI{#&E996l9F<3zO$6ZRGeR-y@2orOwHMQ6@)p2wwGm%yB|Q<v^JX7JN{68nz?(s
zx$>(+&Dq~Si7zMJx;DTax<RqC^Oy7ip$B75=}JRC8z;>E(ATM^H5nUt$1QJZ;qFFP
zzp-eY*$F2^*nWN++Pujp>pdV_Jn8*jk*3S6&4Awf#=Yb6_E_7jcL!VFq}C0N6{Z@C
z(heTN{{~m3b@RtY68J#~&Tf$oEuY;Yh~F~|od~VYuR7Xa%R$|DQ{V5fRDNtN_a|EH
zIhcyg*wwswRSHjntFg+6`pklNj4;1Oz@sZ9n8*pBNYwPz7iLN-JXs-+g%tl3vljZ6
ze;1ksoSMUTvMQ*MzXSn<`n~oO7FiZn6veR-IXJKAp2JqyB3sHvsbZ@u(oEzDDNO7n
zfg;@b(~U4gwT@#8&r#MOG|qwmwaW<i`JOQ@fw;wY3i6y|kfivo852A>RX76@yL?eh
z62^p<wq^{4>7Hw#KEa_r2f?E5h4GNOB{m2wIPQ3Ipvs@>Jf`&P?k%BxU5tBXK(|P9
ze&Eqo4wMXJwQ0&3E>Hj;R#Bl^+c^=WVLd$6%=8=GKplr3f~rRp=}`Saj)mqB=;ZL&
z*q%?4#0IWi4Wh4k&0WT=apqxtNRtCcS56pVkHsPDZGRaTLwz@Cs{>}-X0WdvGfoaW
zmDFIC$wnbCIhcb@9hguQA<jJ~AXS}OPD4Lw-+_JE(@FAD*jO-#((ic}Rtv^QzSv$c
z0F7=~x=c;Klj*3CZ*+sB2iOT|fMC_*1B-n(+1=13B^t?+P_U?_rIt^enK4iar>lK8
zVW|y6Z31k<Rlvr+p$;{%Ib%@hMP}fOfm332b<p0kkjbs>xx--3kq*inT5S(o>>6&X
zom(2*@LFe%Cp-yZ4|OqrlJ5S?HGRdhqoq82e>5J)1Leu+dxFLor|Ps&2%Dfaw(nBx
zE<zWOnIsHs(xk->z^hclJew{xEf5)7iki;nqEJI&KJ_vw&3b1eX@X^*jd-@AJo(@y
z|AVi#PyQtf;N(a0i`>iM_IDFtLzD6W>w7X|m;L2(n$ElJ7?w`$aNrieP&88AOmsai
zR@@FzVs-Qq!a#pTgk+~<H@k4zJLvkW=+o5cxnuLB%euceFb8|hZf6<qxTDW&&4R;+
zB#raVD_YK^kLZ}%QI1Pls_x_SSar42#|Kfpt&|;}BxPcB+cv1-E)hy0vTpcBPq@$%
zp5h5tdBUT0*d5ncA)C>jr~a7c9orma(SuEkN6DLm>CK@+AEv4=F?=wH><y1_fvNI3
zH-{$rQiDB3JQC}k3$bkA%9!J1bu|k|3%iE)q;ewxzV-B^L$hf)sNFTSe3c|(_`NDQ
zzNVM#??D(>J$=3=X#lDHz-0V?^TPq)Kk(20d;AML_@DLj`2S_!0pK6QH{kx)^uO)p
z1pPx8pWYQ|wYwVq7iRM@IED}hpL*&qdd4Nq5qt#_`$+eq6qX*pQXUQ1ky>d+HW}gT
z#di8}1$Qt_>s`OKf%~I@*aCG0VjQ&#^fxAO_|o{5Phf1|<6rUAdsL<}D6IZGT5+up
zP7Mg7Rgg%+@<BeJ(79Al7z07(X=W^a`w-Z8##C;#Vo^`+f)cEjJ_p>;N)?N1hOnPH
zA6s@6$hY5;8I|&TpYn%<aT3$J3AA9YVV5lop!;%a2+xAWgz#Jm&(SWFrBI`|euR2;
zs*IGM&oDgFS(bme1XaUdv5bxvxMnti)iIF=Wzeatq&6;h8;j?tt6C^hi)8BgeDraS
zx^#qoPDJdnoJLl$N~f{BOp=zFi;QRG4wTWYv=Euojc%h#jc#Mc5FrBN9}ueJuaT!N
zAMm$P$D8qMuUrO;i6?R2KnI(#a84ysIirKT6$m0cIA<EbR5D^ejaimc!0V!3!#QMJ
zLW@j+9;8IMn-A`OAy9o$2xD}cnCUzT3kXX$*hxrVAov;Grsd1643!>;rGLKy+CG+X
zOkJmxC47k#lDiyXSVl)-i`TyRC74Q)|4XRVCt)49D)v!N_$h?dc|_m!+}(=xn2Hq#
zpfv-U{W;cD=xpEhJmdMpUdb4PqjX2siShLW0sBXw(n*=_-~lW(3TE%*Ers^R&=3@s
zF%*bBD^eue!yHR;sBkVU9y5l);@a7_?7%RI4g4MPv+}(|$f|&D-th~@%)5bBW9H>S
z9gUeWd)khf1%GwN%p<g>XyQXKtHF30^xiRZC0X)67&FHR-2IK29c{-9Y%NIA|9@j9
z2HU0YdO2*vfw>+sv3`SUA)oCJn;W5>x#{?OZKNg-oAWSiC`6ol-ErX=FVA!^UcNxO
zJ^RIk+;6~GcE`Z0o^S)g9)niT$B8{k5o8<!#d;3HZUd!sHNwW15Xad&`?r?vT(T&Z
zJ`qJB-NO)87tWHU`wALLv5aB33qnbT;=}3!iy%EGQ0xz)T*c>sEGn8&o~Zy~cPiL9
zlqI2usDoH;3=p^hn-*ZZnhvnloc_tb(x~q$Q6`mhfEd0N3>B&=;r+k3;{6)<LGk9y
zRfqDjUR9boQ1Fcr{yaUNrm{H_=7?4`V@e&vO;_zfd$cX42ri7(RDtpv0ov8TX2IWa
z`&!^)9zkZKw2Q(BZ$_9?f{1amWb8utF_|2=jwnU!S>?x$^-z_nn#-ZU^|zh@WO_JJ
z;-gB{jDXny*AIvXX9YIHKj75JU(N<>jlU_ZhhOS%`vokMDB(EL+(8MiK_RHW$qOmr
ziRq540N=u*A^?=da6cVt8H>q4Y-x_o_!A(uh!{$|EG5o|_J(l0{~27qEGRn=Eo1X`
zq{XFyn-*#oi%WzXb%o-dVX9N?9RFCMj=!<=$wFqe7KC8*P=e8jOc)c24EE;xSXA|!
zQ-kVizXukeix!1SUob{@_n;#GfIq1O|JT0(S1gFupdZOSyAD<dI&+e@XV)BP&jw!H
zVb1_pI=T*Mpl7T9DI(v9SfXcV+|RtU9O)Ifw&@k_7VU7cZ?9+e>(Kn{(Koc3vH(to
zw*7ms5)khAlm;9IFlMEGZ^Mr`!Ki6qREOjCJuF?@jN5Ca3$Sc}{~(2brNSQ;hyCdT
z@HVbM+&)U$#=zrVl4opQ3%#M!>QD}2E<QH@qcM4k!0UKSqT?8RFUmkcV1Y8&ARZir
zNLZ?eWvOR4PQ+%&G!8+}kgOQPA^9H-$K$Y;G@}@6GA6-;;h1W~BJhsIR>N@j3o;C$
z$-mr|mSSgtoXN7f2t1eJmxVI7!8<;|)lok`{OpnBt+FaX2$9M<{>IY(Ary@dzx#K!
zBcxqg3F)hge_lv?qem3d5)MCX^>8Li7@tARqMWQ;;}a)e?tZ+mtr5tHof(Cl7q>rO
z6rCY<=`r5)r8>PL(|O`YPx+<FeC}3}?|g`KJM~ZQoV(hY3X9I2e{a%ZZr26l>8qWJ
zczCUIt0z3u6P~TZtb4B~{eDmSgP!z1c*0L2EcWi|#CEvEF`%2F7Q~BlL{76OTiF9Y
zmtSn{F|D^u>qCFbn9Kxhu~(FRTx@;ABW~-hfVFC{wZ6Txe1FR@3|8z`B-lo`757Yj
z!81HVQT|Rm&(6liscbpppp{JrEX+x(2Fp>mkvAg8{>I1*<k*2|Ag38x$X6V_C=<R4
zYCjaGuUONpLQP<Wn!u;O$4rNP(6>&Gug4JHD5G1~oKk=e<cy*lVyU1ST7#(?<1QK9
zZ0$mjF6hdBFV_cc+sg-hwWv&LI2RXX&|1p^SvTq*W04xmrOC<ri(v@Ng;19UV5Bn;
zKteTsHR@ybgZjl!L#C6~Hy(j#SAPCq^Otj0afa2PU4gIS7qyE2s#W~NR`J7I#Scov
zGYS8jCtdim-(=qfRZ$Gf(T7YNQ(&(`ygw3MPm<ps>|qC3{=@?;{|$8Rf#7o&`g0)V
zAB2`akn-0ZVEI*0!3P5W!2>Km{m%ykAB&dgwJ;>6jxh_Gf(5R=H$yWsQ-a4~u4pOC
z!acIggijs)_vj06hSp}IPWgMqQM!*r{t?V9#!<pFWSV(j+kbf+XEK$zbDI)8wHSvD
zzTz0v08is{vANR_?7_lWoNCNaeX279#c@s`yQl7X0lh!$&%i$@ffVELbyy$GT1P~n
z0ejqQr=?V$mlEv3om$GI(Jd_p%Re<Q$^NY;)zz;**bjC9@4N$s2h!2yJ37_J&L4w4
z<?pcnP&ptacv-Z_7b|VC%6!}aJ{0=dcNN{Y3Nd{Y(GJ93Fv}H0i@HiVZWNbq)|t58
zo29DIl_kt^mx9frmX>B?Ws$GxMZ$^4yEd(H^}WU?bf+&R2&FsKCpcZFaVkl|$%x~m
z;3wnNHwmxz+QBQ{zOQM1D}3H`@$p{{OXjur(^9y3T5SCY?<hsF(4F}pLg!#EkjxLB
z2*NGfE`&3}`~-~`Q6s#GY9~56t2jCZXPiyJ!RqKw`(Z^(e4+1CgIxz(&F?lo|K5*V
zI|q*pM8~HD2J|leG7S$5;$}3?kWz!^KwMeBe1iZ2!$uTG%d<%a@Z%~-Y~0<)+4Trz
zv66K6WA5r&g0o!N$}cF!q5xJS*ax_0rk4LDp5SYGg8f!xRo>%t9CG<r68XXi-v%Bw
zZ4iB#Oz(Uzy^Tl)%Eb$0IX_{SB0+CwyEocZ!xQ-?y)Sw1m(aTew1EH`VSe%`ncl1Z
zQ+j{&xtHD-w4t{l^j69t_DA#(q^Ea&e>9k|IGyRpnoylg_Y>k|)^vY5p5SZx1L+R>
z24hW5oZfNr$vA68-=4|zJq%F~eZT97zDjzdmx{z4CfhKs%pGAy&ol@0`2dT{AkK%T
zK;Sa;2vhn9491;C5u%&~Zq1?7Qp@qIs^|m^d!MS)(ukMl{t;Z>^kuw1<S*K{_9>ME
zr><z9iD=C%u-PCFq+lQog6o1EJXGRFn1+lQ?c^!a#l@)_>H2sIs1I;Yr#h*_sgIb1
zv#EEG?!}6JP|=~8y5iAzj|^z2bFD#{Qq_piCl!~@F<-Rk=iOMs=Uj_)sXxe<s6PYR
zQ`h(D`j@EsP^qYYfkdSKSqLU<4sD;h6agAT3qg^QTxEMGfKxAEJgC1nZIepFsb+#w
zKdH1_Z`!F`g4KeA{Hv3D^msPrzIPEzO3uP$c87^ppj{FWC$n}deq>3S@WW+Mgh{a8
z^|V)*@FnSGp3q>1>hM5fgPGS5e6l$-6)!=|jSj_25I2Y%Wh?IRC5X4sZzEC#VD3W@
zlNIHJdjrykNPZ>q)l(dY{Q-_4#iXdKpW;3&$?Y29Oq7zgjYv9z?YIAHfxjD6bld+3
zgUFSh0qOOP#P56LtM>1I2<<OIkwp7oU_vhLER4%Vp_U6i!UiVbP9EeUS3##-^+Zs~
zRsTf&8U5q+`|J8$RQ>)^{|Br)rG!u3KvVVeJ@xbB_49T8tx{!1zSNgT43u22MY^h=
z>#2{KP0MwzuK$v%pDXpBO|<{Vc>Vp4i`A;W8W)Qa^<RkB_l!#nKev5%TuxQ>)wsMa
zQU7wczN-)LU|%bJU_`Ev^3(}NxJ1<z$<mVB(rCiv(#Knq_;L2Igg4;^B;aw8wKyum
z&pY_h<5#?U|3~)cslPuU`*Yue!p9{0Q}6GK-k(TE3LiI;3+XMD?BC8CFYum%aI81N
z{Hh?&$*i~i*1t^ajS}l^Ih3^5%^_W2{}x+yW@wLL+=Cq)41bt@6~>|?s>C<tE0*Yw
z<#<+!wP&!kolZZ4u{y0mQ5-SCR<To_qV&Qyj4w8LT*3OI3$isp0~5T!l032Mz7c9>
zVq%(g&lO=!myg9RbyJm^r+Bc8!-ibCs|<#p-hzj*!v#pPN11Qph7lsU%qx_gflut|
zFqTvzd{Oe9f#9)y2566SxKp0Q0pYKZk7>SXBd0LaTHDlx0c^}PL4mdH#LW+&Dmr;1
zR1|;Dn#<}t?5hxOrT?+}xpqrhikcPq4iF}*J5Aio9mVwp{F)G;8(cO)M$>=06_Y~@
zsv*R(ovw1`Vdd-UKS3^<UJiKT!Y^esydoDK0))4*THu`^@MdawMevUYJfOUMB3L!r
z`3p)V5GH)rr(NU;8tPJCAbJW13g`*vzCgjJMmT_dcU}~(D8Zs?n(!b{U>o6IVavBT
zT7=|sNzDorG~<byPD~wAQ5?NVQn4OG+3qsWjIIboXHLUIFiIk|JzE^RbDBLBJSrxs
zA}!@znobbAm7MJ?I|`<S`&v7H$39{cAM){eF_bgP_owG%_qFI<ihT!Y%B14h(22zj
zLra;6UsHagw-%BEUCfo+JK0YI5&%H))CEsY=zF%8l4$RaR_$RU>OxO*8PeQ^Xf3}u
zdaDmDaW8}|a?b*DJ|1-QgG6kxcy<FVA3Ki?7i=>kK6IC~D1k|HxEWnoiCpa32J(Kg
z)1}$bpv0Eo(TWb+%;>R#mu=q<8br2~H^D%|gfug${~`lmeynVw*-#|?$FKBXiWhtc
z33M+lKS4;*(V)SW;4#Ynw-W7CdfI_OG&9B1@$RY}2yPkN3X7!I_*K1z;cgauXoRnZ
z7`Sbjv)4-RPb1I)aJw}kd{Rdw1UBX2#nDCVKjf5)^mLRej?NNEFq(QL<P{h1&1f%0
zhsKzu14IpSFw0Aapm28<<X{NoV2IEGzgiA%@yNj=fP%`LfXcfz9jE2d+fIe29q|7G
z4;eC~nG4Vg_<sxdKScAN9s-*GQz-Vr|9YW79>4_(u*YCTHe<@+$SH|@M+#^O*d56G
z&h8{=DGBHy*l6y5bzi*KMd)=B_h$*&W}??c=(XZLes!<$-e+9;p9K`GUWM~X;G-V!
z=xh?xdnxdE<+~^!V0Bj&)3JyG(;@B;ahj3tDu$m2fPsR^xON$N7yb{TKT5#yBZ{L7
zaw<V29w>t4hYLnm!~F-sn1s@vXuQWz@7%NTa{68&6w<hlgI;UjAqI56gYYqmcl-qa
zD~?``iKBHh4|tnF*U!)KFgizCIX{PupzoMyfjP!GW*`Y*&q#IuWkgl?*9f-<0BWFM
zT#nC(EX65=%t(P~wye7!XIai$Jn)I)ye+h*|5c*{dE4D}y7S|`oC*e`C8a#5HX|DW
z$5~BZ5;dkwP#44LDJH)Ni7K8gf~I9G$mEAP)KqEBi=jvsUN}=>ffmmeY&D`su*xuK
zCxwuU4d!O;FPpq^f_=E5V*=3|MsZrebkS&ppM%URy!9TO5&kFTBI?op|3L4nMepDG
zJ#o3!<S;lkKHrx_&r#!ikA>{WcjZm(===MGoxeoizf366TQk|n@o{|*yCU9BwWpm{
zdcL<bvwuCG$D?VFp7#Ts@GGlhJ>L^$K$Q0Nd<ru=(DQR37*zegOwTW>l4d&4^K+T2
z+i9)mPgm{iPtSYilV1bUa*91IBtIg!MFDFsTxUd{p`j4MFii>Oabvc>?5~Y}SMT?1
z1M!mgd+w*jfeP5Cl?u61;dHmclWi)H{v5R&qFD1n72P0a2x7&dHPggI`s+RM(a5z%
zNp$=u?$B@x2v_TNVCdQRq6jP+6M2ORf&Q}ybRim0d&>Kj=jXoLuRQ0Xg^uKTDTwyV
z?3d5({yBNBV+lQOTFG;nG_!wsmS@~O^88zX6T4+U@?3&49m;b)GdqyyHK55am*?#f
zX{G~tUdP-H<oRmV&M%bbS%4Im=iefdkmpjUv$paa+o<Ka4@56no<~B0+RF3QZUvsK
zCs&}tOqS<6F|ufcC**l+xV1bNy7G+693sz4Pz3XMHYaUPgBcKRID25dP@{4<ovZP3
z4^fPka~{G02#-Hp)$asRmUbvdlAb00iK(V3=GyOx^e&t|c*{=8d*-c1q&eI#z&zre
z&-Ot5J6ll8kL6`4Phv);aP#~Kai4tmv)P|<|J(@2I8kBznb^@xdkZ@^;TcGgT}>mx
zOPbL#wVAm$m5=VZ2O8YWH0IP{=h6sYj8f1>;fwGaFX3~}LR8H=KZC+~m~~wgW^W)0
zxk%9{^hEIIQP_=+5`|s(^`Ov*sO;CcXv;SE8IXbGJ$UsfzAls{*FfV?xTkD{n;<30
zbt0g7yik^Ni#Mw3%xhEUWiNVJ&NJSq>|IYzZsRg<fh^}DL_K@gS0zV}O^#*&Ay45A
zkP{_>uK2Bv=XPy+AQ2WD#S=9!(R!k9f~#ELa^b9x#Cq<0QwC^oXmqHYgk3HtVcz{`
z5B-D5=S_dtG{hED{2a_}Zm6$m5#`;D7j<n)oHyzF;@Be3PVyGWg15k;^yfcLO9|JN
zAB7i8o<NXO=PfWd@JW}*1h5|^ZjcheoADgO%?QGbN*{;c;4mJ0BXTVw#k9S+mc{c9
zJq~6s&x0(?1OT})0<Dp7buc2YARQ#m#z~|cFU9?72$5E;4+4R=x2u7agS`hL^(jmN
zerM(uP#FiG>8WVdd-%zQV{hn&b9BRa&F59Azg?~q0L;arfd_U1itx<diDNk2DYBpC
z+<@H4Tz+8I_W}6~TAJ|ldPm{5Jr0C<1pOb{i^{q7`v8TrIJrZaW1*V+j7VsEi5
zdEgr3&OYKUHirF49ClbedJE3!C-X<2>%RQDbS*eg(R1LY_0oaU&;ht@{p;^=#M;J*
z2c38h>~jWFr&P;%uQQWjv4<tz1aHt@VNy6n<$n;*kIRowZp+7`U7ZU-0((lO5@~Kz
z#rLan6v6mH2MC9!>KCIF@i=ofW(Vw1Ee=6be=Y6He$W2p@uQF8^7s~l1hgIIf=3=7
zmJ%Y5&mc&7oP*!EJpK-mBze3AwEC~f<I#Xlq%KoP#pSUt(p$^pkL%jXqqCXy<MPP)
z5l@@ktk`)q>`TgHcc!9|k7<K<D34D7{QpQELw6D_S02j*=k$^M(eikr>RMbLFKR$u
ztjD99_|q>3CGAVWcXr|^bmD&_k6UN{5_vpjmcrvn{8AoIfm}9u<T2r|jxEU9JF74*
zlJsPS=*gEph|A@l5oE`H4Xr`Bq-_N~IaNvoXFyNRK#+1d8Nb0RwVu2IktDhNq+W|h
zKLBv`WJP>)+K4=gbT}UpBdE%xZE7`sll5ht=u2t@Aomiq0rllg7`VV>FEyR`%g1nC
zwC^Ul^3H8-xk5=r?y6n;Vc|VC8n~^WU7vC`uT$b~6*f_LDHhh-Md`4Ta&AM7?6K3Q
zk7ZRn%_F|Oy}5I5+yaa46L@$Gpo{$7oJZ6Hm5*W25Q{P5BX9BUslHc!7t3J37YTIH
z4xk9`l?P%2^F+g;q<ECViH<J17TzCIx6FC--0nSteH?fO=m{A-3Px?f`pJG<>!F8b
zRd5F&6zo<9185!wXs_E@3wOlh2L{=a-dU%^UQN2;x$8ER*MpJYuF#W1h6H<dTMK5=
z09pq^Bu@=ZZ7m+|(<ECk_1;2R&SFF-RNF78iOICm3*f<EzX_%~4<Jk1y-I)cuua>K
zwPXWc(c^u7UZ;8k71||u<aa<D^QejI<RG?lKce)X)A!5A$fQvnp8$N$uOLV9Xjl0V
zgFPCQqDtp{aDp4aXHeSG`jv|IY5k~9Z%ykE(}mWFdF5L21(N+CemRb>oyl?RHfy<o
zb&2g-b)NRi#`jauT-px$1C<yWx(SoAXMA5PC1iZxh#<%J)%cB%@5>QM8s9&y`X7w%
zkKR$py#p=g8Q)(a`4^7w#WGs{C*yl5=B;+cE!^SwKAQc3O4#4{E`s|6#`mv5PmJ$#
zVY_jB@47?D!(y?W+REI8FcsSy->2yG)-q=>y~FX{cjf<Le7`~5c>y9l4_eFK38o&<
z_->+&)t*BAuZ-`RpolxH_CLPg4Yp6~*K~SoT0g_|R^$6l(jUWaH-0(3?}WbnCF6Vl
z_GI4M@o{}Tf^6Uqv(&gAf;k8C_6{uO!*ya$ev2T-^%wY!+mqW6NgCHDuh8SVw_av&
z7u~zQ>Bq_J7p`w60XmUNRY-|F2@GyWGFC5FN$&Re<c!k_Ifx9&95wKKV1g0R8`dMu
z#;u&80P9bj!L=lxEc3rCV~#~I5bblra&0ySE0X7_eV7B{3w5=G364#kr8awuWI5{*
z<+{_pS=oA!<CWN#uycTb^OR&(VnWjEjQ5eC=BfBZ;%kb;!(E#mvk!$_xDwa#3ZsnD
zgMMznFD0c6`k@IHojM=bzdRHJKdt94K#*YWh9aS!*YeO=N{Bohfgt4}4Zm@DXo3Do
zl7}Og{TJoI;>r|A{S#W6`W=G-Oi6!)N-x%DhUB)Dhnr;{9e6r0fjsnpF;02dK)nfA
z4`HV4SRO84`rns_>!Fv>GUVY?ESl}^K;S=;hkd{OCGt=-UG;Mrekl({<sNxxeZK$)
z`dq|l_rP@VKZdtA9y;5nU6tUod?6Fm_k^lmOG_y~2hLu2pW+nl0@zDs^J_w>%<d)*
zTvs`7PF3UKs?}G?dwcXZ8PJsQccp0aHN@68Msl$q6xx%HZ;Dc*>^azoyh;@U90sJ%
zeU%%1aycJp!y%AQa1dQs-+WLun{Vf2@sd3_y#@55Hp*rfU3a<<Fanp{%e6$T-{0@x
zHAH+5o5t}Hj!pd4ahin3-iLP_Pg_41OOWopbCF?0PFJd|Kd%tu{jMdDM*hJc?nCvV
z;t$Qd4}<HKAKt3y<=6t9+BJkN&|}p%S>cD*W5^QpAo2>}VXHDFwfs?)1?u%!&CW$?
zktGxq-F6R_EOfGPrlBFFFVJ7-I>e|4vr#z3If!}H(pAm1Y<2?wxqRCQ(B<%R1K*6u
zkD|AjPiL)N5@SJf@In^st6cC7MceYCpYqzuZdxISFs5~WPRk~?h7+pubDXo$I()XC
zBOI)qm<4?65VeaT0(fTNQqnn=@!AYLGjN|G{j~_g{pTX&*q@_s6o$Cx4+FTZw8Ivs
zGB9X1WLbY93zLifg5%6Dy&P!usc0Zekd&^%qy=n$k*-cgcq=Fu+SA#H@b#}iv~yrU
z`gw~fU0)!h1QJ8PaNZjr5!0vQQz!vQ8P10rWQj+@i`W%MQ(8)MxsAMqN$>;DUUH1;
z<n6bkcMGSGgY<|3%&BnogOBD!NzlAHj=aiE@G7SkESRpw*o7(&UMc-h;2S>r38D+9
z9wM@lyV0iN*67JCEx4(L0!>|He+dy_4o|W8BD#D$CgQ@;`u+pn=5bC0>)ib>1zqtv
zS3#1Gn!5Sm;2+VUB}Yj&`}I~F+=m}~CHmO1QMVqohan2KJ5nMefS)DDDT++IP1wGT
zm3BhB+OYjN5-`^uvkyc=D9awn5JX@Wghb^H)+v*bl33oT`%WG4yNeOt0JKAUaQ0si
z=l8`6$?x}&k<4$f6l?lyFuXx{9%nedbVo+NE)*c6pH|&^paT7Q>_##g8Uv5G1jjdr
zegj5dBMH%Aq6J(=^RvrT1p^j=(UTOT!Cndh7>zF!A5}Ft$3P(}g}(vE@)?T7xBmv*
z!YQuNbTH<3V-eVk&4{>yD#tky*{j#X<S!ShF8>IWgA?s2AQ7!kVi{MEuXv1xO7TmC
zF)}$=Z3kUy*`!MU-kyON*!sL;YX;f+tnUA<Qs;5j=>g_<#8wN(ONy;`G6c3hNBt%&
zpQ2M9LP~spl%rxm`qQt#;?Q$cm-zm8@;n6}bqp5eQLBB_kJJ+s{2x1C#Zkj)s4(R!
zE&oK4_t%=0iz!Y7aLeReG?k<;;8BsvNd@~XN6WRQt1$wh2r~WhSd*M@Zw_Y3xDH~u
z5987-*y?)tpTq&^wT5FPa!7XjT|ZJ5@o8OvhoA@_9{JWb-XICxh>7+!@(c$GEy0(h
zhrXug*$>up9eOdr@O?ksf23=dV(0|Oq+}5>>icUXNgU>fBWV9=QrxN_L++*;2Sa@J
z9SBhgr-7ktPnZgRK#e8Y3uskiPeSX4ccGt2&WvzmJQKb~MtH4=S$h3bFdRcO^I<&}
zr37<v<3?Tc`QT-}wCs*ynq-o<x#m;kUh@e9Bno$Y&u8pQ5TN)ypKJDe@1(~KAV8?y
zdnbD*cr^|SdA1>|-Eo(EFXh@|%!VLi{65fK@dTf9v+$ybMhNn;V-_5cyu`@x7HIdH
zk9aSghGI?6c@__=wEYk1Hu=;B-<$AXUX158M;2Qj$Sc%5hRcGh5H8p>nt2-o1)mvV
z3uG*|))ue)F%4{<h4-NUjR&Y)KRk%_a82aot1Tz$>r{LH4`lA4(C`~8A=YqQHp1VM
zIk<NcJ;Ex&*eSz}b{xs$6CHzm{4qnTam>9IC+Yu4Eq|m$VHC(~E{UC<Qd00`a1g%w
zqd2YfRrHcJ_qj0mPs2&{?Q9<s$h$Rp1RE&$ynJJch1++bHLwcnJnVwKWHpPuYsPT#
z#==`{Wz^C}_(kFrEtBaU=<br+)M+_l?csV|CVicKC|UkE*M|Yr6!t;D({h@u#Eis;
zJOQ|S4iB=s@)Z3P#d=g%0%1nG=eSHeNPz{t=F|yKU&rv{WU~sts-Bi3X~=m2$y$GN
zR2>Eha5&l%?RPX@ll5k@eLoKrEI(OIN#f-XrGuF$p*{!WB9_*l%AS80NT~5A?Y#;V
z_}4jU@aG0_)exS{0m}~LHR1vMG_znGoPr>gkTaV15IbzYe`^{=>)`4K*Z9<RcoDJC
zc(6W@_o*5A&WO-Q4TX)!Q7A6+VNu`j`nE(b0^ozp)b9gUs^#~Y4VaN2S{I3EmEfCC
z7hro#UJ=>@mm)Lur#QaRi&2_|A%SrB4z0?x%spoG;$}RdgHh95{t4VJGE3k<e`<+k
zLt0Am_D~N-heMQcVUto@muZ#a#=hAcz_r2cfz*1w&(pyF-r^HfbO6BwQch!54qf5Y
zv=FvCo=PAy%QKB`82q6%xn99^=~=wIS<&)NIhq*3M_{_<*yF{_;8U=Y(ol>~3B;yl
zI^RMd%;l;^OK0G38bj;xt=KetnXzdZX7meh!R-7YRG1#mGXLQ~eviI%+24%bh9?7}
zYPG|=3~=!e-3f^llPPqkjfwEqW+O6yY_m3)p<UU~T7hWqgI>f+I|}qLcVWl{4`~h!
z;<`+xvNDb#Gd8`^z7d8SziS2ZmHT*1LcX&SjG^+(;JFP$_+k_2QIE;?8n6LwtQc!t
zfu-pcn0#wiTr=s9fZ+>y<jD*w?7|fVo8KJ8Xv@NAGa_FR7$#(>fiHs#m1js;?o)6C
z!#Ofoqs;L?dZ%FASrfdyQ?L<Hni|uh%%S5!=48~$XJC|wWgf@Q{hkBplxeR3*Ihjd
zy4nAih-*I;xh%-VVh8>*L(wMn!G+Sco-=$-KBD%gV{slq+Ud)NMkEc%&T@!^j7n=Q
z_R8!-F_EjO{i*K|LP{Pw5+4sD59-}j&JC@WO=i^Z3sq1U2GiyP`=b~o)?QK{Z0h;b
z^W2jehh;dp-B8#JWbkn69{$L3oLt69RyO&ar5~us<~y1v2oH?#kH9~```*PkE$fQ4
z*G)bs<A+h7l+I*PcMZQv5?#S+whyQ2NS;!#L<%8Ye76L6!j7kO8Q~s?CrQ|lxP-xC
zO920qZZWZJG9u5RE{18LJ51HQfflp0<#UX}JbEyOs+#F<;!67>;T5+NXnAp=3@PE2
zv^)tW>^G^8yYa2+AEoQVyAJicB<l0NuUBZr{zLbb{R>KHGwdu4K#6Vd1b|}@)bgmF
zShEN2m7$X>Lx;W^96I38RwK-fPIsOvk~uxH7D*7?Ky17$Z7>0AiRA5DerH;Hc|DGD
zl-h>`VO_@OC@Zs~*?#WYI3Dazwx*}_Cp*!fY~3F|wNHCCTj(uR*&s<m)l~@E?_bk;
zERYCJcOan0f^yg3CIbyKd(O9jDQnGpbtu!WWC#PqhhfSx8(}1K2u;-K%@}Zm2KD(R
z?W(uk$3K7I(3xmjB~_kP1U|8g(PPFcAaVF~Z)hU=yl69AsD05=+bWwUW(U`&7*ma?
zTY~}Ah$se-%^y4p8u=Q(nDB5(rHT~+Y@+}>2f+L=*#y|S&|d6p%z(6p_F`w_g>-_f
zs51u)y!Ovr>i&~M7gKtd=Fm_oXk@`(gllpO-%4#LszD<N7tI&S%cjP`gb|xL-`&&r
z^CZk7pI%Ki6xDJq;10^sykQc>M_sYda(*P7Xel~V*19uTL2kT}%f*bd@Z)X_Tnez5
zqw0}lzY7k+ewDt^Qbb?M*TW;VCJ^oS@SmV&3U(Xex7eJgD>G}*mGxMM>#odA?g~gm
zF}yr2xi_09qP-(@Z?FLpso)Ut^y)b@6D^r1gMvPgpnpvXj?Sr|aNE`hZwE=0b?S_P
zhyibGgA~?9OG(f9M8Ybo1u5o(6!RMjxx1GVMM^`VWRkVqH>xk1mlCR&2c7Y6a<5`8
z=GxL4Bf<-_p^7TRrd1n}N#q|Mlo(u9UW-qCf*>WeXf8&q@v>k=B|c5F7TqYB?{C4A
zK9w`)fgGzxU1>z9M+4Dv$j(R@pSZv`z0%$SVQwgrQ*z#zG4bK(;sLinRepoQW&=OC
zK#tDmgxoM>4+H_wpf$;4YZ|mo(K^r6TtJ4z#&5pL^%QjXnJC?np~O5|HXr@O8){`W
z4jlxw{TeN*!6JVd+|o$}BRm3GYEp<4lO4#{Y5{XI`K+{D5od2|*&Hc^3@S;bGDRJ9
zMLPJ=x5YUUiXnM^JR7`HemL>>d7#T6_cN=Y5AFPaL%o&M3frSV9i*Ed7|`!%;E20&
zfUDFY7CA$dKIAT{KRU82Xj>e+Msy;)dk^MZpnP88{cVJKL_{%MiC<UQ`<z!01D@bt
zjs5VI3|9*2V-?l*J)SrQoTdY|6);}K;NacvZ-`dpO;pl5yulwxHWVR6ZjcB!4~r_r
zLK%>Lg;baX*-{%zCybK4!=g&(JQPl}UukzvwC~5o?vmK8a<LpYq&mX~MNM^Qr236H
zYjJfBl7s3+VXQ|S1l%${XA5H05Y)07HB^AhLzG#k4@^SW-UXiuP%dtd2*O-s%%qG1
zw_b{$Sg&C)H9)ZKNmqEvLryxN&!(qIA8wUu`kab(;`He$5oNp$NWTDO%O@$a9CA5O
z{7F!Royq;r@z7ZA{^tbISo`o}uLS*&eY+?7V3qwbv!6z`Y>%Q7A7PqRopP`_&W-cO
z(SY3xMW7iMNv({d@#AX7e&eK_7YXQ9^fy_KuZLti)6oMpx2<;Pw$)9?Kx^09M{Or&
zGi&U_7{I4;&$&v?Xg^_QWBg>q+tfWt>TdZ`>dv({sX&drURU}jq(X|+BB$w#r26!K
z=_Z+vvKpFL&C>Xd>dEvK#N8_MbbzYL2y?rU*isMJKQy6h=&&n~EUBhh%(@%&Ll=c+
z^tW8Js09Mez5adcqC@t`0TLfSl>?-kq07uDRM&v?kxz06L+x`h$%#--kZ23S`k?m!
z{*xmlxD4&G=RyL+F7eI@*dmM{Aw7H@&johbyT-F1EIoAJ^Y#OugE$+*jD~4+h%H#1
z?P9f6rspAzz^d*EVuhX8WUMfMwujY0?O_%1V3n$|x*~y9eJiZw-E6^Xb7&8YhQD#5
z;c>_m!D`k<7!9WiRwJmoJu@w`;OG2X&a~XJ=0Z-?m6+Pp&0;sql9|?X7&2un1b7~F
zJL#G0rDqcx04mOzz#*}J(cya&{|9}EfsENV@+^Y#-UQCs@QhM+LMMJg+#}}$3(V9w
zXfq-eD68^$&^1!>1ISln{rKUKwlWln>z@wWr|qlTU(#Xw*9iO#;HcX^y~+~pfABc*
z!&f|X`%er*`!{;qhr9%$6PhVAkvB2SqeC_aqPH}XTajTX2&Fh=V`$bobnP+_PQ=y>
zeT}yVK9z^YceM=9#FUG9oy+e!dnd*dX^#zwo~4iB>)!4fNiUkPe%FhoV6cOQ5{^X^
z3#FalI^lU2N+T)3$qOY|5&%mHaUQ37kI%Qz_phK^aeB+(lXd(z_WXaKzxd{zOuz>d
zXnZjeghr^vKlRiVHy9%@#(&XbqyXhO{r1H#W(x=~wq1St5X^&Tlz75_bXJ|+8Dini
zhoqf%eF8!%O9biK6xk^}b{Blg4PI=Kcah<NldIui7l&ggK9lEyFGhr~3+tKc3z?kM
z(U1}TKw`<~fU!qeh3*<kkjqNlb3o_0BzlV;^S0|5E%OD)zriHk?s1ruWT7P{#9<P1
zr^sWaFyF9HEq;C}M7Ig^nrqL2-yO7LO4Zqkb}mk8=lrC0UU`H)dm4pRJ2TkM(I}$Y
zfqj65F6UB^2ZI3-r(Vb$Miq!mEW^Q~#GS9puaSB5l53%cx?Khdkc!!|2k(YkP~APp
zbS4oQsJQr}C9kwjI1%*41m66;{M-o&Al6O&)Zhmb`UqeEMCx$=-aYyrhcK2pLy=?K
zKm?^$2yPiC;>Vr#FHofxBFSEbQudGNee&FIqj_gM{!?@4+NQ(ZfcU3zfqNqCQTvFi
zIOXrMU-*I@!h0EzX{PVnO3#ob7+o@s<LByI_Is|r)@?*wlGunovwacZ)H?cUhS1RK
z&}mG`wX1bXEmBzDxfZmNKnVhhpNzV@#>Li~;(uPtsR2B|5n6nT59iF234@!xiI;Js
zrF-zzht24VtfoK2yX9(!>OTE2chZ&=mtAk82#$F(Q}A?()We=(_*^6=Q@J-%N_eN-
zADU%D+4R89GJ=~CJO;sHYKfO%y~zWHIAhI@4y~0L^*vOS8TFkWTtE!P0wR6$A0X1<
z+X)iZ$03*+#KU0dd`zibe?wD3J)A9G2KuWKWDk-z;y2-Iu=sGK#+FK|yEH%aO32C(
z5R}jMvPUvPOz%`X7cC2-*pwV&p9<@WO24oes*PI;)%H<H;+=fV8gPmY&8R{*2eBis
zRe({543P*n9T<64BIrCLOA(1iO+>8w<b$7PG*pNfk*{MX8YGHPN<)an7@~p?PZ#q{
zl|o&j?)KQr3w}>W(E;=K{SV=+rz`u010N7^LN%X4Cy}r&u`40s;DwLc{%qhCh`y-u
z2J|UIMX%sMhS(tgjPP|}6>ex^C$0$RMe50YGY>obYfYYsh4vI0k@xUzP`SZ41)uA-
z7mbvbO5Iv49omy=+}#hkW~d?4o})^PNiLCX-2FXH+)*Oio{kdCH5S=ySb@kl?q046
z<lC2_0H(t+E5FR_yXoE$Ii>ihcE){k(o$A7W>vjqEZ&5}hV@;(>axmupVkH}qwrx}
z!Hu|J2a^x%dV1(l1TeClA=REdOscH|HNcwos?{MYFrd$2N!9jbkZvPzF1Dn<l7>Gc
zCA1`0MV^;PmWsS65g#LF?A}cL8j*RDwg>7rH4u9+Mbg4Rg{}vg(Cs5~rqMOOG!VNh
z8<}S8<z|!(#O~$xHy9qUHdTd1=<xnWV{z!~e8|4~iVWWNWpDIPsedG47M9`-ph|pf
zcq0CdK_oN{fcQIx;Ab~+bEb578RyIjv=5y_7^M&lbP_v)FmlFdEbQNZMiqun^g;-r
zCm>LBQ5e%BG?0?l5`f7>I)N}9!CJQnMn|<SN;yOp(UscBs?k}E_%-shE<-x;BXnD*
zGv6I(fr5JMOW{j~t*tn~u0uKg>1G%##?=#?hM>-L)B#MQ=5GIB>&F_hhb>8q-2OpT
zhWUpe3-eE)6!XQQoree%_O&gP8m4L}4D%0ASeSnaC^so6bCK8<iV#dgVVHk_!ovI$
zC=h-UvaJ-u4M)94juT_^0SG^KFh%xnR<h3?bx=!aFrd%j-dg{L=%Ow5po3a9$U<8m
zS9{vJRn;QK9n_LO4(Rh|Z>{e!s9ml@2i~jm;8JFU%XD6&w3dF8H_riMS#Pt-8|KZM
zsq;3fymP#H-E<x&T+deD-3GwTUc<@RJrTh}rX;HZ(&wMQ<1|;lr;Vll3X*>QUUm$n
zsJSEf_1NjRAj*BRXen0f`a?9WPMSPZzy{b$oZeGq7M$iBDYXZ47roA53m{-I@N{Py
z$PdZGfoFv0!=#gGMXOLJhy-oFLmgZ@A3*Uwj<ZmJDti-_K%@QPka#)7H`)z}I35<O
zv=@8huTkmP&q`8ZoI%=jKE;eKRSb)-4h{ita2N|lYb(7kyWtk&{#&*rxQ6<4vLUho
z<&^<_-e4GvKkg=mpo1BEjmpl>aodl*R?2vH9gi+h&4Z@RAc}oo7^*hf@A3P2*xU0a
z`~+h7fDQ=8PcP;3*V+a3i-{B8CF2=*9h9sDnql7b0P@h1jreC{x`Kx;m!dd9H}b~e
z(?TeYtjRl#Je)e&V=#ZCj)M;H_Lh9L1uW<Dl&lATj=D@Tj;*7d#g2Bv$0C>?ortYz
zd>It+xr~Fc75L>J_WW!U(>jqA!zYls18>6Fe#i<iF9dFAJrG-vGYuv5=fm7~j67A*
z$P4EbBAdUZ`gaU|?MFdA_84T~_iOqijdj)UCuR(a9!W$8Kbb@aUCCyVN)|;%6@K}L
zuZXZH_Dg;Z$Jb*Jf@9tvvAE`0)O*+z=eIicFd0lDbC|xA>1ZSyH35J>sP>rtaK>P~
z)qs{VkxETKgxO*iOR*r`>pw$)&)c(Cuut(Cu@4Cab;do@5a3s;Mj=N&k85o!&Rb{S
z2J|>Z^1GP)5G_Pex)Q1@H1d{eGHWgkz+Mo~wpG_~o{?(*#jmPEVs^V6CAI9nSny!r
z+q_~~NOD>(&laTRqX64Xk^%ckU{Z|l0LL;mlwlFoh4{w$jRncD)Vdu>b`0dva=$=f
zi7m+r*rCpOkb3A9$o9&p$o7Ny`Ngum8S+;U6WMMC8TLSkzxBvAE^|?~ku~`zBM&sN
z!xWQfdWO{5Rp*t@xD2<J@xFEZcBY7bH2u;0l(dU<M|luam}y35L9!QUl7nhEDx_?~
zh^K&$3B&~)D0C>`ie()N_!6!y{<{L+a0vvwCj~Ip?~~z5z`un8QlF)@?Te6|tFpfs
zq2(Xh%hM>T5c&ob_<s}n_p$g$68Z&9pwPb{%aS%i9|p><(2wICU9Zq9MaYp>Lpg0n
z=oem~s<#pP^YvgVl#9h8^<l-&Xd!vqf#m;*=vW^u8H;5Oh<Fo-2$7#Z3?lzJL_V?K
z#-TSKWYhBtj_g%jZ(Y6pE!De9lEJ?m33x5MX9XQKE9j_+XZlN`6Z3=EwMsw3f}du8
zOqH8KCBJgi^sCqGjqq5c6-P_6N^pNUQ1E*2O89+QKbhA1CDsmWRk8JPz<Q%v`|%~O
zOyH4t8dewc@9{25MsiD<{#FHB%hwA`>vO{T0ZwBjk@dlx;^?&O0NzlAxR=Tm-ay`4
zc3_}3%g4~Qu2dco$GTe95%!sGW~uYRHyr+jpxTRKi3hBD8u->%RGxK|Gt37hoTvo+
z+J}Dxt??L+2(|x_`iz{a2QQ@7^<2RA-#2re2Wwl_15S5=kN}TH-4^jt70#__EMEEX
z<jN2H%iT9gRNCQUYFdA9Q-5@F{X;$Vm*jKaKMsPa=&*!qEW2wWp2_$RAeqijVSJ~K
z7spEA=E&!QVA*kTMdwI96jp>O5{p(yG8RrI)~}_tsK<TVR47X1aPyYljyNw|@lR#)
zEyk}~6SrWwKPzQRoyU<Q^!~53m!;dw?6AG>nss|-YzY^e_7CUl{$r4A1ZVU(I)Ms+
zad$UZ<?2zj9Au6cpoR8VNsv9^rx6r;R#~22cw(sQ-DQ=l;SJO9qyXXh6Y$PSz-w%U
zAL9x97}M~!bC9D0Qh@N@mjuD@bp$njHfnhBV>|(mF%9o_4NnRX-lPP)Yc#xA#${$=
zUv?|PP*|@NLt))EK!jS`(^j2RCB~|X!!z1_AK&Y4B**JC2Pj6{FqZK_5k6V-9)h05
z!&*euKBHcL#3i)@f4gf?fs2rl$#8}qD4a$uX`Auu!Dl_92ZYac8qSr;a4w0%!IRNW
zi7ZnTy-cVrvBqHMR9^^<H58fL=~l}Mrxxt+jHQ`+?6#x^FXITLHBkk|Vi3~umJl{W
zWJq@~o~dUmV41QoEmKzajPU|&iIbvz@$0t0qHJMAP6f617}3rc^sLT2Y`5T%Vq|D7
z@NemZ(;sJ91R)w9^%Gd7KXez2H!6IkV=zYrP0UaHhzjN>9updoF@Vxe^(<9IYCGqn
zjM7)|@+k5-Cm|kM*V4>b*QQ$R+IJaB$<3HNB(ZB>HCTkX1xa>S*p+m3UM?H#v4JB3
z$g@`Ut+O|sqjR_{f$X0b&#85DU_3A<JD&3da>VkE4LlTyfN~#F>>cO2+yAQ_UToqf
zI*y>7jh=8fG{bb$TJ7*QwhFIx&WeXuJM;x{%besX9|In4`g5M}{hsjq@w#i9&UFiO
zb>cQeDRk3!A#7qVKMVQBp}IuVWbqHc@|1>Ep&7lJW~<lwc}!V979Lv0SO>@5^#aW-
z`zf&km&z<{kJunvg7$Gbu?{WS1|t{<=aMN{(Ncffd)F{=mwh4=F)U<p1e48F>y&Jw
z%5};?{RnY~eGCw6dN^MFHC6{KY2a@f)&~XqhqdTz1?#Ors$4S=p#hu1#^jA^3{skA
zlEBvKH|m_mRNy}pOD3M(TqwW0>~pz<LKn{DFL0sWwO75aDDWGN_eH>oyW)rF!oMPp
z0CtSvfG2QdtrcHmDOv}t&)cPSKLBw+&18xh;WIVfajLy}778BSw%~9~rtDz6;4>^Z
z5UB}Um39VYL-|+6-?-(IL?#Jk#nz#`=TJzyST+<@f&eKz3@WRVgP>U~F0RTYm96}e
zc?fh1vWxnD5iP00Hk=VA`D&4raHQsa;Xrg5o6~-mjrOVw7{S}@mZZa%(~RA=5lGEP
zDf?OE`uTQwgq84^Fg!oj$%)XZfGj6MhaO3!X4S#RqW%sn=E8;CX!H75)WO8wcPSOd
z%qp`XG7V$|B8bw53$wHMSX|r#DY2QA?g7vtg-|<1peFcPWqQ0APsn6@aeSOV15`C!
z87l(=1{Mhd<WpSR@%}ZmWIEFPxW!x?yH*_9u=(veeYrkunBcdelho=T;@2(U(~luk
z3eK`$SJs;PaJQ@Epy~~?YC^FwtKPqqgm-=mIAZ5vpKS<SdFV$ea4h<k4w+99Gz_UF
zWcIRQ$b1HQF{BAApnCD(ObN}JhYlQqeag@*x>EkS#)x#47+onx!j*Cs_b}l~8L$@0
zmVc<8&XaEsuKLqNbUP4RNRkuBmW50xv6_Kwi<BMiZz+R4+w;&F*ewXH$*x>hffDjb
z=Qps-UvK1zAQ+L`eI&cch#9>c2W)V<oPe<j_Eci3d7R>sxOd(Vp`SHrU|Nxl%tHSf
zB6cFNQ6*M4MC7DIm85Nol5<UKbTuS}ra1}HAaB|;WaN#mGz&Hwu}io@T!MW5;nMm>
zb!mMgZhzs@7v9=8qt!Or$F0-C+qrYNR~EpPdn7G=jik+zgU9l@P(rE+lu5A*FC-Eo
zra+!JB>Dy9YJeEm5JaaTlBGj})Gch|mm&4SZA7Yf5>jUhQY*W0b@ta%wi_$2QB4pr
zOcXL6ftzGnf(9^}XGS05@N9_Wq7sP?hCD=uG(`IA6f-&<cZ)8{M31tO)DXdDk>U@W
zVjCi(+?W-~M-V`3jc#T0P`Xe>@%>3S`dZSmoUeR^QQqcEMzR%Q2$3Pvn(mXMlsr_3
zO0%ucKvWw%0@1^|K~fYP;;G<FQ*bWhVT$yt4IDu|2@X>f91<c4&TTmQj0(ltz#)Z{
zcd$}34MH;oha^sd^ArX~ROFxyoXj?Gn4;ihdEf{^*F&PCBIs@4hzut6izy0@&jUy3
z>_AqcGN{_Xk>QmDhbamUM`2RGENoOnWyH3DBV#uS4pS5y4)G*79|;_)gEnwBGBF7b
zQxqIlaN$H_l%}?4OWByrN|A%-ql4XR5fF=3Y5Z_so=U|iKJ8XqNWtdyoJ2+C{v0lp
zij<djD=wo<wcFxQsYn57Tk)hobRorxEgl7Z6USCJ+6f06(O6cyb<xd*6r6UkJzN^4
z6tr!W6)7q0Ry<lNlI?9P5?hK!N7z2Zu^c0-+t$Ud*FuT|>&CA>$vJ9_JZ)QnSd#DU
zVtF%0JG4=q*I93yOhve{;c?)Ls<ipuwB2-WpnCL_dxmK;;X=6TaE4MtL-oGnI4yXS
z=Sni~F91Td7s3e&YLz!_gK8a9Q^KW?=MAXqy;iV=$C}C43iLUg*eK^R8`G(p_i?pA
z<{;+>>>EJ$<GBM%9L@C<t+S2rtEi@C35+z(x0Ds<i|@TQ=>F<Yllqh(U7sT3_>$^(
z=s(&3#%cf2r3jj_p<QuD6K<^NQLq=xJK4}5kbZ2C^W9>|4V*g8fIXA!K|1GVX8#r0
z>U8gFj3V(Mjr$o2Ewn`hG8^HkAPY8Xa9kW2hoGLZ^~MfbUN#NgaQAhjw%*n`M@p+~
z?&_Q@aoHt07D08cBI6SPT7Mtz>mTF&{T#s#`+J&n=w(Q+?(agD7IM&I_u1*}?;2#Q
z{!Tsx{e2KcbbsMb+JGG}Y^u2XA2KDvl;hCC?yPHyS&?cXyG+}3mDQGm6`8BjsdNId
zWgN{AjT)6m2|;Y0ijm)l)u|YVB4YFLWB+><!U<bDfWsNiF_F{vx^#+Z13?T!;uU*H
z1E^Nu@}+%c&ip)5x`VB86JtiBXpaF>jsx-Jex7Pycd%o+;A&^PC<r+L_yFm$b!8j6
zI1PK+^2_B?>>*9#{|-?Ug&z`zX--|z4o3eFX|9x6v>iTaAvj=^*p&pIlBDJ#ktmH)
zZCzU;9WE#px_s00QF4>5#BhK35aon0d&IXRe1xwWpWJrvIqy4)QTzB%av>TTA7Ymf
zB9FWr5I!;}H9m7!w588=a*QJQNb_pJrRY<r@JSliu2BD{_(<n8KE8JFxmUK~<MfFW
z;KKxAh$jw;5@bk1@IM@2cPwv9n?=~;P3Xf}HTJnVOwjT@tJ*%r6UTB>PX)YRhQ+o!
z5j{ACnirS+UB8q0yVw+pa4vX8B6La@1&GOu+WNc6tb<pcwYFY<g1Dti5oa2z(7(RY
ze(t0MK8gEVyysWz&O=4DdJ-y?Nk`vm8zd2#f%?k;p&3x8@n`+Ic>OwE{|5-JWDx6J
z5|R2}h(L=O;i+F4uV1O_U#HrylwOx5+P^s7zDqy0Px{F|c~O<Ff0(MTsFs?j|6RQO
zYES#s@%nhVUhX+AQTSI&`+rT~|8%_mJWqYBaCQ6hbp8IS{yeFFTB3e-y#9Po{rU0w
z^L70%FwoGk`BMKwi3t7PlmUoCV7dNS5}P2qMfi}9Tz~ZI>)>($ay*K1vuHGa6Bi&a
zMU3Bb9)kOYsEoa*@b9n(?G|valt;eC$Z`{XPNN=YY<ItnZpI*BqZ{#?sJ=>u#$X;^
z|4jWMXo&SMNNPS`nwR>Q<JxCS3BS3EgHQCEbcYF}uEEQ_$mjH7(v$d2Y?3{|v!wWa
zm|sG_xf)Kjz)`OS_?){42lV@SJlG<;Vo80qo9A<8==wj;zj|GNxzu;tpZ|;Nuh8`u
zC)Iym>eJrFCThF(datixzg!{|=bny~n+PRZxs3nQfFFZj_;3FM<*rxdT!q9VubEZY
zO;oz40Jk`ly&A(_s=7ydLNC!r8Lys}IF~Fs9lNiwfyd(WLH4J|BVPpgMxg5Ol@Yln
zq98u~6~=|y3T9hk);R@`={AXH12bF^Ar<jM#{>IhfW}$UGHER%8$a$1kn>L!1YSXs
zT?YhtVC78NsRQ35pwCnicRr_qHg--Phv_m-Q+AdLb9%A6sxYTLJ4J<Q4cPy}x&h_u
z7`74SNkOCh?zT6py!p&~Q_^<WUoZ`)2Rkvno3WWy_A4)_ge^!&LJ0>^O+BSZHH*Y#
z{a}Z6@fTSiaDp1{W1i}{8V+pLjc7YDQ4$kvxH!W>MZK`o-w{_W=oOFs+DqWoFceAv
zOF=9Fj9rem;Lf^i!QnkFU5k6v?_jAxRKEus;_D0M`$xd#=H{8;iS+}7#kswurHt-g
z^?o@;K<0NdPz5P(A$%{wOE4qCd3GlcpgiXn%In*VrQZTzd~dGQv_56MjYzvo!@w~^
z`XV%eP3Bej_XhJDk+unGd|rsBpxI1(oBzRQXctPoY2?8{k6Gem2|VbXX-wY0q(b}T
zV_RBQQ~(*fE2d(8ia<OyOng`%HUb|WD0oM|w`%#AF$RwvNr_Q>AP5fgrTL~+nr+7P
zsjp>9v1z>YHwsXB!)8p1KzS1})Xt_KVWkS^6J!NH+(2JT!FTEc<qi+%(~QN+g&(zY
zu|2yqrv}xGZqo`8nIngN6Rc7xwMsH7N)exKbSoXh=GiGaLRRvZK5f<dw?_Z+;J`jv
zBwUd{?&4hD5AMz6koqW(dS?iYUX26lZlf|CJ`P=t__H~cC}eaSlP@nV+E*N-#0G-T
zL%G)>HWZ&#J*l7L@-hhQRPr+E;kNQ}TS8uzAiM-yB9s?CXOko^S=?udr4Iox{XSt_
zUOpoX$_S)@GV|ac70gDL3P?Lcm3f0nHE2Rf4CLz>%93&eV_E~m`sF{V`gCGkgSx+}
zV43^GX@q6#bbYL3`)$8fm8Bo#S{C9tCJSxUg>Z47Vp-3$=0rusc4WXo3m+H$9k%t9
zSe8>wXtBgryohDC2vieln@TLRU9rrfSfY$8mLqVusbvx$$A!l+>Zr#(1w^fYpiE*c
zpdf@9U#MoRe?lT5=Zg0fRLW!}Y9VDDzrJkdLf#5pIlK!wkPt10V?+-7BaKZKf(+uH
zCT<RcbEesU>5Z=8B$TX7$lf`SJ#t&iUR>XxMK~uL-6m!_heM4~{@~|{;gIjZBeKQ*
z_^2e|D*<o{Ul9hl%y)75TMhO>{$4#1c)8k2v{I~}4_5mU9Hvm>codW*ahs6<?!SlX
zl(<6o?E72uPv9x^6+KSsP?>VG4y;_Ml#-~Q!9X3|&$kyI>2mhb;}e_(@rARc!j|ue
zf#Phb%UP4GL7NHAlKNUHfwLy+kj$k@+q@+p6lYmbm^v|8)wntfdX8+RJ|IIu*vFKz
zAZIzsYmnoz_+^A!vv?xX*!DD$^h%b1bXV)&8vNSJsa?Tq5;4K+^T2D;xs{%Q`WHT@
zfv;Zv2cZ+mZ>}HhojsEHJsJ&=-(^Q9@f(cx*JAq>oa^!WEhKox@5}kqZ|8$*q>7q&
z2iaSYPS$+_=JRl#;ny+z&q8{VAZXR|hJfoa=P817b^3PJYJ`|f2`Xqd!j};;bO-aA
z5l?+c;+TS<`H{8<X^*qKG5G_Tt`OKJfQ&quWzu^+si7Hp@C{+sHguM`V;JEDxQ|6u
zBJe&zK-%Egzyi2ArY3>bJYk$Xr<SVIzZ>N@99M14I!-NS?>$2CFnPRYfQKX(c!+nw
z?XvD3`kQ>!@GU15%NeczsrW-XGd=W#KMTBh4!hwXtUwV?2*)8fT*hSrf!czmCnVMi
zu78BHW7I+Tx^E@06n_c6XWKsfzg@py_)oSEzhiiBUA|v<mv#itShT()6yLaczaW2r
z*7K0yXXtr62(ur0{$Db%I6a?3x7wkn_x>@|89suw7!Mzm(t&^>=zq)%aDBZ9mrf5-
zdG^&;0t2`Q><5cHV&U%3DE+L6lO3=E6ovSuXUtq~qY6A*JK9);tI>mvMS+}S@Y18t
z?6RkL<);vt9mQQ%+HayOqpoD@!2Yx}c|oz9G6mowQ!jMmERtA%;#(1c*bSMuc#W4~
zliG%W2CP%~3}^UpNE*HlAcFmGI}05!(Rca};8Rz(UV{Hq%ljFN%2KNE`h+#e=dVxW
zor)yEz8py4tETdVW1@W%4l5g)U<tbW_ty5lIKSCu5AVT7I?<?#rs1(w-muw0KWlu|
z;V9S%ObtX&YO1zB%0w(SaMN*EGHu0lSnZXU#NF~A%7){qQM}WZ)X7nO>q7gEG46Tj
zAW@bXXJGktecB^t>c$J?`vp*$U+|sp_h9_V{wF#$Do$A=as!Tn(6Nc2uIE;qr+=}2
zd$^XHgZfRYCR797=ZuFLe_7&hFdk(5X^H=p@o|hlDDgRrk6`>RiQmomV8rDuq4>UD
zzZtWUemc`H$4rq(Uw{{8?CwmzFe&{U!ubKKTKUppqWn=z|AOg8Qu%MOPPX|biL)EP
z66N1!`U0lYD=3lvGSeSpI?uim>Az=snCX1eC6Rs}u6o#$nf`E6dR{N2k6`+pN$H0%
z{d}h1kd%H+Po(EC{nDiLd-9RqmFedurB7%2zwn}h(<>=G?M$S9%=FGl=|>a(8m8|8
zumt|A*e`zn-T5#n{ZOXQVfu=s^t-Xtx9?#3b4lr&m_Cl_zeBp;+U#Fx_lj&WswyC(
zp?%jIcc&vTd*2SYm>PG#fw14&s-D;~mtS(l-TgmUJLPPQl-c`?Q&LiHJ;ULPNbdLw
z?M3}JFmcV^tLo1oti3F0-2DK;rnS+u-tmX()6Cu5&6QsrYR>-t$-7fiZe1IQ-I0?y
zD0X)Kl0HxegRQ}M=s3okb9i7n^fgXl@#w03H3s8gjFGPRQu8{qQvhapw4X6DEv3mP
z_9##-p7egNNYiDwx3%)C-Z$<Ym$%2-X1zPu`X;q*aI7%ZSd@0~5dJr~Dy^G8Hj=;(
zLU4AAbZ7bO7D4<Te!(HMI=|{@e=SZ!)djU!#>YqTzEF$(9u$EYqn9IB5qN$SrXXJY
z#Jcu&)Z@(sB&hG~6(Zo{SM-?LpC+mPvR?cw_d>G(ir)y|$*Q12zI+%NLwR2*ShXJ7
zh|fM}7F*qO*a}-@OSBJ)Ek4?7B2P#G(-rWe0O2;m4Apw_xAsDZP{)DdX2C98irvQ;
z3><Myy#aa7bR;RRYsLf*P8H68#4cYHlY}v=4{OaBiqbvTK(xW3DI>6P&-<aNTVjL2
zg5!=S2dez3&SN6`DZ96X_H{AtnF)!M=KOF{Z7T;#21d1M%AxzGUXM)0Elrqeg-!%%
zn9uP6KEKfo)N!^VLG_3N9jZUbu{Z!i^_`qb1ng##BsMVpNDw^@ZE)_S+w<8y6X8sR
zz51VJls{<+-|H-~>iuQK)&`n5@9EdXu&GgAg{P>z<{2KOuB)8o!`p<lDdjhse0&bG
zJ_=RAC$O}{!o$;_;Uj~XsOXt!pFHlMl**a4n0bSEK)VPJ9@fIY4N}R!I=H;uaXFH`
z9u8k2gB}j1mGpLEwHZCweL^xBzz{S<xT(%f(8`JVatUJm>xC`1c}ebls4&m&Iv4Xs
z$U7g=X%NeCU|q>E5-qPp%^Ld|5;uV|IzhBy#}w6IC$*BT+bbT1++iW;*}wL9@wMCj
zeW=p3#O~Kcv54O2ha8s(8wh*Y<$lit@2ifWL6XE0o3yct8cmoC>2gmS!c&9=<mg5g
zHLXvKZrDUPr0gJ!I&8&k8+D}}8xAr@i|qJ}T@IZ!zL~SN|LW1J?Z<ry(Rl=GW^G^c
ziz2OzA!!_jNW5{=`jWQDjA1bNa0S|0V`ocaNk$J|4Cfp4{K)faJKxjTKv|$Ge`o(l
zZac{P{3aej?_k>p))+SNd{}Vi!tHUK`Z#GR9|X@9Kh^Erz{tg#>b7sWCd7Npuvsa8
zCNy;xW873qVIbd4?k^+J?u@|%>&jk%MO`2|4p-Iign199{D^hU@(L7Zq#3@!RDYTF
zA5ak5s1ytc<V?i>xUj+7$<-)nsr41L)hMHzeiR0Yvg?9G*&Z3CF!=w-d-M3Hs{4;S
z0Rpi?Cn#vFThv&CYf?c;5lw`^9i3=YP^=qup}3<aqE$ANNXBuLT5Yvri(0F;+R|1L
zQ8et}3a+^0euf}K6j@aAyg%pMJ98%x(f<77d0tOmFPMAJJ?nQr-}61^I4_cB$W0{Z
z@Q&sDY7>CbZ_*El=Ur1Wy-(AxHcj)ZKd*eQe_k~ERct1@F-;$M*ZfW0x}Ep3C16Q=
z)~D-ZvfpL<VChk@x~#k#_|>7v)UE9L!;LsizouJv2cJVQh~#YFkKZj_IQ#o99BblM
zo|*O^PPMf}`b@R`vA3(3eY*xcupo6bb9ad;FTNl(g6A#x;YX&AQ`tQ9!|3DF?cMnQ
z@)v%y=;Ixx0%`hK+Bu+)E6E6||KCC%H-kjs_`B#MO&W4)IkMmg^MC{b3JH^k8Od~4
zd{NlL0@M`nyCqp1(2gt+-RN2CZW48+>Ea6Hs_B9AC!mFM5zoQ%Z+Z6hXZXCVZ=Nos
z3g&Lf-S4er#^p9jgeExdAcC@H#xa+i2#$v=rJG(Br3|%@vSXo$z#j49BM_y|HI5ww
z2f;$YYj^vK|9L8@aLNkR$)5@=)o(Iyd(KnQ>vH${&_KGytN58t2m`rh(Yea%&SNQC
z0ti)?ZORs!>YrY~p4xt|V&qd!n{=qhv5*;yAcdRFUl@diKrlQr(3%K6o~oeLWbZ>-
zEn+f5!}^jn7!0R_(apS#wzfXH=X+$0?yDoVGP*lPIccNr?@#wNKELPTaR#F@4|a2-
zSw`QpM{_A!6W3j#8%^2z|7kRDUhhUDgy=7XtF~j@>k1_&^J=y>n$t#SMsvX)+aAqZ
zNH*c7^A@AITUbl>xcn%KoBGxN(`eqluEWv%TT!9l)*sE^2W$m5cb%UZ&Hc>dwuPG>
z{%B;sWaH*#VJ+F?xg+gp&SMGM%G8Q{U`aQ=AEiAtzQfTd&N(!iE?XPTdlzIz^FHp6
zZH?wADR6M}8!(YQnmjj}pS#fvWtsVZ7|p~p-DrdW{e>t^anO3TQX!G|^C-uD`_e$k
zk`5;KBJw1A|7@fHn|BUBRyN4f`;7iG+MVw+5)R+zHUHWbB_5I;0Z&ExaA(R1$Wp8G
znQfU`2Iz)7xcF@DD-&LmJ<gPLO7`y7VsnBuw}Q(hG!hh=H&cCp(!s@}(*n^x2h#9?
z5z6MTIoffvD8LX}iCUHMg=v)cq-PHc=~YY@WpXfGc1DbsWG>_p4=O87o@?vA=0F3M
zq|A73ZFCD_=Yl%og{|qn+Ux{##n`HQyKN1#hV#tK2}nahY^}%uQ)#+BCi{KxGY6vw
zc-5a8DpKpyc%e7ShQ9>etu2Nz026}KXM!PcLNeI198eg{Z+l#yQgZLJkcR%6Hw(nV
z(Ofq(-Zsnz%U0eB%(??JbYovUehzRK4gp12wuJ%UB7JI^J*T0U_jedYO>6CuWkjH9
zhJF)(?I247t1Z|toXtCuL0Bdw5Qgdy08OEk28l%$dVp-fE{>PzUj}OJ0QN<$1K6%0
zCIql2tqs&;xY#w>`#3|*YUzI?tea2SI*<-$X$K(f766I+Qy{VPt1MGl>ayz~0g&_p
zLE!^22eF9l2x>=lG@gOXJV}7|klE~mD-D^a3%3oKH;A_oDdhtD5M=J@L<7kl=ls$_
z=GL_VWJnu}bO`9E#?sLo%X{L(ZIjZ29Y879vrsAypp;iKSfm87GNg2_crReYyr)Hc
zA|-b^l}}GKX8pzhYVUycFw~NgBkiHK*au}AYE!LK%33)c($l?dn=RCy1xyZVzR70J
zSDVS+CzcCvJ>F_>(_qiYkF^b(rdQmN(h6}qe=4w4zsbB^hPYQNo0EAr^rIcV8omOk
zR(^8L@X;Af^XuN*)-<njSB(fix%<5S+<yEOLkN+=A1#G!bn;c~AtUsqkMQ-(baoyl
zXma<xhN{a>`U)%aBoaCh!FhGc>xX|RpX6_8S2wVUZk%uQwIW!eC5`df$B;!Np$1Gq
z7f2GK7_`ogIDvBzIOYqS-i~wL08>s5<IOM$D`6PJR3}WHm5vPylmGhQ`YvJeH{`rn
z<^=#FYr8*S$ha(aU<Tol5?Wv|AJLs)a!`lh51!+pXPL)JmyfJIls+~6JTEmZ{k$L*
z<kv*>zI6J$)Pw2g1*r$p&+}6MOg}G3Jsf)eZ|M24^z*#b2guo=pa1gH)wh}HJPT4!
zhSC>=(wBtNUkatamVTa>dNcjJAoWHl|9heI4@2pzLg}rc^o=|R{&Mx#`Uw*gio8XG
ziXbJsXS;LTN%&L-73iJ2@zU#Q%HP5IDIO%YaEmrr;|nj%1<Sn!UpbswCJ5bETGp(W
z1~ujO$ICT_Gm?nYk&qxaBX{wV2kz)`8(q(qpS$^`e{z!CkppbbiiYOem3X7Ly%Nua
z)~s0H<2lpy^!&1Ydu7dKFHvS<(0-_OYGJ`3REfqPdiDKj(fC>|amHU59LGSJD|cOJ
zk_tfFGH1ysI5$s=ocpt++Rb0{w2Q7v+(|2+z3s~9&RY2#tPG`D>z|ceGUk>)JAm;+
zUv3V0FN63j`loKiUvLYt*{(l-;n%bNH60UDnqC4X<i{-mpBO9wdo}6;&@K87MDIr@
zLhy{n@%_Bu#b0;OscbvDAPe83r%eB1@g-Y>pE||{L0o{g107S@-@gb$AfQ(tKR|(n
zo2@pmjcQM#TIedM?{kOm4v<~hb^@y);C<_i(9PDBrZ<gW_w@=$TcMobPTmeGo3*z6
zc%$(R4&Ie*BMY`d{}$fq{&xWHZGdAitp@Q0?fK#V>+fD-g6^iP-ClxaWQ~;@VfoNr
zevlt-AnORnZ(ed}k)0>ZbMH?TB3V+maGO&2{knGVm$!RAyWRVV?cR@W_x_7^?|Zj<
z-=W=mr*`jaS7-O{gUox8`u|eq-HvB&yZ8U3->rTyFMYPTEgE>`6{6+>G&-QiZ_GVE
zF#pk52tTC$9lm#d8!LkT;fCGD&uY0#cK4mIO?8p+3E{h&p%2u9RW}tFSyfn-IulO$
z%EK2P*Dd!ydWj0ohE)-TUO0U>6R)Vd^rPh=@l*P}d&~TQzbc1m{RD>9vQ7Ui#Zpq%
zY!_}mR?t~!MP#L(g{GV~D~O=y=Gq?aRs>CowAldpRFaWT8chFSC1(`4l%&%-6WO*>
znmM>>{@l0GlE-79w{mZ$7w2Bx!@ML0d-)5To<jRx{BTZ-;rm!R!m*y3UmufR9oC@`
zZfW`$SZGWUMN4hbM{Lo{l~_uqQhu4!D)P~*z-TVf&`2*?SW(LFF%_hHq*u7~V^C&Z
z4ey@+Uc>tc-fR5#5xkG&eT4r$lK0WPkF@WLDwqza{$oZ*9`lIBuDE<u{FpJL;zwVe
zkpo)i?K!j{^>YxIA!@DnmPUFWQ<^$P>CQeN2FZ%s^{mvsiW7UfHGk7#P|U(fu;$Z9
zul&_3uJPu4+(k=Id3evYypAFf{su88)(U*@$%l2xVNJPL@x))|)K<C6o?Inqd`242
zQk|3Lt1X9Cu9h7&CCxEjLIF-(?hKjXuea-sEi<`qfR(ps{8iz1%F4}cvx)?WWWT{b
zv6CWmAab7xqCW2hk<3zP0-yirf9}EOnETv`&$_;csZMP%l(ojJAcc8+nI>QyOf}IK
zt*PM9D?2@D=G$k?VYC)8P2cL9?Od5)Z|75D+A>cw7ARzqf9S5Qg5#w6(N&6H4`8%{
zH^v>LMNorzg1IeUj<8=`u)UYiQEVJ98@FN6CmJSHy`ZqX<CTe4HGxtO%pS-LmceIz
zV%zEWo9%ElB?tZ@p9T6s>o&KUBU^k9;KGpf%^~}`u!f}EP;`|0<MpINuWn8Wcx%jL
zfd&ZvWrKSk%X`RPaxq=e_&R%s@uXF<m%aEA#ahBqw-o&VQ*f0rm4w7WKu38m9x5<+
za+=fvY|Z^x9b_9t0Mey^@)>QFC@{~e4~e{C%7>IG`zq3t*`rzn5PH7dw(bDppV+Gu
z&aoD*nDt%7tNy&i2nW{0H9!<^jOMBNUM_RTZsrk8Ldx``XHYQ>E6QSwS|pmqdbQ-_
z>rbOhN)^1~qrjh`vaD))GJT+mXMQ1H9T%B>s`;=cDC(&{wFOQfJs=ppSY%FAF1p&p
zul~%wbf-FTF$Y3(zTyT_M4v?v5yY!`eSEHweiiBvJ8K-pl$wKW8kmTGWp-dY4c$41
z?r?F1P#n-SV+guWhM$okfcx?c33DP_K|3NI^Qo<I4ln{JDq}}uHrRm=GBVIP+KJT`
z3*1;2(d?bVb(m%^u#j#i4(bl|5-13cw+C6FdG-{IkZP_n;3+Y-sQ!gq)8~`TZ7IAo
zZRRYKRs)%9cy^IuU<%!GfJpgm*g&NG2%Ml9{#4%ma3pou-nnwR?dV#IH}04S@z+xr
z!m8y;O1pWTwXczvXu8oM(q^sQYC0=WTy%hxt-=BJ=O!&}JC;+J_yK_|-#kiqSuFnk
ztZH1`h^OPo1LOXYA9I7Rwa{%X#c3sn58ZjH1CFh*9Q)44LW!c3w!+TMeAL)yLN|dv
zxBWlGo$VIz?_$eA9N?K=gN1=15u&jyHjzT1p%O{p_-nh;G#CG~EX2hn0X2oVI8R_t
z<lUrvX)d;$3%JF_R>Qzk)1h_8#Zz1>`%uKP2Cw>S&&BjpwC%Wfb*pXW(nA6+whWwf
zvfO86t~06SSazS0Ehnf^G4e?LH^z9<A&I<PYVjG_ab}2@r`fL1vL{~Fo4R3nd5T`(
z<=d5oKXWRbO7rp&3`4v;3?k0(vgpfFdMr_5S$?3O?Vzk|$NSm(2icDFvvtkNR${XW
zpNN6&;SmON`oX>!D3KVjee}hE22X|^$q*neEe0xdmbNbjrYWDa#Q7iE8fRJXq{V<|
zKDS_lt6={M5f0o{#7mb7s=t-o0c)v6bB;xoBc_m5APFc)<Rc42mmyhT?ErnFzOX#l
z<%4YCHO?cBZ7cacf8XT`pw~Ee>-m<DFQqeLOdBKhIj|%7axL5{9vg;ilzcgYcMAY7
z;p9si0=EEQLB*FZ76@lbUPBE#Q92}F;Lm7$vkgeHVc}llEHu|4IG7?0Is-KNB>(BJ
zIr=nmP=9+yXS<}FqL%K~zqEjG%a+=+WZTEeSBzU7+iOFt%*6Vb+7H=Uo}XLycETeV
ztERgSI@(EblZhzz+V=R_LEOc9R%`wKxplL=M1hw$k&^{_Pd$*-v+Wr$_+HI<z^LU;
z+KDC()+D5eyNu;^DP%OKd3{#<F*ucHr3R4G`Mdn!^U5=IZ3)%^qL4VaF2!xj6huYy
z7<)~mwY`)&k`aaAak4<+&o8`?C_<IuxSEmoIt%m-&2>9_$%CHLT0Ic$3lHdc!ULfp
za@=Nf)L!P*54@X45&SLu*4D_x<R!^V6b`m_Y%fgM`3Eqg`eja}VS6fY7A&oW2=!aQ
z&49hYcC`UV6;y)R8q6wNo|!_IcA;){>2AcZ?srfCA5HF0vP2Q0yWf9uv_?qt!XacN
zu$cE>L`NhKTB-GVMLXKNQJ-8PA|AMa?y)BEPpl%Oh#3X{tBHfH&hJ1S{QEsJ)w^?=
z(ZN*1GnWR7(i=tdd~Y)c$Ox8&kr$_7^EPlyJ;4Xx{<C&?3bJi8@6(km?vTc^+E)(;
zsXzauT5-o=AF0<J^lO)NziOYgeX5;r)=_uM?7*M3E&Silvbm4{saio7{ux#X|L55p
z7XO#=DEyzzZyNt+@RE)HK0g}%Q#J$#{3D4&_}@fw8vmOKV%!4%&s+S@vc=Q*=Xy)w
zpY93&))WK(g0Jvzrx_i<`QOC%c8Gs);NxE;WbuD04V$-tV@LR>E8D_9)mr>p5#2$*
zc1iaugn#PJ;6KCtMOoah-8nPzu4ZxE$NnWYf@Y0540$T-Z-$-7ak$M<dj;G-h(}?+
zFTb@X`rLmAFIm`MxSx;xcc>uXe!SBDssYy+k`DJ51>DcHyMZLdn}_*-vH1UStwn>J
zyJ1q{pACGVFz*?XVefy~ub+gl5AVVK`&cbI@NIemq8DHViP6UikknNrt?u)(Cip%t
z6V%#f9tWN&b{WC46**?tx6}BfCDWaPDY1MOKNGawW_nUCz^9!oS_{;*!li$b7PWci
zG+~P#yv(mqdzv_Pb+*6!yYY1gE6_f^Zqi~jm0gde8p78pHiyNRmaB!YVf?1?bv!Rw
z_}b~`KO(-~&_XYr`W&VT;cF$y;OklYbxV9*fvAHq52PpHt2ef>@b$dJHFe!40o?(<
zwgW>MeEmb1wfuKAEt%6P*a5zN{nq!v*FVUm9_&La!q-2<Sw9G0kF$d8<LjS1s<PK0
z&Je!F+8h>N^*jn+wfv^>btx}d_!?FGBjRi0;jUAau2TVj{e(urSBw3+CBEiK>m&wV
zOHaVpdDb{vZFW~G1N>M5wF7*e`eqnk16&F8<sGzS9-?3e`1<!7-v?h`lS@6gf>wmD
zuff|7!`B;liF|y$Acql^{T#6o!q*>d4vVjQc@(~G<u{G5n|aB?*NnY?M11W+?dsI;
z;Fu79?N2iJ+TMQM5??zy#rGUN0bh?|j*7o7_N7rjNy`rK_2BX_zK+$5+49%p7*FOK
zAkzW9aw)fMd3Y?jG@5z*3SY;<Z$AuQ8}U>5_*!k<i})K58zFo>W^-74&Erw{dW7FJ
zzW&8a7QVXf^&{e|oZ2<Q_o4I<zK$Uod>vrFZi%mRq<0bnccUlZD-U~B_?qoYBiWlB
z;cMe-VSKf~2M%9vVC<L}1v|jk36$G5zV0NKMziDV4qtb|Z$AuQyU-6GUq6*|i^>*4
zoFRO@jw^}fe%Tuz@+f@0!fzU1FY=Owuao!u5%F~@wX0KQu2X?Lyntlzb*%lmCBFVE
zI41^PL{GriaO_LrtA~TWc~w%b1OAG>8phWEC%=$~8?jl;R0?*0ue)FQKKN=Omqv3A
zt%$!`z}pYQS2_Lg@pZ7ANmRBIVk3mF^=NA!U!BPze0@n$8eeO8$->v)cK;Fabv3oC
zQ<LDB5P#i3GWfdCe%%sZodxH_z$fVm___}>QTRH;mqz<aT6Tc1J6;as>sjTr@^BYg
zGOK`02l)E@rSF5U0o1P1{Fh(hYXJQA!|-(~t|A{_$H{9%Wup)qA$;v@b6Eb`hezS7
z7r$wIMR>`=*H^p#i1@lgo-#VM4BiOg>+dolk_RR1*DdjN5_Ai`y6<N3wQgUBuZMkU
zR3~ZK0lrep!uZ<NX*~F=oR-WH6zl+Bm6Y4IJWP;Fqv;HGg|7ts_QUXXH4Y;mUzhNx
z%4R^EA$%Qfb69*0;!*hO%WoQAhwzexuOUDC5%Kjnwt_mfm+Mp@51)|<!Tig9-4b8-
zB5%Og>GTBtsw#H)`qr053nUjiz*qUwFuqQ67zSUb(vrE6f*s)NcQ1Y)e7!|3jpig;
z5x(96Z$AuQcc6=XeBC5B5|wpDY=rQ2ip^p1bq<fh*D!w5_&T1KEPVZ;@JGbg8~eLX
zT?WU5_-iG};OklYbxVAGA-R(n_z*n-U$<c<N*-4G(x|tjWe50*zYxaP0nS1JUsuzT
zc^k-dfUozT|33ISfZ7G(d-)Z<4uIc&7{0DX7YF<$w-1$_3{i#fb-2yp_=`u&U;L)=
zbqFt6`1)k09}!=Vm$**Nfn!4WdWK~2m;JgWz6L?J;OiTD0=_;ja`?K-mqzDFT6Tc1
zH=YaQYgdP1@fR(b11Z=6zRD=KZFzVNxdh@b_!Yjcf!}@@zV1L52mB@P4VC>3u@S=8
zDK>}WFCHy_@teli@w{Z=>#*KGBEH^`i-Jz|be#(1;YyOlU-s*k`1(Ea27JZn3Ha)V
znJE7H%$G)wN?LY+uRhO)@%5}Q>--zEWUi%P2l$%w%=f|9v*glf2GNS}^(<Vu6?_f1
zF}wBS^Sjpe!f}_+^;4$}w)?^fJJI??;*vspnc+Wh;t@rq({_@>^F_~hkdpLrTO<Ge
zNgXTgc@OT$*b}1)bjjVA`l@`k60I?V>4tV~A63o%T;f14WJN?FWuFppg=yK}U3~Wm
zOMkj&FTSRd{ptnkI(?xZZkdbt7uB_42{1XuI`CLuQ7wvRp<K&CP9>(nmSjN32S2SP
z1M`Vxrru}m0CE!jM(c;Q74D1e#gkod>Sp%#3HwjfE<oMGyNmxBuC*l=aduVX38$~@
zrs6YF;}&W4W^hgpu{riv4HHYKaG!2=8L2i8@KxlTq{R5w{!zwU6?v2*4%zIP=xh%%
zu49F1fPxkYH|cg7(JlFkF{{Xm-45c(z!lKv1OP(ZMr1jBStQC^snNMHsna`799NW6
zchyrRz(M_gb|FAbUgMdy8gF7yY3NV|4pi9BOI%Z=e%Ju=1WPVyuWsG1Kn9Ccc08eT
z6<D*|YiZ`zChMK;cQVH+JYwl)9Q;sKu&}C#z3_`74cbQRC8}7}VX5>;I+OafoU4ym
zK>Ftmq@%62IOTxTkdEv|M-t;%OFn|r4%%wBYpov&jU{+XbmL&BVnhS3SZ@(e{I%5-
zL0=Uf7@am;O=k^hsD|W6%~|}JW8vfU0##)bYi`d8?(A$?@@l7?Svv}h$$rCMV?iQ0
z@POAlv3^b0&6kYC;y%70N@PsFIaH0&=mGW@djw+k*J~68SdoP{mgy4R8k_4*;6UJ^
z1Xmy2{M+$7_BeTCC*lDPSv;0Vn?v$ycpE!vtrt14Iqgr@(x8^zdfAh-B8__}V4{A0
zML>_b*=_PqZ4^cC(?6lp=XANw9UkTA)i%g~#}}&ANa9+p2ioxt9P%-PTLn1mdtzvT
zJ@1KMi2LfvE$>)(5!F#Yp&%!6TXX6%@Q6R36L&mat4zGuJOa#abO7s4WlB3@fmU*9
z>4r|Uq%mH;!HqF8xs|LM=V($QlRx0ij#U9+B`z%F-III_rr6cu`bB#zs@nF77H8U<
z`jKmIl?>9ysY-v%f>xnd{!An}9~Zy(IF;ycPc=cWAq{V@XT173xn9{bUY}>Q3R`5(
zPd9pVW?djW&O#(cFwSDX-C`Pu%xDhveeZNNo~@?Ab_pVS_e0>^fT9qbflZGo>mf9t
zlN0$<bEIl+0}C}5Qv_i3uHq9QS{j*r8Yu$S08%28tL@vtOCmQ6a_P$=lgr&xk6mnz
z19(d{>zJA-U4xwG(rnP{OL(Ryr>iG7@Y$#277j>ou|i$-(6Y(}pu$3hZ0b0Y16aoW
zJR}G#5VJ)Ap(m!#iQ<Gc<^=e}!o%$BryNB}CiF5bKQqymTjl?*b;oe!9{kVBbojIr
zgtiVm`K^XWbN@NPr<IHV>zl>x6NLcttg@%im9Yz63`>`7n`wM%dD83x8Eg0~_o&`q
zQLnyU2ge8cp(B2-4A`s3)OA8!2_ZkjdOu*F4_-F*Y@zf9`*z6ED{H~Puw`K2DpYi^
zUS|1s2Tsz$sCe>&E&11T{1|EaFFdYj9$04oW)a5z)I0Ve$-D5fJR%@VK9F<Hm2b8*
z1ntoyD++pSPw~hN=ffS5#&h`Jj)XwD-YdWCB);AK8Gaej3P$t+RQuuB-ALW{Nt=4e
zipb=HDMan_B9n{l)39aadO0%r0#{k@TU<^ERlH2jXCsq0iAbaNbx+41zI@ThkTGi+
zTdGg^dzcT$Je}dgsVf}+Ki9ljUGK4*>#c=<iG4npmjCQzQ17m;FH6V%MDU1Q&v|0v
zhlMYKjg=Tq86s7CfxMX>l5Yq4?Olt!v}mBs>0baTA03M$lRt;ts6Ov^Bu6HH$XkZ)
zgeC-&?n49w?Rbha<E>=MQnKQ#7mu@e;bW>7#hm<r1Zcaj-?p^LAv8V>&QE>JfBw2=
z7;Yn=^FR`u`<{c_xxf<n6%6I$wV|Pm{iTNSjFN26#_sRfx+2q^M%&V03&n{q0L8yr
z^;aRNj?w$A!Fzme{R(bgU(@U@Tj^~#SMZv<HZr0))t@SKo*=^CHDgv(r-!>QACY03
z-_6Ldzw=D(yQubs^xpiM{{s2-qX@E*v@Z$JG&c}r$N4E&l9I9iwjsar>dx-Se#yJ*
zO&tyuGGE*ZJeDGNtJWSZgW}ED7IH1R6}ff=;wf#%tR>nUOJ=d)+2eeU<eRm#$S>J%
z9nS#m+2;V-@16o^2lAf}G-t<LsOD()9Q%v?J15KzHa@;%d!|f?h3$tHg!EZ^ljTw|
zn>AUmkF2~q+p=HpRqxQbcJ@kOZL~b@;2oypDReEM<Ed{uI-dHZk1IC7Ma-X%g#p;9
z7fFpphy?s%JNP(lI)yt&?}d>+&5cwo+iqzJ6u0C5z`n_OBGcZ1etW0W9_90%TjT3+
zOTbQ%+b-;y?!uR_thJW{c``C2zutb!_5VO?rORq*Y|hv*h(c?{{9D2z+lo;mTQOq(
z%60}f@^O?gRTeBgo}!fiVi%~z<iJZr3fJ>FV4X<tFjNmM>;5MjU|*US>r^vDCoFdK
zQwETd5wB@_s>!mCJ>Ndg=+oHhj6NM<p96jROZ(>Q(<Kbbl)wssezWw&j%SA2`xTA2
zJ$(iG^yT*Y^v%ztPa)_f64r#-rGs5CZAHK}+Mgl)(O#bxpf>aVOK}8q@89!_+I$1G
zNf_zDZ(6mvd!tP~<c-MWp;o&c@=Rp%K%OXZ7M6L*`($T^+3Q1!%#HLZF1E$;j&sG(
zu^ekD9r|WuGCTgk-S{rVMvo`nJcUpW0r>mq)6&46o|&QV*c*->uYKI<)8|%Y;J@5H
zOP`K`i8RO=Hsz4CJ{@5hdf))0oB0#kCWCPCFUGyK-$z1q0X4q8+^2tR>p%&Oq}Gr=
zP5Jt?lH|zbw<OR&3<sryp|nBQfle(Q6?u&_J3MEt@`5c{raOl<2`DvJf`tqW3sXOV
zXPmqV`tnMP>r0?tr=XBhukv3=zXovJ5E@q(e_UOFZAia1`qd#U4_SQW-tb`rfSXIO
zFakL@@LP(S(@Al1>ySK;LStXC<3&sMXA~F_{Lk4kj}EZ|=q@54F%`8vlRoZH-xiVu
z?eLG+Ga}&>p6SXRYWNB|Y<|jrfjBtMi39&NEfq>Sm>HD%unvPcLN6s14o8qmD)cws
zR=JdYNeRsd*mw1<93HX6Z?<%JY|KqNJmg{8rE(&ZxtvHNCrad4w}-6tR~}Up<2>`V
z+DcTlb9?mX3`yVZF&wu_mXF|4X;-@W)JQ5Ma5vRNU4c9anQB|}gB-+oTJA-!Me$EW
z*@I9}$pMK)&J}VNoEfF90_zL;1+}J7vTf#p8JYS8UPjM}klOyOy}q11d9t-I<gxSA
z?{j1RR<u~WYIzlhsa4G+s_(4)hPNV(vZ5DOErvDNFO}r}G2JeX`z4mBE<A-GYGBj0
zbmF+hIvR9W;6h<CXR*(XQ>dfit&UC(t)|rSkTgt?n=>p8UKP%;tUS~Ci(>x0ZHv0`
zS;v;<(%I(VC{Y^CO`XBmr)1!%FyHLSWL2|yXAtnbNq(hmoGbbW8B>y_=2yT~P-Ob1
zan=&`zuwGY=E;z@10>U}?@2T=-KTE5ZEK5-+;ZS$yn6V_!bHLhT_Uc2et|rdp$*h3
zm3I3bDMz9<nhF^M@|rexhD_xLvDCMysz|VL&NrxVQB{F`4?KiPFL8i8dX0V+y3p>O
zZ&d}S-BCqeEA_<^=Z$dQzbZCIeN=n6<A&P3w<N&YpJs^f(f@32OY_PDyjWg&5rEKG
zJ6<{SJ9y;+fgs=&fg*L9DpUM=+DTPc<!93OwiUj7)0Wnhv{(qUTjQv=DnTq)3$>X^
z|K04)L*>9+&ck}$vWPvH)t$J?wx}|8VOE7>Vyg_tlqRYQqlrq6Q3mn=E?&37OH|YA
zcLDiwCCOOUZC>v|GaS-ttYjRw+`OlU?>Mb>4P*65jlyPnP)_G(i@=VDu`{YKB;V|g
z|JP+lmt@&Jh@xC+$aG(W7-)OOkEDrQ$nJF4qit()J!<A8@+?eqWu0~hm-v28<AuAq
z#%z}^*k*TKeQNHv0&b!}Va0nQ@71q6=g6O5#D&1gnzr~U_FCXh^QJ)GZ6gaag9`RP
z5A7}oVQ_wIus=Z}YfRP%N81qPPkLxcsP=5omA8`T$PP={^f-R07!DVC?~~O$du7WM
zYi#E-8sF&j_#%x-XM>HA)Pbxj&+zvfxP7tDL1dDIY#vA+bTtiVi&5DuEjraE%!bD}
zI;WJ6QvMkC^Av0>e|e5ciUU};DdF1!wOJHtSSE(m7CzYm%GzI9wKb}U?Q93E?u>oM
z2TJwNSyYwJAwOiVvb#8m-^aX&^sFj~G<={!J*$gUu+-!H^oEa>Gm*D(W9J-u7?Br0
z=$-E9wncew@}s|iv&}lXtd1t~PLk^`F%AWFA(SvkD*(+@;Kg@;P-X2xKxbr1q@j#{
zCx+(ZW<Ju3UyK(Q^{|)5f)l-jVxLnC97h`WB{74?dhSlxPH3(vl-!M<0Z|S-XcJ9v
zepnZpNcKCNmP{vzMqqsgu%-^hV6yb>#jA>7D+^7q8EI4q_R|t<<^)X1rv)9j%zQar
z1A=B1@?YU4v0MS>iA!aGF#EB4J`lL}o-jT{^?~dK=GevJEo~<?BmX5)bFGSVEK+*!
zZjDv1FF&l0y?d$SiGIBqtBM>1EBO5s798p=K2xQ#`KDM%LH`>Z{qI^osC%x3<`;ZU
zGee3+JMOU7H&BuKJ8$NLe+IVFBJ*?GID%ixTQ%l4*dUg%;`4%S$Maf_GX*kDEU~$8
z@)E+yyE7(1wLxP;rgXRBw1Y9OdB`nY-ES)zSCXS6Fvh8x1#|`!?;*+%u~@R8KcGI(
zjL=6*uC;9chiqN)??`^ivBes5D^;h?0(XAD?F2e_J}s1ft^eHeNk+b<_osBk{|mRv
z@c%PBZ^8fTe?<O2l?E*TBaJQpkAPx3;{QhilzeagU-^IL|8*JupPg^{e-=Ob3-D&v
zq5FLPm&2|d|5E|{A6MBQ!v7Te-u(ZS<NqAX|2xx?>4Y=F@;_kxe*BNX$nZb(wH5w<
zt|R`}2gm>OGyMM#>gbUFAJoU*z0`5|pRrp0hsM7b|9|<z`5!6Z7zF;ejc52D>Af}n
zmr!rd|BT7<zs4s1w~X8o|3BdKzpW^X|LKhQUzD*W|Lfyc_@Cqq|5Nq<jsIocjP??j
zVF)JNYNeLfMwGo&Kc2gF8*xN}(dWJR2D?4nt6$BrV=U^U{)a0dz)N1<YM#RQlLnR6
zwZ<{Og<_2Q4P7G*)6iS>8}cFzhf|jvJq_C5q!5L!HlcH*q0T;Zi8TC`FVwH{c|4l=
zf`JH1t%nX>;{E~b>^k)IJ!GS8>~iTWIBQ`+e8>hVboB0!)-Ae(UCC7AQ?oxJM18I|
zdG#AQMXvw53X{M@Htj(=hHhb@jSeiAj6LP|kpnez%yc*m_Z|?TW_8nctnoKU+>Ct_
z{{Uc_Jk66Pp|7>!zKDauHBf~@h?YiMUgNSN;=kZQFM;zzZgy9v<*K^Fu(CPqkB{2w
zu2U8~-dgJ~&IeZmdE|tgSn@n;1`@j9lXh^yu+{SE)cjO;*TuoIziPGb+V)3q)MRWl
zu=o3E_i$={ZEY>rq0D4dUDF44?y^$yYqV@;vcSOFO&77p%)|W{M}IP=?LUWe9Dq5d
z2Ayh`797%31>_6EOH)<BtMESh1AyrFYV)i#g6%^)kR7e2p2wDU@+3LF)tq&|q{~FA
zp&sU!SOHHL@iHnJ89!hDy<iT6pLx1Oua~^Wja4}L=3?MH`5^*IIF4dEQn)hGr>dY5
za}pZQ;n$w+YhJK2m-&IvD;1H7(E;<DzXkGTLF#v;I$5)zB^Er?>7iwX|C;vQ<OjZ+
zeKK&x_ume#Y`#0WFQ8}q1tvxhGSoemt-;!V&Zi=AQ;}KqHaUUeFeIR0up9YXxb78l
zGX>PrGBZQ#>G=Zt<0LIgw~!VkfwXv@QPqz(OpZ=&^>-VlWk)`;qlmV$WXB3~9aQTO
zm+1Y;XPt8dG6B<axZ7?m@`BfEf91uyioZn+1&fAku-V9;C2|~puQA79xoTt|l)L(w
z!TEe$Hwd0CwLDGCOapC+{xzH9rib`SGx8|L|0JvnUwERILm;%Y|8V>YvnTK7Pk&7#
zQ^fd|H3&a7SExDOzlO##AhiB-PTz=cumN2_@ak-ND+o@IynUI@)K3ru#{$7ud6V?5
zUsD9wVu^j{uac%<R&hFHU;bh3Pq1zkWy5PmAg?e79U<*|t`h6=(-5VQ1LUb*d=Cdm
zDEYeq`Kig3i3?%mn?Ls70+ba3rSq*=c=@slIDTLXj|7gheK^*yDKMYjn}H)EGOzP)
zPUJtI;DxT=OOe{v`lvM^`ER(!!S7^_!b+bmFvf>r7aL(Cs%Flb7`vf<b0;@dVhO&L
zGsbRd0b=pzxyNJC&{m<^D__27=mz@~89c|rI?ic~jZw%cBWeZmAXW*vC^aHEox>@O
zbW`M#&$-&>+|LB)HRg1Fs}qOC;)u~``Mb5-bEW%`{M;C4#G)lXHAnD0jG!BTsli#S
zWe}v&4nZRpQCNKywa2gQSc{#D>{9#RpcuqOqtjF#X`Ey$m_H^o@kV4>Jv0vbXnES|
zp>q)lNq6#v>H0fr(cXTW?LFjO)7-$In>9lxiF0UFkK0L`^|UHa_B-%VS%L$vA<2xu
z776%C;4|y*2EUi{Um)N51`h**hf9KoA^wACAhJr`5XI%`iIi_!^t;B~2h(+EN;dt)
zePlecML2QhIf10q&NDCMW6P7Oz0A(W)|e$I2Gw~2b^7!b_(Px<O^E_Qmm#qdNNmW4
z`VHIFj<4U4A8EWBB(Z4R>J1pUe+rjhL$MM<(QkZ8bi=i*Tf(@0F+)ZQe~UEeyc9c9
zyQALxf#8f;_sHl-_PgMrPGD=jxm=?H7=;`fd?nw-(JfsfGtO+4ppDFUHqxl`;OfUO
zm(*PmX?%k>OP|QwQfQ;HjXTCNw71k0_-u@D=v`~RQ{s*e)1f`|p<|P3y?I>*0}#Db
zY6!@OMDCS=oUk6CQB%s=QXit$Gxm|I9J#U9tU!de{3|VgSD1eRf{grKYW@;_@3G`z
z$T5ij`==$x1O5d$o=PtB<)7(NGH-7Zpxo>EJnOFxzCZbg&AOAa-ZU*kqdNo?xo_~$
z*Mi?{JdjaYs$8Qd#<}x78t;1Zo>ii2%-BO+dK)C~$}I46env`08^Cv}+ujHN@)Ke!
z-3Q-i-WK<J&v+Vou#T!pt0b{PF1iML&^8=;%WQS)qT(_B9q@}uJ9v&jd(j-7P_~r!
z@l3|#&$8#fEmSIe+^=w<%_XyD_9B;|NZ%IezZLZ8lqEL9JTiRqjLT4}Z?*<Q4S#sf
zX3#a?172_$D$+HyQbXfr#?7hNWO#;0^`RzR!v-sA_{{a*9Aab(0#*{-wcazQVwYXT
zu+5*v_WrfrjI+Q?-da475B9pep5h^nKg`?|54CesyrwZ@C{-HVIX_J$>@CnYx|4*E
z>8f7Gl~2^Kt?o@d_&3i_bI;V^?x?a3#P}NB|GYw<X!3S`6%4}ZYd6|cpn|KPKX7Gr
zLG-P~&Adc<dKJ7(b(fpFyXR}T9C~E<)Frwyjl8%m-I-GQZT8=n@@ww3Q&xxlw5+ew
z)zBgk-<7QO?o0n#JdqUcn=eSMa;%0uL;6Fu1+I)PO(_D*EW6VYKkhye_0hgX_)FL)
zL$Yal-G{ISN3T~mZ$YnJp~Y>{>j$h5wWrtdPi52V3(#+xULn=I-4}%D(~=+b+3yrM
z25J!jpN_jxB`kef0){>Z{=reW2>94<9Qj%Lv^7AV{XYKM(&zj%VREvM2#{mkeTw9o
zdi0?qQY`G^KLs@T6PUk4n(Y1K(&W1!@_(brCp)6a8-1F*N;G+Eh$cN~lJmhsCxLB~
zCL`FoUZOos9@&m2t$k=2z|rN1f|Lo<<zIce99fzw5?z+xkwuq(#|)JEz6Yt`pZ_@w
z4DPducq%JD&wq9cdfe$#2n2!k!Yoa~I3aX7KUqR_vC@-y2T_ZNvc_IaFN9NSDIZ>0
z>1q%W&UDglF2wMxzOS$(s$&&dEZ}TKrM>9r=Q($oeT+***8VY|nB{O(Kru9g6yrj2
zD|6%zu_7!+g05vievOd)stL=li8hyqq>CZg5MCol&`t6qo#e7(@g<j^x712PC!bjR
z5N*T%Q*1@L>>NSXY+vj}_KyUTK{Sy~IdjRyZSovkUUPY*=T*>0JJ~5grF!){&HtUw
zuUTtNwuWN$E+eqYAPynJ7g&;400YV^NVP!}dY@W6kqijP(iFyRo9P1}C5C)CrTLfW
z;TAYDL=QV|2<TxWGGn-Kve3UCBH6F&#{oUa(EvTTPWj99I$Uz5(vdgRB6_H3PY?4=
zh8{eWro_)y=wWN5P-;HCEg*&G@5+z@ZCFxZWrWLLzg7pN&0-=(M+%-up&~;H(rZ?a
ziCjR0M;XyUjiUqU2`4Q?Cvndb!RYoxFwy1`&>;e0ikwnbk2o~;9PNl;8eamH_8*?x
z6TxWG)b<#Q`H9NWjuK3(BqicOu+31w5jrW46rTco`_9sVBv64pDl=^#l72e$Dq_F}
zZZo?9$`COmttIxUX}n#QB6=>ZuR4Ea>B0I^95o;`Hh&P@86rfz@EwHEi{e>?AReNO
zWW&r)d|C*Ipq)IB3@~5(KEU}K=-Gh%vvIy2F#I2J{+zACD{``7>o|7?foQ2c&YeiW
zR1nUc6v)Q;XrJ-NSbF;&IR6HVdP|)5Z;$gF+PjvkQhHuhkm?Fv!}#VLThdEYzYx9;
z1dJhk-$nIe{@ec09^X3>zKK4_#`g<rH~V+QYzyDT6z>q<)S~jl2Rf$r@r^&t%72UR
z&bI{kPNBd4e}nJt(rsJt`2UXYBYu~K?}s&w1K;=E+8*EgfD~W<-yO(%A?fv5T0el-
zFaKvYuP6JBSwmkZtulPivIV4%VC%i8Q6w-@vfpvLwEzCo9k;LG`Gsj#^3PL4*qM39
zQ6ofQwLZVED!tXG4z?=k^|io`d`+}cVee_vQ;k`6b3ieVq2U6Gp>?Yj##pV;Bd{l(
zM7IXQXv@hNt>C68PBGkAJW+!|H)MF#R`iOJeH%gQ+NuX;Hxqdd&2Fp;VRq9*vOl|V
zXM6`@;Uzx!3S+9(9<Zu}$6u{I;jDxa@byRpM&^9z;56y>MuOg(snIOJxDy!(6&Y30
z7qKC1rBy|R<OWk-CNTx6R^MI-6xW#2)cz6~8*j=Yp$n*5By=_m>e~wh+P$9ks7G|T
zupeTkfJVac4yVCbpH^?qhR#~qXs{t+jm_?^y7;%8B<@A3mf~5=UFyZxMsArC>$BEt
zToP&g1D&-|6_4^z_Es6;gx#w9tm16$o%2Nu%rd&{d~CAIetKC9#mtfH@ETv5uphx;
z6OOKK{Hkt$hRCV(@iPh{J%?)*Dtr#<6djSf{6@QXcLKXs>+smEkM;S=G?3P^U=-gu
zd6Dh6ZDwyYCH2)#Z&|K5kta3)8~!G+YU#y$6`P~^?j_XHA}@DdiY;Jh(42Jidc-SV
z>_vvo@shhOU@$o?i<npKH4Uf9<5NWRzW-dZIa9`+wC)45B$k)}F!zhGgxF_=1LopL
z!{0>Gjm<W`A4n4O2rw6Wxr;3vKhbS60%d2HEz)=z50B>yo}ZPy<s~|Ki#n-4K=RP)
z@{eoBa-MuNIT){apZC4w2^+lfd2XH2tKU=*x&BLKowKsD;;iSg^Z7Mz*=ldjCtcG0
zuZ&#Uy-TEL_s(`{zI<WbXN0bK^`CC`l3l*F4JG?_j^(cK$~V<^^%At(m1vml^Q+6h
zt=+l04~HMVXaT;-{2Nu}#plFwHwwE}MzF=|qPYtDBf#xSQ1z+M9qaRndG!Xt@KRkF
zaEY$7XL)0ON#L*LyxBB-z1bPO(UvwI#QPSTMzxjbTdYVvQs#NNxLUmY)Z6fcwS&Cm
z5!+j5E=tR;XC!{X09lHei(7gOP})$pDuvrBvt1#B@B<yfZvx?^e+J=;VW>2`<iE1#
zai@3Hu>1K7#Oeb8D@&~2PC{9;SqYuV<$1XfieRb`P@TwmPolEaEcSD(&^CZXXTm67
zds#l!N0H=EN^QOk8skeWF`8f}-8alykq*2s^=7L!{+i`TEsAlaRo|30zz6-+Wo`Yv
z0mLxXR?0P5gl8QHuuzQ8o#XN66-3M5sl7l;`IXodc9G00H{%w`zjp{sm8qcaX?s^@
zo7o@*Z|frClWFSTQ?5S_6Q5V$?X}o#T3zh*S*$8c%p--kPs-n^`-2KfU~)*g1%f=h
zX|P1eD{|;n>>%?`CdA_9!)%N0jPA5}j5M;sp(`l_c_dbteA4%ed5IkpgIJvgR#YSc
zVzFM6pqxd6#8f%7*x!uuF<^Hivcg>I4LIn;mxP0#GJ%dIE(Yvp;0$3Z2?k4TaX-#3
z7B^m8d+_=!w}F6BG88BSk?r1`O<g1on}({RjW0zS7VATDH!W|G;XN}OSrTQ+ldo`C
zu}!mGa=ob8E8iGt7zC$Vo=y(U_xfy#mM@4j?5H%$+S}<V#M;ZeWhrmY`>?j12VB~{
z8y$+}k((3p%jeXs68G1CBF0{)_M=IC%!L<T6K6wJJ)=-dffdPpHk-1^0gomJ?+#&}
z!8+W<g>I0-Q}ij-!e6tjh7!?UNvfH5t(#`iS`=D?^(?JL8tx)X;)?wG&0QmnB40=H
zA1)QR^(~9!E%k`7inN#Z;J8SX^QCrznzY0eg|Z9kZjnc!NJ{PuSrQ`6Sb6KXWQa6b
zjTd1CYxhMW(ewt364P51mcQe+mO2;qFZOV>w+X$On!qx)lOv$Fgihw9Y?#B8`ptQf
z#vR4lFfLG0!;$1j2^MwQtZJe(J9H^MQN^@CQN!r9TrlNI<zjk=CH49PTJ=~dW2wYy
zC^5}ZQ{{8S2<wsepq8Ep0Kl;O_n)i-|7l8?capwcs5q7QO5y_(r5QPQIk*+x6>WJq
zk}<U#|2cUmuByS-L^3u0gu%IfArjQF^>vw!9YD)lbnI_;1|93Gjy>x-R_Hobs*Vk&
zW42D!cYmn9|8A|mlhgG*qWbK-FxhYM9d@AK-cNmBB8tqO*Mr^Uz_*ofD5PMOFE>Z*
z96S(TjXI7mu>9Uqr4=)P+fXdfYhA3TY%Ket(;jLXL@Z3hJoM+tbbRJWyKZ(U9|N&r
zW?+7c?)DJ<1rk8wfam_yos;Hz%N`z%l|oV@+Xo%9n^WjmOLvztFV)2Nbf7~~5a`fu
z`+7;cPzMpk1v@(0Dl}7e1*K<*)V_P1qwBw8oX7G}#IPfhND^rP1nc*)B^o>o+eQmr
zzpI7Rb<P02OgdZQ-n{-Z7P@W_K=0atrw{yQwgkuGw3WK2N^T7tr>TZ%0aTg71ieow
zKqyH6Hk8h}2if*ppwHRY!t$p*%ESw4a!`!*l&G{ZPLeXvM=ST63hsy<c^*n^&_<tS
z-kgn{?doKs&aDvr6WXfak2P&4F?Tr~DT&lVMtl)zxL;(W)nxrOheTCcMzw6u{Yt27
zve%A{Cc4JTH;f~w$QN9-AGvD*-bq}=wE$zf*SF>BZZU347+Me;kk|Qn<f%PSyS4t{
z+`7jk7!p?!JJxT~bNV7refoNUA(KE23mo=u&y198nP~efGn($#N{3o&dne^j$HA>I
zGeD@sE^4y(&z`j_t0mNrP$XK-8ErB>6w7k*qRa|^5T{lo5CnoXh*P^x5(E<Y+0L%+
z{%OTu_gkz*FQo3Zo*%dC`*-aIHO!G6pCn%8QYcS$4t>zfqqtJHl<76mr8qOl%)oVW
z9|ab*y|&e_WB6+BWay5nLRT~q5B{lHhKN#!&qtStwm+8yA^mF0S)9kpyIXG$-524n
zxD&pRcE<X5(}>>CAd1YWkm(|#8@~=ojeP>qiIyNmjy{U@Y2o7vZ_d}9%^746RaS^9
zy)3F+PLR#P)}WBZ#S(dS+(hGVc=02E%gL5Hy%>U|B3Ax&q(MiNq=C>Jof*(ZTIm<H
z^pu{D)-QrSDNs)VW_!#R$F;=vk{MH7wywI*y4rV$2q8+Z?$ydfUH53Rs(bzB+}d5E
zi8`o*J17S9JM|gS<5cLSwZ0^`?qMCh#Ev9wCwgs(K0O$as-S!4cyJo9NT_3}%jqKX
zKaHT8Km<##H2^M#DKm;f`U3j_yz25bwZBLYq}h8y<i}qTbYstbLWSR`#Ay8j<~eKX
zR=Q&Bbqcs(O{~vqQ=qP-xnQZ8?o+C`px0u{1>P21z!fcu-n^CSx55b(B#RR~{RW(H
z6_X{MCuAbM94mxFHEsB2{ru5c>`;-DA!?{Z<FImLIaaEylnPVagSegYHIc^c0b2^L
zN!taMkB098AJscPT59&2AU@I@6G8Tx7(G8j5gBBpj(|awW$9MGF2l?)rvtNqebNyJ
zJ%bd>M^Vf8ZKjf6@zH~DocQQ6DPMshUmB1PQ+!t3w)~d#mfkKBSQ9lH5J@HVn{p!!
zpMf7Q;kCNk%obI&D#N0R4f?@eZ~U0=Rb`)v2_^Z>XJ|*ub5;wiFrR%cj3Cm*bZCKb
z*y(FA`#aZ95Wl@f@v<d@nIhD$DIMJOdD8-`2WWElFBT{~OCqy;d2ODZ;Yrp86uLUn
z@K5@%4F&NN^n*xrb?pR4xEsc;&M1iVy)j!=5FacQZz&MRDu@{w1u@co8OE2ig4p*O
z#=R8<k-8SjksR1Mhphgsz!k(xFEuy$1Pq5rF`QmpD5+0F78F)NL>d$!x)s(aB6$lX
zaRze~XH$H^UOiD}MomN-|3!&*s=`aufNZJlY-Fb~Jb)T2;|nWit5>OfDB9ZnzJ3{q
zY;U<<d;vmD<XT$)A^L@>1O+<MFQw8i1AuhE4rvBS-;rjCaJ0-|(Z8e`L|G{L!23l1
z-Lq!046Sz~j#DK6oZYXgv2#8Sc;+NL7ov7Ao@D8-Eu@C<Nx?OVF&pBKjt0pB9FjZI
zfRRt`+cd?AqL#A3*!WeXF-3-jjm-DkZ~?}W4_ELMOWvmf>h0ef>20D>tsyI{pqKWF
zH@0f{Q&*FmcvNM`BomG@fiE)NXvpIihHdk`<Yi0gxh;7%CDWg5_q;^X^_*)$1KVsd
zT=@XA@r8{g)D0R}S7=-EIIz92vA>d{*iQ>>XD}jWm#QP|QLbjob0VqcW69Y$N{r85
z!P_Zl7&a`=wnf>qycb<i8NafOaFiqU#?63c;%s#{$L1`uIbBcEI?d)(u$G{81Oi8D
zuP?uB5LWdNI@MeFwyXXqp(b&cK<gz=g2CQnmqN2C>}ermd!6Y0+c{{aMghk19dl8z
z=|<Ezl2mWN73in8`AAzzXwQodcitlV5z_H%(-W>Wbi}7-p+BiO6D@<dT%0!DB8`33
z$K-j-wHKRBk(Fop=l3OMyKxB^xm2z<RVj1;Yr&M%B0trEH@E37!7??B|AII<N15%&
zlAQq))vrC6w0p@jrZM3@;7-kJKk$-{*5nj!te|%D%#Fd^b}3nAt<kWO{f<c3x#(p~
zNzG`uDe$i)hkVV3#Ib+&;!AO8)Qw@LFrfF&%?g&cgdYNv7a6=1SE)}_4h0$^;I|E*
zaFs)KJWoKgR79TyhbM$7hw|LizrR6OXZ&07+XR}Rwam$K<<E@6cWvQjenZWEjV+uQ
z;{2hr9S443&WAiwLo5g@Li^Z(SG(D#MYvhX@mQ%pr2Qd8HQB#Cy@X|m_GQ#NGecz#
zqD*K%J)O(vTl@S<GuJEs%M5f1c6_tB|4JtRd-I>97tzNpS33zgFLfJ$2@;PE5~qa{
zPYDubp9asr4?W)<%KQtTk0XtmoCS=^>}s~?&g^D+@l*4FHSE`zY5c;I`YtS)Z%`d_
zn{Kd(<UZqxdT%o$e!?)BTCeaC{6J?>;p~wrGnG90Hs7LX9F$o#U7m;@u+4PL`dst(
zD}=mp6ZtYlU)c7`gT74ROAC72!rhFgbvx(H5C?inLk?s;%Y{VYJ5RNj*eN9Ioo6*M
z1W};5gWFLA-vb3XNIf}7D(oZ*)6W!Q4$7lF(ux}eiwroZcsAw?%Dh5F0t1%4$r59U
zx{Ui2S7}k&SrUu4I)90*(u}usvg(!frIVOq$$=v#!m8z8)Rm`xN`j-`7!{RdT!4`V
z?H`E654K*u`i1${<w!t!q~T3I-~zTAvHr%hwq`YMq$1~cyzoiPpawZax9~gmmtV8L
zX7Q!=iem0Zs};7^m``$v_q#!3(}@+_PaywCn)jId$Py`K1u2r0w;|-kK;s4Z@Ify1
z$`<=B^8V&`RB9s=ojr9xN+MpKO-=O7AX#A6h59bH<nhJlW9wzSemin9zsWZ-WRsj+
zw=<xS)ICJw-I}LdIO|W|oD`-4^QW+Lx(Ms7I?=Z{ifwC`?-o1f^b(f)w1mr+?UyDT
ztTnqK)}dbz&3#A6t_C+ul+JLT{M0WIEp)a)tpKpEpd-`!GWosn-LiHkmRT?cQl1s0
zw5-bH$MUWrqbC@g^=kpdIZXfddtZKQ8AH;WahRV@4rKR}e?WFW;rx;l8?EfVi4x`^
z@nACVgYio0aU4IhZWOsC`>h>kbG=HExf0GY|E7<48!u6@1t8v+9xeOYbpYSRm!@S)
z$brLeUQ5L`JZ7(4Dm-=Idwk6Qm~X~e3-dj5U#R`XK}XW=IsdBYc`c`>U1+6&M-fK{
zh}GYo?L;jA6D|LYxHEeghtT2m+3Yr3_f;1xZSCI;vb==yEz5j)lCd9AJCb5<DJ(f#
zY77V3v&fj`;5sB=sunvJn)PDbehD~8*xX*?SWPNXP99~w_+5_XFT);8N~Xs8Y_JQb
z@kXl`&ZH!Hl?fwJO?!#kM%y0`Kge1y{<4?6S|*_iTaU<v;v0mEE5%Ku`H8WM?OqwV
z71%4&%sTdfymMztehqCFjV>CxSU>1%cFD|S(bU?Ck;bJIzz9@Ns^c4&0pMQv9CNV0
z+FIlcO3GE9fK;gDrb>jGH7WTfEvP!(o`m>3)B*>2><Ma61o~)p)N^82Z@}(vJf@l9
zU8=RVyT8rJzLB*=Sq&30q`n<Ko(}fNKRSkVaA(F{EC4ys`k!k2PiON9pb2)-#K}N?
zJ;vd5^EKf|?8H6SHL5x@#vzNgXaqZk=zuE=YB?v+!amC)Jd;g29h!n`%oA5TC3upQ
zUbZQy1CiR9f>Y?c#cq71Sq7k6o=mTsI1BD{{N26<_wh}E1!oapEjT@y{Q5u&d_tCB
z$IfpjS^i!^lKC6#<jYU6Xof;oW-jmvIP2%Z!*7CztE9*;ovFqhzRq#a79pqT*+)nk
z7mklsU6?^g;!z(L?$a`TLiH~}_MzaXd0VLdf7!wr)qiJN_20&qmK%f4r56=+ilFq-
z5Nm0)nRihsEqD6aQnh>yo)6+#dPeUVevZb!tsEv3L8}vR(N*Ni;$k}`#D6mUb~vfk
zi;tVq&eFDCm9tqimHxgL$y!zR71sCkSaRGHf$*WR8f>E~*3>UiW8%YS+CrVQMw;;8
z{Oajcx`H0yi81<7T{qfE+ZB{UhkR<z4b`)#YPxc{`j!XvaU7`XLm1flmMY7l3dto~
z-%36&Y!pareZ4xWZ*fo`XT_<$X1~5>RkWzWN(%MwC0idWdal0tSgZax8MeIyJ9)))
zbJzJ|LZFs7<8~R?4id3GU)y;w9$XlIv|y7QGY!rYJDT$;6L~DDs&!b<@abh=-OzZM
zG9`x3EVHxI$YXbDBo#M23VXCQJy*8mhNMJ7tj|kwMpfJ}Ti(%FqEU5M#^*G>Ly0qa
zj>cb(uWfpFaQur{vaTuqT6`07q3PYojJpKs6Jzl=HTfMC@5`a~uQi=s+zHvXv9fJA
z>V4dFTq{K)W$v1+YnJqB8*M3hN{~2Bi7Rd5T@K^Hq;#68=SFzQxd~sX`42NvPT&Hd
zo}G>kZ{nDi>24xA&r0d4=~*&|?d?7d`M9UJ@Da>Ehtx^vtT*p&R)@0YpL0}S_WZMc
z({`K#PkQ|(wzl1;3<Cz9M5iJR_7p9K7#!*@%ynA`y+QgjIY&l(;*NBWG|1QMCC4?1
z+|B&;oE_-zXuF+V{-W7wf<$uMOg6OL$LFTIgp|ssD3+6Yo5_y&5jV>UFFw3I2Vkq*
zAX#>wm?YV62sL7w^tc&Wb^+v(G5IME#oO{>idXZ$M(8y)iYy%6=cYQ_S=>a8fpX1$
zxn{lll&x}oY&j9!!bTBcZfaMRn^85rsZr#{)4!V<MNRdadPW*guo*=@xv3L1R}JQy
zq5x*xW+RwSo$SvyBQwsJB7mh<+k_nU#}U^Kn0l<dlJWZ6+pZjI#~Ztf@%|Pln%?|}
z88?Uovfj_<V0+sEwKnVF%K9%r6UfMef`=o5hx7c0)E^nIo?F%ipAOQvb1?!%M0b1x
z+K|(GIcYrIr*<dAyN~ioK2axXSK`ZN{dq{|=WLaK5yH)kbr$ne%)^(vx?a5?-7}lv
zHkYBv+{i=AD?vY=RX?uw%iK*FyMC0B*AmwCtFFR7V%aNR-L!?YhDfb+l-Jvrv;?t9
zk|DmWWNdZ#-k={zvfCxes;+0|h1jP3jk1F+r~2~x0V}U>akW6L)zh{jum5Z1p{R|D
zE}kZ>G(D@Lw4m<k1b;_@x{1$HBJ||apn@!kzc?6IsjCP&w&QAc<4Q~D5<V}iy3VrV
zzFVznW>C`%H@4}1O%{NQDy)bRg#KwE)Ihh|2H!CIVoTSb4vueVdb{kc8_t+m8DA86
z%raWjBJ+gR?p00n-}iM+L$C`jX39BI2Joun_>#zDRSh>JFXcm7TiI8LY}woh@QNGi
zCJk<?`;!ED(*kl`H)?Qv-0hOwn>aP`_NIlTT_1lvzE&c7p)VLK6%VpT0{kR<?!dU|
zBYqW#Wh<*sqH(kk@;4)s*ND;+Rnxpg?_m$Om&tTHkjXn+@N7jUBhI%jkRN3_Arh$G
zXj=$71oKr{wm_DD-TLXR91GKDK$EvoUsKg2$TT-K0%??<FZ%McJw+lbcD90{$2C`E
zWJTMhR#t4!7gG<<((d5MjEkqC5Na<(8Pwqd$1WQ%aL7Ya2K^=A!^!|ag_XhWA~<`$
z2Vq)H8QgB3_&lQsZl*oa@gL6y3dn`Z_yTh$-v>8cJaM6=iQLqSRuO3YTa)7myX%@V
zdf-SmfV?;ZI2(j2{-*}2wQvhlty=vxR&G<H2y9_hLvHFQflc}07n?uKFUntL^NR}M
zn%vX^bqt}8#75kU=&iWFfCpcPok|V>2C08wdJ%Dv>SSg5M1cjl{-TrXJ6x8L>wo4M
zJyv@;{l5pGn1TEk2)`?W2Yb*nL7u536<Mj87S!$@h!3qb;*1a{mg`0WVPy8jU^S>l
zhilF>2jDU0VVc=nQzfz-6UwrK#=V^;O3>=vU&$w73w^c57P{Sh#zRZ9YD`_oz?J>Y
zi(EaOsK>=uH@;L`fQYQam^-|znWHv{$MceT@87L?T{lujW%RYG>LCA|ADI8{ADDkq
zI{%dH{>*FlE)$UWGA}WqP_m?ce*U~Efj>w6|DpMxDguwGm(C2}<=*GEd;d?n_j}Us
z(t2@SIF!#31pMpw*x!L{`TE+P+!8hMsQqhum^zTc!Gc|4$#ZOn6&|W{P+j~5=aF!<
z%{SCg5Ym^pHVM}~?Of2Q74Ee6tnMcaxqup|z)U@hUT45%O<fMP<Iu@`i)Ubw`HDP2
zS1YFI3&mc>xypq;mp|)8hP;gLtadw74bjlgBCk)&Oz3HXH|cBrOo;FRkS0pyOpH{|
z6VSQoXw;NSDr{u~eV+~T=31}mhCDC6?j)%PTYY15<fexy=FM4=$2xQ3eyv3u*i9}T
zulzmQ+zvsZ`_7_r7%FL3^A&8zd6zpxX+}+@qiN?U6VapZ)$U=IxlFy4fs6&Gkt;Y%
zhGiN>mDqd_X6Gx+&c~dAu=PmeMU2ynuZ;Ei!1a7nr`j>ZwRWyOi2rkIap119H~27w
z#eeHe{jU4Q9Am4GG!CPJmN#sAV)y&VqSkwu1N_hZ_^jas1$&sD{uiR=?3b2*Q1?X?
zY<Vk#GC=yR#aD6=gd++&Om}0IFRVS!OL8cyv}O5=wRp}Hzd}Ns)$IhqU2*0_=!QeQ
z_y8|{Y=P&87ccCl6=;h)CkIq6Tqcf`Sk$sK;3to0>;bkF=wn9<irvg9mL_RA!R1vH
zUf5-48XwT|r%_WPGX@njb+ZNd*uuu0^)2-w5)^;x{GATz{lZ^=C%r#(dN#ew)%OZO
zg5WF*Zoxp7u07U)8zw@8Q8s_Xi;JW6-$cd^rpr9!)*Xr($!Y2qOB5HA6pNpefAqpG
z`!S-yVXmtIZtnDA&5N{&;MJDhrXaDs_$OsAnRBd!7x2797dCMWf<p;iS#9>WR+X)~
z<zAG_%HN<|)`5Go;xv}_+3FHY{-(elM^xCJ`(yF_Y}fA+|1ds$Wbt)o1TgU8+N-d@
ztKSeAdo2Iw*7b|b7@pH4%n!=vnlBC>%<lgw^xxmQ5<Y*Kc>d>GI5oldhugtO4gDU$
z;>9~_lW%%F9ErA4Lpp`tD>atbr)7%Sg<&wR9W<`p`4W~mzI^kM1HoAN_(!IV+46Dt
zZ#s~V!CH~y1ACzNQCUz@EF`NzMRK^K6^H}(xK53QsYt`^vUw8lL0mFJK^C|=$WxW0
zV5*7HCt<t^{KHvVl}`$`5JF=i$4a-Hl!ix=^5w5b8s-TFiRxBW7>i#_UPX++a0+Ig
zp^x!#D`19JDl@9gtN*M>+$MpgSTFW6)tQ)V<cn8Smv4?nhP)=)?5;!8UIW;OT`Fd4
zPAd^O<Hp9ySB&q%MPipQ92y;$qiB0;<|z7c_o$71_%rR;O%SWiJm^m?D69Gg2_H`5
z<5pP1AcNn;@YYQC1)C~*y(l#VFxdG%&Tk#GVCoV4w(s5m#Imm46Ogsc_uwhQKHmC_
zMyi6g!U2#O`2}mWinUzFQfJ#{+<pMC1X%L5v!W;k-L#n(M`SU#`Xl_aCCMEK^rd6;
zIUnbFx!<O}g2Px3tY3*+oOn>hO;u#k$5qfre5p6b9Hrf1%QlEa`XfIh$v9p?+<=@Q
z$JFHOmJ^*kEPpT3a6E()#m5w7lm*og1%x}Ce8V6kp}SIq6_@N|3_k6E>KMWQ{UgZ}
z_!^71RL8%pW~nas_~fvIa-wrS?pmF^ysbKSL5w|CiE8u!<rztg6j3Gdhq3aF&<DzR
znFymiMo&v@AXto+3pGB_MZDPqX9xkh&ySQwTyXkId?n;fiMoBf_+kZbOw?k94g!2Y
z)TmXL3H)m#Y5We9ibOYhk?8tpB>Gu(B)T>_=c7C(#s$^6^QEGq<?AAi7?<oqSQTmb
zIa5^yG435DsyEW`B2y$$a-4nD$6^L)04l^e`^6Sq5Z_TEzMRxM*5t0R;PfF`1tfv1
zqrx?*C&-bZmuUP&PivT;RmWdT$vYVM+v8tU$C*qHF7_VCt^1i*Z#r3HV`YHW^5v0+
zo9)z1vGZy`|JA5Og+j^do8jYyk%j~8ToEb}Jo$VA`E>jP@FR(%n+7qQU*|8Z<fN!Y
zl~ydUiOidmI!fobjqq~kc*{0MISOt7CtOEvIvHeF$3JbkG>G>|B7r3@G~tS&Owq;C
zC{8z{hebbK)AR#XC5NZw4}>a-k*3vHIZM5f<XYhmT-Qj9jDWNd{6I*wVhLy+huY_J
zYI{y~`8SA_S42I%VsW-ifuC!T&cX}gWgiSAE%rLXU@iD&Rs0k{p1?l{qjh{f)>`~3
zb#Kmxc@m_t+%Kdi%bPh1+va&E(vYt)qyKi3_KY+<NG<5P7^kouiDpt~vehVLK$ses
zgzo`~1BXP9lvNd6v?#^Y#(lT##@~tMu464wlBmXP{K`q98cCu8Cy7Q{NwnulP7+;}
zmPC=p#TwJ3`Az(ULQvE44|#-r;`h;BlFBFg6{I^tP8}wXqKivms~X3y4?$SLZQ6?@
zfi*Jqtj)oA7NyE<=No8|rsL~VDMV*Rf1JK6qG?Su{<b)*I`>Nsv5Un&Hv2HD)bY+$
zK>L{FsXchp_@l`K_eg1nfO(1mhKwid_q0Bgqiuwh30dOKL)?|Gk2JWjrlCa&i4)lu
z`>atKOO7oSXArBVKqdk{TlSz{W{%a#OA5@58)S?wEE4GvcBOdKXyVKb)rm{<tCOQz
zqlruScQ|{iWD5AxhVELn-SJeb!uIsy%Y7O%#2bve-$rRp0R8l1vDIj|iYKkArj>P(
zh9x4W_=jvL(8jeQb)&iuwhj^H)$zC0MMA@`Y>m$OCNIhviGp(F!md!@TalZd<|Udu
zb4AOs8U8DR|7s`50^|BtW{D>i@r;1PHXCHu2d?zu*R*N^8Q&4J*jDJ(w-ha;s`^#A
zwZL;wE7!cj$II*fMfnY4C`66|bi0T*;_sSa4A<t-=lDs5&IBIR+VZfYr*!k7rsJp4
z>hhL^^Q$vd8#P~@*wY)pDTa}T9MuhM>;X1*zpCAm97EC+MjH0Es5k}Tc}O&V7W0vz
z1p-0C4y4Bb7NfMA2~hzKMA%r!iz2JPBMtjOPquPl*K*JM$HS}^&5gV8(eF3ydm(m8
zQ5VX6M2MGse(5YTA0PpM)TVTK5cvgLcy|)U+F>$`JR#F=z1I+KRzwD1uyAR7tCk}%
zzs{{aUJM7Bycxq9{n*+ZqKL$v8a)Q?t~oI_9TcSqX3EkZhY<O|_QW)eMRC-8LIb+p
ztF*ESiM-Wn=8?r5SMweGHffa{YBDC##eh1*cW!4rGhs1g36o)OU=9oyx#?cHfH^w4
z6n7m#c$9`9iTj^TO%u^cAWh~7%7;2R*?H^=uO6A7OAt;haRR~D=aYj-B&?S?oAPj|
z;s}y8z+MbCJTCi2G9{Y(T(vYd<6@%BcC$CkPOVy9BOBtp=}w=lN?dX4kxY0-V*tE_
z&6rQuh&J>}N3bX&O^>Kd{9Ub(KEp>PJ!-3w9UK3|^saW2mmaGBk|!DZA>am0jlFwS
ze#>3~Us`v9#6Yxs{`hWOS&eO5CR(gcj8eC&<Lg4|D-um`3B$W)_Ao0|Uas!*4Ti|I
zk1NayGY59DXLMG_H%BqETZ(Ahm%j=MpP+UEannyZs;z}mhn!i3?M-uO^ILPBQ(P-A
z5&vavCUfmf#lEAs&n;Fo`xU#`E#>RY9x)=2l0%A8y=lPL4`Mnp1Ksu2tj<U;P>*+j
zLg{@fxJ#h(so-gzHP0Y_TdFfeXX(MhT2bJMQ&xD1<Bis6g&K<_Gt`C>XYeD`7Dbxy
zhtiZt!t6_q#^$RyURdErP*|b4{8&7m7}T}_koJ`ssT2iP$}?dN`Wh<;mXjWH*N#&V
z85la>TP846&AFu2DC1UUC6R{1q)R|~MKsZcnJ!kAa&4#vTFfng?s`Qb`GT5Trj^Ar
zzpzAR<Mwo>hwQa?ER~^6#jQ2UHl~_2D_%SRz(E4cZOVwxqIHqKGK^M$m?r^e&sj2W
zf8hp5g=^fH#@I1+VN8NU1!Ijg{Myp4I$FQV3BQkyA<C$Jb5Z2xpNUq8P?N<F5`4DE
zr1@g`kn9tG7ZCx3Crd1geNS_zW0@r@HZxULY~IM|GUgM_)i?t1c(DyWw;**NwL}x#
zCv-B(0uV?#3codBnJDe&;BpVym2af-jIrnxA&j;V!O-G1IDsS^5-+XYIwUqleMn%y
zM;dZLf3$v0ZaWfql}u4~21XM@z%d6N9|u&dAXlHIZiAlMUP^S=5M#&4BYlW}ON}7A
zVCz7-2p&j1Kucs{kc!P4liHhi)#wbUn$%)4nRh@#$iNBwp&dCxkAqRIWKSb1%Ncre
zc5Tup@;A-w0z5<a&$st?ryhe?d^zR(QysOxng+h3{Rp4K-VfPpoYYN(EA*G8F65e!
z<3dIN?Az62QRYZtJ=t&8wX8vqb`R=#R=G?2#8114v}pVzyDMA1{!5TYULx=5Tb!<k
z7HcZdDus6EJ|D*GgMVhGW<W|`>?PfvV|Uz%X7yX_L|!=5*1Pkq6es9zq{@mQOH9lv
z;=Jd7uP;y9Bg#-HD2@s;SOIn-nq_b=Bw5}HWwIq3Y)11h&=-rpKWmfNG1;$1qe>3E
zn7fJ0D<`^z5tt}-3{y(s!HX|Q9Vah)W_oD-Ni`#|eYfzHncqQR-}Rj(*0w2u{VZ?w
zNw~r5SB(xK#%^`2|N3W<$+ua1x%cw~IdBE$pWK!3;FE6OI-ULwUL^%kkt1ptk*3B-
zW7j-}#`d?g$68wo!v3~ozX#5xd>6{M%vF4Azu)WqOAc0U`Hy|SM(-c#{T};%hTdP%
z`z`i;u-@nE{W|-8nBE`ZJ*Bm@K;I_&9q>5mQ<Z*7R{FXBA-zHAM`fjdat`SiDSf}J
z^hHV^ru0Zw`n^g&Lg@sLgxZ~;^kSvImz93D(z`2tQC9jOrGL4S^nYcguS}5sw$lHY
zmHx)bq|aCSgsk+Zl>U&?aVCcP^V8v^->P)DCzO8pXwt7$`r%pWdn^51rSF-Q{>UiO
zV@mIqmA+&c>3x;H29cJ5|K%FjZc2YWEB%SHN$;Zcr?b)zQ~IhD>ADXsQ_poue?{px
zlU^BLUOC6?iP_v#0*%&xeNN>1>G0E}uRq1Uj9h;KiIwqp+!Cnw)Ul^t-~Y7*7x!hl
zHtFlgemOZ~4^Ev*mcU-B-$eN<ReqBze<&X(ZBow2^`$(^c@|$*S--H0xBe53_t@T>
zwDB+8|2@_oYEUpJ*|&7oK}e~=hzE8a$UKX;Ro1U&(#s6ebVsrchDW+rMrJJbI?JR<
z^m{tVB8OX_qN~yKU){5@<pjc0=d9fK!cFJ)*${s({_^1X+T7;B$qG)^?$ULL{u|uX
zrDtXGWL4gY$4PB=I8RM#vmKwTL!s*Dl{W2CIdgmZQ}z;;L$b$>$7_kZ%`|{}d%QX#
z5=n*c#!Ikr!a<MwOBkVNN$~h0HxN6y7FOW@!Qv@J%qkHZw}1=1o?)?@;F5eA$T1VK
zrKFro)5^5M+8#|uyL2XyYl!#98z3oBWWQa+x6~q%9D=l^8W`4g&9y)T8UY9$JS_|t
z(@r)Fh4sn*g$e=Y<wPQ81Z3`8$w7c@w><>0rpjD<<S=q;{af{4Zx@-Y`!{Txm3VTu
z#GAv}h%MSn-D@cxKLxIk3cMiLFsWBbbE6Qw)i3OnS}FRhU)VYIJ`W~$P)4Nx14C3I
zUE=*Qd3gi+v-*}zYcJ^3FWhQnODcywinmZ}iLajIJ%PmGF9&P*%}F2Tv~|sy^nw0I
zZet=c>(DVn_%y6Ny*^8lvoK(eB$SG9YJVd6sHKGcT^d$f(JWPkY5yn$@Sf@Ef4~N|
zA6V&lG{=0+8DjRP;8J^4Y>^gx5_t=5=$sSTiGvm*J7MQD<Lq9{f4<YP)T&DT$9XHS
zN#(VfkL>Sy^ESVthLH1o2RYBP^@UkbnNflFqjCdvcyH+WFFd>T6+jnDUfN%_M*Lf*
z11FjDutf0qc?4J^&6p|QH-?|;@=t2_h{eB<9%C5w^G2H+E3hca->zd;`KYoiY(%@|
z<g!M(#=mSQ^&qw(TO`hga((-3%l6ih=X<rc+qT;)ckQhO1b%y*5BH*@k99V*pT#l)
z>-~+QiGxFEZ?LDa>(0d6v|YJ*WvBWNI#0xLGx;1T=0sgZG`<?ci}k>p?#3+ibY9Nn
zoxAF3fyZS&c{w)sdE_SAyke#}olLorn>ty|uI!8Tv3{Y)^rx#Vs#l|lL-gB<MfF-_
zMx$De^yyXHY&zAi=zIi?U+FhKv(Q!7IdaoU>DJ@6;sxNUHjhRx#uwSX^FIyB7r64B
zA~zkHE`POO9<s^<uMnX+vz*&db!X0II`v+<QxJWoSGTJQKr%B^E7zs^FxjzvtxO@;
z#FCE`H>(TC`BKx0^@)XlU@>><GQYIOKKwBU>y_vjd(?BJ&qKCAq|c4^XlwHg6DqCO
z^-q^MkYxjepxY_J?46J${=B_734%j1BsV85$;lC)K547)BC}G52KAbUsL%F6V_oDb
z)hw*SGNwCj8rMTmDH|Qu1Z8oQL^$tosm9})uvME=snQmdZ1JeF?5k0jwquyrmc7Jo
zsW!ltlb>VC$S28DW9Jt(U*>BW&V;s6bbpN5PDAGYlj}Q6kjO0n|J&owKa?UGsh)C{
z*F_)cZG?tAMmLye_Tejkjnc2mPV>BYL-9m)KfPr?($h_3Z9^liph>qu)eXfDD7)HG
zPC6DUtK5h#ciw%bO3jTYk`XA^G8xTm3yx0>(s4e~J2bm%97)L987YO4K05G7JB@W8
zlfF-(rc52m#Poyj@~fg-+&TU<)t4)`p0vFsHR$$K*L_6y7}`MkTAABF((@5J6f<F&
zl>C#$(`dw8?!Rb|<~;vpqFzSWmuri!GRNr&(Lcmb8DWm_pGKNr@FdB-zx`cr_O`$5
zPf`vjw#XFdiJ4lyO>v?pG!<B>A*Zhc2ssfnZ-_!Th33qGSOXuUN7MRH=|yIU(mx}8
z3T6!VRUr|p?ExiqI6_u*#+Ld+M!cmC$D9cCY1vCp!@}636cKMLd+Ju3L1V`(dK&qj
zV|s<ZKeFZbFR|Ow<yU5XXIR*ye3yW<^?$r6W3Rfa&dR=Ot0}czjxWgWt;Fl~cT>Rm
z3-S8ffOD9O5~+u6O@l=%RXI<orvR7_qW;3vZ!`Va%6BLBQeDWDv4?CvLB!R=X5G{s
zuJHyEQHhBwz0{%M{$K2+E^qh#(Z<mD_-@*Lzm9ja172*Wrst)ez>gd}KNfoaum3DF
z=#fzRM_8vp{xv*j`tMe}ZD<Q}W2ca+!}vjwCGviGo#yx)nAgpV@2uO%;;j1By&%!N
zW(mwwmCr4@Owh04{8FyJWfeEiyvCAM@VSuBRDDemp9}SQZz|2JcXwSjJPC}_*SHd9
z!g&j%0M!y!I7+ztIzFhBwX?oWgZh%#pVjdG0xR`lF%PmgrK)ROjvKUBL3<T^_h_%e
zb!TX(JDgQpUBhPt)Sw#oxrEQOF|ujUNXe*BBpl1SifIYYuWehY(@-JSKP$I$TWe&_
z(C(=tc`_IEXXF|1I{WDqEc=tL?wk`kd6wbr97{arNgpQuSfd}Dw)tMXS8=R-ZKOeK
zvZ2FJo=`LV?U_CRQ!Z)fxN_oneE&8=yE?(b7S3+M&FXrlCQt!(ShRk=5^68Sn4o5R
zUdnQ>%y@lFZTh^FF-)vHdrlDtYB5i-#}xf%5+-P)th?6wKDl-5y9UWoPATe9YQNmd
zm*Bh<wPHT+hZLdh+a!N%M;brkoO{?AMn=1HEYHIJIt+C&8Dr&rxr$35vmlQprVA7b
zJ=w!05Q>zTIKskZp-4`L-rh_>`_)TK>CY!h*%<f$XrE;RZMFrOfR;Y-Xo?T7<LK4*
zhL`cmbdb<C;C0<Z!AtA)?ctSUzx;|XS@4?EcWdyvQ)9@6*X?B75?&g4dw5l3!|N;x
z`tbUH*n1Q3s;YDUKY;{-lAfSoW1V7+N*qC?5)ny+$Ub_gSSMOV@%Bn-olsAN7Ki9b
zAg9OO(kj+orKN*)tm1@-0%1x(E221{R*fV37?G$IPz(8gzVF`qoD&k%{%-H@xxeT6
z=TXkuYaZV9PU~IoT2Q1UcHbUMt8NB3oS`tSy0J9OvT?fOxR0-=m)@A;DVJCQjJ}a2
zE9p}KZTV*#W__vG1uTCy>mP)YR5zbE2&!|<K=1ocLR@}Nl+mF~{{G5j&|3dJ$#I$d
z70NH`Y{|i;0asQczf1aLLzbj}_NA#eP&36ko=wy5AXGpJwJ)q!t^IfMmH&_JzoV|%
z+5Ve%wNkq5zsrv54xYN%e-0VD!qeB-e-R4)uibw=Y*xD4e|1-BzPjwc>cQR3S2z2w
zl#E^H>uc=4HwI<q>&y1v0?U@O|5_mRUH9L!Y}2o^|6XqX*Y@9C%KtyT{|cYFJhT7I
z<T&vG=mkTWfCSF;Ng&i$$3yk`prI`J%W<!k!Q_GdGQj7LLX0}&^O>FLAO0i*`WM^3
z>1*Nh2Y)80ncf}!vy_!x=-&~rti}(BP>T|#Ce8XXI@m;kSY<rdxm={-Pzs|}33uae
zPU;a+LHJiuK$jv;s=+6IkNCtdoU?76s9&}xS?UKr5X{%f%<sQ>zOIMCi8Sb1X&3ru
zDc6zGnf@LAW%DJUH9OG3SrpjWe1*GF^Cc=6%+~~pIH^S;@ssNSBYsf56O`mGGJ<y|
z8hS}um$1|Qjkjs>L~6;Qe!G+1_$kpp)-3+>fS5fZx|Shk=Pb`1RQeVYjZgFGCgmda
z`>KFX2&0_{VOi(0hkY5*%Pwzv!q?2l)eE0E=9De&CT=y?nmXNg5&bN@@sj_?_V33K
z{O<Sf6iex{e+M6$yL4Y;|K?F(XYja_{rmJGg2(@r`*&IG&h~GWrF7Z9`HUqCN;}S1
zH~V*O#m?tzC;N8}MRu}(uUAsAe{UW_nEiX7_OvmsUH0#3Wd9ob_qHXugZkI@@5Ih!
z|5xna-1BIMTP*0cp}euw)RNSgQuyt@vEft8BK3!ZoE>*sel*!KHu-V2Ibe$}0tM@@
zj^~(bl3NoSWF&@}d0$L|K9=84_vp2ps#P?j?!IP}FOSwVG+`%Aq3VQ}&u!e%sB71~
zCCd;+Vi*!ju1|jEtZR)Puv;|M2P?cjcv;HFFJXS|@&)uF4;Q`bT4j{BYfYZB<vHho
zGxF-LD(@@zG|L>!d8-p0Bje+%6D|2QNX&Tl?evg_vt=y~L4z9O_o+MPJ0hBRzlbx+
z>IEZdg6;&7X(GKPiu12A%&NxdrIb%8;ETqZNqpx9!gJGwO}=z_ZS2rvPI3#{rDd#_
z`&1`<IEE)<IM$C`Z>|M1*~!N<E6ZZ;<dM;(8+&26kHPXLZdQALI1<oHNwbM$hJa3$
zA7XNpTm~&;UA$)xlBdpBPIiwzek`}L>o1_OSn{Q4(i{9-wLP!Ww#O{@BxZQkX!i#_
zm_=Q{+ms(4oqTpI=5onZnpmua_FwMwS*nRTpcm$myvBk?y;tJ%k+sXB2|XFoTx1JD
ztp*3GqxK05kI53Lf19~MQRMVl=GPFcP0g}ujK+}OdfBB0)C)kzvRDOoP&QvP$$yxy
zZNQ{iiRJm#i7!URPpl@SRgXtUCtt>%DaMM^V;27{Cl%4Ehn?(=d92^b-ju9e#7Rog
z;8#^k?LsHfQdGk;F#?}Q?H-N{jq^VpjS9NdyLO<k2Lhx#bd`-w`dZ@>VoJ5ncFI~U
z&EF0{1fIYTpDIq{D-g`G#U{j`j`ViXOy0)1kYjV<r)c<OOC*hal&k$wJ?xa!RJ0~j
zOX5RPC*YNrvmj<L(OjZONqFAI9$JQ%#0SL*)*~l5wJ7smefsex+5BYuAEK`de3|l=
zj_<oheRGnFQ1^BAUBmh0cTKX$(UO%d5nB@reEJ*-x_s;+S~d?~E#?nW9RUJ4;;Jw&
zP5|F0&OP2c03Rv?EE|dN&MXI$qr3iYlBV6lO_O@&Eorray}VZr+uqTfC-q;`k}~3&
ztA=$nS54_?uG857b<bE&hZl^bACsz+Gdw<`?hM%)khInRlBd0B%ZygO^jF<7;{!i9
zlGyejmY|Ga^+mAArqF^VL6Bml7OYdTvTO!5rau}SM3BT``~($Mg^4XQoFGU=<1A<u
zjn8J76=rCjAukg7)y*^JDT?2<!EYYpwEpRrKaLb;Xn6Xa68zQ$zjO85GNUBOO210O
zQuIF?D}f-aOYE%bl1QLmF&><?)(J>k8XEagPH!~4irx)M75))3mT2l4j8Cs%yP~mW
z{k|_1FP6)NuU)Lr92GMQb0O}*^eQz*roQ#x0YDwMVIIj^U(>A93Lt8&nC_;5<9D9u
zwM<?yz<`HV>y4WzqHn@(sf2p%R9Zx&er|6h1I@lb>*Q~%#3KvTgW->;=1Bb#nl&%!
z!wc`d(6rHV6joJ~A7%t)IY}$yeF300HpoIpPkn6D$*bG2md|F33_-n>7)^|mY~Z9p
z{Z$^a+Ry_ZXURE(Z2N<c8>&e5uF)<CT$QF~53|ICRxQePXW`_HYA;oI&_#iI`vQjM
zq#nPMEQu5PLSDH%!+@?N+B=_-#*z;yO}Aa3L>s(0HhO=)QnS@bwiz=Vc9)Z@+3F~I
z_fZqMM=*uE0lbcO+QeYD+D4|c^eh+>{)g3tlP8mVA~wssm!y3dp6BQt9Bv=%(R{){
zX7hVk^Pg6BIW$0TXK}FsTR)yuiT|{2`oO&SI=o86a3_J;OJN+R279q)*q8l*F!*p2
zKi=U7h|5ARjRh*|;=+AcuDW4)k@^@DlNi-EUn<DGauxoyc>a|7KA@}7<hdp3-<tBS
zPV19Ol&F<dN|%CS?tvGbrR&{CS4Ee;hhh4fSmj6Y<D8^BnK^0O9em4lRd@7X5vzPe
zQSq{9WbuiuSe{Odx>pt**HQaMkW%}e&#wd!6muUE2ijt-sa@yLM+D_Sq_cE;PwxU~
zGYfA~soM=WlMQMCYxCNnZQrLd|GbC#6wQU*w^SsN(7qhzBrG#C5*CUd1Xw7|8UA<`
zKeg<N*(zzb)3$j3VB@r{!u~E*_}FBDe8V&ny?sEugMn}5^z!(yYktE=%(gt{X*s@o
zdt!{XMgT<?8JR!R4)ih>12^gTJ#|WzllTmagWu`PJ<GwrVF{pv_v&^Uu$x4iyb3YZ
zsluaZ0>y--%X-{A^)ev!3xuN|BDG<>vmO>?Z}LvD-woavek1c2S*K41LL9g7;TN^r
z2JExg$0TM^pnskuZP>Doj=DwK!tB<}6U;Srz?$+$D#p%I)-~)_ISeDu-y;3Yp<<+>
zW-h_;7R%#Ret+0M+N#1Z1hqG`!YvKsEL1m=3g<-{K4ulzGjnM+Hhk#lA7S+bij&z2
z*+`g0LcAA`JMf;z5?M4fn^!maZY~X~^j<$uQ#D&9u*TxChXMbDG&W<;MOR9QjmjP9
zBV|fYa{Wn`?(K;ait_8sp^<X3@62amq@kQdR#C-d@Lz%b+SUT<8AeB#_d0ddohM>S
zq~;c2rMm+Oc>@mC3UHg-$6IG=(qSas+PRKfQ-`a@>ND}|sG4MdmcqJRbv`nlAEbSz
zjlekOSF<tBdr5*WMi+?{W!~8a4U6tn0Xyq^e;*}sK51jGnFs$HFE(onY&Pga$Jzku
zseh|V9r}m?mun|J?5+Ybc?CY9u$0tal~2!8)xX9Z1fYP+IW1tyT$MA)<5U;#^9?V&
z<pOPx9^tD3pwanO8*YQ?3vVDjqT%=kAl4su(KI5wc@Jo{7C8#n*QxGBngDNsVfjUk
zg#97*W{6H_w`#SvI1Sc`&dz)SyrK)@&#*S$LZ4<#R&9y8aXvZ-o(5Z^#q)7O-E@E?
zwLs~?!B_loyJh}4qsA_T%PO;2hPH?O`g0%!Ay@$CgJYie$~@G$D-45zhuN_U)N*PO
z(X7*E?{2Q8M|EZIovp<Cv@uwEpMKkXJdf~~-bjuBFK6iUtstHBcS#8Grx7=!YNI&D
z_ELK>S1k!qHT|WJu(NHS9$-}3uz_#Oy&H$xpp`HF)29%Yuz+y{r>SrH<v@paMQe-q
z79gEo#vqKo;3pJv3LDe*VDJrj4Ja+vXv3&Stu)+BPt*D*l-rUws%z<AT2vKt{fNQ7
zyXY28Q*zEMe!QE}(Lri$&!;!&z(0fE?f!|kQgqt96Z3|eu9s!fQ;P<whNH<~*Du3%
zeb_<-^(vU5^yynvrYZy6^#`zGZA-!Ty#wsqF>V@$vFzIPB%7(Tob&|Q0qpXv>{)&p
z`pWQJ|I{F?v;R9Y@h$1ca^lxz;+Ln}2W88y%!zN#39rw|GnepJ-JdwuZp$E9Wz1AM
z-Jdvxdw@IKU-;MLnQMf{QahDr&MW)c@=SkbF)Po^_@<A;uDOcc=N8@pKb(<gcA?V$
zU!M8jCeOs#p}suxT(pZkvmqveAbF<ve1&!@&%7@7{9lo0M(ig5{0e!-$(3iO4^WBD
z^2{k&dFJous)N0DF3%hapl0Qn+yA}t%zbP~=+7Ww%WmbF*}gn;=6L}dU);Ssv#;ct
zUop}h<(a2#^#4|Q=9OupkpC`u<|-~-AkPfNxlAa}JWCjPX1~eEGZXgcEYFN(xlEq<
z2ORT%N1k~okY{c?DOaA^9IM<cdB$Xj|4Dgfw#k|Ql037UaPR-IJW~!94dj{h*#dcA
zo*4_`9jy0<ydi|frY^wA$$_iLfW3Nr0L(VDBrD2T87EbIhh>2KHEBicDgYcy-Y%{V
zVNe{ix9$$(S-q$D#X@kq_Z&+JZr|%kU*cfVs~#YTAhFt{4<gPZB1iBb_o4?O)4pt>
zOs%2pu^OzAkv%GE>i#=|$49w>$AM4%AQU`q7grYw9=(IP*qP+<pSF_&$%7Y@hYz@(
zigF|muwX{)I0g==tJq<_=pzjW`kZYYa|t7*xeTO?SOxaPCTQSjUQ9bx!V`3OcY>mP
zVPlv+)0$|sx%Tc%O&xNEa-D{hq2#g2Rv6ZHLI@hZ#DVQ`gP11fbS^)9E~?_!n+UQE
z#g2D?ADGxXmCeMCEBYys_X$gtkSlh;Dr2~`Q?cVtM=Z999h*GY$LHz}gWbvxmlY!Q
z&CEeY?9fXa-H9Fho7OB8JLsHsdbo8oD|%>PT}6+fx7%<W61_fg)$fZQBgB@Zazu|C
zN^E$mG7MFuLApx%Gmf&|_z|t#9;t7U+;M_c<*!#Kxr6bEo9|BUm?zDm<PIvy${nRl
zQfIj%c($OE-0>fOhOJ$`bGhTFd4b$<oFssq$sOlB0CQerynmhKhm72D0PCFo3tW3w
zx#L!{d%H)Ns2sUthA%&u9!he@NHNjUz5T?OI}!|4nA{Q;0Of()QAx7t?ePn-$-7v*
z`buUx<nO<(-0>kjP3xbK{}1Gj4Hi|!>~|-3?7Lfz+~Fa2%u*-DYeSi^@%KnWjdjLU
zrE4Ls>Hn}94*^K>#p-Vmy<$iC;#kQStG&M!`&_@Ig!hryed(bk{Y56cJe`lk8cG%A
zL0I{JmlL0t6YiT6uFMHf55oR)8QnRb-2UbA!hUo%$Cx&2&zCWz-RU#*i6vWso*ww*
zhft&aI!K#7x$U?kc<Pgw<9HM2O)0%nGuz3>(dRU8f98#0YO7n+0-XqWyDB!N8wf33
zlxWXb4d<am`vN7}%Y4P$OS!f)GBW=+S^Igr@Z;;-&LZpQWL>O+i5ADHd?kJ$glA1P
z?8l$2)uEsBsu}FZ7vo?*uIVdyFh4;{-|87@SX`hcbO(~FBja>YpJS|nbQ5^shLwB=
z5HW~vG+kVRQGnknu-VAWb$$iLDaAXoFrR|B3)<wB6VwU%-yp+!>*xky2)+*ki`X9I
zgURv!2|kj`1l_>#HVHW;m!Z9m{NfHOblj5*^t?}EYf)sz`}8%+O&maXYmdl`SJn08
zn~{OV-{;^A%krh^9{%g?b?Qr`q@ZWy^rex01wHIax8CuF2XGRt|E%7;qG$Z%Psowr
zpW;I(j=6F^xj!NM_0h_|U4ygpl7bWkdvNOLT77=%&*cpyLh9%t1%83iM-|9J%OD3w
zFvm#0I=#rd!?@JFh^pO`N&J>S@4Ak+m)R|OC4Cq!dD?WQIMD8B=niQAm1@xY<1VD?
z-iCSW$V|TiwjePaw)SzBzSq-RRivE$^_U#IeibtmyjhgBJK`X)PSqPwnGW~PzLAEv
zfaW{VAO6lr!I(1iT#@?E1mY=7>~Y$&k#K6%Rd@D?)R(GS_wjjDvhZYOafe1u2DW-I
z8m+Wk$nb7qvg1+UP3#y-J607V|8f1Lwp~lpjv+{{Np?5~U^Be*t$|J-j<FYd$0%c$
zeenzHNeJxr<xWMp%rz`0#ZAWKdQt9Ef2hs`S9VC77{K#n!E;~@Ph*_VPShk$(KILk
z{|{8xm)Ggn0NcM7q%Vi~SqkvyjpNF$V0bd43Sn4aJB%%@mEyu}Tu0U`&F7AnujPer
zVg7#cC(7ghGz-mRG}HKQ^MSciUtI4RSbPIgLKc|uPg=qyxlVw*3V6Xv0zlrpyk~r@
zV6g$S<dWj^DVht&ILf`|2qnAVhOUt8R4(@+d1w}rbqfrCC>y*QmP(L2h2F=9iGB&L
zpMjj{jhzs#Eww9L59>v?5Ux2VVkvUq`pMp^B#vhkC1e&LyClPpD-QsBQ=!k3opl=&
z=w{5w<gaz+Uzy89Yrfqshn{`tIiPg`>`@T7^!9+>y2T#=l?!jPi68puv4#XU3s8Sb
z2QvCMj-d7*uC(G4_*r<Y(3Cs$LyH>moU-4|?#iB*lX3s!$T-KZcaC50@04+W>Xp8&
z1D_UnS(`#fgb(yB&!@?C>7%S~breO9da0BCcJ>?R&`x?p_Iq%Xmh=}$lRm!Ys~{1g
zD3&u{2Sx9v<?h(e`a`m^PFn8<0gT}?Wn`Ue)8@^DzUZD)Y5EMNl7ZA|#O4ar9r`>V
z2$gu@BnC2G>*p5dbFjN`r6~x89uB;ceBoInS1LIeW^?*V)s>p+q%T&0yJ_DNyYMRg
z^5(I8KHXoLejfcw7=Aw|ye237dJwKYYhmvnM_T+Fv^=Q8G2Q%FkGwN<w2=IC@kpjm
z-sGH4my2|L`bYyUJoTlxA*93UtIHf08wuW#o7q!1B)RuDp-*xVf^*`Uk|OW$yVQHz
zwVS-EOUMGy<NlPGX#VSZIKD=%G;`KpkMX%D6pW56KEUe7x_D-9w(sa<`8Saps24jF
zcN_;R_)4sNy>^XxW^y20WoXHDdO%3hja-v{qbJOO`-p$$vdP<T*L>`R7L`yPjO3gA
z@a6blAM1<9k@Rz@U6K~jT(I)HAFR-SI#qNc_qTh~FP0^)U7wDo=717#)A@EBw^sE6
zJEq|Eg+4yy_=Wq_SEZ?s`_vnvoB*lr%7#XMA`O4wUvoywTq5tzkv&2%p@0+<x*HLV
zd;4P!^2}2ncN}&&pc_rToH{G-12jqA>eOsm9jN2XKI-;{Qa+=HHWVoq`yOu*)+05k
zMao6*6+M<GxA>`PK<d7!NzWJ-*5G~n49zu{@1M|M?3@-epW4zGgxp%j>gRynBeR!9
zD?f_N)B`1r{c-=M<zYwQ1~u{7NX}D;Pb|%^u6(9;iE3WRTIkqTNPZM=C2Fm*7vCcd
z_X;~CRybrcp)a{9OIXvpQGorP4>MC+>XqZT$e%voC0WB%v<PhE+c2SryxU$3_^|xV
zvYJzw#?#Ap9o;y(d>3y+o(k8N@|Xgj6phR$jq5Aqqbi1>amFgwL=p!gXtXWNfzQP9
zB7t2WCdq&2aPaEReoc<<V;xLb8{u9FJIG#H8c96QR&INN1@W_v?=~w?Kvr0CvgWiO
z7CPWd3a!)xyx{XgZt<z84;CJzsda~LJlgQgq!Yn2&uR+Y;*XVlu%EofPadP>2mRz5
z5nmUYPYieH{eJR5CErN0cLe`*V)%pmJ6-V09_+FZzZ?ftbdO)2=@BmK*TpYS|2YT0
zwEmbe{$Ir}_if*ni(i)D$o0RAU-n`U{{nv5``!Ol_+`T6|1J3C?LPk^etD1rS^To%
zpWFV=@ykpr{6C9d4#;#+d*w^<%hoT#vE*j`3jDJCOA2+yFCQJ1#V_t4!!Nx~07G21
zjb`2AU*kb2#4nfl$s4Lk{+^#a1e_A$ms9-YRZ1R0vbTmocE&H&XUWPVn%p8c;A`X}
z1_lQIAJ=^_UMMVeNs)8t5%N4%;w;^UrLO!5BtMKn0|w@>Gxg;Txsd~48>*V?%NKx>
zpi1Un0+Y``-Zr;3-sc4%rG}01^CbH;DPIdd4E3AZXU*uIJhFtN25eHJsop2K{aa-s
zDqn{87(K^fgo79Gagq8VQXz0HJ0a7zZ^>_<!!2A$c>z!Oa`&hSXCvi~*@J<?s&WM*
zd8MeML7tq5R-g?uYQ6x)l>V-TQH{)e2cc1NMKKHy$E4DIH%IgTfvHVXEcs^>vBFw-
z!68y#tKL++#!)?^C}I>&{--{@kcDfn9B&WmZ(<pA>%-c&oz%nD5hqo6+}Ze1a{pXj
zW`yG=e`1pNn%6{Rx+c1-KVvC<pBmnqT5#qT6p@!F6Tv+1TtoQ#%p64r+{sLg`F^~~
zuN50$%;jKkUXi5;MmJX3Mdg#dJAS#n19`lODiU6ysVnXkTj4%YUPqjF2TN9+I+G>w
z-84^(9@;DB9)~&l`!a)1zT_<J!22&;YOJy&^2=2=26L~dJ7+{k8r~x=+JftU{(QS4
z<hD&1Y}Vv?9=#sCQSK9eIaEc_HhD)2N)Wh;yh8}pJ>i2?yw*wiUMsMj+J6i-NCB)x
z9(}l65<YxBuJX*~MZ>ciLh4$K&S3hm?5kWBY1k(dm(6!qCN7)rQ-d~@MI0)VvO)l!
zqpbLcv<la)&?doxKFd~x`U>;(TK8C!#v7?01p{G>w1xyfepC5e;vJ0Lot}fdX`cz7
zWI@|lx?nwg_{LHiZy-%7BS5!WypLQ2mm+GGUQo49G^ef~I>~h6O|B|z?}cv0TR;!<
zWcYqS!IntF;Tl)vw;~Od8gybMVi_*9elDhHk-_WW&FKhLX5`K9x&_;v{1=S}$4Y&c
zQ&}3hX(?Q3(f&{3&>VdcTYjRGEGVz%p|tIq4)@{mR<-F4oiIYq!M1o)Z}Yi|fx3qb
zPSVag4N~u<ZZhqi_JIIjd2h9ae<Iij_4w+gD&d2S7e$MaG*Y)&2=TUZ=_$===Nn$J
zrqUhoAtEo5gW4JXdH3xM$!!P6=cN*Lg635D0jeBeQ{|3(Z@HOa;se!k*@tp%`&7Ab
zWh+T?)>do0SGMuRwz6FZIbUdn%5ArI)A(Pk+Msc*P^oD$>iu4*;d*kUdk`Dw$70mB
zmRUC^q@&=6IWN1k2zL_utN)<F$?3z1MZVXbnvbNLej4drs>|f_l=D6z3b~}?x%4-3
z*FT!7Es74WMc4L})-;37cpwv7J*5glqdbU_{ww8XML_^F8_60;6w_X`(!-}W0bnQ4
zC*n3=zHERP@5q#<0UQDFO40!<dbN0H1EQ2^vNENc5zVvwddJ%IB~+JzPXz4rA_gM&
z%3PM$;9h2B(=U_B8wrLKU{qj3)#ByD;mb2{NqRC>O6hx~reQNVtFg$9x|jCF0Y|()
zga`E;FJHAMF3!tG>CEm3<BHPr_{stCpP^6@*xvNpEQrr#mkIQy7|HLt`U@lpgsbm<
zDTvScL?GpZxZ~C$3clf07}yW^8Lb2`e%2O2Vb_LE$!(|`FdM1bGcX=Nmkjq#lwzCR
zkb(07<V+7D)=T|kTjp?NV{n*)?joI!8YMh5*%XA+dvYihYEkydgqNpx&tzVi-j)-N
zWWvkSr)Bc5OmEGEm#4qT2}kHr*l0`8ne>S{8TQG^Fqm+TKetQp=Y}6V-~byeT;Z9s
z(p;zG=(y3*OowH{^IB~bJ&Z8Q7T!zHnxstFPkv<F&<4l7QA&}Rn~<RG{l%VC-~|yt
z8G_s5drn5*%FccABLU?m??HnO8p%+V+HWGP56baWw;ZTFnfdniVEV&({CQ@0E7Si)
zxHCVeTv9~E+^;#lTj@sU@$LE5<7@s@(BA#{-f#EE_naZ#uN>cn49)w_$Jz0{n1O+n
z>GueCGd|v|(StSqy7BhBh#GWpqLadOpB+BVN$tjZc4AOhkqHL1VqK)(fmd=Cv<Lhi
z`o_r2X~nAI4R0Sq*dVpk8F^y*njE(;o^>v2a*{WcPh^F~^^Frsm6Mv^3ME#&=-=*g
z@|)X^2frTcB!`VoE?@U{^5yH^e16+&z2keybxgx*p7$DF`~ieFzDs+e8r;0!Xo@GA
zz<kR9l6LM#odmN?om>KRiV~mY*FNB+cA?Ot6mpV9@#m}@-ZR^N7wX4wMCNx$z4e@w
z-%EPDQfNaxRgc@L(pX_ma~O!xG`LC)NPl9Ei^8xuFAGNLU3@C3peR+q{Fu`e`CO|3
zDn+$b$|}y`QPrUFV(y9goc9zsJs%Fr`>m)xQHVf3nCwk6^c6&=9ME_b^uc_B8r@$N
zsjuln{uE!SDMg9xyF?m>_f^C`bA~Z97(plZmOr;h{nb+c$^bjK9;5O~4o+E8<IqG+
z<A~_F+6eCRNgG!dtKgZJ&kH!1<V@7<M|t{Zh|iZ_26<Zn3MfQTVrwDCMYNFEO4dgP
zNaxqv8O{Mv{RQ9f+bs&nBEF)0E?qpm=(1Rdx%^6<%jQ4BHpvFhy&5OfKNtMEL(j)m
ziP9buXO!ax4!v-_l{H6h4<Tt(C$25+*La)tvO4nU82IbHbbJzR_G3Zl>p<&k2YIcp
zcl1|ZIVZEut}5|68`5y%zr441;CNRA?TxFly?y9&zqh-7X>Ui+0|@INI%_YaGQ9Vh
zO`W@4PcNdWO9zGE)5`4VPvD~&;~g*7XF>_=HMakpz4E3-fe!i_k7P7rT<Xt*3Fn0d
z#L^Ak4`GP>9jh{wjpnZi#kJO)YB8&?Ju)rtj)8m@xao88;bo8I+)%!r?4faUeNPsL
zN6Hi1S@Qc$Qt0W=;`j6csQF9o==qainS!Nu+$VewfE@W;@F8o;;O9&iVZGOyh^78)
zj55+{#2%)Pv7roB81Z5sJ9R|#UKza5KrTP@1W82df5;ozwCiid!#lP|?PigYh9LrA
zgu*h$koTB`LTu-OpIZotJsPOJt`Ge=$%12fA|`$XU5mOW7R4%1^Pj?_R?8yw>omN?
zXC;vts})+B?g{op`RpXuIM05leQon6kMn(d;h!})wr;N77)`E<CR(?j<QA;7M%<%&
z#PZvtm8;@-Dx*!rlGw3DTB~LH=UG;XBlOMj1MDast15I0`1<g*1-Iez7E5&03X-U}
zxc#3xU=GHU*(4d5K{jvI!1OHFc1^{SU>^6=f~x8=XR&(4UzxN2R;|%o!p>kWm@v`)
z3VNG=s%0XxE@3plMe^lCrGc3Kc~cBmDZ5P!)Ife2#)=qATSizq0>wqcwwla&yRN@2
zQq%mE2|v#zAV-m^$H&jX3bm4;zs$fY-;Z-}S;dZ#NZlT}p^emN!knAczOTnG%JSkk
z|A+?%^YUvoF>QE@BU|3Pm)~jGB{bC9zPq-{;oK5hXb3ZW@IIO%o;QyDv9eZA_?rQk
znXB&b7rb|Th!9>dd0xvjri%{&@4`R&*zCs5`FY;&=z$Wnqd0i}J^4UR<G}`l7_s`*
z@Gt?+Yt9>&K7xX?1fFxrt-thxu;!ut(8A0@Cg0Gwo&DGPZrn19R!!}AVTd%Whec1P
z`3HGM{%0HB6tTzvOR!aKM+i<Xq7`*g5#C{KFuFg;wu?4uaFap77K8?zucl#Za^#W0
zP3h+`HOk5(^naH1UhGEt=VpZ^I{#SMe~?wCFM!}GXYg95Vy#A@>GP&fMJCw`i6%I2
z`MTvssobwJ<)W$m(l5|>G<9BC`e#Ikaw0dWlFw%4DgWs<gt(=kzsuPAD{DVUpPe1Q
zmZoZlHLH&?w>~TZ1;7k1hY6N?f>dRk2wmX<p2dGHSlWg^@eC|Yzp^VVjrj+#^t}z8
zVCgjFV^eCffv3TMk+AWu?TN_}dvDr>@6>IkHgbmhx|2x#i$Hs1zE9q@-?&Elux-A^
zo{>j=;yEcP|K9aRt|I-eC~6$lQ_W~Mb0fG^*#x_SQ^n~ovxAeP%9=+F%0pXKrKCv1
zW<#ROhe><0fEb~C;Rlfb?RnnZi4V{CnP?J95D;Y0Mj8BM4F{qg`(4{Av692G57_TP
zdGx4}`aWs|(&2Y;lT-1bH|bsG3ftk*5@VX2O3no%4UY#8y-oH`gzeC%iqsb!fG%x?
zP4n42*h@WWVM#fg5w`J8x&~K@PC<R~*{Zq3Ezn@swccgSF768hGRSK5<96ETNA%1K
zh$Q<l<by;E+duF*fxTHPVNTVBDs(jka?t>tP|Ny*VDWO|N(5BS^s$ykBPj;qUV{Ra
z5#Odif|_6b$%=|kO}qZ!CHM-LP7E!dT2V{HEY?_zV+<riK5(?mr8Y+zK82pf+`|`q
z#IY1-&?ls2f8_5|_k?_Qu&PZyYx5zM8^$!d_A(zHd(_l#cI3drs8BC6vOS-|sZns1
zU}qT}+m5=$;C8sbp}tLf+EB0<cB7Stg1g1d!d&_baPy@q79zAmuh?Nf$pWR<JK4P?
z<MUq16e!_BJ;LyZZCa3W7?fOJ*fr1Gxr1X6+S1IXdx73~^L+^@haYKt1R%o;|9O~1
z7at-b_3(-v#vvo8Xid{gMOra9O3v2}ic66NLwp}HK>JDT0Pdar+hc=yKFWx+Vaii4
zN1tQ(p4QT7!&z*R9iYB`7o*<92BJg#N{lYWmdkuVoMZG>eFJ`cWAgskKxuE@a!_Lm
znh@31e+9kYZw2aL&fMY=dvnS%*5-L=zb$aW?ug$~&Km){lHn_dQHBL)_m66qnoIR4
zu6Y!GUO_QJ5+P;~NBp`d4C`ij{ABDep=#+Br1)^g5&MwzsYcuCwB$~T_YpwSYrNf{
z6h7CAmF%AxwBHZv6N>as6HCLU+e@72&bI!P?AA|}B)0DwY0%9Up{046oYV;zECdq;
zMzaY^UCsvF<m9t1%e*xWT6ffUJ7ek{7D3W*@>8ecY2Osp$(+1Ed?GXVQiJsKMzIdC
zU=y9xWfbXSjF*$YQartryw}*gkUMg)EU14)oJ_?UQ%mGGhw`zL>St#jp9g0irezed
zLwEhQkkLFQCnKcM-k`C`w3LbCHM8TK<ip0i;A95w)LZk86o(tUo8c0~_t^~a*q14V
zZHlDf@!n7>>*8&Hix7j^jJT(m@w`9cho*SDouC=5FPCB>qXLRF%+mOo!f~7~NU_oB
zXramwqo7}Z>EC~pq3fziLveWFV&S6~zdB$4ITe_3iFF2MB|b#%U*;@~{<>WMajLd9
zQooO}e>(^h|16{)1g!?U79Uz1E*Pfj`5L6QBlB;vJ>m4ci1!qIQu>79+>rV;*9^*Q
zJR>ji=uIY+M*3Y{G%0xofPYg!TYVJWcn0(}P+jICm*g4H+M5goQLHS!w{qEzLhnS~
z2qI?;6ocx=@vMm4+z-m|*G*BmR@Pq`2b<UyVj#+ge6tVa8EBr8!xP7(S!|u&l3~u{
zeZ5Uh)s4L*Pv_&9<-~lF#^4IP-?>HAjVJa*eAYV@W6*V<NZ?VP{%jdjs`{0=th(;n
zLAX7^2W+HXRDkRQ$vIH_KAIdeP)YfgiStI%tDDEjUF@|3y+h(Mus#ph#h2pyu`PbE
zgZ@EFtfeb6w;!Va=1&YfNS507?YU6Ven}J2)KtA45KXiU^F0hh|6fC3=iqH%q+u@*
zGqT+<^H@JfKxnS<x3Z5~L|gV0-fyBBJ-sTa;SkCqaq$5ZPSY0i;mSWB4DnX_WKfdm
z`I3;YjT(~Psus#(gIi-2t-^O2h<C?D!qjb%h9k&quOflW>PeyVVXyt(UJPbSk0n=_
z4z9iIo8a2V#9O*UixFeMwL=d8*Pj3SHqJAE8MUFm^oUK_{oocy5#1N^61vJ2;w+wW
ze<RAPpj@VX?apDI@gWVDP&_0I)v2k&w9^lp-HA99=Vge4S`CPU4f6VrL*g(^9e&vm
z=g@z&Tf?bu({P!nF8g#AB~w*)*l@6TRDDY*k=ko`;jstzgmdh{)^AT-*q8H%p{8sN
z-U8DNj&h6OV6rH^Cqd}#JBT1*&(-TIZ|c<RBeT7po9(rN-rqh7dwmF*7H%Pq={nx;
z^%b1wc|UnuIIP9{BRr+pEwNqc&9IV)H$OtK@LpC1xA>5@{Je$eCRBbtVW#^AycT&2
z|8B{<<|O~dPrg{mxBAH+us~tyX@2r|m3$S+;m80~F%25v>>E1`@Wu~i$loo^Zfg$*
zxSC80`}j0_!Y1qBl|*?-A?bwy7)L;h@=qqe`?di!^xifF%d{cN92X^R{#r{*|B+`Q
zLy8D2c-w!ST?bf+KbHww-WMz3Lksqx!In7Py4A|OeLjGB|DN=l?$BoX!@M<6&ij~0
ze&BsL@Iu#0s5w0r<_eDgr-j~$uo>R@*6$p2?)z0MH?2#w0)3gU8H<t+Wte(A2=WJ5
z|7ZEM^l+9ghx7{9Fx0f}yG5S}Z6Vs0(3`zCGD!~$qqv33Y#$XLq}BQ}AWEO_=4hW>
z^NI3~u*D46W_QE^|0GGxa;j&=c}AYR#T(Mu;ureGr>gj-S1s4j4VLQ<q%6F_Z=}Z0
zHINi<wV&%4zph(z>e|oGb*Sq4lX6L0lpf4eaCGoy)T`b`1E<1uma%wbP780pYr|+J
z%KH&zDRF=R@Nr6bH@_U7Pb^R0$ni*c=Cn8`ygnz~2aR@^=YX8>wK?J624QTi(@$i>
zAy1N?my`F&obcM5@QWxt!*>3X6aRQlxHqTLVV-4~_?7AQoU-@i^nG?VoaxIU**I=i
zW$UZRsXLyNXM9fGN9W`@HYa>~PI%XxI`7X3@0Jt(O-}yZa@ty-)7D<(@8-Nn=X@Ao
zm*gA>jn?Z9-hJ-^8tm$#3<K85U&HTTg2HsIFYB{GCy0NuqeC7`GH%{3K|0B29Q@`$
z%beuwVf^GYT(e4MQsx8BPE+h2qgS(my_9&o1ZW5OZ;^5>)u0OWFaVM0M<Weo5k=oJ
zMOz|OH(5VCOtYWy`CQMUI-$n*R|rLJWv5S;DgApvMNX=*(MmMd@e^t2smic9ck5(s
zLP5VbOmc^}WfN)sQD!D#5S{T}S3+veIE~CThd##WtL=t~icG%Fg<atgyaMUEEyICl
zY#0~>J@*CGY<@+^YbQVr9Lslj|9C-L|F46{C`5XDe|DpPF~6k&sjGwYIpay|?FvuQ
zcY@>kKi2$Y)v^W(D@m%xpTKr?X?WqZeW9rJ-xeo;P9pT27WmP5<Q8Q*^pRQmai^6q
zlC=uTK0WFC$#IUMWXe3F#8T!bC06-G?Kj)b691F(vnM#l^Y3(vk`FH|pv}M`5BFJ-
z;TYD3IewV@JKNrC_t|)n+lsvFTEWwmZ`9t)1X&ksT;nO~-F%_sbGsE!ZJKHCPn0BV
z!yZG)%raXtjWO+-O|3v=ex1zW{pO)Gd5ylGs`LETz20A4Ucyb6^7{qlTTI|Fc+F{*
z;II$`QgwF<__E%o2s&Y6=!uF9!4x8V)E{g6X({TdqEr<U<1>mpkVs6)+anL`g6WaJ
zK7nt(JMBc!??mdBxoS=~!G2}YqbBo?`b1~RJ$L(mdPS?^S!d~I5{lTYuA?<?SRJX~
zRn4N0MPB-Gld?<OKBEDB(8+GWm9lz^CPwDPr+Ujk4pie`tOBln<Rqq*=EWzDl?mFk
zB3(^ta<*)7QW*Fbtc~KzhD=omzp>P8!Gq-5nEVhuW-6aPIw3}myo{0#O4==3qLt6a
zF*vZy8Gco_(>8erJqM^HnlY(=L>-TRALn*9!gMv0YW84m@t*x)ETS5YJG7>k@y)|Z
z#s99$<$qI!-`}feUivOmpHxX_?Genh7XYtUKq2;kKu@s6QTgQ@=;2*S00?LfvV<+(
zIopIxxKr?kmw&JFZt?Dd_J`1kF2|iG%L#7?#Q}6V5uo@K)nf2IkdSvX*nn6+arR{L
z;JRy;<6d3IninZ={>Qlb(%56k+B)ydTGdih=eQH*Imrog&4>Q4#8V|}=8`ar-+BBt
zG8}EF=9*amV@;zr2T6C*J=H~!Dky<gFxgN_jCav9)_L2HsjKJu$OvE%DnX%Fxc9fD
z-|}$@iU6GX(U6n?Z(Bh+Jlq^?SqdW{WwoVr+RQKBt^WI)`Dfyx*yNfRw_%EYk_|{G
zyei%jN1y9QsiHc+Z>3~0H0=9IV`nwda}5n#JdYrPF61Fv@j8Bv#r|eYALIsW=31sn
z%k=H_fCUFzP0@Vl$Y;{CFvY4WtVS<=U#p>4yzE20+n;^;Qk(AEB|l#G)^%^01{g_Q
z{ug@e2?i{Yb47IWSepV;qbVyyDJ3X_8In3V$KJt7?T)5KRu2zRdIgtgWbZ&h**nb5
z$b(R}WWxe_1vE~!AAidWvZhS*{$?DgIfu&-m=dyi4;ksQg#v1~sw8v54X1OcZyKMV
z_VJ2!_bKWHy$J6g`40Z-rcKO?f3vx2A~1^vE3&w%>NvdN$xf{Qa=aU=0yd!iQfcif
zR<K>QAz{nuex8;Q;reMF2=yjEZ_vF!>3*X=6>TU}jPH1nIX9SV$+*6RA{BW1_<PAO
zI!B8ebA!QH+cJV^kUBX1k=9@I$)Jk4@U9bAn|fru8X5jbD?d1FmZ~RkDVJEZl{ogr
zi2w=Fc;(rjG(`eamS)jteOqiEEmxu$!sk~SR3Kv~t14dTJ-voDZMo-6qn`M=iRqlq
zG)!VTylTLgNncNVZ1^I-KNnknWGIC@&@Ctjd|*k=)?|7e+(eQGP<^0_IjcREwBk;s
z)n5BYx@&Wonxo0ntj1EqWTb&;&5WuXeb-|6BlPDiWhaL05^ZEQn(Y*OpRw~!nTN7W
zJ7p{D=c^-AuI0~f4t9%b4%?}GIX;=4;^Z%dc4?2)sZCCtc7+u2W*8<x&&B)W&E>N_
z@WfCFhvJ~)5Wr{m$W|T;W5!sb-D^?hF&V+`X{p+3BdFY33m2s=Vawze_BGwW9K(_B
zfW>=i9c$(~Zs8~Bs?_OaP=nI`T+<{>xi>(Acd9DsgYe=q&wCP-6GZ`v5rw%G{RuU^
z{FD3`m*1E^nNz0B#ajUF9cv*zSJ$1}70<o-q~Hy72;GCQEmHBo6UcUXC-KqUbVyma
zY_2<xJk~fbl7<c%Q@(Q)J<R2E*Qs6|kCc${R7A;VAaYz01+K>tGah#J2i~xD1L`S6
zIEe)TPkoORNQnhvrr~~{(3j6j!(oe+j=HBy0yejbTu6)5KSW00YZ|qnS{R%B2R?2m
z&BEm#>cduOGz2<zVp*29N9v7{#yeZeRUy=J(D(P^HDgmFvC572Q$#Lpe@kQJyy0Mx
z=HhkyEbOapxkC>|{zv|NfGF?Pzv)B)lb=hcq4KnvIVqrT(FCucSH2x-_%T4oK2n?X
zlDV{jr;N{cDvj&MGF#j)cn|QFkT9F~)%YA_gWKTEuuq@n3|}N-)0xujv097dyJ&fG
zt%XY{>a7vi&Q<mk57k_4^1eZ5$(N_J%MHu<%<4g;0}87$Vit}GtwZLXvx^s((Nb!j
z1+6Pwv>W8rhZMA^C9-~qCi`=eVtd*9@LcQzBb#282G`WH=W!y4g)hhfp-#xR&^rk*
zB+omRM=x6bXrS_xSsqNmI^u`YP<(ga9ZLPHFic<OpEosTSQM4UK+eRgs-@eIns{_I
z_aKi(=j)&pc32$xe^t3<%yHJIC;B#8H^$NUrQ?*xv2IlPUHv`NOrFh_XCf(6Co0c)
z%QMc;Go3tYathNj&X#Yx8lUJbOMvw`BL`ZTKck%K2Q%kJxHt2urN4-`Lx<8QO9d2A
zJ$c#R?i~S2PREGO@PF`w8*=R%_|Eq$t=BN~>V8_n${UYrJVB)49H}HXJBgo`=2b;z
zH>YEKXYCC@-&fCyR9rnTeK9J;NwWgLO@9*bpX`LB^tRve`Bqmpaqft^Vs^r1tEBIB
zi|^xOq5S=`Qx4qAc8a3BQ-kzA1*9M2r;qi^S1En4(igB<0#7%1^|AVXtNfLI{(b!X
zCCZ;qei+KSkXv}e+wWQ?%q{FmsyXF^5B7b>8PA9iyc~U{oT#j)o1d%N0pho~pZ|&t
zV?$r6{gz0gTRgpx<bXTBL$-A8i2qV@s>Sv@Ir8Dz7hdJ}tHIAPO*wu{4wC<<<g5MU
zNq+M8m3$J(s0)62gje>WAFA;7AViTPx%j>CTEEB;zsNx<@=c0(yU{PNSpVGFoYn9g
zu^}6tkrV!Q5Qb`||C|&5R8F`p8-|me$8^Jnrc1K1>34F%c{$<Na>8X^wg+!y<8x~K
z%X^s|IkC58W7B`m2@lAr?}coA>9=#jJ#z9ya>8Ha<as(L{8>)8DyOa7u-fwdhyD-f
zj|!z%$>@(Bl{Ir${ZXq$f0h2|?gxd-yU`!b^7TiveErcwW-;Bp{^-%y1odVL-&ue3
zJ0<*g^haZttLAR>M-MOWq(8bqu224T{n0OAMx;Ob)Ym1n1G4CHhV}jL>5qyIv{BCz
zw*I>MqZPX%|8Pyuocsq&NH_YUIk$<P%QZrgSIyqtiT>!!?|0E3{g9FYW%YUsC53#$
zzo0+b&u`spftq|p`EK<`Z`^9V_h#ml&uXpU)c<elkH#a_XEx0O>5uF#RW;EN_Q^0L
z{|)_74LTm_j~sVzY=DMKe{?;!e!J5j#dH$*W%?sFka#_*ilQ<IPD;;_+KK)MABD=C
zrRSzUdT+@N`lE1!x%#7*IsFSMR3R34P^hgCEic_9E+r!$7w(Ax+E?n2R#1qJPt+dp
z^+!kX#$i|e(cW#sKBhlf`dj7Q;{EukEOce{N1x=WM?29Uoe4I8y)_*QjE@-PFV!FI
z@k7mLO&#Put3Ntlv^!ZdmxTXw{n25Itn+Ja!Mf2O{k`{2^hbZPg5Bwll9tj*e{}OL
z+0A^ciCteue^ll7?O)L!ZIiu7XQ{Z3Y0@&i1w3c<N1u1AKf=<y;uT+al+W$qm)eHz
zs5dg1c=&*os9<)YH`*FM^h@+cH>hBa-pCh@7ufW7r!Trqv*mW9FG?7x`da#;^`G?k
zpVAl2c^rOh*fN7=e-OsMBP2Azdz!A}OZ7z?4H!awnOLx|uP-{)guc%DqD2x={|EI&
zbDm(RJJA>Q16+617mc<4>`Y(u3~28EmcHl{$+Mkz#uSuU+8L`@zpt+^3Y4nH`1;ar
z^hK)+0)3IZ4kbG2i(0-?U(~XLzUban!W^N#=oe;u2g-8`m$TDDebJryzP@OEXMNFf
z(--~Mxg8xpfkdJ)LQ#Yi8R(4yh0#wq{`h+Oq6ZcT%)d%sbg~f=AM`uvi*Q`!Kj`tL
z`l8?S49brBq9W8OA^Vci7hS0J%;<}FK^bCIr=!<cYJsmWI)^9KbXnF{6d68_`Kn~}
zHBsHffQ^=-XtSP#E0d;3t`g9O9TrOs3QmEdsqaF?pNWqdoBEC}!JsAju4#!@QO>D+
zDgG8JqMI=hL42DhOA)DvLbB-&-6m1g9Wj#>?}3HLx7CSDrs3GZ1oBW%)PkPKSjb4j
z1tNvvp-s1nqLmCWM^SW9t9=d<=0I;W!9I7cDB3t*Yv}73{;=4>B^32;7kkcD*awBA
z6h-eoW@)A<iW++J6-C?7QUU|HilPbDq3#t$RjBQ*#Qv1`CazHjjc<7Daugx5N7gx*
zuLpSr?Sj9bcF=<e-=wcVp^@chtn(PNN*}T>Lr7ngtM5Sn@lDKdGW-wU`}>t=^&tVz
z73f25kf~ITKIAItJu>gvfu1fvA9BIG^c<;Kq~Az>4Br*%Kg`HwL>-1S-er#l`VT%b
z`j1_F{fE*|4buC3LVBqG@XJ>zeQ=O|m(l}$$Kif@nbP}_4g-{<?>O=jkb_&eJE=R<
zciiw)sPEVX7BSR!Y+o(rY{V9{Dc&LCGTh>nk^_CmqiIXNTgkTqF*>Jki~D^_j;nm_
z#B4uDqjLOIFy$6MsN_K3aiyR9V<rE9WMAJg2o6M{3hyBC)^71g?jVKwj!}M*@2JSZ
z6!E@Izr20)&w@uX`h)awoPvecGB@Rf2L)m2J08r5e>o@oSvCwyA!4iC`(|U)ALoR>
zkrQsq2_KPTYqT*NpHt(*FJ=0e6MJ_yHvM2u_<)@H-p<yS{wOD0l9OkjoN%9<@T)ob
zd*{S|H>a)Ku-fwV9S&E%Lcb_Bx9-fjs1Av$GXG}Fv)jS$=MQdLp4nf@|5e`(4q8#;
z_LZ}iiP-T<|C!sKtaRFC0>mSAdJ?p&Ay9P(%I2M#0p0SDK-Ayg>W^-oCq5{0Qior&
z9nNolO<sLde2-Y_z*GLoZcMBwn!pJ`^*J^^*_S#X8L<BA<TzY8QX^i}k2~O(F#j?u
zAF}{(C1XSwlkyaw5R_MY@WdsF&-3GZB|h&F-<R32{Zg35#UfA02j9tKG)=RdR4?M%
zUdipRIl*xUU@Q)9&icJcvId>~<J4Z}e}{Q$f4oEbsXL@ccSs++Lwdyy=^O+3K6NIR
z1?OQMZ~5!h(dz%&{M!9~d*2gJREZBb-Pt$AsiS+?atABo`kUx@#=5&Hx`c7*Z@=ud
zskY*9{1k})5Q>66Ya_=otyg~-+kdNnktF`M-4(0-d&=pHl}TTYp5uGyazZM~`E~Jw
zU-ZbUJ+D1+NIuW3a-yC39<P`VI9jp)5x(Q@ZM%P4N87DN?=R*=W+e`gHgexTv7(gq
z%$n|}-(`h^^Y0a<+4$wU7Rnhqs^fz4j%Ri6faVLz+Zu_(70dAqE4w+u{h~D<Uiin)
zdgR4>+YsV?00|#HNefJiNBA>$ygVaNDdxhA!GgUg)HlDDv&ZAr)AYbobLbxt24AV;
z(<Q{)df*j|w^n(scO<)rB{5eSMa~x9&UIeL`FEeljNb@vC%*V$WX638UH8QXIHt&q
zCH_a`V14{aAIU)eTXOREXAFCB<;6$09cuO}*L^nfgb0~l6;xjL*|j&{!pA{Y{IibM
zPYWY6Dy%p>)U(c(4axRIbAhw;{awlX`x*IpQy06Fd&=E*$>{tK7amR}uq#n!l^4If
z2hnz<`j*G?x!EyP^X4pB{^`JTe!1+^f%4e3^nHAEt)<c^|0dr0XLPB@jb#o>3w~|M
ztv#Fyq}BDY<nrpI#|@m=nfa0V{m(qHv2Xrpx27YpvBwFG{qvm>8s^l$IwLgpsj<ZJ
zL2Ts4J+-6#C1-yonhKMj>a4Nk1ovndMB40MZvFMDbGoUSoOueXPlrF+DkL-dT-x0G
z+{S9SyUvvwT)9(vxswaipF?zl@80g@Ug`Jwc3ck5@Ev2VfE>K_4`k*eeHLvjfWyjL
z#O&s!|AWs(^gd6X*7MT$P<)Y67b`A}TjNgP2%Z&ufPZ&zfjyNi>$|_6x-ZQ;S$ysz
zPWc$tv@8c-DK*0m8U9B<zAf1IBO{N_ypyIUanjZoo9vRhzjes(_K><I_Yn;^Esr$H
zwz_V~z50&Sqd(%#$#3rFhb5cD{^~o4Wu?qfWZpdUeiE=icv$JF-^}A@%d)(@ru_JU
z`mM{W9iiX6d^ytX-7!mzxAo0-kavOSH`)GP+FIuHS%3BfRef}3|1Hoow!NC6UkfyG
zs<A4sb|tL*J7(csJOKOih5cph;10dcH~PMUA8+G*A)e5Noy^I49uJ@mVO7F=af+Xk
zo@z17KU_{R1s8tAdQsO<zphf%RsC1^#c==P)sNff%<QMEKo>7JPU2@JdGP^ytU0*>
zid5Z!Hzi)SF3YPu0_4eFddvLWZ1U;kz|1)=er*$SS>r*kswV*7Q5T=JDg2rVmBHyp
z<x?U5WA9;r*(!`L&Nz-~Mk>ODwIzO!h_8hBeDB`7g;2c%?+sa;m3}EKQ37`}nNpaU
zgr%;elnCf0pyTXXh4`}ibm>ckx)?kDaZ?2c4rEmv9phq*n|#-<zHqatgjk~A*DWUm
z5v<f)QD$of0`9-+$VAYT3>YY$orqnT2u$mZaC?*P@w<=Lik*~}FxA9j+xnai`k{M*
zd1_4c54y)1AU;MGwhm1`8JYj|6}u`tC~|$N5Fx>1BG(^IaPX^*>l&XKyjoG;xuPVR
zT&BZO9AmscxH|DxKCgJ!G_D(+e5G-XpXaFR<hm>N)Q;+J8JW;M^7YwE|8Nods`8$V
zvGPLx?=dNPT=`(_xYnap^HSk+YJRgcx->m(@ap8}N*Xe{adqPhm&a<TAHHaC)9B=r
zjPJD3jZNm+t0wt!<MYX7UKMQj_#bFON;C5oY4{^GGI!v@*0YW2sUrf16J7}&+bzG^
z`lz0ja1>sGcc1>%$*q(4Et!<)IO0mzyGx6np?~=2<-Wo~bmH0%i!k9w5c!6#FboDr
zX4S8*1<d^!!jC^*E#DuOUT9ddxQxsTIo49adcWWcx&pBEE`)>|`3E|$^$PJ5RP{Uz
z++K~3%)*@coj>bgK~3m)euCxM%ja*TK?&beXu5-JzTf#8>bg2lV&cj{nHLqt(InA8
zTJ3twbUd4vmYP=k#TDJ|Ok0s&R5*J=I!;|iM~&EI@vRSE2K#by<<jBQnQ$EY#ZtYX
zaWO9c?iqtduzAi`b6i=rBXaP|$mpS?;T$TLO~o(|m!BKF4#uTr5_8I|-u4iab87(V
zPI9H532`dFh<`)M4{Vj1^S#&pqB=KuFObXVQno)lh;$yKK2jVSHwZKQ%hgOjI7Ie=
zpdPAOS!S5w*Q$!kZ?8pDhn35LXt_0WHq~?Goi3%PD2K9>>(RFL)}Hk>yF39tjJX9{
z#Km9t!CU~EE!Q{LayOLUq8+t5TDcW_9)i)wG~4`vCq8TY#w<BXz?Dg)9`jWQNJXsj
z>8smfF853{LBG3wTgO7MvLNUrZZRqDSkQ1Ve@o^;c$4osuIYy0_l34ng%jH)6kJt4
z1uhh_*i;%Q3k&yC0(bFesYXn{6u-)G>&xqiK*7FLK2jKNi+=%Cu{Kpdq8rq^;3JB=
zdbGf-(?M6>MuW`4ODrY0QkELWGE||wlL1ufdmebPKY_yQl*vlg`&Wp^!Jx#F>4oF0
zouLo-x`WU8@oE{F&9^*IX$slk{+!nC&P7(uvT5Q!_UVLI!%_(pfSR^GVE{$W2Jb0^
zEkVqm4EO__uOa53`<YyCR-S9glU>^^e4XheSII#bVDIRet{2Z3om_TWWd4hwr?Q;Y
zj)7neZ4MosT$(|z=1RgxuivHLHG?-+H$FW&+1B`ykBz@moqSEmaQQ+~L56Lh*|+lt
zZyeqDS~k;B<neK=<?4cCW8m1+!Lg^8_wZ{ydQ!4)x#8HL+5pGGV;hEjs_~`Ee=xYI
zX7J{k#urB?KWbcAom}G`j=c{fO|B>Aj2PFKSw_1Bays~+E3UQ26A~+cRoleX6ROV@
zst_L$;#<SCqruT7#>$Ka@%FDtE}Nu4$)xJU)*~V_#C?qBUfhwuZ^)Op#V>O?E5y4s
zc7<!i&APAk7^{`p-}TQ4|1PyL^85|>H$5!<TOa?<<?dC8e+&Ey&QJx@sKDEue**k#
zp5NdQk!6#gbj-SiRnVdRb^K0BEE30dM5I2!4_7f_sfi`kun@yB`kPBsWPXDf$7t%D
z@{*?DHSnoFL;I^l>1L0JJldctCTXPhb_as`iFka7%;#yI$o*zr>*T}ufdnVbI(u-F
z1v2;xu)KwRlKFOvtI<3yyjTZk?$Fuq7(H4{l=p`}NuLLI1t@on8<)O@S)HZ6j>xP1
zPyHU3S9_LzkI$>cg+Bz(`z~yehWZ_{YO{REwo6y=k(pm_Iy)<hvrT^#$S_S9%{8S@
zAd~2WcMnWQ+XLPTrc*WarLWm76ufVXy!S$|dtQRred?V^S38jg=TImXlmShLxFPRd
zYBj0Gtn-keU8GI!FARzT)lSM`y9hSTw@8W8QdPtZ=P9nJxk?6PVw;SA<h-NGH{UC1
zTMLaD_E%==81Pxfeir=;{qwz4n-(TRMeq)F;0^C1EHO^wS}R*a0eO(Qkbo}mZp6i`
z^Tzm@_%4OGKtK`HeF9QFA$|B<wWR&Px&(ss0FmZUfUZHKEaCZni$v@67fZn-U$Bpi
z{@P%&_~>+Tc9Tb=!|24_jUW6R`>D8gl#|*6)xxuBVf(G|!(<qY%MYFGkbu=My!3bw
zs?QhJa?xbPWS>>yzQ%KHuW7s@K_#OxVRp|chacbIT`^vgAM(EO?r)&RohR9VU0me7
z^hl_*T}j-+p{xe58GYDb^A5=lwV)4gE|VeL3>WJoX$AHXrGiJ1TY(TOR~gm$=y#$z
zfT<FVDjCWyv&82~RJZ;HBz1u|T#s)zhmJWsLG3r%u9Th&or*XF27Pr=Uc5b)x}Yfj
zek^tPa%QW#?rNUvyQms$RpgH28BJu4#Mb=UCuJu}Yp6W@EI(G3zCjjUyso`NSyZ`Y
z!?|!<?HtWxG<Afa0~66JOR}tm5M{a_LkR2#t0PNkvh7CpzX2|2G=NWIp4u(FZ`;kG
z{!=<RAs5t319qX~b;n}dBio+k?e=eWMvOlS^k3b<_=(katdQC7sj+et8Sgtf*&ce}
zK@)j_o)Nun0}7%(_yN8MXFl=o1T$Itbhraf?`{WFpId!i`rtX4{bmz47UW~s)dy!M
zmU@rOuf01HVLmC@cNzFsP0MHDr!U^n_Hw4bHa>7lJ4jr+0{+z-9%*=9*f;qFnuJ(t
zR7tdAt1eG1-D>ZT%$Ti2Jmo}_|9}{Ct+@cU!Kv7WPoDUm0xF;18E!>MQPv7vS+y`|
zev_X^-FP8L#si*qbg?N1n|RiAjL}If1J;^rUzIkQebemidmB)jzj0DDa>>fE`R|JV
zdODW8u*7(;em%2;&B*`RdfIDmY?}`T+ZUP1s}1prjbDrjqtEcawY#-l7}C)I-p{HN
zxPN5iVR+&EcYEYD*AUU_t1MssD)h7<L{H$U-dcH_1qjCP3Yq6`?#Mh3xm9a!?nZXh
z#~3Nd?Qi_R%zB3H9O<`H6tv@o?Hs&gJ5T+WwL_n*Lra2pRg9d5^gW9Ycy>xMLbP%t
z_}~&BAC&SHKTn82!1W<sKp$m#>M*LhybC7a@wN~XD8u<(@WA~69w_zkfEv0z2M<(r
zg9rLBg3S1H_IpvAJ1G}G^o1sL!4I9*|57BwjkS@sqq6(o!GBBYYEb?Js1!GheCo1T
z+6d8E_qX6!M8@?xQZACfHFz@@0l>$>D|~yijItSL;PLYHQ4|>-MRC19y1Z3^SmlQJ
z?q^GA1aYO0r#glLy~~|{k$jZAU)c6EJACQ$t_6{wz#|v!Q}@!5M`Horibs`jSw-5-
zEd1*|p({$g=Wh`)IE{f^fhv4dnRMKK@hg|My^<ZTj}MHa3)?ImojM3QS&&iEpizK-
z?zNqNdT*<~?w|i;9FdtnlT6XIcafRm-$ys<b*oX3qH+1gVVzcjljCno_W%t()UO+d
zUOlKI5o+Wa1T_wE9eREuj$+GKX@UbyW2Es{WX`A=Z{q_^@cq&=@^cSAHdKu<H67~L
z9JyXn&*;X({ElcIln_2Qm1J(P-kW(3nX1U-!yywIdGt>4pWcJuo^S?@p#!n0ycoKR
znpkl?6h<rmrFlrI0)!=2q{u4;eTOB54pmAkN9H*}YqoUhjqu$e-U-%i@84w0c3L;y
zfx3h2;X6<)gDz1`Jw`l6Z;$iVpvjZ<+!)S<R!xnw0A<Jx;`?kRdZaI-1|Qy3te5wT
z&c#6YOD~7r8da84svtd_Quer$Mk3&vdUM!GS`%=&ms9kkD^|B*c+VqHVZKwk=A&mH
zMH0Gy*7m&diNUzbGg)_lSH?XoXWaH)7G3zq?63=6^B>+#{uSNkw-rnr-%b9Ty3KEk
zn9zKL>tOg{vR1n~xiYf&gY=bl10)Z<@TV`q+XZ^zebyUqm&cD+!4i8b7_Y{!{g<=f
zQOtGOc!eF=-RE4lUAu4Uw5~g^`SKxxV_WNl9xY?dci3(}5~$jC>w+)-eACY6^NPE2
z@dX_a+(l+A*N)?!^ir>^_9!2ksxZ{>;og7{kXSYKQbAMlFdqaHO$F)yG(m{@iI*38
z-{@R|<ybY<;ExRz7AZ;(rbs8a+Hzyq*$n(qv@85Q(>0&qG?KV53xBqKX4p*}Vass)
z1p8o8wjVvxcQN!%{kXJi{eo}p1Mh7nG@z%fubrnrNqv3gep&pRtB)L{>b^uD*<@?<
z_4JYN{Mv`FfIMXAWd`2|1g12Wx<mf3I>Q~>r_){wwNJfMGv$4c<KHT0>AQuo{CDLx
za|DhU&wH{*UR!@(UV}yL&9TKw^QpNfH*{-SIr5$8xV1&cb=0<++f2jn_oz9?O_b`V
za%EfspjQLgjHUjx)zJ7o?fj^X19hH`>?fy7Z!5d;QsQw`-8)Xoa!Ovn^X@}=%3;b#
z@4a6MK<-(k>Y(t&ZG%F6Kn(RC^j$1^cl7<Q6ojj(UGJn4P1c#m7x0tw9O-adbBkpV
zBJRBUy_O&S5mcM{qeD&lfeZFu-TfX_GeprAx;B>|4e^i=|A*uB^_t=tbH}Od;W+0h
z?2mK%EUR;tVp8`s@}oKrvUv{XO@7O|&YQA^^+|sd)OR2CIjP6%EH%UZIPUa`-rIJu
zbvhhVM;srch09-s=i~kI`ahC6_rhNMq8-uBFI8PGm6v?a_~>h-;WV~l`X~Go(4{b*
z7g{>fJpR06{5r+!ST)Gs3B}bOtf8i=O0&&%v>JT+d}3tjn7(NQ^A2w`tC53$w|VQd
zxssa}gQ4>#q;LPFlIf?<FCx7US;vci{zkrbJoD8%&OPF5$8*8)#{#y*)eZ*JfR;nd
zS&Cr??C<off$QJ^B47_W$A+z0rx!Md`Teh(UXnL$4`z6vmoJ9HonGo~WivF7Dv=@X
zF!?!s6=*HVsqc(oNICX;lza3_<~Fbst*~3rVl5;6dT?GbP)Xja(0IBvNb9!C;$ezf
zMU-Ug#S{4w8~P{_0UT<)tnRz7Al=K%rl|xe@!D|=_=|AN;}~JB0p1TwBmuznEH53r
zQJ9PejbuISeMej{TeK};YpdAs)6MQ8eQP^Q*ZI7;`Yg$#{*w}tNlSg1)JFygHXNVx
zB6Nj(D1yG5BQu_%-;zwC$<6qOj^Yd5oYYHm&VdP1>29)hd=?TbM}1M`m*ffwEf>U+
zZ|f+sO_N_wCBR{-hP&X>0gl26vt+&qvsD!D<NX>CFgtHagbYfSC7gniRoR?m&)Vfq
z%cvq%(uP;epgtPYzKU?!7*()21OC3NO^OmgV(bdU*s(lhM)2ve$sd^{%k@(Bd&mWw
z+>YEjX&$`7A|NR{TLC?(J{@@c)7qWE<I7s{PVm^Gn12}_FO#ezc$`e&-v*B--;jmJ
z-Oe_E6bK$owDzL`Om@V<c~PiwHhp8EvpyhAK4AJgQBe7Y&Gf}U<;M7TV(tZ{dRKe{
zi<gg#R^rV?Cc>3C?_noQ?YMY6Yo`WLJU*QA+jrcnOT8@{Vel~F3idzyVcQGZ4|a@a
zTLN_wn?H)8ZL(?MBo0nCMcai`gn)qmQ2awS%%KIlW~}X3=p9c7rTQME90R3SJ2Dsl
z2j^~Q17(0*aPIakDwOJgb2pV?XeE*@hXv5}#m`kG?=Gn`=+QW+QD=|>IL&{Z#Iu<K
z&G1`<p{K$5+%Mba+4v-wjuQ;qp`VBlHOPw!IExDAP-t<Pk|ENw_(&fiRNZ<1GC!|9
z|8?i*WGczd&tIuJXMXkt3ch-NF6=ZvpC`X`eqJScr}MLzM5f1U=GW$FaUC6EvTi5B
z<|ml1NW*l--FDloub8ikDVLeAiF~B@?K)r5pUa-hx8+*LDQF`^Cx-*8FP}z)nMP>d
zS2YTeUCe#Bjgk6K=%6gPhX5A7^55ZT{|qDCLzMd@Z7{f66lfc~!vQ#!=NgEYRKw#C
zf2%<1nG?<tN<eKpMVLM~-Zb~YI@?4+D{vAyhmQjqeo05Dt(C>rxu$9Y$$@jd0j6#!
zyg5jj;$#jraTZ`_ngy7<72c=xpwJYr&SWfquo<O5?I6x8H6NKXGGrC#ht$QXX`Yk6
zDK)h$u_oU;VJ5R%Bw9X=c^2IQ%WDGqAYW<hZ!!@+{s`YI5^-^I<{7W-4z@{4z2hPc
z<7v{+ean~<rkKMK&Mw!1nPcwJT<&EDG&~onR|wj|`gxZ#<#tf~UH3S&0n^}P%{lVe
zq*U-ND#F7G{iNmo0-x<NkK%E*_^wg+%&mhrd!J}g$qLo^qJZ-y5tf!3@vtmNO=&c8
zKdb`}Pb<JAbKJu+U_6u2o#@&Fh|I~%lTSDjUldwxIEkqdr>)w`R7qL4)#kO`;lp_g
zS5L?sjD9d@Sn6AHZ{E=2sLn~^+_P}gi;`jiEzx>g_rsBP{p<E$khbt5RpAbuiR23f
zTf>ick7$oO;wqB8P3(Q|N!HD{jNGGVJJkT?bUnD(YlimkalWyEUXn9JT1c$cdK)!n
zu~A4L!}S)sX10=u%Og9j`{cn*8B?^oq=ZV%6kC0lS)v`vPaWr#Lzq5-n5)(ALr|i=
z*`sj+*ziq@xEkHlOr2-m**DT~KBi`O@)VHlBsbg1WkICAKapnNSC22WcQifV*v(dk
zA}LZ~WU8WxZH1AUv-rYVZ;H>xFuL<ZOE^;kMw9D&Fx{vUZh00Wv8M+jH+XMrr~6O`
zAOe$RKHRmcqd+GwgT{Gd@APs3-<6ua#1}my4aaHl0o+ApyKE45qCp&xB{yt({-FLQ
z@(pgnG>uuu-9?d?s0Tyx+;~x2KkZNEsS060R*c{+4$LNJ4$PQH4J@%@nonyDPD6eP
z?yiiJ86K~ofmo^xhB8vWg<7O3zKZFrKZ%5>`$Tywleh2(Hbq0%Gy_db>4BDF4#I$D
zHD@Ufpp44itigvZ{DF_Kh1r=F$|ChY4q9ke3lsbnE>a8U`7P8R0+umDLM`Hcd~{kw
z-W#Wp^QxFo7LoS|S)d=Wh?~6MgP4RU*?P_1<)-Vnh!19mqf-2KI~DI6{?W@qJf8JO
z^;S4u_E6D^jltROo(T}0bbme`H2cG!nwAOvq-HQrx5}w@nf7UwV_tOJ!l5ttI~5dS
zJGInHaBIfjsnw({JXN6R4t;H%?bMfu^74VUS(r#JU8%U@kNF8rB+s>|5##vro@~h7
za>8CdaSZVdUU|R|hQUq@F_^(qUiu#EbsbG>u|)fuyn;k)e)>0zE|BNk;>YOI!j=Ac
z{ktoye+Q_4A2AQ;19~rJhV|4!x`XO$ZyC@`<jV}K?Z9=D_a96?fOS(-ep7lhX@33M
zDtC%<+-2E&n_;ag>u*rR<T|$fIDt?96csDb<DBFccov!d5iu-`4u}W(d%^nxJWewW
zr%*mvu*uhL^TEZus<5K9?-}%@-=VlVFcsxY8q7uQYS8G`y7ZG4F~(tguTOI?ro;RF
z^@5)fRnIc|hnV(&$n9_#%%aO(eMqXI@sR3w>O=&2dh<hZj4a3|?^%o5;H}mIhWk%d
zR)a6IC&0*G@}(YWc$R67rp|@iyaeqUcK<ghBJ}E>Yxlp11pSq@^?_efW?q+upk?j-
zy;z0!754tip2D*K@*VB{M=N$(#@@e}F$DJhbc&Jf#{rQ^`;0JSMj#B6e(PF>3BTJ(
zu9_vc?~aSx_aS@muERds%R05z@kp}M6@c{RmZn<(0}1UoU%_#C;sZ#_REi&0z9W6{
z?f&r_TQ6VYwni~`k|RI@b~fm@%1ISI4uV$L@qXn4Yy+=XgHOP$G4~>68DGYSBRj_-
zlV)LJ6oZN5kmEw;`N_!4#|`_OPR8V6%PeDRFSG-yQM$xu1aB?Iw}Bok9RfZ`okK(@
z-~t3@MS6m<{v@!E^le!E@fxp}5PymP=0Uc(@D&jBDK(j6KXBCTv|Ekv9Cgnq68@ge
zch)ngx<8hNu*ltl3pnqQV3(XjE=eyir(De(n<^;R^-Nd(IoHK7TcJ0Q0NjAq0Z!3W
z#aQkOam210ng<H$iu$8%KZPrr(Im)Iw=|=b>uRy!-9{#kj#Fdv##TNXsXxt->1rMM
zSL;Hj<%<`MP4-`_JXD5OVHcm9xC2nh9P!<KHAI&)`%*M4Tf7^<6=JvR$)bsoJ1O2V
zqIce9io)@SQwf6<X;@|m#9n*QlrqSH9589?wRn?6Z#CG|wDEqEM_YXs!#YVe{B|}b
z*Jo8w3?kXV<>FM~$*Tgr8!p2->D?mrYl#W6o0_iBTXT)tMKsOf8(S@x&Z=?=a{7SX
z?fv1L0exinC>&5)L=gDU;>@%vveO!=|7$S#GJk%}n;T%Y7P}FeFAEtHCO&;VS+gvg
z&6B;8r&)d-pc9Oy520<JFBOI?O&@4m5PFxR#}n^AE0$c__G|b)vu|qqHFN!6jM=+@
zMI`(S!v4$b@7M|qn9wXiXy_#Uu;o*QhcTVLMAF+U1Gxwvw)L`qfzeIg4OA4adwehN
zYChZgcg}?U0~l4Ac+-tHA6UyPVH~_aWaJND)T>)E5SRq94%61*nGPwq`0QhQ!hgLj
zT;&VsplMfQB<5;YBlU+UyX(Kg?zs#rn!3&2PK&03cbGuG`-k`nv<OO`?A1@Fi)axx
zWpaT$caNqXE}u&$l#6Gp?*Ex;I|oe$$cE?cm~Zh9p}q7hlI?s#3WkXis>kOj#=m`)
z3KOUOJu&L~?}<uYo}zIoFj)Z+DqkaB6*Px#^3`}Z_St2*?G04jWjXCV$cQXXijQ%s
zUdl~tXOIQ_9T+yY4wfkzSv<PDU$iI4DVmaG;~(1Vc9DD7I!#Z%w(kXeh~5pVd@X)7
zW((6kMh!O5JKsQPI<|GAOwWRrgn)?cRO)W-DP*8BwSSZGMNV75SHASE6V%C4^^H>`
zQ`}ac<5V<DW1BvjD#YJa^Y)*?MU*9OC98Kgu$2CLxF4El8W1_DB@z_@mq`5sq-i0`
z6j8C=OHZQ!y5uasSekw+ET6M~?jz3R<ILN5J;e!RC>uQLwKz*Y7*wstM3!zX<ajPp
ze;Duw$L`;8(2eg;oCr?U32bxnSH+SaRYiW?9IJTKUhHzMldwmBC$ON5@nAtzgCK!d
zPG0JDFGJ86%<(a}VoRj{H_Dx?bW%Sf`lsN~pMgD(a#~I>Eh|w)%_o$A-rtL#7E89r
zl5fViCSK7R%Wul?Ggg;NS<0mbJC)lb^)J(^ebQKB8Di1WGADU?JN|n-em%@6Yq>Yk
zgfjs$A4V6(R&0spw*+t?LrV8#>Z3cmLviC5Sdg<qim>9;HIBMYqpopY41b_cOSjvk
zf8M}MX}L7yoB`Cz&`m0pSOxb@r-}o+RF*LpmzefBv0`8+;fQmGOLZK18tMVRnMgmH
zp@}Fi56pjS+iwIf<m0xao@fe&7U_2q`82Gxk%kY!wdpb;?hqeD7q#-q-?d;ju|BZq
zeapSujrhn^lo+fXqWQdETve7noeB&O(9VC*)$>czlRDvVd6e6g>M{?e>XB3)!T_pU
z3wc_h?G_l=AN(8!_G5N)VBdO{6fHia#oQlFJi~GC*J)sQB-x3R!uSL<y|M|j(tcxx
z@xK#jX~wgxKI%@LR6_T#BkPU&!UvTU2Oz#4$4O#5(=w(^xtqrfl!n9515j0>IGvTa
zzwrn;$y25{$=@un$WtcL)U-&ynT_^2^A`W-PUUNv*~AYWhLoHFH!@(0^dc=YR}rH>
zf@}RyUdzn&7OERbWNQ%VB=47IhPHIBN*ijOpRN3`#p39Hq2JHN_h>$0AdKGk%ADb+
zmGQE6Vi~9UrvY?L;Gd<Srs)Io;_Hyuj#)+fZgD$53-?mZ?$8@nD{90m{CK4xgmC?t
z%QSNdZK*xX`@L2@Ickt`n;8~?D`%aVW|KQ{DBdnwD5{C{#fEspI|ZcG_L#S|HWW>o
zN2l)5m}05NZG`TC<L>Q|H}=Af6UXLn8_SiMZ-W+VTB95cb0d<2;W&@8aB$qw9}GO8
zguHE-*7;>Cw?Rk`r^#6A9-A5ev{&aizC|z$QDB}Wp`(q579MZc3<f8`p{nSWE9kXr
z2e#hvqA2d;bIIm?9JjF6*LzRlF6W+Noj+JFDa}9jEcA|Gq~Z!^Frz+x5G$ZFM~q8@
zyTFBkzOr{<zKs4<DSAl=!wH?LVR@)SqDcJ)zEP{KpD!NdYMm<9_=ku(R@{xYIv?ZY
zDdsDVaVmrJgq(B5yR^zW$C*pu-*%3ZxLi}#>0EINQJu~e&m$t}kDcqlr{&l-26$!P
z&u!oPJ_NYHenpvxSh=ciK@#HG1OJeX!#3NnEu%^m%V(K>&+b&Co!7=VF-)CR8XG>Q
zEIx*p6XS<qew7?k+UaG;=&pIZ3>iJ3q{@F8GMbdn0ywLxq89&MF$agYZBGULui}Y6
zEK<{lMRU2=(7w3q`m@H|j`$*gfp5u0f}Kq3g{k#VAk!Yyp12e{cI<VAi|OOku{Zm^
zswg<GW>_{qy5~5&ZCwU`3*M?Zht9R%JvP$t88Z=617p~j+hP@H>Ucs)6r1DHKat3B
z700E4{ca&Wi&ZSwN$DN}@;lhaF)5%q`2=T+s=}m=j8(L7t{ThV#Jo7J1094;Cs9Q2
zf{}~pAi<LYItX1&I_-Qz2OWqHbZ{(tf?<jBwZtl#y;G;qN`9LtV`$8~P6QAtxhx=p
znS0uq111nVO~`84=-{%lNI#mfCNgvoJXd)mcy1QE*yiV~D1LqxhHp*eurye^;1pUc
zO}o$11aI=4;r}sRyva8~7&Ls~8>Yq>oV8q3%3tdwKaTu*X{_Qci9rp9ZKV4V%3C6+
zgt-xiTCj&a2!FF0>qw(?oFXL0K|MJ`ew%4Utx7$<KXM15SNJ@!3_{H0r8K&e+EdQf
z<Vjo63gRxx9fkG;{>$qLu52C@tNbK>G)C3ju4Y}BpCYbzgmpmuJqq&vLhZY}wCW^0
z9j2>bi=Fv2>%?NGe98TnbmdFz)L!=YSd(jiEZS=9L%=`uFZX@|GvlP>6S~>BhH0gC
z3*LDwJE^N+2$sgMABrx0w`Uu7Bt|&bvtF(2al@b4rOM?$ZLvU=ES7)x-`!$4Ig1t5
z!OTD5Mp>s%SgpvfmqjZ!Mdm-Jqnn1ev<tLapA*Qg7RtTjwMZ@8gW)gI4-(uKKUPmR
zCO?)w<m;@D_3z)dJ}+rUNV&DAM5z)a2Rn;HK>NnU!2*p<USwXjQ6O}t)t%PITO*2*
zU6&FuY74XLVlS;_)@2RrGOvzxaW2d93${N0A9rs8A7yp@|0j?@wBQ5<jV+e3O*JWM
zLzS8s&_oD4QztfcrFBcKQd&1!5;sJmnF!-BEw$R()>gE(wXL<bDj+HZ0t9WXxFNPK
zxblpH23K(9_x_ywJTntO+wb@L^7Z<^{&^|$Ecdy~x#ynko^$S9ZpybsK3+dS^g}=R
zwVRPZKX?nnqVxk)16w&SMHb#?i)WxKh64~6W&mLAV7?guft)Ocm@GLsNWjXW;%(Cj
ze3YVBCe0%Kb(RJuD?2T(^WE~u6|ZP=m%ls^VbP?db>2g-qF0s7*Fqdp5*apxU;#EO
zi7voe{<zn<@k5sB2bytov4Qd84PM>55+`VPke7~ss;n9LW3T3FWMoWeMBl#aH%drI
zE{_+#qeaxCe;(3InS?Jt^#Te1%(H@tiY3ogIMpDgsZhet(Q_}io}`02dHj${e}oAn
zE{q?-(8Iy2P~fr**{z*CZbal?!p|B8CfTqIsedbnGjo1;z2-SLY0Qe&A1bctoY|i;
z&ucDao?FlBM=yg(9@U>-eg}GaD6Q$_-{o7~sih?MEu@g_hHBCEqmZxLB>z<K)tbAL
zi!uLZS1~oXxq4GPeaT)Wf?sd4WG~XP?yu`Lz|w6xcRf723ZTxOKr*TJ(I`qHE~h|0
zk4H@EXU-*mdGc+)_Iu{EH+v7qY_`O!-}YblE~4p5y?@T8Z<~KP)P@S%{-Vg_BJXZ&
z@%o!>J0#Gt8zpfV9qaXqdosuHPUS){j>>^?WS7t;i$DGN%Vq(_y^U%H{y+bL|9CoN
z6zLDPYW_?+BWXn3<)CZ3&{DAbL-%EaZbP_)TDfRk>=@A7qzg!w;Eldl80Lq*3UM1j
zpLM)xiw;CgZq4fhyXMQ4!OoMO&VVtFoAk;e3G=KMcQUQxU+qu{@gtnE{DJZG#NHZ_
z%La)X#*aiVb;fY`1w0L2CVd;Wq)4ety*j`B39tGo98Ro~=JZ78kI*a0z_396JC63D
zq&lVfRe$$sIobzN40hK0VT0)@VZ(KgM7l;XhZ&;l3+;`ZaonY(E18S*e12?co}Uq^
z3?BRfF(tqQoy3!FsIW9FJK<-Cz|UTC{OrP+AVT@s|NmG|{?r}U)73odzn;Df_4v5;
zbPVtRr|W49K_&iuEax13ELcy>m;MI#r%#gSBqF%=gnu9s+>r$<2vHyu3QQu1SHBuc
z1nXH6MxC~DahKRXENGjFZZYQ$5g9i8P%WDMEVCvO!MpM53?i3waAd9((4^Bh>y*DN
zJR^d5_eR@c^o8vPWUut}&9I~8ua+niL%81fSgdG8F)Ya)G*h?kf|oh*CSLrSQAGVi
zxSkA0J6Qxn&&=_aj}QPtx$<88EcFL{gRj6+Rv<fAW*9Vh<0j>qVZ6_sygC@r=>$^@
zH;V5w`gQ|;NPZopwBLN>GE`tMBpX!0d{o6#0l#=IUj4ea{9PpK6_Etd+W`9E#r(iu
z-*OW53L>X)>y%LavP_2B`zhB~I%+YH!6RK)^NoTI)0d1uJwZMEI`&s7Ic#8H?<po>
z4mMT)F*XC@b#FxyC#$JspAaJXa=dy=c0amqcI;h|t@%#cgeRGfIh4|Z%w6ZVRdK8=
z`pt9-P_7L9ru_k?%434RTAqC%zh6Np<9N8x>L@4Qw|Q`UR57o*F9#p?=C-pJo|Cf|
zVk=a${p^L$w?E(&e`q?9X8vh6^KZ9tj>-k)m>L}Qd)puW?O@{L9QZBjH@G=@9w^vP
zs&M*1Xg74t<?V(XOMfa8p8b*0$;F`^(BLk!4$<I<8c@J%5ngQNve}3+a8>3~lR7$>
zJzPy*&3wSEJ}_G{H6r)*dXY)JGWmtIWp9H$o_;>)^DdObq{w8i|Jl<;Gt6Bd8m|47
zaXj%)G?J^$Fzn}SKDNCA4`Fa5Fd51F?9=Z1RsO4@DSS2PJBE_XtGx+%^x*B~)BeH>
z!~yw~3;<g7CZFbz5&JbU?Fgw}GUM#muNfYWcZD4uE*6Qv;lfLg+f5p2YWL*Vkh=DQ
ztwj<?QPQ$16rRJj7h&ZW|A{bQv*-0YhfO&88Y%`j3gWhn4c0B2m0m9v_M)QXk&30(
z@rB6&F6SPWYa=>dU8enUHk1#C;JnD{qHwHw6Gsd)|FRgT$b}^G)Ki{pf%STumAvq1
zHsjl$6^^YIw*5oWmdH1VCJjA@Xn$%A5Un{mOL&G=pg)qi^%SdHNqp^X^YeXrAkuXN
z$dDnGdjY?VEeq)oSz^7My?HD&g{&{m-fS8Q;#5O<=7vzd1#pdj#O9ljzrQ<%?rUY4
zUvd+)pIn*w5-sKI&XBLr?p%UVIy)cydu|5!ckvkvsKB1Ell(!W^6v}xyB3^ACqE`0
zr{-H4#D5o$a~yR!`NYCCbM(UTw?Yz1WS2y4&-vdCuZ-8d?y{=ImI*ER>-O$u<oX-s
z<y<cU{AujCRK6S1ahp@;a#;9OG?^FMhh=8-1pL4eS`XUJi?JQecEzk$y?xhvV|+B&
zynn`Y@-cctIzKFK!*EsOq1t%$V?U?q=<1=G&7IZ7?F&uPi;<IY=mI?GB3<%53;i_I
zivO)sS?siZjmB*qVJqZo^4EZDa(ZU1wnD~HC|-9+Wu!|PGRzQ7a%LfBv%5)dakM~$
z4#8f|+UDLdkPmG<)Wi2}E8WNwN<7@SdQuIRyi2S6<~P|3dqec9KN1JT4x$Is*|yFT
zrBi$5nNjKn$sZreJj7>#|9O9V8{VQ(V>9n<A1@Cj#NMWT03ylo-#FkcE|DrYr8rjo
zzU*zp%lG+*LJ~3SDPCUX;I<Yo|4t;aT%St4O;0ZKs^29(it{D(%(Pt1VWF1$koQoA
zBrYY6?hhFp85S?4sPx?B(sRkT^OE#j>je@DFN@Kf*9D3$4i2mRyH93cw0M&U!>S<E
z{!!0`*khIJA7XEO6pHe4!FuVhf2!e4vS;3sy^#fY<7^h-J4R9Z=|^t=r_krfBgd$R
zqfx)TpIWm0v1O)oaAurA)rchS2lV8h`26n%u<5@sb1g7o{FVZWQx-ujzl;XW6U?iJ
z%R~~>G+9h;G<URO=W00JBwr7!RC^VbAT~_e%!3*|i0VOd5@VIG3T`k`ND%{5rGKYg
z+nLuA+dkeslDLVd&WGYDM8UezkuIflEbPt8lwkZ1_fB$Sk_QR>i3~dd`I6SOgS`5U
z$fYHf{6m5kL(5oll7)&uM4`!p1iu_q1R{T8w_81h9>N?XB;t*fvbkV$F`%DX{+6MG
zF>S$_hcUICzI}Lp#E#c8QlYXHnSB|I$y3JrH@iF>nF02+gXWMA(6_hw?JvU#R<eL}
zQ^5i7nT^~qPX*r%$^L5U{4s3Z_T8uPXfj7ok^hI2LV;mbcF%zG%e$gVcvb4Z%WKto
z%rY(1KL1?^K^`Bi{NmVEmHKZ$KY73tNtFpGGRM~*Hnwu3v+sFJ7M=6&h5FA+YJ*mW
z4yTxFH6QKDvYN-abrf#m$Er~nU3sWT6hor3Cvihp;jEAmKaTto$ES*i5SR(9b&*}@
zUE+_z%DV+K{<<~G6|s96zy3z$Xni!K>yd?KR6d&g%6;8);mnIpD4v>ZDdsG?6A1Y7
z%I3BVjHOPf06pY?*}}!56QZ%+NiqwNzq2K31afr)>>=?(+NN+7<*L}feEdeOr4y=n
zSsakjvni1yqw0N`U)(2$i=&OFsJplnKifVs^E*|a<FBIpvATam5-A?XWKOC3*Q}T0
z`(kr=EIx*vi4WSI+}wFiS#kSgsdm`R@y2F8gXM+^F+Q{8H{vEmv~RwEw|XheJ}&w1
zq4sJhuL4GcX2XJ2n<D6C{;MZy<p+zVM*Zlz5O9Kb0so046b~8jARAW5h;Fw}85Q)?
zI7*~zuJ{H61x<`2(M#QkBQ>BXps{H7-}W!GPmx>oJ_?ByG41sRg0i|=Yo8%+VIsK@
zg~HZk!Gg5)q;6Qt_2PxQ(Q-DS{?(fL7MI7~ag}A_RC>k`Vaq(otXCfegrHp6wue=O
zw?H9tuHL4{e@1Hc4_G(BepM*ABQJZ&*8<_gl1`AHQ-O8yR76&@>VN9cTdkPHa;<`{
zw*wXK5f~aqsnznr`%uS$U$vsS`X6Rymp{5G)j!Y|;aUkrXLPDPj!GZ0f211BU`>~q
z2MgzHLK4rbtQ2m;bRxs{izolyoP1yR@w)rkkI!V7Hj|%NLzIXKBkgF`HtLyKW+ACH
zHw2_vaqgT-o}Rufd4YuJwlNZhWj<#4{~bRQ;eb+u!rnODHO4%)|4z<1WxTM62rzE2
zP^v1<Kd5ZuDlL8UxJu=Ci>h0Y&F%%5skyH*ejxuzd1|RqOL3G6>WHfGcyb6;fFXPy
zNAOh@xej%m2a*PKnNFfxX7KZ(zF74GR<L+gXg^>_SjW*e7qy)UEb(K6+dCnB6~EqQ
zob}uPEdk%GX-_$I@;*uVvQIzg)X9JPtBwn#d?AZy#vpMRm_=<r#B6{mYC-U3mC}aE
zP(l<7CbTwsoc&ptZ-uzpvdm9Fw)x(YP~MlBy#Rvfl?|t8bt2IU2*~ILJH!*#l87H_
zecsQ>@d=C9RU-eKyli@DZ+JrTo|#7jKPRcFrS$0|%o!^(=jdg6{Da_z|D7qheQA@w
zwIy%qR$BUrAKsX5|4FSH=4x3jBhN_&x@ZtAJWxkBhB(Rw$uY;QKL2>6Dv2a4OEUK&
zX0oaJlK&utqSr;`^fnD)ewjYjK#@}C^9ys^=Xf!<jeh%`TGpfG98vC1(+%@P=u!~<
z(~h6@$`;#G_b+BD9hdCM?iR31YdQ;D`z*kcq&EH|#{m58TKfy1s<pvtz?%Bt4z=6o
z|CXK_U2^ad1adGUDcbg6B410Hjechk8eVBI5m89YGME7`FwQ{gj}|-M0aCv`;Xgs@
zjbmNkkFp*IkUCs9{UN3F-wsmF4irvSw!?3rivI#~KmYoUkZTWGf8h2Iay_~Na%a)=
zJUi?ce})|rAf`_}!dspl<Sl<5>az(J{EWZ<H^o`8>olhhscd9*$m$YH_Th*o+W>p?
z@7mcay{xApevJN5W=Qh>qhyR?+XY{tmPRAZ+44YM=v<C<@#W$+$aynD8RY74?~x6$
zeV2Q?^m<#?dY>PJn>8dH)Ew0L&Jpbon&W=w>N2Y1I}T7s7A>c}3h81NaT<m5iQOez
zw2VRZ<|J;8#ZPH3lI6?;9#%Al0B+Qx2KfE9y|#nLo4})U(-cH87&z@oWntqY$)c)W
z>#PNN*-Szi5?mEYAXj)xLJ`jlzWbN5l!7gLWq8mQaPf-BRo&j`vnwl<G2FK4OH_)j
z;74@9TrHELK!0nrl<)4p!ZjAJP76Ar6hU!zVgbMzm9;w)((H8V)#6WxV}oQtc5+(d
z+>%P<ut~*J1r5n+lLkU;Va+--7<p}ylh*`232c+{0-HW?B3mQrUnI2E;SIwVRVs|s
zR=V%cCACeeviG6frg*cQ+~yzsA}Tn!lnCP}^KzMV;~1=mXMW9?**`@6;NHn=Vu*EX
z$fvunFN)>FwWpl8wk~x%;#$)X*^&^1OeVuDFyd3ckhr!F&2H|jvb>B<@TUgs0;O~5
zV5IAdyb8s&I9+d0lm5h`S-!xyp<lNmUD}m#_JT>o89*^6AR9P&tyeM<vL1GW)qaV-
zuakP$m4rVuCts45RvSy7igbi+7&9U>XL9aRs?=YBP@P|(Oi76`V<q~mDOx?DrbyF=
zXvI81r{*$Wi<r8|uw%=nCMReLnYg(Vm^-FSGK0z2c+qM8m@*1QBZ+n~Z%R~AVKK&Q
z&a1Xrq@1R1>d!QFy!LN-UQ0i&rGGXO%@ScEe@H!<_KM7JxD4ciPUXvZAe5_taot+U
z)tmDZ)laTwqA=eS%GIY|tJ#RtF`Fpl>fA&<Lw6!w*YhfzC~@v`&eJIKe;L<m=Putj
z^&>A=(`^d~xtzYrUjge*pJs_^W~N^Czexr0x=N(l`Aif#tz={YPqHzI`p?D>saw$<
z0Lird)NJU$hfZ08bu!s|QHA-->O0`aOi7GJnufkiDUev$&kI|wSBsye#q2!cQx12`
zB+zv{yT6%UWC!o=tqwb8+deR%G5*itic;8X*v3hZgFw?^q!s_i;_1bt!BG5u`n1~2
zdPVdBC?}&sDx4t+1mrpouFO1oA<XxnIjXH;+rvVAPvr{mKcg*aw?dTPe!JC?Ifl13
zEz}~fuzpiwf|Hlid)akiD`zdIojxXP@~Vj*eSTnOZ-$Bt^!HPdfxi8<koon78K8T!
zyZe`XmBB+ACn7HR%aD`&@97C3k78Z%)7)e?22zH98t+y30_qC*j=jp!Z0Q$U3yhmk
zdME=#9R1`7fdV5JHWpPLs@(kv`#}YS9R&psw*Is|8q)TW+tc<QBq`B$QT3umCkAor
zEIj}|iUm88AjgK0cI5msK=@@27a$e)>NEZbXnB(m+?@RR3DWmQ2xTFCZ~Vzl==(?t
z<>~vm&-^dxJ3xoNzY?#0rR_h^_t8ZDM!NQ<*&XQn&2%u*wI8oSvXQ6nXL$c5eLrgF
z^u1Ew??B%nNtk=<&p>`m;PHnyT5iHZ<2cHh_WSa5UAEwmq#KQQguQX@AV=A`J)sQ#
zJSm{<jdLe$N8DkuA#tY|)E%p)qwboCLh>$l40)eJszOKJA9zaS{WwsIu$lrEj8&w4
z@61ggSg>xy6g_+v%x?1Wd{}0a9m#v=Rho<#J=<ZPitN>Oyb9+@q7}3gK77w4t!~}r
z!~=CB$A?|NyNG+flFk}^hfs>jn#H2}Y5!tsh}UUKy5@ug9w3MnqK4XrrU?B0j%-(p
zH0CKh#K0*0SSo?S+e@S!mBrV8XCyv=SJ(rG3|&J-=o8p^wxjdSqVvn3^Zooh{Dnvl
zA#I1i_xRT!Q^$(29xG%qJ)<`Bgwb{jt2CNzv|YChtN>P-AN`lKec8v-_7jY@YZe@B
z?>8Tr4dPQDL)*zv@swzlS!bc|*(uQX&e%@r`*nmui)5bxOZA@-k-t(;#C+vS^=EGK
z^nJE;MhgFox(eyLVs(^$Lgv-j(r^iq)+9$Y$!?Mh6vkWmtw6u7$mu5z-=9*Pa6LIu
zXZ6MR-4Y+u7q9MXdjZdA?1N8XOo^vwTi#>5r15RW=vH6)%OBx9eVEIByu>}0tT|GT
z#~6P5{sab*&j()U!8)ISS2S($PkyqCx2?4}-@H8N^xZKJyG|3|D#L&|4G$!Qv^hNl
zwdC<wY8Y|*PM=H?$b>Mqs{Jj?DMx@%Tw4V=V8Pui?o_9UmXU`Yj(#Sp*uGwRR=NKs
zeFU^iI{>49C^vHG>pZKX*p&Kwe^l22{p-3%b;Z~hPP5Zbh~@6FUY54_*qKGy>%HWf
z?2GZ_x68AnTXy*+E#%-M>>DS8`2P$0M*C6pcn7V$k5%0JvG$Ge;Fl~8*W4swCeYgF
z>IQJ_+`jP(fb(By>2EXw(CzFS4;oxI`unQ^vv2HU?e_Vr=xNTrkr&l0_6?1+0Uz;x
zHV$^8IN2>H?Q>z9H&7u>!K~Riv(qTK6zscJ9zNcfwP>+u@<Nc*e4j}|Uz(W$Q3-MD
z*lPEs1Luiu&tm@6L(uJ2wL<oTBEKV0w0TH-+5I)22-jG6A>UQ-GNPGy6K(spK0#8l
zTij31mX&%VRrw!$MsUWS)2u~~G?X_XRU<n$Sn@0Rb*o-byI;Lz#$at{Z}j&%Qe3R5
zDAUBf=0e6(YW*6uYBn!u>PH{ku1k1E0NIVyvz=`3T?kN}pH&#PrTsnatL0}TrU=9J
z*%5jBC(f?g6@5g->dHA!5Hv(H-{_BFo5_RCM%lBEBz_5awb%M52wg0Bimn^MO&u@f
zZ{%45-IA;lur#8|>&HKuWEv6&A&h93X_w8Nlgo<r3e~eLHu@`0M(yk^#~?n?Uv#4)
zF3%;1^s|Z)-3#~yTaSMhsWw*rRQo1%>g2~A;4jPpKGL;8Gf$Eux>BgS7(V7kGeInY
z8H;Le)oYnB5_fWe_pBHND4}hz_mIdnmPLAl)>0m?A3N9v^L?6;@#a$y%SmO$_R<c*
zVUrOyS?=L<#$JX7^wJG9hM(Jk^ASC;RPvl=t3s5D`gfu_2&E)4e1o;GP0B|NVf~BO
z2?_fABY8}JD@>YbOp+NOd3;f(2#gWlA{Bjq?tr4q5xM()zNPzbh<_?0fzuqf`1M;p
zWUZcD*+O8{m%pVWd|tOq8_yW3`_~bX=+c;+&dVjs1_PM%*xhg<hw+W%k$;%v5&H8S
zObftteI{(7G1Xqi!2BoP&bM&yjx8LPYhm$KwD7QN;g(~pg_W(eu-?!;0l?YY-<ElT
zInU__oDXB!O#J5$56RZ5?AIOrT9s`LE;0-FG<;YeKGe2E`0&{9;``ynsbR4+#e`bL
zkGX-~$3V3jUlRp{tpPbD><aUNwBpuQ>_0lry{ys8hTO{rd%1|0!ll{Nl)XiDx2sOk
zBmb9J*t5@lWWAScUoYG3OtCQ6)qN5LGRO1C$>kbE!LICOn=aI1fG3w1(Az|%w(Q4r
zcU<}nXyfu-k&+iqn!-^+c$#BT^9N6MOv~#{RCWUm{o&qjx~`EXPDW2u-o{fzqbhH4
zPQ{0C%=rz~S@hi)T>!T~m`KDiDnj03cJ<t+GDt?f<Q>*RbKRSf#A)^crwDRXR9?Ih
zZEC=fkNmE)_Bv&!dAjWzlHq%stZPbx0{XJrw)wY6n@+ovpLD|YKgMfbnu;hBnfEi{
zS}I|uJyF=|q(181>OX%!bgb^__TQTY1$|^;(;!(;9z(`R-cq@kmbArqb3<Esa>d%0
zIe2dFag|>*kF+uOT^X<Z;+So7AG^7{Ey9=WC7g*s!lt=xm7{M)l{%4jn<9&v1`vGy
zFfYPG-zEZzwcMw!wx5aEEDwN1#Q9&WVxl{aXwDJ4Fj)`ZT+;S=G^*o^Fmz2Q=AVXQ
zo&MOUe-76__12llqPEI<x;CCD@VQr4UZs8m<oUs`n&Yv0k?6jNZSIBbrOvdi-X1vS
z?VF=*Lr589r*P1pgE*9v4W;8`c$-(bna|=tcRjz6MXvK47g{-5sZ?4U=H65J25%bX
z-chM6+E59mU@C)DoWk6Bm9OhFiXWh7Z`;kp$^&`vrmh#oD-W8Rs8pyON9@h*o_n1^
zW$r)X91-U8ROW^jen~n%uTs7O4CGq(vy~rJD&|J1DL|}fE<`ReEMcAX6HqNuLa1AG
zAyN4rzd~>AgX@pWvfwh(wLjtkg;kjqQv8|zC{z(#5+~}q8tHl#GM>ve0Z=qBRUOTL
zRh6sy5?5Hgw?Z}fLbZie8$`Nfo}p?1)k-(yOM8W-4X|AsTxt0$+88xk>PSoet0}=H
zDWxbZfL^H5%Ub!zFM_~vyqVwt*=p5-ovsDi!AlPu6|9I^Fn7#5^m($pzegtCY%*E<
zm*J7Qn|4j$^GBB9o83@Ey#k%85j2xtg;mpR)wJwb4_?0YBd%9PLsgMGE(ke*dIU_>
zBlsr0Dy!$WuAb-){b?b%d&G%5tw-=yJ(_}~S8Mf*clFflSWgZ0M0Q$_W<vF7a+02B
z^*knXS+b>J$9ftxoMBY9qJzN|j!Sc+dNf^04|3enVs(9&x^}3ip#`OEwyFR=-KvMv
zc7TuhG)?zXx7e(SfMFqEdvywPsBOaO5V60Dpe@j(f*c+E>Mv3~gnh($MkxHZVXlgb
z3J$VT4G0)Dmk)pjo`+Xy@~o<O`lf&b+)>%8o@*C72qBgvlfK39g*?8IVMkSnU(B~w
z0^(203)C!s>P5F*frC%}Oq`770va9}HbGf{X@EFo+tT&#qKJ6sbpJ10<?I!&BCc)m
zM@p2gd$qmKM(ic8vrf{&VrqcD6#1VH#s*DmRZHq@Fiyr8XNq&qLEXW3>YQ@~nBXxW
zv5?#Jh}Zhn9~%!9@Z@PE5WqNdTxAPk|J+XzNHE|PB_}ixP^}JV(Y5j`IyFtdd{Ubu
zZ6aRwvj3QySeg>L#!Cia>@EJmCXFDZYOdGwNt_fV8>~8ZH($wlK1CUungu9r!?E8R
zFWh>`iQ0T<22hs2jL5D#9Ert(&Yr@cL&P<mm{1^ClU)=HRhyTsypSm=kgjw3L(W_Q
zT-Aoh7Rs3Qir0WV3tIj~J@Q+9HPSU5sU_Rt<d&SD@hHvwT8$$#m_@CMHD2`^RiRh@
zSEwTMWyFs8oXMKmz(uZoifBPIHCo>+=c3FKP9Mm<FPJki4D~3SPhN?(4AUcCiN0i7
z?Ar!=Ai_nNim=|nI@O?{KA6{b)d_FKid0>Zs{7qL>dHi(eC&531>tuh2bp868~To#
zL}>V2bjQY5Rye-;5#PN;(VfIYCm!U5^X>L3GC${r{mA+eF?%n+&HU@K%$KbaY0%ZY
z$Nq$YF8FpzW)?TWJk^ib#ns;sNlea{@iN0yM)hZATKhGT#54H@YBPTb+rQ7fk46#~
z<lk3i8r}N>xR7{}yKdm8KP{1Hcu}E?%-7$}t$)&U>^%o{;x9l1g=H1jC@g&!pib<4
z@CSC9<F{&s)a+Mh_U1d@dYAF`?y=bDe){E5ndr%W*69Hp0PCbkKlYQ5-W<V|=$_HL
zfWXB%H^~^_{{}%*DcP}RKBblV6A7NSQj=UsG;P$1l{`#~q0*5%8+4SHx+lOMClLp!
zSpVr?T!Vsu_r81Wzt`Yu`F24qKMSkd-Cgu%_YDS-9jd+!rU#ktu-ep(cQMz>DdP1n
z@tcxY9d?Xsz5*0#@+y`8g1WzBm(0y4Z#@h1&1ee>-N6iDmu=D_il?SRnb;NILG{q`
z8Pk2`q<ycP8kOc*{j}C$2lJ^gsz~Aon!y5<0(a7}$YW?H{%Oo$Gr9U>sZ*%fbYZ5`
zOR>WYqg&+_`*62zEs<-fE`sbQ8-S7mToPN#mHxV=UPQAMh5G_Iz2eC>@=6rw6`ON<
z#c|>qeAytS;({>(j+Bbck$G2&)k_`QUZeQ&QeniC?SEEI7HxAk6KWEq5CeJCHQun%
z&k>W%?gs7)@f~WifJYL$Z{QbM_c#2oAQtljjQH+q!x$c|;Z|fzg}eFXkywH&70}Gf
z>oFoB`Ci5gwX>OD{h92i2O?1VqhY;M*$#ervy~r_BQ8UVMj@5gqO9Ugk>3LbA)Kjm
zq3{@$idQ3%Mr1LR9Pfu6?jSibiRF-vTQ5MM86pXZlrBh)<YSy?gHt8lR*91v&$LQD
zN^bm)s@Xwq6hu_dTC2y&jrY2Gb}Bapa@wa=S)rWf2lZ^QdYs&NysPIU<i^L(-&ubI
zRrN=}4&}yI#9=>PZoE>J74}C{qIxtRg>oa3%ty$LlY{<jFE>VI-mB4CTGvl#jIylm
z*S>s<=DUXe2I6A8vcbO}SwTjD{<31pg@LRX<1?cFPF9NZTTJ#hNs&D+sw!7iah~7C
zZ!9Ti9-McO7FkaEr!^-R>0-J{`%BWHgve^Ny++S`qJ+J=a6;nMl+t>Q`d6xa&VQOS
zbFl68`EMQyQRXnbJ}1v0L)m%^cR9NyP0gpqcoN@)S8~>fLTPGG=`|lCPhHFBG$naz
z;TJ_Poaj&>Pn~Tu=j167&5xC*Y+KOesrZ-;N>~B`Z1oRV5(rZ>BusVAFNmjS^sQb<
z_7I}hcw-2q-0Z%4^DbCI6D&KVgFvo=4i{x^79Y;TtA51ZZJ|7|2%Bk6p3nhl#_!*m
zq4&rWE&11FnUUPcZd*U%3EpEb#)_AdCy27m%M*9m_btd1aHOz|mw6|w|L5*K@<eaG
zf!fTE!}k}t_sA2c<=<Cjs=2|V`l3>QYaPsO4_Lf^EAt@E2)<?s|Iqo|z1g$ECzC^_
zd35+>xRHE+aw<fo@pg5c;c<w!UZ*@kpYd;iBNNOl382qEjyA=5?>W#h|HNW2fI^p4
z2@?XYJBk7OOCX%C;Z^>*Tx*l1Li%&+kV-4*U#M@Wb2@dY<j3td&0{s%+N~A#F{x2a
zsxBJrf;Fa_GTmr}1^%?9aI_uwn`R&IpZ1#`sCL8r!j(WM+;6JT4IFXj`%PculMv7!
z<4@bUzXL^~_wH#yYm1U@KFa++>k7~uPfty*Hu!KfI{S;936^|=%_bbIKO}(ken#vi
zH^-8Xl3IZfGnv4`2r>z51CX<bX;z}eUhu>J8&Hp@kB5J8j&g64kt$_YOg9s3`ZlK+
zCtt%fnEHzH-+Ei#h^0#&ReUYG8};U6In0us;DG-&mV9l3$UM$grOVBE(|(%MQ!tpH
ziE62sMLSb@J|#h>IxpFbR7ub4;D<*ghpdc|hjlLNNxvQZ#?#-a2~+JgC!fol!16ac
z>GJIOwo<<o^5#cDF9+#fzl8Q;DJDPJROL?wN!*K7{(DHrg5Vyb1wpfq7ub(&R})97
zgNO(lHhn<hAsTFUvA>j_vq5mIdnJ-sWz=kTLu2wG@^2_~;Z1J~u^ba>k<i$uTPs6F
zHQ9;@y=ISzVDcaFMRV$iW?(i<4`piI%r`N1U7v1BkO}u!l$aRiog=zu^uWgSiQCMX
ziNN<RvaA#}rY9FKi_(dSh?>!4=VtW>m)R=6<{)M;HK8HMj*FX-8kkl#5y!T8avFlK
zAKS9oOP82o+x+QS8DWRj+b`sM+t$8APxUbqit&CzE2$*e7nnJPkiQ`E8G7n}C_Os4
zPTM4V7<7)pUl^-@toT7Pwb+hRr9Ed}2k|7Htl{L!mg45*mh5K?pNX=p@Hax_muiTz
zaD1<lI-%`V{#qWB&yDT*8BpIFdGBU;5J3tuy<riu*PW}8mMNAxwx;7^))QWfB!R1+
zQPoDb2M`=rlAOe095aZ8BC2yTa{F<%kvaSe673>gE=9>ouYf(}_G8ey?z$Mg%Y>rz
z*k-QLtj<=tjvKz9j*=H|E_5sVJEuRo)4Kgq&@posIP0aAA~QX+)<078T6p7anbDA+
z6lTN%-2k!|PtNcFlx7C<*kIfLGmyIvd5EMbycwKX%4U?m1_%Y{o!uw6FgRWh)62o}
zbDt04IG4v-0_CiD6^C}bXs#FT)b3+BKxQkmRIq{4v<8QKjy8`ZHUXi|t-HDS^v<o&
znb#4!5;#F$&YN`0d!6qNIKQ-W>%jJQ?PezL+zgM4r*P$EHtRcOs)!^GMS6@SiL>g)
z<F!Euj3m>rbp*fhq!sm2KaowqrjM<U+dXc)`rBn2h$Jc`d=^Zkz0H2v*%`o?xxj;H
zG}I<ixhH#KsY|1|(^c6WhboTTE7m!w91Lk%Qy`vX^e%nodd^~ZLBXf=aDKsWZ3M8k
z8%(I_a<_^iuJA^aRe*>l|B89cWy-xuvCvXe%9ZA0Z!=@sPR$noDyE7Zm~x#MwNb3{
z$4rA|jS(G-r}pEB?_ZxK(cwJ$AwP{>*=PkF*lFEQj#4<(Y>6kWJN^h7loto2W(u@r
zecPB&u1U|R81t~!&F9o!dPbG?K@W!MLF%NkSpCVBwe9Pu99LViNV0tyc0Nci$mxeR
zmLSdKWBCHm6kElXoIE}B+%ph`IxkeF$ugZ*ccBtZ{^4_gzw=U$)RTZ-=hyvAPa;bv
z?Mt{Hkwd4JD=|VoS#o%U-2tq}f;^LrEF)V_wOsVeGwG_CR4d87TPiTFT|xbqmJ_0=
zWz0-YgdPNmk%P3z{ytHm#-%l4TfymdyYgr|MA%@TpSd2WEp6pGj}VzBM~Q7TfPaJC
z<fd^NPImcVxqlnRE`{;+$Eu&?D=Vny(JKP}RZOCY^DF!|>J4!gufM4L*n$&*JMk{a
zpAVb9MhHmK(I{&{0oVT`*e3H2h>64h!p7S_Ep(=A17!;u-^jxEuVUPeSoCDcQBvZF
z->1Cvy=uUp@h1)D89xizaK#AiSeqRZe3GfuV(Ik3pX0gmul}s0hd;LnRp`2<bVbSl
z9Da<r-NwcBfMLwTW8T)o^w1_{@`%EMy>vRNZ<OXFnJ(utmVD4Mraa^_nY*kh)vgqo
zR#rEZgKg;wYY5VPuQeGOw9{R!dZC=gn;7%54qEkVO^oW3kn_Mkn(S-r>(rfUPNL9v
zpxp8Foi+M2ediQ=y2Wn<Yq5}DH4Pk3-MheE_CM|FYkHNurdzMr4iGx7_0nCpaSv9m
zh*Wo3=hT}^=na*<^<BP5-#b6}0?bis{(eIBRO#>CTdOa3;MFx&hn;zq7~!r<7pkB(
zAd+1w>|Y_$lrCLHehR$qATEITv&0OjN9%jLiYW6o9nkIa7P{86?M^+TgPQAYcW$+R
zZQe)Qawxs!J2vVXN+GUA)tK~EM%SQ6I0FECcP)2-RU<Wk5tkaQ#%`H9fPQTBN9_w-
z`ThMJyaku8=RTaV+)P<Tt*TTNT|!0vi5d>&&!c>Zq6GXJdbq4Rh7OrcbUS?id%BaW
z<tx-;$GL=W$?$8_Vr^9D?Rm7QxgQM@s*gQJ-R~{e<2tgSUm#|k>WYRPd;T+gq}cng
z%KO8Mz^2)g?PhESuso1{TL`2DRFVBjUe^>IQ6<hx?0+i19|@N$l;2-xRG*yu{=%m<
zC7+o5-oz~Ut$9-8Po%w^{5~fTRJ`OWzn*IeFUJ}o{y)HP0Ie0m?{Cb`CkMZq_WnfS
zR|@v}Q3#epO7tJv%fW9lx}|%8{$q+5`VR!VQ2$A;6B?q!I8D+HqB$S;zJlY!IphrO
zj(rr9zXsWHGAtoZJHmIc6~XyyM&$5~OP^KVlpyWat*{1Z=TajPV5ES<+2oLoKwk@o
zuB@>@MB?6gP~pF7<e7+Gxr}Y|x9tVc&>u|8EcIK!i9nk9P)oPr^=;cM@hVb@f3Rpx
zs&g&`Kb~yGS!@-ObuV|ipN{^sVW+xJX9oqOuSv+Rwj4Cy$KeL=oX?5DrFJ1RBHPy_
z#5ZLvP=TKIQk^&JLzny8G$44f8(QiovV{4sOQCnV&}jy++g(#}RIcLJl!r+Uc$e5K
zq~RtSBY{el*V(Dhi!*-_!A3tmg%`JX2(tCJ&*dkQz|Re+o`;D8T?VrJRL<N%u7=HC
z(v=>4`-S>Q0r%-s!G$|-<2F`*=VGg4WLQUw>S)F!kCmi_hS^4#gwep29)0J{ls0YC
zRdg=r?XmF0NP6_o1A?^<-npW|Dm;66PWfCF8P*VhJ(jvcXOtM2iQ*2Uh_;9LboKY6
zAWj{v)DcTBox)E)h{sYtZQ+^UVW@4|XP3sB`k}R2{CDkf2-rWj1_EH?msl0+yh2|V
zXTOu%FO}6oxQgtjC}crL(;D@Ufj-E}6U`h3+X>6tF+W1ao)ID^+x*eD=oh0(pTGGM
zu@E#@Nt>nqkHKbJDD7WWJ<|#_kH4y8orUM;;6p2P0UoEw0jf2(rV_Ux>X`#vAn&zR
zwpalfY2eB1r-P{&W^~X^RB>l`V7%Ry)*G+jP3A&)+0^-gdf4X=r2m;y^vu<Zp4t*i
z->doedqpbAs1YDh<^=<Q>tFhAp&5kKzrjkKLzfK!GZ#~8zVuz9+m>o3{L@Sx*np{I
z%R=ExhzXgm!)2LZk7rs5&B<Hsk}W4KOO~H>-{7No@_>h8X)Eg`2WIN{6guXmh9jKZ
ztv>lTvp>#yXRupt7d_~&T+aZ0c^=F{o=*|A8c+VoF8j9{)IqE3ytI||lEoPXSoke~
zEFi1lF6ydmWey|$x`)Gvz5nQ_Sbzps@|B@M8(!Q0-Q?M{7EYe9>zIms>$z>!%X9O2
zFs)~E8>NBRzRKAk0+Jd@oXLFq?+j<=PvYwTCXX2ZEU+h@PPLyLKRs@DQP)pQ9=Q`%
z>aT_0g#7f5VFmnj3nVb)r+?yF)6uhMo}d1&?6E(fwL*F1b3&g_%pN<O@qB{tYiFhV
z^N~S9__dW6z;6+<M+m>OxEB06Sr(x@a>AI0`wJs0J2;G>pDco}zgT+eHlrgP#+K^H
zu^33k0bUBz&*RF5X4!*e@#Neo9Sj9EISOh*Tnhm>vS&UWOD%R{gOQ9aRMla{WyF1G
z2f1L0aqR8n0yE-1WL0(ZZ9sig)#8t->b#Gu3Ki}<Bo|pC|4=+Uo<GwpjRSb(AIjtf
z{K3xxH%5dQ@M1i56sD&dg6ZMAZ1j<SyrBl(P{BSt(Y*=)u27`XESQ@cDy9lpgfmuu
z6jxRrnjN@uXtZ_;_98eNGMHomFFmWl8HAneB^gxFk_Z<rXtF{3pi>wVG<On&t@MxZ
zbG9`nwMtNwjw!pAqUBF$QJ|#mu<7t9_$I0+7AapIJmDxW*s{u01W#)8q)Jbs!IN60
zBC8?Xk}Xss?hhd<bD|2>+Ky^1EqTG-s$RDO!`jB)Nn&jbzP*a#(*c$zQJ~!4%W&2N
zjW=xyo1<P{%GkvmBp4p>N65g!-Beqg-PQ5c&c+Li%~{$%2Ks2BHDm6R!47}%zGRO-
zQtJ|Xv>-G8L>UV#h(|C0lCdOQjxLn>D}SRU1XuwO&hEhOKHEyRK`09n0!K6wlyPUB
zg=MO&Oohtm0Kl&T|BhP3%Gcn&{f~xwP&`^dQX9Ysm~zL42cQ&~CMxU1AtqT!eDoZw
zWtF6EMz5pxOi*o61)g=FI$qgg9Y8EmCui=a4Lk<Ysxni;GObpo#g#dcGHP!+FjAAP
ze(6v>dCwneJhsoj^G1OU@7@BT>MixYb5loX#z&-TU^pT6r@m2c+cmirz~-3S7};Jf
zHRJR4a%uk+a3_-)pt)9nE;$~PH4%uzfxL=d!Yneh5Zi49+>8aSU{&N-z^VVAT>&E`
z3_IoKK3gKq(OB^VcIdbSZJGJ80oo>y1gl^T44OJ$_@5f6GR+8XZY6l&yrSAluRjii
zuOlH>;?t5T)5M*TJ|;Z_axD?N2sFS{!&>o2Jp3V-oY%eUj`RM3I>fv;L>7J!&VA+u
z_)lQBRo+1~1#a;#T#VeaZL{Tt+J$ElSIe2fsDE%?h-}CC*P%vc4-=F4E?;oh!h$)8
z);CuJ;ad|bZhOu4+c?x|+c-@%5@Ibn_&=7Eked0M8Ki~|KONRkE8>$LyXG`w4NG|B
zkNgyjcM(tt_-l5*@Z!MWf}Npqe-(qZSW#^ZlyM_qPQPB0+UIX&aapPB?Zp-==s#EH
z-u{cXdHGqD{_pHpmow{y^3Ip(^d~3pR9f#pF?q2IVccH;jyWLr`ia2>@=kAFQSfJq
zUjyWy3=u;9AM960(pn+>UT1`#9Q<Awq;7v=@M}VX@LP>vnUjh?k@iF@J?-Dm5p@O8
z%>E~~3H*))(f?2I8$fG?@cSDx_Q}ETrqWLoex+z1eiZr`;$I2v<={6NE*Zk_JyV>p
z|FPT(;OBCQk*<-%Ui}2^DvW;h<4eD$jmc$3>IZwtH@3}}k&1nOcb-RTG<1%wkiPtL
zrPG+&dGE~>JBZVqMbA^`ls9(Fs>e{cklV<@u3LGDZ)<tCKcI8Nz;VTqId;Bhs%rr+
z%wp4|f{VIT%hb-T2Sw)G!vl`?(<wh>S{gv^t}gXNhweyJDn|@4vCuzupw3ZmkKkQY
zKA^2CwAV;Ts45T=HmbhPFim04e0c<#NM$O*dU1JSVJIW#B?J--v46}tnFQzk@uTG9
z=a#86{?AzJ%GyE2ij}3G(Y%>VJ*qJI0Q!!6K>wVGC;LIbV_%_`Sc*WD;s*@S;-ALm
z#A9VS6qCi8MNPOs$ddX3q&pKaZq>oZMTC*+te^Q>I%}vn2aoj7?<3(aIY%zX_>lu}
z{25;5k^fN12b2%ywWTF<mcC~K$1{OTsFKTZT>jBr9$wTwA*M_O7sDTf7$YezuJUOR
zE)(I#_^tmZ5yoV9FsYbkx&d%^=3pz(P~`ucWtm+l_FATGnR}Q<0=8xS0d!$dW~Ptg
z0u`1^gam^DQd=Z66j9Wfi6{XDDat%cd4B<Q`M+dgs9Vp|tzb=X>k>U|D|WPH3&bbT
zuT;e8qRO?0f7NB1d{CxfiQsz715!H{v;3e?68clqstj;0#l(iA2+=><gd?5&guxul
z*(1^2_DyrXhKWlV5|+8T3eb}cJ%x6wNp!ZrY0Bgn#n&gg+yCI^UfM%gZTlZs*YYVI
zojOw4O4~oLX6TZC8dD?9S6kG!T3afZsjl-u6Mq8l><!q|P(`rS#q>JT^%8xPmSWnO
zRv4HYc&3uKTL%4BJ-AR~@V~^|37w})X&x#?nO3Z8Yt;p|B~#M@%%E*#309p##`+?@
zNslf0CWy!F`z#gdKR^g~Kpy-ZeBu4O-*E4r4Bk_Xh12%=e{=<23JTm!0eSW{cvx|#
z356~Kk5yBn5&UPDG}@5H{!1UE$1mr{zXE1rgCl>34r=n-!30fxf#YOP_ArMpJ=rPY
zMb3lr^p6;}+W&*vXHn84v|w$WqNVFf`<?T)y}vDsyHcNjqtGWssyRxL1fo|iKe|$>
zsgzO`QQI=fy{xg9F}-BBv{CET@U`GPT#)v%O=ZHO)Sik>z$)@cKjhsCG>D8o%)BF~
z%P+ozC;HR%0=t3+SG#z&i^6=6c(T;$DC;c*hE5vh4V+X)_9wtT-AfNyWnWh>*RIbY
z_j_p<WG5R0DU_eae@ER>&V?QdhIa0B6DltWNXSx2!bKx$xNsG}?tk?+gx7bjih8W{
z3l+Cz^uGQM>l=Qy)R8(0-#`c${@_Q0IHbf=5=wlN6o<Q*)`}k4@P|h~cxrIlp0bL7
zoV$VGW3Vv?C&J~bHEX|XBCvyp<5Unfvuu5EFwzjZCW0*_PGJUF*&FG)e=(?&InF4r
z^aN&&X*#w_ude?kjvRk>_VxPGzfPw8w7IRNjxMuL*zeK0kmfU8IPu}5+CgBAvBak2
zk~aKWVt;Dr_xV0Z;088PVw#mmYDqH0&wBNsPupIu0oFq2heIP>GwfT6KnL_*`);0A
za4S$M@*)#CIM+pes6u-N*3^n4d43<R(%R#A{A#zNOg~^9GA%56+w~Cls)oL3b87jA
zr5^4v{`%7dqj^EN4<!^+)UD<~2~c`o<%Zw}F!sbq58d@o9%QP7CrKC1A==n&r4p_*
z_ltF(zQY0>=vF*=lWPLJ)hqq!^Y#1^h~BMx|Lpg)k{FDJ_bHxusUZBWtDDhl!5IMS
zF}uiA7__EE_^f^E!jeca<ld@QTZKJ(YvkQuzYdJW0N2?Q^)Epx7wVkmk+hw|RC1j(
z2-%V@em$O^R_>p^fUkH84Dv(1jp`N3-}NR83^R!fcTwEx2Hs@kx)tClRgr1x3?e1#
ze&Xm2i4$M)<Mai@Y`j;iz<*UTZt5ntgi_a;$G<{$zLrsXb@7#nNk3dGBh?#2rQIDW
zao!9rmihLeC9nEfyu;jJ%l#jTx7R({_GKMH5#khcJk6*-m9Z$>f8{D$>hO}6*_8Md
zZB187kqZTXs5{HXUv{!Xf7Y|mqODT<q4i#xV=kbNkTEU*vcZ4CyYJ(;@doFcBJouf
zwVkNB+sdd4xamK*)l%=UUv11?wo<h5q*I7K3!VNl@8F5gAQKd#IQ1({!W>tv`0aMp
zwqKO1C9I+cTgD?K8r4wBGpDduq$C11m#m&Ypu&4;7wnJIPR5hd&bwk)9%S6Dv{L*~
zTw0P&B;6%r%7kTOFWAjk)?<9({{}MS|C2$7yeQ{}vR@9R(*gDZtp2FHCqq;zb;+Yt
zw|?t-+c^ie&%aNq3Phkqk$s{>{vtiG899)BB1unYsser=_&^l}6=;#%vlV`JSJ<c9
z+{LQw^6*J+r)qaT59r)e{s}aY{S|oX+MBG~A8z+aPnJZ!`0E6HsrA1;A3}Sq=x?}$
zcd}>?pg-YAwP;uN>+Npp?O(%&ZVoO);@<obmFu|sys5+Jwu*%Q;&PS<5|Kdx&#x%p
zs}L6?!t?MKvJ`h%HM<2;6P{`SHmCtFHh&2J=ged*Me5_o(utMBVgp42^3IVRvt$Xo
z<gucQb~CxJb5*S#UHG<ksT%~ZmKX=2pC9s3_OIOr0KTGULxka=W!H(au&1r_O9(5O
zHbAqE=}{JH2sQ50E-!BIfQjcNZ#;ItqulrkY(s?-Kg+dGtL;BlDHf~9coG4Ywa(I9
z9!cC`q??@6tCt*MqkYh;##aDe#|QCw#-F{^i`?7ob#6YQdWHAxhrQy5{T08bmzRA|
zMCT$u8l^4<0U_o7p*Lv&2>M$8j1OUF__sjBls22&%bL?|V~d(eZ0(66g!E|!Zq{B&
zQ)Hn+5oSg?f-N%V*L15fvh;BRwTqL(2Rj1NUE(Dtlq)v;e$}Rp?j?i28|js4>=f`k
zCZX)E5tn>*_FWep`fD`K<y4`8GKFQ{BYweTai+)PjU7i6MdrRiOZ3mBvhC%DL_XO)
zh)3`2t!?BWvZmTYBlG^r^XB^V%G)y*B0$gt1_}S2_urT1`q<9#K@bN&*t5Z{FforB
zV(AO2B-KCTf9YWsgG8me14ve@^M<6(BC<6Af{<=T5~7OzXy@R!SR{2LD>+9~>N1*s
zpA`X%3#ca<i}=hU94X}76$<I*@;Be#HXj~^fj=_r=<>`G+y94bz=&S=dqOEVUe`z7
zDcM>XNsH0$IeeMHg0J|hVq7O?RQPA`2)BJ$%mh%Y*SgDedUgMdbRNdQvg=~$zX(@Y
ztYSmMZ-ihHWS@AHYQ#^(e^*8QO}_wj07v_)L4U+A7;2>JWr4>Hx62MLq&xZHwtr{m
z{unx6M(ZHfIa_d&_ok+_v%9$Qu)&vRoW9@tA)QZ;op!Wz77MQVt<lL5>kfx0q{e=U
zA87{Lw!FX1-xc6H!m}z&oVzNEh9D`7g>71Z15@i8f{C>ImvjX&JR15Hd~5YVNwQBB
zQY^DCdW*yJXc|xCh{Rm}T#FWB&74o1$-zi#I-_Q@zvL864Y4N}b>Lr|pa*q$UU;Sh
zZqj4FNlwsw6S35Zth4FGk%gyFU^cK_7n!p^^D=kjV0{2$lP~=g9I;z;JmmnDi6u{j
zN^FkId5~J?;)@@Yv+u)ll;Jzh2d3O$t1nlM=NNl-QeWKF$0}w6lzEw4UhIvAUMMBm
z^Cn9)u?u<*Tf!(Ge>S>=3w)e-N6x3*J*02Ee#M=V)tuUY$z<vb{doq6P+w=ka52OP
z(_@k=yt+3yPmf~kEG8-|M7ePDC(Fgwxf3?W$jDKyces2Iin2i8RZ%2{0#m~;c~Eti
zf_M^_0?L*Vn-U`ljTNFJUCt4eemXFnC~!s{sHPVixaG^$AKc`G+zEK@ci>SyBI8U)
zaV*Jo-Pt5Z?fSB<hkj8nEXG*!f(rlf&7y)Q+GO&DEy(ndG`>30sbthy!e>KyDfr~3
zkNISm@DuTq^dY-j8#CMo<nplkt&?ewHtvsf-o~2FKIY~psDI+e)c=r*t4q}1jBp|m
zFP8jQE>v`IbM=Zi;aSVKLQmG<Pse*SCfkf69rRXC+=*PyKD!=9y0(EQ!ErP9joAi2
zxz6S|R*Z4rn6jq1pC;WsVP4f_0^O=(i;LeAy=8B9SM$*B)S<54W@0or)HQGt@0IJ^
zTDgdmlD;h(uV(8Ub^{lcL{RQu&?gD>JgH}5t<6Na3)k2;bc4y<$_T^BIgG*|H92<f
z4}MDlj3ch-PGNkv!FYZweNSZ+d?Ieik0M=WvPiGFWF`F%MO@ASFv{J<2A8A&u2^>T
z*Z)0q&3f*3vKbq+wMtpFYGTF9b4y1DG~OZtdRKVHOZLxLWG)$icsm>l=in%LOY#$`
zVHKa0PrGof@u+@cmrH=4Fg?T?!LmWTgWeI*2(#gXBL9ofOpMM77->S<YU(F!)gx<-
zd*q_s^Y|R<|I)2mrJYZY)jcC|E1}w;<`!tiz&FZ3(JFcC+N+$r<w@RZi6qKOgy%VV
ztJP6dDdJ85n;yzr(+cG+FQ7+(orPB0uM*3IU`wuJP%d$0i&fM^M)^S0A_6;0)&{Jm
zJC(D(`D@8p?>RZ^m--l<j!f0(uNAKhA~#I}@=vu@h5)s__(M!+5~--4@WbBc<CugZ
zbN4Kj>~kiv&kLM1$>2HTG`p8ukB~;>pWfn$ht)@Z^a}6d_1{MRabT4E^PK+<Qy<7b
zJ4}$5f>0v-a7J<@aVwu=`iZB%?UEaGFc=q<g9!*ktPPJ&6|idBhGX*?{}pU5pOSfG
zkm;ZEW2vQUnZ}$$Ob{HY=*^@y;;&d^lT3?z)6YM639ICz?EX(LtKu{NdI>RsAB<!e
z;eUFmZ|${~F_*P>v8<s@BCS8&z-41{GMfYo)uqPX$qMm-H(arqA0coJze=N2t6vSO
z%AaRu2XjSv9B}Zh>#+6;UJIR!H3zmGkHm|QF{!q%*gy@+pCzrN#$U&RO--l?!uNI|
zd@s`VTeY$5Pk(fOG0calzo|oB>hdc4xw2M2xzp`>CHQv^S5xUmJdQ56*XfcN^R+r&
zAB}YFrTOxpX&086b?z+YYcX|%*11RmfeC~d|Fc6dJtrEuAyhV!P({fhl?1F!CKwL6
zX@7Nh2wvhdtCE=hOyY$>M!F6cT?DQTS|eV#M%rqyALR6-yc{gP&`%EDmSMgK&-<6;
z!ZNxb+AoE1F24nm@aACEv`88&L;V*i>Tg82gQ`ToOdg(Y^6;mjr06fm&cgEBO$ArV
z0?ET8L}lstbd!fK;N#2yJxc#d>y*XG;bp`T`Un0@V(v4M#BsbexgLuFBCTICUoW0)
z@5>y<o4kAmxB#917D6Tv%iIUKHM-K-7%jF4nGh}}7e5v0`Zpzl;dd3Aa=pXw<tUUJ
zbk64C)^{v5NX{BBnc?sH71Dn%ZG|@J;RgBJY`?%JO<_&<TV3(Gmm-N0o12|^*%2yj
z4(Wd5Yj7F4oF!o?Crrm<;lgmSrOt2F<ReAP7Tkz3TRP#(vd2uo73zu(W~;Zcf)<<~
zxr)~KN6J#*Tw0_c5j;xI`YoeftWO{WBDf9<*HOx>rAgC^vnPlAAk+^y2u}JzEp8wp
zd5Btv4ZQSarykt=IxVHg#*WOX2Ujo@!lxFgkEyZc{0PlAz<WFCXJcxxHvHK52AY!z
ze=a?Lj24J_=Be*S%8b0Y0h5wV%i4|W7qhTrtOqGrScmf+lrLOJ!i;}IZMBeu9e**D
zGb(lI6#unHWb46EFt!D7=Ay<+Dl;3zv{RF&g^-9PFP$d(7z8O>`Q$h>G>YIX{e(~|
zJyhdeVx%KIb_54u!|HFD0IUD5FK`TfPOrZx5U0n!J<eXI_4=#!`Z(PA(o03#{QI8H
z^~(rlzplN9iJB>1Pr>HoQvJl3Df)&1EN~)s^j_VwV(MX*ujs-?;v|=h(&A2!o#DE0
z<`HzEcO!b*>HHJwY56GY+2#Jvna-tEv?ROHJuhl7qN$f_mbKdb+LyS3zb<$=U&D)G
zRY3}?RoOk<lb-DUn)^dw9SSHr8B|NpEcYLs8H936&p(kyvcFN~q8?5~bd+PfnomBJ
zy&}X%RT|o>!pI)x?b5P-#wsv<P_R2BSxJ@!1uW+B@6!<zMp<ljR3);99d{xUtsMj$
z1j(98w(WWE+Zwr71o0zyDiLw>dAt3px0suQ?CXzR^+&zJ2t%IAM#D8XtFrXHw^@to
z;GJsR7^|0@Qr_EO3!PRMt02pz!9P)a*)8>xjg;T%dem>V|G?gWJ{Za1I<OgYL&({`
zn`w&e|CFf9FvkW_Cssr{cM*A%dbIqx60dk2_=OLySJZJS9L3^5W1TC>jB$n-mX5Ub
z*tRdPH3;i~u!>065A_s?PbuIzSE)OlE2o7XZH=dyKcnb_0H3zx0_M#I(H`aP+avYP
z8`I-eBPkutv(24nDcXAim#GPygfPQSLKr2}yxXS!G1<x^T@l)WKpaHT3M2n|Et<2d
zy)Nc;aO%f1{)5lKr(UPCh5W`Mw?-I=a~ny1t&o!8FvgQ%&7x&bK+A|OSBL%?@a)<k
z_`FhhO;PBp9nkRZf$G-T0F`qbhVa3!>;sjNaDbhb%2imMJkts*BH8WeAE+LlO_6Xt
zNQ`O#5x_sCjksjCJPjHQEi_BTvARbiiIW(T+mNTQO`c4o>4}<_WdKEgqw*Oiy?s&q
zwuM<A8K-~6_o2BXC_W#m^2Fg#l|h=m^w@onWuSX=L{+~3JoYk+HB!2$8krlPM&34K
z!!!QC7aY*cIy>((wlE;|3QIB^{ya@+&%pYJ(+54ZCDcscthe=Ch*(@nok0qMztAF?
zr`ryby+MWMDV>A9t<rFLGHaAR>78yNDxSS+h;+TfNHUXvBKA2IR>8*$AhGXO50Ty*
zvz61A%pNBgTuvp~zdG;|@k~ShgO3N4+uLG`v>)X@UvdlecP*f3Jb9;n0<Xz{b_ZYK
zIEhNzrT3!UXNYMnW3c)I@psmjx6flf?&&{$J1zKt3T7$jPV}hF%oTbipTREx^3T)E
zC~<Uq@D5jg{MkHB&+Tm>;BE0Qrz%nXKQfVvnSDa>HXM%LGHbM#Tc%E1fq=4E`rLSL
z-}UZ{2RFlwK?UEXf{<H8f(y;kzP!h(M4p8woU4U!z_69dXld=N<psw`h)R3=uFW4L
zp+;)`C(mV;ejZl8S8yRtwt_e6|H~ka)epxf+l8xnzy38h>`-%qe>vYTjWTWl<F`+<
z4!;Bd)qvwicAkUVLh0Nzjl@FzIntQ!7{@i0MG~j8m^&_Qg$6W;u;$h;s(1r89xbq>
zw6}s8_?%5b_CCK9(MhNpl_=|uwP=VW_R=b1&6s%P`5lbR;!bVf7XL#fyP`Z0wF6X@
zy5h-aWGxt3*_?u2Yq8?O@MNS*+ohQeYMDkVa^2vfhKDplRtVmh2nKa|l^3=4A}rzG
zz8Gbjo}PqIfRRUO_1bn@Ob0`4sBK>n!Wx|K=3-B6+ryAXp33s+i6s86W;%POX}|TG
z9W#MXLiuHbmHy%as61^-x34hCklFS;Tz~a(SF4qT@MUymdPF#ejO^lhdT!%0lKnRt
zG(j^w_5vi4<CTA38;D|Hc3`UlY?^O<TFAk674+d?@uc66iFHsh5Ib>*mmFT1`q~gd
z;`1Wu+F!9_l80A@gC8K67yz#BHu5jhlz^eUrp2v*gR;y^8bFTTr%Ge{aW2H<aGXLL
z!M*2tXxd8AG}DLtgP)|wysR;zDl~JL#Xf@3+X>3NwH<OSd&-TnMgry+`T>U~3^=#U
zfS`y)hI&@2^sX-qD9S$W@6N~Jy0hGD@-;ftJ?Y6F7rq)BUYxjn>y2uik;FE*-(d*K
z5*_Pw?|O<`Cg|K&>5rgnb{|UX=w2UE=48~Txzbu6L@$FW{Ifo(YQG_Slfgx;=l$P0
zH-Sv2A@qEy-(oGp;zqigx_J7)>vo4*qxW{qYE|MTY-|myn_8LP_OfxLTv4-w8@C;|
z3QSlvjgQ*bY3q~o6WR_Gz95!{M-n4bw%Illx>nywWEHfGVVRTuCB@p-T|wbbfmp%0
zOg;YMf2&ABXyyE?b*z-jZNz@9jbYF2`;P+@o>3QZ4)XSXP0i_B*BgT7{F=yDbHAuS
z0@tz7f2nh34SO6jt0IZ-@h~vqU=dbq^bgnQd&Q`gJQSn_tMXF(hX`5sU|ZbNJ|++#
zD|K@&GEF(bo}G$(P&(~U$qYj;=V!?fGp~$Y_aze}T6yGO3F7A0Yv(yp*0&sM<2YlI
z$)c^s--`T|YGUS&FHcT@Bpr(q>@=rdm*y?#VsBFg-vQbt^+Kmp4Of3-Dfa3)ovOoL
z=Fg`xQz7R0z<$0>&Q+7$m+Cwr_(LADGRZpO*bnH1sY$}wYfwmIowshFN~};5X*7hp
zk=I*oH5Z~or0WJpJ_=9Y0uNBrTMf(x-$_%vz*AR3X1CRngY*6YoFfT#njsebU>)hg
zXp8aZ!S*LQ@}CQkz~Ob2D80>F7u0L3wlCo(FaV<$N}WE@=1`d~n7P8~gEd%!vR
zf{{Y`@pP_N4sM-{UI9)Y3pWAAUw2MU`|I<+@wi}j?(||aw)W8NOfUZg?NHyh3g-b$
zj+dH?SZV-*Tbmc_g^y~QXBA~yHE2E=SNKU=S5Q2bK7lp5w(awUpD1Qnhl1T6bNY4m
zFS>hFHUOyFI3_lajfMG=EsG-#=JC|w{T81b%oS)4l|?Js>Ns0o^TaW|IduW{(bI68
zb`FUGJgz>G#8nI<z)Ni^T&q#ZS6G`qrqS7&TdP1&2sC~%0WLftPXV{)5H0g~plHx%
zS#fV4Y%8w(ko<}};{aQ6ukyyPcp^_<ge+}GGD#@JPe;1i0gm>A<D3_adUvE?1zD@g
zk+n$IyAF?Q`|&^)IEW>q_P6yK7h*~0wg+Vi5;@bZFEVd71I!JyV3n>VPSvf_`2~0u
zSLz6@Pm!-?B4786v8`(>J688lc!Zozvf7*4FKIGiO^b8uqe8G-Dy3N$XQGTDKciu0
z$gmrnSM&oF2<dHJw)OVybt)(aGWNFJ=nmogWePT*o@}Q+e^zNDQ^zO|@}6utMY@*K
z-e%IjMY_Ika0`{X`FMjwy6)z!Q|hMBzl>B`*1yJee&M(xi34~80z<b3c=aWfZLI<J
z3Lb$=e1ovRJc9_9<I79<N`IrAEV;d;9QFV9Z6)J&h?SYDbwdO(^|vo*W{-CPz03Yo
zNDRBe=)<9E%sVuf>&*VDcx~pFeB^)TvBCw?@BH%6!F-ybz*kfch?7gelBk@*3$fZ6
zh{ZoC15a2G-83A-qdVB=@=y8=J%h`(phx<rqMb<mej6!P{b>=CSGG3y9dQ|W%}U^z
z;-&Ag#MAy+EMRS)u|mXtUA49T#15+MbY+U<Xzwl&^_DOjkVO1u4%YCaN)n#<Ilapz
z5$<MV{DkP3?naWJYbUQcbuR1q)G}>n8f(O1Q#h)>xsNJ4dN)^yiFS%Y2|3cwS2vez
z6&SZ|n+-{d%&F4&lf8aD%s#BpfgjU9aNjXUx|#@^RCd1eCHuE#No(x*4<x8bVhJnx
zsgFEwX&ew0tU9cRANnWT=n22D^Oo57(sqG+3A{MNiTbE_t3Tsg@{N3|{r5^#r|jrQ
zTXyud)Z?XZF*xvMPt|R^EXv4|D9lm5fs)F1{w%*FM}M9l3T)+vRsn@etj!~BYlO+s
zi@4^IOkl25!n_guGm+2bt*~65Qa}Nl`pYFp#}aCPbKN_Y?dAbRv0#MB(Y@f3LMktz
zd78fRe+Pvmles#FCqk>{g@2A?djm4PnWtfL^gjz?$r?L->5-Q%d&!)%$=8W3XXuYV
zjoK4Y{+(SncQ#@w(mYYC#;1BTw(wk?CaY(9dH;$ypT~o|44tf+$d7)X@`zha`SW@1
zFo1;T>O3X+Dmgv6V?9xq2&jJOyL(evVLeqrf1=hOmmj^|)wA8e)E^YOp^H?-4*d~e
zH7?CX64yI9un%?hH0;=)2I@I)r}b!7RF9^_@}p08=jz;m5n_jW8d}5}IiIKWbHTW9
z@rAD1xjL;f*iD~`v%<q#x4A=g2nJDpbmu>;S)Z<14^PzT;M%`QLJGk!y$!5W5LXV6
zsnVyJ2zJDV5XgQCO89k^9lR6zx2Vr}N5le&RJxO>+XB5$_Tu@_3|)nwMyeB)9X!<U
z0)B<OV)gpd^=<Bw^J;6vm1s5j3|}{qGu<5|%I^>wEAmgLfCy!)KgF&bVy2q0AH7n(
z6Ky;$_q_V?@9o9h4R`uzV@?}?QT?xbtDQkQnifw6M-JT^bSN56-C~~Vbn<4N>5LjL
zsgV9&+>zQw=H!^cMoNZLA%X-))_fqpk#UtYs!0_Q^HSp~alG?e9uVa{IUv>4yC$rc
zPAuR$Xo@R?jUMSw8ww!{@JHXQO@FwQPCE7<VBzYn!tdcq2WCejUE{SoqBAgWMsVER
z$w@upCCE~kl$c)^bx`3iKf@`JQv&mea)JKppQ_Jst^ye-K}D{P%T`Wfr2S-}4(nqu
z99?J5wo_!>^E{u{vC+Mpx3kt81E=scA{Fs>Iaw*<`y!U`FC!%Yd@i(C-DAMn&Sd2v
z=Ktd)^z~W>9O^q@cLlW;Wp1Jq_-_VwipIcDL1tDA6_|xkadxy-@LcwmtGjI<``=g_
zWUF|DU;FG3wy^oCOrEZpdu5B5@70g}`3S_F#7xYMn|01+%A1mDl5N!Ssrs=O@g5r0
z5J^PxWju=^x9`8<-s9-|dn}Ye9krS7b7Sx8-1}%GaYX)oRc4p){od|<c_i^Ta0tGy
z$lSn<egDpIwNLfa^RLS?bq2E{KYPCz<Q>K>v>=6(`U?@RU{%|K_Fu-qZRe6jD_Kp2
zJPY(WRO<6)VUab<aId2G$N={S_)VuXZNxoLzVsXPFZ+a-+NXt!%-8j>tDCg=&tf_7
z-+x$jvSu=0<b|L9duW8|$xaMyFMEjzchIPRII^Up5r^27qY(;U5y8S0DLf75nsu_Q
z+i5s`zDGm0IQ&~-`cC9>rZFvvD``^Z+pAa_VT+-!yS^&-Ju1^xg?~6#**jdR-fXZ{
z*duMa@*C7lEhFy>%F6n=>_JBOO(<_z@ATlpj<IpqIVaPe?B{k4_Kao_HQ+Uomr&%3
zBo0CJ3dO!iVn6QuWt7Z}<SMMX(!Wl9N64%Ig||vb4xG3v{2SL$_(gl9L4F%KID2hQ
z{w=i6FuRixC%+U3pA|WsVlc;fd&{@6&Bg|$?KF06Tg*g&TyJ4X#&@rG3?7RZxgNoX
zuOV{P>SjJ5WnmY&P=!K~u3L1$nl9%n=ho--mQ?BnkG+BMf#WJ8yRU`gUAj(L(;&Iy
zzqkkaQE=%_?8~QGkgyYI^eb)C;?UuEGAk~Vtqth~l5k|$K)e_c{bu6viKslYtZf(n
zCrp&_Bc*^=(s3lv&q4gP|E7X$Q@5{(^-eZ+nPqP;p-w43c~lr*;nh7BNxZ}puli-H
zCw%~p-S7w$O_ahRm2GDz>j+zzuu-}QXX#uSxE}iL>*424B7REgUz@Qu;b(L{8x?(#
zYP9L<SN#FK>fm`Ll_JclVUQ*0>0DV=K%LPq85ew-y5a2-@MP48bhUv6*<VbZZ{)&u
z`~!NmWHWF0n{D~#k7MSOV5x=#52CT+6{^pi2QwELt?sZ&ldCcwpY7E3&`Yk!-o`fC
zG5=zXz6=uP^;5&dyR1cMcy8xgW<MMdXM<L3uW!|HBFnc9H1E&fIn)R|vB0Ab!J{=~
zsE%B<JYM}$WZ`{Cevz($AqJhq#a)`EF|UmIx6+U&ONi_$eLf<pIH0-i#kRJ1G80S7
zoKgL(m;U-z)9c?8q&nX%k6iJZu9m+%5KT^}dMDR;554Lwf2qXiyUk%qWY`d}0<hGr
zZhu99>)iOEmoE9hAVa8oym*6b73@sY<{&Q}|C9uUw!MVz^2sC?@m>E}t>5JGc=0=u
z;Cl4WL(b>)g>dpWy2*dZFI^Q(ek^&e%{}(i!g<f@_xI+sFs(BoX7Z3q|4tJ{h}bn$
zP4;-jtBAfRvzT+ox6<Q2Wx4Rz2Y8YCG6MvcR-hhD2Si)jH)Rgigy$xY+3_NimlN~7
zO5@J{kQr}9{_(d$Ny7DE^R1+$J%Xx<WmJ-zc%?OfI0VzrF$}3u1S5FW{}ND_ZyU&l
zrI$pSk_!=79}m3Q>Q-NHsXIku9<$9^5T&~@yXg}?WX>LXd1_3DZ!vd3vpZb(sNmXg
zXCAfqzJe#f1zDzScX<KCi(kj*vyEg>UyP+oUX3BhGV*=^4#$)8QzKmEa`|9*b$D`a
z^&%5in_KcJAsX(ci;$4%!U%^qP@&VEAD}us3qa`kQA5@9cI)}8LC;qQJ)cMwP$28~
zMv$)LFc9v7O?3M!8VJ6utZnNy=C-yk*-;MRcvE}CIse4G&p7?N7i;f{Tm6IJN(#bz
zwi&Xov%=By+vM-IC0_luzhDI-rL6tzcX1@dE5gX-Urm=}bg{<B4iH@~@HhB@Cp
zqKk&eJ?iz0jKti2kK4>Qc7{1*?J>-O$Qh>k5m;|C9m17<WMvE#wTTeQDpso8PE&~G
z5L}1QjlUPI&Y!bd2-g8XZ9gH8qW=wj`L&y||C+uK@7JHc)R)frlxR!I!9ldLXw3AT
z(U>iC(9xJzbhQ&2Gyi`;U*7!>^yM;%9{(kM`EB?AjlLXjV$R3V7kL*Z|Al-?{<|Dq
z`N39HE9gr7&afj#SANPgmpuwyS+LScP9JyP(8=Jiv0qkP13*R*1zYt`4FZI%$f!Nu
zeGOQB=^Dwc>==xAVOOMx!;0EwD};7gt$oP$!7NCbJj-jm<Z~wOtWwbKmY(;w^*VJm
z&{ic<Axzk|bQwZq({dU1Hu}HX7mCR`M2{`^=j#SC=H)ak4gly}*&_ZO(iIb-Gcy5*
z|6}GP5ZtT_*<kmdK!(1h>Y?@uH!XGYTw&vD_)v{U)t<Gyr*)&xUrNXGyjc5yYT)DG
zc@Ilc@HCK!j2ZNF(LfJ`^*cSksR?0aN4PCxay}kzLxZlLiB&rXw<)^m4>$G!1GxF$
zTNZ+;C;KtyH}%}RBcQJ5Qvqtrs1Q)E=t}!v^wjbDoIiw};RL6;Y}3s)aAfFy#@}ms
z3j0B>Xb1%3avxWI>$w~uYE2K<zSnwD97r2m+4{$saLfF!!{eoXX7m~r2usvH2NGHp
zNh~rF^>5kPOu6S>>UeCZO~iYhg=}^*3fE|*nBnM#Lss}@z>EDvT>K%&v&y@=nWV_o
z6DtQRN8jM)y7$}5Xs7GP>hLR#W^&5z6VOirFBOef+Dt6&4E2I!0B6A<HYXJWP~knY
zA@=!_N`~X@A}EA-5V=OfuII>}#Fh3OgRn-Sr=6?IdnawR;z^~yo>9^H!4d60MMOm?
z+SooAADO{L?Ssa=lKm0%0er3U_dE{mNsPd$S^MKvt*bKs0oJ2bP60c&iXvSS{qcC#
z#1sg_<~JchRuR$9pU8QTainj|c3sG&fCf*4c>RI*mkPw+WyRrEcaA*XPT{&S)iflQ
zKE-_Sj<Q4bl)$BYbLVIZjH5vASy|>lZbSJ1N%JBq%qY8#+8^_8OP-vQ&yeg<jd+<|
zYWSr^(!xk<g@<2iW1MMBdd{|=E*<?K{Y*l6o)Nw@_&+3c%7vZ3us3^OLy;+PS}R(s
zpYeNB*gt?3ZYo!hB_Pr@oZe-wq(#5so^YkC${r9<mTZFnJKak=<Ba0=*>VeR6fmAz
zB8V&69As_Pdy9B%HAnb!gO_FgAuM<;oRf2_aaH!WY9srLu%%(ag=#DJ2kHZx?CN81
zak{I3e>WaK<~|D}LOQ&&2>I|av;&Cr;gN-Nv~wP&qq=}R-6w)y@0MP`{>TNM+CPBy
z!*reg^{irv;Ps_14kU#m%OK1rf_el;a%Y@5`8PE<8Lu3&t)&4Z!P8}R_OHS73p~z>
z%AG%`F=Q4&r3(9dTx0SdcTJ*a^5ErQSm_Mu5#PWmkdBx6WzEbF*gi4&EM25-+Rb!A
zxXq3m`di_gu|HoZ$N&3;n@KE=C;v(O`rq$5nyRHBNlGFd^5p`$gY}C*_TR-Rpap_^
zk(G`mAEzAqPJW74&B@Kp$#<|oc*%#rGt@I*XBTRDLc>Cbj@>2B0hNLk<PT+nr{+iR
z_KT07m|PQCxYC_Uh4B~XjkhBU-;x{3vB|&FQE%eh=Bwk$=4+=;Og8<rptntH*k!ux
z*`lItMGbS?uZbk~Vc{~~Sn@Id@<jk-ArBWeU){jr1QXYHyfZw~rCo%`qNblFy4!Z0
zy4+{emiSgZDR+0$`z9veSv*`YO3%7&?mrmxOJCE?_;WvtaQMf?!p!A-QK8wO%*~C-
zCmI(Ql?R8hyxiiSz{<ciVb)@3>doI|?#2#c`7Ubj7S1^D)u?v0x+Jf9sHtvilxH0J
z_C)TPHbh;yU*(=@3)PkTYwnrL{NA)C2m46ZFZGhcfCT?AWaE^csq_7_7HXXgJx=Rg
zyGB~?Tr;|>M>4v9FgoYYqi}{d#Bb6=f5@99I<1Ssk5pLU@2K$gR=9%~MQnFo3}JxY
z4*j{R-rbe{tt(xp(%+}_{A~}rgi9sdxb@qUxdM-tklyJTsLJ_dqz-V$E<XkuOmuj_
zToMj@{N7%&{o2UFE!$y2$E;5q8q96}>D=Rg7D;G>24rYv0^9G*A;V9p;AaI0p_reJ
zS>>EanMZ<24ke-vC4@40e0Y@V)@cQ%$DgWU{Q`g<O9=9{F<d@ChG`BF-eHj_y8llQ
z(0-`!#gd()!u^8ScBt@l?pYocZp=N)qr$@6vz?&A0=?V;6#{zWpLB=e!^FLqv}!9k
zcJrP##YaB^{lACNIedtM4|a4|dg!H}rtpETaE~jzuL}Rj3U}~A_#od1;X_hYf6<k`
z#+819!zul5QCj#Q4`%;OCMd**dDv46@j-;(c-C69V7{R=7Q=urE|I&`nTpeBbux^8
zh4qJfod}Y>=Coye&z1hTkU~YeEd5~T%4m*uqfhWj;F;c}s51CO6Z2xO>3eM>3t_N6
zXGN9RqPHok(aNQ!Y@0+VHQ4LCtn4(k_Ia)n3~-kRHJRV~hpMSdpQcJLq*}t%r=ZBt
zdyYe(@WEu;ix#zwim?k7!&jtnH7$u-4n<HXH)h#*doB*8&p(74u+Q^Px{C`^%X&)L
zr@w9p%_WCIjM&lK);5q{E^XzpnA^6$VWRc5#+6+z^K^J+!^lV~DcD9~-v!BXv8v=R
z>Wwi!kKa2#j|C)>j;GHv@rFD?!d5n7OAlh0?KP5vaQo`IP!-xcf#8oH@a}WY!{dB)
zL^{2{N#no~qRcsDmt|Mbu_JgR<X}oOUF-XCkQ}`bvk*@!z>%i)h9XUC3o)c=EqWUe
zlmw7m&6cH1r~dIU3u<SfHm&`piU$}|#e?;QWEE1tTJl4(dP=Bg>BBtQw6C?6-w43)
z%=?<{X8{|LIN;4@uXj>ZP}GbJ-gW|bLnnJDn339{%^Z7_?4d8=*;K+p0j}C=lerk&
z83z>Osj~vy8TenK&Y-^*pw2tD33X<AeiR_!*|8n!aM~-`R19^F4N>PAe<dGfo-u@x
zv%HQBbn;Ddi)OBAZN~NwmN0Yr!%u&w=<^$gKG*FWefr)jL?4-)^XPMy-bEey9IDP7
z&ujnB49rW;TBAP9T9aK#*T(ZaM4l<@`XLYUk_j8`_yM`&yk~fkI>_U+)&fkz+J-8|
zdglZIA#lmsFhjI(1w$E=`glxcF0bx^NMa6i)10crBKAVt?)F&uTEg=Oo^(-xKY#<0
zA&j{vqC5?V!yu=X#8ln|p253l*Lfd;qE{h_S`btFNnYwx`Rv<mXIrSRf|sSTMLA%#
zqS!?CY{}I5W4gyYJa>f_G~)>+v#CgCl2=2?tS2v#nO));f2?pKIII*mc$Wq>J4g1G
z34Hl&Qg9;3F!mq>kTXpYBrA1JDX&PtX*niGQHvw$t1_?PIMYu)4JC5G-nof|cX&T|
zds2Pnx?Mm1qOGta6hlM_P+o>R-9@>v$5p!EK@DvD2)&Uhqd=2P+^1X4o|WYypRe|I
zo;KSVH-j)&=AU0eD^nXgmrD0LD3Z`dl1{Mfn5jIrzvd3-rYod1aI|JYBHQm<xhodv
zg_f)$+i?21Le;oTw%73lN$|ZrTqfIC<S$o9>$FU^$&JCLly%ad!<j!s#P5L{zyOzp
zH>dBJt$(B6B<lm(Gz;_JZR|zSo0(<)^+x-3>V?jI58i${SeMUm9)?>kWq;;l%4CHW
z;2&_|ROlcDmH&%dRW;9~G*$?p2H`x+i9mE`Ja^MgRpI3@E7J9hhSa%oN&$8ex#p$O
zy|s;*W(S+YV-sU-ElUF*4Z+W7cae;>N_r>vwY@aD^w2eL5Z`|enm=Oa$UgmyZ+a62
z)=W^Jr<*JkyU_%au(p%WJ7$hAYCpSU=D4EvlRIXTq_#=k5AKt-2plqqD4!52uzPsc
z-LzlD6TcYRL8jVp*073q$5JKTnPb!|Yx7IMH~Sag1Agwo;eVl{VNI9i)#1!=ww*)F
zui!d2e|jh@vh9Dyk`G8ddsMoa3_;2P(|;i(Ecb}kg0%Sh@#nnd)`EO~SPKfTV}e?^
zF^r$03m&wFYhjumsK|6(L=vy)77CV*^JQ{KWghR5-(=kZ$iXW-7UZpvOZ@!L5?dI{
zf`#$_*!vdnsH&^&2__O-aDsxO#X4%#q=F<N2?-EPfZ!ROU=&a)(h$f%A|Z*%4B(YO
z5@9-yQZLndX=AO{d(jFa6-WdE;tlYIceE<!7`ccR@B;bYckOd#&P;+J?bq*l{^!fX
zaL(D+wbx#I?X}lld+m+*q|=uAIYR*m#2#nUX9yC}L8d*1VXQ>0u$>EzOJKGI&4h#F
z<SOI=#ur4!Nq3~cIC&k*DSCXyI9VX$1k{LoWZS4V$mc>o4XwhMM`ExdecU55WRjpa
zO~aVk4%;W8`wK(8zXUC6&oL$bT0FU*_)j!G1E$rupz;{2uEy7i``pCqcVJSz<r_|_
zd6W6T-ajA#M9RL0%UxV!%pHstZtL=83$pas120L=>h<{1(F<0sEzlBs;KB?Ewj$&W
zXKL?ceUgmOgwN59)Pkn(p@Aa!8^#RM)7=ZffMpBo#MyGZ?P6>&;achW#$+9z{NqKy
z@H>oAdaX{=tY(JA?eqi&>KbHe<aqIG0SUM3g`P>?ifGn|q2f$cW>csVK2vU2In62=
z*Mg*cZp2f+7h{I)M7!zJkljS^_@vK~kTO|>-G`2lB#FhufE2s<j_`eW@(&(4xsJm6
z84Hf(qwt1NeIw2YQ4oC#<H$T4mvyok_5#9I{+a2Jpb%VQ`B0Uqmzm&x4n6>|2*D_w
z4|Lb_$@O@WBB9=BR3FekZDb(8S%@e9g!p*no(GXF2!LlE_>W~kZj0rY{3fwxQMcH3
zxKr2bzu{sjm{7f2Fe{6L=T%Dum!=6AsuhwMIxJZ!zu0gQ*>C=<<o^gggN|UITgcRp
zSHf4gFNyHL<MGXeHJGpjIjrO?TW<s;p1`-uQy3r0`V0p<!5+6JVnO-{gy}H9fwm7j
zM_3b)uT~?5%MsWBZubpU^dG!FQJ+qsTN%jvLH}j~OdR<{K4a9x@!CU3*R<XL1xLBY
zNh04iag;vHQh2rzLsfjt3OkB0DEQG)6UPFU1H99BmTm4BLNv`=LoNsfu_c_y!Dq-q
zE#i<|11*!ZIGHR0A2w!!J9GY1YypZ-(SL&@oW4g*HQ&L`iD2dEoUe>_d}69u!cgWx
zLBc>D0HK?O^t=dc(wcZ40AT~OFC$3OJ%%o<pF^MMe*_rybLbNNMtQ?zGlPi1>I|m=
z1)c!lxEw+dXPr-AFs0_zaMVt8qOk$tv<@TR6=1-XF`0dcV7=;5Yf2IZsaaFfHkK7G
z<}P$=dz1bYu^JA1!0ZhhBwNF4?6t-n5kn|rcldp+@r>==W-MhqX$ybN_Wl{(Z3;cH
z?@{`jBI<9Td!Y*_HO)c3rayJX&9MXvtSA>TU^uQdo`5%-07%6ij!3j%%DZL<r@X?u
zKpg0SrebCZ6fQzgGeQ{kFrVuR7pqCHaG4q0!YA(aBvlc=MT@jRh&CpXIqhhPgNK-K
z98(5XOpZ$REG842Q)b|_hGnb`g5h%#m~YO1P5Fq#*>;tZA8HPia5JxSX+7n}g-wLP
z(m6X9?2VMJ1?!uULU_~ANU-zSdRD5$$RH=#P@iYuoJ1}}2!;rk7F;5gI{!9L(93xe
zI0{`~zpk(zd5JkztM~u2IZs|+k7@E1v>&Vj+!!#j3)%`7f~M)XAV}1RezUM%>3fT{
zpmdnUQa;pIfyU~uU+9={xwDD#G}e910N!_=Z9M_!Kk&w*$CFvZ^a*0^my%4Dw!S{x
zim>F;8o*}CrG<B4{kF))T;l)2{C)|xx;#t*LsyV0Oy=?{mJ*CohMFiAYJ(!orVzz?
zrm&ukj#8I6Lm9;Ba{R|XwRml{>pohw)K+NHosGvJepf>Uw|NeL69o&P1}POah>QMW
zlp5sh?OgvKgC2z{q<=EfZbn+=iGeHnKYhvgM8pf91y1y``@tIMEX~x@aofS_$|%ha
z29LBL&KScorPuung$b%2Klv-g2TztNuV1J?2M%WRkfQTJN=-Fr0_ca`ud<P#^pRWv
z2A2qNg;r2?qfdSSL6ORVIqI-)8Y_cHX?Fk%C`n4>d|HNVrrQUx*iccXo&@K!R;aoy
zbx}t<@qqs<*AK~r$xKD(58Qj=2vv%Za!I}c$<_3f){KVnU@+zbu=ww=)<9;}0t^Oo
z1T1I61Wi4sZ;SBxe|!S^4ds1`AS-1H`arSzKcNoA>YL<kIKVs;Z)mFwJ7M^dDyD;N
zljVOUQ_l*OBoe(3OBdr!6{n!HvmY{lnX?W_elqf%jrC};NrHaqV+jvljDuGNa_|aQ
zmzYcfTXAy!>i1s|(f4D>?vnFYd6G{Z69=Iw^d2LLT4Y=c78RB^{4q>E;2Zi&APP**
zzyzCOva0YzHA$w~NYl4JhRkiOE$|w2GuzQcRMiL_7~K($z=5>j4N$A}ICHDHf$4F?
zN|=-!?v^ezyYyLf6bgyO^Df|7zvoX9N8nH*_JPfH*=l1b_Q%y4E&4s*7QTUYauIH3
zoo}>zRh{?c1fhiC-0!nZVU4hLlpXw`dCzVGx*O*?NiYeI;=_#%y<G6$n;8TV7E>3f
z?q8OGihTH_BoUTiyEASP!uiK{vU&dDwOL-M{7`~9Kl#p5`4^e*Ud0!;@v*LjCF{0G
z#QRCIxKjc?X15*qE~56^yuha)wu;2vDQB?gD}3@1<qj$+Waj!c@?uG+U{RCS2hz?y
z9o>KjwWd=I5akEfay8ZpRln&TnV(Xyj;rbBKaGlmIhrZd>lb|~tgLARiZl6QQSdC!
z&@(UnoCsbou-RIMNDf2dp;L?FV_N)+m=^z4T7hFwom>18qM}-SC*X-`@iD5!JrCXD
zyYH43|6E$^Z2bAL#Qa$W_0LPqaIYBXiPG`;IGTZ0KFUveC4Cg1a8LS|<j3%W>C#cZ
zim)QbY$T~)Wv0IyAI_zBGBsA?Ogk!Pt3NU4b@<2VuE@bw*~^GW>9<R5jhlVFpc_Tq
z%#FHRB=*^RW&Jk|>W3Pw^`A-guXSk7<?Vx@*lC(TR78ONn_y<?O;UiAO90Ya7Ov;P
zztz(~<TcEjQLv{m`EQ8A+;}(gxP!L~`R;uIOLiDH?ne~g>%q280*3gGw2eh*T)1l~
zB$2v}O?WZs!-^~}2PHK970HQQ=fRp_!^CV<V{6IrN6?Z3C?d8cl<TaPoCk2MmW=EX
z(-OdGBq6bq&(d3*OIzSk#-$yznr$M|GN=8+(Luzq+j}^hcB48D+I)Az{siZORN@iq
zff^b+%4sSWKQ&n!TX+epcb-6I=sm+6pr`Z#IDoprYp!a(stYSZo9c77q_i=BW;7*c
zIDC&ZkXjEMAQ6gE$7BGw>xQZnuABNgPmo3XyR>1cu{05|YIG^Z1?&v0i!u_t|HPsI
zOTiYj9Bco5cz|OzCcA_cLpx0V$Az*Lu|*QVL#m#J^cliokwkRLjKX`0nH=%MtoVF0
z3`_Cktw@JB9S>M>L-hu&OD5u{;V5Pfo(mh3xq>C&T=44`%gpn<l&dy+Za3jG^#hQU
zk0WnmYV-1>105TkOUEVmfK?zD?B_zFA!?xdOQ`)g&f$Z>5S?f>zQ#dKxg9D{#9fLw
z94N51L4)Q=J?Mr<h2=-+hv!m&lUtFgAC4qCH8R85<?*e*M{=bnR*DlXsI)6KxxDf&
z7Ws_mRft4uTH^a6V2XcMHVyYRB$-lrBg@9D!8y{V0_YFB0jK~tqrz-IN(Im;WgX5q
z_~Xx`#=$&}ENdL3k@|mZ9Q+!Y?c*R>#wF%rOVJ5`Z7UEa6ss0_Al>lX7+Kc^B%^Iu
z)Wc*gW>~bbpirM}I4lr7EO04=(lJ6{xOyulx5F43E+CCixvYUfqTNXRV_*pVBa+k~
z9xYoXQU*oTcscmMkb%?H0I?4Ysw01FVC3A(zCW^oA)2ghdY}7Pgc=yX{*aUZ*AOMm
zrG_qoKO)VA2F@2ln!vh24(uUZwpg7_u)0DajKbQY=~d{R4QQ0Gdce3k+*#stkc(}W
z=!FgiE&QbUZVifvoh9~S*Tb46KK)r_miPqkVb;z04safNV)%3PfaqX_)_`!S>A;#8
z(!!;4BvX2tgw*x{!NTP5gl#~4h_FKrh({%(Z9q&$V`2tGl^PIlfkY2FAVA?94j9Qe
z)p12Kwwe@g=G`deoGwW3LifRhybq{p#hE9i<mE)dSZk=H*hhm55jxS<)9j<cWb0Z;
z6#=6GWctYyh&4W9h6a%@XPh|)=%@bShz15_p2HGpt|7uB!WoI_rxNR(LV(f2*|5Y2
zvCcQ&DVzoH9wsLeD*-(;wSEacNdn6kR;@McgDm4?wa}2(2(|Lw8F<z<Wf?e*vdR1;
z**K)Olk7wRIxNfl1D|1r9WBfJPEy)v^f9y{hDIMp8ZhmQJ+U-ulVws!p-@FDS%wPn
zzNAd<9)V<wZ4`RXyWn02A<rmgE#wKbbP{d00<+@${m4X{gHUFyG}9mx_K`7dk!E)4
zlkbc$?Z0e+G=mGqM4EXQNGAIzhbR+7gtPI?y<u*CGF~Op&dtjs+`Qc6=DVTqCpU*@
zVrYs!l`ruR_YPje`ytp?KJ6=rTjPneP*%b&KJspR9HVd3EwD#Jd@AEFRq>zgl=wQv
zKgD>9){4vFVtnPA0$3!?+ifqOvrKv51;iP*BLd|O054kpRlL+U9`C3x2M@i}tgojn
zK1;>-Wqg5(w=aOle4uoB??I=>w$9PNybTJKm+@)JdE$AJGe<sFN}JGfxHn3ZLVK37
zp5B<{(C$}W6F64kq315L?H5Jm$GyMIze^JLJk6Frr1H;Y{%--NzUFt<PK8?cYt~jp
z(k^HYA39j<r-)SzIfD&37oX`Z#;?9by~5P65ar^g<%{k{BN_>NmOaKxJ8k%02EOSq
z^nXbZ=56)8QFRHvkv@b%G4S+Oh#i~&jE99F@Ny&DhnT-oAPM!HfIV<)+vo9D1wn5j
zNM9C`=v-TLNJTGY^dAsyUX$y^g4(<$cf5L~rmzgFIJqkKB8(yOr>A$Y-kthREd7lK
z@e!t~V&ggcXE~;Km>REq!?+Et52wOOWUoVoKZVAbl|KoY!L0m`>Xnsqy(yJ%zC%@h
zHqgh)Ygzfp_y`wUY@CZvAmZ9rA{6@o-a)Z9;-UZTCUmmNY>R7*MeMv#&mvoDze+uU
zspmzf?vEA_*+Zb=!Ly@SL*o>ShZ%ZWm?67^y_3UI^o<{bUYYt5nGI5q1-9MB=x<b6
zJJ1GI7BM1)A^IRiJsMq<%7L?h#xAsj6OJYe<qJab*n|b2l^~gKj8|1l2Sc%75lHZ5
z!OJrU((eokf%LUSSE}ec7(EBk;queEDt{ITkvz_nSe=bt5VR6aMQSw)@vUxRz8{Q4
zUBrB=ys?jeN6DXeG4H@2B7t2(c+tl<V|8W}O7qkd3EgrH>QY+msVqP?FvrOoiyB}q
zBH)D_H+lZiEYB(M(L=#7mgvA|m=21K?x;XtjZvViXJQz@*+{HEc(B;rpg4^9&%xx?
z&^;F|jbcKlAOK8g#!GBt&wKH(m{_6&<$bpSIQ9X@`Yj-F<5r09s;J+gP#N&+W6PY4
z7?gRUDpRwS`3wpJxj)1*T`Y4b%Y=<kq`{FvnG7=2Pi6w%vk=BejWLZ8ag6X%*Nr&^
z<ymvU#j1)-Rxv3c1gxJ4v_=gIIA}f?9Z&3L9};OYipC|oj1j0*ntLs0OxRt6M_JVR
z0gT|#tfR@VjfFOHbXnvGACaa!m1gXV?%#`0u%Z~;V`fs!KzSp@{3D8DD#7VVfjhz!
z^D{6#<0E^CHuiN3J6D*2HGm=3qcH~=!Z)I>2seOhK@NJj2Qj`yIX3*?ri#^HrO*=U
z`Pljh_Fu<4Fu`k6^p6&xgMY@+DP~_Wggci_PzS}KGxNZnBox1%`~YhtOY}7AQ4?x7
zg*<R3xGv__t5gkllLuap8f*g_9#XqriXISg#axOu?vgJNUOvNXxM9UcIjSQc@Xsa>
zfT7#<usRqmV}CcMVPgJtLWHq@>+QgMAAIO3zdc9`3mAJj28=NjWU3eA4Q?LMIAOLf
z5xsBx7<}9K2>T{_F$jkx-vz=Br%F)*x`6_W3X=V=%v7~VrJo`n>vS(alFW|@{5T39
zoQ5vMM?`6P;}lGP22CZx72uQ&Rsc5^w6g+4=r01N%-M^@Jl|w(;;c1hz`nyWx|?o$
zud%&X+1~H8y?bo$0o(hnw)giQi<G|<;k1uoeahZJ9ai}l&Cut4S&reM1y(Er+H6}w
za5E81$aod1%Ldbg4By{j6W+2Mqlu4YQMMen<wY#XLd_`3W=nPlE<?rfri4l;LUAqp
ziJs7zrFg~Q5P1ZfuT6B_j~z3@;0dM`)XVJ{4s2P2OyLWpoHw-;tm4g=MWJc=^dM1k
zBRh7Mh8TWx-`;~DE$|NPmqU3jDqYiCwY{6fm^rI2zXNZ<9zgGN8tRfAU&@yaIjH7x
zku;m=kI55SQA|F_lc@raJ|3ioQ_>#-7i3wqgDZR(mngsnu5cL=F&E9>3Y&qUdSVO~
zAdvLXAez0^Bx!*=cuSlXybzQ1eJmY*FFb*C>=WwR%`Cj<i!TdepCG0@p(k)*j<83N
zjx)7sGT{jozyR(E1)5Zeh#ExT!A({DPZ&~wasmp{LLuJ$h+RhFQwzM>Chn#XJ^~gt
zcPvKSx~9AEe-eSjo+T3pq?hn$=GVF=xZdJvBgBX3XI}pkX2s<{V?ZeDB5XX<sOcpc
z7jmh>WYD%{;FbKjihi79DAbgXlG@~dn7lohCY3cQ-}O!M6_ggB&K#>w_Rf<a2@f{m
z5hnI-m?4zC+gbt<49!7=GI!fXbGK^1NBqAeh+IMv1OlWnV(+%3SXJD@$Iul-G!MTE
zX2jm@kM&Z(P9P9yR|af?8a)|{4%m`gooC|J6L=;;%0wBg48y@B2K_d`;t71b{7Ho_
z&nLuM9?_yN0|&Rx3xSi1TM_HKtw~2gb*rlVNKbQ?Wu-oE)sP{jQ&8|$))Bf-J?@x{
zR<t!42;+EMp+ZA0gqrl**5tqk5T11cRI)IC^n^mMBDphl1@kf!6L>;Rudv`i6Z4|8
zn)wLaCoKxxw^3Eflx<BkJg#eNO2q?OfED$4vKYx2xI<78x`Uy}UoVB;w@c;Em%ye)
ze1wE}L1#58B?)0#U<H%V%rFp|BUwX92uObs^)xfxx~6>y20Aggq*I`G;p-MNv`z5C
z5RwJcwKa{vo3E{DBm)p}7oKLWzGfPNZ^E`1L4fTwln0H+eQEM}-$p(H@3skcSjaL2
zfG+u(gI84%iy%f?@YENa;&A+03-vkm4GzS-yZSjCzoEO+*BV7?((l9s=Ls&6k=ei#
z$T&?A!Z{Wmmy!c5Ja~NE4WzipZD6}uB~CL4q5kW7l#u(qzXcajNHM_`L>7p84xksJ
zcJZ49y@<4+S82}B6~wi~OQRY8h>Dkf9=d|PF7b+<5Fb$S(vL$|uvc-t7bZ3g9EVkh
z7io<17#Ld8j6m}B`_WosrYNZM0eO^#(>t7ROx`3%{dPQ;f%Z+r?(G!?lL~%QP%_Dw
zhcXwTk<O;KQKE4i3?k@n{J|EMuQ)WS@dO?u0`$M%Btv)Di*XKe1aj$Vf$`0tYPQHW
zGQl7|S`VEK8911;FmReZg&8*OlZ8VgFI(%0L;rWz|4dF6OgLkmFc)DSU2|z+HRc4;
z6_@@NGBgT$x=p4(8yB`QTrd)gVVk@;q_|X|X%U+fP=hpN@@=@+6>h5Uhpe&uR8mSP
ztKcKWf>Rqvb$iIHSI$NBIVyUD8T~UvgX&2RN5G+fKw)+c{XV~Gkqk`(dN`ItzasKw
z!iQ_sRM*V5*g3R)a_f>q|8R{UuL!*m!=dM6P(^WQ#el2{@ZN(l>MsCN5iIUQJtyP%
z>hfZRop@jeLwCwn@arw}qxtojphSyb+vxqNd7$@G5vRX>trEQ4(6&8>Jxt4L=QH!2
z;WNYUZN~SuG^fLGkm;Fbt@#eY#`waPX16U(i!DvJ2P1X;8tKjROFX6PTnTHW?j4hh
zBU`7kZzQ0}Dc<fK&6HTWh_@ZL5_{R$Z?Us7rd$nmmBh=b;7}jP;ph-$E620r0ig)%
z>^OXfz2Lt}9U!*S_$$=6a=VTAi(}&5G4bg!@wfmps(jdw@f+=iFS>fv@96kM*(Y>t
zRnKty6}<V*mFmP?{USn3u>Z^#tJqsrFiyQUgo6RSmD)e2Q%e6N69!wVOFE~*fmeal
zma2c}RN?kC6AsGrhiH%BpY8D95H9UFVt#tKK8c+i9c$tlEtA+0(XqI-!-Q30lcQrX
zgDIT54i~=s8bNe4eIP7~KA6T%9};h)4~e(Yhs4|HL*i}pA@N~+POdpT`cU)?XNR5W
z8DUZIGL0QxiMPQk@iurR-UhG4+u)V>2)uoN0=)J0q*a3aP?%^j#BNo=pydVOVA8S*
z-srzB+^<$jKW=WYsMJbzY3Ecpfg77Dt#hhAW~z>67G?3%!=dN;g~-rFdsJ|z_L!;d
z?J-l@+he7&wZ}?jYmb%6)*dTWq&*BdRC{>ON~jN%2PTW0EbY0O4HM*{e=a~+tAAD)
znfdr$@2`r9H{nIcsrWH5@!2u?&y9&cf#19rTsZY)i!UOi<73Ppd~S`*30?DdoJi|5
z9(YR4&fl5FJ{~0AHXbD2HXbD2HXbD2HXbBCG9D6t!g$C}>jGYcMZwE7c6cS;2Cu~1
z;FWkAyb^DNSK=e^ru`In=X3!t!lK}18auoaZ-ZCjZSYFG4PJ@2!7K3*c<1~ScvHH7
z7hzHGGL0QxiMPQk@iurR-UhG4+u)V>2)rpj1>W*5;6+#zyi8+<SK@8(O1uqTiMPQk
z@iurRJ_2v~A>fVW7n*<^%`XrZ#V_z3#V;h@#xEq^#xEq^#xEq^#xEp3nqO##%P$II
z_+5S%@FOe=etbv4FYz|`CEf<V#M|JPcpLl@FYu4D_+7pQKSG4x@!1u>8|7U30vPu8
zFvHG~285y*Ho{u{{s_ax_j>=#2*Z{5@iFm5G4WY3@dIMwWu6H)Ag5!i$nl4QA2;xr
z^Nn-mi^A*cn+ZD^_rsA!c!Kj8?8XYtCa+T}U@#VTVL39A(myo}aX8fk6KGqiyw0gI
zI;X<C$LVN81VlD4AM?<L0(8tGJiF{4)hZ91(QBcEZRjO8ZhM!5j^S$aCwMG+@&uhl
zLH7it@lT717kaYAkB*5Kda}jiRFEm`sPd0BzdI&R5c$_!XB3dSMk)WSVXv(8>=x?1
zmkRhT*eSAU-W!vlqJSun@XS)EkZjub>+lVgm7LqAQp3`|55pE>MEmZEImZoW%+AK&
zLFeKQyi9Ms;nU^qy-mJ@vJ%1ZoQ*T6nc=x|p2BWnr-yL9md3poV^s@?-i0?tO9ze}
zD1r$%cov+yn6wmPE_D{XSv8^>S;W{DpZufVMNCoUEV$SgVgK3_2j6hr^&zvRntv>j
zhW5v4@aM%cmvusi;Y_6)KGCu8+^<<3L&XHU9>wc-&XFRDQ3L=tt$dSMG2Th-{3Fz`
zC0Z=+*d+tuSO;6;<wSTh6TxU}0!-*+TV4OtER;(JhTPZV&e9(%5x}#`fcyo$<tk`b
z*hDB_;<2BOVEu7~ndU_4engvR!uMkfm(d|ai{B93iwYu#Avq+=#^iby0xNhJ_F4Wx
zs1C=idQi5YrJ@Y~3W-G3qBAv|xy-St!pjJ1Fv0(fV3Do}V8GhUgn~L>1a(^Y*f&uj
z8pAs)B(pB~Gj(UcgMKQO_)5isKBJ~qe0Bt`h%ZJ}FCaB>YITYJJAgn9c#0I8wHumn
zXjHay8+oZuOn5LJ*jkc$*rwV)1@_b&4W!e+?kJOLq^G_-3B%ixuen)?+UMw?p5Sa(
z!>e)nN#)SPz>_4><fF!Cq}fzRCe<Kf8zS`VD=kII+IG3yQVXnYFE`aL?bV_b2&`^*
ztCzL7nG)NeDS9<YQ|#fm6JT%rTHu)oiE2Okn?&M<4wU&J0q*qU4yq5>@#CZ@_-nL9
zD$kPn(gnTI2*TV$;<>{~RqcW4*zsHu5y!3bUHQ(XZ*f0QoF|SSE60|JLpX)F@aecN
z8Oub31KiG@A$I>=NG8TJM2}?I%yX_>SAbh{LJ2+`F{GCWj96zXT#SMM-gyco28Y3W
z6%@q42l|R08H3#mphd+wAd&%3>V^6|%bjY@Jk!|E!au?HFA)aRbY(b<YZ`<h-SBW&
zB;AN`7&o42U9={BvvcWt1+Aa;UAO_qd=|DjpWS@SyQ7Zzq5ymDfG~#Aa47(H92K?2
z(wP=4hNeAX04990#iqXq^yKt}AW6cggNZ<|!mjHS=7+x{dp`daG2fn;Z`^ell|G^e
zl6w8{R~8lDLwy4aN$uS_Hcu-6GfR^Hk2AW<P{FtfeAuy3aAZ)K9-$zFZ&u-}7!LjV
z<HEn;d=tzFgB<rcl8%E3pP+kxBHy1)iXi(%SMje)zMtZWgOO=>eFRxyFkv;ETwsEO
z@hc}FqF6D0xB)pd<NsqVHiX6d$4)xD0p|wWGW~v#yV<7#Vu`$}nSj~m1#3>wp8~sB
zel}x6SxqluFkv`;Fyn^~^80Q2NU;9pi7Ix78M_Lxc5cowe{gR8F)~NE`D%Vqan1wF
zHg4X*jm_T!6yWBsPYQ16X8fScoJUr$tT3lC+5ed;nNMD9V^QSh7uPYIj10CS=xEc@
z*CX8g8+1byH(L!5=)j|j!!DF{?|MG+_T!=JRnhwK;dcHU=D+ZN0j_IgAyQvj7FKdw
z{C9QxP>b76vR=#^Z`QU$NK^dN&LY2>qFAH>rr1^dCd3Q3-KKv|+5#!BL{jd9JS4Pc
zHa)7{KLEwVg`JVFuU5J1k(;2ITviKKC(*lzv;DHjctV9Z+6dE|!W>QRx0EeCe6T0Z
zBSdgug4n+1W*=S{b0%U|7PIF8WnU)4IVLe}!Ce5ul5uNH!<w|P%d5YlN8P~4M&4eq
z#SIxjM^*=4w(`XT=hY`n#1#og(bMjWr8aw6^0`?3z1;t3%7M~8_<Ir`mh^N+2JQ(a
z?}BeQXXAX<62#pEfmgM_e%>#HYbt^lYVO=ueZLa<R>LiexK{ys@G)BYW_RFyF}?8P
z&cZjbP6&LBl>YbJ!OP%AU~H0qlh(AEn~im7<#~ED<~;aYcplLyh)(lA&1evFU5Yz6
z*QM91_<P~63rmEsH#j(4iQvC5{nPf3BA=qKvdP=~#is|LrwcdE3O1P=Hd1uraXy5u
z^ApLjz?JZtM2zCum(O+QyI@g<lgsC)@DtB8d4idsi_dq7$p}sb>Sfc=Meyi3v@mhj
zFjxfPc)`b<ZCdqV&ZZON9gZ=f{+}@`IqZp$x8Qpf$JB6P<OXyX%(%2b5#qe06|gBD
z9~pW@a*{vUxDTL38c5p}Um0$4#Nrn5&c+)!K7lcADlez8jGimwA5Qy%LnQ!U>1QI0
zKK^#P)7KOQK6mehRm?6=;45vfKT*6a^9TUm-qqW=MTcJb{VI~DUP-wGyVC0jJ7)Mb
zS|}f$)@L9sPKLPgAG&6rCr<YrBX^2jhDOs&m1jJi&IR6X$i(YE<h~kM&2=wWh<udd
z`WDeu-iWsN6mZ3BUG5N2ih@&Eu!{wQN7)MgCrYvv{3)T4f?vR!RWQaNGP7VXVn}e}
zrckQDWz(qD58IsUQ}k_MV6b0=HCmJGU(&4@GA0(`;xx@2>amWlF(`n4F33f*u|f#O
ziSxq3zHt%kH_gW+%B*%5a~A<pkn9ATh!N>O`a^FzgDH@!udxsLWgJO3E{2zImyv7D
z<2+!V@5bDglD=7AMLs~xknE5@pC37(Bushwd{ChHc&BArim_Kp1|Z*o6orW|Lz(<@
zQkYp#$EJni-(<I%RGg^afHHVxPNI;e8`RmqgdzSU4<?PYLX4Mn%6TjXy}x(+#fvB(
z1u$e$Mj{!(d0OBr5LyswnF7td_g}bQ1{@;?GNS(tutJb_E+{?8!)u#zVTJV!s=qMM
zW1ZYXzQ^CnKH^z%4Hp*Ym}fEfUBTh9i*C+mCZbGF0Pz^iiN<I&Qtj)7@q`P7^*g~(
zg*5GW>TBZZa<2_fiN-9@JN+~9a)E$1%5NkX8plUbAQ?^Ihz!IT-%Ij*ki<r#2apMT
zD2@R9<QRit{b0glhVzb^9AkMHPIv3}JlvP@8Keo_0Z81fU-J7+e(%C8i8(e<fpK>`
z-;uYkQHWY}|0Id3V${`Ul*{ObV%6NauDu^$Fh1L}Ia5H4MqX#rZPL3R=}i;O`G%s*
z!}!owg9n*e^x0hhfiF3Et^o-aOtzLwtJ|Ma6|8Pg(2J3bg{QD^m+x2-nwsb?MTwDW
zRg9YGh9L$;%3O!pLGO?6_S=|K#rMOvaXFgEYK&p#3;E*a%b63S^&}Hg?uygTOJp!v
z``w|0EVL7@Zk4^GZF*4q7ql^YOABnEPb7zNHPDVuYQ($J&wx06-Hks3l4*-<dBggf
zm5Iz&3wW&cS7hx4l@8T8x-g1UUj0kV;!75`xMvjM8U}GDX6#Hffop8@{45-jS~{Kb
zheX`#dL^b@^kD|m9|8QMaEv~aqY*v8W>78}2a^U~%|LGDhCMK@K!V6y08K;R-h~S*
zlTV_QNPxm?Z}>7G$C<Z=;78A}7JdkI4qaA{{WS@a9gfN24$A5qQ7@H6lc_99CJ6--
zo_Nr{Xjn$9K(eB;h>L!T-^`_yMUYQ1>=H&Gi^${n<ew)v85|%#iS(p<0%g3Zmy?0g
z7WF{mkty^*WYwTVW1G6D26e)45|CLHI#ZR7h0jH(L-&%=0rgzNgTt1yigGQ~nB0O`
zb8(P_SBA@~VKF}tu?$cCVb##Wr+zI)<8tMUKWpboqH(AGS29ge6Y2MYs+OOGgpi>>
zGou$Wnp!q=O)(<qB@<mkz^DYuXRm53T;`xp@Iohm^>~UN2#XK3a%jPe7Q9K$wo?^^
zv+-O!{95=!!j*rvEgl4X!HO?}6?NWS77tPs1=E`@6a0Pv{6cV6Vlhyo_l7LYQL9r>
zz|6eY>=|GDWOT<;Bv)xw2kT9QA@I)fegrX;bu3yA&URo05cVjTb7TlpzmvlO0fG`B
z;ah6LZ@?3VAW*+yg2jl|w%3`A$oMVFRPSqzKbY@ee8zo|aCR-%OsYgVxQBHT$erK>
z6FPvJsMcjudb71oh)y-_JxLqZ^lU^4I^}Wl%Mx@hnKWdvooc00EYkh?moi@TK39-l
zku>ulN&6uwS#EEMS41Uc9EAv@7)XfV>Y)6M1}}+r`<8?O(8Q``3%dnEFmx+(Cs4*J
zmn%)!SkOB<{GRb_I#sKbOwd6P_$%Rr10EPp$DO6mqo8`Y7H&KP!!uZz7%J`vj87E#
zmUQTbQVhopDmf>#`+i0L%XHBS5f-+M*eAeC!&;YAl!1zn$p3*#j`A_Y#UJ8M4c9#$
zFK}SyB9O9LlKDl=5_`t|&@mBc;_~J@yhm^%#iP^(?GFpTc4O=m126@I2oh>V2JH@7
z`vfwtbTJxz(u@z|SfE(V|DX(*1rAM^hbH@fjYj);DULLl4%3`XPSPp8?PI+dEbglf
z?eioYF8j}hI?CWp6zm@nvUWLJ3ll~068`Mb{yeS{cm=7DyJKT}vu*taio6~L9bSL6
zkG1y~&TvHcR}bjvq`wAr>aTYs?T_>qyhGahYd$Lak^Y)0a2%w+MkA6`JqAzdx0Us%
z9Gy1+0n$%qzd4((1ybAZ4bR7i=)dik*!piPJ`8%bi|)Txc1l<M*Fo$@_1_LSy;lAA
z&(8g~4nZ;fw}K^f>OZcXJ)!AI;wjvINfwqA=`EhX*$|AKjbv4H;7(!WU-~%Eo)+Ll
zjhh?yIcJ~GHKRWrcYin*j>U{KGO?g7J{}~AGQ#t<9}Dy~ebu^RwJB%9Nz118<S_oU
zz~@}&eWc$;MjpV$uWPg*Rxbh0buga&f^+$05M>zZ37*k@yM7Y-1kCxk5^%OtDCC4M
z|1sq&<7_Y|8@+;^h#&f>wKQ5vSs@<d4;P4lcXxxxils#S!sBH{fkyN`lZ$Bo(P%`{
z#MS!!=suCp#vV>_f5#%gx$^t;7C32%hs3-ZQ^P?D)=;n|zsN+<@MWi73BwAA9QtT9
z8<>udDKLTPUajxD_$QPnZ9kg8E<I?0f0Y93SvBDs4=Y4ypuUV&PIN-(Q|N`Em1ul-
z@UAs7Pe&1l;C0e!-Hp=2^uY2D3%_GgI#B=|gLNOHPelb?pv3CDPm75YC2yPBBZ$Ed
zqeV8VfF>9LbcvI&2Kq%@X_SW7R;n7|KZ|h@s<;kfD6@mb(;z&2Z$)T8fa%fXQb?en
z_7&{99w4(GNbS&rpaZ}O4n247aYz<OC`Ecb8O>?GmHO?3UQ0O}^c;eRrQY2mL3w|_
z9R<7#bm@K293@lW2+&63*=q!a(4#X@>61L@B;Fh-#F}GkC4^IH;#W{5qKQTI-(&Zq
z6709}J?BEY>(`$JFgaVCH&^2W%zg_JUcghl7#=BM48Z9oh4F?pIX1<?A`HwwAh}+K
zW}@*A=rd3jwt4PFAddco@W6SG8~Np2xeFQ--aDPTQQaoo=3MX%s&vlBCNgv+E7w3<
z7Kl%7YMJXbj$#DvYYrZLK}cz)D0U_hr&=69gU}CcXNS)vH()p$hYIYJh5?4<7~w6X
zriZ8CuoS2bJBo9lwu~E<o1E;t`2j?&%E#d$J(beX%D7GUewCJo!g<JQtv*f`v<VO5
zXfS17mxfCj+6r@ek6P%>#hN;wmlUm2S5A@#y4L0Q003w*4qrk`=xiE^fo5cf`$?y~
zgf7yJd%&Y3EDRlg{}OckiOd|z%2OR*kB2^QjQCT?%Y6d){Ra3oczVRzA4jFuJv(P|
z5f5R7`%@GU_T7o&1Wi8!JaCs<-2;*V)i@Lci7{v67xKkjD!G9~<~zASWUe?~JymQS
zq@*XMB$7Q!8i<1Kj%2+BuaKugJ$AcuUvoB!3(UYPGJl~5%==$(x#(KxT6g-frP6cZ
z<8cn753+GSOl{+`EDutZs0l27$ovsU71VoKtvVgB_(!0(d&K!45it(E-^};`Jw7LD
z!5+!&i!|HaH^%ws0~22mr{TWxOM=hj8>Il|cv64PV~wrV#*1^T_qE0=co+U}TXW0z
zN|KGy?Lv$}`3~#fO8~?`3Yuqy`CC``pwzO>9mK`H5Hf!wH~vlpXMmJR`SH;U>{Hyq
z@o^lSt@~5Dzz1;AewT$$(XRvFs>SUag^2)XkuFLQy##LH0wTXRQGc=)JTHRzhwng5
zXQ`U7+c=cN&WY%W+z&PFeI}aoJ?k0T{(89mQXlkgvt#d?b_e#WdrrT`%mR%qw-9j=
z3_ax^048vSoxAnGQM_O}@Pa4M;z`}7c*g44neds2tD)hAoc>|rzSmRG6YjtyUJ-2y
zA3U&wW`RG3`E>5P&iU71NZ`6<=<0Ak^lDGsF6aD-WE)VWs{6{#TSUNy+@WEa*x&d7
zqz@k$9xSsH^%nGuCwD*h$P%56ZHRG)F58FIL?Z5(cjvBgHr|Rbb)oP=zIAb-FfLE>
zB!JiT>JcvsQe0U6Wz2=;?ocMKlGe4{@P)!28hpqqB%}d^xUYFH^%|%y-=}H@7{^uh
z&dL{XaSZQ)#sVN35x9s3XH3y0z(>q|zaHKrPBMJ7{caV%oB{&=poQIt>%?JX)_X15
z0xtNr2K6)a>bQns52hn6*gLrhn@5n~-0*N;B-d)iCz*=+_+;Zb4z}g{RB?0LS7NI8
z6AZrf3XHdC`uK_T*!MQj<8Vcf15J87o8%hm1Fj9gWS8E5f{fFN#;?z})i-hVL{hAb
zhoSBNknt;b)0y;3dd0T=vu*#zcVHJYNA!2G{6lLP?(iM0=u?}T@D6rRLd9``J{$3Z
z>*;OD6@Re>D0n&3*x;iY%l;jk{!rh$ok<Bjo|ds9@<eQUrC&U5w8ig=Ct%hZ3f;wW
z4iC&)r#{9<o)<yfq0`p^<2HGm13XF|=_EQ%yXZ&d(GxqBmzZCFptt|zz%J*?A0VB=
ze-0_KVNGIqyoY)ke*=U=6ol6kv{2r1JoHpMJ+Qx}tP{dz8FNH-RQ<>Hptbk0tTa_t
zx+-fP%ld#wiCvEf4vxp{O@IO~2zf1hL-n`tLzF3Xo*u4KPsX5#PHv7@_*-J#zf^TE
z!8_XFWZk8xTj(LZ<;UsmQ1qetH)E95zd+~#;{o_Kz&IhitEfISZso9*`p~`<sxzbd
z6xW_=fo;xI+}Myf@`nywy%guXy@m1+`rcYvalw+ZT0g%qiavy%ZeuG)s8&9!T6ruH
zO?sN$Mf#s3ayjHR(%%A={##mfF@0C%OM2>}k^VB3{tT=96T2w?7c4(hr9WGxA7rK9
z22|PTTir*rCv5J7g@)25Lwu(F3bLVADo&yGPJ;rL(}`U|H%%3T373Yz0m7>HV8So)
z-IaCahc`3)pts}<#sU*^e2b*AZppceGo7jIewgo2b`vtC>|Vr>V8VE0p&EH1SqlDQ
zEo9IH2)no^hZZs}l6WxT{+k3y-jnluay>HHRNcj>5vuNSY`dyX%x1(qQm@CTNB&&o
zr-=jf+)_14NWzb>LW|vuSHKF5Gu7YxqFREbE#f<1<vNXj`l2Gb!pJV7HX|Nu6$+R)
zffjMtl+!0HBA>+cNpU>|4jm-f3bd2Syiv&(n90(tWV6g<!=sW-Hj`mnSd})=Om<>a
zvRpG+O%Ym{L<LeJ!7U18CUf&6NvYAf-$}`0vgpvc?J3qbFxx~Yc#_L)wv>I*PT(yy
zD0s&vHC4%i4MDaLnrUwnwxw)lIn*#-m@k>{6{Lq1=$ihzbOuaNU{9FO?y$@W^HJ3-
zq-K9uov0TI2#RhoyyHc4;%r*YTIQ`G^+#{lgy%!*)B97rFy}}125Ok2gQiS-Nn<$H
z1TV=lAs=n;6#pXQ8xJ1;2;;dN;rhRe`~k-AG51NS*C*<z4Xg)4H6B1l{UIQUoDZ^1
z`-#G`1MnI>EZLoVKpnsoLz12PFEIj`a}PRPC0pBmjP-Ik5H~3R#6@}s2qYnQ?v*8g
z3CWK}m)aEnnw}>^47~ypa7ovIhr|P63IF`98a4J_`5OdbP`W-H;K;H9cfH5yzh-+c
z6r7v-5M`Gst6%5~UMD|~<>mVEm?b-{iiZ=NP{J+q(JHKpAq=Wj@xn=6Es7Jvi(-&G
zR~%<^>yJIcDM8VU|5#25VPaXWpK~5yH|21-=rO$P*PA(zaE=^8Bv$FrOzIyB+WM>#
zEN8WVYjJyPOF^Xfj<b5>Eb9eVrtB2SP|wd(y_YkB&z<@%ppU(|Mf?t8wXDBtz9s78
zFdY)LJ&-ms;hXbeDB%u}5GHtq-$Tu}?!dnEmgi1JmanzPu(pYnw|?rkdBb^NL;8!x
zFbX$efy9JQ_K&BawLi9hoH5Lr-!1zj-2Dudad`9icd#<Ik%`)?1ts2(@W@7Qc=obw
z91ldtSkqYNX$upOdA%DHC;l-)BcYzupU4LBGKaijzgTQYVI|XPm-u!nrn=fCrhY>9
zhq1f<rtSi1k@{qd_^wV{#Bm%w!Gzx~uzPHPr&cgCYu;qfk_T=ip9?0OjilkdWA4f#
zntwmOMQni0AoG<?f)sV2pc%Pr)8=995Ra6>a5y`}YPMXA8oJydz8v6K8^o^y4vvHy
zF@nqu;%h-R%bm;|$~yBA6+NBNhrL1kF)~MO5U*zdO-om?*v=coISN7UMnMqfOma|x
zzw-{UbwykYi{kt?BUiBdJg{ryq~-e8z?XdsHF|^i4kWiGICidHMi{U`Z0Y5o1B2ZC
z!JiNnprCBcU!2cq3}xRR&cW<q3`L;+n<0^Tp;j@V)kXm2o6N|d?80|UG{(J=@Gka;
zhb4!%hqnSTQEfXhrfs(E;pZi7^!D&Vh`Z!v<bPdl52xfXUbctxI13<YpMxabKUlHM
ze}Q>$V78zQ(hZ*?qSN*;^bc&T;xE?ywV*fIn3`bYEtn*+F$HnSMKVj%md*0k)>Twj
zm2m!q1ifdOw-yyqF$-}jr^-Kb3a`Q#MmCgkEMS0KHQb^9Nn{<^0>LyVyak>n@Qng|
zTEq9SnS(SA@sKLsWz(u_XO{X(s!n<j`G?g?WeqRIu?p1SmC}2v((~Y7)+(KQWYz||
zD*f|p5F4!jra7xtuj-2&C$%)JKLuZ~ote@KrOgei^{2^=LOo}|KvD}#5Xk^eU-CsZ
zgs~dl$W6$vl5F!R)V%#5{bCB`glb;f9vlgsGt2=?$ex0@Jddj`Ld&XAI*tz5H04j3
z#Yzd5ru+@7E*5JlZ_i-mcFVzG$Dp)m%Rzz9)Ec@dGotdNOg(hXJnEsZN#HF>GoZCy
zY^k7q55<=J37V}Gj@07WOcSm1kkC(Sp%T$g|1zSPJXUH*ks8i{dYYb45KGPW$#|un
znbL*{A4HEUfR`|Q8|w+*hIkV`XVdqP{Rks412y?}`I5pu!K+%2uPO(k^gSF7fft@5
zF>u}cHYf7nRM6R2ml!5DVMZ#bzb<*Q!*>P^g<v9yeSeDklVLiz&L_45#X^b8GwJ>o
zvy_>%FO#M)X-*_*iX^=SNy)=xkE2kWKl_JEfYk%63q4@z%y-OX3yq`MLh8+Xsd&l*
z3Qda`0NJd=hfNYTwFqiy+IluV2wHLnn@EHt35!l<0M`iKM{=<RHv)+W)?<PnRH?Vb
z1n+>I8&a>23I3}pWLZq`1eN+>1~12d{A=McoSSapydMt2{oOSZybmzrYU&)AM*#2r
z+uXT3oQ>Vt9!@y?YtfzqdGPPlaDG>N573LGeH%%jc;BZrY)^!S^!hKf)7ES08?kTY
z`$%DZOK=Y8v8G2uN1PAJa;yNMxE`8t0|v9RNfUZJHxBmV!89#63%2E1KU~k40Aj>D
z4GjoeGeAULz#&G2;rRusRkzryto1?M-l^X+6cz^9e3E(k1{AT96iKo5DwYRUV0?>f
zzpVU1|5{)~lJQI0exko;=Ab-H%Qefp5asb6Zy*b%ErKlH&4)nL@ev7A@O2pwc|`ac
zBUssn4&K!ejjxxnF$w3(polQOewTh2_=>)Cu;GO`)o;#5+KnrB>Ef0xSd9KFm+Re5
z)q)r@a<AJ;srC255oYo(r6tIvqS1+c21|6n4%826nCEVh#biDD0puuTWnOw{tDiTg
znf1Hz+I9WIr2^Ph03TBSFQe*z_F5w4czgZVvHpGNT5DfWE`I9|3NZDD@_pTRk?&!$
z`!y`tcoBgZg&b}s`#8%D?^J9=n1MVYHTIAWlAXjlRAr?ilbk9Fck&w`DB%Q2eln6v
zujwBS1az1tSr3_jtsZ_x>M4|rPh&g_qjENVfEJ)0NyU0z!M94zdY;5i4sIMla#hbP
z*7K-j5!Pg@XJ1r3A4tXtk$TQo^++n#GdQZA(USb)a6QMep5Ce+axYsw!py>W^Gn79
z7^Y@>l2tvDiuJINNP9lKn(f((<f=Unor`+DQvE{?Y^z5Ya=4zJQcrQDp2?DFg<{*N
zrx@RndS*#I*k&;68P0kJsCxA1_V6>(KMzXAJqR<~bC#+{Qn5WJMAdVwBzJ}Dd20ab
z`5M;S<`xpR*yEE6xL3#>r65))78iF#IHI8wM{3XM@G>j_5FHz3jx&BPwXN%TtIMJb
zrRamj?nPdfBWN`KjLBcV*jVES>uVXmJ_l_}90}FP$7q!=l9O?_^|hW~-{4Ch1AJo}
z`dlgBc!_&s8;$qWqnir+RDIor2a$w{g7-FJJM9Yr*&Z^%Y{X{cShKO4jqS*1EiO8K
zqgEfX&)V>1T9kmDqB@s?qc2m0MhF4t@Djyf)`@el1bz|QX*WVEss8~pM8ljU+}X9-
zm)3A??^AH^B3i_mh1MaK?(aQ@HLOXK9p~`+h^ICo+kTFz3lhyQ9T8F96G#?8Hc4c8
z3ageF-;bheBE@*UC&hFq-r5{-JVZ8+l_rTS)Nl?+stO2Hd@j14ezg%{K3os0u*CR8
zQ;a_a_P&tSBULyX)t#wUHT_skFI1!7+CRLOGg>D_-rvA`WIj}jxqV6{Q_$h!cdR~V
zWnU7+3%|1TEm{zULCA6<kC#EDACVXbT~Z?wsr3@iq~Vy1?-vc<b@TlayFDV;CK%94
zLk(|!2L^K+XpshMmq7RSckTJ%Um54jLVs$8^uhg+_7fuTVx_r^77HWMyY<)RJA6AJ
zLZr2g%op+QSS^H8TVUs?2v@5iN<_~MX3&Q)<U^<`!oLE&pEz-nt+nR<k*XgRYrzua
zf6sTSt$uZXF)^YqPDM|s`_izb4%=Al@Hec*ic0N3>-p#l1Op8J7TDl$I6|dSxK*B5
zfE`Z&c*=0%B%=&viT>AwsWUzf4Ug*(@GM3T9xgm@q#PU`j>O2S>(KTXeSQC)Jx0N$
za_){7i^JVEw0v&h8f$P!s`|aaF^#5iObKRC+TO3-;5|5Rxe?16-@u|^GAu&hazh)P
zrsclkOJ@LM`bZG-tY@Cww|rY+L;=&<tugj;?%?=D_fQ-Y@vkAZ!`K=#0PEK?Ft_69
z#uN5TN`F~!nTAu4wBn;ve*pXq{6||v1v@aFisXa`VmR#6kyucjkL4QX6>tT8H0CGt
zBm`{e3HHr8PaTk~$Goo&NIr@;Pbl*PQX035_5fB-V7Ht?(;q?Cng=8?9yA!D6y<)2
z3)PSok~WOtcp3~0P0Y^m;v!wToh$n#6JJ~0fp@gMdx7S+ab^pZaO+0~-zY7JQE*jK
zdp{ZP$7oG2`qCJ{Y`d5T9-vOua=-R}GHQrB(f__@XjyWi4~`XKlmR=p_Ir>|v=91d
z!0SeLnuiRZ6J&9r%j1UT*-4yH#GJgaFi|fjN918tI7UB@;KR5ksVMFME$OIcNJDm<
zsDrr)R&hsr0&roEQ-@N!e?G>b-9S*C@6(uL6}Tw>oUmZz39TgOf)NO}noPX@5%Z&k
zSDx-0b2j_MpA>zt&vzwH`{BqRZ0*`lH09iwbAIRB&iN&|_kMNIbAC7gz;k{#0qOR0
zeipy5pZ7~K&-*2wNvOEc4s79}KZPK4h!(g$(P~ZvUrpk%S<dgx#QD8Kle5BB4;?Az
z_xj8EJ(oLhwDA<g3!x8KdGaKd{rp~vaTk{Z>(q%p8vUghA0j<ylxOtPvtTIdAtkrA
z^ZPV@55u>T2UfQqYrPz&pUH(InJ%@xImztroq7uwSTM~_0pC+e*QwnsZi5}Ge|!cz
z2M69#xX?uUMM%$u=ROPotLO(KtX=Gh7>=R(nYpJxqc3;h4H~J=`!a{)E3jk>8f6nr
z#L0hAmFQ!`cR2RJx3z;nJ*{;l?M;}hMs^fJEhvgnTh)$&xVVQ!??CL8@oERU$<~5-
z`S^r2>U%Q$*lodtGZ_?&-zdjCk@8GVl|h`wy@SxnC_KxwpW-;g{<0*98jH@RomiQ1
zJia?Bry|k+h9@Ao0l-5I(Ab8uR^dT7_7aXz{Cn<Dzcx;~&ZfNX=n5=%2tO<<;L6*D
zjulR@PMn<c)eR*BT<eQtZH~izC<ZQ{`Mn&dL|-02cTxbI-4|!plUP#vi=rPwHu1L|
zSVE8KTHp>Ak6P$u*{nsgUNIHr!6iS_!&4BHTj2OoK1&9-B&$!t)%H?SP?dy>#93e<
z!~&%SfhFY(StmCIunK@s%n)$>)6kUyHUODyW<WGyGVc>mMKmo`MbzVA7E!|xb^tGh
zLNQBJvqDV5OeiKFug=sZOiARniH~=vViDOm6ATAQ#CJFVIhiF8Dl_Y3m6g#j)Pn<v
z>KQ_6aNk`mnLLny;8mRooV|`Bsl5PFYXj5{X6-dH3ETHRXssoQB{|7TM>QsBhg5;_
zqJXi2@S0$-F=j0Q;aUih)icsq*)AI#X<;}<x)|3Q4G1g}<a*p#K<Za4Lg^s6e2|<L
z6l_uY=mnV55M#kiX-|5@icE@$jPjqZu`oDJpBQ$RZrZ15>qaICfd%UN0%PobBHeUH
zK2~TH9U_SDHI(f~jx?2HWDXFD1qyl_+cWHKL6R1@kOFF)>R4>R!CQS$T}Z)EkwUJi
z+_6X6BHr4`S1}4t!+s?vo2S6$m_Zg+lm<gh+tIJ-FSZGx_Ad5>@_KS_Go<cm2;Ffp
zP=GrNb|4k@UvN*sPQ2-V5V17x@+;%S$^LRkXYj6nK)(vp1nbG=5~2xfHC2D!AE1;4
z7*F#7-ygp+!D#1dE|~ZC3f9=ihf=UHq4$?UI`nTKYZ~jNka}V#YE&O=96Ot!Ft(4u
z^gZBt>zX(+1;y*+mJ9QhgN?7~j$*vxQG*Bel$W9qYd*9g-nbFbVgL7nBg_V#z><x`
z%HKt#N*D&L#Q47#jw)sg>e>bPzek%7&;R|yM1344TKK=GbPoUb<uWtKJpm*iPiV;^
z!WDpn9w7=(XvG8sVAjaTh+)e#bw7&PKs-3!^xY!OCp<QthzBFxX7hv(Xb<N8elZ|I
zfz2#8;xJ#22XtWQM8)#KKq_`mAAyX)4o}5&yZlTQ_KF$ShhZdpr2E(mi(}X?R9JIJ
z<=u`s7{pG1(Jq&&v@ae=X3YhjfZV-cT@XPMpU0WPc5g{0zqtc@4ql?X;_u`M-DfG$
z1{2m#6VyJzS@p-f;u8gOMZ;pk!%f)Fe<1}7M&F<#)W)j1x1>@A%Sf0R$(05`vlhIM
zy^2eZEsyvy1fp*NIG%v9{CrB0p{#F%eB=?G`u?-HtQy6(oA)rp(&0H$VZt-$R+5K#
zQ%*o`=`aCn0ob&HD{Mxi$0DJ+gTeF(xX!vb$9@lkjSf#188zV^uOMp)k#T4`R2-#C
zI=oUL{|uD(AE3kQfFO$whbUS)3HT!&-hUd->GKqLhko?wAhkX!dWIQ2AJIoZYQLgL
zZzDBa;HOB9TCmQf<^g=b`lPZWCADi$k^){9bVZR`oG^%=Mrt#OE|3~EsvxzSu~!dL
z8;&F0`mGp_-0gn_+>p;S&NFH#nlq6sj;^9AE+i9`ZTgugq2^U2JM|M>;O2LeH(S?}
z?9{t4>?@F>o^o<Tp8l03SP$ciJJ4oOFQnerv`;r<4IC5$a>MiWIa0Mpj`-s+FFdpf
zOYr2zT*WfHFOUa`Kl>CSvFkJ^GmYY8`njNWO8iG7J?Ul#d(XU_L_Bvz4P_7f{z31L
z0l{G&;rM!YE}&5Ds(z}W#xH<NvrUH2d~e@vhM<gmG8PCaJ<WE(cw^5Y1Qdc<VwJxU
z`B|Fr59@`8XHDdBj((6-1c(M8hkXU3>=j@yr_>E-5Cw&X1Mpe1fhZ5`!fHn82sY!}
z6YBpjJTSa=>YGm`j+7H6ETHuHz@sNL9xHf1FLx2C2<uu;s0cc-J-+_71#230D4}>b
zq=BhYhY$YVPqaI+5l=PfZg>mIecN1<`c7a2CMCj047IS(!%-4jy-4fi)5g~W>)bV1
z=(I0{et^$<JUyZ5^fM)rS5e>-<5Khj<rQ-+`6fDA--Rtig|l_~8-1uJqBgoPML!Wk
zLq)9BU*Zgf=Km8!jB*)VaGBCKU5Z0J5hH0B9a7`u4i2|1{;<4AZ_!UgZxnH3nk%%m
zZqbe;b9n~*hSei*LkRy)kWw$CD$3cooBa{|1@((?8F676^R8#!+>O2uSLGm9KMxIX
z2T$;X=D<3F{e%u&?Fr1m#^>lH@iyC&<32DJP+f}E{aCDZF2zCT(Mizcz>GxnChm2k
z6PZ;sS4flOrnLpu?i9&V+D4|J0BDc((}l?2!g{ai7D?U%$eNMnj(Y_g)zkqv8(~%s
zkdgi<2@3ReHvj=~P^bbmcEfpR)8Co2VNHtd?4`_m1dy)FoK0dT7aUIKWpZ4JqV^K~
zeaysiU<sq=ug3eJdQ)E#f_{f%<Rx?@4hX?F+FoelECd@xX8(U(;H(}E2Pq?Q%_}l5
z61me28)2Xa(tHaKg5?9(l3NS(+eGUH(Q+6AQG|X4iZ@9E9>8!EutTntQf?Dg9WSG9
zrF72I^08BY6yqc^1XbRz<P3!t_-^6LtEWXGP5Pq!s`J?sqRB`Lj)pq#^J#(JvdfUD
zf5H(Gg;l?K9AQ~+l4?ZXZrMMI@#7X!MDUc`Cs9FAooK71F?qejB{VaxE4#<RFv!8G
zCB|V6R?jvaMepSNJ!|89C0f9MPU8GHZPT}b4Go_h)$nmQ|0M|7Jnyr8?E5&JncbSG
z9hLAiZ24guBmw{3E)ax|S2&xdQlrv^1JQiYnBM-XjC18rlg3rWWK%!K)Q<`H_TNP0
zqr>!~oQf=Be-AA1uanhdF$V%1<A}KawM;PyzAFSJN90Di7#tO217s_Q1tzd~97R&5
z?&d)NvQ;fWzSNl9%y*h);>o|TC(Y@AW%+Rk!4#i`e&-Bun@oj}hW6@xF$u_d4eVF}
zb0N5-plw?{Gr&*`e3G^~{Nf;f7cUXFna}f!lC=z2XiCHhGNu;F)g=lCH4_B~reR|<
z3zEgjk2^uWHAy<^pFkj{UNxPf{rbs}8#pR;Zl=PL8un~u@Rkzr#TTPrpGR_)wlTQ{
zHR(aZ0K>*ftTvS8Yv3bq1RnZg@Cr&c^yp=c2ii3@Gbtv=$tOZW8OBk4W3TY2Hm(|J
zy{|SFhTqp3cbe~*`;2>R;on7BP><JEJoc#i8_RL5pCdTx**Ke2o_7Zxo?yZa@^#8|
zK5&eDVZstDP2eyK+h(3%A59YVOy?u@1ub=p*7{w%oECNd?M)3PI$Fa<bZC6NC)96*
z2TJzT^@xqvoPS&EiQA&(;u!i83>@ct#gBU~=EhpE&;7E$wqBi9Pt<a8TC@tuW}QIo
zJI7+&3tH|bSW*T0+#opyOqWN3CXZ8c<&ign5Bg91!V~Oy3O)ew(F{xdR!e<XYyGwx
zz&%QXwO|o8PZ~CN6s2OQf70FEy04oSx52|&c01>f#V7-u><XC%Ed5F_p@3xuUer=s
zQ5akGiWc{kbK!AZ+BJN8jPsWAc=mDcStvE$9k+?hM^2Ew!#5q?0uy<j6bD2rKhQ$e
zaMV-zxiht2vNN>|Tt4@MS*L+-;F#sgtpI_k_3p=lhH^ic{RKKB>sM;wGYm&it1F?@
zPt9BJ#}X=az$QG5CopM<91-ku625_bg-<kXmviA#@~eh@bDg(*31}hmJWY^x3tar~
zJsuwiJUa;BR|2WC@@r>me*nkf1K|2u`v0LT9a0+Lo8Hre4<IEIE^aoYp1Bx06@d2~
z9u^K4!3C(6`Z{|10fxd0TWXuudZ3%uup<tdT<3yMS!Ucj&V@sz@!tYenuFlj`(5sU
zP8eU{e=qXCj|k;f6Xcfw_8RQR=C=CIFlCK~mUsvmIB(1O9}Uw&iv(nN{r|w3+OwB{
zmvC|fJmMYjgz`>6nBIz0=S28(Q-Rl;Ayo@rDPw0%W1iI2B+LkHYYjTuu-dsmrm(no
zapi(@;p0L;-+?ObW`jb#C%OaQ5%^b09iTV1VIKoFYYpp&`?neL9bR5j^{p2{sXuaa
znG4>M=IoWf5x=eY?ZRmCf2akW%&di;BFVJB#CE{P>E5@r)*apP@PtPF3zFsM-Q5sy
zQ@_I2wBRAs;}%xD+IUomrXHX=+{<2hS^a5yA<%7f@7>B{)%fs)dcR#1?B)*Dvl3=p
zk1>8L?`nu^(ZB~ny)R*#Fo49NtfCWJc}N&jq!#M;I<noW#lhs_C@po9C)C632`yp;
z?$(bncv_8>q70J}jeC4s4YUV0S@e5JYyB8jP{#xRp3rYqbK>V41EiZqsg83@A^r7v
zq?V7KPvc?SEE+w=13#??x*H!`<7N5uY3k3zEme5XB+ohztfCv*X3$lT+K&i)2>Lh+
zWr0%kb9<nqV9)dM39hLzG??X<N8SZ|1U8UEl0AYo8ruXqh3CK)l=GXb;vDYOT{v&9
z*KrI56RtxD`0(}e=yQWS2F&FHEDanC?8|sxP=a|Be_9xB*$Rd`pD4KLKf;8Y+^W0L
ziU6BLw<zN07uzOpqaN8W0saA9GkBnW4g|{jWl|R$fQ9mc3_O8CY{N2<%VEtDwf@OV
z9}j3lix~;U>MtOZwbe6!oPN>I;e_QM0uNksoH=<i85HP?K5>7U_d~}8p?;q+B`3eJ
z5E8K%;bwPI!}p2KhHd}~YmKAY1h#mBaSPIzusIu#Q$V+0f{0nn92EGpz{3Iw)^!Q`
zhd+blx%8B6fC<i6ZswU6G-(;Lj6{}Ep&1>8&ZVE3Wukg#)9EZWn0LiEd!t4}6tfz2
z4a&rlSt?oQKx#)}<925w?-^LZFH~?DEBJ)+c5orng$m=eb@2vc-Q2q7DjA$+V?ztW
zH!(bCi+{TfuNX{bf1A85bbTTyXx*p;3aj95+hEV4YpMG}J&sCmX*G_5fwb`^iw)-8
z4^&0^BM9Ny0)5pnRC1Fd!Y7!t(11s<$Hx`rR}ea2AaD){V<m#HIo(5Fu@|?5COnBx
zm?!Pje}g2(W>9tdcB40ti7n)x107sMPz<>UmP|HcBR^x*c$mkC*PKNCdgO(F9A96G
z;~*nhcPu9Y7QRf=mPPtdh+(*#;UnAmUt9h>Gk*=R{1uc)`gO;JNy0Y>f$uYLIs$>;
zH(5!rUV19ndt<mhsmikF4CY;h-iqLBA_O`MU$-QJejgS3)gN#QTnoBtAFR4epIC|d
zJ)(8W8%H=k0=lfRbQXX+kAWKyIIZ>jL;(1n1zn*37S;Ky(SILbY=`G12(uPE9o+%X
zvo;7U`wiuRsUYx;+yw!506M6fh;}v}-;MM<z=xeV*hIXqEwjWyaIV=1B#zGMIf-vd
zx>+yN@iXWXk#25&l=SwaT!+(w%n7YpsJ~ll{VZNPs{dQgrG+@1?N1?tna47MeX1)_
zOR#4>9}rZ|Ntnhrk;Aq)7p6;<6jr+PEfjyZ5C)v-C8_0U!y5ST{b$1`r!IUSv&frB
z<ghK*t#x-i9HDQ;OuN<wK+{}~d=PL?^dc4&O{kYnqmQQc0Ln*zv!yV7J2s?x2t*J*
zxyA{_--kFvj)HYbFdp_K>3c2+4NGw!b)G&FV~HiTGeNM=?K7ll5Aq>R3-Zls+72cX
zIZP=MSzg@6^vw&B+>le;^Y)XE`0p)fJ6eX>Ud#=~%N(bWpE#XB27U|vKrkPE-d#)G
z1;FNuYatbnS+?~H%#)kjw=B|Dh39i?{?+zwGpBUp5|*jx`Sl(+7J)IlRpuxFeXx3{
zsj~Bnp8i=lxj&aJ3T54acMx_B9{RmBcBO(<$uTc*_BY{r7>weRgb1U^hS{NlgZu&a
zm*{0s;5t`+o!&9gaC3lI{EzWr|3BwZTUW%zm_{3_6j+#J-SEMzWE*^TIeJw$@Bzxv
za<-V3P^1@Huo-_r6yaom&7w~HFZ|T{!gBFN2d^*lN9ywgyK^(Sw-%g@wf8ztYMYTD
z$sxDcV3U3$KIl9Mi0EN-9K}Y|qbtIGcqE<&^yx3%@(DE8VX0hr<W#s7c!UP8{GQ-o
zE%3T0^=o(F19$8GL{BPqj^Kp*{G{-*tj&2N7Jjh1_{ur|7g$Aja<Pcl;ozqYUf-Sb
ze_%Zv2+!jMwpNAd9n`wUt$xl0uy}aqBg{?0?hR;lKS7`tY9PHsqQfY_wz|3}tNQ0S
z*MVSXuZ2{#VNV`fw_b~Q`48SFx}3f4a;f*JkcgYPJ-~hYd}ptR^VNG@v9s4=x=cm*
zwUeE_mQ7agBWs+!>TA?{upR(7dujDZbK4?$FGi<V-_tDb6Poe<z+!oyycq9~wa9yU
z3*P^}QQm7d;{6|6<$capyf^HUcM9jN&ZZ1hRD?bCTd?U0hrN58O?TnN6StjmeoGPj
zXC;9}CVJx5Ql@?df`2QV3W3{zz1i5A*{@+N@(7d$$(M+0*bYnz<3T}`0OMYXjF;-4
z#p3Z28y+vW;c<!$k5}68SZl-MH8wnsu;KBy_0nd+qg;~g>?L@-yIH~okAJq|@zE9u
z7d$>~!{hR;5-xb0AHgF8G2&56#f51%rIH@Rqje`7h3Os}792W(4f@YIzK6aBt5>L6
zJgEr9W(N4J{`7(Gm8RCH5gw)FD@loD?QdHrOXh;qm1idCAYdfNeVA#Oc#DFW$wg2&
zA*{t6x2`DlE8_(4P8{U{boiUw4p{R}nFLO_O-64$TIS$*bU$a!xv;bh?6TY?(r8$Q
z;YAuoJ~YzUyEtb&>@Bg@!2$wf()qVd7(QBS2h<ne#^G4I8;X{TplBHripL$z<6TG(
zV)`zgDc#UuE;m&@-M9*Ym;d<>k@Q04$9RI{F^OAK1IO>)Pe3>xszUZ1#p!|jEU+Cd
z3VD)I(?)k{OHpp??EHVA0A93k%<N;@PZvE!k+y29l-mCteaz8dunnt`0j7$%@7H}C
z(?Qb~6@{)F7I$GN{yc)S0J1#N;S#BhZs*7~(2;iQMrUENARs+sa+UoZ{a=_Y%xl=7
zZQn)jWSewPXjmK8_P~}OtREifa(DZFuBGl~5B1*vJ!D^H)~tmx*P<|ozUl`CqVqig
zUUNZn%I;9O-_RX*Je*2XJ6uJeX$fqOtZz9Mu{RCQ827dFhPX(~M#Q{;m>ltN$9cn(
z`kmI$ny791q9`;B3+D}lBlo~8xb=F~cNZa=kILI`=4Sv_-f&^|wkNJbuSBTf2f4Ug
zchHFNeN4mp-gvY&l((oFFhAz7)P3vp*S`P0qu_bzf!y~YGo$UIB)T9tY%48L^$(Gt
z{h#_?$Rfh{;j-Z|yi?LyjdIXmLVwGIKyhX&G7WsAHr&KW`}`RCTMd%JD){R#?d^k9
z@~zge36_sogva5s#v+g&_SL9dg9?pH)lHt<kDc@HAZ73#oL~_U{sMrUiyIEQds3kt
zd9et4B5^4DD0k|nqTCI$FBIa!iqD_ao*I_N(Z7M$S8-0oX2}-49dn!*b%w^qfyAKv
zXs~wQOfPix>}&XbA)WQkhRV>n0Pe;?XSn)sy`r=0;y`C`aL0w`KM@By+s|Bj2&I#p
zV7!Y2^oF-dM^S6h0KgGNV|(?*-*rJ_6A_4eDX_4GRdDcVk{3*NL7)k2f{+#TZoedH
zfdVx}Ua-0Uy%pWYG@HW&rVDcdeL)@90{Sw5#-dQ>OG00JNMCKfJE=uK9Tm5S__;7l
zUwic)2$e0~qTF?}67=h_I@9l>SQ5%hpNR&0zy%eltkY|7kTI6L2vwN8<{|;e>n?CV
zi@ZE|CwY}w<;Z?5{DKjk|M&NQ82CR7{2vDX4+H<-V!&NBtF*GBY+UK|6vvd(x{9eK
zb-vn)s_8?9l-AUEtI8Zx{dK<TnI)xuU-k6b(z23iRVB!PlvUHiaZ~CXrBmw8t@isI
zSNTh;d==MtOKQBeB|dLeU3IOaWSYOuTUJt6R#P&yyt-nl*HMk4YrWGQrAXm$RObB9
zF$}*R_`MHJ!e8*a4nGfm{qXw|=WEvBcQ1aG_>ID^2Yzp4AuWE_;pf4xAAVnEA}xM5
z;dcdo!|>~Y-}8eIhVrA^SW;9nwYmx|_D}V>s{ECeuBvLEOE6;g_`vEK-$0}(D;-!_
zF=b%Q)xPrTszK+brw^>F@OcN+lun&dI^A10&{thMwR~Vj)znIVnRj5pIBykl)p`e3
zpfJDp+;WS6(p>H_;|ebsQ{ZtGj2&Be$vD^O0=K7dl&jR|I{iOY+37>FvoiX-YAU^@
zbzWDkx2C$*=PGqg@lSX8s$Iobk0Y?>x*Xm)HQuQ{Z<(v2%IEM^yYYXa6;$J^Eg3Mp
zqON3GWp$}9C6zg+cwIo1w?9hqxuyeC(@HDryyv=HDI=@>m1Qnpc}1PeTU%RQ>ze7U
zs{>+~rDA4HZS^c~*?BIPdz!1Rx)WekE^k$NY1LHkOmCGB;9lkT*7@v^raBx)hMKAA
zX{sB>UOjU(+Y#=F|586>Wu|p%Zs&fmpo?@!S+##kr5AlNO1*T^FH@?kEC1KK<v-9b
zBhZd;w>bVw{gOR6`|$dO&_%k1I7g>Qpc>H^dTXn^m9A<2s;NFWTyjk<ts)Bn*^`}M
z5Sas?30$Y;S)u=vTwqG|95CmB3bN5AU?muZK=bKCGP1J{qvehx8*dT5_hUd1B{NHB
zct_V(&m3Qcl-{yS{FpUndM#)j;i<(o?bLdG{@SX#k~(kYG)GC@)Y3{!DrMEa60nyt
zFmQ94DfMAeVPBMh>D1O%lzB@zl+3@Mny+LEIQ;C=+A=f$z}eNcGwQ(hy#q}?I<{h_
z?RAuQ7AClifm7>hrw-)wHt-_Mb(eUj7tZn3mYzGcrp7S~Jh!@b$Pi3EwcgUo%Ic{l
z6;*ZKS{w5vfq5#x1T98u5*@=F<7z9wvcZlqEun?v?2>kNg|FP@nC`9e)?ym>RhN|3
z)|Osv!d+2kl?k5lKRyMaL5kSYvIZXxwH+yckI<W%XH~syvddd3(@7l>RqC2n0U{ev
z2f@T8SaDTjCgPWgo~TYKt(xwIq~e`AV~DGyq{@{tyS!p*xeIgKnb-JJUBmNaq8l)r
z1pu4T(N=-xbIc&jK^bY7U~Bz#6|*20VP5r>)=u~O3YkRWv&+1`(u&F)cB32PRAN*C
z5}%$4BC0I~)u=%o8=uL+Z5A>p{j%E98fze7O0Tw$t?Jqmck$>k8Ku5#h$s-w3dd%a
z@Hb{`N#=CFuP(=Tb&a<YaD-pn7cdF`2vKz2v=SRA3gDRk1^%B3-z9bcFS!u)kIe#<
zSw9ZtU`b^GYQix(wyd<0vFSFbj|jd_;FVPZf9Yvs!IMgB#{q+dhs0@iNm=FC!4bs9
z;LZAs!Z%QnhHQ!^qH42}cX}RC`zkNF(B8zc*(Lag%6&PIhQV1&E8V1tvMFeQyR2|Z
zA)9}2@{#N;q@V;@MwCopADM(RI<~HilCc@Fb%D%~ageGjx`{{BzOlY6pC8h(`+}%u
zW|4ptm1RRD!`LY#)K+j|Ay9yiT>gu}2FSB)$`mvZ=qLnMeiR>40E-y~d`uZz_+vyc
z`XI2TlZ^3C+8>5?kQqu6C$vb?Ro@|&6-(zGVgUb7;MYQAPRZEtKVO#cW%m@%%uHY~
z9G+8BS&6<^-!}RqnzMf#&9*X_zra>oRH-BX9sFBp28#;+i)l+ZJX5treMh$?ny77G
zX=!PkKL({`re&pNrwvZaNy|-7OHWVFNFS7*nVyxNojy1{Cp|YKEh9Z6BV$lTW=2*<
zcE;e0oQ&K-X@k-SWegfLD05KOpzJ||2jvXP%}mQo&&<dil$n{Cm6@G6I5Q_RH!CeG
zJu4$?P*!GER#tY_;H;di-0ZaM^z4l6LD`wvS=reTmvgdn2d52AADl6G(BRC$S%b3&
z4<4K|I5#IPCp{-4XHZUNPF7BK&fuJ!oZMW1n2YLjQFJb{<s!&xlBG_VQkSA8Agn!W
z{Jzuy!(H-i#UDiO>y*i+jxgg=N>goRs(Sa$t@HXy&aJEV*G~18jKxHXwJ9_m>RCN)
z8lqLoBbM)qDyk}GmR5>{H>SGE>liC~6vv3l>M4%%l~5+)oB2MfVye$E+7FS|>nNCA
zTH$jPUwzS(D<K;?##f;%Uj@`G*LceuBTH-Q{19L-xx_ooaj~ghsdW^>`;(*2KjmD7
zMu&H1jqmDnXL@H=*Ir$s9J8u0QRtMRpi+p=fQhfQUdk>d4hW!i<)t-VrP6Ug8B&AH
z5X)_^C8bqmOot4P5(ulMl_Eoz)|Qk(g77-*m^ERs2bsur%&PFt28t)V@k2*Geku4R
z;kWPgA3C<;w;sR6_}ztHJ$~i*72}tKp9{Z4{B~{rp<@evE%-f*-y;0#@vFgaGJeJQ
z<>TkTuLfnaM++ze19L+_6Qb9_v&=ik<(B{RTuj5iBU|rKzu@;(R1UPN!9sSZ%j2z@
z4pCeRr#R&j$sE!;guE(>?g*=Ml~t69wu!40s3yu>Q?VNGc`t($KhjD%#3jHTF?<pE
z_{s`j3G2mr1{#}kM;Dcaapi!tnw?ofm5`?zYCirQIetV9rX(ZO8`dw0>WISm|DICm
zv0v=(c>kpR9iQU&41V9>7k~2p4j+DB;&+j2e@EG=`#Z*+vcKam_+`So<T(7+;CB<k
ztIycqk$k%P`ON2E@cah9O#DC{LxxPPhHUPgGfZi8aFJ2<NEKpfb{vi&XYcR0;;j80
z!;(=he*MnGJARMj*NAXMKV&LGI^YOZLOH4rT@jp`W6i#5w1!_ro%#0PT>ho!rXR-Y
zFH8~;xMU$~TlAViz?u@sgo9cOXfP#j?JR1^R3z!NqR!0n-)(2kpu=b=;R`o5(jFJo
zrJb6N{t^k44MtP0yAI3G|CN^I9+tW&wq0yf1(dnruC&@?Ys-JLhcYq`QxEaK4d<ZT
z!y}~sC2AOanBZX?Doxmbqm!xD%pG(%BM|y@(a&)ts=u+m^z@P<zt1~IW=6MXOm0cZ
z^eTVJ)H!p~GfF(wQ)iUaR999^y}B^NTC-Yt`5f!Z804*@hOq>aR4p`;6rW1C6XKS&
z6SM*nuCxO7tn!Zfu|mVfST!`D$j#~brvRbqNH>{#jPeLIbcP`s1c}d9+OzaQ=G3HB
zSC)BeWo4-H4;~ARa;3LW>D1U1z&N%fhi?}`peoER<R_VxYG-&=Dfxe@{>Q#5XStSi
z<*vl4&$e(vT#V8MOIieGdrGgyW~8jd<y-Z-`kun|IYZMkibPr-Ew>mzBY_nhf`3$x
zfkpcWuG_I_AAA0fts5;YN9k__89&mR^w{QP63}#8d#r$PgAjlQnZcb}WcL4m6#pRC
zf4>V32iK^`hHjhA$igk3hr%y3B2myDDT>1cZLb{f|2BO?OCw|#Aqx_)n*2$VK!L3R
zP-Gk~oeJ&_g0)B^qVNY;(v(KY4$UAtIMKZiby!zSF;e`I;ir@O5h$@kWCzHGoG!=o
zA(TXl$PTUo8XW^m#v$+%1<s((JjR9^dHpZY&*6+U5KC^C?Chu0NAyVf&$JJ!KH+kQ
zV$V#*zX+B)!w>cHlt_aA7<`9=<?PNV?h?KTct<vVgoDUl1eCz`$+Qqk^dH4xrwxhG
zf}m`*evd5;Xg(tRMeWIehX@Zm<o*s+@14MNMDOolt`r^?LEwJ`KA>!{DPe*89%35`
zVA}e*6MO<tq_ha7cK!Arho4(D5nf7x3RB8qwW(5R;iCuGA=-!9GGeO3lwoV?5w*`w
z(ubHo4`*0*&G{6NoyVs@9Kqli45`R;ZEu<s0QnvS5s~k){p&7PMTDj0!(Vf3HO;cO
z^5irAcFvVY$IcEOD&S9jckno${9QS#h8x$kN#Q23?1@7K2?a4WvAwRT=bYmzD0Xu@
zTxs52*3Pl(YOn8Hhoe~8kkMd|+vw0xLSY@Y(SpIt`4v9RKgCrs%>|L$Rpq6A6N?{Y
zmA@m?8$vM;YdU2^VeSMBG<OE3nFcMU$<R;N*H@MLun#{lVviwkUhId}gozkSQw-JJ
zOti4fhgHsLW_wgviG-EVpeM5DJ7masD3zy)iuZrU!a}Ln-9BXtz-2G?fDTVzaAf@;
z`s3JI1rr5R>RdCF?NfDC<<+kLN8bCu$312L|99s7zWF!FbV_MZcF<i#wdo&(Ez&k=
zlQuLc+qGRyHPv>d?QD}0w!1>BinOwHDT*vziXtdmbZt;{DT1OX^1<4o2+DTl&-?d0
z_x+yCWHPB>_p?6VUml&l-gDmfocsUWbI<)V74svq%djGBwhOv`N(opbt2^OJv)3Rz
zo4uss?-f`0tQqs?&hKa@UO1K-!c5%aSY+9u!e!*A=Ge6ut%u60DTC&)$){}AnP*O)
zCp&)3Bz%xr2l-EU9eUIz*<W>3k7{I^q`z#|yvlRgvqF9=ru+a>-ZNL9sR5II_83)6
zWlzfwAnCb1>jnzdCA-Mz={V8Zt~+<$JeJ(pmdEcE&J#5#o1ACSoGQWtrbqX}lH)kO
zjvy@Icx?}r$NpXPY_$LLEi#uK!m=gMl-Iy;MWtv)Y5YW45n)R)J{3{Kvb&6=NTWYz
zs;jD}hEJV3&opSm%zv{gCd!V5|3t%%EfU3VqM{`a8QSv(5#4Sz$lDZk%rW63{{8#R
z;`(6OxpSt@p3Cmu*^ybY5iq(zU9{qmp*cPGQ~n_Q>_c;Um||JSk^TAqs|{gTZcld0
znx<;>!Vgg|4;#{xo!IIJsF#Ng>B%aZrzn0{dYSzvJ=DwSH)A7s>s|jD`zh{3Mr?=b
z5j|2B*zNyObrw}$UCmsKyrDfx;t#z4<^G@Tf4Tp6`(KaNr9JDsY%Z3zOcZBg(=pjA
zbC=3oV6)GeQ|Fggs&ccm@_qM%cykJ7q~1Jk;!KQqv7j)^+!I8z<`{FyjgNB`<<sVu
zGj+qDDxW)J)--P-zyYP%XO)-Dne(3*5Ic?!#U~f@VpvX(+Ol-W(e8N=c0_R`DsFF9
z_>8%15h9PS=m}<Dt>nHejM>S0<1F^|&zmt{jzs-5dXlRQE31-CAXt&CG|uNx8cXbm
zW<nU97k!U;p3#n@W!roFx|ml#W;?uhpl^!13%J7U44W?J8aRB>DR)XmH>|V&e(G6b
z=yuNMG`CoICoQe>JF{ELi(-(R2AhSwJVoqs$!i!BSq`BM3Qv^NX0s~h9Msvh;rcDu
zD-m5aG}BFXUYWKfn-Yw%?xld)DCgs3Cy6Y-itwpn*CKTHUVXRn>}WWysAx(kOIaO`
zOWVoibI*~}kY;oLfP=kb2qVtm{Kv88got}G1nag+#Z9^0nMp->s5i4V=P#$Y^KljL
zqL441eLQbQg^D^)!fK#7zZcuKVP+Sx?HZMH4MEMHBPSd?cG#S1j;_SGTw64r&85gz
zMb3|wpF2e=(@YY}Vq9)@%Sa#38KdG1kS%&R0%SrqPmW`#%BdA+vBECL3|T9m&17l%
z6xUPWMHJ22)N(Tpdi`$dv}qhf43|x<oXUiZ%6BJuZd`6d5Va?5$Y{N|jhC+9hgsmD
z8G{C*f0*hGJ^E;mvueQTl2Z=GwwX6eJZYT8;UJGR^V#z&jUPf{95L}&v=R!d>R=_`
z%^56{y4Zg!D%zZCJ~R20@X6&9=99*!d0v~dg--*Y)qHCCRPib0Q^+TWPc|RLXWOsa
zoJKwkeCqks@u}rg%_omfJ@R6)#-0u_zEsyIcBFXuIF`%4hq1*IMvW|<GUWtTu!fH|
zM}S9qj<_*%MtbLe#o5^d{v#9lPjU}(oy<9|F8<P3%G{?zPB`35`f{TN(vhPhkrU>Y
zO)vW9gzqw9Z;|Nx$nU;jVk=OGoR*k@+^C23d*|SA)5VHdZWwox_}QaJdhpPnaB}5Q
z-nNukXBJI5iea*1>g?%L&MunhE>Gl7oK{{j@8r=_j`Tbb#rbmzDk9U$k1U%$t(={*
zMH4x7A%DZg{aKJ7nZUM(f>SuIadPDm?hd>nS)86K|Fep4!jyt3iIpeq<Nv|@li!?Z
zUcX=dcF}_VFI4XLuHPSE{^KiG^2Y)Wtvt4X<Jw@di1Qy)OeJH^E|G0N!$r^0wWb!G
zUC{G2qRyT0Ka&1kR9nHnGx21_=kwcX{Ik<SKA+F_^B;c(pt|2;|Ho{MxDyxgMur$2
zrhsNu{I7c&IJVP6X9y|u%u{cRH@@Sek9L)Am=lIGuwpP`=Y(fZt(xLGT%)x<8Zrmc
zD~hL9R1PZ(M~xeAc_wz|)!lyoZ+EkF7`~mS5X_l!dgh`3p7wU<XgLgB$w6MXM5dHe
zB;E;_iYddq1Kno&!sG?(Ec}z7F!y3M<9_9Q@42(jkgcDxvy+n(ZlM2Mk-4U&Y~eIM
z>?3=$qcy(1hyH--yGK5gBn7;SDjO<Q=Gpc0BN)^&b)18WJV*|Cvq@Ln0-R2oGcP>#
zOfmFj3SG{jnzC~-?qy;ZVT10R%0Y4s$vH4{^!^<B(X5Jr@|7NkG@J`eH|B+B%%94{
zqjI{;wpq-H@_JE89OA1eKWFN>6|4uz!5yzlaz6}rt}t&wJSnIMPn$Wbyo`9=9=bm%
znc}8n%Kwf-F*-4lM#VwB9;%@4)(_*>XQKA{;8^_1tu?MDh_5WST%!B#J+1uTI`6<b
zOiwP(QELCnLD<xDxTuD;r%L0x{!e(I;tF(^3tpVu>{-Qr?|SV%8Kxq=0Si^?E^EiF
z2U1#9l~XwNc361Gf3)d$pG11Vu%6U!`P{i@&5uZiMtlB~aI`&pdIB#JZm;R#`P^K@
z@WB56aM7u9)f39Z_%CrEO`BU@F6TMrZ&u|{*)8I_LZ+PUt?r@<=5brg^ol`Twe??Y
zKSPK2bcF%qW00izT(8j&@>ab6i;2%2+LIj@XG~|JJ3a356s!T%y6Bq`>R55%bbP_J
z$ds2|C$lQNJtw%spp<o3kKgq?WNfKjM8$3^k@bB~^ZncHi&&fq%lkP;_3%uTwBM1Q
zFPbs5g3H`YdC9@F=#B-?TR(eh#Pvk~0NcrtNA<LklV8zQ;Su73uZXI!e|ydJP`M91
zs>iGEw4<}R_GiW*Jo57~g=E!|ofvW7{(G6|iT!Uz2%}!4&B-4oqM6}T&{@POJR;md
z`ES?Dkw^AK@0ct_)2vV_a&sb^;6xh@IQU@2(gW>ax|>@WCuEoqqx7(2|GQ(tkep#X
zX?M}XxZGytEfG%fZjLlr`fuGkbwm&6+T5=a^=8KJDi*h<cZbR_*;39qEV;<(2zQJZ
zmyEj?T^V;rf+xuH+nD^9eBTJ74Z}l|j3GyzFxOlZ<!$FJkSiBmSGt=_?>;HHV<O>5
zp!?n(2@W@FJiB(m|G#$sh!i7NVtap0$FX1f{KQZ6t>DpIn~}I)XXxH4VRUIvR&BBM
z+>z0{>Vc&ncE<7rqnOF6tUfWdtT5ug^S<Tydq$8{5RKMdrk6<xYf@)3ALY(fZVX_E
z{;%G#z^w)Bd^ER?;JK1hnP!WD=a1ql67TP1b2W>%oEaV<cl>dUi@P7eGck=}k&n8Z
zCCgRFb*Ek70c91HNyCchiP6jQyp=#%JFj3ddp4WdS-DeT_ineSpUm6(;Dul<3^R?5
z31zYy-lWuhGi8uW<IUBs{|Uo4X0nyd;;uoqJ<BbEW{tRji{#FgseI42(|pH$8sD4%
zuiDf>(Mn1@UEsFIu(zb>8ig=<mnKHJSTj9~(ltx|Q6HtLm+v-D?QlOzY6j<<4J>2v
z<dlgPq#IA4cP5iImf^k4EfwN!N(koNn$vkdLkae+OEL35)3Mn)KqF)wSh<a&W6wkQ
zAT=#Ae+n+va=AS+T2x8V=wov}cj)khgmk1gUPH&i4>{sgwvS`wb!0F;Tr{$8=|Ar?
zhI<G-^ImSIlsgCMBxjsk$$W_1de!L_-ykJEGy+Xm-%6j&b8q{MUXAJIdt@{|Z^wz7
z1C!tA!t~IgbOX6eGl7GQPn<g!Z}^;ZqZK<a?1u5S5^%>32NTYT=^M;0b~A-X93%U(
zk3O3H-{#iL@>#R-*^Qtb9es410QO?wYX63={%vjsO(=orl5mAwAH{~ho=;T1Pq}xB
zRk_(^l=7~gGCH>70wS}XBCSDslG|Us3vQ!bxwA^|7?)<1RmqK+IpSTH=Mi2Fxi=pC
zw}+?i)owe=#9aDiG>O?G=QYxf*U`S13_G{)oJ3YBZPTqsp-7kybY0Lnch0n#^XAT(
zb)HNs#OKUtJALYG(_71o0vyC7DjI0ITg&lw!w=46FvawJ+O<)LXc10|$!@mkWhY0K
zyj)i1mB92WSs0qpQ6R}G2U@548FEClexhSag34_^uDg1gJ@wo(rWbUIlkHyL%a7z5
z`GZHY#i`si4hnHwinZD?%q5A{%V&4#>|87+iH4)Q@U;5T^EuEm$17Jj5=pz{r$Q3>
z6G?`@R4&Lj#v9vM!j;?=ZS1!a*Va9yV)_u*#5m*Ja47?XG27x^b}ku&(UrtTaIc&%
zRqE#)UUavYu^-T750`c?;QQEBUcGfIoR-Vz#JRV)5Tj%Oju|U*b9M=Li_fYUc{T&K
zxSx3*HGdWxbUXjaq;}}x-L{d0_S>0PzPbEyDd}92-^?%5=8o}uh}VA-g~SXDR?p<v
z>G#shg~&*5O}P0PGOSaGUFM-v{~sFF&t3V8%<(VTJuM~?hK5YLI$DxxxEL66rK3A{
zVKI5;+_JD@q3{e>wI9zKw7XGJ4B)sD=Z6|NN9mrgHS?2s)7c134BUPeC(+v}D;tJc
zgMh{SeK$Kg&NIoyfrC3mG&nMA+F2+Z*G!wpe9D`Lc`Cx>qZ3E$cKqlTOk?8H`{d^-
zm^e#slp-=kS5Dtm?pk=9M7K}RJ6D;d+%u+g964IL)5|($7tvhDaLn1AZ$LPS{XE0w
z&zU=q3yxU1!(Uy*Ub!QunERf%RW<(iL5ijkcnHG;H+IkChR%wjY2x{rR!~rI#>th#
zW#Pwce&d1`b}8d-<tH>`Gn{ybDu+j=&YD*=p6}*LifD>Vdm1t%E@!)JehK4GuGfEh
z%r5?^A4ma2PgS!+82hBC4AaszRf@`FaYc#Y>0UGCj?iwnXt{kKd%%@sj5Jjyzb;~S
zR?nG!&Xnk>#VJ!O%|-w0H`>Sg>Dt}-@m`!np>tcDMxIMAYjJjh)nKTPQVrL)IE%8C
z+5)cFn{YR@a7QF_5O5>-3b-Mx)casR4vv2XR)Q%vwmAI{R4NBt0iFi7{zR$%buG?h
z+VEsBJY1<q!Q3O1`T%SK{{`k8sZ{zOTb$*&tVx19!QX=65lWqLbBj|}sMJ+pRgqH9
zfqhDp`UKp0BDku>S@jF0GHz*c223Wsw-Wyu=t6Kazf4nldyA7dOQ{-g4Y&^M|0|{D
z-PPihGkuNJw>X=?Irk!;-!xnVZU(Of7qCitJGcpa2ps-vrJe`ZfS-Z=`2mb;?`v_E
zfM>03aa0B0?*|#E?)Wou<}3C4b(9mA!>)Uf@~h&G*oVmH0;Mi|xW#Dz*Q{@Gwu7(y
zrNznqEq5F|(&DVGR%)NWwm1VXWb6hPfwf@6LViTLp~YEw5$OhN7gKMbeKEd=M=7s^
z`2INgI!>v&4J}SH`Dz6F%fR_Wi?ajF1GkfodNA}GrJBJ-#GChIi?i-`@Pow?-&3>?
z<cFVze^2Cr5l}tT;?#j9;0CZ1Z0-vmXlKI*W`lKLG1$C`-yx(OmA*`Tw4a7oXfI&i
zYb{O<?MS`e;=BhofFo!})o-*oTMhu<f{%980G5HZ|I^~EraeVIpq~5i9vlJIfhAxA
zSOzwMRbbACloJ>M>xB+(0qZ`Zoc199;Bc@3ECd_D$zT&00h_@^pxRD*0z=>$Fbp<;
zIpAh658MWpfURID7@}Mv;1;kB+yOQUopMqiQ?FnSSO}JaWne8>4K{!)!DetB82*HK
zz!Gqklqc8(=6y;#+Y3H042GLqoJn9kSS#N@r+<LeU$i*uDSx$t@6w;YB;Wgxo}ILB
zP<~f&9T?)*In(-)9<V>y0Oo<sK7NM^tmDTSw^MK7K&#V#KjPtMzbAuvp;l)(81B>R
zya%f6R%iH+kheF#X9b4)!4Kwu>%e+&o4o%~tCJlj{Qlf)3^pFXy~tqBfykpBl=g3R
zHi4m^@cUP^htL4-P?mN9uBAQX9Ks#Q{fU2It1}Ud4CXhqz$Sh>YYW(LEO#sa7`c<U
zPZ^B-n)HEn6~s$B3ZIQ!FjU1o%V6!fyg!8ZVE=*S8!QCXdE^r;0jt3}aFx6VH-dS;
zp<LuWs174uFbB+A(CRdT;otHbUFg5?!d9pCaMH7g^dCX^iwJ)t;lYwz(tR;>u=x`5
z{ZsOPHTOQ#pKHNtun}AZHi7HFW^fZ&vI02+2oLrFBj5nA7R&?dz=>eqHLcDfumM~P
zHh~+3el6(-Yr)XL(81wg&ULNMB(MZ52kXFEupV3mHh}BEMsO3@1U7-q;0{o&<X&sa
zGX(Yn!(a}W1LlKy;3TjFEC)-$YA^yW2W!FAU>&#~tOpyx25=|X2xcBiykHn?28V;{
zdg=`ffs?^77y)y@MPMGd0xSX7fTdsq7y&nf)!;U;7HkFUz!3FY50+A&P2d7BbOUmv
zp24+X1gsWY?MBK4tOK`!4Ry!?n{Oh0gTX(wI!nO%TZkVlxt;pUAsu(n{sr$O--36;
z4~Fg`e?yRSFXaVB)>02(_yOt<Z2U9)LrM38+$Rp^JWPJUruE1HYagLrz`O?9{V?Kx
zl=cSJJx)5nh7J5KozTHnu;dBk3`fqBqz4Q=MLy;G)07j~1U7@s&moWY-1I!<4d%$b
z<Y3+_gcp33^nvwTs0Z46>Fe^Fccl9b`TaZM1=oOTD{=*!XfL#X^&WDMLf$s`z{m%*
z2eAG_+8G%7nD<AM-%qIzFz0jLgAF@~4{ZF3e1J_mX>atS#;;qQGO(eQbb-pD{f!`>
ziZeuDo}YWN!B7Tgj*dZYCihZ<jlDRZ1ghQJoPo!Z-)zn?fqC4Oy&eqh-{$ND8@R)I
z_;KWKAop75QQlw)*Z`J+&0rN6K8*XX!5pv-ECJVprC=ji4TeU-5B3M^z+A8%ECw6E
zQcw+QbC!T5;0`bXrj6n~m;;7$$QRfQt_Smmv^h<n8rtUc8BMu>C17|M=@C4f{DV11
zlh1tOJ*LfB0ER|$7x);`2W|r+U@KS+hQ^Xkus_%Y<`u$Uz<DgN1Y8M5zyU?XJ03X`
zNJlZ@Pb5EJ85lmEd&dPQawqsn$UTK~U!eMVo3lc`PbOY4uat5)8G0G=Cqkb=zJE^r
zfHQwVe3fm^Mz9Xt4pvuFUXytb=AS|Omk@6m<x@jFfZ^XGXD0c(mihym!LnI|zm9qU
z)dQsCSI7l(&LW>1DNnHaHO`q)&$ZwNunyb;)`Q!@22kZ82kZkjfdfGGI`6>}aE;U>
z*d+D%2I-;P>%mQwcjH#tQ33gVv(1?VmcGroIItdU0M-8>NAMl;Lp{`i6Ty=As7Fx!
zopejRfD6FT``r5uHvfaOc3|TNZO%ICCFdjZ4K{)+sh{fY$OY>@rXI%e{gXDQ9IOYc
z!P-x$2f=3I9Z&i`qaA|v;0mzzbMBZ2bH0GT7&%}e82%^i6jc8r9bhfEP2RWCPmZS?
zz)4_z8~qIox6@w15{LGD0`aJJCl71@7l2K^c4rM3vf7=kV71-uglSiily)Zq)&<&~
z+!FGY+3wVVs&~7y1FYM<-C08W3GLbLYzCXM+nxTjqwwDCP8k>h)hW=yK48f{?al}=
z(y!gA0>k^YI~%};AGJH7Q<1lSyE7SVKA_!ME!dxUz`UQ3-k%ZwLG4ZrSb7NQ1ZxMj
zI|C*m=P>xdhC#>!Lpj7tJB|!(cP4_hha(TH%WZekP9xk=$OF|+Ne37j(e5;W%_B)C
z?Ywa;`KF!M7m_cqx~Sc;e+mCM(hoL|Z+9xeP)WP95v-ov?({ny`ZV~!&~)MhOU`U}
zwh7LHZwhiF?M^Kin%C|$f+ZE?yOjK#OL>8H=e0Ylz^31{JMV$D3)`LisqigocNT%6
z#pDkxxtR3O|00*RJL|#l(srj6tX|&k%$!DhxrTIu&DXa(o9LH0w^D!8$@iU<9~il(
z-Dv`I?rnF5pGiLNZ+EJ}#&x7i@Nw#K26CSyKVWDh@qpD&6P|uux~bhM{}p`Cl3uX!
zIqCz<dl8&X`oYi~;s>k1oR?@fU=x@(7y8TX&Pq^iA%79#d7bZI>DG2<JJ|SUyEEz6
zU=!&9>;KWtxmC*J6WS?Qm*zMdD)_#e<Aln25B3Ke!CbHjEC!pwQc&&gIF(>MxCE^3
z<2Wn9@E(p+4>p1ugubWaYynI9I?i@^pY1qG@&on(Yxi=T0bt(Vjxz#m21~%YeH^C@
zEa~SsRbXge$Eg7u!8)*ZKgU@MMt<Zt8^Lgx^im#?{=_5Ya<Jp<q&(^mahx3T-vIWP
z{2l5z5wLoQ<K$KH9h?Z(fiuAdZ~@o^)`HF8Dll>c<uV^Wus>J}=7M>-@PX=R$5{b3
zf$D7dz&>E!PYDOsfs=&}RtbHC<180CxDHguIL=nE39PE({YcVx4*3Tsf~DY0Ff__>
zR)f`GBUqR3IE91{7dTG#xwH##AeaZ{fhFKXuoRpLM!*GNHCPMQf~&v=a2?nPZUURY
zCa@XY0je>MlXf2QgZ;oTm;>g3`CuM62`mB2!D?^`*a)r$!(*u@FamA`Yr!306FA^E
z!~>21RU!2chQKl~3|4_TU=5fD)`2D9TCfgm1RKHapeiDr3kVPP2TQ>`FmwXx{4L)n
zzz;^iwO}>45v&Ebg7siC*Z|tULoS#NHh}{{6uy%OhQWzo4mcC6ok%@Y^S+dPU4Z<l
zw0r2);C6Wrs`E)N*avI?2Y~7f@(oslGr`a_`Vm+MHp=%h+A-JwZb5$aO!$zWH=ptW
ztIwwXz=kT)wUBgz6TzHwY3E?w0^+@hd|gQT!0<(mvmPu3w}EOg=~;{%Fb~YTgmM8(
z!POU%f3OM6yOeT3UIZKn)`IVW;brK9MZ{A}zX#>7j|<9o-bu<=74q$qnVMGX!?y;#
zAD`Um_&|2!As8AH%pTV(a86paI&SZy4;j|~0AUsWd_JY;;41*5e?p(kXD0MK;rISR
zAYD}D;j__kv?V-`fEG=Ugj>m{Fo9MNZ6dVZp!|jTY~Uk4pkh#s35II?CkMkbGlHRl
zVA{w4^#XYtpBlm(D`EJP{I&8~2kk5&czlJW!IG1MITNGYlDGbphr+k?KwP-IvS8uZ
zU`bJM(%4|>*x<~u!N}NP)!1OQZ)|XhuP|8an-nL-O+z_ys*n?(?`mk33AE+VA_=tB
z(8?2N>!Hm|pfy4(OQ5|6trS{N($Ali$4+RI6KI)!e259OFtmvYwBgW75@?0ciW6v)
zp%uo_Fb!1%+7A4hs6nsH#|Fa_g9Gw|Ir+id{9xXgU}1i+BtJMQKUkU{oS7et<Oiz?
zDAyvwMQcT=fv~Sf)&P8?KkJY+J~&{a5i};4*ExKcFxv?~iSQ#k!V~wbm`o{`kcFR?
zFzFJ8zidAJq1Dn(_7{R_C#MC|ymxuLTh-0G$-I;Hm5i^7cbmF-x14uP-Mp*k-HvYF
zZQ@-T<((|PZM^H(%{$vh2X*r<%)9&~@1(3o@NOn`xu=v1e?lvPRtYUyenKmQ7Kx*Y
zUXefFc_qb0>!e<{|1-Fyqq=7}@A~s8hG(QtDf=0W1j>D>e^fBL#u^z6FSf@8n|*#K
zBN!eT%pMgCjS8j}1w`&-KEp4<PYl|BgpY8G?HX$-b)O4u0UxQa>*f6=uG|`belUBn
z$X#grf0q%=9%%#?2axSPy1H;69aqY?1YWltRAdCx#s((P8YJ#w-sg7nUh-7Vd+Dz=
z!e0;!E%p<^LMuNQp5aD1GMH8nDDQY4hwc)-Rq%;#yDQ&W9bt>&_(s832H#A|F8fvV
zo0Q=nF}x_l%l$>c>}A%dV0fur5FAjWM+S2ir;H5dE=;BK`D-(z6Veq25Q5Tni?%pW
z37wE5JJU&^=v>h`;VmuBFC?C0dKAxf$Sdk1Z>%d%bWR;>AZ2ti`?GSTNX6w+_g=b3
zNxDg;m+mpaI-h^GoA87Ljt+!LpXi+2g{%$nY(I)T7#5~3PN~sL?Pb>GexWZX%zE;G
znWpSN5vEi9AYEDl^)ogQEA!Emv+y>;t5|c;lK<JAcu_E3gMxQ-;MW~;3gFF$Et7t^
z8Q#Bw_Tlgv{X)C&+T++@j=#Ze=5A9g2w3;{;|Zezl7@-!7r*JX>(9>e<a~3xE{3-j
z-g<cdAnAY5<?T}5@$H22-jK9G6b7;-EutLXW(@*R#lg@*|2ZA{GQzvzygSu<x7hDz
zlxBS5dkt8{yTelWV%lm#oyVskVx6x$(ohiBSjPqY=Ot^b^asQ)Ko{lzopl@HvfpLW
zmJm;uK2c(Aj>R)3E}o)*RZc}D=@w#h<Pq04;wof~?j$MK-?(ut^%sf`qt7q4X{-x1
zCWGHf^JN4Fpnp1)nTRktMRd;E>J~@Zid~Ehf-lsrMiV_GIw$`@)^})w_V8$ah$FtQ
zm~vxyU6?vHSnIbkx=*$WBeM>fi-_|=i8EE=to6tD7tu+%8LTg|?j-uu?JKl3qsvb3
zXyc<3bdvD3B6|mXmq~i>n`O%1)9a>8L<wC^^F~@!ts5OUoHkL+dX(X-?HQl6?Q-~5
zeb(Z<Ed6V?%h#oC$8^cqV1v(pMN(Z-6cBmqkvIHHPhS4_lPBeonTq|&dRdl~$38vF
zm+<AoxAJR`Pn?ak<8LZQ8H<}~Se30Y8+0JFT4Y6_8Qu3NvVJao);}iWaT!%dGRBSM
zf4-o!Bzyzm8wg)2Y5s#7ca4l?i)q&-{=3}<z>tiB>@RT=zPX*XSm_7n65i-mf?B<3
z_pVlzaM@|tORCj*M#7yH7tY`A1}hAh_L)bxQh%#+8a(!7!cCR*&XyF8W6VyI@{zG)
z4SnxE2{VE)Mz6;7Ayw|5<hC<cuQF)L7`_s|jNh@IDtuY+1#=eKi>(^6YTDs?-uI!u
z8M$B1=&{@#@ICfhPwtcOCCKfQj&6#|y{qSPi{U#jD);=Za?5$YoORZLQl>Maa>sUz
zFVf!_(WJk*<&Yn6Z6x?=DC0YY?-=+hl8nLe`Z&*DPq!(0oI(9btb|Qy%4g}r&tXbE
zA55DNK>ZLrVSFC-_>-HSeeU_fU|zg_DW$CYqhHort<F~x|D$Irb&|wiAbJgDztlE1
zKz#fq{_VzeNr+)|G=`G+7+m{u32{u)tq!?U_Eh39?Y>gl(RkDBu}2DdS5LS)+L9yf
z{TRZbnU;w?vQ+G-8k<?iVm*cmO+kJ5t&>o%l2P<|hXqB;b?2(B$a?)ZEzThlX9!tv
zEKFIfYwV@gGUUtnxx5$l_`@yE5j?wn<#aQC=J0g{f+q6raY@UT>0bLo-!HZ%2J8J+
z0g-z(IY#0Xc@Z9Jqj5e04*{h~OWwq2-(a^ow^7cj<QwX^)C$vKlhot5Knd(yh-W}b
ztCK2eJKas&QmHe~W;f#<(>nhh38USZfauE3U`jClhyS++}My-oD@0j|vWvPb4(
zv0)dcFjKU4CwYlQUtsK`O5|+HXmw_#BIk=TuUtoGrHs1VDq3bQ)io)F3sbEN{ewdO
zMHx6j#s>WN4RX^^53ky-)p=cb)4E9m3PtKaVqIl*O+$WQ5h8XXXT@Hvj(>0DSgxF1
zjz!Jd8e<1{ij9DYz9eN-ur_5(a79XCu#W7mhE~t>x|9jQ2A((aX-xTL5H6cEB&jB>
z3b7TdvuKOMTAizD9QMJa!{}qmV$9{16vgp1DHO%|{y|}@w{MWLW~WGj6b7XIt%rBf
z_*Q3-)ba6d9%}q2N&EX1Q!VQzY9Kq`%%4pA6P{*xs*i7Vej_}GCGgN;K8f+jlt-4V
zRsSIMHKEn{Ncw+Pl&3`EIXzfyTQg%kCLS3-HsNPlaw73aTQ>UsH_{~r6Ex}yVr`bR
zAiIgZQg2IrK0#YZumO#3VRqupFHQ?K`>i`ul9p(Zq&LiGGcr%DYH|L?v;8tMVT#u{
zBcPY>PXD;J#W|F6kT{pgTx_X@qoT$xly-bVFmG0{&_98ZG2#mj7#F8NWNgYsFYKJ#
z>ik6FIFC4}w}t*vqZ`7|HX%PP!rly?-TX8CNbrL-6Q)05P($V~#Q%05@&JwEFn?11
z+0gnyV;R+JAJMsLRfm0-k5bPbZ3b{N_b-O0FiwuhE`^p4jp4-n@l{pwUmi5$Hz*8-
z#*qUFTjTL-H>j~S>OB7CLNj%WZ*Nv`0P42fXp97(P$S{jB3s<xZupYmfKuZVniR|{
z?Bqj|IMp8HpZz<1<(a=QpFYq+(99eq8t1%NBa$$=gefM>@jScnOzc$F5=QoXRS~AB
zBTPw`Fx7-vMVK)iVUF(<M$)*7FpY$X_j#^^wiQ}&2VYm8=T^d`&1>!CZ)}EUC(vxE
zhy+?Tv{vLt>q62o5ZcZJS{}3=3ABmOniFU<p>0o~Er7NyfmRFcy#(4SXiW*Ub<iaL
z@#)(HZA${J3EF08dr3L*C;5;)b0RO=-i4M%0}y$1WAi7pe$Yf-JS_*>suNlrx?)GT
zd}s{`;U+;_pFk^zwl0At`|{Q%(3V51PoS-awkCnL9@^>zS|c=RPw{Df4_aLUZ6~yq
z3A9YcgB1z1Ftp_fwBgWd6KI9dY7%IZp)HA{g(<NJv`WT;ky3i5Z*}#-E+<SKVWK)i
z_*O$(xeGoCvxzVpzadOBVRjHEpYU${XxTU)u*m5{V+m8oyGmm&VTuV8P2)iLilG%k
zo56E5Zv#$^>YmR2XbHc7@Jk3kKQ4UEFM_!^uV<LyJ5INeY_&$@quciH2p{u9Z{*$N
zZr;7eyGS?h6t>8sB=2&GqaW{PlD?NDK5SlZeu@3<`7UJgFg3w&8}It?-b2r1GJfx%
zgW50svPV9=Qy5|~qcG{>osZ)KVWJ;SpKbWH=fk@hc?mwe;e^>ldbcq?*6{4wG|@4u
zd;hA`gXkO?kLq}~Cm`FjOJh^Ze0Fk&MT7k$dO`NM))Rg|A7A7<pUudS{(i97I7hfL
zyz-)fi@&<j^5fc#IW|Tcn>mwyxR!mRUkhKh%NK7GCitVq2AeIbIJy5u+`cNzrxcl`
z>_J_}v)we=<fp?oQ0+4|&?3UL66RX5t6y-#>@r?#t`*7SHTD<Qh}we8!t<~#C>Q&h
zPBOcjvoRA$;%_bt<R`VZDE|?}lSgNeJ+f^Q&*5%7@pA&a*NgEFN5qlD!cuG&hW|qN
ztKqL-;PLnQcKlM;4e&>P*XrCV@qhY@&gDy8cjv>}3E#H!J-$sn=gXo0mR{iT-PUuy
znec77(BoU&bG|zGN)~y1(|XRg1-^~!IX2~z*K@uQ{Wgz1%7$;>p7Y7x|8+||zOPT~
zaXnPSmvgDd_iE4i*1@;>GLLU<&-t3+3orHfe&2Jx0sAl>FZ1~3^o*~B_E`#FsMgg_
z_V`YGX6?+Ihv$rQyGMtsH9z2A;F3E0(6Yo~{9*&b*hk0|zlEz?e~w?!xZ8}**-V_<
zd4CeG?GJzM`BGv&Im~iZ9!6J|w>Z;zR`g4mx`_S__2ct<_6qZCKLBrR?M0S9SbGwW
zhFKglbAmj=HxT|!34cA|Jv+kqA9MWEI`lP~sXt*NgpXW-{Vcj}aa6vj_9;>KqmjFU
za5MkV>f9xAmvxbQVuxIu6C!sb;kOY!!zTP$@p5s2N_z1hN&cD%*T|muzl+>+yU4}*
z@Mg@O-0;4PN!PYI1u`a_5ifU?D;FP~#9v6bbt_w)fW%)G3s>sD&kaZIN&FGQ)v;H6
zr^H_#jo<idrF_pL{vtDDKR#gQ-gShJ{E<D^qTlkOawlY^<S(<BsV>z1(RfKp#VH`m
zSjMNY8J>RFRW}RI-ccUczf!QwE?BAyYEtqSruqwEEfm(#0e{sX({@7pDRsfGIC~@M
z`+O3HLrRFbv%e-~sa|GZZcT(foG``gbH7{Uzd{&O=Ozzk-4?rHKBFrH*Nr(o;NQV_
zCfM%xf&j);%*!e6B{|xlYLMFp8;5N>%Dxb}QD4Z|8f&3FZ&0XQ(mK+tEw3Z2k93L+
zV4v=#(<Ys?D=FtKychegT;7lAltytk7j@J#<5)If(tgC5wOO9L{RofCK|IYOt3U5D
zZ*Fz&5m_0$GwCYMN*OKb@)ryW&2tmRdkH_0@GA*_jfCI!v(9}Y&;N_~xF!8N4@CZ|
zR%Zmy@|VwN1u}A>wMaktvn!*^xa#@ZSOWYdelCXEcsFN3K;$$br}<V-&K=)PP9AbZ
z=gg!{?JoV`)~Fn|A$fjR<NL#~@>>t#;^~wSW0TPBjLw-ov&A`@XS)I!1dn=K^Pts)
zDQoQ1fW+~OUBz)vVjT2tS2s2yV{|l*gSy7i%=@O>Tb%=>pZ1Ez5%V9LeB`rb$of%Y
z99Yw?eK~x8?AyCqox`R5_i<%-Hl5MwqRpOgJ3yy(vvT-0-`(mQCw!lt>a~mbb&wMO
zbMYO=^Lej=Z)<(4GeyefX_s%A_;TaRzN0Nz2b1tb$17UOZCB06YQDGCnI^JskIIVW
z-?P_@54Y5skVMpI<9p0K0Ke4zt<EhXrzV;g<9j?^);0IXKJ_%I8MktXzZ{<BoMRE+
zx?SFtN5^<feF{$<Jk6YW2?|efC!T5I<6KG#y?BhxEj*jy*~Ix6(~brv@HF|X^J6?l
z_XtlKeQ@<(JRfIfl*jXN!t<*bk10#x839k(Up=1fr+D?bYn})^Vb10pEBV<J<uT<p
zS5}|zip68}c?t2Xf~Oz4Y$m>2dyUHzKR-zDk)9B&wyZ6I?yPp=N2chUQ`k>v)&a+m
zUzp;3q_Q4Si&9?uSgSLZ_9{C5a=f8k*Rs9+Dkc6uB#n_;5m{T2wT`n$+eFqkuWjT|
z3`v~k?%KxM{>-mAzx0&&jIZpbJQz(qpYgaL^>>~KOg5pF<nT%RF=NK2R%fo%%ero4
znl&zO?Y`ReuO~^QX&)kUGBS5O$2lwunHP4FnJ@=4GV_^6TO;WMNo01kg?eN*zkuy0
zGEeFx)9fRPt;NOCnU!27BcE$8*f_B_ZEkhm)yO;$nbC2##9f0hOAsS`gF@DEKG)Fj
zPaou!TOPdCuj6|aeLAq4^ib|zU5u5w_&RxAbfjlFG8eqXIWWm%C@wwTnn+Pi%6Jw<
z_LUT&rLhS{1;&w}T+-ACZ|EIt6Uj>wUUxlZFV<81&%1-RKTq-^bIHtu@W1?{)ww}<
ze-sz*@p0v_pR7vVBdqy>(+8<M_|_c?@1hoGZ?B&uTT3`2SXjugaA)V01eo~?@sTr#
z8?R~W@L>r}&LeJsmMMYw6Plb^ZGc7+&7aV6p{<7&??Wwywl0BI3T<rytrA*2w7ufw
zErHf=d#m%~INC~Rxe2s-XcH4?8=yrJXj`DwK#QhX?5!&F!4hbFdA6OCG1h{i6TG#D
zw8OfFUmYL*rC9hd{JDhBBoCrzTw4_1K5nMyLX(uyWWvlOzMzERPiPTnWeK!J&`J|%
za{hC291ZoO)<ByC?N|{OwSN~{;y{g#M@G+XCY+Q<en+@*oqU`*(03AM<qFPYb%Zgt
zG_$hP7~HU#*_BlgnBO5+(ms$zy9FL;^YSM&X~<F^(X|Ol=R|0dpA#eyq87{k%7qpS
z8|Ox6a%P|A0>VjsoGNYD_!rGuvGJF&roK=YVl@{e%t%KDeg*|y7B_9}N-?(mR+0a4
ztMf$Xbc|=_Io8djJ4XxiK5a1h`()Q;Hjprr2-6_)NUyhVlrmdr%S4KuPq7II+EdD@
z6rT6s`L;e(DaV!YF8OrV^(}R<fpFDVw>h`-EPus(wm_?a_cO0-yNunjaV4M4)z-Di
z2R~zfio5|dnh~F|b}b!>ynFIuV~WwYM*p9{9x7`eJL#BW)^|l-74llYY<12+-^R&H
z7*D)$9?SSw$;T68=Zn0J$m{nnY|PH-OD1n@u+e9I(d`PkyLVR3;cvaF&Do9ev)`ED
z*&^7i-R+$%@vluf+K=nrml-DAne@Zu)Ln#U`yAq#8w{7>Svj7y)ih|5w^c)!KM<xj
zc7pVKZ%+k_nH_rwbF6_>V5c1v%&&_RNR*cNVRj%-^lt&rs?uG5Mb)Xyq4+bPPmnfa
z#$(fOW$wC?{(`ooznl;-<JJb!oyob@&k1TjPEU`ei~g{f{!rqdK!51yrJgUM6~4{z
z)d}AXE+0ElWL>|DuU*E+I?|NCvd!#6a&;bimt>z|p8v66VNtN;lwje=U>;k!Vm=h%
zle>-9U*G23D)EdZp7{KP+2Rq*hC+UdWDy%B#bcPTp<%?wdG3)sN8^iM9|}PpPM8Q`
z;@8j%p_N0E<|ls=ZZfo)(4uQZVQ3L(WxJ41Gge{xZarzgjA#4u<Gpl{OerJTqwZ-h
zcGwyFMe@55xtTY##jPco@XQUp@MDdKryy`*)Fa~BE;Yn85xy6soey{8TIy$x?d{oN
zN0VipN!toj?(WRe#3}7^1QGANvCVl?+6(m`KR+Dj`ryr;m=ga>=9tWh!AT>7B~(?S
z+klx@7!lcIenks9Sklr!3`WO#eyRq`_)JC+=Bv}VXN2(f-Gm{1;xv)*eG|mhlr8QD
zQ{Pl6<%L|nZsXmWB=5xkQH#vbA35_Z@lHc#bZm{5_xbK<VRkf*<e5DWJR4m$5dQqb
z@h9KZW_&Ng>-A4Fj`?Nj$t^cCCq=qa^uIjPaxsm?_Cbm1)0n=B*{@?{fBUZ1#!8U4
zCuJhzjs47c%B1uDc8Rl+Fat^B9TMhl!o<>P{0C8=u%G<Nnn_2$9T!;8S+5`m85@vS
zgS<XcW@o$d%pOqbo7F!5@f}50VD`<<JOcar<~HXocoW--@qZX!f`1vYQ*Ism=Eel@
zx_0vHFnbOmVjc1_SGAeFFe0*y&m?rk0BEoBY`2Uv@x+gN32SMhLvR4g46TRf9hg^2
z84o`ay+k}MJa=1;Y})Unj@c<Bysf0M>`!gZpLn*XxM}Qk{zNpB+gA(Y&!0$pmpd)C
zz&o2~i8r1P$9)G|To*U<;P>DQ-O}c~=JCb%)3~x<oKW^B1g3T_dPzeLnOKFa>yxCx
z{~Hp4R;6#Se=??LWNeuUU*@fC&Y8j&)5X;Fbho7#Rg6yIb(n}s2)jV$V#gJEWlWp!
zbP}~C`{wif?8)g=#>|2xuFb?%d%LHX;^WHkcZ!T|uhdcW-!*N{#qt~rOAqif40jzP
zN(k0K`s(13vm5qpg<aBT`V#50&cbNvlt4B*vO3X;gzn2m<`U8|`R+D{WfU(RCz|nJ
z^0SI}rM&w<-kEy$#!J!dPeq;g?tHS5u>I@X+%=ZtrHvT<iJi94{;gZmR8K(wOWSpf
zBa$4bG#sRp;X9E2EOB^yZDab#*oz&0mwzU;?id-QjZ8#V-aTz*y`xKd|3V^=Pa@b_
z9!msOrfT5Z3Eyvc7QW8@P3l}MkwVIktrW1jT9tk!`bh3)8F6o$vxlVL<lBsYCg1)$
ziPPPaXncGUekb7v-q+^*I9d4n-SA^Q9iK;dxmRi6+BRpd=(|V9c<D3q721kthm8x?
zTUJO6F=!nrG=3D(!8?&L>Haq7U6E1i%81ubF+Yk|&gjv`q(<dPvv;77e!Y{sL=KX1
z<mbDRXYBGZ!Fr#ynAGJ`m$XspwnNXFeT<UEmB<{<7;OB-1EVs@r)T$$m9tlDzFW)&
zRL+z9Qgr>Z-J~&$yzR)l_hk2Mu>EC$`20GZy%N10qVh{0=1vnSm)8hm+MlttWAls7
z?a#Bd6ZH1jZftGzw)ER7;u)~M%^7SH&-_mDByZ=G*ZSo14&%d>c9eGvvKrc)8$H`R
zUME}a6t*<HlY?m`0av$02;YbDodd7r(_3HRU{Q^n2U?6?!)Ef2a@$Zx{g^f+y0ISq
z{O`ywWv~GKPznE6DWv<#e6Ory^=rn+nC(G}_FI3llUA@vQy7_|b3Qqd{qa28^N<Ns
zykA#znA~sF>T_)@74r#@TdS{>l52sFH?~5UzL{fr`v1^q8e``cyY!P%Yn4<J$xdk2
zV~o5A@=8-Yd2f&Q^8DT9H6X7xz0LWv=>IEsCC~H`cD!5HO8UAUcq}yO3;mP=%JSrm
z{eJR9mv26fK1G|HBW?1V{oZqutAu=$wl$<}_+xF()5x;#9_5uC^Mm;FWd))S3f=Q%
zCDujB<rM^^{YLW8r5oCuA)<FVQ{s(J?mT5HM=!FAo@8{`6ikadew)uB4uZ=#Sugxg
zv^kWL87CL|e{06c1$^DayDIM1nn%5KZ&Q1^uoPcxGEF!#;6F9#i^72QV)C4c^F2hA
za+lUj(pJW^{PpLvolI|lHiKvTcLYds#ze+Gxpyn`$u=h}<zmJ@Gq!ZL%~`1HWIN!U
zk^0^MZ!^609`CN#Osrwu<w9hIsN#v-OSM8|eN)>hG3$OwhLe#2ku@1vtN+HGR6UVZ
zYIVE*BC=K^Ys52cJzW=Cp0o;?fr+dg$jaNqomzgrn6YM8YfRQj$;+$2<S~zixZ~M>
zPrBsJw4Kjki}tJ@Q<JC5)hT7@fDw&t&ZXqh4)Dd)WlYZ}(V-Q|<>i}v4nPerc#-**
zl-W16i&ATRa#>@|I&LMhN;mi1=9d06JP&^kJU3D{_7nyaQ*XvbNIrI#T6@LPfEm#-
zloraV)TC`z`e6vOV8_d``b=_uDaZdsLO;Z>CS&Us(wO#Yn{%p^`Cl2RO&X1^Om4^E
zcm0JVvg^QQ>{O9Ca3uD^Yi-Vc=rsExWPVfGBeN#C&0>6|ru~!0rZ-sk5k1*$yYR-Y
zh~>n}G2Ub>2#C(h!$e%f*o8>*C-%SGL09`0a~Y|tw^4wUL%g2nl$4m{Vj;V85O3a)
zJqrEJopff8!y{2X+HA*J_=xOmit&*nnv#;c1go3coFMTfZ5J_nrX*PEe<xY(OIfcV
z{zC4;drgz>lim2c*yk}>X2-^{VjFd}r;V>iWNt_1=J!3Bha$7v@{aksk4!Gp*qqWv
zMdy@ewmHAcB%KyAjXjlke(RTMgTnr4gOvYiSvZsi6^3^KyrYU*+`2gu-h{nY5~dH0
zq?$V^sYb8vn5M_=k(V@VK+YP%G$GgS)m<9wXd0e#7cbp3NTt0_8d9Wvbd!b^q+wwH
z?$c088VZp!iMs}0;n}2NLdUv9v78YtmbuFKj=74sq~|BAEbkO5zR7s2mH3ex#{ICF
zIK=*N+wtg_AC?_wUJLHnr?ZtbRc`m%{`nYBJ#71Z=s=UXC$UJ%<aGFGC!OcNrT%Sh
z+jr-^7#-5Sis7r|KE=Il_>Okti*K_DJ})`nZVc`uZMH+_)FCsEyBEu(eI40VX2RGp
zR%X4E(vtQAGq03%?vVU(4`YM{WBaFFWp+ttlixaAcP~@)Od<WgO1Hah`5;&3H|ZH#
zD)*hqe4yPqM#?(X<?CG5#=n%~UzMQey?JDV@TYReBKp&Qm&w4c$0E5D%1TYHlZ_8q
z%BRm*x?)DVGf&FrZe$u?6n#2h_5n<g^Qmk<u)LiIRxiJFsG`96F0e<IuUrk%S2h_G
zv-TnRoQcd_?y5BT6q%&6r}-5A)xy7<$A8(k<Ci>b6MpW<%uFFqGdh)1Y&<mj)EjG<
zZ&xH=movIg(li3Su!DOxO`d-C?b0Ot3xt0!kAK*=<DU%wdg13j&s(IPdb#||WPRJ)
z7Z<;mjqLmsRW6fK?D!4?glhVTAvK34A5Sha^P_&~n|$v7H2M3KiS@Uvv&q7LpvV8r
zx9698Tl2Yh)Wm=5x98s~{6B7Y_L9D_uoHjGPbO!;@gR1Z%XocD?ulKrANS%)`Ob9t
z<Mmk=-P7c=UP#`zOq&T)XAxvZ^4+`N>_TL+At+wcOS@Q3m^BBrJ9mriHn5v?j|<k?
ze(roqaPQ58e>4334rzBzlyvVKO?SdPIp)JM7X|c->9=?T^#r+MD2aYH>uwTP-gxZP
zA??o1qVwArjg#r0n4N-t_E}%X$|ycA9MViGlSC$kS4>=CI`TyBk6lk(-(b5KJr|0_
z6K^b<w2Mw?CGEo**H4nP&xod-wZ-^#b@qKMPAPWpq(XO|D>50!6P!C8<5dX>%^Tp!
z?Dzd;N*S+0W<T!9%}pnrFW|&9WgN5RyNp+jKI^iW-4<WQMFIcYNy}I=AY~F_*;Ve|
ztrb0ae>ARSa~n@j*5Q}z&M22^;!}pqf!yDFhxEf6yUH}_kF^oA4^XsHlDUZKM<R0_
zGM8}o?>e6Cvt60qcoN^Y^P*j(V-2xTHs5#Z?WR4TN>%kq=u^2*m~_Uq<JkBR>oe>R
zu>O)f+pe!_63hGpCbT<yNS<HE*`BOl#LJ7Ai<gqL{(R$KUyZz-C$>AX&~IP)jq=6?
z>n!V&<o5)aGL-yAPNr>fU-B6AOkDbs=eN|lJb8XcnK}7-WUW4(JC?Cy+`gHt&3N;Q
z((e5$m9VExq`#N8JF_HR_^7(*-1u?2)PF@Xolz9MYo#<^yISeV^WgeV<lgoCGq{iW
zd*(sf=Oq4HfgboYjrtnTqP$sGH-27IzFqokt!4dK><#jkpj$e|jy1@fcWjHZo3yDb
zkQuD)v}ZY=cpC{58q(&x$g_PpVQ6>petp(V<UFF;3sLI7u<QN@dP5lA>=XHn<TqSI
z&S?@?Y5dhNX0AJeFpHvb9sAwmT7;Y}d0t%UUE`ASTuqpr#C5>lq~~KS?r!^&(M#Md
zmSVjYN~-EQ)(TpYSwp-JNqKIOcxy3iV|VdMIV?XFKkxYN=YFI_<)b&YPi=Qbi48A5
z3PIWuZT~0YeFfC0@GXGP=ALrno2W|8m*|_2vREte!8c3N{WF)ZOCK@1#VZT!l54ta
z_{d|FQfc(xZ1Q>RZpb_enNgd+FuM1b15;L82AhAc;2M^{8si_IM;P9Gc-u#{IPH@E
zo$w~i<3;{V!t9*f=6n(&%+~JYe;k#+&f92h<gbEvB{q|he;v9nL4H5vkC;RonZ$i2
zJiBRk&!fv%qEwT4Uv*1+$KFt(MW8K8pe=${3r!0ne-ds5w3-Ck8fZ(P1x+Y7T!Sa?
z_Bh&RXkqSre>#q~4cb;{*T&IWq0L;~-mz~|;tl<rvbwD^tv|FnXz}uLp-sNMGp!g}
z_8slc{c&-WLfZf>K7EzYO4oFzErHgWKwAlI1$WlR^VUNfephGO2561Y;^W-{ZNc50
zY1^R<sP9Zur(wH7dpR!6eW2BGNB$iiEe9L`ZE{(=bCuBOYfJsuY&90^ma^9`$6rEU
z>lp88qf(ZW;n@feCY1Rr;s4fC81IptFZSQ~UFj5WJ*n2Vf}*2|Btb_Te`bGVZXlir
z@!%cx^Z+_v?!%Jff8)$v1LODKPPiq6yV(mD-+zn5S413(xx<Nd?P6vq?9@myGN7v^
z9~;n73;5lD2#wZ$V3?;H6YTw%eO1KF!8#<lJ}~(Z-IV<^>YMbX$sFWrWEy=?WbUZq
z4CQ#&3E;mUeXv62`{HB_!&d^|C+LDbdA8@lm(W*5b|qno&-7%U8kOyy&m1e4K8+Cv
zga5&(Y#9JW_8RzBM`aI5E<2xzanpnC?mlayht1v#cTKMj9~t>zzsCvj`#q$6EP$`-
zp;-HnboQf9uAPAmCuQ)$P_M0F*L3Xt$@6paIM&vfH;OKr2;ZKhQ{>t=!57SnUi)kO
zc$K_g!0%8D6}gx4KB{YrmU5<bp`Je|be@!{<XhyeBK#|i#YWyV!b_QI&U$iBn~eGL
zI~(O^wL7y#|DMHrY>j2oC%T-ElZ{I6cOCQb7>1FPNuT^3@$A8~J;bED)0i)9Er&3X
z@^<H+lJ-3alQ0f<vE^%hR-?4Fu6B*FyCvR5$lQl`50H4@8Y1Ixr#Z32E58}Ca}K|y
zA@h>Q2osApW>1+m9v$;+D@EqCUE`G{Qj_0SWIn<8f1AWxFYz|n@#Fsp2}eIIo!{=f
zB=IgMOth}Jt1bR4vGqKsar}=-wT%k+{|B#^8%HVp8RYE+iQ`iE6Z(VXZ4qI%@;f7$
zdlRODFeY!^u8(2cSk_6YN%LlWoMB{cLFP2#9V~6)Q)DLCipKt*BIglfeI;H8CHOXE
zf8te1W2Eg!9twYf&fqso%o(c7qj^X+2Qc-6_rhmIlF!eL|2h|$E0MXG-%<Hk=Dk-U
zGg|hY?uIMU9D_>XZa9CQl)h(^Y=QR*@-O?;?YrPjDEDmSM@s2iC0%WX5btVum-`p$
zglG8^T3spQ9kh7cU@f#9XnlDve-dtEDfS4o-a_C{Xj@AuBWO{3fhoPwIr-4$$@sH-
zH)R&zrt56~Uz1isk-6(QZz^)g553YWtC?m$g2X?Wcat9Noc0K`iO@16Jbz;AFM?JA
z&D_D5xOWXBHrw61HbMH3gkMYe#>d*72Yq}=49{jP2_N0NCUyEA;Y&BrpLll1Zlh1>
z!@Ic02j|29g0J}{ze`G;zR$D0M~;`rQ_WofOYLzrn$1cR28B&|7oR~NAfD)WABLwC
z+Gc1Ic$T)b5FHt2vucR_o}!E-%p$_5C-}`9zzu_0!S%DzK+<kj6J`Wqqz`d!E6k+@
zhF{)o;9Z1wMczBMp?1h_;@v9VF^qX>huJ^6Lv|RlJ&o~;Fbqd=@pC(dmq^h`Yfoof
zc(UDjUBVwY*p#C;$HULnv3IM{$6oPHZe0eHFg!cppG`R)M4j3HL{)JJC3+5>*QWex
zNXO2<b#5<p&~`wJwqZ$29y+8MT0PJ9Q^-mfb7h~WyM=f{K=yNIyQ{252aCN_4Sy-W
zWi&(hFM~hHo|H=7^(&*?3D?)pms2|X1iH+nOSsH1i6>=D)F;qsZO#7~U5eRKf<;Gz
zkoFxer9bc+OXu@!e|nggKJHK8Vpfa@Q@hdoh^0@L5WbZ9n_?0Eal*&ODe7jaJw|N1
znBQj%%g>XO#*Zx#MqVBAwlPN4@oX<cUN90rM45gezb92(+~V$MHsx>j<em@=|Hhn@
z!QO$}t=BTbD)JNU^P(H7ptU~JnI^wYwKIWM2W>|JZ7sCs1lmSu+o9opHh&VwR%qLx
z={Q<5wD+J%JCHvKXP?P`q%S^PHnha_4TP4MzC37>zIfh=&?J5Fw3*N(eetvf&?J3X
zk`VqREw#`jeevN|L2H5*?E}*1*FoC~?O<U<ub@s&4<eu+Uq$Cg{Gt!+w+=P>BB~SQ
zeKqgH<aeyRf6TlOaVo{wev*bhbPQ>Kqj^?kLHX%wH2_)-w0(reNvI;~SJIIOeK~Za
z-=K>e+3P7hlX<sdoO@P9cxJ#O@++a&!;{Lh{0VIdv^CJ;^Su(<YH0E8rXJd=gm4?6
z)j^AvkI35sZ6&n$vfU1CMO-*Zzg&th>GyPJPPCllT|eG!<XvJJ%e!3OZSCgWMBX)b
z^De?Wo8L`Klv#qzZ1hbw@7CCa8FWZ|KaTlF&Dd;wnyfe2R=e0UUF}V?t~vmj<;Yxr
zOp9}{=w12ISYyu=GJc8OU&LKDS1@`}zkF5@_91+ICTue)+(p<8gk5)>7k108!!{H4
z0rZKH+t7Vj(e3@FGoCdvKbG{aCM>$e^Rct*-5amRu#<tGmFTn#h3=Gkk>8mtD{L`s
z+fBc_KT_JncHXUfp|gFZe#O`ZExwKQfmRPKzAX-bwg#HaQ{+$NjesWd;=`3d6M6Bp
zvLy1VJbBVj<WJ<)c=F<HxH@PXkQZ&+qVGlLG(bDiri^Z)<`VS1v7_VL2{zp7q|Ics
zx!1=&3!N~ws|^=TSAX8uzQb?&34uSsTxd1W;`3PyZ3(n!S|wa5v_;TNU-Nh?J$bo&
zmp^HHqI1?0u7YP<D5mW(k6viWdF9E`9nv|xUkiWe-S!@zSrMLX@RWZGp8oKxCQTdR
z0hAp%D5;$w_0dRx(0lFfy;;SCqdvU3F#XoM+nDheQ}JZC)nb3TyI2YX(@Zn&Ak0Kw
znE&!N<68+3ujR^%pC=jH!}w}qd!}wr?pxW#ZeG{+tRb$^^m!Q<>|1^!bGA<NV98%S
zVVWq1d&IuDj4-CX#e6-n^q6^;Ttt3lcgZnzPCqR9N4s-hCi2cfo~i5D{w?>WEbARh
z=2qN8{tlC^bC_(2ei#Ao<}bYU`J3Iec>2NY752_XCEdFsd08O*JKCKiu<7jcqj@pw
zugP?*>|O6>zwnsAaO7=5p8Zz4bEfF4d{<t)A6^dio*D0lM>CtUreFVKgtsQ&4?Z?u
z#sAJif8Gzo|LhaFa#=`B51M>(;f&A!k}KG4N+Sbd(lZ&pXOWA?nl->5Be%Na_n#n{
zHGmy+sM8e3JtKwO_%j=F-+*T?A0O|plDrO_OF#OSe4+!&;0vcZ?!B#i7g`mx5zzMH
zS^k7p1Fh=Ic4s$3VJrmepe=${C^Xi^qw^lA%R>6}I^LZr?ed@WSko?(&HugelHCh(
zaHgx_U0}*?1@bojw8eP_+4l3u!)Y0B_*Oxe-{)LK8C*h~-?&a#(p^7bFa2)5WC@X-
z{^kcEI!zadO7fdqPJYuJ=eN>!k0gEF?xpnlVTu2Rq}j!BD)QvFP`3s6txu5`h{{VS
zznCvT{4vRkk2zkxr196-5dY-2X;b)O_GFIt&Z=7np%0{4VWy4^SU-(@<COlj9k0AS
zec{=R8*S~*R7uy>2O8gEd>?1lWX{5g2|#D)IA}I1V3i=b6HN>u>2H<D{PWSCpYdnN
zL>+rSBV@*(w0ZO$r`<V<u%14Wv@;QxYY%ZUkaF=i58(B1!bS-D`z~S4Y}36bfHIM=
zrG$NturjX2jcGUmywixDA3*9On|_(A9Oog)_rnJyZ_lxDU96xj)bEL1=8^a=!pNJ6
zye&t0KHN)?m$1Q8`j;YMi~Nr3OF|DX^dE0#ZaKVK4gWgqwr52Lo$RKk#y_rOKeF{Z
zsYf~;lfBcua^4D0C2=;wZ;zHZIe6*)evCxtu$x+b)9GEGzTYq3?A`eW#((;~9f0_c
zOuZ?jUd+;4LTYD_e)m*J9))LES0177cAZbZ<W~(o|5m@c*uRW)dy;$qLO<zOxBE0@
z`<_qV<wpHuSImKD=pDBDn@{tDRg3+4tF2!2>qcAMYv~QP`q0wrY!mANV~o7szxV5x
zY_-~wkc+e{BPsE~<$k>>RekE$kEf_7E&X^Zzb&OVrKlz?l=Z1z{5=cSd#6wHGfqn^
z{g$N`+g`lMh?3e+DZlQfF7fHBY_-J3h=8`|udG$D`t+Ta`p~a=zR@Z$Z}ymGU1q8C
zH9x&b-g#W?y%08itiILMk1f^GBSOY;_GNx`tK61C`1sF0GoXFp)Bn(_&8Oec>TkCG
zLaP_Na<uor!j_~?B-lTE`bMoj^XWUZYOwX6wY-Uj=#!5Me#xibx72n&Oc#13vd3B0
zJ6g3{`XjA&2<^(0eT~TDb5G)D*?iWrEcLZdUm8?f{Q56Jb-QIfouxKf`lT#&z1B|#
z)f=h$>MXT6?GHY_To}+-2GymR8n)XqH9<eme8z29$5rSTda0E@y(XwG@#~v|>R!Lz
z*h}4NSxb7U$1VN)Uh19{Qu$G;zBZ_Kq>-4XGd1sC%KFk3ahvZUAJkj?)(3;s6Mns8
zu=-5ya(Kqp*9=zo>tFq8u)0akDLfa_FAi24dT)g9#@*%R*M0OogVmLL>K6v7Tld`e
z%R%bizWT|*>aA>j!C>`8wmiSFw_Z9}?bt{EWw2V(Z^)a3+*dz1NWHM1ervG0ZU2XS
zPYhDd0f!?~`oe_+G|7155DD`9VGuqzY&;2gFGo(%Q1p|p@u4}d^IP{1RuA~~9fQ>S
zmaZP6uGKaPzav$*4OUO3wFjQgQ49C*^W}lP+S6_t#N~j0@DbpPL;bwDZlJvRaG-u<
zu)6dxji}28$qk~&4?Q7;@w`u8ZL8b7>I|)sMnr7y+3Kq(jV8>L#HTk~k^;k9dItEF
zPc!?d_UkW#>P=hU-Ag&bvM@zI)l2;~ML*n2y_N#u%~btTFZKI0{g0qpE!A>ux_%?5
zR;BCbgX+D2zO0wJC=>F9S^DZ;>VYhMGm_{C+4p!VMZWbLdnW_kKU4MAfO;oQZwsg;
z8EC0j1M<lGsFq3}=|4}QnM^X}g?y5$syz?8k_evnjr_<`TP*#KrIy=nf5?8-r*GEk
zeZRg|t4lap`+KeatVbJ}y$4c!WAvG7s-COv^CHTA(<eISbH9ElMg76L!n!9#y_(`O
zNCNuO2QcU__v_V`YWC|JE%k&~l;7}L^w6hhS-;-tS9KoG|4)d43w*_|`snR$twd`x
zioHtpAN^{9zTK}@`dyv7$BFD!RMk=|_xd!CTf7Pyc(-pP)#uaew0c9@IKu=t=)EKb
z`VULrqSe*5y!)f3QBh&(BlXoj72)U?UxuFi7rNw)*4P_URf}+3ZtK;l>T%JcuWEfy
zs=75r2!BqI=Rc&*H(Y!C%-Y~nOC$lP9Uhy!wEW~UdAQN1w_B=39%&ZuTIv}|*4uvl
zx}{#W+*AyVMtoB&Vq$nnBL0Uy^o~?@qlt1wDux9Kl~2YrKZEZ_B8+dL>H2C5CBnDW
zW~hN?XJVsyq3w~nLY||8jh&-Q<G*M8-2Z!@TH~cW{W!adA9!;`i+yq3QNF=9`}CKo
z>JguQGgZCf*X`POt9|STsp=zJzmTeK(E6qnwLC>%lcJs%Z9tbZf8UEYHp*o#@@cLR
zUS;WvUAtjlcl;6h7v$X1j%70B(F=~F``+ri&393%`pU1DrK&3}&GQYG{#S~+*_P+~
zY<+#Ix=tU)bEDQ*r}9IRr2Mj!y&zmH8jKR_H6B49G2;6<YK|c*q*0I<ZEx^n<^9x<
z23+D}YOS8b#Ie-hr5Y}<^lO&7+0sv0YOSRo5$l3wYS@1gu`aaLH9o!ER&PilUuNmc
zY_-y&`M+ps-nCh0nvkKJMbe#qeYvz)H$9<?g@(2CPYjVL?KTmElKQ8mE{Ur1RYJSV
zr*}xq9<OPoS9DVGp=7kDg?F=0-)gG|J>Cy|W1(&E>6&bEtvBtZZjglBWa);zB-yQd
zsqNBdnr+>@msn){K*&RqzDH6tcldvjqPOm?mWXjylcpQ@R_oLBhP~A*Qp&HVYi_~6
zJwvbATRoWp;p2d=+gtrJpqKBhUS(|GTYVw}$Eu*N-AmmS)Qk2~OM2;*d#M|HL4G8p
z`4NPTA-z3Y-PBt*Wvd5!L*B8Q-kh!4cheiQ)t24$`fT;NSjAuU(W|o6EqmztY_(wz
z2s`)GE3(yvef8SC)XRPK>b<0nbcj(<9_9}r$jSP~fO_56_wG!!(685J(sA{pnd&)P
z|2<2!rRYzx)D5Zn<)FGdO(@T#N%-w)`l^6hmadlu)HCV(5&ohK89Z;zkl}A_hJ<`M
zLq^YkX2?*uBw(ZkWaBLn@10A;Gp$E#^_->ex7AkKkgZ<#My~KNyggknWfbz+i&E4r
zT3?l-)@sf3y(#+{3H$$!FznKQORGj(zpT|Kwit-5wuaK82b(}6(NBNKP_JR9r>o6=
z|4ZqrP3x`c>W?Y<k_`2Dif&6+*QV;v($x+I<#csLntm!>txeaDq^ox`+zv7PWZj&i
zp7rbZQ>F2vAU0Z6n^V;$E%cAIekN62o}!;gQTM0p3H?cF2IMh!H27P+HkD-Rt5Vfp
z{kkDleQ4{YsbXV5xfuysJ)tGxo3$kTMNuyBj{U@Ie#<h{THF3GU2RCQS~Ao}Df)|a
z^=PVYO;;bL>NhgfO=+4RW&Au%|4+KQCtVX}Yr0g;?HL!jMLp&jX}I^<*2NiWaf<#j
zU2RFx_hqO*rdqFLs12!ld%C(NO<$0q9!k>}1=NRWx;8`Iny%kXSFfdO!rUb`ib<9v
zE&kK%2o`rP_UVUx>SO6tyiPw>cQswI4&Y_+UejeFVM6u!CHU(wvj&tcGl<pJ!G_-J
z$Iy>Dc#W9|taGPa0ryo#?+X~e8vMAwiDu8JB}Im5k)LL$`_kPJD{4bYs9rMbJGI*1
z<QJ>PFz!(WqW%BoQxEubgHOgS^CCW9DKGg~*<Glvw4J^!ewhln(tZxUWlW)JOJ8NF
z8jEFfV(xqT65gyx(eJ0Lhf?&L>FR${o^(51d_;Sga?tnr)ureozpC@;>)c%L6HC>8
z+kE=6H1%eRz9@}q(@m-B&Qzf=>!3{%+u-Xg+UkwAev!F`t)I-ozSa+AsmHXwD@(nZ
zu0PCDU#9D~vecrCXI#TvV>5Tjp~wn+;Xi!~mYt`433J{4zCEw<=?84zyVl8QPFsIv
ztBd4)bQp|gY0qh*RI#t?ZS{pE&o6iiHsf-M?i#Oo@BSmcoJtb-f%hkU>ME}^v4uYH
zsk3#HPt^-a;y&27e~k|<<GU#x%0tq&eJB%_Rs#BofZ8mNZZb?*^gUt!Bw-)eW!Txi
zvcvFSowNH5+3JsEE?fQC*0*Q-?$YPHpRI04*N<hZuQJZ&|MtLci&3wc`txjceJ_0l
z`ZaX&bJ^<E-um@yb<ge-`2W#9`EdPfkNbRld91JI%ZGg}7`zg4uG`nwM}ByElN7^i
zz6>rj`dl76-cR*Sa?`_4t&yHpeI-5BeI-5oXc_5Q(O1&L|LuWZ^?lV-LA|-Jdc4;!
zH$n;Nn!f6_-6rt=2fOD>_&sj&@#V!mHD5a7*w?pvHgW!#%8>S^)D3Y-h}K~@brZVI
z$G)$(eEMF$y3FsIl*Q)_z9R+a$dAy}8=2gt$gJQ+^q{40Oi`~``fn*R35W8wts7F*
z1xAlwqW2@@qgua_qTbbdZHoF@ngw-zif>##PYmI=_7yYN>Fc9+F3I*?ouWVPs~%3%
z5ACU*$kdNhL_z&kUv<Tv1iyAqO?lMsDNHx@9k0~G*}9>l<VEf=zG3AA#5eG8mT!ko
zKbz&d4M$BDUJm_AmbxHK-<zdw&boktdNe50`HzAG_lix*C0UsucQ0S>qh9sZ`hBbX
z`Zm8>CvDc{f~VgzKD|-PxbcEkPuu$|^)GBm8Jh^@Tz8^x44#NTrTK0bt`GeBo;3BE
zwP$;(dM4#{gfKn)W0q=3(~oAUPtx_hLAvffZ)d66O#N_BZOzmhf@*u_;a-IOibUov
z(zB~~j_<h9s3yD;ZwJ)FzTKFMzUJ3Y1=KyZ{u}C5>qi3WjTGIGp;o0G3WdQR%1h}I
z=0WLz-zV;)R!M8Zi2NI~X&OaPeW0KA^M(G4n<JoK45}{zteGsylE-_pB*<TbGekXn
z_jq?*9{V3jD_7TAx=pKBIZ=e?Kx*wqEre$64oEqt$&D|Y{Gmm?nDGqV8B~8uS?mMu
zNYhP0bzj;x-}a#TJW#3BXIays(<tnHX$Rn8hdd51ehRGqm;Kg_wrcW=^X*grB%E)5
zxAY>OEz<CqjTah*!WVoO_%F{=TWxVbT}ihI%Cu>7mRgvqpUF}WrV8b9(`cI0^olHX
zQ$R1yQcnhSO8}R>{ygBCnaDceq%+@6Q!BOR|8*()l{EE0n!X}kZAuf$QYm&q={aww
zc77;6aYkcrwO;FGy(R9ed#f$!`XeZTY0y8&)bI3Gf6vmd^;Rue8p_u}p}f<pKhIZ$
zB>bY@LVrdKSYkN3(WgJoP`EO0$#fkx3$?}yy*3qh*1A;vUWRl{+L+kX{GTr2E(kn9
z$-#Zl>H6vvagyAYqL%n|U5a=*?&VoC(^y9$Q?RCZB&f*_y^Sx|P4}Hv!ox51lY20O
z(mVQySK{qH>TXM~>f`&XrSI(H!)df|clEN?HM^@<({=mq{Qj@LxQ}X-gkP|m{-}@o
zaJLMY{v|f|3w`t}eblvLZId$hk4Ufb{rX<t;@;{ZoH8NxtfjvWDe}k35^4<kdujTy
z-fCx>ey%rHW9ch;tLM_?xhY*g06ksbPDy15<;@KJZg2HvhJKa*1KIH6;l81_+L|SD
zKFkt1tyv<OL4p6@2<q2D>WiR$Hl*(DCC?jr=|@9qYcGAjafa_p?J||NWcKt3UQb4Q
zTQ@OZqrV&Lqt_@ImZ$577>G@=(Q*%hg>3EO)kFT%3A5PJj|~-{MEfxHg{_|(s-8sc
z4_EJK{r)huBSqgnRMn>H2Zo9n`}$C|B28a9Tsr&b!_)=6tQ&@^I~in$s<%Tze|C54
zxnWYypA1pY^wnSIsO$H(77tZV?5%GfrnZWe^kP4K#W2;{PcI&(ZrE49K1^M+pT1$3
zy8cJ{{h?~xkM!$9R9jd-H$+`JK$7#=fT2Wt&B6MyVd~+7H761NbcjUt{2}`GA?mY3
z^bJGQzxaQcy5dm8-*c!YzQ#i}Df{$LO==bolyuY&)NJs6aiD%+h}r>VsJi|ziSVhz
z^c6#JgX+aY)J20bi2U9`<GvoEt{=P~BDW4ck;os(kqF<*k^DF32rngTPjgGSj~GXD
z^rt?x$fw(U>Mx#klxz@!ZRjxs|CRnricFDl-z{QluEjacqQQ2wMT%Ogb)!~yo1yk@
z&jS0(cj_8i>m&aCZs~7ag#Fd^mVW3!G21>oP$qw09Y|Nw7xtHV&a(chCPm-dUtN-R
zf$yyY)nn=Ut^?I28T-6=fXr;4JV0&CIDqFr1oVmn)TaSmbAWm|Q-64XdJUz0fXrn7
zKlZK!KCY_jzZTN6N?6NI3do{@Gg({Q(q-C)ZlNh<mB}=jwxMY<Wa%P;VUtD004}JX
z8Ws^*B!UYfY7iF?mxv4Cf)P<sTq61*;`aT|J^wp-@6LO(H7(%B`$2E!KWEO}&pr3t
zbI%(-+`D^^iW?92?ys!4_6VpfSL|@%5#BHMt$6lu@2dT#K7P3Oo*6p~9PZsP<8xD=
zn(2N0kcz*~^nP|o1&E$M6gB+dVHL>up2M)>{qry>=trw69zMdmud3qfM|fjZ^5umy
zkHjx;ohdnfW~Th|2Qw>xJZ6x`4I=HF)OMa|f*;M4Civz|X@cKXNfUf<rZmA#Go=ac
zoGDH4rJ2$MPs}t;aOF&Cg2$_*2`;RXCV2KRX#)I>CV1d5X@YwXlP36Jl{5kHckfYw
zmpt9PN5u~g^X{swc%aJrBbwmOD(`#yR@_wOU9jKOE33SVXY4S1n0L*L;i-7B)RhNS
z+<2IG`#}{Ty60e&`Ns!WAmir`ekE3pL!_YJMf1$`{(MNqvsK>Uq4MROhaQPv{^U@}
z@mGh+FE2i<0?6fu33BCOg7})?wbxCp_~dl&-W@8gnC|^#hgsK5_kOcu#mA<5mrSep
zpXuHc(<*@cW11k(OrLA!|7$Oq`em$34`REzv-jg^72nz!){i58xU=`G=@s|y>|L~T
z#XUQFx9%)|U$_fS_Ecz`b`mDilw<!Xc4PRcN)fmS?6GR4e05~VS3)NLSJYKpu4XaO
zH3V2taZ>=uuLr=KxCKA_#FU<2=-#SQ*&Oh;Tc-TH!u$GEIrsVZ4)2!__f{N%f2Dl8
z!dD*Y8{a+U+fyFc-TUQ^6*odvoK}HX!M}If3Gdw9`{MM9;oU_~!wcU3X_j1wYaqYf
z<;(Je@9%aT{uNI1(}q*^jZ+4ueq(nrBz<9b@2cth;qM2gSKP6??Bj0O-Men*8>ayJ
z$Zi$*_piGhjDPX#oke5+3We}PQ$9WA`W=2+gC+6{Q-^B2TXw5>s>b`nZWXuHc#rHc
z4aog_Roo2;wRgp6jd#Vq?LdCIf5kwp_qhYleou|}9A*`S-k}v=t?_<**ekJjuUZF^
ze^t!@&xdCw@b{yKUp5sVet+bF`0)8x9*MvI`O14GvkxD;5&ypb)fM>ne_maIFP?sN
z1(=STzb|;rXN2f0$G>|DuusP1n}5V=@bCBH@{6CukHg>BRUeJNAE;i4UrXf{p>j7#
z<z$>d|6j7H`uUx`?@q1w{I1@O*g3$OQ2`xv$`_|qd}Y@u7w-g1&6JyV-UWYucIRC{
z`Hh|B+b4Dw{N`OMZihibOl%MDTJgxPb~Q#mE!RN3+}pbsV|#Cyi;n!j-op1&d&^S%
z#l5{Jcd2;S-rnza6XcfN@%zueqT;uEd5`Z=alzi+GkYBQzkA6R>lb@_ckNa2!@a#9
z?<IeOZN<WG?qIt1_zS0;i$A7TJTuK3+_3@{_8WIR^3v(v^%WKWz*fghqhC&wzu&o2
zv-!myD`oe)x#IRI-px~>|C;*mIc3_wlm!n>0n;hxd8guXp6K*@yjD&K`YLdy{?>};
zM8%kL-s|PNpe<&`ZBq}z^`A$exlhB|-T>iVQ2_vCyB%9`j`yaDbskq3`_os!aDUBC
zVxhfaC-3$h-tyz=vSR;oIuxfg{(fk>{C&wz?@@)TGE;H&RI#FuPl3r;cYs@i9e4~{
z`VAGZ2!4A?#q(3-eE-i=ylbY$P1TM!#^Wc<e)7+-e^2u++40EVOcNCf`^M{G2b(7L
zJN*6hbZt^wE7O*+W8i!hcO+n{-A}&RaRxr2J!eljK3#XgbU8Ww_B8J=Q!Bm-8{Q7z
znEKqbDfdj<1z-F{R>*6n%TdS;V#)LCbnVYfor>3}e`|-SFt}pW!ASeqZWY%Y;Qej)
ziYpHAu71Uw3l8u;y$9?F-s6=O*X{59V$X`7Bm2E7zPX?G!M%_C(SF`X_YqsegZoz8
zx1aak{e<!p`w8-~8E?hPwm-Tp?>_E7Q}%>81bfLTw@tww0XsC)TD#%o;>S}xi1Y8L
zh&vFr4b*y<H{p9Z+c1K3$2dL-d?e1XZ^6!ehl(Fh^PbzG@+;G1WAWK(vN`<@l<H~n
zw=`N>nilq-88{mJ%#Kq&I~8NjdveE$-|px=Ct2IZF*GA$^q=yPsm}tvf5(akcJv+z
z(BL5FS}FS{VXxg$thS~sJ9?_5A1EF49M+=>?{4|(3fQhHyid>!WsY{VEc=LEb$!KW
zc9>#}L^yG}ZwK*tJ5M?ukJy0@si@dzhdnCJ+TpZ{o2N(v;1un1k~cQxrqq^LfAaFb
zFSo$UE%0&+yxamWx4_FS@Nx^h+yehaEl`zGEw%MjO$SfcbodNS$Cy^0sh^vfjxhD!
zOq}T;)1kBY`&LbDlMScg<A<m9tFV*c8DyGaI?QyGsUgBY5Ab&};zLXunT{|WhmBUA
z5fii@_iTExU-Uh#Yc>DDTXBSqXZ#LLD{s+sOfC!JspRjyJN5Gr4sGQb`?RK2Ok3~b
z_n**o;Io>>$ak2(kCAWlr}X>5+cj<dq^2Xxe}sI7;P)ZVI6r&*{(V<LMaOeH(=Rdo
zCR6ie$Ug(K^>-m(g+e1m(zl*aB;VH+M>kd^-N@^U<5%^@;^<C2Q^&=~r8-_?mj}nK
zZKTUc=Ut@VS2DHV+jN+8;}<GjYXse+q#Gk$a|E6DW-Z?+=^7*Gjv?I$>0%Ld&7>P9
zT~!2~t<Sbdb*3t3|KEa4t#on}|A3IEDgtkf!j}g5j5(ie79-1obdAe{pDmpy=ViX9
z)%k3{syZo1H{g8E(ya*6wK|_IT~68({9VrX1cr*FYn1x}MLc6g(hZ#)&aZJ*5Fc<p
z+agq*5~Q>Ee@nW5Qs?*3Yq>7gx-&v+8Y7*p_wecAboTpF)8QKBH^Q{g?-#K=8PX3#
zlsiMZR?=l6=&mJQGwE6*=tfA_NV?_-x~E7NBVA(zU1h7be--Is5p=UiS4q062)b6%
zd8Dh1pu3E8<JIiP2)bd?jgfAgeBJ(elysw{8;hXxQe5vyHyTCfW>D<si-PmD)%pBD
zyXIer-Kg`sLi9u8%M|u(2*+C;ytOMx*VyfR&ICVKofG_QvmT4U8>z<*{$BgLS%R?T
zd76Kj|NG2gs^7LQ4eGg7@!;oEnQ!H|@*Dh1e!e3G@*O=w`41(7`FPdN=R9=_`>N9J
z58y5;&*(K<$Is1U`10`n8oDAJ-^~5O5DfP6j5EbGy8!D!;=}fPemn54aK5AD+x)2V
zX=NH<x(z;5;XXmw)4ERQV+?mN<r#Wh(*dS2rei--ypeRxOk0^|m<})<WIDujl<AoL
zp8Ur78Q08A{*_G42mDjT&oQQrOq(4#OJ68$y*j)d)^%_l__>xN#?<9k8G*ZW?&nti
zuG#s0BR^ODO7nGp7xlU7n(%tW&enR2{hIaoji!xEEx!!EA7?tibdc!~(-EelOzrQB
zO^4naUd}?lf0XqYd``<h#MJ$r-L3VXqu<!i7dfA=aXx?8`Fxx6`5x!<SNZuBEWl&@
zd<Z{(7x&ZU*-aA|60{^XZqjbA(snq2-)}x$`w3SC%+tyg?!@MKl;6vpdwF)!#6y?{
zL~_9Wd5TBdx}!MzmPffF4h;*ki^AS7S-xHQ+1y#j-&rY}(O=rtp_K*0#RBud>(M;$
z|1eMNPnudh_F4UG%PH?emS<;8ObiP76H)L@`A0vkU;Kxk?dTup=T8so_kZW-y==!S
z{PMpqx4?fz3pB2`Gd5VA80AMJ(`KfvOfyUem<}=>Vmi!pgy|^LF{a~8y*+heRWhw&
z8e`hXw3%ru(+txArh`m}m<}@?VLHlmjOjR2j}yC+X%*8L(?+JvOk0^|m<})<WIDuj
znCS@9QKn-|$C-M2v;Iu0n8uhkGHqts$~41tfaxI9A*RDjN0^Q>9b-Dq)Z2&oGp%A8
zW7^2HnQ1H24ATLogG`5*4l^BLI?8m6={QqwU*^xWifN2#BhzN4txPja2bc~r9b!7n
zbcE?B(=n#wOuhY>Khr9vF{X`7o0+yU%`hEcI>>a0=`hm~rlU;9n2s~`W-x!IRZL?{
z8<{pUZDpEaI>2<0=@8RlrXx&8nT|0XXX@?G{FzoUjWKOx+RU_-X@=<l(?O;~Ooy3{
zFdbz&#&n#icL4KeTE#TRw2^5u(^jS#rUOg|nGP`>W;()jl<64Lai-pZ%%5o$(-_l6
zrp-)SnP!*{Fdbw%#B`YH2-8ufV@$`HdIvFord3R1OdFXtGi_y>VLHHckm(T9VWuNY
zN12W>9cStt%>0>FF^w^8WZKNMm1&0Q0MkLHLrjO6jxZf%I>vOIsdotTXIjNH#<Y=X
zGt*Y48KwhF2bm5r9cDVhbd>2B({ZNWq0FCY71J2gMyAb7TbX8<4lo^LI>dCC=?K$N
zrejRUnR<sYf2LJTV@w;FHZyHynqfM?bdc!~(_y9~Oh=iHF&$^>RWX02RZL?{8<{pU
zZDpEaI>2<0=@8RlrXx&8nT|0XXX?#l{!FWw#+WuTZD!iaG{ba&=^)b~ro&7}n2s_X
zV>-^%JDmA5tzsHu+Q_t-X)9B!Ej4Zl*Prb7xLj@iY-YN(pPLcENuJoPnr<z>0iM4N
zGj%@?U99<Jn2s{_<RZH~1?pGzDW$8tgY^2V?7u^_M_Is9oyz_9e-_T0cS6-MC$H}B
z>g%tH$B(Z)K6YHZ-+YXpUtN8CtoHcoWBG+;5|SQ;jo^;npa0{2CHYPDUg5=3n)7B7
z+)jL$_$kC|KqmB+tCfBZ@plt%C2sef@FzF)8RGJ86L}<VrSKVa=-*F#$ic^m4-=oG
z*?TqEKntJIS_SOB8khJD?zJo4&W-V1#GfMlEG>>VMm*E0^ef2!P;88ae^s3V^GTl~
zJ}NH`!85>(ySNJoeLLwdB)ynV1aHnL{awTtz<?n5uSsu@Uq3*+@|++&E`=ESXC3;v
zn1;e<)S-VH@oDwS$MV0A_+XFX|6%zLfI&g{>__@nv78%;5B4j)#O0LdyTnI{-%mbO
zXm8=u=<*>xv|0JcbxwJ{O8jinKd7)b0|QC;SU;IplYay7DTs(T_EwcYcHEv%{L%9i
z*IYbYAXWNS;+FnCi#vRFf#JsRKR3vK8Sz#Jf0w0yOOXEhy~F3-PeCvB8miO^?ZNuq
z1Ot)a!^Gzh|0i)zUdn+7#Wv4E*qMaBlDMTGB;M-aKO;U&+>Yle7|Mi?x0mv_d@ds1
zOx))CPvWh_*RnkC1PkFaO1z(V11czZ^WMt;`^4=$jqRiOUujUY^R%`>d(h6)%gLvc
zc$$0=uFmj}9jJV6(4Tn?81Rzs_eg&X>2D=Ic97D`I|}4EdQYYQJL&CwxtVzD!Ajq$
z<leuC?>tL0wDcP}o<~W~YG=p!0i-{Hd`{U*^Bq4_`R}TUcRleIhyEGj-b^K2hyIYK
z5#vqj`!3Qy$#%Ggxb-u$>i1`W%QznrL4n843p<al9SF{&XGvd0+|t|mQswYjxG;SD
zp8&(Alruwt@k*0BE;8fZ#T0Lbm$Bz&#Cx!C2>p;le?1JMLO<Zpzioe|A9d*OhXGjV
z2OauP!SE~mJ^HiR@;nYZtk4fR^moHBEcDhdj>Fbl4TE$TzT2USTfaGL#~mTw%6u)K
z4TmYciyv5}c!u<r&-aKA61R5p^Jgmkkb@s{xZ=YO{&V6Z4t~uMN<T_mZ-;r_8AmE!
z`86GfR-W{t0i_*UiDR0W=Uc=Fh%Y0)|54;md=<y-GsI)xQ2t*def`l&j}RXQvH32>
z1ebi9iI0%p%AZR5DSn^$mq>5@7j3;RI#&6_zN!5G%<{~774dH=F4wu_`4jQR?<u|-
z`h+|meYMgL|3L9J(zm=u@o`*`l4m~opLU$$JtrtoYraM7>Ejh|L_}D5Fg(ojzSk<=
z>fm3C$+)fbhI%wZkeSEI+X2#F%JNuwdlU1u^7c32BklP(@hb9>*KwJ4{-u_ui}*i?
z+jDG-cT_9=zsbicqQ4>@ds_LdWIOCzqxAN?{;h1!e&8}6$2cFY{P{ryeJvVL%4yHv
zEuX7^3!hWh1m&K*9yN^qL|pL}+@9Zmj^+6{aqCZD@##>I44;2!y{ubng81OS6;H6;
zt|UH2{Cup_@|@nF<=p=aEvN05DfmP9|NC{?VAh=RF!AACwSZ$haJmlXL8je!u4VZw
z#sVaK4t#_1na1`%>~)G)y-@)>FItIP{|9;hw>*CZF5|6{ejsxc_GZ0a`B=XNOaCBn
zp+~GGLuvW%f^#$}PmBj_@(y5m_6C9AXR$ncvHfShDV+YeM#Zf^#>0k-=iNa(!wm$c
zp?T_2VBv55C@lSFi2sj6KYOmykJ0Z!-mNRo?}$G@dU-#tJVW!8zLNeTwtZGYa7n(_
zk3`-fE6;8i7=mAzm|{NvTNCd=;@1BIku}V7{sN^RSQw<Ah4okXSif?JBc-R_J8+SX
zE3)@a0=<kA>o2oE>4yytdBXlWllUHsl-~NeSo-w_N8#w_a~kQtLEQTP{E_(GO*$T|
zAJC79zxPDNtv}Gy#4lP*oPI%%5P$O$Esyngx8<3>ROzkX(8o#tDTB|*`ng+rc?Y?S
zPwP)~Bk32c3fE`;L;5}F&tlti#&T`{^P3dek9_VX{vHQ^W3$rVLY&>}T~7QS2Y-%u
zrdKB-x1-+aE0oVSNe{KkJa%0eqd$&a7w&6O`k#~jaMJ&W_>>d1{GT9x;!2j2`1gq4
zLHyjsO8;6m#8beHp2_jXY3;S1toR7=S>*F0;4*G+S)vTMUGW;w{({^45$<??&Kb`u
zBKm7>1pZdwQce$19p$m*yovZu#3}Z??*NzbT(?}yc{J;L@hRcs<S*n?>F|FQ=q2AO
z;&vR)1upy_XjaBUrc|ESPu$)MSwr0F?X$>-#qn;AD9;DT$KEHwG&hgsbArR?C`{xq
zJJRRLXAdfFXhHLIoT}x2HE}EFXMm5?%idd=!h+g-U+3ieCenA1Pd)h$oe^H1PS8tx
zj$fhuXxsBW<TFG*50TGTh!0EfBs@Fn&%Ec!r|KOFV7i${-lIIlJIEX2e&4q9$p*)M
ziuqdl%YaLLM-EW_DP8`(`$#`P{xR}@8u&Ex_Av!4|GyEp<I<+bpdEz&`cstB&i6IM
z2Z&da|5e28eIdkgFwdRD2T4Cl`riN#>vwrqsN`$!DOHnxDi#Q7p8>XyE$3|DVSHW%
z+_cY`S`esV<~h;uIoO-+jEA>G@L5kj&2LsdwmjEF(0_#VnN~Ic^Bp2S<luJ^A9L^r
zfy;OsamK?B4Za`VDa!#x{{JK&dk>7Tx6_-2&%U1ZPqKJa`HwdAdwI2o2KmeH!rHkG
z^wQ4O?`?Nx&}aA@<Yk=x8Z`L9UdEBzPmoXLxjK*!r$GGeS<1h;PVt%KKRc;-EA8eV
zCf-WCa+A^{^qhIVLVWB`+Ar_t1U@9Ce1_hxg#RF)6N!(;l+O&}r>)k0S>%-e6Tqb%
z?7beHm7dqsM!Z`2Sp3(-hrXf}{UyKnN4wIGT&CrDjQPGgE%`!UqW|3y#8&{9e64?`
z<$o!0>sN2_6V@ob^?SGYr-_fZXgPT-;>}n~dit|NqczVpz(udK{^wR69<h$)zfbE0
zwa`5M#O*!1&k+9<aqE9?`M<hD`HX#D`wL>qJR6A*|3vw}jrbpdn{}4`V&&5Ovz3qa
zFSY#71}^k>U#R2U%9F2<-rg6yh5Vm0_#U41`y=fAoAfdIz3##<j#;ny4)NZfwO5=D
zT-t4@P4hj5^xr4FJ=ba`esrhuiCwAV;b+<$-ZjKW2NbaU*KS~aYl7o#&o0HSKksz%
zZw4;o%HDVSB=M=;N}s9H5rW}lo>v2x@>{=u+s@s@?R^eA9%g2gkN37<`TL1me_`8h
zcM*5T=W*vKpJCR^_TyWLS6!_2ZPUcNs7L#^)#=|K0hjqQFsSYJUbg4!dzF6Jp?@E6
zp>I7`%l~n<!xxEL|5`hbo*{1U|5$vVKINaeAUNK-fE&4Sjq?93`CmbNY?F?2dDpr;
zqvR7KAKU&%^s{{~)C$>r_ui=eW$zu@ez_C4<m>kDPYivfXZ`CB)oi_gk<T&CxV>YO
z@)??`Ihyx>;Wz&zKDI0<PtM$|^d8SgZ8^U~+}>Zb{rjaYN<Y4rw!=*vxO=`uaqG`)
z`K%y5=*X)+;4*G6<2@qVZciBeV6T#PW{bb>T<x!Go&NeD@nNU^e*s+D)7}fdh3$6m
zTZyy2wq746KJW#t7q^|>ABo$0&(^K&$@7$ty$5RfeENLF-S#{Rf>Fv7bL7Ami8sDY
z`-R7T-d6@lPycnx{|~@LPCh{Uyp@xQ3zWXmk$dL@mv*-Iwd{D?=R&0)VZY2|`>Z8C
z{ylAnorzyz@c|vr*oK=YbCL2HJxAxuEaE#}tavN56nT0$PMU}hIO8FHiPGEq5cA0A
zY~WIUdymhS{~e?sifg+yk^Zc=E1&VRHQ$}|YwvmBB3JEwZL2@*d#U2~-m~2=JPBO-
z%ibHX_S?fQ6Z(Vk${iH|w*7m63%$KxVaNY1hJJ=;@5kHne~a|?KD%9ir(Ul77jb`K
z`OgO~{2MRU_P70Yt)Yiqj`ut3*$xjAxA$XDB7XE0te2y|olD%_Q?T-X1h~|9<osZL
z4|<2<83%7CKIY(e6MxESw|(EK<;ghn;$Og}-#7FA%mXa{;qOxV;peq~*Al;+xb?@_
zo&Ek3;8Ok}=4<82Ypzs2<J2ctk<Vf8Ry?*;8CbpRT;f%D_lP{J$p06_?Y*TQEta>>
zAnR4B0L`!7>#tJW#yPP4zKM9NBbS~bKDJ!>*cSk=yqe|b_+P;CA9ank+W_y|TuZze
zxQw67f?&NqMI3MEF_hNM{2ck%dpmaA9{wKX)6Dh7j>EHnOMgA;$n(2MZ{sq2ljZp<
z>GyBfadiyw{`V?>S8w0<ea!a*TK;cPp0oj%{vCJ5)fY(bJ*@4voctdpZtrp0dD{Pe
z;d3Bfn5g9;x%Uo(&+zQMe%lUjLBmRWT7M<H@XkEnwzwmQU-1ELAA5h*^0^$i@Ui#7
z4r0c46Sw!7to;8cagXxt4Wv(hQ26ifx&H25z>VCvLJPRY@;CHoH)kE$>qAN(;{<wI
z#f|qA@v-T_@w_WGdeZ*GobPsiy_tBcqyOIoT*@=@b1l!eSf2lUL~(aqz3!vhZo{jC
z<K*xm){FPlY(Flt_&r+i*RcGR*C`)+KhU=SY=fixYzJHZD@bqeVO#oht`|Pg&zEYB
zcE9si;8HJpkJ0v5)eTB-;~C6neb*Tr_CU6emGgH2H|@i?BzC?$Mcl?uxrKa=xl!q3
z+;`6;zKQtA1v+1nns^s~OzB;H`*+0cJv7^%N8UvKpVNMyuZedSaGCEzjCWz@?{^K3
zeKGCiN3%T76SwhCHV|L`apgaXmpsX1^`Bwj!oQODH)5oJ#Ng<qC0fEmh(AX@t^ZO!
z_Tp9D&BEtkZ_tqsml+)Pf5t_@@HWrUpHTWCXZ<=Cxb#aELI%rY<@wd<2*K?=R@-i0
z0xtPBI_u6Jw<`Vcca@Kox5oiD_5{X}fVys;n@K-7L+R%bf7;L=?Cs=~^Mu=!kG+p-
z$3xQK2YHt{_Mc4#KiHe)@cFvIp(Q!{i|*UC{H?T~EMa+0`=sKrb98|4Ywv#IjTcNY
zly4;c19vEWV@&Dq<+zFuYrBm(^YKF9ru`9>R-SH-=Z}0!=y9IqtRpAfsqJ9nfnd2b
z&nR%=Gd`jjevSAi?@~VYey5%9j{rCHj=gBbrxnle-sW!Pa~W{yM;jmKY2ur(&<h{$
zG%dK*D;^=9Y18t|)n@XZ1upfnaT{!TKKfbZWAB@u#C97c{)J`A;MdIecf@0#(eZOF
z`JeeYmd74ovOWI{T*hscBhP2w&H6g)+9u-T^3ruYYnkuk#H;pD#^*BMFMnS7*!$cV
zVdnWe@yt0&k8U;3v-hyRO<K`D&BnX?3yP2RDgHNGe&W`DcP9CqaIey5{dtLB0bJ@^
z>FghWM*1=KubtP^zo>i$KcofuCF^wx@e#)!awG9dyjxD5HKbqqCFNt|WLbHBIq_k}
zD}kD1o*#W#>216$OTW*E;+4+6U@h^{CT$oy-tGr3<LXh)mnu)6o$eDH<Cbx8YKb=*
z{2<Tu178YU#&hgR<^Sw7{lQl)e*F{!yo3ER^DCO~y0etft{0W}v;5uKKKqh>6>ur%
zI%i)0n)KFBdnU{I>aPkt_9>1&a|!V(NB;ku_&P`K9S4CR^=f6DgkO<=6L8@_?${qb
zU~y-@JWW1h2Wh?Rda=(pl>cC}R@C;(8jCZ|ft4p80WR%e<7inq@Ojd^_T<h7gdX|{
z@8KWC^1lzb)GITl<zd(~?{VU;-M->MrLUs>p@Mv3#9cdi;+sl8&VwdI@;1*I4=Fx&
zlh*fDI&HiUd`s(P<7(M@t^2mpkF;w!?fUpu;L;A89ev~dq>u6bz16$!BVOg~E8p-P
z;d8L}oYRl*050t_;>h`fzpJ<_pI=A3YEiHshky(JQD;2-o%A*?g>BCxMwS0C^>gdL
z)B)Vg)3bHH*!CGDedCH?zP~o~&^Int`kmNcv%aVNs~*$#Tub~q;KJX=U9<f6dYJT%
zUcC;u)GOx9qc4zt)RB8jzOQ`7x$a{bGS59fP`r`*mm7$`;)ja2I{ozo@y5GUo@^q0
z(~n5Mi`Hv5t+w}J;x^94a?<}4xYXCikF)h%^@#Erp+a$oX5)R4_!#xBL5}CY1DA8p
zLF!SOyXP%?RQWeM`qGoY&A57l_SXkF{*QW0@n)yKqrjz|uXW_op+8po#!4MIYshCI
za5FD>&&1ZNhxC<>J>f}14||9+{*U{K=G*GvR{}S3;|tm_x3hhoG4xnRT7%>B%Ac~F
z_bK6Yt+w}`pD8}Pr?!}thu?l&@`WDGcxUIazn1-6@PoXY+0KN$(+!UK<;eNlflE0n
zpU@27#BwekBhHSscA@Kln|bQ!=YJ!;jT>X-$;@A9c{V%x^BaLnzBYc8o!2A8M{d-D
zWY@tbl-|an#PBxH$;1Z+g5^Ksms(D|1jNwUc|8nV^0o0RZ8^VBe0Ylv5ZhnJKB@H0
zrzn5B?p#9LJx{rfxGVo3vGk4}SoJIA@1B!31DAGgbjI@sNI(1wEhpWQy!$PF=Z>bn
zR?m6H@WFo28E+eYt^BK;_W26&n3M0`ztM7L=4(6rOS{Kg4_wBfjYs$~?G|sJr&zB`
zm2f@jW5jJdGu!^R5g(YT<-bF-@ecm2@)`c9;(yp#GuTW#cBJAbQO?Z%oz`o_sn_Me
zO}{(#^Bu9#5WJc8%q=Yc8-B0#vT*{hCH|Pf5B8d!b*kzQLO;VBcjVXy4Gup6XTCg5
ze1PSwB>#DT)bfux>&5lJrJOcCE7S(_JVLy=OB*c1b{_ka(vLgq%j$8(8=ZFeJ8>JI
zHbee@`m@$+|Fd)+9Yg%eXIM_&8`_EZqXx%#pj_HWe8peb&a^9@O1ulWjPq97?Pn4H
zq``6S@U9Ajyps4Mq#ttR*sK1k{4?|$--YxS5+Aul=L?o0^L&E%nA1KB{+I3ZMjcm!
zT5azR;4;qf0%r63e)2iyS*0IX9Fz~20XK5e8He8^eHF_&hkOqG8_VykYtx=1exH{A
zD9(#M;x=xTt?y01CEo#OeEz}Um7a|wdN%Vt<L}Df#<AL+_|?Q)smC@Ee+amYa~q#)
zf8yf?hyKic&?e&RpI82)Kh%La^+5gN*Z)x5#y7r&^auY_%irqApHAR1{%ySD&yfC3
z;zLhse`R*nFW&qwrMGbmwbwlFQQ*=(G1?E&qvo0UZ^Z|l{<@HOEA_*t$mn6>$2jx!
zRsYd^y%V&fo@aku4P5daap-?XdK+hXPnPEmpf>I7oC}N)A9wWm6Q?NsQ`BcRlTR=4
z&5m69BXHB6j^3V`s(fr*Pust@0~h*1>bG`0{G9mMRa#MN_fGGi^y3$5zuWd7BR=qT
z#XBgscdgKTA8_h*32@1Gi1vhiS+CC#Z*}BYYMRow($C+{uOZ@<Ja1h@K8H?M`T-tz
z9?$-|llbU03cQhc%T7w)%=4qA<a6xKijUIJd#g@Y?*qhJ<-#nU1BgEfT>96>Tej`o
zwF~*s@91mfzh+nB)E{hneiXQjtIM2m^*7SH`+|pd(|i|kf3%ML{|elU2lnqm;tjhi
z?wzR(u!8u<iQ72Qwmj9ZP<k8B{DY)F4Y<^|nfEp9yxVyX!7Du*S91pGj|6VU8|5UI
zeQ%|q$2~aC3)^l-S86*9IPGu&aKq;YEq^oH|AsxaJT{K0<#Qw)6@)%BSLv;uvzfS!
ze{SphA>dN4aq3sLUOyzgyDwgY_&UPBxkJap`&i!_flEJDviz3M{~8?jCED$0lfHT%
z<>PHoKK8u+E{i+s$ZPji{>_wocD;U!!4LATb@cNafJ-@T{P<g$@9zvfdWmx87~&QC
zDgF4nwcV^7xY6RAr!z_a7vR$GHjZ+f_%SmS_jtbWcH(y$9Qv>$pLg3|=x2D(rGom&
zalobgL(V#T266WsVGD8hJXhk>NWY9RU)wL!5hq6Y+qmp!s5tYEAYOTkwuAMDJoP}O
zckL=aAnwkiBn&u4KCBMbtNLK&W8*H^_B_|%&;l^t{m%uYxA7_Mxc#`roqg`YL)dPN
zYiRZJZsMcdki9~y=N)>e(vRPv6D!W_F9dG-m+>H9$wJ*q`q(SAB<GRO(-!Yh-~*)p
zI^vN?`#<3H`#x35-^THsL;BNzOTUc&K<9-G2YV^$ZG3f@mCSScOr`hglztWk%^`;?
zK6<rQ=s4<u3yHgO;B4YHUjKTw!`q1uovih;`^P^JZ~nC6-=bXF_ejmx#__P{<DJ9@
zE>OnnS^n!@sqM4QX&>(><uh`=)(h_zG*2t>s`E6%@3EX$5_kQ$eh6IJf1G;09sh?M
zt^5b%f+U_{^0}J0YyZEO_~5IxJU3|Kz3mv~f2t$5A2B%efBNUV)8qoNbFAj;>J?8A
zw{cLdK781#lzy1|4!iEp2QKsL0Y@+TvZdce`-N`YUhAusPvhl^U#W>#V-Cvq^eW$P
zXO<Rp0dONfIlpGI{O<rR?b+<q_g4mozZT_~ZRZ1zQ~E|nZg&C~{*}F25bGcQO^fp!
z-0IcO5+A-$JIa>!*T*ZL!7nTR3HIZP*DCJX^Sg-K_c>TTZ;UCuYkyt=T;za#|G_WW
z4znRBq#gEm>h(V0QZE~K#<ttfiH|w$*-$NfV4rmK<hL38V9#^*5u?C`zm4B&%l{m4
z_dNQ@8l@k)L<j6{+`pVkd}N8@`*Qw{0GIOExa?N1m|e?!C7=o(YoA|Ay!n1@_!#+|
zSjT*Mo@L9~N!-RIw)imdDjG_=*^j>mF6~_DwDaP6<?o*79@{{i`u0BTuM}{@$7#=7
zN$<+Z9}=&+TgT@eENJa)ZJ#QxYlGx}4sbJHxL(_O-A{b%UCL)y&b#wZ(DDpA<@pY9
z;nVEw3odw_<~vBerIGpG23+VpM_)ST^@`i~8DM!b&qm^vJU6lXwPE1WKAW9(7KaDn
zKl(*2_(xc=-Zv`!bB-P3A>cwk_!k`r8Sak`f?-z1RpVJfec*S%g?{Kj9f!!yJbTYk
zyy`c~r+qiY+lkxv|5*8OjivvhGI*AJo+du{b}c`Tqr8XbvYgZtY=1pZ+_ksNoTue)
zrT+hb7RQ@sa7a7G)o&zzDd{~&ul_P{sjn-4-aTLOaq87}zwi+8v9z{N2j|xw38f$6
ze(Ff(dm8a7%EPt9?<em1+w8PJ`NW*@^HJc^Up9Wcl>_$>x9?f7c<n+R58m6ff31H0
zZs4ZA&OMZ05Fd2p$^RNYkhgpf(nglQdy(>K<bL0d+j&ijXPo&m<3z<rd9G^dTZr5E
zUo*-7THrGNn;kuQob>Mb-GPgzczf};{v-~3Cf19A#X6#D`DI50{Y8czaSu2zW|97b
zzz@eb$vmq4K1}?6;#F7c0Jr*pw<Nqgiz4tFfD8X2r``Tw@o#Isc0K)11fQdpDxcAe
z4opsW?@Zuf?Q>}a{v-17*62L_73=%g2>RK}!uekufqxCS)YltU{!6ub-k&1qcU!LX
z!`ExQpw5{mPW;y^6@c1ip0j|5)pu<Ker^Ol7=aH*;7<TI_2N0>-C7*)_05V`-LB<l
zxD&4*cvv~#5rKacxScy>ZITDcf5;gpKOsIipbcZ!U+<*w^2{baM#GiWds~QC-mK;S
zsCJuoF7UAOe;@+)R)o`^0o;s#9=uz7_$Ppe)%T~QAE3WeI~(I)#K&&Xa=w=1>WCJR
z1K8KokHyN#dBDTUpN_yEi@>L1{7Lz%-l^^UWzF7O4m_;<8-a)A`@snO8S)=uzcjMG
z8&1}ID<#k$o+0xu;;<1P<an_C_%QKdN6s9zD!e>#;MYNZ4#~U3@jOWWaz9+^JLcSj
z`yb+!@6dLr;C%eH;RF9qN1xg4l<@XB1o&RyUwMe;+sk~<05179^B(91;$O1(ceTSm
z&wNLT54~LhyYGA6@aK<xN&Tte<&n5}lJAHkS1%)8b%^F`^`+!#;q7@j@UZdl1K`4c
z*s-4<db+M(!_;T612NAVfrpKs50QT0_1bPVq<=Vq{tv(<-%6fS50QS4GuY0%DIZ&}
zSDvZ(DD5h*Cw({Yu<~3Ofq#R1#s+kvV0fEn_cv>O$GBeaP5ws$597Z$0`CVN*3Rz(
zZrb@e&G4<{|C0zl)6NRd_h{f@{1->iuaCgbkHD{rz&{;<e+RhKYxJ|)o?~p!*Cn+*
zhxoorYY$%qJgoeeM&Q>24{QG~N8pb|;E7g_D>{(w$#o<NJdFPpz{AS<7;us2i_TR2
z*4j6oC;vgtQ>(Whk<xrKS8KzpV!yWn56kzm2>dg^!^-p92z<xY;rw3{fv=9hF9R<9
zTXmK;*q&Tpt|K1fIyjH@`c4F&quavE^Jd`Eo}-M@6(gVXfSdVpw8{Y<+j!rI;Pb}_
ze6RNKd~1M9zWei>&i3Qkz$M?IOSPgOV|hLt!RPJ>{14<a_#c(ecX8i(e_HvEeM$LW
z!Et!%8rF9}$BEUK&I2CS?^gj28$TZ*pT-N7zqLRA9=O!^7~18m9#XS5oX-z|3w?&;
z+|rLn(C@WQ%kR<7a}wJvPP~%$Nxx5g6Y#L|-%k3*ls3#KN&f)xp+T+JZkl*cM)0ZW
z2<N{6xYTQq`q(V;|8xZXL%?M`G;h%IJV*K`$>#w_4tQrP|IN;R|8U@DTycMG_eYnK
z{sD*nHsE3Hv)lS`e02o=&Io+3&Tu~I2>i+j{1d>XUIUACoZrp%8IPc!u^~L)<0J4E
z;L=}>&OE&bxa3>;cg@%8Y3Fvaot<&=e&Av4`K<_ipKhVYx!A!ff9!eXn}AEc<COCU
zu)gm%^!(sU{t$s5mr*{0^yfK>zd0LtSUIl%F6}VnjFX!p=pQHlF~+;lT6o?+BIpl1
zN6XXrW*z5n*ECN(aIs^I)1Tb>Ei4BfR?hwi{I&@EKM{CsPdNXRfSZ12|6a;+9?~05
zzXiCIzl!^ab)^3laN(1=Nad|PAN+R&pSgY7{_C7^_(|Yl?eJ&>{x{$z-;FB35-iW&
z{o#BXfJ?tO^FFVY!)Hd&e;By%AL2pW&Xki!ZPa!c;(7Ej%y$KFL(f1FA12<pN%67A
zmBEjQe;jyNeMchjzeeCQH;1ol^MK2|%W&RZ#eBPfOF3hgYJb`N)JH7-RW0X7^=t1D
z;-d{Zo{!bUdzN@B6^fsd{~lYy>opU&=`Zeo?L1u^LH`!gkG)$9V%u{tg8mjm&qDZ;
z2O{YI6oDWAmhkfTMc|(UF7smO4JsMhnD?RQYJFX~bO!LSe!nUL{~GWxz2!ILKje(J
zDd&arKOVT)0je(6a#}mUnh5&aBJiEg*Lt<mZ>p2^eG~91U%T5sPXjLFaQOSWzMRP~
z&Ly8gt~*x#JY?wYuP4&SNgumQ$Agu_e<SYNF<$jHEoUX|QSTtDEyPE8PGR+ylc6ul
z_`J78`5&u^cPVk#Zh0H=(KB>Hyo&t)3_PrV_qjmj&oJ%MR{ks@evGpotphIQaqS6r
z8hZHoGhWse%(wAE(lg%9LB!9r_{BP}aZSKHL%_qz`6b|C<LyTg^iKdc>nZDH_5b~_
z9!U8I&>r&G{%QgqmhX>AKX$r~lLeGB7hSCLE5rI)ef~qh!}vT1T=E@q?2t1q38!y}
zz)uEl>gDLA>m%qd0WSHvdenlqE8fWaV#k`+@VxVYhn4@=5%@oWhxJPh<hW^P#v!rt
z{B+{pFSXx!9OzvQJS^WYl7964O2041|Kq^Tx=+1jCd<FWWvo|+cGRxgZQfD9!}u?Y
zz<(8i?|FGRpA#bRRlubk-1VqCg8l;1kI|pm_RD994_>70un*f|G=k6HBJhK+2rvKf
z5qJ}DDgUrDPu~Sx#$hA%7CXQ01YX6y@F(9PAMXQN@L?Vt{tkFpzx;#rL-%WatsH(7
z_FpNFtDnc-skl2%P696DWQ^yGR=#ZjF7n@_{BL4m-%UPM|ImSB_oqYe3NPo6BJjTh
zH{;*gH#J_V<!L=dCC@zOdnxe4{SC&yuQxc(*?GTj2IbFBiI1lg;59k#Z^SF#r}#pS
zlarvYgthZ(;8ITSF>U9^C=h=}eE4p~EuYzg;q$!<co_eo2>iYX{OJgMpR2T-L)@3q
z+~PG7Z@p8?S;hLk2Y6U{?geh-2Jd|w%65L7eB5)WX|U@_`;2}q*bdde!}u?az`ths
za9q)><~@2%IG>jHgyTIC_*KBAoK>II26&R~a2xT;hqe5#ApS6LX@>#c6S4N&KahT$
z=d%fpw`uPUFV7s{!l#jTb!*@22QKAt&&xg(!RK4Rt3t;APsnG0@rdkxeE0W-muJ5S
zyb-v_s|OrC=TzWQPIupNSp=UuNI(9#$_H!b`9%c%LGKSQ=bL~_yTvG%>{vP(xRmGW
z71~k9vcKLH!RO%!{P*PJ-5VUY3$G0?&uIpS9gOnu6!yiY2>M%qOF4&mK6oh0`5^JZ
z-BpnAYww4U`$9k9=mY;G?)pEx@`K^!yb!qX8KeEv`g`9TL4P0VtCHGZR13W4fXjGj
zrk|BvzYhP9@^7ZTaW328b--;O`G@I3(pTQC4e$`#p*4d365x{Wh;#4z!x8jfkHDXZ
zz+d-2Y-gU&TK)C{;9=!|7`R!-X*afZgWnmv67k)Y!%S9sj}I&VR@#B>_>Th*<KGHg
z_zyaI_%-A+@L?UFJO=PaBKZ6`0{>eCzTZc}%X3r&el~D39-RHoB@y)B0WRZk)RE`E
zfPo^c9i|K^y}M65koW-ieRdp<0T1J|<8@L_+>fKYwfl~Pfrshgvw%xED;;~<j@O6N
z9{^nFt9I7;@&=CQBW_T95PF$Bn>Zd`4?L`#=aAm@Tlf<3N@w5kzr;tlk8b6<v-^$V
z{O3pD?GgC7z@>kW;d^JS9(WCKsqe^pbR4ExuiGN{JVg32>LGUg{|$Ip`_KHC@^|+;
zrvVS6zZrN~dp;O}x7;N3=wIF=_%X}h3*7W0<0xAFdHKf`cl{jh1RhqNuaMsLbND^*
zuyP)7v+{TCW3MIdt{2OI8~wrYL+B#C>+kVV;9>dROFrY|W9P*W$Y+4(GFE>$<`dz3
zRsffFYn?;An&W>9a2Y@2&c6F&5qw4>@O^IKICSnMy#~1S<M<vbZx^v2Pc=9{_>%Pz
z_)jD7|Bb+B-m3Wy(jVp?&bx)c&G`Ag&I`LQ_^zS%6@-5td7IL&J4qYtShm|a#49%|
zpJzz_8{lF6xZCaF_z4mCnGtwT1pZs#VdMOePlo6Fh6wz8;4*Fp_#PH(H@=zp*j2%G
z^V`6sJ%=ght^ApFhvLm&)b_cQ?RG11*Z=b=;F9mydzAm>Y|s8-9arnPjt>%lCveF(
z!}WS+;<piZ_u~%$59^nmKNXIj23+`$U#sPRCCmR7;9>22R|KEOBJf@ARQ|4fJ_5M3
zgU5C7wXD}F(hpNUYb$x)Wx&Jgb*tsi^P1%>|AP_qzly*Q$GRo$;OeD|h!3#+_hUKN
z6R&dQ^TJPumoo)i_{ZL>?R<nCClT}yMBsmkz<>Xl@O*dutmf<5lbeByJlx;WV>^I{
zwe!1xhmD6XN6`N@0^k30TAtxsv>m!AuWlvo(Qan@{fEHA%JWPF{yOOKB8Qvle`xo`
zDd1+_F>ZjJzvls$ere@>WNWXx$I#=x58KD;bqhbQ^aBrS`482^d%MM*eeT`B!^;0C
z>0@tJJ?$xuhk5s~eH{DYdBDT?+)jFTz577~{qH02sxO4|pC5s50dCsu{n~-H{J%2v
z(5tx)+A{Bdua38Iwr3UR(M`aG&tOX1VT|pt*B2Fc{ZdW^F8$a_zqLciXA5wf7a^0}
z7J)wmJZwDw3AmZ3&UxiAUkc~H9JtWCcFr}od7=DY^W|_p?Gg9@aN$4B^G&O-ewes>
z&h-c2(mo@!kJ)j)^N5_&;r_nkXL}-WQ+~&v%1fTUtUA@#(49$l^`zIvtCI_tET21P
zNiw-C8LzM1)RW3&(ml!Ewp3@jJ=xyfm(28ZxA(X8CD-)#0!j7trMvn%y1SCy8`C{K
z9qsAV>dtg>b*gRsrc_URV#VTREy?<o>8{@Ho<vJ+a^k<fx>Qe3YD?4VB^zoJfv-9{
z+gFpWZ&r6kxTiXs+7o%pT32t&T54g#%D7^2i!bYz>SUsQ_3D<oWT8@2H`r2~)1N}S
zyd|BK=0h7v{R55B-np!%Egp-<TF|zso|RiN>HIZ}*IC}F)xF8?{=Oi;6)0ui7OKwv
zo_*Kbo=LW?>+Wbv2RUimhc$t=!0No&)Mh&(+XPyDUrQuYYu0phrIWpV{i}m{OD)<v
z^JWmwW+3gLEHKF8vvia-^`(1Kece5CQ@uG2x}>&Y<?<Et7A2Q7&0R5P#c9c=rOiu{
z9bFxL$z*a)e_!|7o>Y5tPsheoU;5<UbUay;Sh1`Q%-Ys9Ea_O?lj_;x_R(apJie{F
zvt8=ESVlyB)2dWwf4ZSBwRUkwSNq($w#}R4@x;dCJ9>Lk$H$7L%QE#%?<Ljs!TOu3
zbM6<eYH7a?EkKRa0Si{lS-3Q@Y^8PrOOmWBT$ZvKg%v9(_G+>drLDiWuX{r>#gUO*
z)0IqhcBFbcy4EIqaxL$0Ety*p1)OaqK}Z36(tZ6sU6>f@&NamsvK)?u3mTv07IbxQ
z(Pb+1<81aTf-|(XV?(C1sd;f*^8#7gvWrh$GLcBEXo)3PCYP^_B}HU3Bv+)@cJxAG
zEbm#B?&(;wCE=T8+k$Vl(EeFjR}-9-6H>`eh%nhyUwD$#))ivx7k1U#fnjL<YpbJ*
z2|3)A^p~-=h%#;!Hx|%pLB0K}lTFFx(2g)t5}W%Vf!i_}XTmS33(K;%FV)x478Gp$
zs#MET($cqNRx;VqErQtPTH_3h5QaIcQC)3FQ~G}xwa&Sj*Um1$D5Pt0csPa03VVNn
z>=3WZsY`vVKoRUu!)J$-#%Ngax!s%7?TgbrUFpvF><y{)&`$+{8r0P$+GW$JsrDuL
z6yfSvR*q%*lWS6)ouc5X<`S>P=r9_`qEuIVXFAc`(pSGR-3G0pN!AEm%@ZxDrk2?v
zf$(f>7U7fqot7-o(qLp#?fj13OscPK-Qx6?mULgDWnO-@E?#ZRwy`6<DX2}E`p=pp
zdiIL+8Wg84S>Li{LsM5DO5yZYa1E@^Dp|ehzGQEAe@|OFsR~vSQ*c73hniSO@60X&
z$<A)*x<)@w^(5Ovl;Zql$}UpLw(iW9<eDB?o4eYRJ?UO(+QF(riOtJwVs)8dmct>x
zrdyqDc`3oVR3@$3L6A+}K4_U$pp;ovMC*vQ_iw23^Y-mvr03(+OVb;=d$ug-?%9yS
z0La@s)p0wj(i<{;TcBpk-mX32<k+b@w?IzS$-tf_A)AI}7$o&eQd?x#G6Bp_Z<Ni4
z0ah0%Pj+La<=J4Wl3b&82`Sa26v-Antm#_%8j?w)n#ZeGLnUl8ZPc~4sja0i9&cNR
z5suZOqp!EAO(uU^A~CPJsig)Y0ICUQO-&L@O;>Lp_R|^F>RbA%tFhGRem9v(ZRzYz
zwPP#Yn{F}{%ud}|$)z&lARIOxpGo!f>Cb%{MQ|=p4E2=km$V~G7)P^`fz5D3GJjW1
zes*C>L$Hdq*?Hm<S1l01skO4p)U<oe8hk4R0SRW>wt>Wn&Sj7s6CzUvctid<x{dQL
zT=@y3LrCTGR5iZ<$WFM@n1&U1NllnC!ELu+B`RXOy%nrNi&lc}S&V)hrU;sf=BP^{
z+*rU+-E%3pY)Ey6X`i*urrvBH{3=#5phDNU(|1Yfx}MQpmqE}qBo|PLO0F~NPfybV
zpZ??De?b9)9gUX0#ObuF%(9#7S(XtNi+HlW3(E7xbhurzx?!H#m|MMX0hDvI>1~lc
zZ=w!1%g!xLP@GN_a}rb?lyr8w%UC8m)-;_uy907CwIQ9{*t9YUvP5G3%C^qlo|Z+)
z2COJzxLh*7E76;2>uhLGw{>oW`4KxtFiIq9`@~Q;KeN20DVaFA6M`H@&E{kxnTW?A
zmN%rjAaY>pLypNgU0agvslF75S|CH?klXro5}GSiMyQ^cgDaczwS~0cB(z06+QKZt
zs>4Z5@-|BSM5D|rrBUXyQL-v9lwPa)wk$(4iO`7{H-X)HJehxp#3p5k%$6a7tO~S-
zDDE60iBei)0b2t(VEE)6AkF{J1_+uomTxd@dm&Qm3XwV1wD<o^f)vpZ|L?>{5pDAS
zL3R|;4BLBX_yq^n793%9_jmQl;bgiq4NGoQv#VJ&%*$k!b+=;|Rd6xQQ{P1M0<qh~
zleNMNdQ7TmV`9m&*~#SEu70x-tWGZJZd;$sba!^NZAnyP$*4|j<a+mFFDcHKUK%Ae
zi)cGcWZgL4+h%3XuVB8&hQj0*b{md{NiXb6bwI?n-RPZUDYgwux{XFd8dk3EEiaV@
z))Bm|%RKo_U5owhT3FM#@WM2WO|NW^aptwA`~UGqZ#EjHqibXLdO4Hu8v>R?eUNrB
z$5J&NU41?My|8j5GkxZ)C?2B=3l6a5m^?oF<gWI#c*?Z<zDU#?+ix;!{Pi7KBeznm
z(hY6h8(>5>MXgOv@RD*Tk#~C%CKJD!gABUF?B1E|psNiI+XH*|iOqkF+5C&qTXxv7
zWL*-bk6sLT@zQ}SS-L&f_)%S-g56|es;9#kY<kk?^mm}fN%75*_MBxl)iArj=8{Oj
zfODe!2m1`{=k+}>LdmI6TU)xfH(S6Yd}$_X2Kz&)jg)FhEJuz39-+3cKs#)$4I_^n
zw>GZrx(O*bu1jxNnDLiFgU2~Y51a~{nsH3B5nb9Kof)GoF4&DZhbHkCxo#aVC?`=@
zOpN}nwf(R>Vkgsy8IR+mj;`KxPhY{gSH^HoOjuW>sbQJLp-ym$)+XVFmh7~%$aD+l
z*u)Hg5w9khOtqg4Q*3fwI+fX;1?@zEeewh+j3Q5RO53t9xiX$ySsg5PJsd7f(~8?F
z{4!stCJrri%K{N)fo3-4g+e1m#Swb<_HF4x&8!7RGf$NHzssJzY_T!(Pb_MVMZ2vt
z0n0Z2!|{#(Jon{Rh-FXQA}szBVkod~{<ohL<yl8@n6&C;vv}}et(fMjZ-YyDx_U``
zwp>+EcY3S|8(QEP)N2nsorAQxhElHzY@2%x*;%4~tl1OM66}`wH+S)d9@s|VN0Qa5
zdv$MIeujBe)5zR%t*^DSO6`H5|C5}()+S|-HE~ZwPJFYMHKLj}Uel7=m~LOiO@>kL
z{VUj3n2_^#1x>;$S8cN#%wR9SV+$0YmcjD*=NUMY(KdP5Z)L;e;XHqvwhGN}<u`eO
zF3p<vtk>#lYzS5-5>W5Oj3Ijv+<@4OJp+)rxGH0}{i`Q5KI}p4uhmt}=k+Sb>+x#1
z?RBQ$Ln@B5O(*J^yBxj4lVYV$b#$kN&nqNXd<1>Ks`K=a-7asx!z!40R<1&my~sDG
zVvTY})=Rd@qlA4a70N$X{)>iD{)(1y!nKtnDSvskf5T@7Q;67!$;HF<=01UOuME9<
zm}RionO~Eq*42pVwnHxFrh3-)s;6iBmaf!>?yL?Oub)R6Iklc6WFnbb3Lk~%?GWkg
zt*BQXNgdy&m5R<dWcXr1QOAL#aVX4Q7lAim;pU6Z39bVZi>3I*(<M*LaIkH|&^#vz
zU%gl|US}N~=XJxwVY6KLh25)(>$83^gw<A&ZNkBexoQIEM58K~Z0QculKy`-M0Nr@
z^LMLSBk!zw>5Ys@lVvL4G4>;;hWKEHOw#RKf~Y}JLNW44hV9V`Tb+GW6V$|`#AOl}
z>T#oen@UYcLx@cyA9Hpfa!?pDbY9#A-j>b0RZ^&omM&p-<S$yhUYFuc9ewNI!VeRU
ztciJLhPkZhYDKeJHl;EF6ObAS9Z~<1D0kFADWULrWWtNL#b2O8v$=zpbo@8NS2npj
z>JoY)|L~{ozO6>7VZSw{&(kGaUG##yx5|)_zumY0_ES$lvVtw-7R}0*|H2Ae=kBen
zT%NJ=U)Yr|!rEs0v1wbFK=V)5@@!WtTV66H-6n0CXAr%Z8g$Z&X*LT>;??VY_mItI
zA;j)C`^6A@Agk_$6tHHy1#8GxirMAg7<W$SYidJS1+Vj$e3w64a?JNwa#5;xU7z@w
zHZ^aA$xUo<xT%TbEJWpLPxqxdI%gy3pM*H<+XB1YEI6aWEv~QIkhS#HuEX#76U4p1
z1>bIjBT023>VSNsal)#T)v@IK^qN$EXP<uAZoWk}<|~Xm+(qt5%P;KhA^8dprrF}z
zSQ1NC#}Z3c$Ibt6*i+XC<0goGkOZ}o=o`s7orttzt~{Ya#Do<bcX160bi#%YDv?Qb
zi2pHuTAP$#Wl#8&RB{vY)d?KctypY4ZQ~8_UQWX$w^O}05$q6=98oZT0Iio>*tGFB
zYT@ia4iN;ib3^So@p1C}y0*kXLU~xlW0MtfByT49LipGpE?i|IojMXGUft$IbTYpA
z+1N~BF`Zzt_Z@E=&)Mn#o8)RtzxAC6EC;L83zKW;0mWogmCYI4{`!McAR3m&N^47-
zP&OuF1wk9)p_gGSq)(*1C&E0mn~*?yr!%oJQD-i9npP~1E-yR(Q1G*>uE#YN3TT#e
zVY*9$dL>ec+BKc2wJ5`q_$=JUFj9Q2{km7|ipBE^Z9O+jZJb~WlV(wO_j=oSfo$Wm
z;Vh383rj%<LR7xR1ecH#AQ28cU_wlacuha<ZnURy@dBYL;7DA$$9`gUcG;Vo3@(ZC
z84D;El*KJ`lW-$R*j5jgR|0JyM&*mOw4trv;xBHVUdo{hZ-PLq<s}Wc0+CK_P&=dv
z1e=`|HSGNsLs74Uxc+0q#`?!!&?ja|bzGv)EbQ!Fo$Az|w;(JbZ_eiVUA&qzQO3TO
zDIaybtA7JVpM>)&Ox%#^>FDl(LZ#C@-Ba)<)zwmqKr8ev??UJ>a{)L{!DgG-J5mSe
z^vS+X+#-@Y7Ve~V8ZBPe*5A_urB9+|W&0#gHqqq^@1q9D8|{q+h+)^<-QAh!KwwD|
z1vq@<nR4VCa#@0WZ04IDGTNbpNTw7HOw1+Kv9)HjNQ@c@367X{Sk2IUp#@tg%^Auk
zUb7ZTuZ#!@Z5@kZu&BGYFIb?qo<0P1UbE)-HcteV#%E3TZAi9tc6X&Aa1n+)$uhR1
z7QNz~=7}qW-2ZQvU}#|!a+j?GjuME4m_*n1Z1L9gNDQF%{tX+p;3vLM@Op!ro1Zk9
zMC{6?I7eDGKWW^v0~545?ibI9u9)sw17ElJ?1^Q?x1M<OPdUCPz2<lX4%oa|f@1ed
zd~=x&8zs22T@&kd`+@FA_bp9rS)E>hz?IUJ&3!!*P%jvy%xLJfaDDE?su_=UbYb~Q
z%3Vx5)J;^}nz%purG!uH@A8GhG!ZKiSVwcNiJyY#?{>*Xt-4nr&;qt927>f%LV#Ta
zg4Fos&RQd3p73Su+Wx-YSyB$shO&Ra0Z=?0@c7Cd8WhunshbFuNp(!_M!In_qr+}4
z=kcFR;CL;XldU)pK_sOiv$5agj$*P_zOAlp!QKtysk>`=PYYuC;RdEQy#}!|a!1YZ
zY^`|xN(mFrDB(>QKsNe=$#xd9^>v`ldLqyqa?Zy0L%S4H5Zr*A5P;jJe_Q(M`uZ`i
znojg95-s&f{m)luxkqMUaXpa_bL2lcpYwTVBe0ZsXU~;T`U#8zsc7UNF${!FA}K#Y
zIHm11W>#^`_|6ukAN`8I__e~Z9y-j%bYeqbo>iNWr*sihzah*QJJ&GgN}6OrEzaD0
z^G$wVABg|RL{sAnH*O_k%tkjjIuWgnAYSsI)e|aGLZ_zicPF8Y0>c2!p?YztVhk39
zf?})$0fhi<xg*yZl9)$$;RGD3+F{~A{Kq^4(vEOc5<gM>HeubrWq_DzSVH+$1WH#Y
z(;+L4q$m>tLLyN@Xd)zO-V)ZyH0%T=C9IV~jxENl`SEJ3TRox)EW=8efH$_N?-<v`
z*uOxQ-3XYU)I{08YNG95=Bse~7ubc`ztr|+Oic)>8pqvp<4lfA!%fKz=-F8!aq8ut
zY6+<pX{VA2>zm!qnQCpKeJxr9%^?8<*~4&~ls?Yt<+Hf#&xKEWx~qLD?1fNs_!@^S
zRXwSUs?-E6MRaQ@mJ*->ib$e)V=5p#N*O)w;0}x!e3)2L3QC6h{NoBEAFk{YD>2&~
zrZGw#@c!|RKEeeatO$ja1vBkT&WxyJ)>L;XFI2@c7u6E_!?d`$f0m;T)#Ptpy?;Gl
z?_$7%pd49})3ni}p_mpIhdVqvPyB`h*yOc<uKcj(a3dOgrgHhLf+ZUg%jnN$?8NI=
zb@cSXsy;D4YSWt$KN&`J-6KiA)$`ACXG%`nls4>mN5%}ht&iJbD-q9SWu-=S%t{Sw
zX%fj{T7uen?L_#HKDd9II4o+M>gw;r5e3v$6<!nZLXdAUi?Tb9{BJ1oE9@IJ7^<y}
zEqXjh;Bi=&k#{y`xNo13yT8ddm}C#;8%DY##h{kmpp-nYk;^T*hI=*Ye!g5nMLI6|
zP0BPbX{eg{YQ|~kmcwTbEN+t_v0DPWokI5uU{UYx?8N&Ik`k1^X@g`Fh`x(a-wR#1
zw_`2h+L#c$$boTnq21WwWQ4;cE7Fb>FuGC#%?kk&trJ6)l4!Vi<OL3;D#Yg3fjX74
zp7^@UESAv{T~=qG@^7@iB9>MB3je`yi^r;p1*Swizj&sSRH|N(qo?Pos;{1)%H8A?
z=up2Z!{x269S6G%vp>5Vmc^v88D>{4z2LbrUu<hEa<c8M5zQ0!8JU9q-J)6GF%HUH
z+h54$!62+Njs+oae=yfiYV?3XUtQ4aY(jC5CFI2ug~nJM-O;m!f*X~8!8HJ0@ipa+
zH>h<e(V5-=4PGomO-p3F%3t!%wnV+G)U%er<|PpBzt-b$e?d<=Z7=HZ0)BRVn0(7z
zP;@}lHQ+R+(C!i2cOjF<x8jCw|4pZCrDYBQ!DRv#XY%xmHyIE1bs}afXl({&y%up>
zz&JOrT6)O7)UVcb8UnnUc=32YM<6e=n#O&v<uN<)vnQe8@T#x{cWt)PeNNo>xW2!|
z=*d=AOWCaAmDL#oHuZR1or2m@ZR^q#_QHC-lM%0x`+=S5&5LmOknSPdvO2__=Dq>s
zXo_3`XU*?G$S>qh{m`t=!8(`fihw>TM$44R>3b`ZZ<`!l=u#K72-7JhqWRA(7Wm5;
zYWT17N>txh=9Q@W*|+$K1+~Ch=i?!nz&7VYd|U$?`!d(4?O|&xa=lhFfnOEL-DV1_
zGF9{!ay6c?Sz&~kGb-`YYHFU<-J{Fpk_|P8J-uWDEUBWu$SDCVX-jcDtJ_0)qc4`I
zti?JWwowQOKnPpzo3Jim?i4&C$vv4UQ|Xn`7t&%Sv#qm9y(Fh<AS_6I5f_~5SEu1~
z0XHy_e)g>^P9PjJy$Y`>)ioM|6k#^|H}rSHp`aZ{<*7cORui$lSOZt!)tp9Io)E}l
z=|<pCpP-Yl>&{EXYUK<R?}@Qm07i)dcBBdbO{T~dA&{F-b@Ho0IfjQ~mYM+L+|5d0
zy>WDaQrWLYVSUFZSyd>n;bJ=|+IZ77Dju*R6`IracnLH#Q8Tf9wl_5xHZ*lfgiP&d
zwHlgjNRA;<Ub!an%1SSch2Y7C#>488*BBv^aaVGqaRpAA7s%?x`s7kL?b2-k=iM8Z
zg9COn^@xrM*Cpd2VqPxRBprF88OZT?3xeZzuClLytj($yOU|xCjr>N4M8fwjH{AJw
zgKZ)SKdpw1${q*hP|1xY_C)*?=Na|J?-qyqy|QD)PPJ*niAy#ZZ(exsCO3BUN`xK_
zK8yoAywXT&o!w`YAcqS`MQK;QOA<<mDmD1m>NUo&fUU4^7R$GCj5|lqe$DGHqs=aA
znZUpE&DpOB_M5PpVmbBkw>!pPvFnQO#GzCb(4S*r^G(45sog@&@;PUY+hqe;-2PE!
ztzT|#>+Pc%-Pui0xG&CDn4s+oQv<@)5!Hpv$Wd14a?L^8aAajwfKa{A%z8cHDoz!Y
z<I`L(z1oH1a4XJX^W^O<iDqMq&(7+}HTyb^Rfn}DcR?y}tll6C5`C7+F$44Wx&mRj
zv2InxzBSK?5Bvj7-wb!ZY0(s`S-dvPn*^d>LAOwj!^2-em$jA@GVUW-ys_=e@uu{a
zErxkfZnpbMSY-TtgH8!5fxfjPx5MDCA%iii|M7EH{gXgB;_o6<u4$r3Q9D-_A6Y~f
z1~icZwoAGG#8OBB`6qgw+v;%5I6LjkBD0MrfT&hjjZwVl_1BmJjVPCKU+S@k`L&Gp
z5Ife3;u@Xwhq`lEYm``P%^~Z=SEVlaFGiKg46MMAjdEA5!%^w7^d`TnYPq&qI|~DA
zW^hAd7wp_K*LZ#JmaevSJ>6aMR&V;qOw1qM1mr6FGTQQOb?VHm9Jt%K#|dKXRC967
z*&Th=HCUT`9zEimyh*%Q#j$p_R!ofgSzPiJ(HG`g8r=01WjnD8=E{}@LFce|QWYKF
zn~JL)fe8iM4BiEffvQ@6Iw4(`VS{b8ciC3>y#{M?#bvgNVWuKTk*v$wR_Fu~*6Zq?
zbQ^B0B-0!R@W&FVI^hmdhOIidzxQpK9j&f(*6n`1cf#sjIwSwO7|yd)>i}9UI2r?U
z)2X=`X{Aa(f0>cn#Ap+PPGE>RCE+T)UDtA2h}A_SS{7xO<q%(;c-$u@jGdtr*Bm)7
z4!Guo3lV?A!QNk#2(dmJWekyQ9|R0jS;?JM@wTyo5*R*azHixp8wjXBtOv!YE;#4_
zp*-T;P=@wm7-Z#M0art-<RLp0ws`Zn<8&IIWnE{w5bZIUh6`$M6JAi94Flt!Vb-=U
zRjd{*H<p}mT9qw<iL7jEU`KehxctifOkK!i5o;3%;90O($zSwjcf5t&_dDs0adCEt
z+Bt*-*uo-#E9iLRlkwLzXA?Me@*Cu^eZd>NVK<9owZ*`nxNMIRopm<G-^-wsM_jf~
zL^~Usufm;;!7kL<ST5V^g?lu!T(;L+`UyB1oAZ-=`bNB3^{HI%v*2d8gxyj=AuwA)
zv8ffI$%|bMBO?EqLABm$ZWeFm9$e$hM)dZd7vzQ-HM}t*+SZHzOGD7!ILUHCR{n7R
z+TyMom9ur9B{kwfVRjmYTbU+Sivj{8xDuBrHH;88yA@2HQr~1IKJ&&P!`)~xHJTU-
z#UD9Zm-(h`p^N67b`8GWLxclSxyQZ%`5U~fQE(RGte<X#RwkFKUu}=`b-!PUZnSFD
zQ3NqhB+9f{QzjI<z{>2?7);DX#z;VI%TRxFl~tdwOh6s619*1Uq?l+f|3XEc9-f@*
zIA-$kD(&(G3tI33!<BR9E=d?6Q=Z>{0jq;Ok?}cxWT)}!B^zp?Klr}#`GojX_DT0N
z$tNl4)u3f7PBvPNw~!q^?u+!0yDp2IFRu&CL<nD)in{n=nj>JO$u*1UZDO&Rt3f?L
z9%0j|mUXCEZUf_X(uVGyEm?(6dE{E><VGXRE(oh+oGuwU!t71_^S&<9d{CETKJb$B
zPg^!;Y12FreAX|okfj+fa>}IJOe~9Z67T6-y`>Mz@VZ0-=9`r=S;`VBS(}GXSmhk+
zo61_P3|Fbwq<gHnz6CyX8&Yy<pO>I<CLA&S%&UpyoKdtQd@vSK{%aj~IQ@KIndzsW
zgVPVQ6!EJ1yE<^{n|)twZLnNIQfu9O^sT&2=F+S=fXUVA-R(#4rAVT2o5)a7e4OFQ
zoz-Bk&DPO@w+f(ia5chCt2lLDPG9Y1A^g1zX7&hmetxt%Z@vmw=fSQVb-v{Na-<>2
zu298r46LXyl>~;SWrTJOMk`y)nH1^C<L|w&&Wq>WqKOr<v@KdrpS}=Vv^rd8?M<8g
zA!4qXSZ3y>jJ}9w^GxtF)5u2ZmZjVVtVgBlaef`c?R>wv2j2zv2`_-1EW1uyH7j>r
z%5g=Q<UOBnk`=St)vc~L#+GdMF_(|reNp&DJS;%P%x$MTeanx}>TKtDxw?n1fnEy3
z2?ra286ex)3DqLUR$Q*<9~LcaR`6@*_DRl&jxveSTH+MfoWbX)LxIy2`04o8ZW*(Z
zXL0(9+kpWiy&WidkGbOh{jXa25@tV?chfwpA17{U7=GGYB2KTv&C@7vc2Gm?7Pfub
z<1NF^NX|+3Tnbpp)IomYfJ#3{AA6(o)bG&04sUZ0CaV>jh@-ioWMHyE+ivY`71QKa
zjz=7~)vIG~CY@UEdi2H{QoXRj+j|o&I2M&sO-$XuSimKkPK^38EiHZlAv!evm%$Yo
zC<Y|*NQS`}4N;TZN4ho3cWGtQF;B}~Hi)Dc<fL+PsGKkEknC|MTGaMqJR%5%b#4f~
z<|BNfc4D!tQr}yTmyNIQ&&XELx5kLCPjSM`F9v0BSYMq`4cC8J!*0Hch(NVHjAGJU
zWz^W$W)LPJ3x0@FZyX8O1;!a=LXD3+Y;m_?IQS}V8<umbGB0WyHZOV`X1)sFhJjr<
z+pwUVia4;M)>)1c#zoOD;3tmiojGcqKdsSRrAx^oBl_H;c=M19r{L|lns{43qG4}9
z0KC3##A0qwuIa)FRbPKkS1<g-JJ*O@vC%&<hKp;gfcY&jZXG89N7Ux-3=-K;ch;^}
zm+0x~?veRaR#|*DV7Dxj6l(zu7O|koJ$b)cCr7M>mL==4%?tKM?7r|v=mn1|V1L@$
z!*leoSu>mgyA9&GOHSLUP2IJ{*yXSR@DBrM_lN{GtC-+nSu@{U2_80&_pHTlw+#OT
zWo9`^BUl|WXrs%00jTxiHAgvCa>Rq@y2i{pVHrViEtvO2FD`oK`aQ}fwg{ms|3vpq
z`c}ZNx1i(9kzK_6E@4*qYO&3B1}6oyy;2vP;1(!&QaLH-STYdG)YzOpF`9iupU822
zu`<?f09>WYj4n4))k`nZUG_ZP)W>T7;k&rtI#Yr&?G77ra#3(%;AERu^>|k*crs-T
zXp6dik$&Uxp428>NtTekJ?f88o@y0w`eN?b8`F!K_Wl#j7t`Tqm_Mz5c@h21s-VT4
zgvw4hd*7y3_nQ8V2yR5m*OOptG=pRN=u7#Bd(d|+9`Ej%C#I|gczKJ2@`xqR#{aTH
zPAq8d;mL&OoY<$ZYY|T`;}VCvl!)-!*Cf|3H!hovd!k+a$+pd#L4&uc^x&rF#Mwf#
zgq)NP$hzL;<meuu#p{-ItiknZb(WBmG+Ak7s~=j374XCpWyjvTDXLJoOD#4?Td1wo
zyn|Lq+cmk`e-Zv~Ax^ct9aFbb!Va7LODRsZ=Gclhtchtr2^dEdd7R}Yd(9zkZG9WM
zxKE>5cXW469PO^`Np<w~F2J?+J_O%uU!fZ+QJ0JXDqgn+#?n6g#J-d#>Im2>e7;#M
zg2=+=WEdZC&1kM0CK&=%uFB3cS7l{s1CO|S+-v4}P}oN7mS?vi)^)Nyg#e|^8=Dtz
z%m#ljipAs}*aR056Q3k&e$CZAWQ8vf!^D_mJ7xc+cVCSUaH%S@pNYtV2yHp!P_!<E
z35xe41?Szy70YUR*Qeq&2o%<5UguQy3pJo!+rx0|1t9(>{9ffV<N*)j^){M}2B|@=
zJs@8AwLkJhsYvMMPJ~O4L0-rZDW2=uB_{XQi+{B#il24fDMm1%d=a{0z*3QwQBqec
zqQaFW|M6wDtj9@-J=&GVyCOPMz2+4V9f;SPtZ8pDXH(WIR`*0AZ~|pA@q(q5&Acol
zQdlhxtnuD42p;JNUqlUCjrfc8)hXFp8=;aLaAzWL;CyrSR~pWG&f9Uh4dju3zY*xF
zoTEnfxE;HCDWWx7%P!wnWO195E2fI(95d=>Gy2E^f**Gb=9&AY3nWg1%<BY#SeIiB
z2y!ae5rw?qqD6$%KdX@9LqTB?IPc357Ot*qo4PnbWOdz@#tX|%On-li@GLekWAIoS
z+ASPNM11(=O^vVi;$a}K8tLukC2nc<T%lY$3kpZ76A^OSyZe%vp6>R3#714y-wVWO
z7)F!m-k9#`ft-<75e8lFpitsJ6vuCMLaT~;P$%kvojUm&Hz1XZ63e19nyo^4rq!41
z#w@oB;fsi6p|`ug2Z2F({W9NsC($F5aPC=Pe^s1e#0Zth5mMrUU&a}jPh12RXvchC
z9%n8i!NWC&b1_PVe`?7MPTx_T!%dYy-H}X$FVrn{>2;8RP0>dhrIy?;_kxVG+EA_&
zp<k)BYUa!Lx^(@^cZUZvrzjz6b=#n0dX!f}h*gaH4h}hE&yhgzc1o9Q7!Dle>(0x+
z!1>oV--3gcK3Bmhylumlje#X;b#k0D3f!W~?%U-(Y{^Y{8x>wx(sZI2bzsvl|1p5-
z+&VGI#KqMbF!p-pKWQYp&x`P^@oWE9!H)2yx1`P_(y{W=f0Ih}R=r6@<cJxI+x`e=
zE2H=LMD8{_yz#$rY}>R-&?N^ZCCt!r=W&5RY`Rf_B|@I<(RwS9m8ekVUBxM&cltIk
zA|%{%0%K%ylx=gwyH#Q9Cd9Q7;HxDrp-;7@oHOylZ3G%4R<aT)^bEyb=grQ&5>9&j
z%7Rfe(HPjv7I3pOQdO2d`$nWvb(H)XkEwI1D_TxXqGv)e^<#KF?H?^r$RP@&!Ar}B
zTGIR)rL3KS;FrZ|4t8@}Y7_h0qp&WUGs3dbe`Fxq+n{z4E!MTb>|o<bpJUp@e6~48
zIWFpYTIgV`RQ*eobFGDm@za3IqQ2lAffx2#U1DRRx;Bc&s$U@o7;(f8;=7~GTo!>-
z$`M9RT#z#5pnn|5p;V!{E#nQ3)GTnMkbAIG9oLvV^KqPjmys^-^53#<AW9<!zXI3N
zw@1=)6I(prMk%|WnH0%4De}S%QaiavDiLd%)pN~35<Zy4i~l-T41HgjD~9^nr{T*D
z-vWw&kB4M}V=Eux<4RSboYzPC7#7Gv#%YM9rWWV6z3Wn$G+tZWk?IYa(u`>FYe|7y
ztR~LGRxd$;(IggT^oy^V@pW)9K%)nZ%%O1%3(j603fe0sW}(bwyJIoSRPMXR5U6;|
z2Du|QIdUqy)hcO~hz><=VoDYeMFox&$`qY}6HL{PpaT|R&=->_o_woAoc6;>YV(jO
z5KFD@<*R2+toQgV_2_THDW-(_nJYIWQN)6B9_Q#1dM(lHgv?7}5YIMo`EW*#tG}~M
zY_?>FyjFeN+dUx{ikS^Yk_37q+M1wN12NT@Q)R0FWH$%4KewT3D$>p;CUB@!vUf3O
zd#U=O6SB@Vcm^k64WmctvR)6fOggnew=%k0v)Wyhg)2KZwx%tMS7*3IDqZ)8h^1dT
zBN6z{4a9;5<`AtiXNxFT8*9pY2jpZ(MHPm5{5mjz3kggU5y$tc;^5erbVHIW)lK#V
zoloGe!>~Qu!}MVdM~;mVUSGO-dHJ@#u|Sy_6c&)JOqb-q&~S!!al@D^O>*T@iD@@K
zImr=9vOAvK^}ep5)br)!YQ$>Wl)}m$i17lcI`N;geeiu{6n*{dTd;h`(1o_{+aZc&
zP{j_&+iXjUQk-3_tbHx8bvKL0{8NtaNv}Cx%#oOsH9d${-?1T`#A$tZPq})*<P5if
zc3-H`{nh}CRjf({JvZ$6fBxI%@pNY@17C)fXchPt!u=^2d~4f^amZ#UZ=#$RK8zBT
z3eR>F<HczU=nMaz)OP|GJ#dZol<Q$-Ey9oz@tD5$ud`#l<4~dow!H7`tjW<?>`>3|
zWa2yRFK&!kBWKuFKJdCjpTRXdX~Rzst<5s1xYpa(Y4=0!$=<8F2Cjap#!5J+g`M52
zQ=QG--TIDgnw`uRG{=WKy9fup#>;+pqT_Oql~ASzbm8~4?CeZ`?>ZyH6U(6lO3x@|
z2db4jCgL&kD%FTM>ee(_#7%3z2r1{tK-9afpAjKuz~Y!X*FJn=AUptAs9irWS7F^L
zZj@UmfHJj{oQ4mS0v{$k@+M&$nbpGGEr*?0lVWs*ye{du$+fS><9*%YAKR{nGv&lV
zHH)D6`AVz|%dB@N!&a&lCs*CV>&-L$$u+neB3hJaP{z7e3;ANKD~nQH?Qr31_DKz_
ziT1vR?;2&4xh!Vl@3!MANN>DOqUt7lQyay5P@MNAyGimoH|5Tj%Q~P-bZx%_x_tII
zpc_9^8TLibWxrL!*;KoxhH?g$t4bNcRE)~b8*A>knq0fRf93`ztr&)JuI^Ky@pVvZ
zwd2vc#M;mHL_|L06s*uD8@U8MjivVz@<y9dZ&2iL!Wbx1qPe|>%1^#2p&Ub)&-2eN
z!)01D-71ziFlB<3S^-%J>}Uy99m`;tYrEQt1vqdgJUH3|<#h!^UiBv8Wib;6IoobZ
zo2^G+UP~8b-Q&f--#5d>a9zxF#jmnrHc@7D0LufTXrduX(DmHW>z|+n>!5};NEy2w
zDMzsTMW>lNdD^+e69HQ+bPnA2jbkA0b@Gjya<6mkOZiox5Mj7H8u=>FGR2u23d5W?
zqF?;u8=E=AwZQphdt!A$Zu&)Ee$nGf{5wZNVa#PHj1rWJ=pZ#&xn16Ua(CpHA-Bzp
zv%A#U6Y=!KFcR=WDdSC3TyQP-?3K2F;i^3C%rCnF;@GyRnUcYPE|@KPlcl#kgL`w`
z5}Z23sOSt7b9)7fiwqQzP(EE*L`7C9<>ei}p{vwmBey&Kf-LtkY8i(XR+vP%NQK-d
z<$Jj0)IV!U2noM3(I_kZ(O1gUxN^j)oN5$u2{N-(#7TBf7`iF;3jrBKRtG6=b8JmO
z#qQ!KlnBR}FyAnpeNI1K)+}+&u}w?jT6Zs^Pz44myo$|6&hM#L?&|4pZ0ThiGEpy(
zPE-Kj1+}2!oIyzCs%FJ>r_VvezSNSIwTa<o+QFTJ;n4^do6B`Y#6>{76)}R?5Cnxh
z%yMm3rK+yZskR*(PB2$f)nNDF`hagu+qxIe7p!K4@jPI<jJS;uk9YKHV1KyaC2*x5
zgOvxnc5%q{+>?1ci2|e)3r<#q7zsY<rvfiY4>_L=a+_U65Zji>v3z@|$T!xOE|VQw
zb}2i#;w|zbsI!B7_L}Y<Xz>WrVqO-JlIyf0+QoFIj!y}^K`ma>k7zCJ=^j|pHl#9G
zd`o|G(&}uO2E6D1Qeq-ZgM6N_k}ak}Tki7V>MSLype6VLxr<Y%+E@A@kaaD`=f!6?
za7INyC!Pe)>{8bX6xZix>@xv5<<x~yH;nP>oI7cvPLw{V=yQe<D>v8XmhCqXR3t(>
zGOr9M&#8N5(+eKUz03`5Kt9MDPQtF06}%3Em%^(d*{rO=mx)I2%_ghJB_5&#m*DMh
z4dR_AbrdW;GRnj<mX-bW?x}7(-flvR%<G1q#zc@BTi3!b^Kpg+revKlPKZNR886=v
zlV3Je&Q|SR%_3kt9SXm<UEPp#w#}2Sbh$<$b3)+k^--3si;ucJl%y6_lFFQyf%Urd
ze#%ZuJFK&UV^Oy+VxqkM%MHn2>WFeicY)BG{*4e$n_#>yC=ccQ+3YdJXkbkvxreCL
z4IzQ6&4rDQuC;P5Yc)4fp)mCtlJ-Sc47JkJgqUkl0beBKjOhma6T|Ha*QR6~fttm#
zE^l=0=-SwATx44MW+&%#X_PcbR@_{-3fxwRkC0Ub?O2{Lk(kU`VN!gV0>e|gFwbF|
zS<cEVAk^TA0-CY8jn7`N_+k%QSRe|SJTi*QI`%YX$_Nu*)N6D7(?=BTY^dd$rmjqX
z-%=brm@|E|bui7IeTnXDGxT!jP2BGj6`$STg>XWBcwcg|V@<Lvoo-Llr8CYq2-vG0
zWqN$)h>MWXTv#RqbfjfCG6=lL%*@W0et2Gl-dmbn)6vzQT%E>-G>zDx?Rb|qBv3p#
z@up}i;d3oBb*u6E?e%b=RHNj?waIceGu8kb*ad`bX0~SikRcr~Y(iuPEN5leHj7zf
zkde=hDf61HNnF2eD~GS6MX8a+z78q(Qe8E4%)Xxf-j0ptHI2zWyfio6XV$c>x&>_J
z$=39jSrNBx0$UHTe6}QGY1LH#!y)8+wc3Ba+O%;w7Wh~)UY}f*f&r&XRGjXf)Y^2D
z=zS-ubcnYg%x>o@2)b^>;lw`Kp0O`EyAHkY7gM;6W@7guU{_*e;^fZduzw{_gaN35
z=etcO%Gz#J04)D4$&Gy`<QkTG8b-OWW}`cq@scZw8|0Q<cYj~-6k!E?y@XYHp)33q
zc`;2dX@at>K-r_9;v~_eZ&3UVit(Q>{69{ji_s*EV}*#E?czsnUjk6xa0)7Z!IuCO
ztZUYZBfCAxas76OXmRI#)){s}pym23bL$H0Qm_tjjmYR%U#p!}2>VT|wr{nx3{(&w
ztk^;C7gl?WyAPHop0DLfkm`8SL}gwB_wyvYM&;%@PAG!{1^J*l<tPF3HO6W%1jxiW
zm`c`$7HqiSuG*R2knZXehuWqkQfv8ZF|?UPy{ud&FpWT{xHN8VUE(!*+W;3r5yY}R
zVYmm?16&yjs0Y=qegJ=fW#TGpwHHiR1#C?OSY3YaO8;%eE4oW^?+=U|ID7hx7|yGv
zvdT(zTdHjxdLy|GyM9D3fIcX<wM~8VEx1MA(nh~gJqlgm%pREvN*PwNKz;FHVy5U)
zTjqvRg2!cdykYXk_GodU$Ht;{PJxY?z!|ZAEkj5BWSyICoeCzyp{%7i_dv#X(2*4j
zYQbg<{&>y2Zd@rzZ(h{hiIcOasUNepoYlRq=|)CX?gbVGztRPueE)s2DOEN`d5!pH
zKsFGUwP2Zg$n}~RSVQqkP_w=iVf(PIpcusgIke4AoM}we?v+NfrmF35$H?wbvK_WH
zd2iiJwK`0B(Ubj}Ye2Zl&4Cwir=4AXLp{N8wHJjgc(ar*5a{UY=o589I?FDDxt5Ww
z=~DNA0u5&_jQSK2C|FzPwd$7<*;<6Wl+{mXN9AO%v;p~`2a{#4yotcdn``FCQ6jb`
z9)%MX)?UV5<Ujr}0}oz4?eMxt>wai<cNgMu<NX`@(i|C8)iv1tccnKa@nR{_XnPT2
zU9U>MP;Wdb=4LXFkjt@7V?;NUvqQ*#6>DWnbDr}RX=vz7HFEVj<aou@#+}#IzE=*|
zLt7gr&%B;|64a{~ot%s~Fv#Z<Y_yQ<6~q_*B{RNLHYwWdITBT^scTKQIkk|HU^lqg
zB{yU{?~A5s1&7i5pd9#r9=fLZWg2?e)nkiWajcW^gi<o;nq1IQu`OPE<l97UGv%!r
zJ!<D}M0W(PUKH2-xi+84Js2$D{fS{?H>8l3qCT1bj1<FPMni@y^J@k0j+Yl~%r)=d
zf?yW4Ci3m`s$%2c1a7meZdrgf#8b{4%N$PVi&8f@A(JGp`8C9v6HDR!(KO`*yyVUN
znXv0D!~VmztT|l`v>_IP>`<C5R<5S5K6_%+(%HSqn3gOczVVjY81CW?O<kRM4U$@a
z7Mi#XdQ`Bgqo=Pw)u|9fNFsr^b4wr7P*ilA1x9k9jqHXP&Ieg911ADgF!)2Le>|*N
z7zd@lHPEjRhaX~Dr8r`Rsh5p=2i`gJf9+jMY$RD$HGdETGk}l~AZVcx0wgR)X8I?A
z)b(+9d+KME-8~})6rD~-*;$pDk;+WF$|I154I4JFKp=LA4X}a52)469njN#8<scR;
zK|-*kIp^MYU%dAs@@Ko7ZbYf7A~WOnKKI?vxu3`Cj?5&l_WVr+%-toFBTdD!$hmD<
z7$3{;ReWr;joA&T+vYh&)e`^a9q1Ea35z0RR>y#2!Zzi91#>u(DbgG)xTVlNSe%Oh
zb8>g3e1hGJ?(}HVA9g3#53mDS@n$A^S!|u{o(ZZ9;2QCDIgNY4=WZA{C4Bi7kgOk9
z8`9<WRB92z&9R+st^{d?khLAnV+32fS+q`ML9O1vg@!*`8we>f`Y>IIrUnr606}-(
z9bc655(sm{E^&`Cz_IzePv6fE-pk*4e(%Xa{@%mq#<yz-Em$+!vnn2NGhlNyZV<;Q
zV<|1!+>4~O8qSIiRss<nj^s$flw>j0Mk#s%=g()5uQq&Mxn1NYVECY;1COY$>mZw>
zX2}{7mV<G{7nDr&f0H_L{`b9H>d5K$fTmf7dPym-;2C@mRHopU;HlBhGoc_4MIglz
zt(a!{%`+>=?=-tjUZBntA4U+`Id?qn0m)FYO{YUbGBGp3D08x?soqodcGbGy4@JEt
zV@u3FG3KzVh59=0K=ejS0*fkaJkBL-Z>192(EQ09Yxd(xo?I9Ls<pVRNwS9<XfAQP
z1AA_b5>#vQRAo6$<pFEsxDv1SKD2c?DoHaT!PS{8I8S#n>0Te6J|1>f;IRvV*rr-?
zbM{R(J-f+9s=W6#m;It}(FXL#b{K{v@j;?3iHE{&kOH=V3n7NdOp!Pt{AR%+p`M|d
z<&6@;)w?&a4Q+(lguUH4z7jT={0sj;!SjuBRe?hr;bQ|$9)lyA4%F;BHSQm2;&op~
zwmZEZ^)4pm2!S9j1~d%_l_7TeWMdZEFezh>*_AYwl{K1hnbw%ZLZ(H%GJmf-y^x?f
zCCtx$Y_BnaX!4rv@}_;(S5qTv94^S0Y(gWbV>F?$0o3~OuR~Brr27(G3wV5uc%Z4p
z28#Rc6jQgO^3tU(c}7(HZ1-d;B%jDi|3Fu|vv+Tp!&Vn2?LA+gLTgh5Wh(OS3&fQd
zz~e4YV?o1U)3bqRJS#`<fTOSpE;(BHXf=(Pw8{WUXEVv_#i-xVlq5z<J7m?xQ5DTd
zDmye;SnKN7$$*Ical-*wX1qWNf<Ep{6D>ZNMy=0LMZAPrw|6q3w?J7U*|k2&??{@w
z+lfb@0N)eS(wcc`w*9;qmy_AO(b+?|b7w3s4T@Ps?N0g9LZL-Y`oF*`Z_g(kH?3PV
zOEe>}&wzK@&hx<-eQb)-SCoCOGKC4}@hmB5X5DC-1Bb^MS|pBnYPQe7lCaww!kSB9
zl7PW}a^||MxkNp~t`dS&IJoKmw|{27mc)syPe*A^Ura~~D=#$UtXHO<&X5ZTFtCa#
zE~qw)ZJeH>aN+srGUx8cC|rmVC&-k^`);SF!y{l@Q2rsMB}gPR9ooyW@=!}iIcl;z
zV_*Yh9jkebIJCQ}#=dT>Q+&zk6w*zTElt}@nnJ&1nB8R|7pTWJaA6^yICxa3TRC)5
zOfdY1VUZOV%%?@mB$YI9@e+g@!{-I*x+=k^kwe*})V1Sm57}J#w7i@k+)K6ATy{1y
z6&8afX;&+h)RuLLojeRmImMb1mk1IsPbBL$vhwOC`s(5KGSv-RIc2_9_@iPnDs;nv
zJID=mk<`A{I58P4+oj$Y@&_{_)+MBnMPL>dxl0+Zyc*WZq8U|Ar&}@!oG#1mX2@6M
zRE6~;?Otkg0#$cmdN;CK!hs4c)w&J?Asy7J0_8WuDCTO1ivqm3Dn-Tbj_a7>E()$t
z2XMMuLG;nk$+)e8#Cd;oOlEf*0u(h3yb-M<NVvq81anMNK@Uit>y`$PyYFJ@)TC>S
zmE&|GU{~^Ztg_tH9yRZVth=Bjz6sHJyZOyA(UkLb;BjQq80r%~D96_*_PP=~qR(wr
zJ1^Bj5_lqV#d;8p%fWZdVvVEvxw3G|$x(?^?qc$8F}pWIFx2VgteA?l#}dW3q2Mi}
zAW%^56sj2V<^o-;l|M#0e?Be;{oeIKt75R(*3--3X))oza0eO<pKwcj>-eIJ0(FvU
zN3yEK6xqkVZ)3H>9oiSv9ot4@`aOx>8l9hBzBufmwOJ1j&S#Tue>Oer@%El)uz1V9
z-qSB}vVQf!c5hI0Cq#N$`C&Qbzgc%yy#4Ou!}|}u@|C>3*|9}n=ywMpB)DkA9d?k@
zw$sBtkuc!#E@-J#R<o=XHni!z+DhRWcs2NMHQ?jf<nTzStV&H>JpN(y{q_Mj?=pQu
zo#q_#tka(!U!KnJ@!5+5#J_-tY~@GCC3}}fe=)MEwKMKcrv+o>&^wOrXgbCWj%I^v
znOJFJJHtGqLg9<<<p4)MK4*NLKr;NMm##lFo$2fD7{v`cc{mlDt&HuJZn^ZhY-?7+
z$a`l{c3n}dM@e_G_AxgZF`v^8DvNM;6O8*7c2j22L=HK_?>g+b(lcnX-F{x2_orAT
z2pzh@AiO^<viHjJ6=H<(<BKlpK@&}AZO=cil<w)m5hG}fl!O1V!{~hSaPsWwvp}zI
z&B{Q-8-R}BGd(ys$-!1Yd|j02IPZ^gq*wMX-Ub2RyMLHxo9%{Kj>|H8ib2O0bZECf
zy?@ysaHPx06G`OUKqr{csq>V#Hd~w9c-`6H=xP|WU~^x`V!wFPE5<Bj$ybJxi}S~`
z7K*#i;3Rs@`U83SA_v`_BN1XaJ9^anT6p<pYl}||NPJ&c;+Olg3r2Z$fh?b4noS+%
z47WO(6g(?O&n6W|0vkWXSy*RKFHC1l$Vi9-YUl7-{KzU+(ry$cjA~0<ca9GZi{Y@m
za%{zqk;;<&_eWSjY^>`4hlo6bt7$6P%V?<AEZo`-T)fdP*Vw(sF(%Y~a%-Wn57`QR
zsD^cadOaK#AlRO+<>Zms_Pxtlc|Pf$xyU-BB#uJX(X3>SL@*kdw#pK^$)-b@Hv~9g
zFcuF^rJ8Z}un#5$(tlK3`(YEkztvGcj1R2UwLR%hG`=?5((`O3Dh?~HB)3${UP-wk
zto1oR+f6U7(_>+WW6CK#x~KiC3>9}EUqwRaGqea3imZfSHN_2ZhF-9FTHvIw@W7Z4
z+MAtx-hW!oNO_W4gmptPB10&YunQJ@9~;PJTjvZ`&QH0S99E?F(0ExT1W&*2`eOoK
zy40Yf{lv}-b+xFz_+)Z%CP(wAqm*U&$t5}_A)SYhhb+B9@-rV1U?lk~&onHSZe=^+
zXDOiCR;#u-fO@g`P8qU2lS)~pWA;KsR5)nf(~apx>Go;wq8N5tTf^=vaJ<V=k6hCI
zO7${y+hlo`A^7={iC_<l7Z3pumHyo_jM;v7G%6vz<tej?hO|QN&-U!$tTb17pKdZ|
zsz>OYR6JvEQjF1`*~GRRr_zhZudP~QQ9NM*<k_TjTF33xrR16A<ef=Tya$<#HnNev
zuuq0hvl3fBbSG07DM~*t4-T%5dM{CWhu8Jbp>=BCwl#at`??_|P~%|O?;XB8csxWx
zy2%z}kmyN5nOPHMdSw=LmGE8>Nv)lkl>7(7U~Pq7FDElbr3EM|C=;|T#VgQ62lc5C
z@oUH%3vF2A4=yKE@LUetv2K7ac?t2}=yG=@1*u7e^+S_ZX4_Wa@MwZ@E=Bd~c%4z)
zUn+HtnTN)`6MD{Y<EuiYP1t}d2q&!D7;0^-YH}*mqLZ@!oYYT3JQ4<slgX#5)W8b8
z!oDB5u3*N5qml`g!imb4q^mL^S*t&#zT7LM`jU};7(9Z#WVdAF3+Nmn#qA?z5fjj#
zUB|!&F7x>ZatjKK04Og}S4Gq#gtxQ22A<W*TU$98g6QLN5~P5$c8XE(1^U3>%!K1z
zK-hh3iy2dNN72j_Q?qGDYBW9BH5kGa3@%pFsIr6j0)eo)9d}TF6kmE<gGo(G7t5K=
z6@5+JyP0UKcLTV38n55VF%iO_OHq)!15rv5uiw^qeW>`b#$bE*E?!}q#{NJb3w=_I
zRN=n>Q^dFsbqjQwa^R2*!#P>iWUV}DEFR1i3y=v9s<<X78*92Z%cirKiiuenw1>}6
z;i;WN#rNh@v7BnAVmz*qib=sr$q<vQWvJxuJz|*Vbs-N>;FJ7y(S3zy0YHGa>S_Zz
zK6t2~5;cSBd?*Tj-2PfIDG@*5EP-ck-UG$6(!s3oCAGADR@YR4bfrb+TEj~TOw+wI
zSsw0+!OTY|ps15Ass={xbtHy(#hG>orZBm0M6Vg`In|C7sm?km-qX8ya8dMLy#q4j
zng{8=#6P}=n~AqV^DTTDA9a%fry@{|H69|=?zVDiwV`kzoZ_TZrA|bFXs~7$!=^0I
zM_BQ1`m;ch-rV&q4#yyD@uv14Dr2_u;OKpiRB;|UQ&c7up>ftdibib%Yj04DQgXmk
zI(a#swozhFAz4vdO&RqAtzNW_ux^2Bblkkj5h}1)g9H{E!~AhC&#fQmZ`6`HbU0jz
zu|jxb0I+DTse+P$O@b1nEs%u@*X}7MFqjz|3L|@AXof0=y1Fyq>UA6<>lo}Nw@jrD
zEGt*WQcGBR1Z#9T7)Un`LCY96r|B4gvk!F;^GZ=m%Li};j5dWUrq@7<hHzcOZtBP6
zQx|~i;Xn;S&=p~vQ;?HGC(A>Xr&jCr1xiJ{1}?%?$-R9=JyB)nMshiX9J8GnOjK4>
zJ1JH6aIC8`Yitb{&-Rq%Em+oKBrtKdOYD%v;rb&WH?(A=MG1Lvh|OM^PhlU-qp%Qa
zm{-@*Z8VNX?upGk>S7D^XX7rKYaChm%Jbp{wMJ(-dR@^OjoCg*77tOhm{!B6IEUqR
zMZsXR;SkH6S^2mG1({;l&hKAI`A@~#AyVPA%~<T?Op!IB%l=(DrNE&u#(-DEaSS|O
zw}!ciKxgx)@Vdp(Y71`{tOMh44;b)dmN0q8=e7FJBK2uyn`lXGljBmWnX@g2hyj0o
zT%N?hFwsSbl}xnQ{PY^`4D1^<j1WjK##))I*{4|S4<Sc^{wdtSK}M{?4fHa~G~>;Y
zNgeR%ngPgBA3*pYmns9Wcz_2B%SqT}0%3#W;f~<Zrvcv6e%S|tt0a0KOT8GKu^hL<
zyRNl-VFZ&TkO0dx2hYCq4ThkWV2CQOF26&B%@DrOpBz3yp#ce6I5<8QK%*Min@~5O
zOq^|=aOAlagroD2`#s*o?#zJj&|^SFAZrb~*QetA01Gib#>+f1cHkj%2R0nf#)J9j
z|NP#b<WT5NDAu1r`97jESO^#Ne@nxNyZ}=#8cMdWWiO%DmXqrog};Y51IG@6I;3B|
zx`9--#8`l5U@#-{2d7B9yAm!HR>HI&9=e*IfOEkB`M=nuiQ7{;D(|m_ny~T7l1lP3
zctr(L!E^-?_EDKs5EJ9aChJ#q;yuU|ynqgjh13?BO_r}<30a%COdnD%FV4=T6l$=_
zeAAqe*b}c7M^rd}`aqJM;Ya}Ve4AIdtlQ9I5<Op?YkvN>hL?j}@%si%6-<JIZAB_M
zFCW|m0g6T_?72x7%#I<QSV0?0&gmd&Azjgysr0f*NaPD!8FMzL+}(E3U_as~*jAZt
zP)2fd?@hR!sS^ooug!=<IMQ0b(h_N?3k9+YLg4fv|BKunbT>f>SPGfbJwoyd$Ye!h
zA3pIX2n2w)dve*E9Sj?QolpgaxpO|dP!%{qf%tw>yR#o%Mcs2&$%0===BPrq;n9R&
zHOr9#WQAhIyVD80JI7LY9w>NNBp_xzI9X>NPEDkc%rf}1r^T_L?i8r&50Gy0+q5{d
zHVmTv`6ySPIaYBwQThG=LVi}zNeL7Kw!-XMC#O+;NFJTF8@AVAioE9q$qfQXc1GP{
z5#T`Yk6v>#hv&r<?t{1AeeeKL4&qDQ=^-DdXo>1hogr(Jius9Zl7gdHQgb3K&0XSF
zoXUgPkAjtfs{pptDivbAGHF{j*ml(FvSPzYH6;PRnMrW?&d05<%qjSi<<M0gny&H;
zlkj-h0?cpVb<<4DIoIB@nEluZ_YQ<Dt+hrOPy@876^fC2jrlGsx8K$gwZJo$%JF=r
zg8jj`99$g3{@l<TD`FwC-)bF|*u|;WyM(9!QNIfTHy7ZiJ#GCbD{z}_YhiH2ykh<N
zt&!>sA*6z*aCJ2;Z9KKz8(#vGnZ+@*^0=_DdOPr?F#~_cX`0+x-nkbf{%J|zljz}U
z3~7rw06^u7EMpsr5gh=!4|po2unt5}$&8@kb!dqjmlE!U@#kc%At4lgcAs(7tx;Ky
z!&IWAL0j-iSbIdHQ20#6J75jx-xW#&^urdlsisyenDihwh0SQxp%Th(w<sKY>RY0!
zHNL>kA=6iR@uujhXIomdKKrFn=%TNl!_*66w!%Omq;*{nPZ_O-fDdJ7t&H_qbBqa`
zdn3dtPjdiRPS>Iw&f2PuK0AFG<4#-=m8azJGwH}NVd2(Gwp&$1%cs{v+AoV%BbT4K
z$+33gv{)=9lBQ%0Y|rR+KWz54OMyBF1C-A9f8#<3)$biGsG`>e4r@eRk}74wLSxL(
zDZ09CmlnWC4PFBP2CS+22B_F8C^!)bWqa(%QgL^O9l?Z%>9u>+N14}d6F`hXu`a7s
z(K@;Ykpe$L)w!AR*GF8x!9H2r+<d_~H$lM>IL~{+eoLpx02YLS8TZL{z@%a}p|cT1
z(-5gjn8^h@5oorY!ljBVEneF5RD2ulc{#I!H7_LtAdSY@L5JNvjy8G~zKr_VLRO)A
zDZNe@z|d5ZNBFc*Sgz&<7d(QUM4$@>$dfO)PP12S=zVHK+;D%MNA3Q6kD8onSR4Ad
zMvof3Y2Kq|L`TBPA8M>d-o1PyyBxvaX7L*?s<mPrSE%(t=t4|B$T@mugojrFaw<x8
z&SI6OMTv5RQf^WRZ~`mYg=2~wA??>3HA1Og2#MDQ*>L%we~NfwiL@h*4;N8!=G)M2
z*;0$trN$@;QI}StV)*Zk0S5~pk(qIDrck};Pc9}p1TkQrX{aLw)qyVs4d+K-D)T-B
zYzma~zU{1My^Ai*cAb|E-Zo83a(sgvK_kc%TXb5Z1jMSoL#F~=BLNt>`?!CK7`tnu
z5nWTd3rZ+hg#^Zz*jw&O3t%S3zc92teeoe?o4beHhud{e{Gf1Jia)0AY)yj`g>sNe
zI6~+SOdUqM^s6Y1=Q<14=xvRdf+_oqEG&C49$O>}4R5d@3lM%b`2dFOiz?=jz!T%H
zx&z-!+_4geQ-<^j*;wl;ye()3PT>_AtW5N558BkU57o2~7c_&aRgKs+jGgqBnwYH}
z@tEb8qv_?CE?k6T&p^nCIPD%FNQR?9e{y?*bOeU3$mNN=0;7zyvj1nwp2c9*I!nu~
zQH>Q#Qw+reMu}Vr9DJ$;zrpp=9Q@`1$Q6+#YVHb<kf21T!5r@7c?qluP7s&{;_GQj
z*${_>CRn(u4+RmY83!=Dox89Y&EC+s^BY9YZRCe13^~vgfRiT=2IZ(YA+s>PX;LBU
z9K4|&ab7qK+id5?T(?j)g7QEQzxeMl_LWZrst%O^YCOU^0mAo&V=tLpY>9BwNbz;@
z0o-HdXhaPszr8O4^H@0}xFVZ2HvYJ=8g6tKjw8Qg9KCPpp5iq}$0%zBmhmJXc1M7d
z!KuO>lF^lCDE7nPBt4CY4ayuabk046(E{Z-%lgxMR|t!#1j|C{ZnBX~EDWQR+3lFb
zaWEw=s)bGpmAIKl)3QwE<P(GuPqIv+iD$Cx)w(A;T{qrI*l=A-hA7ZY%kaosx}Mx9
zaCe4}n@FHqM?}K!A(8AWk#MS$i4!>0tjj=qTS@T?%@><<-l#5MDik4W;M*(ay~iCn
zrAiWUWUF>6vjPOZt;mi+SF-8e?4mzyNWJq0b93m(T<uQ>BGQ3?eyNSr(Ohq%R1U5W
zU?gzw3{|L4$3VopGpKoV6UNkz>#|jdohhvxo7;?jqw$DEk0yn|#ZW)Yh!))%&dr+`
zmkyr@eDTtOYI2tWmPvQl$H|1eX<3@#A*^Lb&4G8svFp{;036g9G!&tlGWCNlDaSw}
zM!>u&wosnArEcK(X+cPQMHt4DK`ftMDBsL2Vfetg;ozIl^bn)Ro1J1?I;$|$05~FR
zMAh8P1I3{iRa^suIt=xaX$rW<$|cy8(&AOXUQ{s*9>SN-DTO>}!By`e!N(6rSSLY}
z9ISkBS9+=V_=KZoTk18H%u9IvED6(7R8w%aBV)y>7f51xp$v{Ci{kBW-@k|DB~&>b
zDX}i=kik}uDxVsGvMku=-7j}1BUriMeprZp<7LT`$dDmAapz$nf_>~gL?TR~(OGta
zfFZg4T?B85sSWDPxCPNNAP!>+76(8^BNhkcIg+Wxo@(&cvag-j=&Lj0to43kB7A{L
zBz4sBwbxNIjY(g!bUH`}t*OqBo(I&$a5-Z=0Znhq{<3vDh9iPr!XVLJGO8DDw6d3i
zoTJwf+*|8c%EQb*D~xU>OP*v;tkTH!P*K&nd<5i5lLOmPE-*S$At?`kfea=;@Xk1(
zGz&cyjzK#Ng)Ss$Z)_rfB}h|m_|3~f!DZfDkz3$|YtBLG%*Pt#R5R9y$IZnWeGZE0
zP3&B#D<@oKeaEEZ^rXASfJU#L#=AB{X#g+_&CbZDvI))Mb*6Dm&czkY35I=L!<lp7
z;^aP?`4=5zI-kMZ(xP3-yw!Z#S!ap_tvt!kO70S^e2<6`(LiPCm3kOyqsg7~<0;fC
zJ~;-V-O{je_%^!T9l#4yU#6b1fK@is!4PHDvgNwtG2%HD<qZWJ!o@;?R`uRh2x!!J
zSvg?o20jA?A;eWuPTn=s;{nTqkm##d)kJ%g=^{EPG3>^stkM#)D~WY+ZkyoMFTux<
znA!Mfl)YQ!=`VwR0&|R09I%~vel_ebNpo=S@w&Si*r>Ui0oM1=FUw1U>SOgd<jCA9
z-T(oGkb0Gp#~jt9W!%N*pI|g%zaA8nM65ffc(MjGXVE8-0)oHL)X`*iZ^Ek}U1)Z|
zr2xg~%6TW|bZLuG0_BSr@F>a>rdLkX0F6<g8iEC4i8774WQz7z(jr`A_6W*8+T?NT
zbaWy^*B+C&2Re|{FAkomTun_^31CaYPibL>4QZda-92g3QX`z>P)H=X6Xa@Xh+&Fu
zxXyqImI`w8*$Z-f=5F9~1bKVDAeU3kf*g-)6yz$zT{0<b24J-uewYV`If_#PJ;Z+!
z7>0!o4GwV#1)@a;8i9y<JJ=ur*{ELBX6+~y3;DbZEtz&-pq_LXI)i~ta)ThVX|QN8
zVtb`*AQ-v$lPuZ#f(`STdJ<fmI<f-Ib}rL)^&BJz1ZcQMLF(#h@>NrV<6;d30<>lt
z?J5jY^t&!*xrc_N;+$R%$MB6!kbz8%g#vJGcqw5pN%DfzhJ}g+8EW^+*V#5>aZvTe
z`}4#`3S39|`H;mBkgqk%B?N-2B2WSE<pcau!$vu8vVO(;3HyWtMT4pHyy%saGXt%%
zZO2hh#0!jqUebMspoO-Mr@y5(06o2`M?8%ZSRi+i^U;vN;*jOJ211spGAY7$&eX;c
zqS-zGEv?CP)*t4YXT&?z=k6#Akyx%KPfV=|v0>RLorz6{#ot^MNo-N&PVXQeirF<%
z+&VDcvVC6@9>99;&}o@+<IYV17@o*w==VV~!}9FXtoVw#E>s6@pba3NQsr`75X|fP
zTjV!7HM*}H1(n7(b{NW|CbacTC_jpJX1~PpNTdj|yq;muc{<|jPr75x6~oWBFd%k-
zb5m|es0pCXIT0od>GA>tpDC%D4Zr!~f@hkg#R6er5hlX*`)@E7K29^jX*qNuLgk%c
zp|EYxtZi`$OJ1DWOwwF`sCDAPRu~PK?x9V{X9z$P4t#bt@fx5M?W%}6)PC%CQFt3{
z1D75u?6p?;$m5otlex-v+fpV++#7+W?-!lRwT>i9%IYJP9~x-HL>mzBeZhX2Oy1h^
zA5R`1lLbzvs6fOAIKif1BUU{+P$5E`JcT~-y1RnabmHok-^0=3N_^r{JVOntQYr$-
zx?NI$5ZXt7c5npQRpsu$0j|{zp&uoT2Zo7oic}(jRj-g@A{v4)&uj;dXBNJ(vUo1a
zN3K|)S&>RLC|Rc#?M4Wt7%*_juTm*e)c+thv6W3P&(9GymIFYnGR-uKNctL9w7~n7
zCgjYvrv2wAtu{e<rPs80oNUviau>@KiZu&^I+`F;w?`ZH7|V}N(WH_jeZpBg4m9YU
zfER8f%pgjI!84d?6qqN93pu@Pea<0tguV?}V{kac>m;rV?Fk`0rG(IF_*parvRfDe
z`T3hS+)if7NGoL0<X2naRGV0hLL^Kh`}7AWXwLIh`Ko{d5ymILVmp3hY5^*niZakS
z1S6q5Njb*x9F8GybUjj1q(VYzpUp7$Nc>chYK<LyR70=gYIy<fiZJx;{BQ_8l45w}
z>NkL{ycUJepIG&qC{~vwl~lhWBXy!3J0K(c$||j;vb_UOF&si?_Z~hju8IMQz#uJ{
z^5FD3@1MO<P?Bu)_T~0g^}FKjvEQ|(E}#=o7Xp#dLi9wH)Ix<JWT33)8c>RasjFG-
zoZ+bLr7rGR1KFdo9eIqL3KpqWa}+MGukP7erj48|<63O1;&!x~yQLQO;N)me)2Yku
zjY@2uan0PMJ)Z|eEhm_LBr>D-7M{J>jeBwu>ch7#XClpLWpp>1!?ZZFuu+*)$X3lt
z-_oq~LH||Jkd+*P{Z4t1TEv2rQFNn^JHal}_Ji81Skm|bR+0O;Da(vc^_E{ttv;=m
ztz)0C&i@la+AO8Y8euO3^rgl<*RT1ftH3N5nJ=e?*NnY5D#rjNQ7ju`#DV@1TMB@8
z(7mn@i}bcf2*spEcxFbp#HJmkU6plcF-a-&n^)1%D<M0~a2^yim68ST3NY0f2!{Ta
zE~&biV#`at>TY3?8>qXO`UgV&bUwU2E4)Lwp@>4Fxo#wz)Zr#X2OLq5C^)qmYgVtX
zI!k1+dda9Zhl8sA4`)Hu$2DBoRaq7UaiqmBAtM`tajE?)h|tQEN8!3Y@<|{g=cTzr
zk-J%@yBoGiAwdoeI<HA|xEQ#U$l1m2J2-F!dL6D5c3KKD9ZUSR0C&WX`!1H*PkJg^
zB8z{qlP@w*SMkZ!JbwD}G#w}t^twM9qqvp#0#jr5jShB2tIBoLmq>~x-&hbt)M{yp
za)5W<XL%uldNO)0&n9rns)gN&%kPQ~AH=#vqH29QXA}_uS<`0Y1c|yGAj>QWiQ$gZ
z$b5XAYt@E;gf^M?sF;f{`;4McSm;*lrc$SNp<G33Pr_IQlR-=utAj!1T^~Y=Mv23n
zMK!)@@V`WJQ-3XLhE12aYh^$fI5Q9Z4)N*^v!N78AM{ZPA%d!f3?e{Rfl9~0S3uz`
zl5H;XuY{`o4`4LxUV#WTZOz>|xiI^|+L4#Gj^S3bOjFP78~yF#)LR}`K&V23qbhH{
zhq5aq41sow_xW%(-tng>Xxt#Ww9aM#zICFcJqNwXKqtzFDPAU@#huWg_a-Fx6bh^<
z^2Q<j=&q-&r<}wE5lCYjV>4~zSE?xro<R<JM6-^vg%aA>aUGS=V$D(wLF@s(--+97
zUrTSjhWi(;%V9nR63A^a;8~WEx}vwp4e(eEE(?cq8|k2-YeJZb3=|CzMHc(09cjsv
zNyL)OeN_>@iJhjnC~OQSoP@Znn^G;;AQ7{EEW9VR)kc{vBqK_&k+@W+ng=SOl2NLs
z6CGYkQiE~&o;esyF%D>VH=^BRgf#shwtK`V$tC?5v9dkXhzJBFsWoAgBKa3>GD01h
zgG<&I2cF<bY73+*j<1pBwtWm6e(-_Rio8#-{pG>lIk!p~muC=!sCLLp6XHY=fh-JE
z{)B!+$rUs?nQ=CX=hIEvbUPxmx}^MwhqCSy)(SG$_#OSj;o$g72(#D^O09&#Tt-0O
z7TI^uX`DsHmW28ej2`(b55~!p9JLvMgtFB-6&^o<<1{s=FjKL`DnpvETXdx!LLg?1
zV-|{;CdCYrmkSWvWYD2L+7f3_$_qI^>PJR_OrZ|9wGzC|NRZ4??M8*aty?x_o*lf6
zva5xQ2nlH5CQ2D5m~mar@?T0AER8GVTyoPvE5fi$4Oc&zVL^2PfhiaAC($ga`N4W^
zBh+p7NN7?!MhbtZZjZdb5u&l1ETdCeMoEMOs1jauONvvb$&~2%KG<w`dOhl4!=n)x
zsRN<p!5@EJt2NT#0&+>&OSAo#vbJdHEeh396e#leaz6zXOBLoCG$`R=T&>M?$ul?5
zOVvD_tC>6PqbUP8i5J{Q$$nSQt6on)d59OqqOGLRit->K%zfGKEVjIa+(IVZ+Eshc
zVyd_~2)yc=@*zSS!Cg%Ami@LQQ6jMlWx45dLx=P72d8q;lu^?w?sSGYScb^9xFKDI
zVPK~_>_gi{h59|Kzo)C~Q<Y_U18RPL2WHlax%vpz6d4Os8~jZQoI1<4Ifqn)@j^BM
z;RpjdSh%903!RyY>o#10>jtl7t#zp`8tqdM3QJd`KN!e`4e&TsU|-cm3i9CK-Etwx
zO%C`bz34NTQxo#srebL#dTYhf^0qpgSnITyplTS*j+)Y?B9MHFS@G|WV9@Z3w;1lG
zLV|XnAp}O`G3oK$51<z%B%x-T4}32XfdU3OXZNIlp@mv(NW@;R7oytOoL&#HBL|MN
zi5@TP6L#e)HnFw+ChEl_dI1$cH~mki*k1AM*ZXH1asjJDY;3~*z{B)7o9G!tyi8e-
zKrVA%a-4$!+T{)I(|9o3*kp*&#wI>)o|o>!wCHVYBJBWQk(tuR)g{Hm#%6JmznGw2
zXnt{qr`b2W21BD^qA1?SA{d*q)9F;7op&iW^hx%aPh(^7U}<<4J7M7|<&5pXFY+>Z
zLa)UBf=DL0^`D=^6MhK)zSlkJyY7P>-*Ug(;fL*mJ8t3_-oNpE?u0vjF}eQxlkfNN
z{l7l?sI>S0io4H_U-=Qe!R|}n{|Kjl6#wl0uYXRjx8qT!-+usy@4-L2|7Y~)jbj}7
zO1uBtzo{qe_%9Q8p*!#y;h(SK%SS=`zwvi^){Z~#8gA{k_t^0Qm;ZNg|F^%P_uKJ9
z_d>Rx&#?CERT~?x-S76it-seZcHG$4@AiJX-}?Xi?*99F*~Zqtx%=JmLlOx7@cy60
zzx4h;#BXfk58|Kv`GRY(JG$HLtUSkm!G}M<zwrLI+<kUD`pNL>jg2q3uXg<7<o@w5
z>HF>YR(yNy{y$Id|08$59p7H^{=ZD_|8_^;Z^z&GPe;(M9oBAZ)?X+0f9}^f(;FLh
z{4x1f{b%!Nzx*ve@QpVA8(Vt69Y23=SL*St`0K_$;Cp!g7yeZ5x8pZ{D!w6o|34=8
zf9r4bemjn*@%=_W_Uk_-_wW6JQj#5i`<LuWJ*MyfXZ*}(exEz>=C}0zH(%0YI^OB~
z{}uQCIPb?l{z>ouP=2lcv-|D%A2{`S{ImPN<KF)r_kO$1o?zeY_z_NU{O$hly8FND
z?*B09K=O6tbGVs3wfo<GrTu>UtM2Fs!%>A_zxJFzN#6hUq2BlP-_he6PguARf2`eh
z{As*DZ2!hs@7wrLk9&@P>)rMHe<8X5?Q6a7?Y<tne0cEZt>801ei7%{7VG~vzoz&7
z#Y;W@yLHsv_m(@_;aA=Jx9kUZwEKTO`Q4fm9+M%GJHGAS|E=HE2mB%qzl=uOxEcTD
p@L1dRO8x)O{#D!md*8G->F@Ekjg6m}bN^5OmA<K!+>m_T_#b#)A{_t#


From 174e10f1a6ae5ff1d8701b8b7d81745564c09555 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 23 Jan 2026 10:55:25 +0000
Subject: [PATCH 090/194] [Device] Rename deivce PyTorchSimDevice2 to
 PyTorchSimDevice

---
 Dockerfile                                          |   2 +-
 .../CMakeLists.txt                                  |   0
 {PyTorchSimDevice2 => PyTorchSimDevice}/README.md   |   0
 .../cmake/TorchPythonTargets.cmake                  |   0
 .../csrc/CMakeLists.txt                             |   0
 .../csrc/amp/OpenRegAmp.h                           |   0
 .../csrc/amp/auto_cast_mode.cpp                     |   0
 .../csrc/aten/OpenRegExtra.cpp                      |   0
 .../csrc/aten/OpenRegMinimal.cpp                    |   0
 .../csrc/aten/native/Common.h                       |   0
 .../csrc/aten/native/Extra.cpp                      |   0
 .../csrc/aten/native/Extra.h                        |   0
 .../csrc/aten/native/Minimal.cpp                    |   0
 .../csrc/aten/native/Minimal.h                      |   0
 .../csrc/runtime/OpenRegDeviceAllocator.cpp         |   0
 .../csrc/runtime/OpenRegDeviceAllocator.h           |   0
 .../csrc/runtime/OpenRegEvent.h                     |   0
 .../csrc/runtime/OpenRegException.cpp               |   0
 .../csrc/runtime/OpenRegException.h                 |   0
 .../csrc/runtime/OpenRegFunctions.cpp               |   0
 .../csrc/runtime/OpenRegFunctions.h                 |   0
 .../csrc/runtime/OpenRegGenerator.cpp               |   0
 .../csrc/runtime/OpenRegGenerator.h                 |   0
 .../csrc/runtime/OpenRegGuard.cpp                   |   0
 .../csrc/runtime/OpenRegGuard.h                     |   0
 .../csrc/runtime/OpenRegHooks.cpp                   |   0
 .../csrc/runtime/OpenRegHooks.h                     |   0
 .../csrc/runtime/OpenRegHostAllocator.cpp           |   0
 .../csrc/runtime/OpenRegHostAllocator.h             |   0
 .../csrc/runtime/OpenRegSerialization.cpp           |   0
 .../csrc/runtime/OpenRegSerialization.h             |   0
 .../csrc/runtime/OpenRegStream.cpp                  |   0
 .../csrc/runtime/OpenRegStream.h                    |   0
 .../include/Macros.h                                |   0
 .../pyproject.toml                                  |   0
 {PyTorchSimDevice2 => PyTorchSimDevice}/setup.py    |   0
 .../third_party/openreg/CMakeLists.txt              |   0
 .../third_party/openreg/README.md                   |   0
 .../third_party/openreg/cmake/GTestTargets.cmake    |   0
 .../third_party/openreg/csrc/device.cpp             |   0
 .../third_party/openreg/csrc/memory.cpp             |   0
 .../third_party/openreg/csrc/memory.h               |   0
 .../third_party/openreg/csrc/stream.cpp             |   0
 .../third_party/openreg/example/example.cpp         |   0
 .../third_party/openreg/include/openreg.h           |   0
 .../third_party/openreg/include/openreg.inl         |   0
 .../_C.cpython-311-x86_64-linux-gnu.so              | Bin
 .../torch_openreg/__init__.py                       |   0
 .../torch_openreg/_utils.py                         |   0
 .../torch_openreg/csrc/CMakeLists.txt               |   0
 .../torch_openreg/csrc/Module.cpp                   |   0
 .../torch_openreg/csrc/stub.c                       |   0
 .../torch_openreg/openreg/__init__.py               |   0
 .../torch_openreg/openreg/amp.py                    |   0
 .../openreg/extension_device_interface.py           |   0
 .../openreg/extension_device_op_overrides.py        |   0
 .../torch_openreg/openreg/meta.py                   |   0
 .../torch_openreg/openreg/random.py                 |   0
 58 files changed, 1 insertion(+), 1 deletion(-)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/CMakeLists.txt (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/README.md (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/cmake/TorchPythonTargets.cmake (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/CMakeLists.txt (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/amp/OpenRegAmp.h (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/amp/auto_cast_mode.cpp (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/aten/OpenRegExtra.cpp (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/aten/OpenRegMinimal.cpp (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/aten/native/Common.h (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/aten/native/Extra.cpp (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/aten/native/Extra.h (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/aten/native/Minimal.cpp (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/aten/native/Minimal.h (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/runtime/OpenRegDeviceAllocator.cpp (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/runtime/OpenRegDeviceAllocator.h (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/runtime/OpenRegEvent.h (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/runtime/OpenRegException.cpp (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/runtime/OpenRegException.h (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/runtime/OpenRegFunctions.cpp (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/runtime/OpenRegFunctions.h (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/runtime/OpenRegGenerator.cpp (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/runtime/OpenRegGenerator.h (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/runtime/OpenRegGuard.cpp (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/runtime/OpenRegGuard.h (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/runtime/OpenRegHooks.cpp (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/runtime/OpenRegHooks.h (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/runtime/OpenRegHostAllocator.cpp (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/runtime/OpenRegHostAllocator.h (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/runtime/OpenRegSerialization.cpp (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/runtime/OpenRegSerialization.h (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/runtime/OpenRegStream.cpp (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/csrc/runtime/OpenRegStream.h (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/include/Macros.h (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/pyproject.toml (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/setup.py (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/third_party/openreg/CMakeLists.txt (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/third_party/openreg/README.md (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/third_party/openreg/cmake/GTestTargets.cmake (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/third_party/openreg/csrc/device.cpp (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/third_party/openreg/csrc/memory.cpp (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/third_party/openreg/csrc/memory.h (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/third_party/openreg/csrc/stream.cpp (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/third_party/openreg/example/example.cpp (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/third_party/openreg/include/openreg.h (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/third_party/openreg/include/openreg.inl (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/torch_openreg/_C.cpython-311-x86_64-linux-gnu.so (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/torch_openreg/__init__.py (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/torch_openreg/_utils.py (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/torch_openreg/csrc/CMakeLists.txt (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/torch_openreg/csrc/Module.cpp (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/torch_openreg/csrc/stub.c (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/torch_openreg/openreg/__init__.py (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/torch_openreg/openreg/amp.py (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/torch_openreg/openreg/extension_device_interface.py (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/torch_openreg/openreg/extension_device_op_overrides.py (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/torch_openreg/openreg/meta.py (100%)
 rename {PyTorchSimDevice2 => PyTorchSimDevice}/torch_openreg/openreg/random.py (100%)

diff --git a/Dockerfile b/Dockerfile
index 1b4d08f3..1c52d32f 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -12,5 +12,5 @@ RUN cd PyTorchSim/TOGSim && \
     cmake .. && \
     make -j$(nproc)
 
-RUN cd PyTorchSim/PyTorchSimDevice2 && \
+RUN cd PyTorchSim/PyTorchSimDevice && \
     python -m pip install --no-build-isolation -e .
\ No newline at end of file
diff --git a/PyTorchSimDevice2/CMakeLists.txt b/PyTorchSimDevice/CMakeLists.txt
similarity index 100%
rename from PyTorchSimDevice2/CMakeLists.txt
rename to PyTorchSimDevice/CMakeLists.txt
diff --git a/PyTorchSimDevice2/README.md b/PyTorchSimDevice/README.md
similarity index 100%
rename from PyTorchSimDevice2/README.md
rename to PyTorchSimDevice/README.md
diff --git a/PyTorchSimDevice2/cmake/TorchPythonTargets.cmake b/PyTorchSimDevice/cmake/TorchPythonTargets.cmake
similarity index 100%
rename from PyTorchSimDevice2/cmake/TorchPythonTargets.cmake
rename to PyTorchSimDevice/cmake/TorchPythonTargets.cmake
diff --git a/PyTorchSimDevice2/csrc/CMakeLists.txt b/PyTorchSimDevice/csrc/CMakeLists.txt
similarity index 100%
rename from PyTorchSimDevice2/csrc/CMakeLists.txt
rename to PyTorchSimDevice/csrc/CMakeLists.txt
diff --git a/PyTorchSimDevice2/csrc/amp/OpenRegAmp.h b/PyTorchSimDevice/csrc/amp/OpenRegAmp.h
similarity index 100%
rename from PyTorchSimDevice2/csrc/amp/OpenRegAmp.h
rename to PyTorchSimDevice/csrc/amp/OpenRegAmp.h
diff --git a/PyTorchSimDevice2/csrc/amp/auto_cast_mode.cpp b/PyTorchSimDevice/csrc/amp/auto_cast_mode.cpp
similarity index 100%
rename from PyTorchSimDevice2/csrc/amp/auto_cast_mode.cpp
rename to PyTorchSimDevice/csrc/amp/auto_cast_mode.cpp
diff --git a/PyTorchSimDevice2/csrc/aten/OpenRegExtra.cpp b/PyTorchSimDevice/csrc/aten/OpenRegExtra.cpp
similarity index 100%
rename from PyTorchSimDevice2/csrc/aten/OpenRegExtra.cpp
rename to PyTorchSimDevice/csrc/aten/OpenRegExtra.cpp
diff --git a/PyTorchSimDevice2/csrc/aten/OpenRegMinimal.cpp b/PyTorchSimDevice/csrc/aten/OpenRegMinimal.cpp
similarity index 100%
rename from PyTorchSimDevice2/csrc/aten/OpenRegMinimal.cpp
rename to PyTorchSimDevice/csrc/aten/OpenRegMinimal.cpp
diff --git a/PyTorchSimDevice2/csrc/aten/native/Common.h b/PyTorchSimDevice/csrc/aten/native/Common.h
similarity index 100%
rename from PyTorchSimDevice2/csrc/aten/native/Common.h
rename to PyTorchSimDevice/csrc/aten/native/Common.h
diff --git a/PyTorchSimDevice2/csrc/aten/native/Extra.cpp b/PyTorchSimDevice/csrc/aten/native/Extra.cpp
similarity index 100%
rename from PyTorchSimDevice2/csrc/aten/native/Extra.cpp
rename to PyTorchSimDevice/csrc/aten/native/Extra.cpp
diff --git a/PyTorchSimDevice2/csrc/aten/native/Extra.h b/PyTorchSimDevice/csrc/aten/native/Extra.h
similarity index 100%
rename from PyTorchSimDevice2/csrc/aten/native/Extra.h
rename to PyTorchSimDevice/csrc/aten/native/Extra.h
diff --git a/PyTorchSimDevice2/csrc/aten/native/Minimal.cpp b/PyTorchSimDevice/csrc/aten/native/Minimal.cpp
similarity index 100%
rename from PyTorchSimDevice2/csrc/aten/native/Minimal.cpp
rename to PyTorchSimDevice/csrc/aten/native/Minimal.cpp
diff --git a/PyTorchSimDevice2/csrc/aten/native/Minimal.h b/PyTorchSimDevice/csrc/aten/native/Minimal.h
similarity index 100%
rename from PyTorchSimDevice2/csrc/aten/native/Minimal.h
rename to PyTorchSimDevice/csrc/aten/native/Minimal.h
diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegDeviceAllocator.cpp b/PyTorchSimDevice/csrc/runtime/OpenRegDeviceAllocator.cpp
similarity index 100%
rename from PyTorchSimDevice2/csrc/runtime/OpenRegDeviceAllocator.cpp
rename to PyTorchSimDevice/csrc/runtime/OpenRegDeviceAllocator.cpp
diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegDeviceAllocator.h b/PyTorchSimDevice/csrc/runtime/OpenRegDeviceAllocator.h
similarity index 100%
rename from PyTorchSimDevice2/csrc/runtime/OpenRegDeviceAllocator.h
rename to PyTorchSimDevice/csrc/runtime/OpenRegDeviceAllocator.h
diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegEvent.h b/PyTorchSimDevice/csrc/runtime/OpenRegEvent.h
similarity index 100%
rename from PyTorchSimDevice2/csrc/runtime/OpenRegEvent.h
rename to PyTorchSimDevice/csrc/runtime/OpenRegEvent.h
diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegException.cpp b/PyTorchSimDevice/csrc/runtime/OpenRegException.cpp
similarity index 100%
rename from PyTorchSimDevice2/csrc/runtime/OpenRegException.cpp
rename to PyTorchSimDevice/csrc/runtime/OpenRegException.cpp
diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegException.h b/PyTorchSimDevice/csrc/runtime/OpenRegException.h
similarity index 100%
rename from PyTorchSimDevice2/csrc/runtime/OpenRegException.h
rename to PyTorchSimDevice/csrc/runtime/OpenRegException.h
diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegFunctions.cpp b/PyTorchSimDevice/csrc/runtime/OpenRegFunctions.cpp
similarity index 100%
rename from PyTorchSimDevice2/csrc/runtime/OpenRegFunctions.cpp
rename to PyTorchSimDevice/csrc/runtime/OpenRegFunctions.cpp
diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegFunctions.h b/PyTorchSimDevice/csrc/runtime/OpenRegFunctions.h
similarity index 100%
rename from PyTorchSimDevice2/csrc/runtime/OpenRegFunctions.h
rename to PyTorchSimDevice/csrc/runtime/OpenRegFunctions.h
diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegGenerator.cpp b/PyTorchSimDevice/csrc/runtime/OpenRegGenerator.cpp
similarity index 100%
rename from PyTorchSimDevice2/csrc/runtime/OpenRegGenerator.cpp
rename to PyTorchSimDevice/csrc/runtime/OpenRegGenerator.cpp
diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegGenerator.h b/PyTorchSimDevice/csrc/runtime/OpenRegGenerator.h
similarity index 100%
rename from PyTorchSimDevice2/csrc/runtime/OpenRegGenerator.h
rename to PyTorchSimDevice/csrc/runtime/OpenRegGenerator.h
diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegGuard.cpp b/PyTorchSimDevice/csrc/runtime/OpenRegGuard.cpp
similarity index 100%
rename from PyTorchSimDevice2/csrc/runtime/OpenRegGuard.cpp
rename to PyTorchSimDevice/csrc/runtime/OpenRegGuard.cpp
diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegGuard.h b/PyTorchSimDevice/csrc/runtime/OpenRegGuard.h
similarity index 100%
rename from PyTorchSimDevice2/csrc/runtime/OpenRegGuard.h
rename to PyTorchSimDevice/csrc/runtime/OpenRegGuard.h
diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegHooks.cpp b/PyTorchSimDevice/csrc/runtime/OpenRegHooks.cpp
similarity index 100%
rename from PyTorchSimDevice2/csrc/runtime/OpenRegHooks.cpp
rename to PyTorchSimDevice/csrc/runtime/OpenRegHooks.cpp
diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegHooks.h b/PyTorchSimDevice/csrc/runtime/OpenRegHooks.h
similarity index 100%
rename from PyTorchSimDevice2/csrc/runtime/OpenRegHooks.h
rename to PyTorchSimDevice/csrc/runtime/OpenRegHooks.h
diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegHostAllocator.cpp b/PyTorchSimDevice/csrc/runtime/OpenRegHostAllocator.cpp
similarity index 100%
rename from PyTorchSimDevice2/csrc/runtime/OpenRegHostAllocator.cpp
rename to PyTorchSimDevice/csrc/runtime/OpenRegHostAllocator.cpp
diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegHostAllocator.h b/PyTorchSimDevice/csrc/runtime/OpenRegHostAllocator.h
similarity index 100%
rename from PyTorchSimDevice2/csrc/runtime/OpenRegHostAllocator.h
rename to PyTorchSimDevice/csrc/runtime/OpenRegHostAllocator.h
diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegSerialization.cpp b/PyTorchSimDevice/csrc/runtime/OpenRegSerialization.cpp
similarity index 100%
rename from PyTorchSimDevice2/csrc/runtime/OpenRegSerialization.cpp
rename to PyTorchSimDevice/csrc/runtime/OpenRegSerialization.cpp
diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegSerialization.h b/PyTorchSimDevice/csrc/runtime/OpenRegSerialization.h
similarity index 100%
rename from PyTorchSimDevice2/csrc/runtime/OpenRegSerialization.h
rename to PyTorchSimDevice/csrc/runtime/OpenRegSerialization.h
diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegStream.cpp b/PyTorchSimDevice/csrc/runtime/OpenRegStream.cpp
similarity index 100%
rename from PyTorchSimDevice2/csrc/runtime/OpenRegStream.cpp
rename to PyTorchSimDevice/csrc/runtime/OpenRegStream.cpp
diff --git a/PyTorchSimDevice2/csrc/runtime/OpenRegStream.h b/PyTorchSimDevice/csrc/runtime/OpenRegStream.h
similarity index 100%
rename from PyTorchSimDevice2/csrc/runtime/OpenRegStream.h
rename to PyTorchSimDevice/csrc/runtime/OpenRegStream.h
diff --git a/PyTorchSimDevice2/include/Macros.h b/PyTorchSimDevice/include/Macros.h
similarity index 100%
rename from PyTorchSimDevice2/include/Macros.h
rename to PyTorchSimDevice/include/Macros.h
diff --git a/PyTorchSimDevice2/pyproject.toml b/PyTorchSimDevice/pyproject.toml
similarity index 100%
rename from PyTorchSimDevice2/pyproject.toml
rename to PyTorchSimDevice/pyproject.toml
diff --git a/PyTorchSimDevice2/setup.py b/PyTorchSimDevice/setup.py
similarity index 100%
rename from PyTorchSimDevice2/setup.py
rename to PyTorchSimDevice/setup.py
diff --git a/PyTorchSimDevice2/third_party/openreg/CMakeLists.txt b/PyTorchSimDevice/third_party/openreg/CMakeLists.txt
similarity index 100%
rename from PyTorchSimDevice2/third_party/openreg/CMakeLists.txt
rename to PyTorchSimDevice/third_party/openreg/CMakeLists.txt
diff --git a/PyTorchSimDevice2/third_party/openreg/README.md b/PyTorchSimDevice/third_party/openreg/README.md
similarity index 100%
rename from PyTorchSimDevice2/third_party/openreg/README.md
rename to PyTorchSimDevice/third_party/openreg/README.md
diff --git a/PyTorchSimDevice2/third_party/openreg/cmake/GTestTargets.cmake b/PyTorchSimDevice/third_party/openreg/cmake/GTestTargets.cmake
similarity index 100%
rename from PyTorchSimDevice2/third_party/openreg/cmake/GTestTargets.cmake
rename to PyTorchSimDevice/third_party/openreg/cmake/GTestTargets.cmake
diff --git a/PyTorchSimDevice2/third_party/openreg/csrc/device.cpp b/PyTorchSimDevice/third_party/openreg/csrc/device.cpp
similarity index 100%
rename from PyTorchSimDevice2/third_party/openreg/csrc/device.cpp
rename to PyTorchSimDevice/third_party/openreg/csrc/device.cpp
diff --git a/PyTorchSimDevice2/third_party/openreg/csrc/memory.cpp b/PyTorchSimDevice/third_party/openreg/csrc/memory.cpp
similarity index 100%
rename from PyTorchSimDevice2/third_party/openreg/csrc/memory.cpp
rename to PyTorchSimDevice/third_party/openreg/csrc/memory.cpp
diff --git a/PyTorchSimDevice2/third_party/openreg/csrc/memory.h b/PyTorchSimDevice/third_party/openreg/csrc/memory.h
similarity index 100%
rename from PyTorchSimDevice2/third_party/openreg/csrc/memory.h
rename to PyTorchSimDevice/third_party/openreg/csrc/memory.h
diff --git a/PyTorchSimDevice2/third_party/openreg/csrc/stream.cpp b/PyTorchSimDevice/third_party/openreg/csrc/stream.cpp
similarity index 100%
rename from PyTorchSimDevice2/third_party/openreg/csrc/stream.cpp
rename to PyTorchSimDevice/third_party/openreg/csrc/stream.cpp
diff --git a/PyTorchSimDevice2/third_party/openreg/example/example.cpp b/PyTorchSimDevice/third_party/openreg/example/example.cpp
similarity index 100%
rename from PyTorchSimDevice2/third_party/openreg/example/example.cpp
rename to PyTorchSimDevice/third_party/openreg/example/example.cpp
diff --git a/PyTorchSimDevice2/third_party/openreg/include/openreg.h b/PyTorchSimDevice/third_party/openreg/include/openreg.h
similarity index 100%
rename from PyTorchSimDevice2/third_party/openreg/include/openreg.h
rename to PyTorchSimDevice/third_party/openreg/include/openreg.h
diff --git a/PyTorchSimDevice2/third_party/openreg/include/openreg.inl b/PyTorchSimDevice/third_party/openreg/include/openreg.inl
similarity index 100%
rename from PyTorchSimDevice2/third_party/openreg/include/openreg.inl
rename to PyTorchSimDevice/third_party/openreg/include/openreg.inl
diff --git a/PyTorchSimDevice2/torch_openreg/_C.cpython-311-x86_64-linux-gnu.so b/PyTorchSimDevice/torch_openreg/_C.cpython-311-x86_64-linux-gnu.so
similarity index 100%
rename from PyTorchSimDevice2/torch_openreg/_C.cpython-311-x86_64-linux-gnu.so
rename to PyTorchSimDevice/torch_openreg/_C.cpython-311-x86_64-linux-gnu.so
diff --git a/PyTorchSimDevice2/torch_openreg/__init__.py b/PyTorchSimDevice/torch_openreg/__init__.py
similarity index 100%
rename from PyTorchSimDevice2/torch_openreg/__init__.py
rename to PyTorchSimDevice/torch_openreg/__init__.py
diff --git a/PyTorchSimDevice2/torch_openreg/_utils.py b/PyTorchSimDevice/torch_openreg/_utils.py
similarity index 100%
rename from PyTorchSimDevice2/torch_openreg/_utils.py
rename to PyTorchSimDevice/torch_openreg/_utils.py
diff --git a/PyTorchSimDevice2/torch_openreg/csrc/CMakeLists.txt b/PyTorchSimDevice/torch_openreg/csrc/CMakeLists.txt
similarity index 100%
rename from PyTorchSimDevice2/torch_openreg/csrc/CMakeLists.txt
rename to PyTorchSimDevice/torch_openreg/csrc/CMakeLists.txt
diff --git a/PyTorchSimDevice2/torch_openreg/csrc/Module.cpp b/PyTorchSimDevice/torch_openreg/csrc/Module.cpp
similarity index 100%
rename from PyTorchSimDevice2/torch_openreg/csrc/Module.cpp
rename to PyTorchSimDevice/torch_openreg/csrc/Module.cpp
diff --git a/PyTorchSimDevice2/torch_openreg/csrc/stub.c b/PyTorchSimDevice/torch_openreg/csrc/stub.c
similarity index 100%
rename from PyTorchSimDevice2/torch_openreg/csrc/stub.c
rename to PyTorchSimDevice/torch_openreg/csrc/stub.c
diff --git a/PyTorchSimDevice2/torch_openreg/openreg/__init__.py b/PyTorchSimDevice/torch_openreg/openreg/__init__.py
similarity index 100%
rename from PyTorchSimDevice2/torch_openreg/openreg/__init__.py
rename to PyTorchSimDevice/torch_openreg/openreg/__init__.py
diff --git a/PyTorchSimDevice2/torch_openreg/openreg/amp.py b/PyTorchSimDevice/torch_openreg/openreg/amp.py
similarity index 100%
rename from PyTorchSimDevice2/torch_openreg/openreg/amp.py
rename to PyTorchSimDevice/torch_openreg/openreg/amp.py
diff --git a/PyTorchSimDevice2/torch_openreg/openreg/extension_device_interface.py b/PyTorchSimDevice/torch_openreg/openreg/extension_device_interface.py
similarity index 100%
rename from PyTorchSimDevice2/torch_openreg/openreg/extension_device_interface.py
rename to PyTorchSimDevice/torch_openreg/openreg/extension_device_interface.py
diff --git a/PyTorchSimDevice2/torch_openreg/openreg/extension_device_op_overrides.py b/PyTorchSimDevice/torch_openreg/openreg/extension_device_op_overrides.py
similarity index 100%
rename from PyTorchSimDevice2/torch_openreg/openreg/extension_device_op_overrides.py
rename to PyTorchSimDevice/torch_openreg/openreg/extension_device_op_overrides.py
diff --git a/PyTorchSimDevice2/torch_openreg/openreg/meta.py b/PyTorchSimDevice/torch_openreg/openreg/meta.py
similarity index 100%
rename from PyTorchSimDevice2/torch_openreg/openreg/meta.py
rename to PyTorchSimDevice/torch_openreg/openreg/meta.py
diff --git a/PyTorchSimDevice2/torch_openreg/openreg/random.py b/PyTorchSimDevice/torch_openreg/openreg/random.py
similarity index 100%
rename from PyTorchSimDevice2/torch_openreg/openreg/random.py
rename to PyTorchSimDevice/torch_openreg/openreg/random.py

From 89546d788e599edf64ff08241c094b83d0d218d3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=EC=9D=B4=EC=9E=AC=EA=B7=A0?= <jamesgyun@gmail.com>
Date: Sat, 24 Jan 2026 14:18:36 +0900
Subject: [PATCH 091/194] [Test] Add YOLOv5 test file

---
 Dockerfile.base      |  5 +++
 tests/test_yolov5.py | 88 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 93 insertions(+)
 create mode 100644 tests/test_yolov5.py

diff --git a/Dockerfile.base b/Dockerfile.base
index c5f200bc..0fd950d2 100644
--- a/Dockerfile.base
+++ b/Dockerfile.base
@@ -45,6 +45,11 @@ RUN wget https://github.com/riscv-collab/riscv-gnu-toolchain/releases/download/2
 # Install torchsim dependency
 RUN apt install ninja-build && pip install onnx matplotlib && pip install --user conan==1.56.0 && pip install "transformers<4.44" && pip install diffusers==0.34.0
 
+# Extra Python deps for YOLO/vision tests
+RUN python -m pip install -U pip setuptools wheel && \
+    python -m pip install --no-cache-dir --no-deps ultralytics && \
+    python -m pip install --no-cache-dir opencv-python-headless pandas seaborn
+
 ENV RISCV=/workspace/riscv
 ENV PATH=$RISCV/bin:$PATH
 
diff --git a/tests/test_yolov5.py b/tests/test_yolov5.py
new file mode 100644
index 00000000..197b597a
--- /dev/null
+++ b/tests/test_yolov5.py
@@ -0,0 +1,88 @@
+import torch
+import torch._dynamo
+import torch.utils.cpp_extension
+
+import argparse
+import datetime
+
+import requests
+from PIL import Image
+from io import BytesIO
+from torchvision import transforms
+
+import os
+import shutil
+
+
+
+def run_yolo(batch, config):
+    from Scheduler.scheduler import PyTorchSimRunner
+    device = PyTorchSimRunner.setup_device().custom_device()
+
+    torch._dynamo.config.recompile_limit = 64
+    torch._dynamo.config.cache_size_limit = 128
+    
+    model = torch.hub.load("ultralytics/yolov5", "yolov5s").cpu().eval()
+    url = "https://ultralytics.com/images/zidane.jpg"
+    
+    response = requests.get(url)
+    img = Image.open(BytesIO(response.content)).convert("RGB")
+    
+    imgsz = 64    # 이미지 사이즈 줄여서 시뮬레이터 체크 가속
+    transform = transforms.Compose([
+        transforms.Resize((imgsz, imgsz)),
+        transforms.ToTensor(),
+    ])
+    
+    x = transform(img).unsqueeze(0)   # [1, 3, H, W]
+    x = x.to(device)
+    
+
+    model.to(device)
+    x = x.to(device)
+    
+    # Compile and run the model with PyTorchSim
+    compiled_model = torch.compile(dynamic=False)(model)
+    y = compiled_model(x)
+    print("Yolo Simulation Done")
+
+
+if __name__ == "__main__":
+    import sys
+
+    base_dir = os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim")
+    config = os.environ.get(
+        "TORCHSIM_CONFIG",
+        default=f"{base_dir}/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml",
+    )
+    config_prefix = config.split("/")[-1].split(".")[0][
+        9:
+    ]  # extract config name from config path
+    sys.path.append(base_dir)
+    args = argparse.ArgumentParser()
+    args.add_argument("--batch", type=int, default=1)
+    args.add_argument("--dump_path", type=str, default="results")
+    args = args.parse_args()
+    batch = args.batch
+    result_path = os.path.join(
+        base_dir,
+        args.dump_path,
+        config_prefix,
+        f"yolo5s_{batch}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}",
+    )
+    
+    
+    # setting environment variables
+    os.environ["TORCHSIM_LOG_PATH"] = result_path
+    os.environ["TORCHSIM_USE_TIMING_POOLING"] = "1"
+    
+    # only timing simulation
+    os.environ["TORCHSIM_VALIDATION_MODE"] = "0"
+    if "pytorchsim_functional_mode" in os.environ:
+        del os.environ["pytorchsim_functional_mode"]
+
+    # Clear extension/inductor caches to force rebuilds
+    shutil.rmtree("/tmp/torchinductor_root", ignore_errors=True)
+    shutil.rmtree(os.path.expanduser("~/.cache/torch_extensions/py311_cu126/npu"), ignore_errors=True)
+
+    run_yolo(batch, config)

From d5be66ec37c8cbd8306a324addc4908f734c158a Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 27 Jan 2026 10:26:48 +0000
Subject: [PATCH 092/194] [Cleanup] Fix indent error

---
 tests/Fusion/test_addmm_residual.py   | 3 ++-
 tests/Fusion/test_attention_fusion.py | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/Fusion/test_addmm_residual.py b/tests/Fusion/test_addmm_residual.py
index 917628e3..a2c17207 100644
--- a/tests/Fusion/test_addmm_residual.py
+++ b/tests/Fusion/test_addmm_residual.py
@@ -36,7 +36,8 @@ def addmm_residual(a, b, c, d):
     y = addmm_residual(b2, x2, w2, r2)
     test_result("Addmm + Residual Fusion Forward", res, y)
 
-if __name__ == "__main__":    device = torch.device("npu:0")
+if __name__ == "__main__":
+    device = torch.device("npu:0")
     test_addmm_residual(device, 32, 32, 32)
     test_addmm_residual(device, 128, 128, 128)
     test_addmm_residual(device, 512, 512, 512)
diff --git a/tests/Fusion/test_attention_fusion.py b/tests/Fusion/test_attention_fusion.py
index ebbd3037..93a17347 100644
--- a/tests/Fusion/test_attention_fusion.py
+++ b/tests/Fusion/test_attention_fusion.py
@@ -67,7 +67,8 @@ def test_MHA(device, num_heads=12, embed_dim=768, input_seq=512):
 
     test_result("MHA Forward", res, cpu_res)
 
-if __name__ == "__main__":    device = torch.device("npu:0")
+if __name__ == "__main__":
+    device = torch.device("npu:0")
     test_MHA(device)
     # test_Attention(device, head=16, seq=512, d_k=64)
     # test_MHA(device, num_heads=12, embed_dim=768)

From 5ec144d92b62a993bbf20d5c57baf5d879aa1e47 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 27 Jan 2026 10:28:48 +0000
Subject: [PATCH 093/194] [Test #204] Add yolov5 test ci

---
 .github/workflows/pytorchsim_test.yml | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/.github/workflows/pytorchsim_test.yml b/.github/workflows/pytorchsim_test.yml
index 8444f318..2d32ab5c 100644
--- a/.github/workflows/pytorchsim_test.yml
+++ b/.github/workflows/pytorchsim_test.yml
@@ -684,6 +684,27 @@ jobs:
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/Llama/test_llama.py
 
+  test_yolov5:
+    name: Run test_yolov5
+    runs-on: self-hosted
+    steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Run test_yolov5.py
+        run: |
+          echo "Running test_yolov5.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump \
+            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
+            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/Llama/test_yolov5.py
+
   test_accuracy:
     name: Run test_accuracy
     runs-on: self-hosted

From 730fce9bf81f6ea3a94778f2a3428728da3021d4 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 27 Jan 2026 10:30:11 +0000
Subject: [PATCH 094/194] [Fix] Remove comments

---
 .github/workflows/pytorchsim_test.yml |  2 +-
 tests/{ => Yolov5}/test_yolov5.py     | 30 ++-------------------------
 2 files changed, 3 insertions(+), 29 deletions(-)
 rename tests/{ => Yolov5}/test_yolov5.py (57%)

diff --git a/.github/workflows/pytorchsim_test.yml b/.github/workflows/pytorchsim_test.yml
index 2d32ab5c..9589384b 100644
--- a/.github/workflows/pytorchsim_test.yml
+++ b/.github/workflows/pytorchsim_test.yml
@@ -703,7 +703,7 @@ jobs:
             -e TORCHSIM_DUMP_PATH=/dump \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
-            ${{ inputs.image_name }} python3 PyTorchSim/tests/Llama/test_yolov5.py
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/Yolov5/test_yolov5.py
 
   test_accuracy:
     name: Run test_accuracy
diff --git a/tests/test_yolov5.py b/tests/Yolov5/test_yolov5.py
similarity index 57%
rename from tests/test_yolov5.py
rename to tests/Yolov5/test_yolov5.py
index 197b597a..d9e6b261 100644
--- a/tests/test_yolov5.py
+++ b/tests/Yolov5/test_yolov5.py
@@ -16,8 +16,7 @@
 
 
 def run_yolo(batch, config):
-    from Scheduler.scheduler import PyTorchSimRunner
-    device = PyTorchSimRunner.setup_device().custom_device()
+    device = torch.device("npu:0")
 
     torch._dynamo.config.recompile_limit = 64
     torch._dynamo.config.cache_size_limit = 128
@@ -28,7 +27,7 @@ def run_yolo(batch, config):
     response = requests.get(url)
     img = Image.open(BytesIO(response.content)).convert("RGB")
     
-    imgsz = 64    # 이미지 사이즈 줄여서 시뮬레이터 체크 가속
+    imgsz = 64
     transform = transforms.Compose([
         transforms.Resize((imgsz, imgsz)),
         transforms.ToTensor(),
@@ -48,41 +47,16 @@ def run_yolo(batch, config):
 
 
 if __name__ == "__main__":
-    import sys
 
     base_dir = os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim")
     config = os.environ.get(
         "TORCHSIM_CONFIG",
         default=f"{base_dir}/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml",
     )
-    config_prefix = config.split("/")[-1].split(".")[0][
-        9:
-    ]  # extract config name from config path
-    sys.path.append(base_dir)
     args = argparse.ArgumentParser()
     args.add_argument("--batch", type=int, default=1)
     args.add_argument("--dump_path", type=str, default="results")
     args = args.parse_args()
     batch = args.batch
-    result_path = os.path.join(
-        base_dir,
-        args.dump_path,
-        config_prefix,
-        f"yolo5s_{batch}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}",
-    )
-    
-    
-    # setting environment variables
-    os.environ["TORCHSIM_LOG_PATH"] = result_path
-    os.environ["TORCHSIM_USE_TIMING_POOLING"] = "1"
-    
-    # only timing simulation
-    os.environ["TORCHSIM_VALIDATION_MODE"] = "0"
-    if "pytorchsim_functional_mode" in os.environ:
-        del os.environ["pytorchsim_functional_mode"]
-
-    # Clear extension/inductor caches to force rebuilds
-    shutil.rmtree("/tmp/torchinductor_root", ignore_errors=True)
-    shutil.rmtree(os.path.expanduser("~/.cache/torch_extensions/py311_cu126/npu"), ignore_errors=True)
 
     run_yolo(batch, config)

From 47c563e14530b1940072d53ea98154b0a4e137ee Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 2 Feb 2026 06:46:36 +0000
Subject: [PATCH 095/194] [Frontend] Fix Identity handling for index expr

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 3 ++-
 PyTorchSimFrontend/mlir/mlir_common.py          | 9 +++------
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 1565a26b..c5da1f56 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -338,6 +338,7 @@ def convert_index(self, expr):
 
         expr_str = str(expr)
         if isinstance(expr, ModularIndexing):
+            dim = list(expr.args[0].free_symbols)[0]
             replace_str = f"({expr.args[0]} floordiv {expr.args[1]}) mod {expr.args[2]}"
             expr_str = re.sub(r"ModularIndexing\([^)]*\)", replace_str, expr_str)
         elif "//" in expr_str:
@@ -1233,7 +1234,7 @@ def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffe
                 if isinstance(sub, ModularIndexing):
                     if not str(sub.args[0]).startswith("index"):
                         continue
-                    dim_idx = int((str(sub.args[0])[5:]))
+                    dim_idx = int((str(list(sub.args[0].free_symbols)[0])[5:]))
                     floor_divisor = sub.args[1]  # y: floorDiv divisor
                     mod_divisor = sub.args[2]    # z: modular divisor
                     current_tile_size = self.kernel_group.tile_desc.get_tile_size()[dim_idx]
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index be491925..f101b7cb 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -852,12 +852,9 @@ def rename_indexing(self, index) -> sympy.Expr:
             index = index.args[0] if index.args else index
 
         # Replace Identity arguments with Identity.args[0]
-        if hasattr(index, 'args') and len(index.args) > 0:
-            for arg in index.args:
-                if arg.is_Mul and arg.args[0].is_number and isinstance(arg.args[1], Identity):
-                    index = index.replace(arg.args[1], arg.args[1].args[0] if arg.args[1].args else arg.args[1])
-                if isinstance(arg, Identity):
-                    index = index.replace(arg, arg.args[0] if arg.args else arg)
+        Identity_args = [expr for expr in sympy.preorder_traversal(index) if isinstance(expr, Identity)]
+        for expr in Identity_args:
+            index = index.replace(expr, expr.args[0] if expr.args else expr)
 
         index = V.graph.sizevars.simplify(index)
         sorted_symbols = sorted(index.free_symbols, key=lambda s: s.name)

From d3cf8633f463697ea2660abc8ce22e813532ff1f Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 3 Feb 2026 04:33:22 +0000
Subject: [PATCH 096/194] [OpenReg] Add Python interface for device stream,
 event API

---
 .../torch_openreg/csrc/CMakeLists.txt         |   4 +
 .../torch_openreg/csrc/Module.cpp             | 292 ++++++++++++++++++
 .../torch_openreg/openreg/__init__.py         | 117 +++++++
 tests/test_stream.py                          |  22 ++
 4 files changed, 435 insertions(+)
 create mode 100644 tests/test_stream.py

diff --git a/PyTorchSimDevice/torch_openreg/csrc/CMakeLists.txt b/PyTorchSimDevice/torch_openreg/csrc/CMakeLists.txt
index 4ff321c4..2a29a89c 100644
--- a/PyTorchSimDevice/torch_openreg/csrc/CMakeLists.txt
+++ b/PyTorchSimDevice/torch_openreg/csrc/CMakeLists.txt
@@ -6,6 +6,10 @@ file(GLOB_RECURSE SOURCE_FILES
 
 add_library(${LIBRARY_NAME} SHARED ${SOURCE_FILES})
 
+target_include_directories(${LIBRARY_NAME} PRIVATE
+    ${PROJECT_SOURCE_DIR}/third_party/openreg
+)
+
 target_link_libraries(${LIBRARY_NAME} PRIVATE torch_python_library torch_openreg)
 
 if(WIN32)
diff --git a/PyTorchSimDevice/torch_openreg/csrc/Module.cpp b/PyTorchSimDevice/torch_openreg/csrc/Module.cpp
index 052a9ed4..31d0c6a8 100644
--- a/PyTorchSimDevice/torch_openreg/csrc/Module.cpp
+++ b/PyTorchSimDevice/torch_openreg/csrc/Module.cpp
@@ -10,6 +10,10 @@
 
 #include <runtime/OpenRegFunctions.h>
 #include <amp/OpenRegAmp.h>
+#include <include/openreg.h>
+#include <functional>
+#include <memory>
+#include <thread>
 
 static PyObject* _initExtension(PyObject* self, PyObject* noargs) {
   HANDLE_TH_ERRORS
@@ -135,6 +139,274 @@ PyObject* _getAmpSupportedDtype(PyObject* self, PyObject* noargs) {
   END_HANDLE_TH_ERRORS
 }
 
+// Stream functions
+PyObject* _streamCreate(PyObject* self, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  torch::utils::device_lazy_init(at::kPrivateUse1);
+  orStream_t stream = nullptr;
+  orError_t err = orStreamCreate(&stream);
+  std::cerr << "[DEBUG] Stream created: " << stream << std::endl;
+  if (err != orSuccess) {
+    TORCH_CHECK(false, "Failed to create stream");
+  }
+  return THPUtils_packInt64(reinterpret_cast<int64_t>(stream));
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* _streamCreateWithPriority(PyObject* self, PyObject* args) {
+  HANDLE_TH_ERRORS
+  TORCH_CHECK(PyTuple_Size(args) == 2, "stream_create_with_priority expects 2 arguments");
+  PyObject* flags_obj = PyTuple_GetItem(args, 0);
+  PyObject* priority_obj = PyTuple_GetItem(args, 1);
+  TORCH_CHECK(THPUtils_checkLong(flags_obj), "flags must be an int");
+  TORCH_CHECK(THPUtils_checkLong(priority_obj), "priority must be an int");
+  unsigned int flags = static_cast<unsigned int>(THPUtils_unpackLong(flags_obj));
+  int priority = static_cast<int>(THPUtils_unpackLong(priority_obj));
+
+  torch::utils::device_lazy_init(at::kPrivateUse1);
+  orStream_t stream = nullptr;
+  orError_t err = orStreamCreateWithPriority(&stream, flags, priority);
+  if (err != orSuccess) {
+    TORCH_CHECK(false, "Failed to create stream with priority");
+  }
+  return THPUtils_packInt64(reinterpret_cast<int64_t>(stream));
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* _streamDestroy(PyObject* self, PyObject* arg) {
+  HANDLE_TH_ERRORS
+  TORCH_CHECK(THPUtils_checkLong(arg), "stream_destroy expects an int");
+  orStream_t stream = reinterpret_cast<orStream_t>(THPUtils_unpackLong(arg));
+  orError_t err = orStreamDestroy(stream);
+  if (err != orSuccess) {
+    TORCH_CHECK(false, "Failed to destroy stream");
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* _streamSynchronize(PyObject* self, PyObject* arg) {
+  HANDLE_TH_ERRORS
+  TORCH_CHECK(THPUtils_checkLong(arg), "stream_synchronize expects an int");
+  orStream_t stream = reinterpret_cast<orStream_t>(THPUtils_unpackLong(arg));
+
+  orError_t err;
+  Py_BEGIN_ALLOW_THREADS
+  err = orStreamSynchronize(stream);
+  Py_END_ALLOW_THREADS
+
+  if (err != orSuccess) {
+    TORCH_CHECK(false, "Failed to synchronize stream");
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* _streamQuery(PyObject* self, PyObject* arg) {
+  HANDLE_TH_ERRORS
+  TORCH_CHECK(THPUtils_checkLong(arg), "stream_query expects an int");
+  orStream_t stream = reinterpret_cast<orStream_t>(THPUtils_unpackLong(arg));
+  orError_t err = orStreamQuery(stream);
+  if (err == orSuccess) {
+    Py_RETURN_TRUE;
+  } else {
+    Py_RETURN_FALSE;
+  }
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* _streamGetPriority(PyObject* self, PyObject* arg) {
+  HANDLE_TH_ERRORS
+  TORCH_CHECK(THPUtils_checkLong(arg), "stream_get_priority expects an int");
+  orStream_t stream = reinterpret_cast<orStream_t>(THPUtils_unpackLong(arg));
+  int priority = 0;
+  orError_t err = orStreamGetPriority(stream, &priority);
+  if (err != orSuccess) {
+    TORCH_CHECK(false, "Failed to get stream priority");
+  }
+  return THPUtils_packInt32(priority);
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* _streamWaitEvent(PyObject* self, PyObject* args) {
+  HANDLE_TH_ERRORS
+  TORCH_CHECK(PyTuple_Size(args) == 2, "stream_wait_event expects 2 arguments");
+  PyObject* stream_obj = PyTuple_GetItem(args, 0);
+  PyObject* event_obj = PyTuple_GetItem(args, 1);
+  TORCH_CHECK(THPUtils_checkLong(stream_obj), "stream must be an int");
+  TORCH_CHECK(THPUtils_checkLong(event_obj), "event must be an int");
+  orStream_t stream = reinterpret_cast<orStream_t>(THPUtils_unpackLong(stream_obj));
+  orEvent_t event = reinterpret_cast<orEvent_t>(THPUtils_unpackLong(event_obj));
+  orError_t err = orStreamWaitEvent(stream, event, 0);
+  if (err != orSuccess) {
+    TORCH_CHECK(false, "Failed to wait for event");
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+// Event functions
+PyObject* _eventCreate(PyObject* self, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  torch::utils::device_lazy_init(at::kPrivateUse1);
+  orEvent_t event = nullptr;
+  orError_t err = orEventCreate(&event);
+  if (err != orSuccess) {
+    TORCH_CHECK(false, "Failed to create event");
+  }
+  return THPUtils_packInt64(reinterpret_cast<int64_t>(event));
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* _eventCreateWithFlags(PyObject* self, PyObject* arg) {
+  HANDLE_TH_ERRORS
+  TORCH_CHECK(THPUtils_checkLong(arg), "event_create_with_flags expects an int");
+  unsigned int flags = static_cast<unsigned int>(THPUtils_unpackLong(arg));
+
+  torch::utils::device_lazy_init(at::kPrivateUse1);
+  orEvent_t event = nullptr;
+  orError_t err = orEventCreateWithFlags(&event, flags);
+  if (err != orSuccess) {
+    TORCH_CHECK(false, "Failed to create event with flags");
+  }
+  return THPUtils_packInt64(reinterpret_cast<int64_t>(event));
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* _eventDestroy(PyObject* self, PyObject* arg) {
+  HANDLE_TH_ERRORS
+  TORCH_CHECK(THPUtils_checkLong(arg), "event_destroy expects an int");
+  orEvent_t event = reinterpret_cast<orEvent_t>(THPUtils_unpackLong(arg));
+  orError_t err = orEventDestroy(event);
+  if (err != orSuccess) {
+    TORCH_CHECK(false, "Failed to destroy event");
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* _eventRecord(PyObject* self, PyObject* args) {
+  HANDLE_TH_ERRORS
+  TORCH_CHECK(PyTuple_Size(args) == 2, "event_record expects 2 arguments");
+  PyObject* event_obj = PyTuple_GetItem(args, 0);
+  PyObject* stream_obj = PyTuple_GetItem(args, 1);
+  TORCH_CHECK(THPUtils_checkLong(event_obj), "event must be an int");
+  TORCH_CHECK(THPUtils_checkLong(stream_obj), "stream must be an int");
+  orEvent_t event = reinterpret_cast<orEvent_t>(THPUtils_unpackLong(event_obj));
+  orStream_t stream = reinterpret_cast<orStream_t>(THPUtils_unpackLong(stream_obj));
+  orError_t err = orEventRecord(event, stream);
+  if (err != orSuccess) {
+    TORCH_CHECK(false, "Failed to record event");
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* _eventSynchronize(PyObject* self, PyObject* arg) {
+  HANDLE_TH_ERRORS
+  TORCH_CHECK(THPUtils_checkLong(arg), "event_synchronize expects an int");
+  orEvent_t event = reinterpret_cast<orEvent_t>(THPUtils_unpackLong(arg));
+
+  orError_t err;
+  Py_BEGIN_ALLOW_THREADS
+  err = orEventSynchronize(event);
+  Py_END_ALLOW_THREADS
+
+  if (err != orSuccess) {
+    TORCH_CHECK(false, "Failed to synchronize event");
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* _eventQuery(PyObject* self, PyObject* arg) {
+  HANDLE_TH_ERRORS
+  TORCH_CHECK(THPUtils_checkLong(arg), "event_query expects an int");
+  orEvent_t event = reinterpret_cast<orEvent_t>(THPUtils_unpackLong(arg));
+  orError_t err = orEventQuery(event);
+  if (err == orSuccess) {
+    Py_RETURN_TRUE;
+  } else {
+    Py_RETURN_FALSE;
+  }
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* _eventElapsedTime(PyObject* self, PyObject* args) {
+  HANDLE_TH_ERRORS
+  TORCH_CHECK(PyTuple_Size(args) == 2, "event_elapsed_time expects 2 arguments");
+  PyObject* start_obj = PyTuple_GetItem(args, 0);
+  PyObject* end_obj = PyTuple_GetItem(args, 1);
+  TORCH_CHECK(THPUtils_checkLong(start_obj), "start event must be an int");
+  TORCH_CHECK(THPUtils_checkLong(end_obj), "end event must be an int");
+  orEvent_t start = reinterpret_cast<orEvent_t>(THPUtils_unpackLong(start_obj));
+  orEvent_t end = reinterpret_cast<orEvent_t>(THPUtils_unpackLong(end_obj));
+  float ms = 0.0f;
+  orError_t err = orEventElapsedTime(&ms, start, end);
+  if (err != orSuccess) {
+    TORCH_CHECK(false, "Failed to get elapsed time");
+  }
+  return PyFloat_FromDouble(static_cast<double>(ms));
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* _deviceSynchronize(PyObject* self, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  torch::utils::device_lazy_init(at::kPrivateUse1);
+
+  orError_t err;
+  Py_BEGIN_ALLOW_THREADS
+  err = orDeviceSynchronize();
+  Py_END_ALLOW_THREADS
+
+  if (err != orSuccess) {
+    TORCH_CHECK(false, "Failed to synchronize device");
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* _addTaskToStream(PyObject* self, PyObject* args) {
+  HANDLE_TH_ERRORS
+  TORCH_CHECK(PyTuple_Size(args) == 2, "add_task_to_stream expects 2 arguments");
+  PyObject* stream_obj = PyTuple_GetItem(args, 0);
+  PyObject* callable_obj = PyTuple_GetItem(args, 1);
+
+  TORCH_CHECK(THPUtils_checkLong(stream_obj), "stream must be an int");
+  TORCH_CHECK(PyCallable_Check(callable_obj), "task must be callable");
+
+  orStream_t stream = reinterpret_cast<orStream_t>(THPUtils_unpackLong(stream_obj));
+
+  Py_INCREF(callable_obj);
+  auto py_callable = std::shared_ptr<PyObject>(callable_obj, [](PyObject* obj) {
+    PyGILState_STATE gstate = PyGILState_Ensure();
+    Py_DECREF(obj);
+    PyGILState_Release(gstate);
+  });
+
+  auto task = [py_callable]() {
+    PyGILState_STATE gstate = PyGILState_Ensure();
+    try {
+      PyObject* result = PyObject_CallObject(py_callable.get(), nullptr);
+      if (result == nullptr) {
+        PyErr_Print();
+        PyErr_Clear();
+      } else {
+        Py_DECREF(result);
+      }
+    } catch (...) {
+    }
+
+    PyGILState_Release(gstate);
+  };
+  orError_t err = openreg::addTaskToStream(stream, task);
+  if (err != orSuccess) {
+    TORCH_CHECK(false, "Failed to add task to stream");
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
 static PyMethodDef methods[] = {
     {"_init", _initExtension, METH_NOARGS, nullptr},
     {"_get_default_generator", _getDefaultGenerator, METH_O, nullptr},
@@ -147,6 +419,26 @@ static PyMethodDef methods[] = {
     {"get_autocast_dtype", _getAutocastDtype, METH_NOARGS, nullptr},
     {"set_autocast_dtype", _setAutocastDtype, METH_O, nullptr},
     {"get_amp_supported_dtype", _getAmpSupportedDtype, METH_NOARGS, nullptr},
+    // Stream functions
+    {"_stream_create", _streamCreate, METH_NOARGS, nullptr},
+    {"_stream_create_with_priority", _streamCreateWithPriority, METH_VARARGS, nullptr},
+    {"_stream_destroy", _streamDestroy, METH_O, nullptr},
+    {"_stream_synchronize", _streamSynchronize, METH_O, nullptr},
+    {"_stream_query", _streamQuery, METH_O, nullptr},
+    {"_stream_get_priority", _streamGetPriority, METH_O, nullptr},
+    {"_stream_wait_event", _streamWaitEvent, METH_VARARGS, nullptr},
+    // Event functions
+    {"_event_create", _eventCreate, METH_NOARGS, nullptr},
+    {"_event_create_with_flags", _eventCreateWithFlags, METH_O, nullptr},
+    {"_event_destroy", _eventDestroy, METH_O, nullptr},
+    {"_event_record", _eventRecord, METH_VARARGS, nullptr},
+    {"_event_synchronize", _eventSynchronize, METH_O, nullptr},
+    {"_event_query", _eventQuery, METH_O, nullptr},
+    {"_event_elapsed_time", _eventElapsedTime, METH_VARARGS, nullptr},
+    // Device functions
+    {"_device_synchronize", _deviceSynchronize, METH_NOARGS, nullptr},
+    // Stream task functions
+    {"_add_task_to_stream", _addTaskToStream, METH_VARARGS, nullptr},
     {nullptr, nullptr, 0, nullptr}};
 
 /*
diff --git a/PyTorchSimDevice/torch_openreg/openreg/__init__.py b/PyTorchSimDevice/torch_openreg/openreg/__init__.py
index 81c2fc60..b7d28291 100644
--- a/PyTorchSimDevice/torch_openreg/openreg/__init__.py
+++ b/PyTorchSimDevice/torch_openreg/openreg/__init__.py
@@ -65,6 +65,118 @@ def _lazy_init():
     _initialized = True
 
 
+class Stream:
+    """Wrapper for OpenReg stream."""
+
+    def __init__(self, priority=None, flags=0):
+        if priority is not None:
+            self._stream = torch_openreg._C._stream_create_with_priority(flags, priority)
+        else:
+            self._stream = torch_openreg._C._stream_create()
+
+    def __del__(self):
+        if hasattr(self, '_stream'):
+            torch_openreg._C._stream_destroy(self._stream)
+
+    def synchronize(self):
+        """Wait for all operations in the stream to complete."""
+        torch_openreg._C._stream_synchronize(self._stream)
+
+    def query(self):
+        """Check if all operations in the stream have completed."""
+        return torch_openreg._C._stream_query(self._stream)
+
+    def wait_event(self, event):
+        """Make this stream wait for an event."""
+        torch_openreg._C._stream_wait_event(self._stream, event._event)
+
+    def get_priority(self):
+        """Get the priority of the stream."""
+        return torch_openreg._C._stream_get_priority(self._stream)
+
+    def launch_kernel(self, task):
+        """Add a Python callable kernel to this stream.
+
+        Args:
+            task: A Python callable (function) to be executed in the stream
+        """
+        torch_openreg._C._add_task_to_stream(self._stream, task)
+
+    @property
+    def cdata(self):
+        """Get the underlying stream pointer (for internal use)."""
+        return self._stream
+
+
+class Event:
+    """Wrapper for OpenReg event."""
+
+    def __init__(self, enable_timing=False):
+        if enable_timing:
+            # orEventEnableTiming = 0x1
+            self._event = torch_openreg._C._event_create_with_flags(0x1)
+        else:
+            self._event = torch_openreg._C._event_create()
+
+    def __del__(self):
+        if hasattr(self, '_event'):
+            torch_openreg._C._event_destroy(self._event)
+
+    def record(self, stream=None):
+        """Record the event in a stream."""
+        if stream is None:
+            # Use default stream (stream 0)
+            stream = Stream()
+        torch_openreg._C._event_record(self._event, stream._stream)
+
+    def synchronize(self):
+        """Wait for the event to complete."""
+        torch_openreg._C._event_synchronize(self._event)
+
+    def query(self):
+        """Check if the event has completed."""
+        return torch_openreg._C._event_query(self._event)
+
+    def elapsed_time(self, start_event):
+        """Get the elapsed time between two events in milliseconds."""
+        return torch_openreg._C._event_elapsed_time(start_event._event, self._event)
+
+    @property
+    def cdata(self):
+        """Get the underlying event pointer (for internal use)."""
+        return self._event
+
+
+def synchronize():
+    """Synchronize all streams on the current device."""
+    torch_openreg._C._device_synchronize()
+
+
+def stream(priority=None, flags=0):
+    """Create a new stream.
+
+    Args:
+        priority: Stream priority (optional)
+        flags: Stream flags (optional)
+
+    Returns:
+        Stream: A new stream object
+    """
+    return Stream(priority=priority, flags=flags)
+
+
+def event(enable_timing=False):
+    """Create a new event.
+
+    Args:
+        enable_timing: Whether to enable timing for the event
+
+    Returns:
+        Event: A new event object
+    """
+    return Event(enable_timing=enable_timing)
+
+
 from .random import *  # noqa: F403
 from .amp import *
 
@@ -88,4 +200,9 @@ def _lazy_init():
     "get_autocast_dtype",
     "set_autocast_dtype",
     "get_amp_supported_dtype",
+    "Stream",
+    "Event",
+    "stream",
+    "event",
+    "synchronize",
 ]
diff --git a/tests/test_stream.py b/tests/test_stream.py
new file mode 100644
index 00000000..70077abe
--- /dev/null
+++ b/tests/test_stream.py
@@ -0,0 +1,22 @@
+import torch
+import time
+
+start_event = torch.npu.event(enable_timing=True)
+end_event = torch.npu.event(enable_timing=True)
+stream = torch.npu.stream()
+
+def my_kernel():
+    print("Task is running...")
+    result = sum(range(1000))
+    time.sleep(2.5)
+    print(f"Task completed with result: {result}")
+
+start_event.record(stream)
+stream.launch_kernel(my_kernel)
+end_event.record(stream)
+
+
+stream.synchronize()
+
+elapsed_time = end_event.elapsed_time(start_event)
+print("Event has completed! ", elapsed_time)
\ No newline at end of file

From 5224cc965421172e48b3b1607cd0183f1b5e3c33 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 3 Feb 2026 11:45:22 +0000
Subject: [PATCH 097/194] [Scheduler] Reimplement Scheduling mechanism

---
 .../torch_openreg/csrc/Module.cpp             | 180 +-------
 .../torch_openreg/openreg/__init__.py         | 206 ++++++----
 .../torch_openreg/openreg/random.py           |   6 +
 PyTorchSimFrontend/extension_codecache.py     |  79 ++--
 PyTorchSimFrontend/extension_config.py        |   3 -
 PyTorchSimFrontend/extension_op.py            |   7 +-
 PyTorchSimFrontend/mlir/mlir_autotune.py      |  26 +-
 .../mlir/mlir_codegen_backend.py              |   3 +-
 PyTorchSimFrontend/mlir/mlir_conv_common.py   |   2 -
 .../mlir/mlir_conv_mt_template.py             |   3 -
 .../mlir/mlir_conv_sb_template.py             |   3 -
 .../mlir/mlir_conv_sbs_template.py            |   3 -
 PyTorchSimFrontend/mlir/mlir_conv_template.py |   3 -
 PyTorchSimFrontend/mlir/mlir_scheduling.py    |  12 -
 Scheduler/scheduler.py                        |   3 +-
 Simulator/simulator.py                        | 388 ++++++++++++------
 TOGSim/include/TileGraph.h                    |   6 +
 TOGSim/include/TileGraphParser.h              |   2 +-
 TOGSim/src/Simulator.cc                       |  50 +--
 TOGSim/src/TileGraphParser.cc                 |   4 +-
 TOGSim/src/main.cc                            | 165 ++++----
 TOGSim/src/scheduler/Scheduler.cc             |  25 +-
 scripts/stonne_experiment2/tog_gen.py         |   4 +-
 tests/test_scheduler.py                       |  44 +-
 tests/test_stream.py                          |  16 +-
 25 files changed, 585 insertions(+), 658 deletions(-)

diff --git a/PyTorchSimDevice/torch_openreg/csrc/Module.cpp b/PyTorchSimDevice/torch_openreg/csrc/Module.cpp
index 31d0c6a8..e4f3e8d1 100644
--- a/PyTorchSimDevice/torch_openreg/csrc/Module.cpp
+++ b/PyTorchSimDevice/torch_openreg/csrc/Module.cpp
@@ -145,7 +145,6 @@ PyObject* _streamCreate(PyObject* self, PyObject* noargs) {
   torch::utils::device_lazy_init(at::kPrivateUse1);
   orStream_t stream = nullptr;
   orError_t err = orStreamCreate(&stream);
-  std::cerr << "[DEBUG] Stream created: " << stream << std::endl;
   if (err != orSuccess) {
     TORCH_CHECK(false, "Failed to create stream");
   }
@@ -185,171 +184,6 @@ PyObject* _streamDestroy(PyObject* self, PyObject* arg) {
   END_HANDLE_TH_ERRORS
 }
 
-PyObject* _streamSynchronize(PyObject* self, PyObject* arg) {
-  HANDLE_TH_ERRORS
-  TORCH_CHECK(THPUtils_checkLong(arg), "stream_synchronize expects an int");
-  orStream_t stream = reinterpret_cast<orStream_t>(THPUtils_unpackLong(arg));
-
-  orError_t err;
-  Py_BEGIN_ALLOW_THREADS
-  err = orStreamSynchronize(stream);
-  Py_END_ALLOW_THREADS
-
-  if (err != orSuccess) {
-    TORCH_CHECK(false, "Failed to synchronize stream");
-  }
-  Py_RETURN_NONE;
-  END_HANDLE_TH_ERRORS
-}
-
-PyObject* _streamQuery(PyObject* self, PyObject* arg) {
-  HANDLE_TH_ERRORS
-  TORCH_CHECK(THPUtils_checkLong(arg), "stream_query expects an int");
-  orStream_t stream = reinterpret_cast<orStream_t>(THPUtils_unpackLong(arg));
-  orError_t err = orStreamQuery(stream);
-  if (err == orSuccess) {
-    Py_RETURN_TRUE;
-  } else {
-    Py_RETURN_FALSE;
-  }
-  END_HANDLE_TH_ERRORS
-}
-
-PyObject* _streamGetPriority(PyObject* self, PyObject* arg) {
-  HANDLE_TH_ERRORS
-  TORCH_CHECK(THPUtils_checkLong(arg), "stream_get_priority expects an int");
-  orStream_t stream = reinterpret_cast<orStream_t>(THPUtils_unpackLong(arg));
-  int priority = 0;
-  orError_t err = orStreamGetPriority(stream, &priority);
-  if (err != orSuccess) {
-    TORCH_CHECK(false, "Failed to get stream priority");
-  }
-  return THPUtils_packInt32(priority);
-  END_HANDLE_TH_ERRORS
-}
-
-PyObject* _streamWaitEvent(PyObject* self, PyObject* args) {
-  HANDLE_TH_ERRORS
-  TORCH_CHECK(PyTuple_Size(args) == 2, "stream_wait_event expects 2 arguments");
-  PyObject* stream_obj = PyTuple_GetItem(args, 0);
-  PyObject* event_obj = PyTuple_GetItem(args, 1);
-  TORCH_CHECK(THPUtils_checkLong(stream_obj), "stream must be an int");
-  TORCH_CHECK(THPUtils_checkLong(event_obj), "event must be an int");
-  orStream_t stream = reinterpret_cast<orStream_t>(THPUtils_unpackLong(stream_obj));
-  orEvent_t event = reinterpret_cast<orEvent_t>(THPUtils_unpackLong(event_obj));
-  orError_t err = orStreamWaitEvent(stream, event, 0);
-  if (err != orSuccess) {
-    TORCH_CHECK(false, "Failed to wait for event");
-  }
-  Py_RETURN_NONE;
-  END_HANDLE_TH_ERRORS
-}
-
-// Event functions
-PyObject* _eventCreate(PyObject* self, PyObject* noargs) {
-  HANDLE_TH_ERRORS
-  torch::utils::device_lazy_init(at::kPrivateUse1);
-  orEvent_t event = nullptr;
-  orError_t err = orEventCreate(&event);
-  if (err != orSuccess) {
-    TORCH_CHECK(false, "Failed to create event");
-  }
-  return THPUtils_packInt64(reinterpret_cast<int64_t>(event));
-  END_HANDLE_TH_ERRORS
-}
-
-PyObject* _eventCreateWithFlags(PyObject* self, PyObject* arg) {
-  HANDLE_TH_ERRORS
-  TORCH_CHECK(THPUtils_checkLong(arg), "event_create_with_flags expects an int");
-  unsigned int flags = static_cast<unsigned int>(THPUtils_unpackLong(arg));
-
-  torch::utils::device_lazy_init(at::kPrivateUse1);
-  orEvent_t event = nullptr;
-  orError_t err = orEventCreateWithFlags(&event, flags);
-  if (err != orSuccess) {
-    TORCH_CHECK(false, "Failed to create event with flags");
-  }
-  return THPUtils_packInt64(reinterpret_cast<int64_t>(event));
-  END_HANDLE_TH_ERRORS
-}
-
-PyObject* _eventDestroy(PyObject* self, PyObject* arg) {
-  HANDLE_TH_ERRORS
-  TORCH_CHECK(THPUtils_checkLong(arg), "event_destroy expects an int");
-  orEvent_t event = reinterpret_cast<orEvent_t>(THPUtils_unpackLong(arg));
-  orError_t err = orEventDestroy(event);
-  if (err != orSuccess) {
-    TORCH_CHECK(false, "Failed to destroy event");
-  }
-  Py_RETURN_NONE;
-  END_HANDLE_TH_ERRORS
-}
-
-PyObject* _eventRecord(PyObject* self, PyObject* args) {
-  HANDLE_TH_ERRORS
-  TORCH_CHECK(PyTuple_Size(args) == 2, "event_record expects 2 arguments");
-  PyObject* event_obj = PyTuple_GetItem(args, 0);
-  PyObject* stream_obj = PyTuple_GetItem(args, 1);
-  TORCH_CHECK(THPUtils_checkLong(event_obj), "event must be an int");
-  TORCH_CHECK(THPUtils_checkLong(stream_obj), "stream must be an int");
-  orEvent_t event = reinterpret_cast<orEvent_t>(THPUtils_unpackLong(event_obj));
-  orStream_t stream = reinterpret_cast<orStream_t>(THPUtils_unpackLong(stream_obj));
-  orError_t err = orEventRecord(event, stream);
-  if (err != orSuccess) {
-    TORCH_CHECK(false, "Failed to record event");
-  }
-  Py_RETURN_NONE;
-  END_HANDLE_TH_ERRORS
-}
-
-PyObject* _eventSynchronize(PyObject* self, PyObject* arg) {
-  HANDLE_TH_ERRORS
-  TORCH_CHECK(THPUtils_checkLong(arg), "event_synchronize expects an int");
-  orEvent_t event = reinterpret_cast<orEvent_t>(THPUtils_unpackLong(arg));
-
-  orError_t err;
-  Py_BEGIN_ALLOW_THREADS
-  err = orEventSynchronize(event);
-  Py_END_ALLOW_THREADS
-
-  if (err != orSuccess) {
-    TORCH_CHECK(false, "Failed to synchronize event");
-  }
-  Py_RETURN_NONE;
-  END_HANDLE_TH_ERRORS
-}
-
-PyObject* _eventQuery(PyObject* self, PyObject* arg) {
-  HANDLE_TH_ERRORS
-  TORCH_CHECK(THPUtils_checkLong(arg), "event_query expects an int");
-  orEvent_t event = reinterpret_cast<orEvent_t>(THPUtils_unpackLong(arg));
-  orError_t err = orEventQuery(event);
-  if (err == orSuccess) {
-    Py_RETURN_TRUE;
-  } else {
-    Py_RETURN_FALSE;
-  }
-  END_HANDLE_TH_ERRORS
-}
-
-PyObject* _eventElapsedTime(PyObject* self, PyObject* args) {
-  HANDLE_TH_ERRORS
-  TORCH_CHECK(PyTuple_Size(args) == 2, "event_elapsed_time expects 2 arguments");
-  PyObject* start_obj = PyTuple_GetItem(args, 0);
-  PyObject* end_obj = PyTuple_GetItem(args, 1);
-  TORCH_CHECK(THPUtils_checkLong(start_obj), "start event must be an int");
-  TORCH_CHECK(THPUtils_checkLong(end_obj), "end event must be an int");
-  orEvent_t start = reinterpret_cast<orEvent_t>(THPUtils_unpackLong(start_obj));
-  orEvent_t end = reinterpret_cast<orEvent_t>(THPUtils_unpackLong(end_obj));
-  float ms = 0.0f;
-  orError_t err = orEventElapsedTime(&ms, start, end);
-  if (err != orSuccess) {
-    TORCH_CHECK(false, "Failed to get elapsed time");
-  }
-  return PyFloat_FromDouble(static_cast<double>(ms));
-  END_HANDLE_TH_ERRORS
-}
-
 PyObject* _deviceSynchronize(PyObject* self, PyObject* noargs) {
   HANDLE_TH_ERRORS
   torch::utils::device_lazy_init(at::kPrivateUse1);
@@ -421,20 +255,8 @@ static PyMethodDef methods[] = {
     {"get_amp_supported_dtype", _getAmpSupportedDtype, METH_NOARGS, nullptr},
     // Stream functions
     {"_stream_create", _streamCreate, METH_NOARGS, nullptr},
-    {"_stream_create_with_priority", _streamCreateWithPriority, METH_VARARGS, nullptr},
     {"_stream_destroy", _streamDestroy, METH_O, nullptr},
-    {"_stream_synchronize", _streamSynchronize, METH_O, nullptr},
-    {"_stream_query", _streamQuery, METH_O, nullptr},
-    {"_stream_get_priority", _streamGetPriority, METH_O, nullptr},
-    {"_stream_wait_event", _streamWaitEvent, METH_VARARGS, nullptr},
-    // Event functions
-    {"_event_create", _eventCreate, METH_NOARGS, nullptr},
-    {"_event_create_with_flags", _eventCreateWithFlags, METH_O, nullptr},
-    {"_event_destroy", _eventDestroy, METH_O, nullptr},
-    {"_event_record", _eventRecord, METH_VARARGS, nullptr},
-    {"_event_synchronize", _eventSynchronize, METH_O, nullptr},
-    {"_event_query", _eventQuery, METH_O, nullptr},
-    {"_event_elapsed_time", _eventElapsedTime, METH_VARARGS, nullptr},
+
     // Device functions
     {"_device_synchronize", _deviceSynchronize, METH_NOARGS, nullptr},
     // Stream task functions
diff --git a/PyTorchSimDevice/torch_openreg/openreg/__init__.py b/PyTorchSimDevice/torch_openreg/openreg/__init__.py
index b7d28291..66ec022a 100644
--- a/PyTorchSimDevice/torch_openreg/openreg/__init__.py
+++ b/PyTorchSimDevice/torch_openreg/openreg/__init__.py
@@ -1,6 +1,8 @@
+import os
+import threading
+
 import torch
 from torch._dynamo.device_interface import register_interface_for_device
-
 import torch_openreg._C  # type: ignore[misc]
 
 from . import meta  # noqa: F401
@@ -8,7 +10,9 @@
 from .extension_device_interface import ExtensionDeviceInterface
 
 _initialized = False
-
+_default_streams = {}  # Dictionary to store default streams per device
+_tog_simulator = None  # Singleton TOGSimulator instance
+_launch_context = threading.local() # storage for launch_kernel context
 
 class device:
     r"""Context-manager that changes the selected device.
@@ -57,43 +61,28 @@ def is_initialized():
 
 
 def _lazy_init():
-    global _initialized
+    global _initialized, _tog_simulator
     if is_initialized():
         return
     torch_openreg._C._init()
     register_interface_for_device(custom_device(), ExtensionDeviceInterface)
     _initialized = True
 
+    # Create default streams for all devices
+    num_devices = device_count()
+    for device_idx in range(num_devices):
+        _default_streams[device_idx] = Stream()
 
 class Stream:
     """Wrapper for OpenReg stream."""
 
-    def __init__(self, priority=None, flags=0):
-        if priority is not None:
-            self._stream = torch_openreg._C._stream_create_with_priority(flags, priority)
-        else:
-            self._stream = torch_openreg._C._stream_create()
+    def __init__(self, flags=0):
+        self._stream = torch_openreg._C._stream_create()
 
     def __del__(self):
         if hasattr(self, '_stream'):
             torch_openreg._C._stream_destroy(self._stream)
 
-    def synchronize(self):
-        """Wait for all operations in the stream to complete."""
-        torch_openreg._C._stream_synchronize(self._stream)
-
-    def query(self):
-        """Check if all operations in the stream have completed."""
-        return torch_openreg._C._stream_query(self._stream)
-
-    def wait_event(self, event):
-        """Make this stream wait for an event."""
-        torch_openreg._C._stream_wait_event(self._stream, event._event)
-
-    def get_priority(self):
-        """Get the priority of the stream."""
-        return torch_openreg._C._stream_get_priority(self._stream)
-
     def launch_kernel(self, task):
         """Add a Python callable kernel to this stream.
 
@@ -107,75 +96,149 @@ def cdata(self):
         """Get the underlying stream pointer (for internal use)."""
         return self._stream
 
+def stream(flags=0):
+    return Stream(flags=flags)
+
+def default_stream(device=None):
+    _lazy_init()
+    if device is None:
+        device_idx = current_device()
+    else:
+        device_idx = torch.accelerator._get_device_index(device, optional=True)
+        if device_idx < 0:
+            device_idx = current_device()
 
-class Event:
-    """Wrapper for OpenReg event."""
+    if device_idx not in _default_streams:
+        # Create default stream if it doesn't exist
+        _default_streams[device_idx] = Stream()
 
-    def __init__(self, enable_timing=False):
-        if enable_timing:
-            # orEventEnableTiming = 0x1
-            self._event = torch_openreg._C._event_create_with_flags(0x1)
-        else:
-            self._event = torch_openreg._C._event_create()
+    return _default_streams[device_idx]
 
-    def __del__(self):
-        if hasattr(self, '_event'):
-            torch_openreg._C._event_destroy(self._event)
 
-    def record(self, stream=None):
-        """Record the event in a stream."""
-        if stream is None:
-            # Use default stream (stream 0)
-            stream = Stream()
-        torch_openreg._C._event_record(self._event, stream._stream)
+def launch_kernel(tog_path, attribute_path):
+    """Launch a kernel on TOGSimulator.
 
-    def synchronize(self):
-        """Wait for the event to complete."""
-        torch_openreg._C._event_synchronize(self._event)
+    Args:
+        tog_path: Path to TOG file
+        attribute_path: Path to attribute file
 
-    def query(self):
-        """Check if the event has completed."""
-        return torch_openreg._C._event_query(self._event)
+    Returns:
+        int: The kernel ID assigned to this launch
 
-    def elapsed_time(self, start_event):
-        """Get the elapsed time between two events in milliseconds."""
-        return torch_openreg._C._event_elapsed_time(start_event._event, self._event)
+    """
+    # Get TOGSimulator instance
+    sim = get_tog_simulator()
+    if sim is None:
+        raise RuntimeError("[torch.npu] TOGSimulator is not initialized. Call torch.npu.init() first.")
 
-    @property
-    def cdata(self):
-        """Get the underlying event pointer (for internal use)."""
-        return self._event
+    device_idx = current_device()
+    stream_index, timestamp = get_launch_context()
+    # Create a task function that calls TOGSimulator.launch_kernel
+    def launch_task():
+        return sim.launch_kernel(device_idx, stream_index, tog_path, attribute_path, timestamp)
 
+    stream = default_stream()
+    stream.launch_kernel(launch_task)
 
 def synchronize():
-    """Synchronize all streams on the current device."""
+    """Synchronize all streams on the current device.
+
+    This function:
+    1. Registers TOGSimulator.device_synchronize as a task on the default stream
+    2. Calls the underlying device_synchronize to wait for all tasks to complete
+    """
+    # Get TOGSimulator instance
+    sim = get_tog_simulator()
+    if sim is not None:
+        # Get current device index
+        device_idx = current_device()
+
+        # Create a task function that calls TOGSimulator.device_synchronize
+        def sync_task():
+            return sim.device_synchronize(device_idx)
+
+        # Register as task on default stream
+        stream = default_stream()
+        stream.launch_kernel(sync_task)
+
+    # Call underlying device_synchronize to wait for all tasks to complete
     torch_openreg._C._device_synchronize()
 
+def get_tog_simulator():
+    return _tog_simulator
 
-def stream(priority=None, flags=0):
-    """Create a new stream.
+def set_tog_simulator(simulator):
+    """Set the global TOGSimulator instance.
 
     Args:
-        priority: Stream priority (optional)
-        flags: Stream flags (optional)
+        simulator: TOGSimulator instance or None
+    """
+    global _tog_simulator
+    _tog_simulator = simulator
 
-    Returns:
-        Stream: A new stream object
+def set_launch_context(stream_index=0, timestamp=0):
+    _launch_context.stream_index = stream_index
+    _launch_context.timestamp = timestamp
+
+def get_launch_context():
+    stream_index = getattr(_launch_context, 'stream_index', 0)
+    timestamp = getattr(_launch_context, 'timestamp', 0)
+    return stream_index, timestamp
+
+class launch_context:
+    """Context manager for setting launch_kernel parameters.
+
+    Args:
+        stream_index: Stream index (partition ID) to use for launch_kernel
+        timestamp: Timestamp in nanoseconds to use for launch_kernel
+
+    Example:
+        with torch.npu.launch_context(stream_index=1, timestamp=1000):
+            model(input)
     """
-    return Stream(priority=priority, flags=flags)
 
+    def __init__(self, stream_index=0, timestamp=0):
+        self.stream_index = stream_index
+        self.timestamp = timestamp
+        self.prev_stream_index = None
+        self.prev_timestamp = None
+
+    def __enter__(self):
+        # Save previous context values
+        self.prev_stream_index = getattr(_launch_context, 'stream_index', 0)
+        self.prev_timestamp = getattr(_launch_context, 'timestamp', 0)
+        # Set new context values
+        set_launch_context(self.stream_index, self.timestamp)
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        # Restore previous context values
+        _launch_context.stream_index = self.prev_stream_index
+        _launch_context.timestamp = self.prev_timestamp
+        return False
 
-def event(enable_timing=False):
-    """Create a new event.
+def launch_model(model, *args, stream_index=0, timestamp=0, **kwargs):
+    """Launch a compiled model on TOGSimulator.
 
     Args:
-        enable_timing: Whether to enable timing for the event
+        model: Compiled model (torch.compile())
+        *args: Model input arguments
+        stream_index: Stream index (partition ID). If None, uses context value.
+        timestamp: Timestamp in nanoseconds. If None, uses context value.
+        **kwargs: Additional keyword arguments for model execution
 
     Returns:
-        Event: A new event object
-    """
-    return Event(enable_timing=enable_timing)
+        Model output (same as calling model(*args, **kwargs))
 
+    Note:
+        This function executes the compiled model and automatically launches
+        the generated kernels with the specified stream_index and timestamp.
+        If stream_index or timestamp are not provided, values from the current
+        context (set via launch_context() or set_launch_context()) are used.
+    """
+    # Get stream_index and timestamp from parameters or context
+    with launch_context(stream_index=stream_index, timestamp=timestamp):
+        return model(*args, **kwargs)
 
 from .random import *  # noqa: F403
 from .amp import *
@@ -200,9 +263,10 @@ def event(enable_timing=False):
     "get_autocast_dtype",
     "set_autocast_dtype",
     "get_amp_supported_dtype",
-    "Stream",
-    "Event",
     "stream",
-    "event",
+    "launch_kernel",
+    "launch_model",
     "synchronize",
+    "get_tog_simulator",
+    "set_tog_simulator",
 ]
diff --git a/PyTorchSimDevice/torch_openreg/openreg/random.py b/PyTorchSimDevice/torch_openreg/openreg/random.py
index 6817bd79..3f2e99fe 100644
--- a/PyTorchSimDevice/torch_openreg/openreg/random.py
+++ b/PyTorchSimDevice/torch_openreg/openreg/random.py
@@ -11,6 +11,7 @@
     "manual_seed",
     "manual_seed_all",
     "initial_seed",
+    "_is_in_bad_fork",
 ]
 
 
@@ -59,3 +60,8 @@ def manual_seed_all(seed: int) -> None:
     for idx in range(device_count()):
         default_generator = torch_openreg._C._get_default_generator(idx)
         default_generator.manual_seed(seed)
+
+def _is_in_bad_fork():
+    # For NPU simulator, we don't have the same fork issues as CUDA
+    # Return False to indicate we're not in a bad fork state
+    return False
\ No newline at end of file
diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index 5066d214..d6b47123 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -2,6 +2,7 @@
 import re
 import shlex
 import subprocess
+import torch
 
 from torch._inductor.codecache import get_lock_dir, get_hash, write
 from torch._inductor.async_compile import AsyncCompile
@@ -144,7 +145,9 @@ def load(cls, source_code,
         key, input_path = write(source_code, "mlir", specified_dir=write_path)
         new_input_path = os.path.splitext(input_path)[0]
         raw_tog_path = new_input_path + "_tog.py"
+        tog_path = os.path.join(write_path, "tile_graph.onnx")
         sample_mlir_path = new_input_path + "_sample"
+        validation_binary_path = os.path.join(write_path, validation_binary_name)
         gem5_cmds = mlir_gem5_compile_command(new_input_path, sample_mlir_path, raw_tog_path, vectorlane_size)
 
         from filelock import FileLock
@@ -177,9 +180,9 @@ def load(cls, source_code,
                 val_llvm_caller.generate_wrapper_file(write_path, validation_wrapper_name)
                 val_llvm_caller.compile_wih_kernel(write_path, key, validation_wrapper_name,
                                                    validation_binary_name, new_link_option)
-                target = os.path.join(write_path, validation_binary_name)
+
                 stack_size = val_llvm_caller.parse_stack_sizes(f"{write_path}/{key}.s", vlenb=vlenb)
-                spad_size =  val_llvm_caller.get_spad_size(target)
+                spad_size =  val_llvm_caller.get_spad_size(validation_binary_path)
                 spad_usage = stack_size + spad_size # Spad usage per lane
                 if extension_config.CONFIG_SPAD_INFO["spad_size"] < spad_usage:
                     logger.debug(
@@ -188,6 +191,10 @@ def load(cls, source_code,
                     )
                     raise SpadOverflowError()
 
+        # Skip if TOG file already exists
+        if os.path.isfile(tog_path):
+            return key
+
         # Launch tile graph generator
         gem5_sample_cmd = shlex.split(gem5_cmds[0])
         gem5_translate_cmd = shlex.split(gem5_cmds[1])
@@ -213,13 +220,10 @@ def load(cls, source_code,
             cycle_llvm_caller = MLIRKernelCallerCodeGen(False, arg_attributes, cycle_sim=True)
             cycle_llvm_caller.generate_wrapper_file(write_path, cycle_wrapper_name)
             cycle_llvm_caller.compile_wih_kernel(write_path, key + "_sample", cycle_wrapper_name, cycle_binary_name, link_option)
-            array_size = []
-            for (arg_name, arg_attribute) in arg_attributes:
-                array_size.append(str(arg_attribute[2]))
 
             # Run cyclesim
             cyclesim = CycleSimulator()
-            cycle_list = cyclesim.compile_and_simulate(os.path.join(write_path, cycle_binary_name), " ".join(array_size), vectorlane_size, silent_mode=silent_mode)
+            cycle_list = cyclesim.compile_and_simulate(os.path.join(write_path, cycle_binary_name), vectorlane_size, silent_mode=silent_mode)
 
             # Create TOG
             w_offset, x_offset = vectorlane_size, vectorlane_size
@@ -231,7 +235,7 @@ def load(cls, source_code,
             tile_graph_generator = tog_generator(origins)
             tile_graph_generator.load_file(raw_tog_path)
             tile_graph_generator.generate_tile_graph(
-                os.path.join(write_path, "tile_graph.onnx"),
+                tog_path,
                 cycle_list=cycle_list,
                 x_offset=x_offset, # FIXME.
                 w_offset=w_offset, # FIXME.
@@ -247,25 +251,18 @@ def __init__(self):
         self.cycle_binary_name = "cycle_binary"
 
     def mlir(self, source_code, arg_attributes=[], vectorlane_size=16, tile_size=[], spad_info=None, origins=None, silent_mode=False, **kwargs):
+        autotune = kwargs.get('autotune', False)
         def task():
             key = MLIRCodeCache.load(source_code,
                                           valdiation_wrapper_name=self.validation_binary_name,
                                           validation_binary_name=self.validation_binary_name,
                                           arg_attributes=arg_attributes, vectorlane_size=vectorlane_size,
                                           tile_size=tile_size, spad_info=spad_info, origins=origins,
-                                          silent_mode=silent_mode, **kwargs)
+                                          silent_mode=autotune, **kwargs)
             return key
         future = self.submit(task)
-        if "loop_size" in kwargs:
-            loop_size = kwargs["loop_size"]
-        else:
-            loop_size = []
-
-        # In the autotune mode, skip validation to speed up
-        autotune = kwargs.get('autotune', False)
-        validate = kwargs.get('validate', False) if not autotune else False
 
-        def dummy_simulator(*args, **kwargs):
+        def run_kernel_simulation(*args, **kwargs):
             # Wait for compilation
             key = future.result()
             from filelock import FileLock
@@ -277,47 +274,27 @@ def dummy_simulator(*args, **kwargs):
                 # Dump arguments and meta data
                 dump_metadata(args, arg_attributes, result_path)
                 runtime_path = FunctionalSimulator.get_runtime_dump_path(result_path)
-                if not autotune and (extension_config.pytorchsim_functional_mode or validate):
+                if extension_config.pytorchsim_functional_mode and not autotune:
                     funcsim = FunctionalSimulator(result_path, key)
                     funcsim.run_spike(args, arg_attributes,
                                     runtime_path, self.validation_binary_name,
                                     vectorlane_size=vectorlane_size, spad_info=spad_info,
-                                    silent_mode=silent_mode)
+                                    silent_mode=autotune)
+
                 if not extension_config.pytorchsim_timing_mode:
                     return [float("inf")]
 
+                # Prepare arguments for launch kernel
                 onnx_path = os.path.join(result_path, "tile_graph.onnx")
                 attribute_path = os.path.join(runtime_path, "attribute")
-                togsim_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "TOGSim")
-                TOGSim = TOGSimulator(togsim_path, extension_config.CONFIG_TOGSIM_CONFIG)
-                TOGSim.vectorlane_size = vectorlane_size
-                attribute_path = TOGSim.create_attribute_file(attribute_path, args, loop_size=loop_size)
-                result_path = TOGSim.simulation(onnx_path, attribute_path, silent_mode=silent_mode, autotune_mode=autotune)
-                result = TOGSimulator.get_result_from_file(result_path)
-                return result
 
-        def dryrun_simulator(*args, **kwargs):
-            key = future.result()
-            from filelock import FileLock
-            lock_dir = get_lock_dir()
-            lock = FileLock(os.path.join(lock_dir, key + ".lock"), timeout=LOCK_TIMEOUT)
-            with lock:
-                # Run simulator pass
-                result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "outputs", hash_prefix(key))
-                # Dump arguments and meta data
-                dump_metadata(args, arg_attributes, result_path)
-                runtime_path = FunctionalSimulator.get_runtime_dump_path(result_path)
-
-                # Todo. Support valude dependent mode for graph mode
-                if False: # extension_config.pytorchsim_functional_mode:
-                    funcsim = FunctionalSimulator(result_path, key)
-                    funcsim.run_spike(args, arg_attributes,
-                                    runtime_path, self.validation_binary_name,
-                                    vectorlane_size=vectorlane_size, spad_info=spad_info)
-            return result_path, runtime_path, None
-
-        is_dryrun = int(os.environ.get('TOGSIM_EAGER_MODE', default=False)) and not autotune
-        target_simulator = dryrun_simulator if is_dryrun else dummy_simulator
-        target_simulator.arg_attributes = arg_attributes
-        target_simulator.future = future
-        return target_simulator
+                TOGSim = torch.npu.get_tog_simulator()
+                if not autotune and TOGSim is not None:
+                    attribute_path = TOGSim.create_attribute_file(attribute_path, args)
+                    torch.npu.launch_kernel(onnx_path, attribute_path)
+                    result = None # No result for non-autotune mode
+                else:
+                    result_path = TOGSimulator.run_standalone(onnx_path, attribute_path, autotune_mode=autotune)
+                    result = TOGSimulator.get_result_from_file(result_path)
+                return result
+        return run_kernel_simulation
diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py
index b0bcac7f..eff6f573 100644
--- a/PyTorchSimFrontend/extension_config.py
+++ b/PyTorchSimFrontend/extension_config.py
@@ -106,9 +106,6 @@ def __getattr__(name):
     if name == "CONFIG_TORCHSIM_LOG_PATH":
         return os.environ.get('TORCHSIM_LOG_PATH', default = os.path.join(CONFIG_TORCHSIM_DIR, "togsim_results"))
 
-    if name == "CONFIG_TOGSIM_EAGER_MODE":
-        return int(os.environ.get("TOGSIM_EAGER_MODE", default=False))
-
 # SRAM Buffer allocation plan
 def load_plan_from_module(module_path):
     if module_path is None:
diff --git a/PyTorchSimFrontend/extension_op.py b/PyTorchSimFrontend/extension_op.py
index 18bf65c3..e6351101 100644
--- a/PyTorchSimFrontend/extension_op.py
+++ b/PyTorchSimFrontend/extension_op.py
@@ -46,9 +46,6 @@
 
 class MLIRExternKernelChoice(ExternKernelChoice):
     def call_name(self):
-        is_dryrun = int(os.environ.get('TOGSIM_EAGER_MODE', default=False))
-        if is_dryrun:
-            return f"yield from sparse_mm_dummy_stonne_outer"
         return f"torch.ops.extension_op.{self.name}"
 
 custom_lib = torch.library.Library("extension_op", "DEF")
@@ -275,10 +272,8 @@ def prepare_outer_product_matrix(a, b, out):
 def sparse_mm_stonne_outer(a, b, out):
     onnx_path, attribute_path, c_result_path = prepare_outer_product_matrix(a, b, out)
 
-    togsim_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "TOGSim")
     stonne_config_path = f'{extension_config.CONFIG_TORCHSIM_DIR}/configs/stonne_single_c1_simple_noc.yml'
-    TOGSim = TOGSimulator(togsim_path, stonne_config_path)
-    result_path = TOGSim.simulation(onnx_path)
+    result_path = TOGSimulator.run_standalone(onnx_path, config_path=stonne_config_path)
     TOGSimulator.get_result_from_file(result_path)
 
     # Load result data
diff --git a/PyTorchSimFrontend/mlir/mlir_autotune.py b/PyTorchSimFrontend/mlir/mlir_autotune.py
index 138bec50..4503584c 100644
--- a/PyTorchSimFrontend/mlir/mlir_autotune.py
+++ b/PyTorchSimFrontend/mlir/mlir_autotune.py
@@ -61,12 +61,24 @@ def make_run_fn(
         # Check already cached result.
         write_path = get_write_path(self.source_code)
         key,  _ = write(self.source_code, "mlir", specified_dir=write_path)
-        result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "outputs", hash_prefix(key), "togsim_result/0")
-        if os.path.exists(result_path):
-            result = TOGSimulator.get_result_from_file(result_path)
-            def cached_run_fn(*args, **kwargs):
-                return result
-            return cached_run_fn
+        result_dir = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "outputs", hash_prefix(key), "togsim_result")
+
+        # Find the most recent .log file in the result directory
+        if os.path.exists(result_dir) and os.path.isdir(result_dir):
+            log_files = [f for f in os.listdir(result_dir) if f.endswith('.log')]
+            if log_files:
+                # Sort by modification time, get the most recent file
+                log_files_with_time = [
+                    (f, os.path.getmtime(os.path.join(result_dir, f)))
+                    for f in log_files
+                ]
+                log_files_with_time.sort(key=lambda x: x[1], reverse=True)
+                latest_log_file = log_files_with_time[0][0]
+                result_path = os.path.join(result_dir, latest_log_file)
+                result = TOGSimulator.get_result_from_file(result_path)
+                def cached_run_fn(*args, **kwargs):
+                    return result
+                return cached_run_fn
 
         # Run a candidate code
         run_method = custom_async_compile.mlir(
@@ -74,7 +86,7 @@ def cached_run_fn(*args, **kwargs):
             loop_size=None, spad_info=self.extra_args["spad_info"],
             vlen=self.extra_args["vlen"], arg_attributes=self.extra_args["arg_attributes"],
             origins="Unknown", silent_mode=True,
-            validate=self.extra_args['validate'], autotune=self.extra_args['autotune'])
+            autotune=self.extra_args['autotune'])
 
         args = [
             tensor
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index c5da1f56..a60c706e 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -99,7 +99,7 @@ def write_header(self):
 
                 from torch import device, empty, empty_strided
                 from {extension_codecache.__name__} import CustomAsyncCompile
-                from PyTorchSimFrontend.extension_config import CONFIG_SRAM_BUFFER_PLAN, CONFIG_TOGSIM_EAGER_MODE, setup_logger
+                from PyTorchSimFrontend.extension_config import CONFIG_SRAM_BUFFER_PLAN, setup_logger
                 from Simulator.simulator import TOGSimulator
                 from PyTorchSimFrontend.extension_op import sparse_mm_dummy_stonne_outer
                 from torch._inductor.select_algorithm import extern_kernels
@@ -1016,7 +1016,6 @@ def run_bench(self, nodes, kernel_name, src_code):
                 "spad_info": self.spad_info,
                 "vlen" : self.vlen,
                 "arg_attributes" : arg_attributes,
-                "validate" : extension_config.pytorchsim_functional_mode,
                 "autotune" : True,
             },
             source_code=src_code,
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_common.py b/PyTorchSimFrontend/mlir/mlir_conv_common.py
index a1a9d935..1aa99d14 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_common.py
@@ -82,7 +82,6 @@ def outer_func_render(self, kernel_name, input_args):
         Y = self.output_node
         Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2]
 
-        eager_mode = int(os.environ.get('TOGSIM_EAGER_MODE', default=False))
         options = dict(
             kernel=self.kernel,
             KERNEL_NAME=kernel_name,
@@ -94,7 +93,6 @@ def outer_func_render(self, kernel_name, input_args):
             PADDING_H=self.padding[0],
             PADDING_W=self.padding[1],
             VALIDATION_MODE=extension_config.pytorchsim_functional_mode,
-            TOGSIM_EAGER_MODE=eager_mode,
             input_reorder=self.input_reorder
         )
         code = self._template_from_string(self.WRAPPER_TEMPLATE).render(**options)
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py b/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py
index 0bf01421..051d7a0e 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py
@@ -120,9 +120,6 @@ def {{ FUNC_NAME }}{{kernel.def_wrapper()}}:
 
     # Launch kernel
     {{ KERNEL_NAME }}<DEF_CONV_WRAPPER>
-    {%- if TOGSIM_EAGER_MODE %}
-    yield ({{KERNEL_NAME}}, <DEF_CONV_WRAPPER>)
-    {%- endif %}
 """
     def __init__(self, input_nodes, layout, input_reorder=None, **kwargs):
         super().__init__(input_nodes, layout, input_reorder, **kwargs)
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py b/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py
index 92b9a525..c742b3b2 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py
@@ -121,9 +121,6 @@ def {{ FUNC_NAME }}{{kernel.def_wrapper()}}:
 
     # Launch kernel
     {{ KERNEL_NAME }}<DEF_CONV_WRAPPER>
-    {%- if TOGSIM_EAGER_MODE %}
-    yield ({{KERNEL_NAME}}, <DEF_CONV_WRAPPER>)
-    {%- endif %}
 """
     def __init__(self, input_nodes, layout, input_reorder=None, **kwargs):
         super().__init__(input_nodes, layout, input_reorder, **kwargs)
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py b/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py
index ab124852..07211bb4 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py
@@ -121,9 +121,6 @@ def {{ FUNC_NAME }}{{kernel.def_wrapper()}}:
 
     # Launch kernel
     {{ KERNEL_NAME }}<DEF_CONV_WRAPPER>
-    {%- if TOGSIM_EAGER_MODE %}
-    yield ({{KERNEL_NAME}}, <DEF_CONV_WRAPPER>)
-    {%- endif %}
 """
     def __init__(self, input_nodes, layout, input_reorder=None, **kwargs):
         super().__init__(input_nodes, layout, input_reorder, **kwargs)
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index 66aa0a27..46a7f9bf 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -125,9 +125,6 @@ def {{ FUNC_NAME }}{{kernel.def_wrapper()}}:
 
     # Launch kernel
     {{ KERNEL_NAME }}<DEF_CONV_WRAPPER>
-    {%- if TOGSIM_EAGER_MODE %}
-    yield ({{KERNEL_NAME}}, <DEF_CONV_WRAPPER>)
-    {%- endif %}
 """
     def __init__(self, input_nodes, layout, input_reorder=None, **kwargs):
         super().__init__(input_nodes, layout, input_reorder, **kwargs)
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index faf5e69c..5305cbb7 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -280,11 +280,6 @@ def codegen_node(self, _node):
         ex_kernel.call_kernel(kernel_name)
         _, args, _, _ = ex_kernel.args.mlir_argdefs()
         args = ", ".join(args)
-        eager_mode = int(os.environ.get('TOGSIM_EAGER_MODE', default=False))
-        if (eager_mode):
-            V.graph.wrapper_code.writeline(
-                f"yield ({kernel_name}, ({args}))"
-            )
         self._set_flush_status(True)
 
     def ready_to_flush(self):
@@ -344,13 +339,6 @@ def codegen_template(self, template_node, epilogue_nodes, prologue_nodes):
         kernel.call_kernel(kernel_name)
         V.graph.removed_buffers |= kernel.removed_buffers
         _, args, _, _ = self.kernel_group.args.mlir_argdefs()
-        eager_mode = int(os.environ.get('TOGSIM_EAGER_MODE', default=False))
-        if (eager_mode):
-            target_kernel_name = kernel_name if kernel.outer_func_name is None else kernel.outer_func_name + f"_{len(args)}"
-            args = ", ".join(args)
-            V.graph.wrapper_code.writeline(
-                f"yield ({target_kernel_name}, ({args}))"
-            )
         self._set_flush_status(True)
 
     def enter_context_fixed(self, node):
diff --git a/Scheduler/scheduler.py b/Scheduler/scheduler.py
index cdcdd2a7..77e218ea 100644
--- a/Scheduler/scheduler.py
+++ b/Scheduler/scheduler.py
@@ -343,8 +343,7 @@ def __init__(self, num_request_queue=1, max_batch=1, engine_select=FIFO_ENGINE,
             self.request_queue.append([])
         self.finish_queue : List[Request] = []
 
-        togsim_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "TOGSim")
-        self.tog_simulator = TOGSimulator(togsim_path, togsim_config)
+        self.tog_simulator = TOGSimulator(togsim_config)
         if self.tog_simulator.config_yaml['pytorchsim_timing_mode'] == 0:
             # Scheduler requires timing mode to be enabled (pytorchsim_timing_mode != 0).
             logger.error(f"pytorchsim_timing_mode is set to 0 in config file '{togsim_config}'. ")
diff --git a/Simulator/simulator.py b/Simulator/simulator.py
index 96a1fc86..2771d03c 100644
--- a/Simulator/simulator.py
+++ b/Simulator/simulator.py
@@ -146,7 +146,7 @@ def run_spike(self, args, arg_attributes, runtime_path, binary, vectorlane_size=
         run = f'spike --isa rv64gcv --varch=vlen:256,elen:64 {vectorlane_option} {spad_option} {kernel_address} {base_path} /workspace/riscv-pk/build/pk {target_binary} {file_path_str}'
         if not silent_mode:
             logger.debug(f"[Spike] cmd> {run}")
-        logger.info("[Spike] Running Spike simulator")
+            logger.info("[Spike] Running Spike simulator")
         run_cmd = shlex.split(run)
         try:
             stdout_setting = subprocess.DEVNULL if silent_mode else None
@@ -194,14 +194,12 @@ class CycleSimulator():
     def __init__(self) -> None:
         pass
 
-    def compile_and_simulate(self, target_binary, array_size, vectorlane_size, silent_mode=False):
+    def compile_and_simulate(self, target_binary, vectorlane_size, silent_mode=False):
         dir_path = os.path.join(os.path.dirname(target_binary), "m5out")
         gem5_script_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "gem5_script/script_systolic.py")
         gem5_cmd = [extension_config.CONFIG_GEM5_PATH, "-r", "--stdout-file=sto.log", "-d", dir_path, gem5_script_path, "-c", target_binary, "--vlane", str(vectorlane_size)]
 
-        is_dryrun = int(os.environ.get('TOGSIM_EAGER_MODE', default=False)) or silent_mode
-
-        if not is_dryrun:
+        if not silent_mode:
             logger.debug(f"[Gem5] cmd> {' '.join(gem5_cmd)}")
             logger.info("[Gem5] Gem5 simulation started")
 
@@ -224,65 +222,55 @@ class TOGSimulator():
     TOGSIM_RESULT_PATH_KEY = "TOGSIM_RESULT_PATH"
     FINISH_STR = "Simulation finished"
     ALLOC_POOL = dict() # For eagermode buffer plan
-    def __init__(self, togsim_path, config_path, vectorlane_size=-1) -> None:
+    def __init__(self, config_path=None, togsim_path=None) -> None:
+        if config_path is None:
+            config_path = extension_config.CONFIG_TOGSIM_CONFIG
+        if togsim_path is None:
+            togsim_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "TOGSim")
+
         self.base_dir = togsim_path
         self.config_path = config_path
         self.config_yaml = self.load_yaml(self.config_path)
         self.process = None
-        self.vectorlane_size = vectorlane_size
-
-    def get_togsim_command(self):
-        bin = os.path.join(self.base_dir, "build/bin/Simulator")
-        config = os.path.join(self.base_dir, self.config_path)
-        cmd = f"{bin} --config {config}"
-        return cmd
+        self._next_kernel_id = 0  # Auto-incrementing kernel ID
 
-    def simulation(self, model_path, attribute_path="", silent_mode=False, autotune_mode=False):
-        cmd = f"{self.get_togsim_command()} --models_list {model_path}"
-        if extension_config.CONFIG_TOGSIM_DEBUG_LEVEL:
-            cmd += f" --log_level {extension_config.CONFIG_TOGSIM_DEBUG_LEVEL}"
-        if attribute_path:
-            cmd = f"{cmd} --attributes_list {attribute_path}"
-        if not silent_mode:
-            logger.debug(f"[TOGSim] cmd> {cmd}")
-            logger.info("[TOGSim] TOGSim simulation started")
+        # Create FIFOs for command and event communication
+        self.fifo_dir = os.path.join("/tmp", f"togsim_fifo_{os.getpid()}")
+        os.makedirs(self.fifo_dir, exist_ok=True)
+        self.trace_file_path = os.path.join(self.fifo_dir, "cmd_fifo")
+        self.trace_log = ""
 
-        try:
-            with ProgressBar("[TOGSim] Running simulation", silent_mode=silent_mode):
-                result = subprocess.check_output(shlex.split(cmd))
-        except subprocess.CalledProcessError as e:
-            logger.error(f"[TOGSim] Command failed with exit code {e.returncode}")
-            logger.error(f"[TOGSim] Error output: {e.output.decode() if isinstance(e.output, bytes) else e.output}")
-            assert 0
+        # Create FIFOs if they don't exist
+        if os.path.exists(self.trace_file_path):
+            os.remove(self.trace_file_path)
+        os.mkfifo(self.trace_file_path)
 
-        # Separate Autotune logs
-        if autotune_mode:
-            base_dir = Path(model_path).parent / "togsim_result"
-            base_dir.mkdir(parents=True, exist_ok=True)
-            file_name = f"{len(list(base_dir.iterdir()))}.log"
-        else:
-            base_dir = Path(extension_config.CONFIG_TORCHSIM_LOG_PATH)
-            unique_id = uuid.uuid4().hex[:8]
-            timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
-            file_name = f"{unique_id}_{timestamp}.log"
+        # Start TOGSim process
+        self._start_process()
 
-        base_dir.mkdir(parents=True, exist_ok=True)
-        result_path = base_dir / file_name
+        # Open trace file FIFO once and keep it open (after process starts)
+        self._trace_file_lock = threading.Lock()
+        try:
+            self._trace_file_handle = open(self.trace_file_path, 'w')
+        except IOError as e:
+            logger.error(f"[TOGSim] Failed to open trace file: {e}")
+            raise RuntimeError(f"Failed to open trace file: {e}")
 
-        # Prevent race condition
-        with open(result_path, "w") as f:
-            f.write(result.decode())
-            f.flush()
-            os.fsync(f.fileno())
+    def __enter__(self):
+        """Context manager entry."""
+        # Set this simulator instance as the global TOGSimulator
+        self.old_tog_simulator = torch.npu.get_tog_simulator()
+        torch.npu.set_tog_simulator(self)
+        return self
 
-        if not silent_mode:
-            import logging as _logging
-            model_path_log = f' of "{model_path}" ' if logger.isEnabledFor(_logging.DEBUG) else " "
-            logger.info(f'[TOGSim] Simulation log{model_path_log}is stored to "{result_path}"')
-        return result_path
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Context manager exit - automatically cleanup."""
+        # Reset global TOGSimulator to None
+        self.until()
+        torch.npu.set_tog_simulator(self.old_tog_simulator)
 
-    def interactive_simulation(self):
-        cmd = f"{self.get_togsim_command()} --mode interactive"
+    def _start_process(self):
+        cmd = f"{self.get_togsim_command(self.config_path, self.base_dir)} --models_list {self.trace_file_path}"
         if extension_config.CONFIG_TOGSIM_DEBUG_LEVEL:
             cmd += f" --log_level {extension_config.CONFIG_TOGSIM_DEBUG_LEVEL}"
 
@@ -290,69 +278,144 @@ def interactive_simulation(self):
         if self.process is None:
             self.process = subprocess.Popen(
                 shlex.split(cmd),
-                stdin=subprocess.PIPE,
-                stderr=subprocess.PIPE,
+                #stdout=subprocess.PIPE,
+                #stderr=subprocess.PIPE,
                 universal_newlines=True
             )
         else:
             logger.warning("[TOGSim] Simulator is already running.")
 
-    def stop(self):
-        if self.process:
-            self.process.terminate()
-            self.process.wait()
-            self.process = None
-            logger.info("[TOGSim] Simulator stopped.")
+    def _cleanup_fifos(self):
+        """Clean up FIFO files"""
+        try:
+            if os.path.exists(self.trace_file_path):
+                os.remove(self.trace_file_path)
+            if os.path.exists(self.fifo_dir):
+                os.rmdir(self.fifo_dir)
+        except OSError as e:
+            logger.warning(f"[TOGSim] Failed to clean up FIFOs: {e}")
+
+    def _send_command(self, command_type, device_index, stream_index, tog_path="", attribute_path="", timestamp=0):
+        """
+        Internal method to send a command to TOGSim via FIFO.
+
+        Args:
+            command_type: Type of command ("LAUNCH_KERNEL" or "DEVICE_SYNC")
+            device_index: Device index
+            stream_index: Stream index
+            tog_path: Path to TOG file (ONNX model) - empty for DEVICE_SYNC
+            attribute_path: Path to attribute file - empty for DEVICE_SYNC
+            timestamp: Timestamp in nanoseconds (default: 0)
+
+        Returns:
+            int: The kernel ID assigned to this command
+        """
+        if self.process is None:
+            raise RuntimeError("[TOGSim] Simulator process is not running")
+
+        if self.process.poll() is not None:
+            raise RuntimeError("[TOGSim] Simulator process has terminated")
+
+        # Get and increment kernel ID
+        kernel_id = self._next_kernel_id
+        self._next_kernel_id += 1
+
+        # Format command: command_type,kernel_id,device_index,stream_index,tog_path,attribute_path,timestamp
+        command = f"{command_type},{kernel_id},{device_index},{stream_index},{tog_path},{attribute_path},{timestamp}"
+
+        with self._trace_file_lock:
+            # Write command to TOGSim
+            try:
+                self._trace_file_handle.write(command + '\n')
+                self._trace_file_handle.flush()
+                self.trace_log += command + '\n'
+                logger.debug(f"[TOGSim] Sent command: {command}")
+            except IOError as e:
+                logger.error(f"[TOGSim] Failed to write to trace file: {e}")
+                raise RuntimeError(f"Failed to send command to TOGSim: {e}")
+        return kernel_id
+
+    def until(self):
+        # Make sure that all kernels in the stream are finished
+        torch.npu.synchronize()
+
+        # Close trace file handle if open
+        if self._trace_file_handle is not None:
+            try:
+                self._trace_file_handle.close()
+            except:
+                pass
+            self._trace_file_handle = None
 
-    def wait(self):
         if self.process:
-            logger.info("[TOGSim] Waiting for simulation to complete...")
-            self.quit()
             self.process.wait()
+
+            # Read output streams
+            stdout_output = ""
+            stderr_output = ""
+            if self.process.stdout:
+                stdout_output = self.process.stdout.read()
+            if self.process.stderr:
+                stderr_output = self.process.stderr.read()
+
+            # Print stderr immediately if there's any error output
+            if stderr_output:
+                sys.stderr.write(stderr_output)
+                sys.stderr.flush()
+
+            # Save stdout to result file
+            if stdout_output:
+                result_path = extension_config.CONFIG_TORCHSIM_LOG_PATH
+                os.makedirs(result_path, exist_ok=True)
+                file_name = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') + ".log"
+                result_path = os.path.join(result_path, file_name)
+                with open(result_path, "w") as f:
+                    f.write(stdout_output)
+                logger.info(f'[TOGSim] Simulation log is stored to "{result_path}"')
             self.process = None
-            logger.info("[TOGSim] Simulation completed.")
 
-    def send_command(self, command):
-        if self.process:
-            try:
-                logger.debug(command)
-                self.process.stdin.write(command + '\n')
-                self.process.stdin.flush()
-                ret = self.process.stderr.readline().strip()
-                return ret
-            except BrokenPipeError:
-                err = self.process.stderr.readlines()
-                for line in err:
-                    logger.error(line.strip())
-                self.process = None
-                exit(1)
-        else:
-            logger.warning("Simulator is not running.")
-            return None
-
-    def launch(self, onnx_path, attribute_path, arrival_time=0, partion_id=0):
-        command = f"launch {self.config_path} {onnx_path} {attribute_path} {arrival_time} {partion_id}"
-        ret = self.send_command(command)
-        return 0
-
-    def cycle(self):
-        ret = self.send_command("cycle")
-        return int(ret.split(" ")[-1])
-
-    def until(self, until_cycle):
-        command = f"until {until_cycle}"
-        ret = self.send_command(command)
-        bitmap = int(ret.split(" ")[-1])
-        indices = []
-        for i in range(64):
-            if (bitmap >> i) & 1:
-                indices.append(i)
-        return indices
-
-    def quit(self):
-        command = "quit"
-        ret = self.send_command(command)
-        return
+        # Save trace_log with same name but .trace extension
+        if self.trace_log:
+            result_path = extension_config.CONFIG_TORCHSIM_LOG_PATH
+            os.makedirs(result_path, exist_ok=True)
+            file_name = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') + ".trace"
+            trace_path = os.path.join(result_path, file_name)
+            with open(trace_path, "w") as f:
+                f.write(self.trace_log)
+            logger.info(f'[TOGSim] Trace log is stored to "{trace_path}"')
+
+        # Clean up FIFOs
+        self._cleanup_fifos()
+
+    def launch_kernel(self, device_index, stream_index, tog_path, attribute_path, timestamp=0):
+        """
+        Launch a kernel via FIFO communication.
+
+        Args:
+            device_index: Device index
+            stream_index: Stream index
+            tog_path: Path to TOG file (ONNX model)
+            attribute_path: Path to attribute file
+            timestamp: Timestamp in nanoseconds (default: 0)
+
+        Returns:
+            int: The kernel ID assigned to this launch
+        """
+        return self._send_command("LAUNCH_KERNEL", device_index, stream_index, tog_path, attribute_path, timestamp)
+
+    def device_synchronize(self, device_index):
+        """
+        Synchronize all streams on a device via FIFO communication.
+
+        Args:
+            device_index: Device index to synchronize
+            timestamp: Timestamp in nanoseconds (default: 0)
+
+        Returns:
+            int: The command ID assigned to this synchronization
+        """
+        # For device_synchronize, stream_index is not meaningful, use 0
+        return self._send_command("DEVICE_SYNC", device_index, 0, "", "", 0)
 
     @classmethod
     def sram_alloc(cls, buf_name, addr_range):
@@ -404,22 +467,83 @@ def get_core_freq(self):
         else:
             raise KeyError("Key 'core_freq' not found in JSON.")
 
-    def find_zero_sub_tensors(self, tensor):
-        x, y = self.vectorlane_size, self.vectorlane_size
-        zero_positions = {}
+    @staticmethod
+    def get_togsim_command(config_path, togsim_path=None):
+        if togsim_path is None:
+            togsim_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "TOGSim")
+        bin = os.path.join(togsim_path, "build/bin/Simulator")
+        config = os.path.join(togsim_path, config_path)
+        cmd = f"{bin} --config {config}"
+        return cmd
+
+    @staticmethod
+    def run_standalone(model_path, attribute_path="", autotune_mode=False, config_path=None, togsim_path=None):
+        """
+        Run a single kernel simulation in standalone mode.
+        This method starts a new TOGSim process, runs the kernel, and waits for completion.
+        For streaming multiple kernels, use launch_kernel() instead.
+
+        Args:
+            model_path: Path to TOG file (ONNX model)
+            attribute_path: Path to attribute file
+            autotune_mode: If True, run in autotune mode (silent)
+            config_path: Path to TOGSim config file (required)
+            togsim_path: Path to TOGSim directory (optional, defaults to CONFIG_TORCHSIM_DIR/TOGSim)
+
+        Returns:
+            Path to the simulation result log file
+        """
+        if config_path is None:
+            config_path = extension_config.CONFIG_TOGSIM_CONFIG
+        if togsim_path is None:
+            togsim_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "TOGSim")
+
+        # Create result path with appropriate filename
+        if autotune_mode:
+            base_dir = Path(model_path).parent / "togsim_result"
+        else:
+            base_dir = Path(extension_config.CONFIG_TORCHSIM_LOG_PATH)
+
+        base_dir.mkdir(parents=True, exist_ok=True)
+        timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
+        file_name = f"{timestamp}_{uuid.uuid4().hex[:8]}"
+        result_path = base_dir / f"{file_name}.log"
+        trace_file_path = base_dir / f"{file_name}.trace"
+
+        # Create trace file in result directory
+        kernel_id, device_index, stream_index, timestamp = 0, 0, 0, 0
+        command = f"LAUNCH_KERNEL,{kernel_id},{device_index},{stream_index},{model_path},{attribute_path},{timestamp}\n"
+        with open(trace_file_path, 'w') as trace_file:
+            trace_file.write(command)
+            trace_file.flush()
+            os.fsync(trace_file.fileno())
+
+        try:
+            cmd = f"{TOGSimulator.get_togsim_command(config_path, togsim_path)} --models_list {trace_file_path}"
+            if extension_config.CONFIG_TOGSIM_DEBUG_LEVEL:
+                cmd += f" --log_level {extension_config.CONFIG_TOGSIM_DEBUG_LEVEL}"
+
+            if not autotune_mode:
+                logger.debug(f"[TOGSim] cmd> {cmd}")
+                logger.info("[TOGSim] TOGSim simulation started")
+            with ProgressBar("[TOGSim] Running simulation", silent_mode=autotune_mode):
+                result = subprocess.check_output(shlex.split(cmd))
+        except subprocess.CalledProcessError as e:
+            logger.error(f"[TOGSim] Command failed with exit code {e.returncode}")
+            logger.error(f"[TOGSim] Error output: {e.output.decode() if isinstance(e.output, bytes) else e.output}")
+            assert 0
 
-        # Need to set vectorlane size
-        if self.vectorlane_size == -1:
-            return zero_positions
+        # Prevent race condition
+        with open(result_path, "w") as f:
+            f.write(result.decode())
+            f.flush()
+            os.fsync(f.fileno())
 
-        for i in range(0, tensor.shape[0], y):
-            for j in range(0, tensor.shape[1], x):
-                sub_tensor = tensor[i:i + y, j:j + x]
-                if np.all(sub_tensor == 0):
-                    if i not in zero_positions:
-                        zero_positions[i] = {}
-                    zero_positions[i][j] = 0 # i pos : j pos : 0
-        return zero_positions
+        if not autotune_mode:
+            import logging as _logging
+            model_path_log = f' of "{model_path}" ' if logger.isEnabledFor(_logging.DEBUG) else " "
+            logger.info(f'[TOGSim] Simulation log{model_path_log}is stored to "{result_path}"')
+        return result_path
 
     @staticmethod
     def get_result_from_file(result_path):
@@ -482,6 +606,24 @@ def get_result_from_file(result_path):
         return core_metrics, dram_channel_bw, avg_dram_bw, simulation_time, total_cycle
 
 if __name__ == "__main__":
-    sim = TOGSimulator("/workspace/PyTorchSim/TOGSim", "/workspace/PyTorchSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.yml")
-    sim.interactive_simulation()
-    sim.until(4000)
\ No newline at end of file
+    # Example paths (adjust these to your actual test files)
+    test_tog_path = "/workspace/PyTorchSim/outputs/6vxl6mwzhfl/tile_graph.onnx"
+    test_attribute_path = "/workspace/PyTorchSim/outputs/6vxl6mwzhfl/runtime_0001/attribute/0"
+
+    # Test: Launch multiple kernels
+    sim = TOGSimulator(config_path="/workspace/PyTorchSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml")
+    with sim:
+        try:
+            id1 = torch.npu.launch_kernel(tog_path=test_tog_path, attribute_path=test_attribute_path)
+            id2 = torch.npu.launch_kernel(tog_path=test_tog_path, attribute_path=test_attribute_path)
+            id3 = torch.npu.launch_kernel(tog_path=test_tog_path, attribute_path=test_attribute_path)
+        except Exception as e:
+            print(f"Error during kernel launch: {e}")
+
+        try:
+            id2 = torch.npu.launch_kernel(tog_path=test_tog_path, attribute_path=test_attribute_path)
+            id1 = torch.npu.launch_kernel(tog_path=test_tog_path, attribute_path=test_attribute_path)
+            id3 = torch.npu.launch_kernel(tog_path=test_tog_path, attribute_path=test_attribute_path)
+        except Exception as e:
+            print(f"Error during kernel launch: {e}")
+    print(sim.trace_log)
\ No newline at end of file
diff --git a/TOGSim/include/TileGraph.h b/TOGSim/include/TileGraph.h
index 990c107d..4cad9355 100644
--- a/TOGSim/include/TileGraph.h
+++ b/TOGSim/include/TileGraph.h
@@ -67,6 +67,10 @@ class TileGraph {
   std::string get_name() { return _name; }
   void set_arrival_time(cycle_type arrival_time) { _arrival_time = arrival_time; }
   cycle_type get_arrival_time() { return _arrival_time; }
+  void set_kernel_id(unsigned int kernel_id) { _kernel_id = kernel_id; }
+  unsigned int get_kernel_id() { return _kernel_id; }
+  void set_start_time(cycle_type start_time) { _start_time = start_time; }
+  cycle_type get_start_time() { return _start_time; }
   void init_cache_plan(IntervalTree<unsigned long long, int>::interval_vector it) {
     _cache_plan = std::make_shared<IntervalTree<unsigned long long, int>>(std::move(it));
   }
@@ -130,6 +134,7 @@ class TileGraph {
   int _vec_index=0;
   std::string _path;
   std::string _name = "?";
+  unsigned int _kernel_id = 0;
   std::vector<std::string> _loop_index_list;
   std::vector<std::tuple<int, int, int>> _ranges;
   std::vector<std::shared_ptr<TileSubGraph>> _subgraph_vec;
@@ -137,5 +142,6 @@ class TileGraph {
   std::map<int, std::map<int, std::shared_ptr<TileSubGraph>>> _cpu_graph_map;
   std::shared_ptr<IntervalTree<unsigned long long, int>> _cache_plan;
   cycle_type _arrival_time;
+  cycle_type _start_time = 0;  // First tile issue time, 0 means not started yet
   static std::shared_ptr<Tile> null_tile;
 };
\ No newline at end of file
diff --git a/TOGSim/include/TileGraphParser.h b/TOGSim/include/TileGraphParser.h
index 9c176966..f067fb2d 100644
--- a/TOGSim/include/TileGraphParser.h
+++ b/TOGSim/include/TileGraphParser.h
@@ -65,7 +65,7 @@ class TileNode {
 
 class TileGraphParser {
  public:
-  TileGraphParser(std::string onnx_path, std::string attribute_path, std::string config_path);
+  TileGraphParser(std::string onnx_path, std::string attribute_path, const YAML::Node& config_yaml);
   std::shared_ptr<TileNode> get_top_loop();
   std::unique_ptr<TileGraph>& get_tile_graph() { return _tile_graph; }
   addr_type lookup(std::string key);
diff --git a/TOGSim/src/Simulator.cc b/TOGSim/src/Simulator.cc
index 857923c5..b5b9c778 100644
--- a/TOGSim/src/Simulator.cc
+++ b/TOGSim/src/Simulator.cc
@@ -170,55 +170,8 @@ void Simulator::icnt_cycle() {
   _icnt->cycle();
 }
 
-int Simulator::until(cycle_type until_cycle) {
-  std::vector<bool> partition_scheudler_status;
-  for (auto &scheduler : _partition_scheduler)
-    partition_scheudler_status.push_back(scheduler->empty());
-
-  while (until_cycle == -1 || _core_cycles < until_cycle) {
-    set_cycle_mask();
-    // Core Cycle
-    if (IS_CORE_CYCLE(_cycle_mask))
-      core_cycle();
-
-    // DRAM cycle
-    if (IS_DRAM_CYCLE(_cycle_mask))
-      dram_cycle();
-
-    // Interconnect cycle
-    if (IS_ICNT_CYCLE(_cycle_mask))
-      icnt_cycle();
-
-    // Check if core status has changed
-    if (_core_cycles % 10 == 0) {
-      int bitmap = 0;
-      for (int i=0; i<_partition_scheduler.size(); i++) {
-        /* Skip this */
-        if (partition_scheudler_status.at(i))
-          continue;
-
-        if (_partition_scheduler.at(i)->empty()) {
-          bitmap |= (1 << i);
-        }
-      }
-      if (bitmap)
-        return bitmap;
-    }
-  }
-  int bitmap = 0;
-  for (int i=0; i<_partition_scheduler.size(); i++) {
-    /* Skip this */
-    if (partition_scheudler_status.at(i))
-      continue;
-
-    if (_partition_scheduler.at(i)->empty())
-      bitmap |= (1ULL << i);
-  }
-  return bitmap;
-}
-
 void Simulator::cycle() {
-  while (running()) {
+  while (running() || _core_cycles < 1) {
     set_cycle_mask();
     // Core Cycle
     if (IS_CORE_CYCLE(_cycle_mask))
@@ -232,7 +185,6 @@ void Simulator::cycle() {
     if (IS_ICNT_CYCLE(_cycle_mask))
       icnt_cycle();
   }
-  spdlog::info("Simulation finished");
   for (auto &core: _cores) {
     core->check_tag();
   }
diff --git a/TOGSim/src/TileGraphParser.cc b/TOGSim/src/TileGraphParser.cc
index 515f6247..fd629f8a 100644
--- a/TOGSim/src/TileGraphParser.cc
+++ b/TOGSim/src/TileGraphParser.cc
@@ -685,9 +685,9 @@ void TileLoopNode::print_node() {
   spdlog::debug("{} stride: {} ", spaces, _stride);
 }
 
-TileGraphParser::TileGraphParser(std::string onnx_path, std::string attribute_path, std::string config_path) {
+TileGraphParser::TileGraphParser(std::string onnx_path, std::string attribute_path, const YAML::Node& config_yaml) {
   loadConfig(attribute_path, _attribute_config);
-  loadConfig(config_path, _config_yaml);
+  _config_yaml = config_yaml;  // Use the pre-loaded config
   _attribute_path = attribute_path;
 
   if (!std::filesystem::exists(onnx_path)) {
diff --git a/TOGSim/src/main.cc b/TOGSim/src/main.cc
index bee1b45f..44fb5612 100644
--- a/TOGSim/src/main.cc
+++ b/TOGSim/src/main.cc
@@ -1,6 +1,9 @@
 #include <fstream>
 #include <chrono>
 #include <filesystem>
+#include <sstream>
+#include <thread>
+#include <atomic>
 
 #include "Simulator.h"
 #include "TileGraphParser.h"
@@ -9,82 +12,78 @@
 namespace fs = std::filesystem;
 namespace po = boost::program_options;
 
-const char* env_value = std::getenv("TOGSIM_EAGER_MODE");
-bool isDryRun = (env_value != nullptr && std::string(env_value) == "1");
 
-void launchKernel(Simulator* simulator, std::string onnx_path, std::string attribute_path, std::string config_path, cycle_type request_time=0, int partiton_id=0) {
-  auto graph_praser = TileGraphParser(onnx_path, attribute_path, config_path);
+void launchKernel(Simulator* simulator, unsigned int kernel_id, std::string onnx_path, std::string attribute_path, const YAML::Node& config_yaml, cycle_type request_time=0, int partiton_id=0, int device_id=0) {
+  auto graph_praser = TileGraphParser(onnx_path, attribute_path, config_yaml);
   std::unique_ptr<TileGraph>& tile_graph = graph_praser.get_tile_graph();
   tile_graph->set_arrival_time(request_time ? request_time : simulator->get_core_cycle());
-  spdlog::info("[Scheduler {}] Register graph path: {} operation: {} at {}", partiton_id, onnx_path, tile_graph->get_name(), simulator->get_core_cycle());
-
+  tile_graph->set_kernel_id(kernel_id);
+  spdlog::info("[Scheduler {}] Enqueued kernel id: {} tog: {} operation: {} request_time: {}", partiton_id, kernel_id, onnx_path, tile_graph->get_name(), request_time);
   simulator->schedule_graph(partiton_id, std::move(tile_graph));
 }
 
-Simulator* create_simulator(std::string config_path) {
-  YAML::Node config_yaml;
-  if (!loadConfig(config_path, config_yaml))
-    exit(1);
-  SimulationConfig config = initialize_config(config_yaml);
-
-  auto simulator = new Simulator(config);
-  return simulator;
-}
+void process_trace_file(Simulator* simulator, std::string trace_file_path, const YAML::Node& config_yaml) {
+  // Open trace file (can be FIFO or regular file)
+  std::ifstream trace_file;
+  trace_file.open(trace_file_path);
+  if (!trace_file.is_open()) {
+    spdlog::error("[TOGSim] Failed to open trace file: {}", trace_file_path);
+    return;
+  }
+  spdlog::info("[TOGSim] Reading from trace file: {}", trace_file_path);
 
-int until(Simulator *simulator, cycle_type until_cycle) {
-  return simulator->until(until_cycle);
-}
+  // Read all available commands and process them
+  std::string line;
+  while (std::getline(trace_file, line)) {
+    if (line.empty()) {
+      continue;
+    }
 
-void interactive_mode(Simulator* simulator) {
-  std::string command;
+    // Parse command: command_type,kernel_id,device_index,stream_index,tog_path,attribute_path,timestamp
+    std::istringstream iss(line);
+    std::string token;
+    std::vector<std::string> tokens;
 
-  std::cout << "[" << simulator->get_core_cycle() << "] TOGSim> ";
-  while (std::getline(std::cin, command)) {
+    while (std::getline(iss, token, ',')) {
+      tokens.push_back(token);
+    }
 
-    std::istringstream iss(command);
-    std::string token;
-    // Parse the first part of the command (e.g., "launch", "until", "quit")
-    iss >> token;
-    if (token == "launch") {
-      std::string onnx_path, attribute_path, config_path;
-      cycle_type request_time = 0;
-      int partition_id = 0;
-      iss >> config_path >> onnx_path >> attribute_path >> request_time >> partition_id;
-
-      // Check if both paths were provided
-      if (onnx_path.empty() || attribute_path.empty()) {
-        spdlog::error("Error: Please provide both ONNX path and Attribute path in the format: launch onnx/path attribute/path");
-      } else {
-        launchKernel(simulator, onnx_path, attribute_path, config_path, request_time, partition_id);
-        std::cerr << "launch done" << std::endl;
-      }
-    } else if (token == "until") {
-      cycle_type until_cycle;
-      iss >> until_cycle;
-      int reason;
+    if (tokens.size() != 7) {
+      spdlog::error("[TOGSim] Invalid command format. Expected: command_type,kernel_id,device_index,stream_index,tog_path,attribute_path,timestamp. Got: {} ({} tokens)", line, tokens.size());
+      continue;
+    }
 
-      if (iss.fail()) {
-        spdlog::error("Error: Please provide a valid cycle number after 'until'");
+    std::string command_type = tokens[0];
+    unsigned int kernel_id = std::stoul(tokens[1]);
+    int device_index = std::stoi(tokens[2]);
+    int stream_index = std::stoi(tokens[3]);
+    std::string tog_path = tokens[4];
+    std::string attribute_path = tokens[5];
+    int timestamp = std::stoi(tokens[6]);
+    // timestamp (tokens[6]) is available but not used in current implementation
+
+    try {
+      if (command_type == "LAUNCH_KERNEL") {
+        launchKernel(simulator, kernel_id, tog_path, attribute_path, config_yaml, timestamp, stream_index, device_index);
+      } else if (command_type == "DEVICE_SYNC") {
+        simulator->cycle();
+        spdlog::info("[Device {}] Device synchronization completed", device_index);
       } else {
-        reason = simulator->until(until_cycle);
-        std::cerr << " Until finished: " << reason << std::endl;
+        spdlog::error("[TOGSim] Unknown command type: {}", command_type);
       }
-    } else if (token == "cycle") {
-      cycle_type current_cycle = simulator->get_core_cycle();
-      std::cerr << "Current cycle: " << current_cycle << std::endl;
-    }else if (token == "quit") {
-      std::cerr << "Quit" << std::endl;
-      break;
-    } else {
-      spdlog::error("Error: unknown command {} Available commands are: launch, until, quit.", token);
+    } catch (const std::exception& e) {
+      spdlog::error("[TOGSim] Error processing command {} (type: {}): {}", kernel_id, command_type, e.what());
     }
-    if (isDryRun)
-      std::cout << "[" << simulator->get_core_cycle() << "] TOGSim> ";
   }
+  trace_file.close();
   simulator->cycle();
-  if (simulator->get_core_cycle()==0)
-    simulator->until(0);
-  simulator->print_core_stat();
+}
+
+Simulator* create_simulator(const YAML::Node& config_yaml) {
+  SimulationConfig config = initialize_config(config_yaml);
+
+  auto simulator = new Simulator(config);
+  return simulator;
 }
 
 int main(int argc, char** argv) {
@@ -94,13 +93,9 @@ int main(int argc, char** argv) {
   cmd_parser.add_command_line_option<std::string>(
       "config", "Path for hardware configuration file");
   cmd_parser.add_command_line_option<std::string>(
-      "models_list", "Path for the models list file");
-  cmd_parser.add_command_line_option<std::string>(
-      "attributes_list", "Path for the models list file");
+      "models_list", "Path for the models list file (can be FIFO or regular file)");
   cmd_parser.add_command_line_option<std::string>(
       "log_level", "Set for log level [trace, debug, info], default = info");
-  cmd_parser.add_command_line_option<std::string>(
-      "mode", "choose \"trace\" moode and \"iteractive\" mode");
   try {
     cmd_parser.parse(argc, argv);
   } catch (const CommandLineParser::ParsingError& e) {
@@ -120,29 +115,31 @@ int main(int argc, char** argv) {
     spdlog::set_level(spdlog::level::info);
 
   std::string config_path;
-  std::string onnx_path;
-  std::string attribute_path;
-  std::string execution_mode = "trace";
+  std::string trace_file_path;
 
   /* Create simulator */
   cmd_parser.set_if_defined("config", &config_path);
-  cmd_parser.set_if_defined("mode", &execution_mode);
-  auto simulator = create_simulator(config_path);
-
-  if (execution_mode.compare("trace") == 0) {
-    /* Get needed info for launch kernel */
-    cmd_parser.set_if_defined("models_list", &onnx_path);
-    cmd_parser.set_if_defined("attributes_list", &attribute_path);
-
-    /* launch kernels */
-    launchKernel(simulator, onnx_path, attribute_path, config_path);
-    simulator->run_simulator();
-    if (simulator->get_core_cycle()==0)
-      simulator->until(1);
+  
+  // Load config once for reuse
+  YAML::Node config_yaml;
+  if (!loadConfig(config_path, config_yaml)) {
+    spdlog::error("[TOGSim] Failed to load config file: {}", config_path);
+    exit(1);
+  }
+  
+  auto simulator = create_simulator(config_yaml);
+
+  // Get trace file path
+  cmd_parser.set_if_defined("models_list", &trace_file_path);
+
+  if (!trace_file_path.empty()) {
+    // Process trace file (unified mode: supports both FIFO and regular file)
+    process_trace_file(simulator, trace_file_path, config_yaml);
+    spdlog::info("Simulation finished");
     simulator->print_core_stat();
-  } else if (execution_mode.compare("interactive") == 0) {
-    /* Get onnx_path, attribute from user input, request_time */
-    interactive_mode(simulator);
+  } else {
+    spdlog::error("No trace file provided. Use --models_list to specify trace file path.");
+    exit(1);
   }
   delete simulator;
 
diff --git a/TOGSim/src/scheduler/Scheduler.cc b/TOGSim/src/scheduler/Scheduler.cc
index bb5d29cf..b801fc05 100644
--- a/TOGSim/src/scheduler/Scheduler.cc
+++ b/TOGSim/src/scheduler/Scheduler.cc
@@ -5,8 +5,6 @@ Scheduler::Scheduler(SimulationConfig config, const cycle_type* core_cycle, cons
 }
 
 void Scheduler::schedule_graph(std::unique_ptr<TileGraph> tile_graph) {
-  spdlog::info("[Scheduler {}] Tile Graph {} Scheduled", _id, "FIFO"); // TODO: tile graph id
-  // _tile_graph = TileGraphScheduler->get_tile_graph();
   _tile_graph.push_back(std::move(tile_graph));
   refresh_status();
 }
@@ -25,6 +23,10 @@ std::shared_ptr<Tile> Scheduler::get_tile(int core_id, int slot_id) {
     return tile;
   } else {
     tile = std::move(_tile_graph.at(0)->get_tile(core_id, slot_id));
+     // Record start_time when first non-EMPTY tile is issued
+    if (tile->get_status() != Tile::Status::EMPTY && _tile_graph.at(0)->get_start_time() == 0) {
+      _tile_graph.at(0)->set_start_time(*_core_cycle);
+    }
   }
   refresh_status();
   return tile;
@@ -48,11 +50,22 @@ void Scheduler::refresh_status() {
 
   /* Remove finished request */
   if (_tile_graph.at(0)->is_finished()) {
-    spdlog::info("[Scheduler {}] Graph path: {} operation: {} finish at {}",
-                 _id, _tile_graph.at(0)->get_graph_path(),
+    unsigned int kernel_id = _tile_graph.at(0)->get_kernel_id();
+    cycle_type start_time = _tile_graph.at(0)->get_start_time();
+    cycle_type compute_time = 0;
+    if (start_time > 0) {
+      compute_time = *_core_cycle - start_time;
+    } else {
+      // Fallback to arrival_time if start_time was not recorded
+      start_time = _tile_graph.at(0)->get_arrival_time();
+      compute_time = *_core_cycle - start_time;
+    }
+    
+    spdlog::info("[Scheduler {}] Kernel {} has completed - TOG path: {} operation: {} finished at cycle {}",
+                 _id, kernel_id, _tile_graph.at(0)->get_graph_path(),
                  _tile_graph.at(0)->get_name(), *_core_cycle);
-    spdlog::info("Total compute time {}",
-                 *_core_cycle - _tile_graph.at(0)->get_arrival_time());
+    spdlog::info("[Scheduler {}] Kernel {} execution summary - Started at: {} cycles, Total compute time: {} cycles",
+                 _id, kernel_id, start_time, compute_time);
     _tile_graph.pop_front();
   }
 }
\ No newline at end of file
diff --git a/scripts/stonne_experiment2/tog_gen.py b/scripts/stonne_experiment2/tog_gen.py
index e8013da7..0e4b5812 100644
--- a/scripts/stonne_experiment2/tog_gen.py
+++ b/scripts/stonne_experiment2/tog_gen.py
@@ -71,10 +71,8 @@ def extract_simulation_stats(result_path):
         if "outerPro" in path:
             continue
         tog_path = os.path.join(path, "tile_graph.onnx")
-        togsim_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "TOGSim")
         stonne_config_path = f'{extension_config.CONFIG_TORCHSIM_DIR}/configs/stonne_validation_c1_simple_noc.yml'
-        backsim = TOGSimulator(togsim_path, stonne_config_path)
-        result_path = backsim.simulation(tog_path)
+        result_path = TOGSimulator.run_standalone(tog_path, config_path=stonne_config_path)
         nr_multiplications, total_cycle, sim_time = extract_simulation_stats(result_path)
         sim_time, total_cycle = float(sim_time), int(total_cycle)
         print(f"[TLS] Cycle={total_cycle} Sim time={sim_time} nr_multiplications={nr_multiplications}")
diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py
index 9c7ca255..724c10d0 100644
--- a/tests/test_scheduler.py
+++ b/tests/test_scheduler.py
@@ -3,41 +3,25 @@
 import torch
 from torchvision.models import resnet18 as model1
 from test_transformer import EncoderBlock as model2
+from Simulator.simulator import TOGSimulator
 
 base_path = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-sys.path.append(base_path)
-from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
 config = f'{base_path}/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.yml'
+os.environ['TOGSIM_CONFIG'] = config
 
 target_model1 = model1().eval()
 target_model2 = model2(768, 12).eval()
 
-# Init scheduler
-scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config)
-# Register compiled model
-opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last))
-opt_model2 = torch.compile(target_model2.to(device=scheduler.execution_engine.module.custom_device()))
-SchedulerDNNModel.register_model("resnet18", opt_model1)
-SchedulerDNNModel.register_model("bert", opt_model2)
-
-# Init input data
-model_input1 = torch.randn(1, 3, 224, 224)
-model_input2 = torch.randn(128, 768)
-
-# Init request
-new_request1 = Request("resnet18", [model_input1], [], request_queue_idx=0)
-new_request2 = Request("bert", [model_input2], [], request_queue_idx=1)
-new_request3 = Request("resnet18", [model_input1], [], request_queue_idx=0)
-new_request4 = Request("bert", [model_input2], [], request_queue_idx=1)
-
-# Add request to scheduler
-scheduler.add_request(new_request1, request_time=0)
-scheduler.add_request(new_request2, request_time=0)
-scheduler.add_request(new_request3, request_time=0)
-scheduler.add_request(new_request4, request_time=0)
-
-# Run scheduler
-while not scheduler.is_finished():
-    scheduler.schedule()
-
+device = torch.device("npu:0")
+opt_model1 = torch.compile(target_model1.to(device=device, memory_format=torch.channels_last))
+opt_model2 = torch.compile(target_model2.to(device=device))
+model_input1 = torch.randn(1, 3, 224, 224).to(device=device)
+model_input2 = torch.randn(128, 768).to(device=device)
+
+with TOGSimulator(config_path=config):
+    torch.npu.launch_model(opt_model1, model_input1, stream_index=0, timestamp=0)
+    torch.npu.launch_model(opt_model2, model_input2, stream_index=1, timestamp=0)
+    torch.npu.synchronize()
+    torch.npu.launch_model(opt_model1, model_input1, stream_index=0, timestamp=0)
+    torch.npu.launch_model(opt_model2, model_input2, stream_index=1, timestamp=0)
 print("Done")
\ No newline at end of file
diff --git a/tests/test_stream.py b/tests/test_stream.py
index 70077abe..70b2c34b 100644
--- a/tests/test_stream.py
+++ b/tests/test_stream.py
@@ -1,22 +1,12 @@
 import torch
 import time
 
-start_event = torch.npu.event(enable_timing=True)
-end_event = torch.npu.event(enable_timing=True)
-stream = torch.npu.stream()
-
 def my_kernel():
     print("Task is running...")
     result = sum(range(1000))
     time.sleep(2.5)
     print(f"Task completed with result: {result}")
 
-start_event.record(stream)
-stream.launch_kernel(my_kernel)
-end_event.record(stream)
-
-
-stream.synchronize()
-
-elapsed_time = end_event.elapsed_time(start_event)
-print("Event has completed! ", elapsed_time)
\ No newline at end of file
+torch.npu.launch_kernel(my_kernel)
+torch.npu.synchronize()
+print("Task completed!")
\ No newline at end of file

From 09753bc60be1bae818fffe429822bac619eb1722 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 5 Feb 2026 05:04:47 +0000
Subject: [PATCH 098/194] [TOGSim] Rename scheduler_graph to enqueue_graph

---
 TOGSim/include/Simulator.h           | 11 +++++++++--
 TOGSim/include/scheduler/Scheduler.h |  2 +-
 TOGSim/src/TileGraphParser.cc        |  6 +++---
 TOGSim/src/main.cc                   |  4 ++--
 TOGSim/src/scheduler/Scheduler.cc    |  2 +-
 tests/test_stream.py                 | 12 ------------
 6 files changed, 16 insertions(+), 21 deletions(-)
 delete mode 100644 tests/test_stream.py

diff --git a/TOGSim/include/Simulator.h b/TOGSim/include/Simulator.h
index 39fa310e..a0b8b9c5 100644
--- a/TOGSim/include/Simulator.h
+++ b/TOGSim/include/Simulator.h
@@ -24,8 +24,15 @@ namespace fs = std::filesystem;
 class Simulator {
  public:
   Simulator(SimulationConfig config);
-  void schedule_graph(int partion_id, std::unique_ptr<TileGraph> tile_graph) {
-    _partition_scheduler.at(partion_id)->schedule_graph(std::move(tile_graph));
+  void enqueue_graph(int partion_id, std::unique_ptr<TileGraph> tile_graph) {
+    if (partion_id < 0 || static_cast<uint32_t>(partion_id) >= _config.num_partition) {
+      spdlog::error("[Enqueue_graph] Invalid partition_id: {} (valid range: 0 to {}). "
+                  "Total partitions: {}", partion_id, _config.num_partition - 1, _config.num_partition);
+      throw std::runtime_error(
+          fmt::format("[Enqueue_graph] Invalid partition_id: {} (valid range: 0 to {}). "
+                    "Total partitions: {}", partion_id, _config.num_partition - 1, _config.num_partition));
+    }
+    _partition_scheduler.at(partion_id)->enqueue_graph(std::move(tile_graph));
   }
   void run_simulator();
   cycle_type get_core_cycle() { return _core_cycles; }
diff --git a/TOGSim/include/scheduler/Scheduler.h b/TOGSim/include/scheduler/Scheduler.h
index 39ab7576..c178a4c5 100644
--- a/TOGSim/include/scheduler/Scheduler.h
+++ b/TOGSim/include/scheduler/Scheduler.h
@@ -8,7 +8,7 @@
 class Scheduler {
  public:
   Scheduler(SimulationConfig config, const cycle_type* core_cycle, const uint64_t* core_time, int id);
-  void schedule_graph(std::unique_ptr<TileGraph> tile_graph);
+  void enqueue_graph(std::unique_ptr<TileGraph> tile_graph);
   void finish_tile(std::shared_ptr<Tile> tile) { tile->get_owner()->finish_tile(tile); }
 
   /* For other schedulers */
diff --git a/TOGSim/src/TileGraphParser.cc b/TOGSim/src/TileGraphParser.cc
index fd629f8a..882aba6b 100644
--- a/TOGSim/src/TileGraphParser.cc
+++ b/TOGSim/src/TileGraphParser.cc
@@ -706,7 +706,7 @@ TileGraphParser::TileGraphParser(std::string onnx_path, std::string attribute_pa
       uint64_t value = it->second.as<uint64_t>();
 
       _arg_to_address[key] = value;
-      spdlog::info("[TOGParser/Attribute] Address Attribute key: {} address: 0x{:x}", key, value);
+      spdlog::trace("[TOGParser/Attribute] Address Attribute key: {} address: 0x{:x}", key, value);
     }
   }
 
@@ -719,7 +719,7 @@ TileGraphParser::TileGraphParser(std::string onnx_path, std::string attribute_pa
       for (const auto& val : value_list) {
         _arg_numa_stride[key].push_back(val.as<uint32_t>());
       }
-      spdlog::info("[TOGParser/Attribute] Address numa info key: {} numa stride : {}", key, fmt::join(_arg_numa_stride[key], ", "));
+      spdlog::trace("[TOGParser/Attribute] Address numa info key: {} numa stride : {}", key, fmt::join(_arg_numa_stride[key], ", "));
     }
   }
 
@@ -754,7 +754,7 @@ TileGraphParser::TileGraphParser(std::string onnx_path, std::string attribute_pa
 
   /* Get meta data from graph */
   for (const auto& meta : model_proto.metadata_props()) {
-    spdlog::info("[TOGParser] Register Metadata \"{}\": \"{}\"", meta.key(), meta.value());
+    spdlog::trace("[TOGParser] Register Metadata \"{}\": \"{}\"", meta.key(), meta.value());
     _tog_meta[meta.key()] = meta.value();
   }
 
diff --git a/TOGSim/src/main.cc b/TOGSim/src/main.cc
index 44fb5612..cc73f6db 100644
--- a/TOGSim/src/main.cc
+++ b/TOGSim/src/main.cc
@@ -18,8 +18,8 @@ void launchKernel(Simulator* simulator, unsigned int kernel_id, std::string onnx
   std::unique_ptr<TileGraph>& tile_graph = graph_praser.get_tile_graph();
   tile_graph->set_arrival_time(request_time ? request_time : simulator->get_core_cycle());
   tile_graph->set_kernel_id(kernel_id);
-  spdlog::info("[Scheduler {}] Enqueued kernel id: {} tog: {} operation: {} request_time: {}", partiton_id, kernel_id, onnx_path, tile_graph->get_name(), request_time);
-  simulator->schedule_graph(partiton_id, std::move(tile_graph));
+  spdlog::info("[Scheduler {}] Enqueued kernel id: {}, tog_path: {}, operation: {}, request_time: {}", partiton_id, kernel_id, onnx_path, tile_graph->get_name(), request_time);
+  simulator->enqueue_graph(partiton_id, std::move(tile_graph));
 }
 
 void process_trace_file(Simulator* simulator, std::string trace_file_path, const YAML::Node& config_yaml) {
diff --git a/TOGSim/src/scheduler/Scheduler.cc b/TOGSim/src/scheduler/Scheduler.cc
index b801fc05..0be42f27 100644
--- a/TOGSim/src/scheduler/Scheduler.cc
+++ b/TOGSim/src/scheduler/Scheduler.cc
@@ -4,7 +4,7 @@ Scheduler::Scheduler(SimulationConfig config, const cycle_type* core_cycle, cons
     : _id(id), _config(config), _core_cycle(core_cycle), _core_time(core_time) {
 }
 
-void Scheduler::schedule_graph(std::unique_ptr<TileGraph> tile_graph) {
+void Scheduler::enqueue_graph(std::unique_ptr<TileGraph> tile_graph) {
   _tile_graph.push_back(std::move(tile_graph));
   refresh_status();
 }
diff --git a/tests/test_stream.py b/tests/test_stream.py
deleted file mode 100644
index 70b2c34b..00000000
--- a/tests/test_stream.py
+++ /dev/null
@@ -1,12 +0,0 @@
-import torch
-import time
-
-def my_kernel():
-    print("Task is running...")
-    result = sum(range(1000))
-    time.sleep(2.5)
-    print(f"Task completed with result: {result}")
-
-torch.npu.launch_kernel(my_kernel)
-torch.npu.synchronize()
-print("Task completed!")
\ No newline at end of file

From 235bb5c8f1e8d4429112b58fcc5613a71e61a974 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 5 Feb 2026 05:13:43 +0000
Subject: [PATCH 099/194] [TOGSim] Add comments feature in trace files

---
 Simulator/simulator.py | 2 +-
 TOGSim/src/main.cc     | 9 +++++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/Simulator/simulator.py b/Simulator/simulator.py
index 2771d03c..13f2b4f0 100644
--- a/Simulator/simulator.py
+++ b/Simulator/simulator.py
@@ -238,7 +238,7 @@ def __init__(self, config_path=None, togsim_path=None) -> None:
         self.fifo_dir = os.path.join("/tmp", f"togsim_fifo_{os.getpid()}")
         os.makedirs(self.fifo_dir, exist_ok=True)
         self.trace_file_path = os.path.join(self.fifo_dir, "cmd_fifo")
-        self.trace_log = ""
+        self.trace_log = "# command_type, kernel_id, device_index, stream_index, tog_path, attribute_path, timestamp\n"
 
         # Create FIFOs if they don't exist
         if os.path.exists(self.trace_file_path):
diff --git a/TOGSim/src/main.cc b/TOGSim/src/main.cc
index cc73f6db..7c596af5 100644
--- a/TOGSim/src/main.cc
+++ b/TOGSim/src/main.cc
@@ -39,6 +39,11 @@ void process_trace_file(Simulator* simulator, std::string trace_file_path, const
       continue;
     }
 
+    // Skip comment lines starting with #
+    if (line[0] == '#') {
+      continue;
+    }
+
     // Parse command: command_type,kernel_id,device_index,stream_index,tog_path,attribute_path,timestamp
     std::istringstream iss(line);
     std::string token;
@@ -119,14 +124,14 @@ int main(int argc, char** argv) {
 
   /* Create simulator */
   cmd_parser.set_if_defined("config", &config_path);
-  
+
   // Load config once for reuse
   YAML::Node config_yaml;
   if (!loadConfig(config_path, config_yaml)) {
     spdlog::error("[TOGSim] Failed to load config file: {}", config_path);
     exit(1);
   }
-  
+
   auto simulator = create_simulator(config_yaml);
 
   // Get trace file path

From 9dbe03711484b48efd83b612d730b22929989b26 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 5 Feb 2026 13:58:19 +0000
Subject: [PATCH 100/194] [Eager] Add eager mode POC

---
 PyTorchSimDevice/csrc/aten/OpenRegMinimal.cpp |  8 --------
 tests/test_eager.py                           | 11 ++++++++---
 2 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/PyTorchSimDevice/csrc/aten/OpenRegMinimal.cpp b/PyTorchSimDevice/csrc/aten/OpenRegMinimal.cpp
index 39f019c5..21ab3fef 100644
--- a/PyTorchSimDevice/csrc/aten/OpenRegMinimal.cpp
+++ b/PyTorchSimDevice/csrc/aten/OpenRegMinimal.cpp
@@ -158,12 +158,4 @@ TORCH_LIBRARY_IMPL(_, PrivateUse1, m) {
 }
 // LITERALINCLUDE END: FALLBACK GLOBAL
 
-// LITERALINCLUDE START: FALLBACK SINGLE
-TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
-  m.impl(
-      "sub.Tensor",
-      torch::CppFunction::makeFromBoxedFunction<&wrapper_cpu_fallback>());
-}
-// LITERALINCLUDE END: FALLBACK SINGLE
-
 } // namespace at::openreg
diff --git a/tests/test_eager.py b/tests/test_eager.py
index 7a2df6e2..9255b681 100644
--- a/tests/test_eager.py
+++ b/tests/test_eager.py
@@ -1,8 +1,13 @@
 import torch
 
+@torch.library.impl("aten::mul.Tensor", "npu")
+def my_fallback(x, y):
+    raise NotImplementedError("Fallback called")
+
 if __name__ == "__main__":
+    #torch.npu.register_fallback_op("aten::add.out", my_fallback)
     device = torch.device("npu:0")
-    x = torch.zeros(10, 10).to(device)
-    y = torch.zeros(10, 10).to(device)
-    z = x + y
+    x = torch.ones(10, 10).to(device)
+    y = torch.ones(10, 10).to(device)
+    z = x * y
     print(z.cpu())
\ No newline at end of file

From f9a9f5fa8fa83cdbfb1c7c589ba9f8fd8854e78a Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 6 Feb 2026 05:20:10 +0000
Subject: [PATCH 101/194] [Eager] Add eager to graph fallback API

---
 .../torch_openreg/openreg/__init__.py         | 39 +++++++++++++++++++
 tests/test_eager.py                           |  5 +--
 2 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/PyTorchSimDevice/torch_openreg/openreg/__init__.py b/PyTorchSimDevice/torch_openreg/openreg/__init__.py
index 66ec022a..8d62cee3 100644
--- a/PyTorchSimDevice/torch_openreg/openreg/__init__.py
+++ b/PyTorchSimDevice/torch_openreg/openreg/__init__.py
@@ -243,6 +243,43 @@ def launch_model(model, *args, stream_index=0, timestamp=0, **kwargs):
 from .random import *  # noqa: F403
 from .amp import *
 
+def eager_to_compile(op_name):
+    """
+    Register an eager mode operation as a graph-based implementation using torch.compile().
+
+    Args:
+        op_name: Operator name (e.g., "aten::mul.Tensor")
+
+    Example:
+        torch.npu.eager_to_compile("aten::mul.Tensor")
+    """
+    def wrapper(*args, **kwargs):
+        @torch.compile(dynamic=False)
+        def dummy_graph(*args, **kwargs):
+            # Convert "aten::mul.Tensor" -> torch.ops.aten.mul.Tensor
+            namespace, op_path = op_name.split("::", 1)
+            op_path_parts = op_path.split(".")
+            op = torch.ops
+            for part in [namespace] + op_path_parts:
+                op = getattr(op, part)
+            return op(*args, **kwargs)
+        return dummy_graph(*args, **kwargs)
+
+    torch.library.impl(op_name, "npu", wrapper)
+
+def register_eager_to_compile(ops):
+    """
+    Register multiple operators at once using eager_to_compile.
+
+    Args:
+        ops: List of operator names (e.g., ["aten::mul.Tensor", "aten::add.Tensor"])
+
+    Example:
+        torch.npu.register_eager_to_compile(["aten::mul.Tensor", "aten::add.Tensor"])
+    """
+    for op_name in ops:
+        eager_to_compile(op_name)
+
 __all__ = [
     "device",
     "device_count",
@@ -269,4 +306,6 @@ def launch_model(model, *args, stream_index=0, timestamp=0, **kwargs):
     "synchronize",
     "get_tog_simulator",
     "set_tog_simulator",
+    "eager_to_compile",
+    "register_eager_to_compile",
 ]
diff --git a/tests/test_eager.py b/tests/test_eager.py
index 9255b681..b84cc6f6 100644
--- a/tests/test_eager.py
+++ b/tests/test_eager.py
@@ -1,8 +1,6 @@
 import torch
 
-@torch.library.impl("aten::mul.Tensor", "npu")
-def my_fallback(x, y):
-    raise NotImplementedError("Fallback called")
+torch.npu.register_eager_to_compile(["aten::mul.Tensor", "aten::add.Tensor"])
 
 if __name__ == "__main__":
     #torch.npu.register_fallback_op("aten::add.out", my_fallback)
@@ -10,4 +8,5 @@ def my_fallback(x, y):
     x = torch.ones(10, 10).to(device)
     y = torch.ones(10, 10).to(device)
     z = x * y
+    z = x + z
     print(z.cpu())
\ No newline at end of file

From a13f37b173050848cea423db71fc758c14e7cf4d Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 6 Feb 2026 05:45:57 +0000
Subject: [PATCH 102/194] [Template] Conv warpper minor fix

---
 PyTorchSimFrontend/mlir/mlir_conv_mt_template.py  | 2 +-
 PyTorchSimFrontend/mlir/mlir_conv_sb_template.py  | 2 +-
 PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py | 2 +-
 PyTorchSimFrontend/mlir/mlir_conv_template.py     | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py b/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py
index 051d7a0e..da2bc829 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py
@@ -104,7 +104,7 @@ def {{ FUNC_NAME }}{{kernel.def_wrapper()}}:
     padded_shape = list(X.shape)
     padded_shape[2] += 2 * {{ PADDING_H }}
     padded_shape[3] += 2 * {{ PADDING_W }}
-    X_padding = torch.zeros(padded_shape, device=X.device)
+    X_padding = torch.zeros(padded_shape).to(device=X.device)
     X_padding[:, :, {{ PADDING_H }}:X.shape[2] + {{ PADDING_H }}, {{ PADDING_W }}:X.shape[3] + {{ PADDING_W }}] = X
 
     # Tanspose inputs
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py b/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py
index c742b3b2..cc284522 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py
@@ -105,7 +105,7 @@ def {{ FUNC_NAME }}{{kernel.def_wrapper()}}:
     padded_shape = list(X.shape)
     padded_shape[2] += 2 * {{ PADDING_H }}
     padded_shape[3] += 2 * {{ PADDING_W }}
-    X_padding = torch.zeros(padded_shape, device=X.device)
+    X_padding = torch.zeros(padded_shape).to(device=X.device)
     X_padding[:, :, {{ PADDING_H }}:X.shape[2] + {{ PADDING_H }}, {{ PADDING_W }}:X.shape[3] + {{ PADDING_W }}] = X
 
     # Tanspose inputs
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py b/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py
index 07211bb4..6d768bf2 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py
@@ -105,7 +105,7 @@ def {{ FUNC_NAME }}{{kernel.def_wrapper()}}:
     padded_shape = list(X.shape)
     padded_shape[2] += 2 * {{ PADDING_H }}
     padded_shape[3] += 2 * {{ PADDING_W }}
-    X_padding = torch.zeros(padded_shape, device=X.device)
+    X_padding = torch.zeros(padded_shape).to(device=X.device)
     X_padding[:, :, {{ PADDING_H }}:X.shape[2] + {{ PADDING_H }}, {{ PADDING_W }}:X.shape[3] + {{ PADDING_W }}] = X
 
     # Tanspose inputs
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index 46a7f9bf..e2cd61fd 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -109,7 +109,7 @@ def {{ FUNC_NAME }}{{kernel.def_wrapper()}}:
     padded_shape = list(X.shape)
     padded_shape[2] += 2 * {{ PADDING_H }}
     padded_shape[3] += 2 * {{ PADDING_W }}
-    X_padding = torch.zeros(padded_shape, device=X.device)
+    X_padding = torch.zeros(padded_shape).to(device=X.device)
     X_padding[:, :, {{ PADDING_H }}:X.shape[2] + {{ PADDING_H }}, {{ PADDING_W }}:X.shape[3] + {{ PADDING_W }}] = X
 
     # Tanspose inputs

From e840786efc58ee5771b0b270302ab76ff290eec8 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 11 Feb 2026 08:27:45 +0000
Subject: [PATCH 103/194] [Fix] Index_expr ops codegen issue

---
 .../mlir/mlir_caller_codegen.py               |   2 +-
 .../mlir/mlir_codegen_backend.py              |   9 +-
 PyTorchSimFrontend/mlir/mlir_conv_common.py   |   4 +-
 PyTorchSimFrontend/mlir/mlir_scheduling.py    |   4 +-
 PyTorchSimFrontend/mlir/mlir_template.py      |   2 +-
 tests/Yolov5/test_yolov5.py                   | 249 +++++++++++++++++-
 6 files changed, 245 insertions(+), 25 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_caller_codegen.py b/PyTorchSimFrontend/mlir/mlir_caller_codegen.py
index a539bdb9..06d41ea2 100644
--- a/PyTorchSimFrontend/mlir/mlir_caller_codegen.py
+++ b/PyTorchSimFrontend/mlir/mlir_caller_codegen.py
@@ -101,7 +101,7 @@ def generate_args_define(self):
                     bits = 8
                 else:
                     bits = torch.iinfo(arg_type).bits
-                buffer_size = int(math.ceil(arg_size * bits // 8 / 64) * 64) # Round up to 64 bytes
+                buffer_size = int(math.ceil(arg_size * bits // 8 / 64) * 64) * 2 # Round up to 64 bytes + Add some padding for safety
                 self.writeline(f'{DTYPE_TO_C[arg_type]}* c_{arg_name} = malloc({buffer_size}ULL){self.ending}')
                 name_set.add(arg_name)
         self.writeline(self.newline)
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index a60c706e..b52b36d0 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -470,7 +470,6 @@ def load(self, name: str, index: sympy.Expr):
         tile_numel_per_lane = local_tile_desc.get_numel_per_lane()
         tile_shape = local_tile_desc.get_mlir_shape(mlir_dtype)
         tile_stride = local_tile_desc.get_tile_stride()
-
         # Compute vector unit size
         vshape = self.kernel_group.tile_desc.get_mlir_vshape(mlir_dtype)
         compute_vec_size = self.kernel_group.tile_desc.get_compute_vec_size()
@@ -697,7 +696,7 @@ def _index_expr(self, tile_desc, renamed_expression, index, base_vector_index):
             self.reset("recompile")
             raise mlir_common.RecompileSignal(f"Index access (tile size {prior_tile_size} is not divisible by {prior_ranges})")
 
-        tile_size = tile_desc.get_tile_size_per_lane()
+        tile_size_per_lane = tile_desc.get_tile_size_per_lane()
         compute_vec_size = tile_desc.get_compute_vec_size()
         strides = tile_desc.get_tile_stride_per_lane()
 
@@ -707,13 +706,13 @@ def _index_expr(self, tile_desc, renamed_expression, index, base_vector_index):
 
         # Create tile_dim index
         dim_list = []
-        for idx in range(len(tile_size)):
+        for idx in range(len(tile_size_per_lane)):
             # Prepare initial values
             offset = tile_desc.vmap.vlane_stride #* strides[idx]
-            outer_sz = tile_size[idx] // tile_desc.vmap.vlane_stride
+            outer_sz = tile_desc.get_tile_size()[idx] // tile_desc.vmap.vlane_stride
             with self.override_buffer_cse(buffer=self.const_buffer, cse=self.const_cse):
                 div_coeff = self.get_const_cse(strides[idx], "index")
-                mod_coeff = self.get_const_cse(tile_size[idx], "index")
+                mod_coeff = self.get_const_cse(tile_size_per_lane[idx], "index")
                 vlane_stride_coeff = self.get_const_cse(tile_desc.vmap.vlane_stride, "index")
                 vlane_outer_coeff = self.get_const_cse(outer_sz, "index")
                 nr_vector_lane = self.get_const_cse(self.vector_lane, "index")
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_common.py b/PyTorchSimFrontend/mlir/mlir_conv_common.py
index 1aa99d14..f8566b6d 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_common.py
@@ -85,7 +85,7 @@ def outer_func_render(self, kernel_name, input_args):
         options = dict(
             kernel=self.kernel,
             KERNEL_NAME=kernel_name,
-            FUNC_NAME=self.function_name + f"_{len(input_args)}",
+            FUNC_NAME="wrapper_" + kernel_name,
             INPUT=X,
             WEIGHT=W,
             BIAS=Bias,
@@ -96,7 +96,7 @@ def outer_func_render(self, kernel_name, input_args):
             input_reorder=self.input_reorder
         )
         code = self._template_from_string(self.WRAPPER_TEMPLATE).render(**options)
-        return code, self.function_name + f"_{len(input_args)}"
+        return code, "wrapper_" + kernel_name
 
     def get_arg_attributes(self):
         arg_attributes = []
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index 5305cbb7..af960533 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -154,8 +154,8 @@ def can_fuse_horizontal(self, node1, node2):
             }
             # Buffers still required by the activation node (unmet) or read by it
             epilogue_unmet = { dep for dep in epilogue_node.unmet_dependencies }
-            has_depedency = bool(template_writes) and epilogue_unmet.issubset(template_writes)
-            if not has_depedency:
+            has_dependency = bool(template_writes) and epilogue_unmet.issubset(template_writes) and not bool(reads1 & writes2)
+            if not has_dependency:
                 return False
 
             # Revert act_node.group : simplify_and_reorder() modified _body, _size, group
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index b864e5f2..556f7e04 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -403,7 +403,7 @@ def call_kernel(self, kernel_name):
         _, call_args, _, _ = self.kernel_group.args.mlir_argdefs()
         # generate the code to call this
         wrapper.generate_kernel_call(
-            kernel_name if self.outer_func_name is None else self.outer_func_name + f"_{len(call_args)}", call_args)
+            kernel_name if self.outer_func_name is None else "wrapper_" + kernel_name, call_args)
 
     def codegen_template_code(self, render, template_node, prologue_nodes, epilogue_nodes, tile_info):
         with self as kernel:
diff --git a/tests/Yolov5/test_yolov5.py b/tests/Yolov5/test_yolov5.py
index d9e6b261..1262dfb9 100644
--- a/tests/Yolov5/test_yolov5.py
+++ b/tests/Yolov5/test_yolov5.py
@@ -13,39 +13,230 @@
 import os
 import shutil
 
-
+def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
+    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        message = f"|{name} Test Passed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+    else:
+        message = f"|{name} Test Failed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+        print("custom out: ", out.cpu())
+        print("cpu out: ", cpu_out)
+        exit(1)
 
 def run_yolo(batch, config):
+    import copy
+
     device = torch.device("npu:0")
 
     torch._dynamo.config.recompile_limit = 64
     torch._dynamo.config.cache_size_limit = 128
-    
+
+    # Load model and prepare input
     model = torch.hub.load("ultralytics/yolov5", "yolov5s").cpu().eval()
     url = "https://ultralytics.com/images/zidane.jpg"
-    
+
     response = requests.get(url)
     img = Image.open(BytesIO(response.content)).convert("RGB")
-    
+
     imgsz = 64
     transform = transforms.Compose([
         transforms.Resize((imgsz, imgsz)),
         transforms.ToTensor(),
     ])
-    
+
     x = transform(img).unsqueeze(0)   # [1, 3, H, W]
-    x = x.to(device)
-    
-
-    model.to(device)
-    x = x.to(device)
-    
-    # Compile and run the model with PyTorchSim
-    compiled_model = torch.compile(dynamic=False)(model)
-    y = compiled_model(x)
+
+    # CPU version
+    model_cpu = copy.deepcopy(model).cpu().eval()
+    x_cpu = copy.deepcopy(x).cpu()
+    y_cpu = model_cpu(x_cpu)
+
+    # NPU version
+    model_npu = model_cpu.to(device).eval()
+    x_npu = copy.deepcopy(x).to(device)
+    compiled_model_npu = torch.compile(dynamic=False)(model_npu)
+    y_npu = compiled_model_npu(x_npu)
+
+    # Compare results
+    # YOLOv5 output is typically a list or tensor, handle both cases
+    if isinstance(y_cpu, (list, tuple)):
+        for i, (out_npu, out_cpu) in enumerate(zip(y_npu, y_cpu)):
+            test_result(f"YOLOv5 Output {i}", out_npu, out_cpu)
+    else:
+        test_result("YOLOv5 Output", y_npu, y_cpu)
+
     print("Yolo Simulation Done")
 
 
+def test_c3_module(device, batch=1, c1=64, c2=128, n=1, h=64, w=64):
+    import copy
+    import sys
+
+    # Import C3 module from YOLOv5
+    try:
+        # Load model first to ensure hub cache is populated
+        _ = torch.hub.load("ultralytics/yolov5", "yolov5s", pretrained=False)
+
+        # Try to import from torch hub cache
+        hub_path = os.path.expanduser("~/.cache/torch/hub/ultralytics_yolov5_master")
+        if os.path.exists(hub_path):
+            sys.path.insert(0, hub_path)
+        # Import C3 module
+        from models.common import C3  # noqa: F401
+    except Exception as e:
+        print(f"Warning: Could not import C3 module: {e}")
+        print("Skipping C3 module test")
+        return
+
+    torch.manual_seed(0)
+
+    # Create input tensor
+    x = torch.randn(batch, c1, h, w)
+
+    # CPU version
+    model_cpu = C3(c1, c2, n=n, shortcut=True, g=1, e=0.5).cpu().eval()
+    x_cpu = copy.deepcopy(x).cpu()
+    y_cpu = model_cpu(x_cpu)
+
+    # NPU version
+    model_npu = model_cpu.to(device).eval()
+    x_npu = copy.deepcopy(x).to(device)
+    compiled_model_npu = torch.compile(dynamic=False)(model_npu)
+    y_npu = compiled_model_npu(x_npu)
+
+    # Compare results
+    if isinstance(y_cpu, (list, tuple)):
+        for i, (out_npu, out_cpu) in enumerate(zip(y_npu, y_cpu)):
+            test_result(f"C3 Output {i}", out_npu, out_cpu)
+    else:
+        test_result("C3 Output", y_npu, y_cpu)
+    print("C3 Module Test Done")
+
+
+def test_bottleneck_module(device, batch=1, c1=64, c2=64, shortcut=True, g=1, e=0.5, h=16, w=16):
+    import copy
+    import sys
+
+    # Import Bottleneck module from YOLOv5
+    try:
+        # Load model first to ensure hub cache is populated
+        _ = torch.hub.load("ultralytics/yolov5", "yolov5s", pretrained=False)
+
+        # Try to import from torch hub cache
+        hub_path = os.path.expanduser("~/.cache/torch/hub/ultralytics_yolov5_master")
+        if os.path.exists(hub_path):
+            sys.path.insert(0, hub_path)
+        # Import Bottleneck module
+        from models.common import Bottleneck  # noqa: F401
+    except Exception as e:
+        print(f"Warning: Could not import Bottleneck module: {e}")
+        print("Skipping Bottleneck module test")
+        return
+
+    torch.manual_seed(0)
+
+    # Create input tensor
+    x = torch.randn(batch, c1, h, w)
+
+    # CPU version
+    model_cpu = Bottleneck(c1, c2, shortcut=shortcut, g=g, e=e).cpu().eval()
+    x_cpu = copy.deepcopy(x).cpu()
+    y_cpu = model_cpu(x_cpu)
+
+    # NPU version
+    model_npu = model_cpu.to(device).eval()
+    x_npu = copy.deepcopy(x).to(device)
+    compiled_model_npu = torch.compile(dynamic=False)(model_npu)
+    y_npu = compiled_model_npu(x_npu)
+
+    # Compare results
+    test_result("Bottleneck Module", y_npu, y_cpu)
+    print("Bottleneck Module Test Done")
+
+
+def test_conv_module(device, batch=1, c1=32, c2=64, k=3, s=1, p=None, g=1, d=1, act=True, h=16, w=16):
+    import copy
+    import sys
+
+    # Import Conv module from YOLOv5
+    try:
+        # Load model first to ensure hub cache is populated
+        _ = torch.hub.load("ultralytics/yolov5", "yolov5s", pretrained=False)
+
+        # Try to import from torch hub cache
+        hub_path = os.path.expanduser("~/.cache/torch/hub/ultralytics_yolov5_master")
+        if os.path.exists(hub_path):
+            sys.path.insert(0, hub_path)
+        # Import Conv module
+        from models.common import Conv  # noqa: F401
+    except Exception as e:
+        print(f"Warning: Could not import Conv module: {e}")
+        print("Skipping Conv module test")
+        return
+
+    torch.manual_seed(0)
+
+    # Create input tensor
+    x = torch.randn(batch, c1, h, w)
+
+    # CPU version
+    model_cpu = Conv(c1, c2, k=k, s=s, p=p, g=g, d=d, act=act).cpu().eval()
+    x_cpu = copy.deepcopy(x).cpu()
+    y_cpu = model_cpu(x_cpu)
+
+    # NPU version
+    model_npu = model_cpu.to(device).eval()
+    x_npu = copy.deepcopy(x).to(device)
+    compiled_model_npu = torch.compile(dynamic=False)(model_npu)
+    y_npu = compiled_model_npu(x_npu)
+
+    # Compare results
+    test_result("Conv Module", y_npu, y_cpu)
+    print("Conv Module Test Done")
+
+
+def test_concat_4d(device):
+    """
+    Test concatenating 3 tensors along dimension 4
+    Shapes: (1, 3, 4, 4, 2), (1, 3, 4, 4, 2), (1, 3, 4, 4, 81)
+    Result: (1, 3, 4, 4, 85)
+    """
+    import copy
+
+    torch.manual_seed(0)
+
+    # Create 3 input tensors
+    x1 = torch.ones(1, 3, 4, 4, 2)
+    x2 = torch.ones(1, 3, 4, 4, 2) * 2
+    x3 = torch.ones(1, 3, 4, 4, 81) * 3
+
+    # CPU version
+    x1_cpu = copy.deepcopy(x1).cpu()
+    x2_cpu = copy.deepcopy(x2).cpu()
+    x3_cpu = copy.deepcopy(x3).cpu()
+    y_cpu = torch.cat([x1_cpu, x2_cpu, x3_cpu], dim=4)
+
+    # NPU version
+    x1_npu = copy.deepcopy(x1).to(device)
+    x2_npu = copy.deepcopy(x2).to(device)
+    x3_npu = copy.deepcopy(x3).to(device)
+
+    def concat_fn(x1, x2, x3):
+        return torch.cat([x1, x2, x3], dim=4)
+
+    compiled_concat = torch.compile(dynamic=False)(concat_fn)
+    y_npu = compiled_concat(x1_npu, x2_npu, x3_npu)
+
+    # Compare results
+    test_result("Concat 4D", y_npu, y_cpu)
+    print(f"Output shape: {y_npu.shape}")
+    print("Concat 4D Test Done")
+
 if __name__ == "__main__":
 
     base_dir = os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim")
@@ -59,4 +250,34 @@ def run_yolo(batch, config):
     args = args.parse_args()
     batch = args.batch
 
+    device = torch.device("npu:0")
+
+    # Test Concat 4D
+    # print("=" * 80)
+    # print("Testing Concat 4D")
+    # print("=" * 80)
+    # test_concat_4d(device)
+
+    # Test Conv module
+    # print("\n" + "=" * 80)
+    # print("Testing Conv Module")
+    # print("=" * 80)
+    # test_conv_module(device, batch=batch, c1=32, c2=32, k=1, s=1, p=None, g=1, d=1, act=False, h=16, w=16)
+
+    # Test Bottleneck module
+    # print("\n" + "=" * 80)
+    # print("Testing Bottleneck Module")
+    # print("=" * 80)
+    # test_bottleneck_module(device, batch=batch, c1=32, c2=32, shortcut=True, g=1, e=0.5, h=16, w=16)
+
+    # Test C3 module
+    # print("\n" + "=" * 80)
+    # print("Testing C3 Module")
+    # print("=" * 80)
+    # test_c3_module(device, batch=batch, c1=64, c2=64, n=1, h=16, w=16)
+
+    # Test full YOLOv5 model
+    print("\n" + "=" * 80)
+    print("Testing Full YOLOv5 Model")
+    print("=" * 80)
     run_yolo(batch, config)

From f60cbe5b766723c8f4eb1306d9464c7ecc7be85e Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 23 Feb 2026 06:25:11 +0000
Subject: [PATCH 104/194] [Codegen] Use ops instead of raw assembly

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index b52b36d0..2cff7815 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -1432,10 +1432,10 @@ def convert_indirect_indexing(self, index :sympy.Expr):
                 self.spad_buffer_dict[target_dim] = [sram_var, local_tile_desc.get_tile_size(), tile_numel_per_lane, sram_index_var, tile_shape, vshape]
 
                 # Store the indirect index variable
-                opeartion = "affine.vector_store"
+                target_var = self.cse.varname_map[target_dim]
                 compute_index_var = ",".join(sram_index_var.split(",")[:-1] + [f"%{self.compute_idx}"])
-                line = f"{opeartion} %{target_dim}, %{sram_var}[{compute_index_var}] : {tile_shape}, {vshape}"
-                self.stores.writeline(line)
+                with self.override_buffer_cse(buffer=self.stores):
+                    ops._store(target_var, sram_var, compute_index_var, tile_shape)
             mlir_dtype = vshape.split("x")[1][:-1]
             with self.override_buffer_cse(buffer=target_dma_buffers):
                 out = ops._load(tile_numel_per_lane, mlir_dtype, sram_var, sram_index_var, tile_shape)

From 014cb116c126c87dd7594c8d065c5d41380b7e97 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=EC=9D=B4=EC=9E=AC=EA=B7=A0?= <jamesgyun@gmail.com>
Date: Thu, 19 Feb 2026 15:58:07 +0900
Subject: [PATCH 105/194] [Test] Add DeepSeek v3 base test file and etc. (WIP)

---
 Dockerfile.base                               |   3 +
 .../torch_openreg/openreg/__init__.py         |  17 +-
 .../mlir/mlir_codegen_backend.py              |  32 +--
 PyTorchSimFrontend/mlir/mlir_common.py        |  11 +-
 PyTorchSimFrontend/mlir/mlir_template.py      |   4 +
 tests/DeepSeek/test_deepseek_v3_base.py       | 220 ++++++++++++++++++
 6 files changed, 271 insertions(+), 16 deletions(-)
 create mode 100644 tests/DeepSeek/test_deepseek_v3_base.py

diff --git a/Dockerfile.base b/Dockerfile.base
index 0fd950d2..e8504bcf 100644
--- a/Dockerfile.base
+++ b/Dockerfile.base
@@ -45,6 +45,9 @@ RUN wget https://github.com/riscv-collab/riscv-gnu-toolchain/releases/download/2
 # Install torchsim dependency
 RUN apt install ninja-build && pip install onnx matplotlib && pip install --user conan==1.56.0 && pip install "transformers<4.44" && pip install diffusers==0.34.0
 
+# FlashAttention
+RUN python -m pip install --no-build-isolation flash-attn
+
 # Extra Python deps for YOLO/vision tests
 RUN python -m pip install -U pip setuptools wheel && \
     python -m pip install --no-cache-dir --no-deps ultralytics && \
diff --git a/PyTorchSimDevice/torch_openreg/openreg/__init__.py b/PyTorchSimDevice/torch_openreg/openreg/__init__.py
index 8d62cee3..f5aabc18 100644
--- a/PyTorchSimDevice/torch_openreg/openreg/__init__.py
+++ b/PyTorchSimDevice/torch_openreg/openreg/__init__.py
@@ -80,8 +80,21 @@ def __init__(self, flags=0):
         self._stream = torch_openreg._C._stream_create()
 
     def __del__(self):
-        if hasattr(self, '_stream'):
-            torch_openreg._C._stream_destroy(self._stream)
+        # Interpreter shutdown can clear module globals before __del__ runs.
+        # Only destroy when both runtime handle and stream are still valid.
+        stream = getattr(self, "_stream", None)
+        backend = globals().get("torch_openreg", None)
+        c_api = getattr(backend, "_C", None) if backend is not None else None
+        if stream is None or c_api is None:
+            return
+        destroy = getattr(c_api, "_stream_destroy", None)
+        if destroy is None:
+            return
+        try:
+            destroy(stream)
+        except (AttributeError, TypeError):
+            # Ignore cleanup-time teardown ordering issues.
+            pass
 
     def launch_kernel(self, task):
         """Add a Python callable kernel to this stream.
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 2cff7815..62acd877 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -110,6 +110,7 @@ def write_header(self):
                 aten = torch.ops.aten
                 inductor_ops = torch.ops.inductor
                 assert_size_stride = torch._C._dynamo.guards.assert_size_stride
+                assert_alignment = torch._C._dynamo.guards.assert_alignment
                 alloc_from_pool = torch.ops.inductor._alloc_from_pool
                 reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
                 custom_async_compile = CustomAsyncCompile()
@@ -375,6 +376,10 @@ def _convert_sympy_to_mlir_expr(self, expr, sorted_args):
             indices.append(str(new_arg))
 
         expr_str = str(expr)
+        if "ModularIndexing" in expr_str:
+            def _replace_mod(m):
+                return f"({m.group(1)} floordiv {m.group(2)}) mod {m.group(3)}"
+            expr_str = re.sub(r"ModularIndexing\(([^,]+), ([^,]+), ([^)]+)\)", _replace_mod, expr_str)
         if "//" in expr_str:
             expr_str = expr_str.replace("//", " floordiv ")
         return expr_str, indices
@@ -1158,30 +1163,28 @@ def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffe
                 for constraint in sorted_constraints[1:]:
                     index = index.replace(constraint.original_expr, 0)
 
-        # Calculate dram stride
+        # Calculate dram stride in local tile-dim order.
+        # This keeps dram/sram stride rank aligned with tile rank.
+        local_dim_to_axis = {dim: axis for axis, dim in enumerate(local_dims)}
         dram_stride = [0] * local_tile_desc.get_nr_dim()
         if index.is_Symbol:
             dim_idx = int(str(index)[5:])
-            dram_stride[dim_idx] = 1
+            if dim_idx in local_dim_to_axis:
+                dram_stride[local_dim_to_axis[dim_idx]] = 1
         elif index.is_Number:
             pass
         else:
-            dram_dict = defaultdict(list)
+            dram_dict = defaultdict(lambda: 0)
             # Assume that div will have high priority than mod
             for arg in index.as_ordered_terms():
                 coeff, dim = arg.as_coeff_mul()
                 if len(dim) == 0:
                     continue
                 real_dim = list(dim[0].free_symbols)[0]
-                dram_dict[str(real_dim)].append(coeff)
-            # Add missing dims if not added
-            max_dim = len(self.ranges) if not store_reduction else len(self.ranges) - 1
-            for i in range(max_dim):
-                target_dim = f"index{i}"
-                if sympy.Symbol(target_dim) not in index.free_symbols:
-                    dram_dict[target_dim] = [0]
-            sorted_keys = sorted(dram_dict.keys())
-            dram_stride = sum((dram_dict[key] for key in sorted_keys), [])
+                real_dim_name = str(real_dim)
+                if real_dim_name.startswith("index"):
+                    dram_dict[int(real_dim_name[5:])] += int(coeff)
+            dram_stride = [dram_dict[dim] for dim in local_dims]
 
         # Support floordiv pattern
         # FIXME. How to integrate implicit dims and floordiv?
@@ -1193,6 +1196,9 @@ def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffe
                     if not str(sub.args[0]).startswith("index"):
                         continue
                     dim_idx = int((str(sub.args[0])[5:]))
+                    if dim_idx not in local_dim_to_axis:
+                        continue
+                    local_dim_idx = local_dim_to_axis[dim_idx]
                     if int(self.kernel_group.tile_desc.get_tile_size()[dim_idx] % sub.args[1]) != 0:
                         # In this case, need to recompile
                         original_tile = self.kernel_group.tile_desc.get_tile_size()
@@ -1211,7 +1217,7 @@ def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffe
                         # Send recompile signal
                         self.reset("recompile")
                         raise mlir_common.RecompileSignal(f"Tile size {self.kernel_group.tile_desc.get_tile_size()[dim_idx]} is not divisible by {sub.args[1]}")
-                    dim_divisor[dim_idx] = sub.args[1]
+                    dim_divisor[local_dim_idx] = sub.args[1]
 
             # Update dram_stride, just insert 0 next to target dim
             offset = 0
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index f101b7cb..7eb8f7f1 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -504,7 +504,7 @@ def __init__(self, tile_size, vector_lane, vlane_split_axis=None, vlane_stride=N
             vlane_stride=vlane_stride
         )
 
-        self.implicit_dim_size = None
+        self.implicit_dim_size = {}
         self.nr_rdim = 0
         self.offset = sympy.Integer(0) # Dram offset
 
@@ -654,6 +654,11 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
     def indirect_indexing(self, index_var, size, check, wrap_neg):
         raise NotImplementedError()
 
+    def check_bounds(self, expr, size, lower, upper):
+        # MLIR backend currently relies on masked paths for out-of-bounds handling.
+        # Keep this hook as a no-op to satisfy Inductor's check_bounds callback.
+        return
+    
     def codegen_global_init(self):
         raise NotImplementedError()
 
@@ -964,6 +969,10 @@ def store_reduction(name, index, value):
             def reduction(dtype, src_dtype, reduction_type, value):
                 return self.reduction(dtype, src_dtype, reduction_type, value)
 
+            @staticmethod
+            def check_bounds(index, size, lower, upper):
+                return self.check_bounds(index, size, lower, upper)
+
             @staticmethod
             def _index_expr(tile_size, buffer, renamed_expression, index):
                 return self._index_expr(tile_size, buffer, renamed_expression, index)
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 556f7e04..b1c756ba 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -861,6 +861,8 @@ def load_epilogue(self, name: str, index: sympy.Expr):
         vlane_stride = self.kernel_group.tile_desc.vmap.vlane_stride
         tile_shape = self.kernel_group.tile_desc.get_mlir_shape(mlir_dtype)
         tile_stride = self.kernel_group.tile_desc.get_tile_stride()
+        tile_rank = self.kernel_group.tile_desc.get_nr_dim()
+        dram_stride = dram_stride[:tile_rank] + [0] * max(tile_rank - len(dram_stride), 0)
 
         # Compute vector unit size
         vshape = self.kernel_group.tile_desc.get_mlir_vshape(mlir_dtype)
@@ -913,6 +915,8 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
         vlane_stride = self.kernel_group.tile_desc.vmap.vlane_stride
         tile_shape = self.kernel_group.tile_desc.get_mlir_shape(mlir_dtype)
         tile_stride = self.kernel_group.tile_desc.get_tile_stride()
+        tile_rank = self.kernel_group.tile_desc.get_nr_dim()
+        dram_stride = dram_stride[:tile_rank] + [0] * max(tile_rank - len(dram_stride), 0)
 
         if name not in self.buffer_names:
             sram_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, self.kernel_group.tile_desc, index)
diff --git a/tests/DeepSeek/test_deepseek_v3_base.py b/tests/DeepSeek/test_deepseek_v3_base.py
new file mode 100644
index 00000000..b8402c8b
--- /dev/null
+++ b/tests/DeepSeek/test_deepseek_v3_base.py
@@ -0,0 +1,220 @@
+import os
+import sys
+import argparse
+import torch
+
+
+def _dtype_from_str(name: str) -> torch.dtype:
+    return {
+        "float32": torch.float32,
+        "float16": torch.float16,
+        "bfloat16": torch.bfloat16,
+    }.get(name, torch.float32)
+
+
+def _build_random_inputs(batch, seq_len, vocab_size, device):
+    g = torch.Generator().manual_seed(0)
+    input_ids = torch.randint(0, vocab_size, (batch, seq_len), generator=g, dtype=torch.int64)
+    return input_ids.to(device)
+
+
+def _safe_scaled_int(value, scale, min_value=1):
+    return max(min_value, int(round(float(value) * float(scale))))
+
+
+def _round_to_multiple(value, multiple, min_value=1):
+    if multiple is None or multiple <= 0:
+        return max(min_value, int(value))
+    v = max(min_value, int(value))
+    return max(min_value, ((v + multiple - 1) // multiple) * multiple)
+
+
+def _maybe_scale_config(config, scale=1.0, max_layers=None):
+    if scale == 1.0 and max_layers is None:
+        return config
+
+    if hasattr(config, "hidden_size"):
+        config.hidden_size = _safe_scaled_int(config.hidden_size, scale)
+    if hasattr(config, "intermediate_size"):
+        config.intermediate_size = _safe_scaled_int(config.intermediate_size, scale)
+    if hasattr(config, "num_hidden_layers"):
+        config.num_hidden_layers = _safe_scaled_int(config.num_hidden_layers, scale)
+    if hasattr(config, "num_attention_heads"):
+        config.num_attention_heads = _safe_scaled_int(config.num_attention_heads, scale)
+    if hasattr(config, "num_key_value_heads"):
+        config.num_key_value_heads = min(
+            _safe_scaled_int(config.num_key_value_heads, scale),
+            config.num_attention_heads,
+        )
+
+    for name in [
+        "n_routed_experts",
+        "n_shared_experts",
+        "num_local_experts",
+        "num_experts",
+        "num_experts_per_tok",
+        "moe_intermediate_size",
+        "shared_expert_intermediate_size",
+    ]:
+        if hasattr(config, name):
+            setattr(config, name, _safe_scaled_int(getattr(config, name), scale))
+
+    # DeepSeek MoE gate expects n_routed_experts to be divisible by n_group.
+    if hasattr(config, "n_routed_experts") and hasattr(config, "n_group"):
+        config.n_routed_experts = _round_to_multiple(
+            config.n_routed_experts,
+            config.n_group,
+            min_value=max(1, int(config.n_group)),
+        )
+
+    if max_layers is not None and hasattr(config, "num_hidden_layers"):
+        config.num_hidden_layers = max(1, min(int(max_layers), int(config.num_hidden_layers)))
+
+    if hasattr(config, "hidden_size") and hasattr(config, "num_attention_heads"):
+        config.hidden_size = max(
+            config.num_attention_heads,
+            (config.hidden_size // config.num_attention_heads) * config.num_attention_heads,
+        )
+
+    return config
+
+
+def _apply_preset(scale, max_layers, batch, seq_len, preset):
+    if preset == "tiny":
+        return 0.03, 4, 1, min(seq_len, 16)
+    if preset == "small":
+        return 0.07, 8, 1, min(seq_len, 32)
+    if preset == "medium":
+        return 0.10, 12, 1, min(seq_len, 48)
+    return scale, max_layers, batch, seq_len
+
+
+@torch.no_grad()
+def run_deep_seek_v3_base_test(
+    model_id,
+    device,
+    init_mode="config-random",
+    scale=1.0,
+    max_layers=None,
+    dtype="float16",
+    batch=1,
+    seq_len=32,
+    use_tokenizer=False,
+    prompt="Hello, DeepSeek V3",
+    trust_remote_code=False,
+    revision=None,
+    compile_model=False,
+):
+    from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+
+    torch_dtype = _dtype_from_str(dtype)
+
+    # Load model config
+    config = AutoConfig.from_pretrained(
+        model_id,
+        trust_remote_code=trust_remote_code,
+        revision=revision,
+    )
+
+    # Some remote model codes expect quantization_config to stay object-like
+    # (call .to_dict()), so only disable it for pretrained loading path.
+    if init_mode == "pretrained" and getattr(config, "quantization_config", None) is not None:
+        config.quantization_config = None
+
+    config = _maybe_scale_config(config, scale=scale, max_layers=max_layers)
+
+    if init_mode == "config-random":
+        model = AutoModelForCausalLM.from_config(
+            config=config,
+            trust_remote_code=trust_remote_code,
+        ).eval()
+        model = model.to(dtype=torch_dtype)
+    elif init_mode == "pretrained":
+        # Load model(weights)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            config=config,
+            torch_dtype=torch_dtype,
+            trust_remote_code=trust_remote_code,
+            revision=revision,
+        ).eval()
+    else:
+        raise ValueError(f"Unsupported init mode: {init_mode}")
+
+    model = model.to(device)
+    model_params = sum(p.numel() for p in model.parameters())
+    print("init mode:", init_mode)
+    print("scaled hidden_size:", getattr(config, "hidden_size", "n/a"))
+    print("scaled num_hidden_layers:", getattr(config, "num_hidden_layers", "n/a"))
+    print("scaled num_attention_heads:", getattr(config, "num_attention_heads", "n/a"))
+    print("model params:", model_params)
+
+    # Load tokenizer
+    if use_tokenizer:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_id,
+            trust_remote_code=trust_remote_code,
+            revision=revision,
+        )
+        encoded = tokenizer(prompt, return_tensors="pt")
+        input_ids = encoded["input_ids"].to(device)
+    else:
+        vocab_size = getattr(config, "vocab_size", None)
+        if vocab_size is None:
+            raise ValueError("Config has no vocab_size; use --use-tokenizer or pass a model with vocab_size.")
+        input_ids = _build_random_inputs(batch, seq_len, vocab_size, device)
+
+    if compile_model:
+        model = torch.compile(model, dynamic=False)
+
+    out = model(input_ids)
+    logits = out.logits
+    
+    print("logits shape:", tuple(logits.shape))
+    print("logits dtype:", logits.dtype)
+    print("logits max:", logits.max().item())
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="DeepSeek V3 download-based test")
+    parser.add_argument("--model-id", type=str, default=os.environ.get("DEEPSEEK_V3_MODEL_ID", "deepseek-ai/DeepSeek-V3-Base"))
+    parser.add_argument("--revision", type=str, default=None)
+    parser.add_argument("--trust-remote-code", action="store_true", default=True)
+    parser.add_argument("--init-mode", type=str, default="config-random", choices=["config-random", "pretrained"])
+    parser.add_argument("--preset", type=str, default="tiny", choices=["none", "tiny", "small", "medium"])
+    parser.add_argument("--scale", type=float, default=1.0)
+    parser.add_argument("--max-layers", type=int, default=None)
+    parser.add_argument("--dtype", type=str, default="float32", choices=["float32", "float16", "bfloat16"])
+    parser.add_argument("--batch", type=int, default=1)
+    parser.add_argument("--seq-len", type=int, default=32)
+    parser.add_argument("--use-tokenizer", action="store_true")
+    parser.add_argument("--prompt", type=str, default="Hello, DeepSeek V3")
+    parser.add_argument("--compile", action="store_true", default=True)
+
+    args = parser.parse_args()
+
+    if not args.model_id:
+        print("Error: --model-id is required (or set DEEPSEEK_V3_MODEL_ID).", file=sys.stderr)
+        sys.exit(2)
+
+    args.scale, args.max_layers, args.batch, args.seq_len = _apply_preset(
+        args.scale, args.max_layers, args.batch, args.seq_len, args.preset
+    )
+
+    device = torch.device("npu:0")
+
+    run_deep_seek_v3_base_test(
+        model_id=args.model_id,
+        device=device,
+        init_mode=args.init_mode,
+        scale=args.scale,
+        max_layers=args.max_layers,
+        dtype=args.dtype,
+        batch=args.batch,
+        seq_len=args.seq_len,
+        use_tokenizer=args.use_tokenizer,
+        prompt=args.prompt,
+        trust_remote_code=args.trust_remote_code,
+        revision=args.revision,
+        compile_model=args.compile,
+    )

From 9a27549ad72880de8046424f8f6102719a549513 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 25 Feb 2026 14:06:30 +0000
Subject: [PATCH 106/194] [Fix] Polish the error handling of dram_stride
 calculation

---
 .../_C.cpython-311-x86_64-linux-gnu.so        | Bin 15312 -> 0 bytes
 .../mlir/mlir_codegen_backend.py              |  76 +++++++++++++++---
 PyTorchSimFrontend/mlir/mlir_common.py        |   4 +
 3 files changed, 68 insertions(+), 12 deletions(-)
 delete mode 100755 PyTorchSimDevice/torch_openreg/_C.cpython-311-x86_64-linux-gnu.so

diff --git a/PyTorchSimDevice/torch_openreg/_C.cpython-311-x86_64-linux-gnu.so b/PyTorchSimDevice/torch_openreg/_C.cpython-311-x86_64-linux-gnu.so
deleted file mode 100755
index 04b3b4e1cb7232dbb845c2f33fe24d94c640b705..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 15312
zcmeHOU1%It6uz6Z8tqTk(rRqQj?!YQ?If)!+DdHFH0eZ}Hl`^e{*1G`lkCd=q`R}4
z^`Taaf0RN&d{bzB5Px0-1@%F#RHRBDT0sy6DPkd2(W)rbhmPmWoNqIo?urx<gnMD<
zyXW_wJ@@Y3JG*Bd8`wJ7l1M1DR&~3Yp_J+sL2?~5n0Kq1L)Rs0aiBYEMsz`M#q5v;
zRLPL$F#?g$-~-o75J2#m9gJs$epC$K5jHStcL~Wl%uu1@_Ve+qjHg<}fyBim+pQ40
z6=DZGD0aX$G3OZh^@{k5qak|0xZP2)i{mn}1M(Q$cAWAu7c<_7*v0b*dR6FN0WscP
zWXE~=rw!9QD&sYqhiS<tP{<&c?1R_LmTfzEnhtX;tr7tD7YfbtuYY;>r|frkcfRy^
z_nl9q2fuy)#V>nU74yk{IsPE;5**X5``+^jnzlib7!9PHpGJIk-H2}!e*wMXjpH)n
zOrm$vYk``pj4MLnWzezhi9)GpS3IZe*|xHW#)j>TTXrXM70)e?4fp3uMR&|e<=s%$
zSYoHA9D6)hbn>}JT{Ti0D(1*rzseDApLC0(?!<5@Qza+)T*@nz(^)%}D`s-ViHcb%
zsm{`**O@LAGfpSTH!RyeI<#eI8}|_=K5tm(NqHZJe4fBRE_2b=8(M-7`sl`x&vV7O
zLOUMe%SR^=eG%bft+3!^gfCxFp{w2yE+xQP4>|g(GoUk|GoUk|GoUk|GoUk|Gw}b<
zz_0Ds{%P&~q0QPi`$VTw){}?57XP@l_oKEW!JG5feM)S9`7ye-FYQ&VpJmDEZ+zb$
zKftuVd^btQ+m~)uf!tsIa-FvJ_q<AkwSV?Y4C~K&i)g3^xKBe}=AUd|x`*8z5hVRu
zpueunu=1Ss>>2CBA2(Vbo^7=fA6qBRc?-$GB5}~>pA5%^J;$@BXB~-E^`@QH-kxvx
z#@%}MlsDJf*K;NDr-vx;=?q;yo;{D~#QkJjAD`_{KSDx@C!lX2n!Ip7$W=W%#MDh^
zKxaT_KxaT_KxaT_KxaT_KxaT_KxaT_;D3;T#FEzA_`gg3ugf^&`xap@;UwV*;WNzt
z+4m0;a^wG4xg{PQRf(x&V#(Y~+YZnlam$Ez4ZV*4<ogDpnx7n;-*HF#oJZRBsEv!)
z-+1$~>zU2=^fQ4vN_z5FAF~7geT0W&eAe;kHAS)1|MMhTH=O~U0i6Mz0i6Mz0i6Mz
z0i6Mz0i6Mzfqy3hsLw<_CTcBriTifJrv&F>Sh=a2C-f?*^SoMU)PXJ$8uguvg+@In
z-%E0X{I_#{iRm*^+=-ga!&21A^`P83guXl^)gi$-f*pdW1)Y{UP}Gb<j{97$1LbyA
z2r4N9gP-LaXKIM^m5@}pSB1Yn{4jouD{B84pk6&bV3*7n>T9?3_ir#(>`YaQUe#E$
z#_Tq`R<EfB<eJ@U*P313=Gv7ai2I7tyk2IhpZRa|^BS*{7OG=@FnaS(y!paE5aVAb
ze5^wvgLVsD6Y5Hp><wwG&;Edag~B=trAj?S9Ud3!8vKU?{iMRW1pj&JG>X63`+6bp
z)2`k!;9&+E`FSnSU!dai@@BwaRDWOa@`3O%Pv9S;P7ANTSl&Jh6ez0(G($W^(4yuk
zd@dpVo;nHo6$YON@cGdY14IOq#BWrOzPO2gcN6|d;3wi(x~V4mDqZ#}UUhujOsknP
zciSu2X)biHDBI4I?1_9S<>c)QRjOBPr#hw5rNU(1_1uiv)mVdz-*dK8E;}{bEqdje
z8ZSEq*UnT6g&LVeD4UDo&r_v<QWTGE503T?57-0SHghHZ=6kmF4G;Abo2u}^v@?~G
zopp*CuE%FbB&O#T>;X%Rt<9q<2u9gkwr=0lx7FT0IJjeA%pU98v~_^oHY?Q+D*qc-
z_y5f^0T&kd#~vt9W~El}oD`u~4l#>fvE;etM6qg4mP?av*{dnD&Pr8t`ONBEMg<C4
zr;=4>rdA|pA$nzHmfgI=OzBPLJ!J+tF{x`#l!)X`#Z4>IbEk;pSyHB(mHEhK$P1{@
zHk_DH6MY)ODdflrxnz$Nf#W&KOjRn%q@_`y8NYA|UKEG-HQpPrpEP^!2hOo?faYqY
zsC~2Nf1oeh4@lKVz29lC!T8uOf&4oyH`v9;zmsIVSHd3qDG>V`Z-ev^t?zr-=XMZ}
z_c+*Ne+C{RS+uc_XuFpPjt%zMUx64u9zVu+#eS1G#eNNh9^3OCkM|EgA2K*5oG0qv
ztMdJda|@{i4RL_xeI{yeh+Q3_D2T6ZU^QmnD*_<&;082scg%i71VHFDvwu2f|DYHE
zKY^iO+~59~J@y$O&V_Ij&);)mkNH=IXbP?p5)ijPL;}t~?7NIm6ZlL>)~}d<FkYt)
zpG9y^!?^&@tHt#}1K%cFJ{!Xx=MaNG*wmmGQZqF8M@YbH5B4}8;Q7OY{N||1F|_Uv
z^H3uhtH2)5OC9ZGE2WI!I&;CeKt4}I?QtHM{)hIT$Lw)lY3mBd#5jl`PJcxL<{^HZ
z?~+aIB^_|5PEwC~qF(32es!G_c3>YxtevQ8uQ;3(A>Yq5`u^Z^KVpyCzp*x?c3~h9
V#z@7tOO5QW>kbW0iya_t{}-!tJ*fZy

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 62acd877..d6ddb025 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -1,5 +1,6 @@
 import contextlib
 import sympy
+import sys
 import re
 import os
 from functools import reduce
@@ -375,13 +376,51 @@ def _convert_sympy_to_mlir_expr(self, expr, sorted_args):
             expr = expr.replace(target_arg, new_arg)
             indices.append(str(new_arg))
 
-        expr_str = str(expr)
-        if "ModularIndexing" in expr_str:
-            def _replace_mod(m):
-                return f"({m.group(1)} floordiv {m.group(2)}) mod {m.group(3)}"
-            expr_str = re.sub(r"ModularIndexing\(([^,]+), ([^,]+), ([^)]+)\)", _replace_mod, expr_str)
-        if "//" in expr_str:
-            expr_str = expr_str.replace("//", " floordiv ")
+        # Convert ModularIndexing and FloorDiv to sympy expressions
+        # ModularIndexing(x, y, z) means (x // y) % z -> Mod(FloorDiv(x, y), z)
+        # FloorDiv(x, y) means x // y -> will be converted to floordiv in string representation
+        # Use preorder_traversal to find all instances
+        replacements = {}
+        for sub in sympy.preorder_traversal(expr):
+            if isinstance(sub, ModularIndexing):
+                # Convert ModularIndexing to Mod(FloorDiv(...), ...)
+                if sub.args[1] != 1:
+                    floor_div = FloorDiv(sub.args[0], sub.args[1])
+                else:
+                    floor_div = sub.args[0]
+                mod_expr = sympy.Mod(floor_div, sub.args[2])
+                replacements[sub] = mod_expr
+            elif isinstance(sub, FloorDiv):
+                # Keep FloorDiv as is, will be handled in custom string conversion
+                # We need to mark it for special handling
+                pass
+
+        # Apply replacements
+        for old_expr, new_expr in replacements.items():
+            expr = expr.subs(old_expr, new_expr)
+
+        # Custom string conversion for MLIR affine expressions
+        def mlir_str(expr):
+            """Convert sympy expression to MLIR affine expression string"""
+            if isinstance(expr, FloorDiv):
+                return f"({mlir_str(expr.args[0])} floordiv {mlir_str(expr.args[1])})"
+            elif isinstance(expr, sympy.Mod):
+                return f"({mlir_str(expr.args[0])} mod {mlir_str(expr.args[1])})"
+            elif isinstance(expr, sympy.Add):
+                terms = [mlir_str(term) for term in expr.args]
+                return " + ".join(terms)
+            elif isinstance(expr, sympy.Mul):
+                factors = [mlir_str(factor) for factor in expr.args]
+                return " * ".join(factors)
+            elif isinstance(expr, sympy.Symbol):
+                return str(expr)
+            elif expr.is_number:
+                return str(expr)
+            else:
+                # Fallback to string representation
+                return str(expr)
+
+        expr_str = mlir_str(expr)
         return expr_str, indices
 
     def parse_indices(self, expr, comments="", indices=None, indirect_dims=[]) -> common.CSEVariable:
@@ -1174,17 +1213,30 @@ def get_dma_info(self, name, index, broadcast=True, store_reduction=False, buffe
         elif index.is_Number:
             pass
         else:
-            dram_dict = defaultdict(lambda: 0)
+
+            dram_dict = defaultdict(list)
+            implicit_dim_divisors = defaultdict(lambda: sys.maxsize)
             # Assume that div will have high priority than mod
             for arg in index.as_ordered_terms():
                 coeff, dim = arg.as_coeff_mul()
                 if len(dim) == 0:
                     continue
                 real_dim = list(dim[0].free_symbols)[0]
-                real_dim_name = str(real_dim)
-                if real_dim_name.startswith("index"):
-                    dram_dict[int(real_dim_name[5:])] += int(coeff)
-            dram_stride = [dram_dict[dim] for dim in local_dims]
+                if dim[0].has(ModularIndexing):
+                    if dim[0].args[1] < implicit_dim_divisors[str(real_dim)]:
+                        implicit_dim_divisors[str(real_dim)] = dim[0].args[1]
+                        dram_dict[str(real_dim)] = [coeff]
+                else:
+                    dram_dict[str(real_dim)].append(coeff)
+
+            # Add missing dims if not added
+            max_dim = len(self.ranges) if not store_reduction else len(self.ranges) - 1
+            for i in range(max_dim):
+                target_dim = f"index{i}"
+                if sympy.Symbol(target_dim) not in index.free_symbols:
+                    dram_dict[target_dim] = [0]
+            sorted_keys = sorted(dram_dict.keys())
+            dram_stride = sum((dram_dict[key] for key in sorted_keys), [])
 
         # Support floordiv pattern
         # FIXME. How to integrate implicit dims and floordiv?
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 7eb8f7f1..34b185b8 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -923,6 +923,10 @@ def indirect_indexing(index_var, size, check=True, wrap_neg=True):
                 # Skip CSE since this doesn't return an expression
                 return self.indirect_indexing(index_var, size, check, wrap_neg)
 
+            @staticmethod
+            def check_bounds(index, size, lower, upper):
+                return self.check_bounds(index, size, lower, upper)
+
             @staticmethod
             def load(name: str, index: sympy.Expr):
                 index = self.rename_indexing(index)

From 9b92f11f5aea7517093f748903c811564125b81b Mon Sep 17 00:00:00 2001
From: jung-min <wjdals020503@naver.com>
Date: Mon, 2 Mar 2026 07:58:59 +0000
Subject: [PATCH 107/194] [Frontend/template] add SDPA modules

---
 .../torch_openreg/openreg/__init__.py         |   7 +-
 PyTorchSimFrontend/mlir/mlir_lowering.py      |  25 +-
 PyTorchSimFrontend/mlir/mlir_sdpa_template.py | 664 ++++++++++++++++++
 PyTorchSimFrontend/mlir/mlir_template.py      | 101 ++-
 tests/test_sdpa.py                            |  84 +++
 5 files changed, 878 insertions(+), 3 deletions(-)
 create mode 100644 PyTorchSimFrontend/mlir/mlir_sdpa_template.py
 create mode 100644 tests/test_sdpa.py

diff --git a/PyTorchSimDevice/torch_openreg/openreg/__init__.py b/PyTorchSimDevice/torch_openreg/openreg/__init__.py
index 8d62cee3..5a0de6c3 100644
--- a/PyTorchSimDevice/torch_openreg/openreg/__init__.py
+++ b/PyTorchSimDevice/torch_openreg/openreg/__init__.py
@@ -24,7 +24,7 @@ class device:
 
     def __init__(self, device):
         self.idx = torch.accelerator._get_device_index(device, optional=True)
-        self.prev_idx = -1
+        self.prev_idx = -1 
 
     def __enter__(self):
         self.prev_idx = torch_openreg._C._exchangeDevice(self.idx)
@@ -64,6 +64,11 @@ def _lazy_init():
     global _initialized, _tog_simulator
     if is_initialized():
         return
+
+    # Replace the global C++ binding with our custom dispatcher patch
+    from PyTorchSimFrontend.mlir.mlir_sdpa_template import patched_scaled_dot_product_attention
+    torch._C._nn.scaled_dot_product_attention = patched_scaled_dot_product_attention
+    
     torch_openreg._C._init()
     register_interface_for_device(custom_device(), ExtensionDeviceInterface)
     _initialized = True
diff --git a/PyTorchSimFrontend/mlir/mlir_lowering.py b/PyTorchSimFrontend/mlir/mlir_lowering.py
index ebf0c80e..e09dcf57 100644
--- a/PyTorchSimFrontend/mlir/mlir_lowering.py
+++ b/PyTorchSimFrontend/mlir/mlir_lowering.py
@@ -15,6 +15,7 @@
 from PyTorchSimFrontend.mlir.mlir_conv_sb_template import MLIRConvSingleBatchTemplate
 from PyTorchSimFrontend.mlir.mlir_conv_sbs_template import MLIRConvSingleBatchStridedTemplate
 from PyTorchSimFrontend.mlir.mlir_maxpool_template import MLIRMaxPoolTemplate
+from PyTorchSimFrontend.mlir.mlir_sdpa_template import MLIRFlashSDPATemplate, flash_sdpa_args
 from PyTorchSimFrontend import extension_config
 
 aten = torch.ops.aten
@@ -38,6 +39,26 @@ def tuned_bmm(mat1, mat2, *, layout=None):
 
     return mlir_template.generate().output_node()
 
+
+def tuned_flash_sdpa(
+        query             : TensorBox, 
+        key               : TensorBox, 
+        value             : TensorBox, 
+        scale             : float, 
+        dropout_p         : float = 0.0, 
+        is_causal         : bool = False, 
+        return_debug_mask : bool =False) -> tuple: 
+    
+    print("Enter tuned_flash_sdpa")
+
+    N, Hq, H, L, S, E, Ev, layout, query, key, value = flash_sdpa_args(query, key, value)
+    mlir_template = MLIRFlashSDPATemplate([query, key, value], layout, scale)
+
+    # _scaled_dot_product_flash_attention has to return a tuple which has 9 values
+    # since its backward(_scaled_dot_product_flash_attention_backward) needs that values.
+    # (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor rng_state, Tensor unused, Tensor debug_attn_mask)
+    return (mlir_template.generate().output_node(), None, None, None, None, None, None, None, None)
+
 def conv_layout(
     x: TensorBox,
     weight: TensorBox,
@@ -188,4 +209,6 @@ def custom_unsafe_index(x, indices):
 lowerings.update({getattr(aten._sparse_addmm, overload): sparse_addmm for overload in aten._sparse_addmm.overloads()})
 lowerings.update({getattr(aten._unsafe_index, overload): custom_unsafe_index for overload in aten._unsafe_index.overloads()})
 if extension_config.CONFIG_USE_TIMING_POOLING:
-    lowerings.update({getattr(aten.max_pool2d_with_indices, overload): custom_maxpool for overload in aten.max_pool2d_with_indices.overloads()}) # FIXME: maxpool should be implemented as a template
\ No newline at end of file
+    lowerings.update({getattr(aten.max_pool2d_with_indices, overload): custom_maxpool for overload in aten.max_pool2d_with_indices.overloads()}) # FIXME: maxpool should be implemented as a template
+
+lowerings.update({getattr(aten._scaled_dot_product_flash_attention, overload): tuned_flash_sdpa for overload in aten._scaled_dot_product_flash_attention.overloads()})
\ No newline at end of file
diff --git a/PyTorchSimFrontend/mlir/mlir_sdpa_template.py b/PyTorchSimFrontend/mlir/mlir_sdpa_template.py
new file mode 100644
index 00000000..b3d88cc6
--- /dev/null
+++ b/PyTorchSimFrontend/mlir/mlir_sdpa_template.py
@@ -0,0 +1,664 @@
+import math # sqrt
+import sympy
+
+from typing import List, Optional
+
+import torch
+from torch import empty_strided
+from torch._inductor.ir import IRNode, TensorBox, FixedLayout
+from torch._inductor.virtualized import V
+from torch._inductor.select_algorithm import realize_inputs
+from torch.backends.cuda import flash_sdp_enabled, mem_efficient_sdp_enabled
+
+from PyTorchSimFrontend import extension_config
+from PyTorchSimFrontend.mlir import mlir_common
+from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate
+from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplateKernel
+
+
+def flash_sdpa_args(
+        query : TensorBox, 
+        key   : TensorBox, 
+        value : TensorBox) -> list:
+    """
+    Arg processing for flash SDPA.
+    Its logic is based on: 
+    mm_args() which is in torch._inductor.kernel.mm_common.py (142 line).
+    """
+
+    # Materialize input buffers for the codegen backend. 
+    query, key, value = realize_inputs(query, key, value)
+
+    # query : (n, hq, l, e)
+    # key   : (n, h, s, e)
+    # value : (n, h, s, ev)
+    # out   : (n, hq, l, ev)
+    # n: Batch size
+    # hq: query's head counts, h: key and value's head counts.
+    # l: target sequence lenght and s: source sequence length.
+    # e: embeding dimension of the query and key and ev: embeding dimension of the value.
+    nq, hq, l, eq  = query.get_size()
+    nk, hk, sk, ek = key.get_size()
+    nk, hv, sv, ev = value.get_size()
+
+    n = V.graph.sizevars.guard_equals(nq, nk)
+    n = V.graph.sizevars.guard_equals(nq, nk)
+    
+    h = V.graph.sizevars.guard_equals(hk, hv)
+    s = V.graph.sizevars.guard_equals(sk, sv)
+    e = V.graph.sizevars.guard_equals(eq, ek)
+
+    # While there are no theoretical requirements for e == ev, 
+    # this implementation enforces e == ev for simplicity. 
+    # Distinct notations are still maintained to ensure future compatibility and clarity.
+    if e != ev:
+        raise NotImplementedError("Flash SDPA does not support mismatched head dimensions between query and value.")
+   
+    # Flash attention does not split tiles along the head dimension (e or ev).
+    # Therefore, the head dimension size must be less than or equal to the number of vlanes.
+    vector_lane = extension_config.vpu_num_lanes 
+    if e > vector_lane or ev > vector_lane:
+        raise ValueError(f"The head dimension size must be less than or equal to the number of vlanes (e: {e}, ev: {ev}, vlanes: {vector_lane}).")
+    
+    # The aten._scaled_dot_product_flash_attention kernel does not accept an explicit enable_gqa parameter.
+    # Instead, the Flash SDPA implementation infers GQA usage by checking if hq != hk.
+    # The Flash SDPA for GQA will be implemented after implementing its native version.
+    if hq != h :
+        raise NotImplementedError("Flash SDPA for GQA is not supported yet.")
+    
+    layout = FixedLayout(
+        query.get_device(),
+        query.get_dtype(),
+        [n, hq, l, ev]
+    )
+
+    return [n, hq, h, l, s, e, ev, layout, query, key, value]    
+    
+def validate_sdpa_input(
+        query       : torch.Tensor,
+        key         : torch.Tensor,
+        value       : torch.Tensor,
+        attn_mask   : torch.Tensor = None,
+        dropout_p   : float = 0.0, 
+        is_casual   : bool = False,
+        scale       : float = None,
+        enable_gqa  : bool = False) -> None:
+    """
+    Validates input tensors and parameters for Scaled Dot Product Attention (SDPA).
+    This function's logic can be found in:
+    https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/transformers/attention.cpp(504 line)
+    https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
+    """
+
+    # Tensor class, dtype, and device consistency
+    # Ensure all primary inputs are torch.Tensors
+    if not all(isinstance(t, torch.Tensor) for t in [query, key, value]):
+        raise TypeError(
+            f"Expected query, key and value to be Tensors, but got "
+            f"{type(query).__name__}, {type(key).__name__}, and {type(value).__name__}."
+        )
+
+    # Check for dtype mismatch
+    if query.dtype != key.dtype or query.dtype != value.dtype:
+        raise TypeError(
+            f"Expected query, key, and value to have the same dtype, "
+            f"but got {query.dtype}, {key.dtype}, and {value.dtype}."
+        )
+    
+    # Check for device mismatch (e.g., mixing CPU and NPU)
+    if query.device != key.device or query.device != value.device:
+        raise ValueError(
+            f"Expected query, key, and value to be on the same device, "
+            f"but got {query.device}, {key.device}, and {value.device}."
+        )
+
+    # Shape and dimension validation
+    # SDPA typically expects 4D (B, H, S, D), but we check for at least 2D here
+    if any(t.dim() < 2 for t in [query, key, value]):
+        raise ValueError(
+            f"Expected query, key, and value to be at least 2D, "
+            f"but got Q:{query.dim()}D, K:{key.dim()}D, V:{value.dim()}D."
+        )
+
+    # Attention mask validation
+    if attn_mask is not None:
+        if not isinstance(attn_mask, torch.Tensor):
+            raise TypeError(f"Expected attn_mask to be a Tensor, but got {type(attn_mask).__name__}.")
+        
+        # Dtype check: floating point masks must match query dtype; bool masks are also allowed
+        if attn_mask.dtype.is_floating_point:
+            if attn_mask.dtype != query.dtype:
+                raise TypeError(f"Floating point attn_mask must match query dtype ({query.dtype}), but got {attn_mask.dtype}.")
+        elif attn_mask.dtype != torch.bool:
+            raise TypeError(f"attn_mask must be floating point or bool, but got {attn_mask.dtype}.")
+
+        # Nested tensor limitation with explicit masking
+        if query.is_nested or key.is_nested:
+            raise ValueError("Nested tensors are not supported when an explicit attn_mask is set.")
+
+    # Dropout and causal flag validation (added)
+    # Dropout probability must be in the range [0, 1)
+    if not (0.0 <= dropout_p < 1.0):
+        raise ValueError(f"Expected dropout_p to be in [0, 1), but got {dropout_p}.")
+
+    # Mutual exclusivity: cannot use both explicit mask and causal flag (added)
+    if is_casual and attn_mask is not None:
+        raise ValueError("Both attn_mask and is_casual cannot be set at the same time.")
+
+    # Scaling factor validation (added)
+    if scale is not None and scale <= 0.0:
+        raise ValueError(f"Expected scale to be a positive number, but got {scale}.")
+    
+    # GQA (Grouped Query Attention) constraints (added)
+    n_head_q = query.size(1)
+    n_head_k = key.size(1)
+    n_head_v = value.size(1)
+    
+    # The aten._scaled_dot_product_flash_attention kernel does not accept an explicit enable_gqa parameter.
+    # Instead, the Flash SDPA implementation infers GQA usage by checking if n_head_q != n_head_k.
+    if not enable_gqa and n_head_q != n_head_k:
+        raise ValueError(f"Query and Key must have the same number of heads when enable_gqa is false (Q:{n_head_q} vs K:{n_head_k}).")
+
+    if enable_gqa:
+        if n_head_q == n_head_k:
+            raise ValueError(f"enable_gqa Query and Key ")
+
+        if n_head_k != n_head_v:
+            raise ValueError(f"Key and Value must have the same number of heads (K:{n_head_k} vs V:{n_head_v}).")
+        
+        # Query heads must be an integer multiple of key heads for grouping
+        if n_head_q % n_head_k != 0:
+            raise ValueError(
+                f"Number of query heads ({n_head_q}) must be divisible by "
+                f"number of key heads ({n_head_k}) for GQA."
+            )
+
+def convert_boolean_attn_mask(attn_mask: torch.Tensor, target_dtype: torch.dtype) -> float:
+    """
+    Equivalent to the C++ 'convert_boolean_attn_mask' function.
+    Converts a boolean mask to a floating-point mask for SDPA.
+    """
+
+    if attn_mask is not None and attn_mask.dtype == torch.bool:
+      
+        new_mask = torch.zeros_like(attn_mask, dtype=target_dtype)
+        minus_inf = torch.finfo(target_dtype).min
+        new_mask.masked_fill_(attn_mask.logical_not(), minus_inf)
+        
+        return new_mask
+
+    return attn_mask
+
+def calculate_scale(query: torch.Tensor, scale: float) -> float:
+    """
+    Calculate the scaling factor based on the head dimension if scale is None
+    Otherwise, use the provided scale.
+    """
+    if scale is None:
+        return 1.0 / math.sqrt(query.size(-1))
+    else:
+        return scale
+
+def patched_scaled_dot_product_attention(
+        query_      : torch.Tensor,
+        key         : torch.Tensor, 
+        value       : torch.Tensor, 
+        dropout_p   : float = 0.0, 
+        is_casual   : bool = False, 
+        attn_mask_  : torch.Tensor = None,
+        scale_       : float = None, 
+        enable_gqa  : bool = None,
+        orig_fn     = torch._C._nn.scaled_dot_product_attention) -> torch.Tensor :
+    """
+    Custom patch for Scaled Dot Product Attention (SDPA) to intercept high-level calls.
+    For NPU devices, it redirects execution to specific ATen kernels based on global flags.
+    For all devices, it maintains parity with the original dispatcher logic found in:
+    https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/transformers/attention.cpp
+
+    This function acts as a custom override that replaces the default PyTorch SDPA implementation, 
+    invoked via 'PyTorchSim/PyTorchSimDevice/torch_openreg/openreg/__init__.py'.
+    """
+
+    # Device-specific Dispatching: redirect to specialized kernels if on NPU
+    if "npu" in str(query_.device):
+        
+        validate_sdpa_input(query_, key, value, attn_mask_, dropout_p, is_casual, scale_, enable_gqa)
+        attn_mask = convert_boolean_attn_mask(attn_mask_, query_.dtype)
+        
+        # Kernel selection logic: emulate C++ dispatcher priority
+        # Selection priority(can be changed): flash attention > memory efficient > math (cuDNN is not supported)
+        aten = torch.ops.aten 
+        scale = calculate_scale(query_, scale_)
+
+        if flash_sdp_enabled(): 
+            # Skip padding query, key and value for alignment.
+            dispatch_kwargs = {
+                "dropout_p" : dropout_p,
+                "is_causal" : is_casual,
+                "return_debug_mask" : False,
+                "scale" : scale
+            }
+            
+            out_lse_softmax = aten._scaled_dot_product_flash_attention(
+                query_, key, value, **dispatch_kwargs 
+            )
+
+            return out_lse_softmax[0]
+        elif mem_efficient_sdp_enabled():
+            # out_and_lse = aten._scaled_dot_product_efficient_attention(...)
+            # return out_and_lse[0]
+            raise NotImplementedError("Memory efficient SDPA is not implemented yet.")
+        else:
+            dispatch_kwargs = {
+                "attn_mask" : attn_mask,
+                "dropout_p" : dropout_p,
+                "is_causal" : is_casual,
+                "dropout_mask" : None,
+                "scale": scale,
+                "enable_gqa" : enable_gqa
+            }
+
+            out_lse_softmax = aten._scaled_dot_product_attention_math(
+                query_,
+                key,
+                value, 
+                **dispatch_kwargs)
+            
+            return out_lse_softmax[0]
+    else: 
+        # Fallback: Delegate to the original C++ Dispatcher for other devices 
+        return orig_fn(query_, key, value)
+
+FLASH_SDPA_TEMPLATE = r"""
+// SDPA kernel
+// b = {{ b }}
+// l = {{ l }}
+// s = {{ s }}
+// e = {{ e }}
+// tile_l = {{ tile_l }}
+// tile_s = {{ tile_s }}
+// tile_e = {{ tile_e }}
+// subtile_l = {{ subtile_l }}
+// subtile_s = {{ subtile_s }}
+// subtile_e = {{ subtile_e }}
+{{kernel.def_global_vars()}}
+
+func.func @{{ KERNEL_NAME }}{{kernel.def_kernel(inputs=[query, key, value], outputs=[out], names_str="query, key, value, out", input_reorder=input_reorder)}} {
+  // Inputs
+  {{ kernel.def_sram_buffer("query", q_tile_desc, indent_size=2) }}
+  {{ kernel.def_sram_buffer("key", k_tile_desc, indent_size=2) }}
+  {{ kernel.def_sram_buffer("value", v_tile_desc, indent_size=2) }}
+  
+  // Output
+  {{ kernel.def_sram_buffer("out", out_tile_desc, indent_size=2) }}
+
+  // Intermediate buffers
+  {{ kernel.def_sram_buffer("mul", mul_tile_desc, indent_size=2) }}
+  {{ kernel.def_sram_buffer("max", max_desc, indent_size=2) }}
+  {{ kernel.def_sram_buffer("sum", sum_desc, indent_size=2) }}
+  
+  // Constants
+  %c0 = arith.constant 0.0 : {{ data_stype }}
+  %c1 = arith.constant 1.0 : {{ data_stype }}
+  %c_scale = arith.constant {{ scale }} : {{ data_stype }}
+  %c_neg_inf = arith.constant -1.0e+30 : {{ data_stype }}
+
+  %v0_c = arith.constant dense<0.0> : vector<{{ chunk_size }}x{{ data_stype }}>
+  %v0_l = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(tile_l, tile_e) }}x{{ data_stype }}>
+  %v0_s = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(tile_s, tile_l) }}x{{ data_stype }}>
+  %v0_2x = arith.constant dense<0.0> : vector<2x{{ data_stype }}>
+
+  %v_neg_inf_c = arith.constant dense<-1.0e+30> : vector<{{ chunk_size }}x{{ data_stype }}>
+  %v_neg_inf_2x = arith.constant dense<-1.0e+30> : vector<2x{{ data_stype }}>
+
+  %v_scale = vector.broadcast %c_scale : {{ data_stype }} to vector<{{ tile_s }}x{{ data_stype }}>
+  
+  {{ kernel.def_local_vars(indent_size=2) }}  
+  
+  affine.for %index0 = 0 to {{ b }} {
+    affine.for %index3 = 0 to 1 step 1 {
+      affine.for %index1 = 0 to {{ l }} step {{ tile_l }} {
+        {{ kernel.def_dma_op("MVIN", "query", q_idx, q_tile_desc, subtile_size=[1, subtile_l, subtile_e], indent_size=8) }}  
+        
+        affine.vector_store %v0_l, %out_buffer[0, 0, 0] : {{ out_tile_desc.get_mlir_shape(data_stype) }}, vector<{{ kernel.get_spad_size_per_lane(tile_l, tile_e) }}x{{ data_stype }}>
+        affine.vector_store %v_neg_inf_2x, %max_buffer[0, 0] : {{ max_desc.get_mlir_shape(data_stype) }}, vector<2x{{ data_stype }}> 
+        affine.vector_store %v0_2x, %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape(data_stype) }}, vector<2x{{ data_stype }}>
+              
+        %qt_buffer2D = memref.reinterpret_cast %q_buffer to offset: [0], sizes: [{{ tile_e }}, {{ tile_l }}], strides: [{{ tile_l }}, 1] : {{ q_tile_desc.get_mlir_shape(data_stype) }} to memref<{{ tile_e }}x{{ tile_l }}x{{ data_stype }}, 1>
+        %ot_buffer2D = memref.reinterpret_cast %out_buffer to offset: [0], sizes: [{{ tile_e }}, {{ tile_l }}], strides: [{{ tile_l }}, 1] : {{ out_tile_desc.get_mlir_shape(data_stype) }} to memref<{{ tile_e }}x{{ tile_l }}x{{ data_stype }}, 1>
+
+        affine.for %index2 = 0 to {{ s }} step {{ tile_s }} {
+          {{ kernel.def_dma_op("MVIN", "key", k_idx, k_tile_desc, subtile_size=[1, subtile_s, subtile_e], indent_size=10) }} 
+          {{ kernel.def_dma_op("MVIN", "value", v_idx, v_tile_desc, subtile_size=[1, subtile_s, subtile_e], indent_size=10) }}
+
+          affine.vector_store %v0_s, %mul_buffer[0, 0] : {{ mul_tile_desc.get_mlir_shape(data_stype) }}, vector<{{ kernel.get_spad_size_per_lane(tile_s, tile_l) }}x{{ data_stype }}>        
+
+          %k_buffer2D = memref.reinterpret_cast %k_buffer to offset: [0], sizes: [{{ tile_s }}, {{ tile_e }}], strides: [{{ tile_e }}, 1] : {{ k_tile_desc.get_mlir_shape(data_stype) }} to memref<{{ tile_s }}x{{ tile_e }}x{{ data_stype }}, 1>
+          %vt_buffer2D = memref.reinterpret_cast %v_buffer to offset: [0], sizes: [{{ tile_e }}, {{ tile_s }}], strides: [{{ tile_s }}, 1] : {{ v_tile_desc.get_mlir_shape(data_stype) }} to memref<{{ tile_e }}x{{ tile_s }}x{{ data_stype }}, 1>
+
+          
+          // key @ query.t and scaling.
+          linalg.matmul 
+            ins(%k_buffer2D, %qt_buffer2D : memref<{{ tile_s }}x{{ tile_e }}x{{ data_stype }}, 1>, memref<{{ tile_e }}x{{ tile_l }}x{{ data_stype }}, 1>)
+            outs(%mul_buffer : {{ mul_tile_desc.get_mlir_shape(data_stype) }})
+
+          %raw_mul_vec = affine.vector_load %mul_buffer[0, 0] : {{ mul_tile_desc.get_mlir_shape(data_stype) }}, vector<{{ tile_s }}x{{ data_stype }}>
+          %scaled_mul_vec = arith.mulf %raw_mul_vec, %v_scale :  vector<{{ tile_s }}x{{ data_stype }}>
+          affine.vector_store %scaled_mul_vec, %mul_buffer[0, 0] : {{ mul_tile_desc.get_mlir_shape(data_stype) }}, vector<{{ tile_s }}x{{ data_stype }}>
+
+          
+          // Find new max.
+          %old_max = affine.vector_load %max_buffer[0,0] : {{ max_desc.get_mlir_shape(data_stype) }}, vector<2x{{ data_stype }}>
+
+          %chunk_max_res = affine.for %index5 = 0 to {{ tile_s }} step {{ chunk_size }} iter_args(%iter_max=%v_neg_inf_c) -> (vector<{{ chunk_size }}x{{ data_stype }}>) {
+            %chunk_val = affine.vector_load %mul_buffer[0, %index5] : {{ mul_tile_desc.get_mlir_shape(data_stype) }}, vector<{{ chunk_size }}x{{ data_stype }}>
+            %local_max = arith.maximumf %chunk_val, %iter_max : vector<{{ chunk_size }}x{{ data_stype }}>
+            affine.yield %local_max : vector<{{ chunk_size }}x{{ data_stype }}>
+          }
+
+          %max_cast = vector.shape_cast %chunk_max_res : vector<{{ chunk_size }}x{{ data_stype }}> to vector<{{ chunk_size // 2 }}x2x{{ data_stype }}>
+          %max_reduced_1 = vector.multi_reduction <maximumf>, %max_cast, %v_neg_inf_2x [0] : vector<8x2x{{ data_stype }}> to vector<2x{{ data_stype }}>
+          %max_shuffled = vector.shuffle %max_reduced_1, %max_reduced_1 [1, 0] : vector<2x{{ data_stype }}>, vector<2x{{ data_stype }}>
+          %max_reduced_2 = arith.maximumf %max_reduced_1, %max_shuffled : vector<2x{{ data_stype }}>
+          
+          %new_max = arith.maximumf %max_reduced_2, %old_max : vector<2x{{ data_stype }}> 
+          affine.vector_store %new_max, %max_buffer[0, 0] : {{ max_desc.get_mlir_shape(data_stype) }}, vector<2x{{ data_stype }}>
+          
+
+          // Compute rescale factors: exp(old_max - new_max)
+          %max_diff = arith.subf %old_max, %new_max : vector<2x{{ data_stype }}>
+          %max_diff_scalar = vector.extract %max_diff[0] : {{ data_stype }} from vector<2x{{ data_stype }}>
+          
+          %rescale_bcast_e = vector.broadcast %max_diff_scalar : {{ data_stype }} to vector<{{ tile_e }}x{{ data_stype }}> 
+          %exp_rescale_e = math.exp %rescale_bcast_e : vector<{{ tile_e }}x{{ data_stype }}> 
+
+          %rescale_bcast_2 = vector.broadcast %max_diff_scalar : {{ data_stype }} to vector<2x{{ data_stype }}>
+          %exp_rescale_2 = math.exp %rescale_bcast_2 : vector<2x{{ data_stype }}>
+
+          
+          // Rescale previous out and sum accumulators
+          %old_out = affine.vector_load %ot_buffer2D[0, 0] : memref<{{ tile_e }}x{{ tile_l }}x{{ data_stype }}, 1>, vector<{{ tile_e }}x{{ data_stype }}>
+          %rescaled_out = arith.mulf %exp_rescale_e, %old_out : vector<{{ tile_e }}x{{ data_stype }}>
+          affine.vector_store %rescaled_out, %ot_buffer2D[0, 0] : memref<{{ tile_e }}x{{ tile_l }}x{{ data_stype }}, 1>, vector<{{ tile_e }}x{{ data_stype }}>
+
+          %old_sum = affine.vector_load %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape(data_stype) }}, vector<2x{{ data_stype }}>
+          %rescaled_sum = arith.mulf %old_sum, %exp_rescale_2 : vector<2x{{ data_stype }}>
+
+          
+          // Shift scores and apply exp: exp(x - new_max)
+          %scaled_scores_reload = affine.vector_load %mul_buffer[0, 0] : {{ mul_tile_desc.get_mlir_shape(data_stype) }}, vector<{{ tile_s }}x{{ data_stype }}>
+          %new_max_scalar = vector.extract %new_max[0] : {{ data_stype }} from vector<2x{{ data_stype }}>
+          %new_max_bcast = vector.broadcast %new_max_scalar : {{ data_stype }} to vector<{{ tile_s }}x{{ data_stype }}>
+          
+          %shifted_scores = arith.subf %scaled_scores_reload, %new_max_bcast : vector<{{ tile_s }}x{{ data_stype }}>
+          %exp_scores = math.exp %shifted_scores :  vector<{{ tile_s }}x{{ data_stype }}>
+          affine.vector_store %exp_scores, %mul_buffer[0, 0] : {{ mul_tile_desc.get_mlir_shape(data_stype) }}, vector<{{ tile_s }}x{{ data_stype }}>
+          
+
+          // accumulate current sum
+          %chunk_sum_res = affine.for %index5 = 0 to {{ tile_s }} step {{ chunk_size }} iter_args(%iter_sum=%v0_c) -> (vector<{{ chunk_size }}x{{ data_stype }}>) {
+            %chunk_exp = affine.vector_load %mul_buffer[0, %index5] : {{ mul_tile_desc.get_mlir_shape(data_stype) }}, vector<{{ chunk_size }}x{{ data_stype }}>
+            %local_sum = arith.addf %chunk_exp, %iter_sum : vector<{{ chunk_size }}x{{ data_stype }}>
+            affine.yield %local_sum : vector<{{ chunk_size }}x{{ data_stype }}>
+          }
+          
+          %zero_2x = vector.broadcast %c0 : {{ data_stype }} to vector<2x{{ data_stype }}>
+          %sum_cast = vector.shape_cast %chunk_sum_res : vector<{{ chunk_size }}x{{ data_stype }}> to vector<{{ chunk_size // 2 }}x2x{{ data_stype }}>
+          %sum_reduced_1 = vector.multi_reduction <add>, %sum_cast, %zero_2x [0] : vector<8x2x{{ data_stype }}> to vector<2x{{ data_stype }}>
+          %sum_shuffled = vector.shuffle %sum_reduced_1, %sum_reduced_1 [1, 0] : vector<2x{{ data_stype }}>, vector<2x{{ data_stype }}>
+          %sum_reduced_2 = arith.addf %sum_reduced_1, %sum_shuffled : vector<2x{{ data_stype }}>
+          
+          %new_sum = arith.addf %sum_reduced_2, %rescaled_sum :  vector<2x{{ data_stype }}>
+          affine.vector_store %new_sum, %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape(data_stype) }}, vector<2x{{ data_stype }}>
+
+          
+          // value.t @ mul
+          linalg.matmul 
+            { idx_map = array<i32: 2, 1, -1> }
+            ins(%vt_buffer2D, %mul_buffer : memref<{{ tile_e }}x{{ tile_s }}x{{ data_stype }}, 1>, {{ mul_tile_desc.get_mlir_shape(data_stype) }})
+            outs(%ot_buffer2D : memref<{{ tile_e }}x{{ tile_l }}x{{ data_stype }}, 1>)
+        }
+
+        // out @ row_sum^(-1)
+        %final_row_sum = affine.vector_load %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape(data_stype) }}, vector<2x{{ data_stype }}>
+        %one_2x = vector.broadcast %c1 : {{ data_stype }} to vector<2x{{ data_stype }}>
+        
+        %reciprocal_row_sum_2x = arith.divf %one_2x, %final_row_sum : vector<2x{{ data_stype }}>
+        %reciprocal_scalar = vector.extract %reciprocal_row_sum_2x[0] : {{ data_stype }} from vector<2x{{ data_stype }}>
+        %reciprocal_bcast_e = vector.broadcast %reciprocal_scalar : {{ data_stype }} to vector<{{ tile_e }}x{{ data_stype }}>
+        
+        %accumulated_out = affine.vector_load %ot_buffer2D[0, 0] : memref<{{ tile_e }}x{{ tile_l }}x{{ data_stype }}, 1>, vector<{{ tile_e }}x{{ data_stype }}>
+        %stable_final_out = arith.mulf %accumulated_out, %reciprocal_bcast_e : vector<{{ tile_e }}x{{ data_stype }}>
+        affine.vector_store %stable_final_out, %ot_buffer2D[0, 0] : memref<{{ tile_e }}x{{ tile_l }}x{{ data_stype }}, 1>, vector<{{ tile_e }}x{{ data_stype }}>
+
+        {{ kernel.store_output(indent_size=8) }}
+      } { accumulation_loop=true } 
+    } { outer_loop=true }
+  } { outer_loop=true }
+  return 
+}
+"""
+
+class MLIRFlashSDPATemplate(MLIRTemplate):
+    def __init__(self, input_nodes, layout, scale, input_reorder=None):
+        super().__init__("kernel", input_nodes, layout, input_reorder)
+        self.scale = scale
+
+    def render(self,
+               kernel: MLIRTemplateKernel,
+               template_buffer_node = None,
+               epilogue_nodes: Optional[List[IRNode]] = None,
+               prologue_nodes: Optional[List[IRNode]] = None,
+               tile_info = None,
+               **kwargs):
+        
+        # Except for kernel, other arguments are usually None.
+        query, key, value, out, q_tensor, k_tensor, v_tensor, out_tensor, b, l, s, e, ev, n_extra_node, n_prologue_node = self.extract_info(template_buffer_node, epilogue_nodes, prologue_nodes)
+       
+        if tile_info is None:
+            tile_l, tile_s, tile_e, subtile_l, subtile_s, subtile_e = self.select_tile(kernel, l, s, e, n_extra_node, 0, n_prologue_node)[0]
+        else:
+            tile_l, tile_s, tile_e, subtile_l, subtile_s, subtile_e = tile_info
+
+        TOG_latency = l if tile_l > l else tile_l
+        kernel.loop_size = [TOG_latency, tile_s, tile_e]
+
+        # Select template code
+        # Other templates will be added according to situations.
+        nr_reduction_nodes = [node for node in epilogue_nodes if node.is_reduction()] if epilogue_nodes is not None else []
+        if nr_reduction_nodes:
+            raise NotImplementedError("FLASH_SDPA_REDUCTION_TEMPLATE is not implemented yet.")
+        elif prologue_nodes:
+            raise NotImplementedError("FLASH_SDPA_PROLOGUE_TEMPLATE is not implemented yet.")
+        else:
+            template = FLASH_SDPA_TEMPLATE
+            epilogue_dim_aliasing = {"index0":"index0", "index1":"index1", "index2": "index2", "index3": "index3"}
+            nr_rdim = 0
+
+        # Prepare tile descriptors for input and output tensors.
+        # Intermediate buffers (transient data) do not require DRAM settings(dram stride and dram indices)
+        # as they are not synchronized with external DRAM. 
+        # DRAM and SRAM tile shapes must match.
+        vlane_stride = 1
+        
+        # (n, l, s, e, ev)
+        loop_dim = [sympy.Symbol("index0"), sympy.Symbol("index1"), sympy.Symbol("index2"), sympy.Symbol("index3")]
+
+
+        # Hardware constraint: The tile split axis is restricted.
+        # To accommodate this, we compute (key @ query.t) instead of (query @ key.t).
+        # SRAM settings
+        vlane_split_axis = 1
+        q_tile_size = [1, tile_l, tile_e]
+        q_tile_stride = [0, tile_e, 1]
+        q_tile_desc = mlir_common.MLIRMultiDimTile(q_tile_size, kernel.vector_lane, vlane_split_axis, vlane_stride)
+        q_tile_desc.set_tile_size_stride(q_tile_size, q_tile_stride)
+        q_tile_desc.set_name("q_buffer")
+        q_tile_desc.offset = query.get_layout().offset
+        # DRAM settings 
+        q_stride = q_tensor.stride()
+        q_idx = [loop_dim[0]*q_stride[0], loop_dim[1]*q_stride[1], loop_dim[3]*q_stride[2]] # To keep index arguemnt order, we used index_list
+
+        # Since we use a weight-stationary approach in the Systolic Array (SA), 
+        # the split axis of the first operand differs from a standard linear algebra matmul.
+        # The first operand (key) must be split along the column axis.
+        # This logic aligns with the relationship between the dot product's summation direction and the hardware's accumulation direction in the SA.
+        # SRAM settings
+        vlane_split_axis = 2
+        k_tile_size = [1, tile_s, tile_e]
+        k_tile_stride = [0, 1, tile_s]
+        k_tile_desc = mlir_common.MLIRMultiDimTile(k_tile_size, kernel.vector_lane, vlane_split_axis, vlane_stride)
+        k_tile_desc.set_tile_size_stride(k_tile_size, k_tile_stride)
+        k_tile_desc.set_name("k_buffer")
+        k_tile_desc.offset = key.get_layout().offset
+        # DRAM settings
+        k_stride = k_tensor.stride()
+        k_idx = [loop_dim[0]*k_stride[0], loop_dim[2]*k_stride[1], loop_dim[3]*k_stride[2]]
+
+        # Since we compute mul = key @ query.t, we perform out.t = (value.t @ Softmax(mul).t).t,
+        # which simplifies to (value.t @ Softmax(mul))
+        # SRAM settings
+        vlane_split_axis = 1
+        v_tile_size = [1, tile_s, tile_e]
+        v_tile_stride = [0, tile_e, 1]
+        v_tile_desc = mlir_common.MLIRMultiDimTile(v_tile_size, kernel.vector_lane, vlane_split_axis, vlane_stride)
+        v_tile_desc.set_tile_size_stride(v_tile_size, v_tile_stride)
+        v_tile_desc.set_name("v_buffer")
+        v_tile_desc.offset = value.get_layout().offset
+        # DRAM settings
+        v_stride = v_tensor.stride()
+        v_idx = [loop_dim[0]*v_stride[0], loop_dim[2]*v_stride[1], loop_dim[3]*v_stride[2]] # To keep index arguemnt order, we used index_list
+
+        # Output is also stored in transposed format to match the value.t @ Softmax(mul) operation.
+        # SRAM settings
+        vlane_split_axis = 1
+        out_tile_size = [1, tile_l, tile_e] 
+        out_tile_stride=[0, tile_e, 1] 
+        out_tile_desc = mlir_common.MLIRMultiDimTile(out_tile_size, kernel.vector_lane, vlane_split_axis, vlane_stride)
+        out_tile_desc.set_tile_size_stride(out_tile_size, out_tile_stride)
+        out_tile_desc.set_name("out_buffer")
+        # DRAM settings
+        out_stride = out.get_layout().stride[1:]
+        out_idx = [loop_dim[0]*out_stride[0], loop_dim[1]*out_stride[1], loop_dim[3]*out_stride[2]]
+
+        # Intermediate buffers
+
+        # For mul = key @ query.t
+        vlane_split_axis = 1
+        mul_tile_size = [tile_s, tile_l]
+        mul_tile_stride = [tile_l, 1]
+        mul_tile_desc = mlir_common.MLIRMultiDimTile(mul_tile_size, kernel.vector_lane, vlane_split_axis, vlane_stride)
+        mul_tile_desc.set_tile_size_stride(mul_tile_size, mul_tile_stride)
+        mul_tile_desc.set_name("mul_buffer")
+        #FIXME. What is the offset? -> It doesn't matter at this time.
+
+        # For storing maximum values per row
+        vlane_split_axis = 0
+        max_size = [tile_l, 2]
+        max_stride = [2, 1]
+        max_desc = mlir_common.MLIRMultiDimTile(max_size, kernel.vector_lane, vlane_split_axis, vlane_stride)
+        max_desc.set_tile_size_stride(max_size, max_stride)
+        max_desc.set_name("max_buffer")
+
+        # For storing summation per row
+        vlane_split_axis = 0
+        sum_size = [tile_l, 2]
+        sum_stride = [2, 1]
+        sum_desc = mlir_common.MLIRMultiDimTile(sum_size, kernel.vector_lane, vlane_split_axis, vlane_stride)
+        sum_desc.set_tile_size_stride(sum_size, sum_stride)
+        sum_desc.set_name("sum_buffer")
+
+        # For reduction
+        chunk_size = 16
+
+        kernel.render_options = dict(
+            KERNEL_NAME = self.name,
+            kernel = kernel,
+            b = b, 
+            l = l, 
+            s = s, 
+            e = e,                             # Input sizes (dram)
+            tile_l = tile_l, 
+            tile_s = tile_s, 
+            tile_e = tile_e,                   # Tile sizes (sram)
+            subtile_l = subtile_l, 
+            subtile_s = subtile_s, 
+            subtile_e = subtile_e,             # Subtile sizes (sram)  
+            data_stype="f32",
+            query = query, 
+            key = key,
+            value = value, 
+            out = out,                         # Inputs and output (dram)
+            q_idx = q_idx,
+            k_idx = k_idx,
+            v_idx = v_idx,
+            out_idx = out_idx,                 # Strides (dram)       
+            q_tile_desc = q_tile_desc,
+            k_tile_desc = k_tile_desc,
+            v_tile_desc = v_tile_desc,
+            mul_tile_desc = mul_tile_desc,
+            out_tile_desc = out_tile_desc,     # Tile descriptions (sram)
+            max_desc = max_desc,
+            sum_desc = sum_desc,               # Intermediate buffer descriptions (sram)
+            scale = self.scale,
+            chunk_size = chunk_size,        
+            input_reorder = self.input_reorder # ETC 
+        )
+
+        kernel.epilogue_info = dict(
+            output_node = self.output_node.name,
+            sram_var = "out_buffer",
+            dram_var = "out",
+            dram_idx = out_idx,
+            dram_tile_desc = out_tile_desc,
+            nr_rdim = nr_rdim,
+            r_dim_size = 0,
+            dim_aliasing = epilogue_dim_aliasing
+        )
+
+        code = self._template_from_string(template).render(**kernel.render_options)
+        kernel.add_loop_info([kernel.render_options["l"], kernel.render_options["s"], kernel.render_options["e"]], [kernel.render_options["tile_l"], kernel.render_options["tile_s"], kernel.render_options["tile_e"]])
+        return code
+
+    def extract_info(self, template_buffer_node, epilogue_nodes, prologue_nodes):
+        if template_buffer_node is not None:
+            self.output_node = template_buffer_node
+        
+        query = self.input_nodes[0]
+        key = self.input_nodes[1]
+        value = self.input_nodes[2]
+        out = self.output_node
+
+        q_tensor = empty_strided(query.layout.size, query.layout.stride)
+        k_tensor = empty_strided(key.layout.size, key.layout.stride)
+        v_tensor = empty_strided(value.layout.size, value.layout.stride)
+        out_tensor = empty_strided(out.layout.size, out.layout.stride)
+
+        # Flatten batch and head dimensions (n, h) into a single dimension (b = n*h)
+        q_tensor = q_tensor.view([-1, q_tensor.shape[-2], q_tensor.shape[-1]])
+        k_tensor = k_tensor.view([-1, k_tensor.shape[-2], k_tensor.shape[-1]])
+        v_tensor = v_tensor.view([-1, v_tensor.shape[-2], v_tensor.shape[-1]])
+        out_tensor = out_tensor.view([-1, out_tensor.shape[-2], out_tensor.shape[-1]])
+
+        b, l, s, e, ev = q_tensor.size(0), q_tensor.size(1), k_tensor.size(1), k_tensor.size(2), v_tensor.size(2) 
+
+        n_extra_node = len(epilogue_nodes) if epilogue_nodes is not None else 0
+        n_prologue_node = len(prologue_nodes) if prologue_nodes is not None else 0
+
+        return query, key, value, out, q_tensor, k_tensor, v_tensor, out_tensor, b, l, s, e, ev, n_extra_node, n_prologue_node
+
+    # Reuse the existing function in MLIRBMMTemplate.
+    def select_tile(self, kernel, l, s, e, n_extra_node, n_extra_read, n_prologue_node):
+
+        # FIXME: Update the method for getting tile candidates once TestDmaFineGrained oass works correctly with Flash Attention.
+        # tile_candidates = kernel.flash_sdpa_mapping(l, s, e, n_extra_node=n_extra_node)
+        tile_candidates = [[kernel.vector_lane, kernel.vector_lane, e]]
+
+        for idx, (tile_l, tile_s, tile_e) in enumerate(tile_candidates):
+            subtile_l = tile_l if (tile_l < kernel.vector_lane) or n_prologue_node else kernel.vector_lane
+            subtile_s = tile_s # if (tile_s < kernel.vector_lane) or prologue_nodes else kernel.vector_lane
+            subtile_e = tile_e # if (tile_e < kernel.vector_lane) or prologue_nodes else kernel.vector_lane
+
+            tile_candidates[idx] = tile_l,tile_s,tile_e,subtile_l,subtile_s,subtile_e
+
+        return tile_candidates
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index b864e5f2..23f5e3dc 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -387,6 +387,100 @@ def conv_single_batch_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilatio
         tile_candidates = sorted(tile_candidates, key=lambda x: x[0], reverse=True)
         tile_candidates = [v for _, v in tile_candidates]
         return tile_candidates
+    
+    # Flash Attention requires more SRAM compared to standard GEMM.
+    # Total buffers needed: query, key, value, out, mul, max, sum
+    # Tensor Shapes:
+    #   query (tile_l, tile_e), key (tile_s, tile_e), value (tile_s, tile_e), mul (tile_s, tile_l), out(tile_l, tile_e)
+    #   max, sum : (tile_l, 2) 
+    def flash_sdpa_mapping(self, l, s, e, n_extra_node=0, n_prologue_node=0, pad_e=True, min_tile=False, is_conv=False):
+        tile_candidates = []
+        
+        spad_size_per_lane = self.spad_info["spad_size"]
+        spad_size = spad_size_per_lane * self.vector_lane
+        
+        # Double buffering
+        max_spad_per_lane = spad_size_per_lane // 2
+        max_spad_size = spad_size // 2 
+
+        # Padding for utilization        
+        minimum_tile_size = 8 
+        minimum_n_tile = self.num_cores if min_tile else 1
+        l_pad_factor = self.vector_lane if l > self.vector_lane else minimum_tile_size
+        s_pad_factor = self.vector_lane if s > self.vector_lane else minimum_tile_size
+
+        pad = lambda x, factor: ((x + factor - 1) // factor) * factor
+        l_padded = pad(l, l_pad_factor)
+        s_padded = pad(s, s_pad_factor)
+
+        # Calculate the total number of vector-sized blocks
+        l_idx = l_padded // self.vector_lane
+        s_idx = s_padded // self.vector_lane
+
+        # Generate candidates for the number of blocks per tile
+        l_tile_range = sympy.divisors(l_idx) if l > self.vector_lane else [1]
+        s_tile_range = sympy.divisors(s_idx) if s > self.vector_lane else [1]
+        
+        # Convert block count to actual tile size
+        maximize_i_j = 1
+        max_used_spad_size = 0
+    
+        # Flash Attention does not tile along the head dimension (e or ev).
+        tile_e = e
+
+        for i in l_tile_range:
+            tile_l = i * self.vector_lane if l > self.vector_lane else l_padded
+            for j in s_tile_range:
+                tile_s = j * self.vector_lane if s > self.vector_lane else s_padded
+                
+                # Calculate used spad size
+                used_spad_size = (
+                    tile_l * tile_e * (1 + n_prologue_node) # query
+                    + tile_s * tile_e                       # key
+                    + tile_s * tile_e                       # value
+                    + tile_s * tile_l                       # mul
+                    + tile_l * tile_e * (1 + n_extra_node)  # out
+                    + (tile_l * 2) * 2                      # max, sum
+                ) * self.precision
+                
+                # Calculate used spad size per lane.
+                query_per_lane = tile_e * (1+n_prologue_node)
+                key_per_lane = tile_s
+                value_per_lane = tile_e
+                mul_per_lane = tile_s
+                out_per_lane = tile_e * (1 + n_extra_node)
+                vec_per_lane = 2 * 2
+
+                used_spad_per_lane = (
+                    query_per_lane
+                    + key_per_lane
+                    + value_per_lane
+                    + mul_per_lane
+                    + out_per_lane
+                    + vec_per_lane
+                ) * self.precision
+                
+                # Add the validated candidate to the list if it passes all hardware constraints.
+                n_tile = math.ceil(l / max(tile_l, 128)) * math.ceil(s / max(tile_s, 128))
+                check_spad_size = (used_spad_size < max_spad_size and used_spad_per_lane < max_spad_per_lane)
+
+                if (check_spad_size 
+                    and max_used_spad_size < used_spad_size             # SRAM utilization
+                    and maximize_i_j <= tile_l * tile_s                 # Larger tile
+                    and n_tile >= minimum_n_tile                        # Pallelism
+                    and max(tile_s, 128) // max(tile_l, 128) < 10):     # Balanced Shape
+                    max_used_spad_size = used_spad_size
+                    maximize_i_j = tile_l * tile_s
+                
+                if check_spad_size:
+                    tile_candidates.append((used_spad_size, (tile_l, tile_s, tile_e)))
+
+        # Sort by used_spad_size.
+        # tile_candidates[0] is the best solution we have.
+        tile_candidates = sorted(tile_candidates, key=lambda x: x[0], reverse=True)
+        tile_candidates = [v for _, v in tile_candidates]
+
+        return tile_candidates
 
     def meta_kernel(self):
         kernel_arg_attributes = self.kernel_arg_attributes
@@ -827,7 +921,12 @@ def def_dma_op(self, dma_type, dram_var:str, index_list:list, tile_desc:mlir_com
     def def_sram_buffer(self, dram_name, tile_desc, id=0, indent_size=0):
         # Prepare code block
         with self:
-            dtype = self.named_nodes[dram_name].get_layout().dtype
+            try:
+                dtype = self.named_nodes[dram_name].get_layout().dtype
+            except (KeyError, AttributeError, TypeError):
+                import torch
+                dtype = torch.float32
+            
             tile_shape = tile_desc.get_mlir_shape(mlir_common.DTYPE_TO_MLIR[dtype])
             buffer_name = self.allocate_sram_buffer(dtype, dram_name, tile_desc, id, forced_name=dram_name)
             code = f"%{tile_desc.name} = memref.get_global @{buffer_name} : {tile_shape}"
diff --git a/tests/test_sdpa.py b/tests/test_sdpa.py
new file mode 100644
index 00000000..9c921eb4
--- /dev/null
+++ b/tests/test_sdpa.py
@@ -0,0 +1,84 @@
+import sys
+import math
+import torch
+import inspect
+from typing import List
+import torch.nn.functional as F
+from torch.nn.attention import SDPBackend, sdpa_kernel 
+from torch.fx.passes.graph_drawer import FxGraphDrawer
+from torch._inductor.decomposition import decompositions
+
+def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
+    message = f"|{name} Test Passed|"
+    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+    else:
+        print("custom out: ", out.cpu())
+        print("cpu out: ", cpu_out)
+        exit(1)
+
+def test_scaled_dot_product_attention(device, backends="flash"):
+    torch.manual_seed(0)
+    n_batch_list = [1, 4, 8, 16]
+    n_head_list = [1, 4, 8, 12]
+    n_token_list = [128, 256, 512, 1024]
+    head_dim_list = [32, 64, 128]
+
+    for n_batch in n_batch_list:
+        for n_head in n_head_list:
+            for n_token in n_token_list:
+                for head_dim in head_dim_list:
+                    # Inputs
+                    query = torch.rand(n_batch, n_head, n_token, head_dim, dtype=torch.float32)
+                    key = torch.rand(n_batch, n_head, n_token, head_dim, dtype=torch.float32)
+                    value = torch.rand(n_batch, n_head, n_token, head_dim, dtype=torch.float32)
+
+                    query = query.to(device=device)
+                    key = key.to(device=device)
+                    value = value.to(device=device)
+
+                    # With NPU
+                    if backends == "flash":
+                        backends = [SDPBackend.FLASH_ATTENTION]
+                    elif backends == "math":
+                        backends = [SDPBackend.MATH]
+                    elif backends == "memory_efficient":
+                        backends = [SDPBackend.EFFICIENT_ATTENTION]
+                    else:
+                        backends = [SDPBackend.FLASH_ATTENTION, SDPBackend.MATH, SDPBackend.EFFICIENT_ATTENTION]
+
+                    with sdpa_kernel(backends=backends):
+                        opt_fn = torch.compile(dynamic=False)(F.scaled_dot_product_attention)
+                        out = opt_fn(query, key, value)
+                    
+                    out = out.to(device)
+
+                    # With CPU
+                    device = torch.device('cpu')
+                    query = query.to(device=device)
+                    key = key.to(device=device)
+                    value = value.to(device=device)
+                    cpu_out = F.scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False)
+
+                    name = f"SDPA(n_batch: {n_batch}, n_head: {n_head}, n_token: {n_token}, head_dim: {head_dim})"
+                    test_result(name, out, cpu_out)
+    
+    print("All tests passed!")
+
+def clear_caches():
+    import os
+    from torch._functorch._aot_autograd.autograd_cache import AOTAutogradCache
+    from torch._inductor.codecache import FxGraphCache
+    AOTAutogradCache.clear()
+    torch._dynamo.reset()
+    os.environ["TORCHINDUCTOR_CACHE"] = "0"
+    FxGraphCache.clear()
+
+if __name__ == "__main__":
+    clear_caches()
+    
+    device = torch.device('npu:0')
+    test_scaled_dot_product_attention(device, backends="flash")
+    
\ No newline at end of file

From 88e79e06cf329e756862a616a70d37752d74fc21 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 3 Mar 2026 12:24:53 +0900
Subject: [PATCH 108/194] [CI] Update for torch 2.8 based image

---
 .github/workflows/docker-base-image.yml | 72 -------------------------
 .github/workflows/docker-image-2-8.yml  |  4 +-
 .github/workflows/docker-image.yml      | 70 ------------------------
 3 files changed, 2 insertions(+), 144 deletions(-)
 delete mode 100644 .github/workflows/docker-base-image.yml
 delete mode 100644 .github/workflows/docker-image.yml

diff --git a/.github/workflows/docker-base-image.yml b/.github/workflows/docker-base-image.yml
deleted file mode 100644
index 2c29a11b..00000000
--- a/.github/workflows/docker-base-image.yml
+++ /dev/null
@@ -1,72 +0,0 @@
-name: Docker Base Image CI
-
-on:
-  push:
-    branches: [ "base" ]
-  repository_dispatch:
-    types: [ build_base ]
-
-jobs:
-  build:
-    runs-on: ubuntu-latest
-    
-    permissions:
-      contents: read
-      packages: write
-
-    steps:
-      # Step 1: Checkout the repository
-      - name: Checkout Code
-        uses: actions/checkout@v4
-
-      # Step 2: Log in to GitHub Container Registry
-      - name: Log in to GitHub Container Registry
-        uses: docker/login-action@v3
-        with:
-          registry: ghcr.io
-          username: ${{ github.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      # Step 2: Set environemnt
-      - name: Set environment
-        env:
-          GIT_ACCESS_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          if [ -n "${{ github.event.pull_request.head.sha }}" ]; then
-            echo "GITHUB_SHA=${{ github.event.pull_request.head.sha }}" >> $GITHUB_ENV
-            echo "GITHUB_SHA=${{ github.event.pull_request.head.sha }}"
-          else
-            echo "GITHUB_SHA=${{ github.sha }}" >> $GITHUB_ENV
-            echo "GITHUB_SHA=${{ github.sha }}"
-          fi
-
-          gem5_response_file=/tmp/releases-gem5-latest.json
-          curl -s https://api.github.com/repos/PSAL-POSTECH/GEM5/releases/latest > ${gem5_response_file}
-          GEM5_ASSET_ID=$(jq ".assets[0].id" ${gem5_response_file})
-          echo "GEM5_ASSET_ID=$GEM5_ASSET_ID"
-          echo "GEM5_ASSET_ID=$GEM5_ASSET_ID" >> $GITHUB_ENV
-
-          llvm_response_file=/tmp/releases-gem5-latest.json
-          curl -s https://api.github.com/repos/PSAL-POSTECH/llvm-project/releases/latest > ${llvm_response_file}
-          LLVM_ASSET_ID=$(jq ".assets[0].id" ${llvm_response_file})
-          echo "LLVM_ASSET_ID=$LLVM_ASSET_ID"
-          echo "LLVM_ASSET_ID=$LLVM_ASSET_ID" >> $GITHUB_ENV
-
-          spike_response_file=/tmp/releases-spike-latest.json
-          curl -s https://api.github.com/repos/PSAL-POSTECH/riscv-isa-sim/releases/latest > ${spike_response_file}
-          SPIKE_ASSET_ID=$(jq ".assets[0].id" ${spike_response_file})
-          echo "SPIKE_ASSET_ID=$SPIKE_ASSET_ID"
-          echo "SPIKE_ASSET_ID=$SPIKE_ASSET_ID" >> $GITHUB_ENV
-
-      # Step 3: Build and Push Docker Image
-      - name: Build and Push Docker Image
-        uses: docker/build-push-action@v4
-        with:
-          context: .
-          file: ./Dockerfile.base
-          push: true
-          build-args: |
-            GEM5_ASSET_ID=${{ env.GEM5_ASSET_ID }}
-            LLVM_ASSET_ID=${{ env.LLVM_ASSET_ID }}
-            SPIKE_ASSET_ID=${{ env.SPIKE_ASSET_ID }}
-          tags: ghcr.io/psal-postech/torchsim_base:latest
diff --git a/.github/workflows/docker-image-2-8.yml b/.github/workflows/docker-image-2-8.yml
index 4d511a1a..f1e915d6 100644
--- a/.github/workflows/docker-image-2-8.yml
+++ b/.github/workflows/docker-image-2-8.yml
@@ -1,8 +1,8 @@
 name: Docker image CI (PyTorch 2.8)
 
 on:
-  push:
-    branches: [ "torch_v2.8" ]
+  pull_request:
+    branches: [ "master", "develop" ]
   workflow_dispatch:
 
 jobs:
diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml
deleted file mode 100644
index eba48da2..00000000
--- a/.github/workflows/docker-image.yml
+++ /dev/null
@@ -1,70 +0,0 @@
-name: Docker image CI
-
-on:
-  pull_request:
-    branches: [ "master", "develop" ]
-
-jobs:
-  build-and-test:
-    runs-on: self-hosted
-
-    permissions:
-      contents: read
-      packages: write
-
-    steps:
-      # Step 1: Checkout the repository
-      - name: Checkout Code
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event.pull_request.head.sha }}
-          submodules: recursive
-
-      # Step 2: Log in to GitHub Container Registry
-      - name: Login to GHCR
-        uses: docker/login-action@v3
-        with:
-          registry: ghcr.io
-          username: ${{ github.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      # Step 3: Build and Push Docker Image
-      - name: Build and Push Docker Image
-        uses: docker/build-push-action@v6
-        with:
-          context: .
-          file: ./Dockerfile
-          push: true
-          no-cache: true
-          tags: ghcr.io/psal-postech/torchsim-test:${{ github.sha }}
-
-      # Step 4: Wait for GHCR propagation
-      - name: Wait for GHCR propagation
-        run: |
-          for i in {1..30}; do
-            echo "Checking if image exists in GHCR (attempt $i)..."
-            if docker manifest inspect ghcr.io/psal-postech/torchsim-test:${GITHUB_SHA} > /dev/null 2>&1; then
-              echo "Image is now available in GHCR."
-              exit 0
-            fi
-            echo "Image not yet available, retrying in 30 seconds..."
-            sleep 20
-          done
-          echo "Image did not become available in GHCR within expected time."
-          exit 1
-
-  test-pytorchsim-wrapper:
-    needs: build-and-test
-    uses: ./.github/workflows/pytorchsim_test.yml
-    with:
-      image_name: ghcr.io/psal-postech/torchsim-test:${{ github.sha }}
-      vector_lane: 128
-      spad_size: 128
-
-#  call-test2:
-#    needs: build-and-test
-#    uses: ./.github/workflows/pytorchsim_test.yml
-#    with:
-#      image_name: ghcr.io/psal-postech/${GITHUB_SHA}
-#      vector_lane: 8
-#      spad_size: 32
\ No newline at end of file

From fc247be17221f2b6aa8c52228a2e86b7315ef78d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=EC=9D=B4=EC=9E=AC=EA=B7=A0?= <jamesgyun@gmail.com>
Date: Mon, 2 Mar 2026 00:28:31 +0900
Subject: [PATCH 109/194] [Template] Add cat & sort template + Multi-output
 (WIP)

---
 .../torch_openreg/openreg/__init__.py         |  49 +++
 PyTorchSimFrontend/mlir/mlir_cat_template.py  | 167 +++++++++++
 PyTorchSimFrontend/mlir/mlir_common.py        |   6 +-
 PyTorchSimFrontend/mlir/mlir_lowering.py      | 281 +++++++++++++++++-
 PyTorchSimFrontend/mlir/mlir_sort_template.py | 253 ++++++++++++++++
 PyTorchSimFrontend/mlir/mlir_template.py      |  30 +-
 tests/DeepSeek/test_deepseek_v3_base.py       | 170 +++++++++--
 tests/test_cat.py                             |  89 ++++++
 tests/test_sort.py                            | 112 +++++++
 9 files changed, 1121 insertions(+), 36 deletions(-)
 create mode 100644 PyTorchSimFrontend/mlir/mlir_cat_template.py
 create mode 100644 PyTorchSimFrontend/mlir/mlir_sort_template.py
 create mode 100644 tests/test_cat.py
 create mode 100644 tests/test_sort.py

diff --git a/PyTorchSimDevice/torch_openreg/openreg/__init__.py b/PyTorchSimDevice/torch_openreg/openreg/__init__.py
index f5aabc18..5603a4f7 100644
--- a/PyTorchSimDevice/torch_openreg/openreg/__init__.py
+++ b/PyTorchSimDevice/torch_openreg/openreg/__init__.py
@@ -256,6 +256,52 @@ def launch_model(model, *args, stream_index=0, timestamp=0, **kwargs):
 from .random import *  # noqa: F403
 from .amp import *
 
+def _precheck_cat_out_args(args, kwargs):
+    tensors = args[0] if len(args) > 0 else kwargs.get("tensors")
+    dim = args[1] if len(args) > 1 else kwargs.get("dim", 0)
+    out = kwargs.get("out", args[2] if len(args) > 2 else None)
+
+    if out is None:
+        return
+    if not isinstance(tensors, (list, tuple)) or len(tensors) == 0:
+        raise RuntimeError("aten::cat.out requires non-empty tensor list")
+    if not all(isinstance(t, torch.Tensor) for t in tensors):
+        raise RuntimeError("aten::cat.out tensors must be Tensor values")
+    if not isinstance(out, torch.Tensor):
+        raise RuntimeError("aten::cat.out out must be a Tensor")
+
+    rank = tensors[0].dim()
+    if rank == 0:
+        raise RuntimeError("aten::cat.out does not support scalar inputs")
+    if dim < 0:
+        dim += rank
+    if dim < 0 or dim >= rank:
+        raise RuntimeError(f"aten::cat.out dim out of range: dim={dim}, rank={rank}")
+    if any(t.dim() != rank for t in tensors):
+        raise RuntimeError("aten::cat.out inputs must have the same rank")
+    if any(t.dtype != tensors[0].dtype for t in tensors):
+        raise RuntimeError("aten::cat.out inputs must have the same dtype")
+    if out.dim() != rank:
+        raise RuntimeError("aten::cat.out out rank mismatch")
+
+    for d in range(rank):
+        if d == dim:
+            continue
+        base = tensors[0].shape[d]
+        if any(t.shape[d] != base for t in tensors[1:]):
+            raise RuntimeError(
+                f"aten::cat.out non-concatenated dimension mismatch at dim={d}"
+            )
+        if out.shape[d] != base:
+            raise RuntimeError(f"aten::cat.out out shape mismatch at dim={d}")
+
+    expected = sum(t.shape[dim] for t in tensors)
+    if out.shape[dim] != expected:
+        raise RuntimeError(
+            f"aten::cat.out out concatenated dimension mismatch at dim={dim}: "
+            f"expected {expected}, got {out.shape[dim]}"
+        )
+
 def eager_to_compile(op_name):
     """
     Register an eager mode operation as a graph-based implementation using torch.compile().
@@ -267,6 +313,9 @@ def eager_to_compile(op_name):
         torch.npu.eager_to_compile("aten::mul.Tensor")
     """
     def wrapper(*args, **kwargs):
+        if op_name == "aten::cat.out":
+            _precheck_cat_out_args(args, kwargs)
+
         @torch.compile(dynamic=False)
         def dummy_graph(*args, **kwargs):
             # Convert "aten::mul.Tensor" -> torch.ops.aten.mul.Tensor
diff --git a/PyTorchSimFrontend/mlir/mlir_cat_template.py b/PyTorchSimFrontend/mlir/mlir_cat_template.py
new file mode 100644
index 00000000..996af1de
--- /dev/null
+++ b/PyTorchSimFrontend/mlir/mlir_cat_template.py
@@ -0,0 +1,167 @@
+from typing import List, Optional, cast
+
+import sympy
+from torch._inductor.ir import Buffer, IRNode
+from torch._inductor.virtualized import V
+
+from PyTorchSimFrontend.mlir import mlir_common
+from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate, MLIRTemplateKernel
+
+
+TEMPLATE = r"""
+{{kernel.def_global_vars()}}
+
+func.func @{{ KERNEL_NAME }} {{kernel.def_kernel(inputs=[X0, X1], outputs=[Y], names_str=NAMES_STR, input_reorder=input_reorder)}} {
+  {{ kernel.def_sram_buffer("X0", X0_TILE_DESC, id=0, indent_size=2) }}
+  {{ kernel.def_sram_buffer("X1", X1_TILE_DESC, id=1, indent_size=2) }}
+  {{ kernel.def_sram_buffer(OUT_DVAR, Y_TILE_DESC, id=2, indent_size=2) }}
+  {{ kernel.def_local_vars(indent_size=2) }}
+
+  affine.for %cat_block = 0 to 1 step 1 {
+{% if DIM == 0 %}
+    affine.for %index0 = 0 to {{ X0_ROWS }} step 1 {
+      affine.for %index1 = 0 to {{ COLS }} step 1 {
+        {{ kernel.def_dma_op("MVIN", "X0", X0_IDX, X0_TILE_DESC, indent_size=8) }}
+        {{ kernel.def_dma_op("MVOUT", OUT_DVAR, Y0_IDX, X0_TILE_DESC, indent_size=8) }}
+      }
+    }
+
+    affine.for %index2 = 0 to {{ X1_ROWS }} step 1 {
+      affine.for %index3 = 0 to {{ COLS }} step 1 {
+        {{ kernel.def_dma_op("MVIN", "X1", X1_IDX, X1_TILE_DESC, indent_size=8) }}
+        {{ kernel.def_dma_op("MVOUT", OUT_DVAR, Y1_IDX, X1_TILE_DESC, indent_size=8) }}
+      }
+    }
+{% else %}
+    affine.for %index0 = 0 to {{ ROWS }} step 1 {
+      affine.for %index1 = 0 to {{ X0_COLS }} step 1 {
+        {{ kernel.def_dma_op("MVIN", "X0", X0_IDX, X0_TILE_DESC, indent_size=8) }}
+        {{ kernel.def_dma_op("MVOUT", OUT_DVAR, Y0_IDX, X0_TILE_DESC, indent_size=8) }}
+      }
+      affine.for %index3 = 0 to {{ X1_COLS }} step 1 {
+        {{ kernel.def_dma_op("MVIN", "X1", X1_IDX, X1_TILE_DESC, indent_size=8) }}
+        {{ kernel.def_dma_op("MVOUT", OUT_DVAR, Y1_IDX, X1_TILE_DESC, indent_size=8) }}
+      }
+    }
+{% endif %}
+  } { outer_loop=true }
+  return
+}
+"""
+
+
+class MLIRCatTemplate(MLIRTemplate):
+    def __init__(self, input_nodes, layout, dim, input_reorder=None):
+        super().__init__("kernel", input_nodes, layout, input_reorder)
+        self.dim = dim
+
+    def render(
+        self,
+        kernel: MLIRTemplateKernel,
+        template_buffer_node=None,
+        epilogue_nodes: Optional[List[IRNode]] = None,
+        tile_info=None,
+        **kwargs,
+    ):
+        is_out_variant = template_buffer_node is not None
+        if is_out_variant:
+            self.output_node = template_buffer_node
+        # cat template currently emits a single output buffer and does not
+        # support epilogue output remapping.
+
+        def _unwrap_node(n):
+            return n.node if hasattr(n, "node") else n
+
+        x0 = _unwrap_node(self.input_nodes[0])
+        x1 = _unwrap_node(self.input_nodes[1])
+        y = _unwrap_node(self.output_node)
+
+        def _as_int(v):
+            try:
+                return int(v)
+            except Exception:
+                return int(V.graph.sizevars.size_hint(v))
+
+        x0_rows = _as_int(x0.get_size()[0])
+        x1_rows = _as_int(x1.get_size()[0])
+        x0_cols = _as_int(x0.get_size()[1])
+        x1_cols = _as_int(x1.get_size()[1])
+        y_cols = _as_int(y.get_size()[1])
+        kernel.loop_size = None
+
+        # 2D cat template with contiguous layout.
+        x0_tile_desc = mlir_common.MLIRMultiDimTile([1, 1], kernel.vector_lane, vlane_split_axis=1, vlane_stride=1)
+        x0_tile_desc.set_tile_size_stride([1, 1], [1, 1])
+        x0_tile_desc.set_name("x0_cat_tile")
+        x1_tile_desc = mlir_common.MLIRMultiDimTile([1, 1], kernel.vector_lane, vlane_split_axis=1, vlane_stride=1)
+        x1_tile_desc.set_tile_size_stride([1, 1], [1, 1])
+        x1_tile_desc.set_name("x1_cat_tile")
+        y_tile_desc = mlir_common.MLIRMultiDimTile([1, 1], kernel.vector_lane, vlane_split_axis=1, vlane_stride=1)
+        y_tile_desc.set_tile_size_stride([1, 1], [1, 1])
+        y_tile_desc.set_name("y_cat_tile")
+
+        if self.dim == 0:
+            # Flattened offsets for dim=0 cat.
+            x0_idx = [sympy.Symbol("index0") * x0_cols, sympy.Symbol("index1")]
+            x1_idx = [sympy.Symbol("index2") * x1_cols, sympy.Symbol("index3")]
+            y0_idx = [sympy.Symbol("index0") * y_cols, sympy.Symbol("index1")]
+            y1_idx = [(sympy.Symbol("index2") + x0_rows) * y_cols, sympy.Symbol("index3")]
+        else:
+            # Flattened offsets for dim=1 cat.
+            x0_idx = [sympy.Symbol("index0") * x0_cols, sympy.Symbol("index1")]
+            x1_idx = [sympy.Symbol("index0") * x1_cols, sympy.Symbol("index3")]
+            y0_idx = [sympy.Symbol("index0") * y_cols, sympy.Symbol("index1")]
+            y1_idx = [sympy.Symbol("index0") * y_cols, sympy.Symbol("index3") + x0_cols]
+
+        kernel.render_options = dict(
+            KERNEL_NAME=self.name,
+            kernel=kernel,
+            X0=x0,
+            X1=x1,
+            Y=y,
+            OUT_DVAR="out_ptr1" if is_out_variant else "Y",
+            NAMES_STR="X0, X1, out_ptr1" if is_out_variant else "X0, X1, Y",
+            DIM=self.dim,
+            X0_ROWS=x0_rows,
+            X1_ROWS=x1_rows,
+            ROWS=x0_rows,
+            X0_COLS=x0_cols,
+            X1_COLS=x1_cols,
+            COLS=x0_cols,
+            X0_TILE_DESC=x0_tile_desc,
+            X1_TILE_DESC=x1_tile_desc,
+            Y_TILE_DESC=y_tile_desc,
+            X0_IDX=x0_idx,
+            X1_IDX=x1_idx,
+            Y0_IDX=y0_idx,
+            Y1_IDX=y1_idx,
+            input_reorder=self.input_reorder,
+        )
+        # Needed when epilogue fusion requests set_ranges().
+        kernel.dim_aliasing = {"index0": "index0", "index1": "index1"}
+
+        if hasattr(self.output_node, "node") and hasattr(self.output_node.node, "get_name"):
+            output_node_name = self.output_node.node.get_name()
+        elif hasattr(self.output_node, "get_name"):
+            output_node_name = self.output_node.get_name()
+        else:
+            output_node_name = self.output_node.name
+
+        if hasattr(y, "get_numel"):
+            y_numel = y.get_numel()
+        elif hasattr(y, "node") and hasattr(y.node, "get_numel"):
+            y_numel = y.node.get_numel()
+        else:
+            y_numel = None
+
+        kernel.epilogue_info = dict(
+            output_node=output_node_name,
+            sram_var="y_cat_tile",
+            dram_var=kernel.render_options["OUT_DVAR"],
+            dram_tile_desc=y_tile_desc,
+        )
+        if y_numel is not None:
+            kernel.exception_nodes[kernel.render_options["OUT_DVAR"]] = {"numel": y_numel}
+
+        code = self._template_from_string(TEMPLATE).render(**kernel.render_options)
+        return code
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 34b185b8..256d7101 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -173,7 +173,11 @@ def get_mlir_shape(info):
     def mlir_argdefs(self, extra_node=dict()):
         buffer_types = {}
         for x in V.graph.buffers:
-            if not isinstance(x.layout, MultiOutputLayout): # FIXME: MultiOutputLayout should be handled
+            if isinstance(x.layout, MultiOutputLayout):
+                # MultiOutput kernel containers own concrete output nodes in `outputs`.
+                for out in getattr(x, "outputs", []):
+                    buffer_types[out.get_name()] = [out.get_dtype(), out.get_numel(), out.get_size(), out.get_stride()]
+            else:
                 buffer_types[x.get_name()] = [x.get_dtype(), x.get_numel(), x.get_size(), x.get_stride()]
         for name, val in V.graph.graph_inputs.items():
             if isinstance(val, sympy.Expr):
diff --git a/PyTorchSimFrontend/mlir/mlir_lowering.py b/PyTorchSimFrontend/mlir/mlir_lowering.py
index ebf0c80e..0f28f03b 100644
--- a/PyTorchSimFrontend/mlir/mlir_lowering.py
+++ b/PyTorchSimFrontend/mlir/mlir_lowering.py
@@ -15,10 +15,15 @@
 from PyTorchSimFrontend.mlir.mlir_conv_sb_template import MLIRConvSingleBatchTemplate
 from PyTorchSimFrontend.mlir.mlir_conv_sbs_template import MLIRConvSingleBatchStridedTemplate
 from PyTorchSimFrontend.mlir.mlir_maxpool_template import MLIRMaxPoolTemplate
+from PyTorchSimFrontend.mlir.mlir_cat_template import MLIRCatTemplate
+from PyTorchSimFrontend.mlir.mlir_sort_template import MLIRSortTemplate
 from PyTorchSimFrontend import extension_config
 
 aten = torch.ops.aten
 aten_spmm = MLIRExternKernelChoice(torch.sparse.mm, "custom_op::sparse_addmm")
+_orig_cat_default_lowering = lowerings.get(aten.cat.default)
+_orig_cat_out_lowering = lowerings.get(aten.cat.out)
+_orig_sort_values_stable_lowering = lowerings.get(aten.sort.values_stable)
 
 def tuned_mm(mat1, mat2, * ,layout=None):
     m, n, k, layout, mat1, mat2 = mm_args(mat1, mat2, layout=layout)
@@ -181,11 +186,285 @@ def custom_unsafe_index(x, indices):
         x.realize()
     return index_impl(x, indices, check=False)
 
+
+def _cat_layout(tensors: Sequence[TensorBox], dim: int) -> ir.Layout:
+    with V.graph.fake_mode:
+        output = torch.ops.aten.cat(
+            [ir.ir_node_to_tensor(t, guard_shape=True) for t in tensors],
+            dim,
+        )
+        sizes = ir.convert_shape_to_inductor(output.size())
+        stride = ir.convert_shape_to_inductor(output.stride())
+    return ir.FixedLayout(
+        tensors[0].get_device(),
+        tensors[0].get_dtype(),
+        sizes,
+        stride,
+    )
+
+
+def _can_use_cat_template(tensors: Sequence[TensorBox], dim: int) -> bool:
+    # Current template specialization: 2 inputs, rank-2, dim in {0, 1}.
+    if len(tensors) != 2:
+        return False
+    if not all(hasattr(t, "get_size") and hasattr(t, "get_dtype") and hasattr(t, "realize") for t in tensors):
+        return False
+    if tensors[0].get_dtype() != tensors[1].get_dtype():
+        return False
+    rank0 = len(tensors[0].get_size())
+    rank1 = len(tensors[1].get_size())
+    if rank0 != 2 or rank1 != 2:
+        return False
+    if dim < 0:
+        dim += rank0
+    if dim not in (0, 1):
+        return False
+
+    if dim == 0:
+        cols0 = tensors[0].get_size()[1]
+        cols1 = tensors[1].get_size()[1]
+        return V.graph.sizevars.statically_known_equals(cols0, cols1)
+
+    rows0 = tensors[0].get_size()[0]
+    rows1 = tensors[1].get_size()[0]
+    return V.graph.sizevars.statically_known_equals(rows0, rows1)
+
+
+def _cat_fallback(reason: str, tensors: Sequence[TensorBox], dim: int):
+    # Non-template cases delegate to the original lowering path.
+    return _orig_cat_default_lowering(tensors, dim)
+
+
+def _custom_cat_impl(tensors: Sequence[TensorBox], dim: int = 0):
+    if _orig_cat_default_lowering is None:
+        raise RuntimeError("Original aten.cat.default lowering is missing")
+    if len(tensors) > 0:
+        rank = len(tensors[0].get_size())
+        if dim < 0:
+            dim += rank
+    if not _can_use_cat_template(tensors, dim):
+        return _cat_fallback("default-path", tensors, dim)
+
+    for t in tensors:
+        t.realize()
+    layout = _cat_layout(tensors, dim)
+    mlir_template = MLIRCatTemplate(list(tensors), layout, dim=dim)
+    return mlir_template.generate().output_node()
+
+
+def custom_cat_default(tensors: Sequence[TensorBox], dim: int = 0):
+    return _custom_cat_impl(tensors, dim)
+
+
+def custom_cat_out(tensors: Sequence[TensorBox], dim: int = 0, out: Optional[TensorBox] = None):
+    if _orig_cat_out_lowering is None:
+        raise RuntimeError("Original aten.cat.out lowering is missing")
+    if out is None:
+        return _orig_cat_out_lowering(tensors, dim, out)
+
+    copy_default_lowering = lowerings.get(aten.copy_.default)
+    slice_tensor_lowering = lowerings.get(aten.slice.Tensor)
+    if copy_default_lowering is None or slice_tensor_lowering is None:
+        raise RuntimeError("cat.out lowering requires aten.copy_.default and aten.slice.Tensor lowerings")
+
+    # Lower cat.out as a sequence of slice+copy ops so each piece still runs
+    # through the existing compiled/simulated kernel path.
+    if len(tensors) == 0:
+        raise RuntimeError("cat.out requires at least one input tensor")
+    if not all(hasattr(t, "get_size") and hasattr(t, "get_dtype") and hasattr(t, "realize") for t in tensors):
+        raise RuntimeError("cat.out inputs must be tensor-like values")
+    rank = len(tensors[0].get_size())
+    if rank == 0:
+        raise RuntimeError("cat.out does not support scalar inputs")
+    if dim < 0:
+        dim = dim + rank
+    if dim < 0 or dim >= rank:
+        raise RuntimeError(f"cat.out dim out of range: dim={dim}, rank={rank}")
+    if any(len(t.get_size()) != rank for t in tensors):
+        raise RuntimeError("cat.out inputs must have the same rank")
+    if any(t.get_dtype() != tensors[0].get_dtype() for t in tensors):
+        raise RuntimeError("cat.out inputs must have the same dtype")
+    # cat semantics: all non-cat dimensions must be equal.
+    for i in range(rank):
+        if i == dim:
+            continue
+        base = tensors[0].get_size()[i]
+        if any(not V.graph.sizevars.statically_known_equals(base, t.get_size()[i]) for t in tensors[1:]):
+            raise RuntimeError(f"cat.out non-concatenated dimension mismatch at dim={i}")
+
+    # Output shape must match concatenated shape.
+    if not hasattr(out, "get_size"):
+        raise RuntimeError("cat.out output must be tensor-like")
+    out_sizes = list(out.get_size())
+    if len(out_sizes) != rank:
+        raise RuntimeError("cat.out output rank mismatch")
+    for i in range(rank):
+        if i == dim:
+            continue
+        if not V.graph.sizevars.statically_known_equals(out_sizes[i], tensors[0].get_size()[i]):
+            raise RuntimeError(f"cat.out output shape mismatch at dim={i}")
+    expected_cat = sum(t.get_size()[dim] for t in tensors)
+    if not V.graph.sizevars.statically_known_equals(out_sizes[dim], expected_cat):
+        raise RuntimeError(f"cat.out output concatenated dimension mismatch at dim={dim}")
+
+    if isinstance(out, TensorBox):
+        out.realize()
+
+    offset = 0
+    for src in tensors:
+        src.realize()
+        end = offset + src.get_size()[dim]
+        dst_view = slice_tensor_lowering(out, dim, offset, end, 1)
+        copy_default_lowering(dst_view, src)
+        offset = end
+    return out
+
+
+def _custom_sort_values_impl(
+    self: TensorBox,
+    dim: int = -1,
+    descending: bool = False,
+    values: Optional[TensorBox] = None,
+    indices: Optional[TensorBox] = None,
+    stable: Optional[bool] = None,
+):
+    if values is None or indices is None:
+        raise RuntimeError("sort.values* lowering requires both out tensors: values, indices")
+
+    def _normalize_dim(rank: int, d: int) -> int:
+        return d + rank if d < 0 else d
+
+    if not hasattr(self, "get_size"):
+        raise RuntimeError("sort.values* lowering requires TensorBox input")
+
+    rank = len(self.get_size())
+    norm_dim = _normalize_dim(rank, dim)
+    if norm_dim < 0 or norm_dim >= rank:
+        raise RuntimeError(f"sort.values* dim out of range: dim={dim}, rank={rank}")
+    if rank != 2:
+        raise RuntimeError(f"sort.values* lowering currently supports rank-2 only, got rank={rank}")
+    if norm_dim not in (0, 1):
+        raise RuntimeError(f"sort.values* lowering currently supports dim in {{0,1}} only, got dim={norm_dim}")
+
+    self.realize()
+    if isinstance(values, TensorBox):
+        values.realize()
+    if isinstance(indices, TensorBox):
+        indices.realize()
+
+    value_layout, _ = _sort_layouts(self, norm_dim, descending)
+    mlir_template = MLIRSortTemplate(
+        [self],
+        value_layout,
+        dim=norm_dim,
+        descending=descending,
+        stable=True if stable is None else stable,
+        indices_node=indices,
+    )
+    sorted_values = mlir_template.generate(template_buffer_node=values, epilogue_nodes=[indices]).output_node()
+    return sorted_values, indices
+
+
+def _sort_layouts(x: TensorBox, dim: int, descending: bool):
+    with V.graph.fake_mode:
+        v, i = torch.ops.aten.sort(
+            ir.ir_node_to_tensor(x, guard_shape=True),
+            dim,
+            descending,
+        )
+        v_sizes = ir.convert_shape_to_inductor(v.size())
+        v_stride = ir.convert_shape_to_inductor(v.stride())
+        i_sizes = ir.convert_shape_to_inductor(i.size())
+        i_stride = ir.convert_shape_to_inductor(i.stride())
+
+    value_layout = ir.FixedLayout(x.get_device(), x.get_dtype(), v_sizes, v_stride)
+    index_layout = ir.FixedLayout(x.get_device(), torch.int64, i_sizes, i_stride)
+    return value_layout, index_layout
+
+
+def custom_sort_stable(
+    self: TensorBox,
+    *,
+    stable: Optional[bool] = None,
+    dim: int = -1,
+    descending: bool = False,
+):
+    empty_strided_lowering = lowerings.get(aten.empty_strided.default)
+    if empty_strided_lowering is None:
+        if _orig_sort_values_stable_lowering is None:
+            raise RuntimeError("sort.stable lowering requires aten.empty_strided.default")
+        return _orig_sort_values_stable_lowering(self, dim=dim, descending=descending, stable=True)
+
+    rank = len(self.get_size()) if hasattr(self, "get_size") else 0
+    norm_dim = dim + rank if dim < 0 else dim
+    if rank > 0 and (norm_dim < 0 or norm_dim >= rank):
+        raise RuntimeError(f"sort.stable dim out of range: dim={dim}, rank={rank}")
+
+    # Template specialization supports rank-2 and dim in {0,1}.
+    if rank == 2 and norm_dim not in (0, 1):
+        if _orig_sort_values_stable_lowering is None:
+            raise RuntimeError("Original aten.sort.values_stable lowering is missing")
+        return _orig_sort_values_stable_lowering(self, dim=dim, descending=descending, stable=True)
+
+    try:
+        value_layout, index_layout = _sort_layouts(self, norm_dim, descending)
+        values = empty_strided_lowering(
+            list(value_layout.size),
+            list(value_layout.stride),
+            dtype=value_layout.dtype,
+            device=self.get_device(),
+        )
+        indices = empty_strided_lowering(
+            list(index_layout.size),
+            list(index_layout.stride),
+            dtype=index_layout.dtype,
+            device=self.get_device(),
+        )
+        return _custom_sort_values_impl(
+            self=self,
+            dim=dim,
+            descending=descending,
+            values=values,
+            indices=indices,
+            stable=True if stable is None else stable,
+        )
+    except Exception:
+        if _orig_sort_values_stable_lowering is None:
+            raise
+        return _orig_sort_values_stable_lowering(self, dim=dim, descending=descending, stable=stable)
+
+
+def custom_sort_values_stable(
+    self: TensorBox,
+    *,
+    stable: Optional[bool] = None,
+    dim: int = -1,
+    descending: bool = False,
+    values: Optional[TensorBox] = None,
+    indices: Optional[TensorBox] = None,
+):
+    return _custom_sort_values_impl(
+        self=self,
+        dim=dim,
+        descending=descending,
+        values=values,
+        indices=indices,
+        stable=stable,
+    )
+
+
 lowerings.update({getattr(aten.mm, overload): tuned_mm for overload in aten.mm.overloads()})
 lowerings.update({getattr(aten.addmm, overload): tuned_addmm for overload in aten.addmm.overloads()})
 lowerings.update({getattr(aten.convolution, overload): convolution for overload in aten.convolution.overloads()})
 lowerings.update({getattr(aten.bmm, overload): tuned_bmm for overload in aten.bmm.overloads()})
 lowerings.update({getattr(aten._sparse_addmm, overload): sparse_addmm for overload in aten._sparse_addmm.overloads()})
 lowerings.update({getattr(aten._unsafe_index, overload): custom_unsafe_index for overload in aten._unsafe_index.overloads()})
+
+lowerings.update({aten.cat.default: custom_cat_default})
+lowerings.update({aten.cat.out: custom_cat_out})
+
+lowerings.update({aten.sort.stable: custom_sort_stable})
+lowerings.update({aten.sort.values_stable: custom_sort_values_stable})
+    
 if extension_config.CONFIG_USE_TIMING_POOLING:
-    lowerings.update({getattr(aten.max_pool2d_with_indices, overload): custom_maxpool for overload in aten.max_pool2d_with_indices.overloads()}) # FIXME: maxpool should be implemented as a template
\ No newline at end of file
+    lowerings.update({getattr(aten.max_pool2d_with_indices, overload): custom_maxpool for overload in aten.max_pool2d_with_indices.overloads()}) # FIXME: maxpool should be implemented as a template
diff --git a/PyTorchSimFrontend/mlir/mlir_sort_template.py b/PyTorchSimFrontend/mlir/mlir_sort_template.py
new file mode 100644
index 00000000..d12c7570
--- /dev/null
+++ b/PyTorchSimFrontend/mlir/mlir_sort_template.py
@@ -0,0 +1,253 @@
+from typing import List, Optional
+
+import sympy
+from torch._inductor.ir import IRNode
+from torch._inductor.virtualized import V
+
+from PyTorchSimFrontend.mlir import mlir_common
+from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate, MLIRTemplateKernel
+
+
+TEMPLATE = r"""
+{{kernel.def_global_vars()}}
+
+func.func @{{ KERNEL_NAME }} {{kernel.def_kernel(inputs=[X, YI], outputs=[YV], names_str=NAMES_STR, input_reorder=input_reorder)}} {
+  {{ kernel.def_sram_buffer("YI", YI_TILE_DESC, id=1, indent_size=2) }}
+  {{ kernel.def_sram_buffer(OUT_DVAR, YV_TILE_DESC, id=2, indent_size=2) }}
+  {{ kernel.def_local_vars(indent_size=2) }}
+
+  %c0 = arith.constant 0 : index
+  %c_cols = arith.constant {{ COLS }} : index
+
+  affine.for %sort_block = 0 to 1 step 1 {
+    // Initialize output value/index buffers.
+    affine.for %row = 0 to {{ ROWS }} step 1 {
+      affine.for %col = 0 to {{ COLS }} step 1 {
+        {{ kernel.def_dma_op("MVIN", "X", INIT_X_IDX, X_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=8) }}
+        {{ kernel.def_dma_op("MVOUT", OUT_DVAR, INIT_YV_IDX, X_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=8) }}
+{% if DIM == 1 %}
+        %idx_i64 = arith.index_cast %col : index to {{ YI_ELEM_TYPE }}
+{% else %}
+        %idx_i64 = arith.index_cast %row : index to {{ YI_ELEM_TYPE }}
+{% endif %}
+        memref.store %idx_i64, %yi_sort_tile[%c0, %c0] : {{ YI_TILE_MEMREF_TYPE }}
+        {{ kernel.def_dma_op("MVOUT", "YI", INIT_YI_IDX, YI_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=8) }}
+      }
+    }
+
+{% if DIM == 1 %}
+    // Stable bubble sort on each row (dim=1).
+    affine.for %row = 0 to {{ ROWS }} step 1 {
+      affine.for %pass = 0 to {{ COLS }} step 1 {
+        affine.for %j = 0 to {{ COLS_MINUS1 }} step 1 {
+          {{ kernel.def_dma_op("MVIN", OUT_DVAR, D1_S0_IDX, YV_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=10) }}
+          %lhs = memref.load %yv_sort_tile[%c0, %c0] : {{ YV_TILE_MEMREF_TYPE }}
+
+          {{ kernel.def_dma_op("MVIN", OUT_DVAR, D1_S1_IDX, YV_S1_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=10) }}
+          %rhs = memref.load %yv_sort_tile[%c0, %c0] : {{ YV_TILE_MEMREF_TYPE }}
+
+{% if DESCENDING %}
+          %need_swap = arith.cmpf olt, %lhs, %rhs : {{ YV_ELEM_TYPE }}
+{% else %}
+          %need_swap = arith.cmpf ogt, %lhs, %rhs : {{ YV_ELEM_TYPE }}
+{% endif %}
+          scf.if %need_swap {
+            memref.store %rhs, %yv_sort_tile[%c0, %c0] : {{ YV_TILE_MEMREF_TYPE }}
+            {{ kernel.def_dma_op("MVOUT", OUT_DVAR, D1_S0_IDX, YV_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=12) }}
+
+            memref.store %lhs, %yv_sort_tile[%c0, %c0] : {{ YV_TILE_MEMREF_TYPE }}
+            {{ kernel.def_dma_op("MVOUT", OUT_DVAR, D1_S1_IDX, YV_S1_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=12) }}
+
+            {{ kernel.def_dma_op("MVIN", "YI", D1_S0_IDX, YI_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=12) }}
+            %li = memref.load %yi_sort_tile[%c0, %c0] : {{ YI_TILE_MEMREF_TYPE }}
+
+            {{ kernel.def_dma_op("MVIN", "YI", D1_S1_IDX, YI_S1_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=12) }}
+            %ri = memref.load %yi_sort_tile[%c0, %c0] : {{ YI_TILE_MEMREF_TYPE }}
+
+            memref.store %ri, %yi_sort_tile[%c0, %c0] : {{ YI_TILE_MEMREF_TYPE }}
+            {{ kernel.def_dma_op("MVOUT", "YI", D1_S0_IDX, YI_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=12) }}
+
+            memref.store %li, %yi_sort_tile[%c0, %c0] : {{ YI_TILE_MEMREF_TYPE }}
+            {{ kernel.def_dma_op("MVOUT", "YI", D1_S1_IDX, YI_S1_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=12) }}
+          }
+        }
+      }
+    }
+{% else %}
+    // Stable bubble sort on each column (dim=0).
+    affine.for %col = 0 to {{ COLS }} step 1 {
+      affine.for %pass = 0 to {{ ROWS }} step 1 {
+        affine.for %i = 0 to {{ ROWS_MINUS1 }} step 1 {
+          {{ kernel.def_dma_op("MVIN", OUT_DVAR, D0_S0_IDX, YV_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=10) }}
+          %lhs = memref.load %yv_sort_tile[%c0, %c0] : {{ YV_TILE_MEMREF_TYPE }}
+
+          {{ kernel.def_dma_op("MVIN", OUT_DVAR, D0_S1_IDX, YV_S1_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=10) }}
+          %rhs = memref.load %yv_sort_tile[%c0, %c0] : {{ YV_TILE_MEMREF_TYPE }}
+
+{% if DESCENDING %}
+          %need_swap = arith.cmpf olt, %lhs, %rhs : {{ YV_ELEM_TYPE }}
+{% else %}
+          %need_swap = arith.cmpf ogt, %lhs, %rhs : {{ YV_ELEM_TYPE }}
+{% endif %}
+          scf.if %need_swap {
+            memref.store %rhs, %yv_sort_tile[%c0, %c0] : {{ YV_TILE_MEMREF_TYPE }}
+            {{ kernel.def_dma_op("MVOUT", OUT_DVAR, D0_S0_IDX, YV_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=12) }}
+
+            memref.store %lhs, %yv_sort_tile[%c0, %c0] : {{ YV_TILE_MEMREF_TYPE }}
+            {{ kernel.def_dma_op("MVOUT", OUT_DVAR, D0_S1_IDX, YV_S1_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=12) }}
+
+            {{ kernel.def_dma_op("MVIN", "YI", D0_S0_IDX, YI_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=12) }}
+            %li = memref.load %yi_sort_tile[%c0, %c0] : {{ YI_TILE_MEMREF_TYPE }}
+
+            {{ kernel.def_dma_op("MVIN", "YI", D0_S1_IDX, YI_S1_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=12) }}
+            %ri = memref.load %yi_sort_tile[%c0, %c0] : {{ YI_TILE_MEMREF_TYPE }}
+
+            memref.store %ri, %yi_sort_tile[%c0, %c0] : {{ YI_TILE_MEMREF_TYPE }}
+            {{ kernel.def_dma_op("MVOUT", "YI", D0_S0_IDX, YI_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=12) }}
+
+            memref.store %li, %yi_sort_tile[%c0, %c0] : {{ YI_TILE_MEMREF_TYPE }}
+            {{ kernel.def_dma_op("MVOUT", "YI", D0_S1_IDX, YI_S1_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=12) }}
+          }
+        }
+      }
+    }
+{% endif %}
+  } { outer_loop=true }
+  return
+}
+"""
+
+
+class MLIRSortTemplate(MLIRTemplate):
+    def __init__(self, input_nodes, layout, dim, descending=False, stable=False, indices_node=None, input_reorder=None):
+        super().__init__("kernel", input_nodes, layout, input_reorder)
+        self.dim = dim
+        self.descending = descending
+        self.stable = stable
+        self.indices_node = indices_node
+
+    def render(
+        self,
+        kernel: MLIRTemplateKernel,
+        template_buffer_node=None,
+        epilogue_nodes: Optional[List[IRNode]] = None,
+        tile_info=None,
+        **kwargs,
+    ):
+        if template_buffer_node is not None:
+            self.output_node = template_buffer_node
+        if self.indices_node is None:
+            raise RuntimeError("MLIRSortTemplate requires indices output node")
+
+        x = self.input_nodes[0]
+        yv = self.output_node
+        yi = self.indices_node
+
+        def _as_int(v):
+            try:
+                return int(v)
+            except Exception:
+                return int(V.graph.sizevars.size_hint(v))
+
+        x_size = x.get_size()
+        if len(x_size) != 2:
+            raise RuntimeError("MLIRSortTemplate currently supports rank-2 input only")
+        if self.dim not in (0, 1):
+            raise RuntimeError(f"MLIRSortTemplate currently supports dim in {{0,1}} only, got dim={self.dim}")
+
+        rows = _as_int(x_size[0])
+        cols = _as_int(x_size[1])
+        cols_minus1 = max(0, cols - 1)
+        rows_minus1 = max(0, rows - 1)
+
+        x_dtype = x.get_dtype()
+        yv_dtype = yv.get_dtype()
+        yi_dtype = yi.get_dtype()
+        if x_dtype != yv_dtype:
+            raise RuntimeError("sort template requires input/value dtype match")
+
+        yi_tile_desc = mlir_common.MLIRMultiDimTile([1, 1], kernel.vector_lane, vlane_split_axis=1, vlane_stride=1)
+        yi_tile_desc.set_tile_size_stride([1, 1], [1, 1])
+        yi_tile_desc.set_name("yi_sort_tile")
+        yv_tile_desc = mlir_common.MLIRMultiDimTile([1, 1], kernel.vector_lane, vlane_split_axis=1, vlane_stride=1)
+        yv_tile_desc.set_tile_size_stride([1, 1], [1, 1])
+        yv_tile_desc.set_name("yv_sort_tile")
+        # Neighbor element descriptors use DRAM offset to preserve affine stride metadata.
+        yv_s1_tile_desc = mlir_common.MLIRMultiDimTile([1, 1], kernel.vector_lane, vlane_split_axis=1, vlane_stride=1)
+        yv_s1_tile_desc.set_tile_size_stride([1, 1], [1, 1])
+        yv_s1_tile_desc.set_name("yv_sort_tile")
+        yi_s1_tile_desc = mlir_common.MLIRMultiDimTile([1, 1], kernel.vector_lane, vlane_split_axis=1, vlane_stride=1)
+        yi_s1_tile_desc.set_tile_size_stride([1, 1], [1, 1])
+        yi_s1_tile_desc.set_name("yi_sort_tile")
+        if int(self.dim) == 1:
+            yv_s1_tile_desc.offset = sympy.Integer(1)
+            yi_s1_tile_desc.offset = sympy.Integer(1)
+        else:
+            yv_s1_tile_desc.offset = sympy.Integer(cols)
+            yi_s1_tile_desc.offset = sympy.Integer(cols)
+
+        row = sympy.Symbol("row")
+        col = sympy.Symbol("col")
+        i = sympy.Symbol("i")
+        j = sympy.Symbol("j")
+
+        init_x_idx = [row * cols, col]
+        init_yv_idx = [row * cols, col]
+        init_yi_idx = [row * cols, col]
+
+        d1_s0_idx = [row * cols, j]
+        d1_s1_idx = [row * cols, j]
+
+        d0_s0_idx = [i * cols, col]
+        d0_s1_idx = [i * cols, col]
+
+        kernel.loop_size = None
+        numel = rows * cols
+        kernel.render_options = dict(
+            KERNEL_NAME=self.name,
+            kernel=kernel,
+            X=x,
+            YV=yv,
+            YI=yi,
+            OUT_DVAR="YV",
+            NAMES_STR="X, YI, YV",
+            ROWS=rows,
+            COLS=cols,
+            COLS_MINUS1=cols_minus1,
+            ROWS_MINUS1=rows_minus1,
+            DIM=int(self.dim),
+            DESCENDING=bool(self.descending),
+            YI_TILE_DESC=yi_tile_desc,
+            YV_TILE_DESC=yv_tile_desc,
+            YI_S1_TILE_DESC=yi_s1_tile_desc,
+            YV_S1_TILE_DESC=yv_s1_tile_desc,
+            INIT_X_IDX=init_x_idx,
+            INIT_YV_IDX=init_yv_idx,
+            INIT_YI_IDX=init_yi_idx,
+            D1_S0_IDX=d1_s0_idx,
+            D1_S1_IDX=d1_s1_idx,
+            D0_S0_IDX=d0_s0_idx,
+            D0_S1_IDX=d0_s1_idx,
+            YV_ELEM_TYPE=mlir_common.DTYPE_TO_MLIR[yv_dtype],
+            YI_ELEM_TYPE=mlir_common.DTYPE_TO_MLIR[yi_dtype],
+            X_MEMREF_TYPE=f"memref<{numel}x{mlir_common.DTYPE_TO_MLIR[x_dtype]}>",
+            YV_MEMREF_TYPE=f"memref<{numel}x{mlir_common.DTYPE_TO_MLIR[yv_dtype]}>",
+            YI_MEMREF_TYPE=f"memref<{numel}x{mlir_common.DTYPE_TO_MLIR[yi_dtype]}>",
+            YV_TILE_MEMREF_TYPE=yv_tile_desc.get_mlir_shape(mlir_common.DTYPE_TO_MLIR[yv_dtype]),
+            YI_TILE_MEMREF_TYPE=yi_tile_desc.get_mlir_shape(mlir_common.DTYPE_TO_MLIR[yi_dtype]),
+            X_TILE_DESC=yv_tile_desc,
+            input_reorder=self.input_reorder,
+        )
+
+        output_node_name = yv.get_name() if hasattr(yv, "get_name") else yv.name
+        kernel.epilogue_info = dict(
+            output_node=output_node_name,
+            sram_var="yv_sort_tile",
+            dram_var=kernel.render_options["OUT_DVAR"],
+            dram_tile_desc=yv_tile_desc,
+        )
+        kernel.exception_nodes[kernel.render_options["OUT_DVAR"]] = {"numel": yv.get_numel()}
+        kernel.exception_nodes["YI"] = {"numel": yi.get_numel()}
+
+        code = self._template_from_string(TEMPLATE).render(**kernel.render_options)
+        return code
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index b1c756ba..76b0ef71 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -403,7 +403,7 @@ def call_kernel(self, kernel_name):
         _, call_args, _, _ = self.kernel_group.args.mlir_argdefs()
         # generate the code to call this
         wrapper.generate_kernel_call(
-            kernel_name if self.outer_func_name is None else "wrapper_" + kernel_name, call_args)
+            kernel_name if self.outer_func_name is None else self.outer_func_name + f"_{len(call_args)}", call_args)
 
     def codegen_template_code(self, render, template_node, prologue_nodes, epilogue_nodes, tile_info):
         with self as kernel:
@@ -628,8 +628,26 @@ def def_kernel(
                 self.buffer_names[node.get_name()] = self.epilogue_info['sram_var']
 
         def hook():
-            arg_defs, *_ = self.kernel_group.args.mlir_argdefs(extra_node=extra_node)
-            return f"({', '.join(arg_defs)})"
+            arg_defs, call_args, *_ = self.kernel_group.args.mlir_argdefs(extra_node=extra_node)
+            output_names = names[len(inputs) : len(inputs) + len(outputs)]
+            out_ptr_idx = 0
+            renamed_arg_defs = []
+            for outer, arg_def in zip(call_args, arg_defs):
+                raw_symbol = arg_def.split(":", 1)[0].strip().lstrip("%")
+                if outer in self.kernel_group.args.input_buffers:
+                    symbol = self.kernel_group.args.input_buffers[outer]
+                elif outer in self.kernel_group.args.output_buffers:
+                    symbol = self.kernel_group.args.output_buffers[outer]
+                elif raw_symbol.startswith("out_ptr") and out_ptr_idx < len(output_names):
+                    symbol = output_names[out_ptr_idx]
+                    out_ptr_idx += 1
+                elif outer in self.kernel_group.args.sizevars:
+                    symbol = self.kernel_group.args.sizevars[outer]
+                else:
+                    symbol = raw_symbol
+                _, arg_type = arg_def.split(":", 1)
+                renamed_arg_defs.append(f"%{symbol}:{arg_type}")
+            return f"({', '.join(renamed_arg_defs)})"
 
         assert "<DEF_KERNEL>" not in self.render_hooks
         self.render_hooks["<DEF_KERNEL>"] = hook
@@ -1151,6 +1169,8 @@ def __init__(self, name, input_nodes, layout, input_reorder = None):
         super().__init__(name)
         self.input_nodes = [node for node in input_nodes if node is not None]
         self.output_node: Buffer = Buffer(name="buf_out", layout=layout)
+        # Multi-output templates can override this with explicit output buffers.
+        self.output_nodes = [self.output_node]
         self.input_reorder = input_reorder
         self.layout = layout
 
@@ -1166,10 +1186,12 @@ def generate(self, **kwargs) -> ChoiceCaller:
         kernel_hash_name = f"mlir_{self.name}_{next(self.index_counter)}"
         extra_args = []
         # create the BenchmarkRequest
+        output_nodes = getattr(self, "output_nodes", None) or [self.output_node]
+
         bmreq = MLIRBenchmarkRequest(
             kernel_name=kernel_name,
             input_tensor_meta=TensorMeta.from_irnodes(self.input_nodes),
-            output_tensor_meta=TensorMeta.from_irnodes(self.output_node),
+            output_tensor_meta=TensorMeta.from_irnodes(output_nodes),
             extra_args=extra_args,
             source_code=code,
         )
diff --git a/tests/DeepSeek/test_deepseek_v3_base.py b/tests/DeepSeek/test_deepseek_v3_base.py
index b8402c8b..ade787c5 100644
--- a/tests/DeepSeek/test_deepseek_v3_base.py
+++ b/tests/DeepSeek/test_deepseek_v3_base.py
@@ -1,8 +1,55 @@
 import os
 import sys
 import argparse
+import copy
+from pathlib import Path
 import torch
 
+# recursive compile for some ops that are caused by graph break
+torch.npu.register_eager_to_compile([
+    "aten::zero_",
+    "aten::sum.IntList_out",
+    "aten::mul.out",
+    "aten::floor_divide",
+    "aten::floor_divide.Tensor",
+    "aten::floor_divide.Scalar",
+    "aten::cat.out",
+    "aten::sort.values_stable",
+])
+
+
+def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
+    out_cpu = out.cpu()
+    max_diff = (out_cpu - cpu_out).abs().max().item()
+    mean_diff = (out_cpu - cpu_out).abs().mean().item()
+    if torch.allclose(out_cpu, cpu_out, rtol=rtol, atol=atol):
+        message = f"|{name} Test Passed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+        print(f"Max absolute difference: {max_diff:.6f}")
+        print(f"Mean absolute difference: {mean_diff:.6f}")
+    else:
+        message = f"|{name} Test Failed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+        print("NPU out: ", out_cpu)
+        print("CPU out: ", cpu_out)
+        print(f"Max absolute difference: {max_diff:.6f}")
+        print(f"Mean absolute difference: {mean_diff:.6f}")
+        exit(1)
+
+
+def _extract_logits(output):
+    if isinstance(output, torch.Tensor):
+        return output
+    if hasattr(output, "logits"):
+        return output.logits
+    if isinstance(output, (list, tuple)) and len(output) > 0 and isinstance(output[0], torch.Tensor):
+        return output[0]
+    raise TypeError(f"Unsupported output type for comparison: {type(output)}")
+
 
 def _dtype_from_str(name: str) -> torch.dtype:
     return {
@@ -81,7 +128,7 @@ def _maybe_scale_config(config, scale=1.0, max_layers=None):
 
 def _apply_preset(scale, max_layers, batch, seq_len, preset):
     if preset == "tiny":
-        return 0.03, 4, 1, min(seq_len, 16)
+        return 0.03, 1, 1, min(seq_len, 16)
     if preset == "small":
         return 0.07, 8, 1, min(seq_len, 32)
     if preset == "medium":
@@ -89,8 +136,58 @@ def _apply_preset(scale, max_layers, batch, seq_len, preset):
     return scale, max_layers, batch, seq_len
 
 
+def _togsim_log_count() -> int:
+    log_dir = Path("togsim_results")
+    if not log_dir.exists():
+        return 0
+    return len(list(log_dir.glob("*.log")))
+
+
+def _assert_simulation_happened(before_count: int, case_name: str):
+    after_count = _togsim_log_count()
+    if after_count <= before_count:
+        raise RuntimeError(
+            f"{case_name}: TOGSim log count did not increase "
+            f"(before={before_count}, after={after_count})"
+        )
+    print(f"{case_name}: TOGSim logs increased ({before_count} -> {after_count})")
+
+
+def test_cat_default(device):
+    def cat_default_fn(a, b):
+        return torch.cat([a, b], dim=0)
+
+    x = torch.randn(8, 16, device=device)
+    y = torch.randn(6, 16, device=device)
+    opt_fn = torch.compile(dynamic=False)(cat_default_fn)
+
+    before = _togsim_log_count()
+    out = opt_fn(x, y)
+    _assert_simulation_happened(before, "cat.default")
+
+    cpu_out = torch.cat([x.cpu(), y.cpu()], dim=0)
+    test_result("cat.default", out, cpu_out, rtol=1e-4, atol=1e-4)
+
+
+def test_cat_out(device):
+    def cat_out_fn(a, b, out):
+        return torch.ops.aten.cat.out([a, b], 0, out=out)
+
+    x = torch.randn(8, 16, device=device)
+    y = torch.randn(6, 16, device=device)
+    out_buf = torch.empty(14, 16, device=device)
+    opt_fn = torch.compile(dynamic=False)(cat_out_fn)
+
+    before = _togsim_log_count()
+    out = opt_fn(x, y, out_buf)
+    _assert_simulation_happened(before, "cat.out")
+
+    cpu_out = torch.cat([x.cpu(), y.cpu()], dim=0)
+    test_result("cat.out", out, cpu_out, rtol=1e-4, atol=1e-4)
+    
+    
 @torch.no_grad()
-def run_deep_seek_v3_base_test(
+def run_deepseek_v3_base(
     model_id,
     device,
     init_mode="config-random",
@@ -120,7 +217,6 @@ def run_deep_seek_v3_base_test(
     # (call .to_dict()), so only disable it for pretrained loading path.
     if init_mode == "pretrained" and getattr(config, "quantization_config", None) is not None:
         config.quantization_config = None
-
     config = _maybe_scale_config(config, scale=scale, max_layers=max_layers)
 
     if init_mode == "config-random":
@@ -141,7 +237,6 @@ def run_deep_seek_v3_base_test(
     else:
         raise ValueError(f"Unsupported init mode: {init_mode}")
 
-    model = model.to(device)
     model_params = sum(p.numel() for p in model.parameters())
     print("init mode:", init_mode)
     print("scaled hidden_size:", getattr(config, "hidden_size", "n/a"))
@@ -157,23 +252,33 @@ def run_deep_seek_v3_base_test(
             revision=revision,
         )
         encoded = tokenizer(prompt, return_tensors="pt")
-        input_ids = encoded["input_ids"].to(device)
+        cpu_input_ids = encoded["input_ids"].cpu()
     else:
         vocab_size = getattr(config, "vocab_size", None)
         if vocab_size is None:
             raise ValueError("Config has no vocab_size; use --use-tokenizer or pass a model with vocab_size.")
-        input_ids = _build_random_inputs(batch, seq_len, vocab_size, device)
+        cpu_input_ids = _build_random_inputs(batch, seq_len, vocab_size, torch.device("cpu"))
+    input_ids = cpu_input_ids.to(device)
 
-    if compile_model:
-        model = torch.compile(model, dynamic=False)
+    # CPU version
+    model_cpu = copy.deepcopy(model).cpu().eval()
+    cpu_out = _extract_logits(model_cpu(cpu_input_ids))
 
-    out = model(input_ids)
-    logits = out.logits
+    # NPU version
+    model_npu = copy.deepcopy(model_cpu).to(device).eval()
+    if compile_model:
+        model_npu = torch.compile(model_npu, dynamic=False)
+    npu_out = _extract_logits(model_npu(input_ids))
+
+    # Campare results
+    test_result(
+        "DeepSeek V3 Base",
+        npu_out,
+        cpu_out,
+        rtol=3e-1,
+        atol=2e-1,
+    )
     
-    print("logits shape:", tuple(logits.shape))
-    print("logits dtype:", logits.dtype)
-    print("logits max:", logits.max().item())
-
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="DeepSeek V3 download-based test")
@@ -181,7 +286,7 @@ def run_deep_seek_v3_base_test(
     parser.add_argument("--revision", type=str, default=None)
     parser.add_argument("--trust-remote-code", action="store_true", default=True)
     parser.add_argument("--init-mode", type=str, default="config-random", choices=["config-random", "pretrained"])
-    parser.add_argument("--preset", type=str, default="tiny", choices=["none", "tiny", "small", "medium"])
+    parser.add_argument("--preset", type=str, default="small", choices=["none", "tiny", "small", "medium"])
     parser.add_argument("--scale", type=float, default=1.0)
     parser.add_argument("--max-layers", type=int, default=None)
     parser.add_argument("--dtype", type=str, default="float32", choices=["float32", "float16", "bfloat16"])
@@ -190,6 +295,7 @@ def run_deep_seek_v3_base_test(
     parser.add_argument("--use-tokenizer", action="store_true")
     parser.add_argument("--prompt", type=str, default="Hello, DeepSeek V3")
     parser.add_argument("--compile", action="store_true", default=True)
+    parser.add_argument("--test", type=str, default="e2e", choices=["all", "e2e", "cat"])
 
     args = parser.parse_args()
 
@@ -203,18 +309,22 @@ def run_deep_seek_v3_base_test(
 
     device = torch.device("npu:0")
 
-    run_deep_seek_v3_base_test(
-        model_id=args.model_id,
-        device=device,
-        init_mode=args.init_mode,
-        scale=args.scale,
-        max_layers=args.max_layers,
-        dtype=args.dtype,
-        batch=args.batch,
-        seq_len=args.seq_len,
-        use_tokenizer=args.use_tokenizer,
-        prompt=args.prompt,
-        trust_remote_code=args.trust_remote_code,
-        revision=args.revision,
-        compile_model=args.compile,
-    )
+    if args.test in ("all", "cat"):
+        test_cat_default(device)
+        test_cat_out(device)
+    if args.test in ("all", "e2e"):
+        run_deepseek_v3_base(
+            model_id=args.model_id,
+            device=device,
+            init_mode=args.init_mode,
+            scale=args.scale,
+            max_layers=args.max_layers,
+            dtype=args.dtype,
+            batch=args.batch,
+            seq_len=args.seq_len,
+            use_tokenizer=args.use_tokenizer,
+            prompt=args.prompt,
+            trust_remote_code=args.trust_remote_code,
+            revision=args.revision,
+            compile_model=args.compile,
+        )
diff --git a/tests/test_cat.py b/tests/test_cat.py
new file mode 100644
index 00000000..32573a05
--- /dev/null
+++ b/tests/test_cat.py
@@ -0,0 +1,89 @@
+import argparse
+from pathlib import Path
+
+import torch
+
+
+def _test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
+    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        message = f"|{name} Test Passed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+        return
+
+    message = f"|{name} Test Failed|"
+    print("-" * len(message))
+    print(message)
+    print("-" * len(message))
+    print("custom out: ", out.cpu())
+    print("cpu out: ", cpu_out)
+    raise RuntimeError(f"{name} mismatch")
+
+
+def _togsim_log_count() -> int:
+    log_dir = Path("togsim_results")
+    if not log_dir.exists():
+        return 0
+    return len(list(log_dir.glob("*.log")))
+
+
+def _assert_simulation_happened(before_count: int, case_name: str):
+    after_count = _togsim_log_count()
+    if after_count <= before_count:
+        raise RuntimeError(
+            f"{case_name}: TOGSim log count did not increase "
+            f"(before={before_count}, after={after_count})"
+        )
+    print(f"{case_name}: TOGSim logs increased ({before_count} -> {after_count})")
+
+
+def test_cat_default(device):
+    def cat_default_fn(a, b):
+        return torch.cat([a, b], dim=0)
+
+    x = torch.randn(8, 16, device=device)
+    y = torch.randn(6, 16, device=device)
+    opt_fn = torch.compile(dynamic=False)(cat_default_fn)
+
+    before = _togsim_log_count()
+    out = opt_fn(x, y)
+    _assert_simulation_happened(before, "cat.default")
+
+    cpu_out = torch.cat([x.cpu(), y.cpu()], dim=0)
+    _test_result("cat.default", out, cpu_out, rtol=1e-4, atol=1e-4)
+
+
+def test_cat_out(device):
+    def cat_out_fn(a, b, out):
+        return torch.ops.aten.cat.out([a, b], 0, out=out)
+
+    x = torch.randn(8, 16, device=device)
+    y = torch.randn(6, 16, device=device)
+    out_buf = torch.empty(14, 16, device=device)
+    opt_fn = torch.compile(dynamic=False)(cat_out_fn)
+
+    before = _togsim_log_count()
+    out = opt_fn(x, y, out_buf)
+    _assert_simulation_happened(before, "cat.out")
+
+    cpu_out = torch.cat([x.cpu(), y.cpu()], dim=0)
+    _test_result("cat.out", out, cpu_out, rtol=1e-4, atol=1e-4)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run cat simulation tests")
+    parser.add_argument(
+        "--case",
+        choices=["default", "out", "all"],
+        default="all",
+        help="Which cat case to run",
+    )
+    args = parser.parse_args()
+
+    device = torch.device("npu:0")
+
+    if args.case in ("default", "all"):
+        test_cat_default(device)
+    if args.case in ("out", "all"):
+        test_cat_out(device)
diff --git a/tests/test_sort.py b/tests/test_sort.py
new file mode 100644
index 00000000..2b070223
--- /dev/null
+++ b/tests/test_sort.py
@@ -0,0 +1,112 @@
+import argparse
+import torch
+import torch._dynamo
+import torch.utils.cpp_extension
+
+def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
+    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        message = f"|{name} Test Passed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+    else:
+        message = f"|{name} Test Failed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+        print("custom out:", out.cpu())
+        print("cpu out:", cpu_out)
+        raise SystemExit(1)
+
+
+def test_equal(name, out, cpu_out):
+    if torch.equal(out.cpu(), cpu_out):
+        message = f"|{name} Test Passed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+    else:
+        message = f"|{name} Test Failed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+        print("custom out:", out.cpu())
+        print("cpu out:", cpu_out)
+        raise SystemExit(1)
+
+
+def _normalize_dim(dim: int, rank: int) -> int:
+    d = dim if dim >= 0 else rank + dim
+    if d < 0 or d >= rank:
+        raise ValueError(f"dim out of range: dim={dim}, rank={rank}")
+    return d
+
+
+def test_sort_stable(device, size=(128, 128), dim=-1, descending=False):
+    _normalize_dim(dim, len(size))
+
+    def sort_stable_fn(x):
+        return torch.sort(x, stable=True, dim=dim, descending=descending)
+
+    x = torch.randn(size, dtype=torch.float32)
+    x_npu = x.to(device=device)
+
+    opt_sort = torch.compile(dynamic=False)(sort_stable_fn)
+    out_values, out_indices = opt_sort(x_npu)
+
+    ref_values, ref_indices = torch.sort(x, stable=True, dim=dim, descending=descending)
+
+    test_result("Sort.stable/values", out_values, ref_values)
+    test_equal("Sort.stable/indices", out_indices, ref_indices)
+
+
+def test_sort_values_stable(device, size=(128, 128), dim=-1, descending=False):
+    _normalize_dim(dim, len(size))
+
+    def sort_out_fn(x):
+        out_values = torch.empty_like(x, device=x.device)
+        out_indices = torch.empty_like(x, dtype=torch.int64, device=x.device)
+        return torch.sort(x, stable=True, dim=dim, descending=descending, out=(out_values, out_indices))
+
+    x = torch.randn(size, dtype=torch.float32)
+    x_npu = x.to(device=device)
+
+    opt_sort = sort_out_fn# torch.compile(dynamic=False)(sort_out_fn)
+    out_values, out_indices = opt_sort(x_npu)
+
+    ref_values, ref_indices = torch.sort(x, stable=True, dim=dim, descending=descending)
+
+    test_result("Sort.values_stable/values", out_values, ref_values)
+    test_equal("Sort.values_stable/indices", out_indices, ref_indices)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run sort tests")
+    parser.add_argument("--shape", type=str, default="(128,128)")
+    parser.add_argument("--dim", type=int, default=0)
+    parser.add_argument("--descending", action="store_true")
+    parser.add_argument(
+        "--mode",
+        type=str,
+        default="all",
+        choices=["all", "default", "values"],
+    )
+    args = parser.parse_args()
+
+    shape = tuple(map(int, args.shape.strip("()").split(",")))
+
+    from Scheduler.scheduler import PyTorchSimRunner
+
+    module = PyTorchSimRunner.setup_device()
+    device = module.custom_device()
+
+    # Register recursive-compile bridge only when values_stable path is explicitly tested.
+    if args.mode in ("all", "values"):
+        torch.npu.register_eager_to_compile([
+            "aten::sort.values_stable",
+        ])
+
+    if args.mode in ("all", "default"):
+        test_sort_stable(device, size=shape, dim=args.dim, descending=args.descending)
+    if args.mode in ("all", "values"):
+        test_sort_values_stable(device, size=shape, dim=args.dim, descending=args.descending)

From f615178ae581236a1b4d1018f9b458b2c552179f Mon Sep 17 00:00:00 2001
From: jung-min <wjdals020503@naver.com>
Date: Wed, 4 Mar 2026 07:57:47 +0000
Subject: [PATCH 110/194] [Fix] Prevent fallback to eager mode after reaching
 compilation limit (7)

---
 tests/test_sdpa.py | 31 ++++++++++---------------------
 1 file changed, 10 insertions(+), 21 deletions(-)

diff --git a/tests/test_sdpa.py b/tests/test_sdpa.py
index 9c921eb4..6ffd6f2e 100644
--- a/tests/test_sdpa.py
+++ b/tests/test_sdpa.py
@@ -14,6 +14,7 @@ def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
         print("-" * len(message))
         print(message)
         print("-" * len(message))
+        pass
     else:
         print("custom out: ", out.cpu())
         print("cpu out: ", cpu_out)
@@ -31,35 +32,25 @@ def test_scaled_dot_product_attention(device, backends="flash"):
             for n_token in n_token_list:
                 for head_dim in head_dim_list:
                     # Inputs
+                    clear_caches()
                     query = torch.rand(n_batch, n_head, n_token, head_dim, dtype=torch.float32)
                     key = torch.rand(n_batch, n_head, n_token, head_dim, dtype=torch.float32)
                     value = torch.rand(n_batch, n_head, n_token, head_dim, dtype=torch.float32)
 
+                    # With NPU
                     query = query.to(device=device)
                     key = key.to(device=device)
                     value = value.to(device=device)
 
-                    # With NPU
-                    if backends == "flash":
-                        backends = [SDPBackend.FLASH_ATTENTION]
-                    elif backends == "math":
-                        backends = [SDPBackend.MATH]
-                    elif backends == "memory_efficient":
-                        backends = [SDPBackend.EFFICIENT_ATTENTION]
-                    else:
-                        backends = [SDPBackend.FLASH_ATTENTION, SDPBackend.MATH, SDPBackend.EFFICIENT_ATTENTION]
-
-                    with sdpa_kernel(backends=backends):
-                        opt_fn = torch.compile(dynamic=False)(F.scaled_dot_product_attention)
-                        out = opt_fn(query, key, value)
-                    
+                    opt_fn = torch.compile(dynamic=False)(F.scaled_dot_product_attention)
+                    out = opt_fn(query, key, value)
                     out = out.to(device)
 
                     # With CPU
-                    device = torch.device('cpu')
-                    query = query.to(device=device)
-                    key = key.to(device=device)
-                    value = value.to(device=device)
+                    cpu_device = torch.device('cpu')
+                    query = query.to(device=cpu_device)
+                    key = key.to(device=cpu_device)
+                    value = value.to(device=cpu_device)
                     cpu_out = F.scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False)
 
                     name = f"SDPA(n_batch: {n_batch}, n_head: {n_head}, n_token: {n_token}, head_dim: {head_dim})"
@@ -76,9 +67,7 @@ def clear_caches():
     os.environ["TORCHINDUCTOR_CACHE"] = "0"
     FxGraphCache.clear()
 
-if __name__ == "__main__":
-    clear_caches()
-    
+if __name__ == "__main__":    
     device = torch.device('npu:0')
     test_scaled_dot_product_attention(device, backends="flash")
     
\ No newline at end of file

From 8ca5d02d599d06725b90963ee44701cb50e8f444 Mon Sep 17 00:00:00 2001
From: jung-min <wjdals020503@naver.com>
Date: Wed, 4 Mar 2026 08:09:28 +0000
Subject: [PATCH 111/194] [FIX] Add idx_map to the first matmul for logical
 consistency

---
 PyTorchSimFrontend/mlir/mlir_sdpa_template.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_sdpa_template.py b/PyTorchSimFrontend/mlir/mlir_sdpa_template.py
index b3d88cc6..49c6c6bb 100644
--- a/PyTorchSimFrontend/mlir/mlir_sdpa_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_sdpa_template.py
@@ -339,6 +339,7 @@ def patched_scaled_dot_product_attention(
           
           // key @ query.t and scaling.
           linalg.matmul 
+            { idx_map = array<i32: 1, 0, -1> }
             ins(%k_buffer2D, %qt_buffer2D : memref<{{ tile_s }}x{{ tile_e }}x{{ data_stype }}, 1>, memref<{{ tile_e }}x{{ tile_l }}x{{ data_stype }}, 1>)
             outs(%mul_buffer : {{ mul_tile_desc.get_mlir_shape(data_stype) }})
 
@@ -451,7 +452,7 @@ def render(self,
                prologue_nodes: Optional[List[IRNode]] = None,
                tile_info = None,
                **kwargs):
-        
+    
         # Except for kernel, other arguments are usually None.
         query, key, value, out, q_tensor, k_tensor, v_tensor, out_tensor, b, l, s, e, ev, n_extra_node, n_prologue_node = self.extract_info(template_buffer_node, epilogue_nodes, prologue_nodes)
        

From 41288bc2d300305d91559ae49a67f11984f789c0 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 3 Mar 2026 16:40:57 +0900
Subject: [PATCH 112/194] [Template] Polish template kernel of cat operation

---
 .../torch_openreg/openreg/__init__.py         |  49 ---
 PyTorchSimFrontend/mlir/mlir_bmm_template.py  |   3 +
 PyTorchSimFrontend/mlir/mlir_cat_template.py  | 369 ++++++++++++------
 PyTorchSimFrontend/mlir/mlir_conv_common.py   |   3 +
 PyTorchSimFrontend/mlir/mlir_gemm_template.py |   3 +
 PyTorchSimFrontend/mlir/mlir_lowering.py      | 118 +-----
 PyTorchSimFrontend/mlir/mlir_scheduling.py    |  22 +-
 PyTorchSimFrontend/mlir/mlir_template.py      |  43 +-
 tests/test_cat.py                             | 143 +++++--
 9 files changed, 424 insertions(+), 329 deletions(-)

diff --git a/PyTorchSimDevice/torch_openreg/openreg/__init__.py b/PyTorchSimDevice/torch_openreg/openreg/__init__.py
index 5603a4f7..f5aabc18 100644
--- a/PyTorchSimDevice/torch_openreg/openreg/__init__.py
+++ b/PyTorchSimDevice/torch_openreg/openreg/__init__.py
@@ -256,52 +256,6 @@ def launch_model(model, *args, stream_index=0, timestamp=0, **kwargs):
 from .random import *  # noqa: F403
 from .amp import *
 
-def _precheck_cat_out_args(args, kwargs):
-    tensors = args[0] if len(args) > 0 else kwargs.get("tensors")
-    dim = args[1] if len(args) > 1 else kwargs.get("dim", 0)
-    out = kwargs.get("out", args[2] if len(args) > 2 else None)
-
-    if out is None:
-        return
-    if not isinstance(tensors, (list, tuple)) or len(tensors) == 0:
-        raise RuntimeError("aten::cat.out requires non-empty tensor list")
-    if not all(isinstance(t, torch.Tensor) for t in tensors):
-        raise RuntimeError("aten::cat.out tensors must be Tensor values")
-    if not isinstance(out, torch.Tensor):
-        raise RuntimeError("aten::cat.out out must be a Tensor")
-
-    rank = tensors[0].dim()
-    if rank == 0:
-        raise RuntimeError("aten::cat.out does not support scalar inputs")
-    if dim < 0:
-        dim += rank
-    if dim < 0 or dim >= rank:
-        raise RuntimeError(f"aten::cat.out dim out of range: dim={dim}, rank={rank}")
-    if any(t.dim() != rank for t in tensors):
-        raise RuntimeError("aten::cat.out inputs must have the same rank")
-    if any(t.dtype != tensors[0].dtype for t in tensors):
-        raise RuntimeError("aten::cat.out inputs must have the same dtype")
-    if out.dim() != rank:
-        raise RuntimeError("aten::cat.out out rank mismatch")
-
-    for d in range(rank):
-        if d == dim:
-            continue
-        base = tensors[0].shape[d]
-        if any(t.shape[d] != base for t in tensors[1:]):
-            raise RuntimeError(
-                f"aten::cat.out non-concatenated dimension mismatch at dim={d}"
-            )
-        if out.shape[d] != base:
-            raise RuntimeError(f"aten::cat.out out shape mismatch at dim={d}")
-
-    expected = sum(t.shape[dim] for t in tensors)
-    if out.shape[dim] != expected:
-        raise RuntimeError(
-            f"aten::cat.out out concatenated dimension mismatch at dim={dim}: "
-            f"expected {expected}, got {out.shape[dim]}"
-        )
-
 def eager_to_compile(op_name):
     """
     Register an eager mode operation as a graph-based implementation using torch.compile().
@@ -313,9 +267,6 @@ def eager_to_compile(op_name):
         torch.npu.eager_to_compile("aten::mul.Tensor")
     """
     def wrapper(*args, **kwargs):
-        if op_name == "aten::cat.out":
-            _precheck_cat_out_args(args, kwargs)
-
         @torch.compile(dynamic=False)
         def dummy_graph(*args, **kwargs):
             # Convert "aten::mul.Tensor" -> torch.ops.aten.mul.Tensor
diff --git a/PyTorchSimFrontend/mlir/mlir_bmm_template.py b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
index 178ea987..9398f90c 100644
--- a/PyTorchSimFrontend/mlir/mlir_bmm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
@@ -154,6 +154,9 @@
 class MLIRBMMTemplate(MLIRTemplate):
     def __init__(self, input_nodes, layout, input_reorder=None):
         super().__init__("kernel", input_nodes, layout, input_reorder)
+        self.support_epilogue_fusion = True
+        self.support_prologue_fusion = True
+        self.support_reduction_fusion = True
 
     def render(self,
                kernel: MLIRTemplateKernel,
diff --git a/PyTorchSimFrontend/mlir/mlir_cat_template.py b/PyTorchSimFrontend/mlir/mlir_cat_template.py
index 996af1de..d68af7d4 100644
--- a/PyTorchSimFrontend/mlir/mlir_cat_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_cat_template.py
@@ -1,8 +1,9 @@
-from typing import List, Optional, cast
+from typing import List, Optional
+import math
+import itertools
 
 import sympy
-from torch._inductor.ir import Buffer, IRNode
-from torch._inductor.virtualized import V
+from torch._inductor.ir import IRNode
 
 from PyTorchSimFrontend.mlir import mlir_common
 from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate, MLIRTemplateKernel
@@ -10,40 +11,28 @@
 
 TEMPLATE = r"""
 {{kernel.def_global_vars()}}
-
-func.func @{{ KERNEL_NAME }} {{kernel.def_kernel(inputs=[X0, X1], outputs=[Y], names_str=NAMES_STR, input_reorder=input_reorder)}} {
-  {{ kernel.def_sram_buffer("X0", X0_TILE_DESC, id=0, indent_size=2) }}
-  {{ kernel.def_sram_buffer("X1", X1_TILE_DESC, id=1, indent_size=2) }}
-  {{ kernel.def_sram_buffer(OUT_DVAR, Y_TILE_DESC, id=2, indent_size=2) }}
+func.func @{{ KERNEL_NAME }} {{kernel.def_kernel(inputs=INPUT_NAMES, outputs=[Y], names_str=NAMES_STR, input_reorder=input_reorder)}} {
+{%- for buffer_name, tile_desc in UNIQUE_BUFFER_TILE_DESCS.items() %}
+  {{ kernel.def_sram_buffer(buffer_name, tile_desc, indent_size=2) }}
+{%- endfor %}
   {{ kernel.def_local_vars(indent_size=2) }}
 
   affine.for %cat_block = 0 to 1 step 1 {
-{% if DIM == 0 %}
-    affine.for %index0 = 0 to {{ X0_ROWS }} step 1 {
-      affine.for %index1 = 0 to {{ COLS }} step 1 {
-        {{ kernel.def_dma_op("MVIN", "X0", X0_IDX, X0_TILE_DESC, indent_size=8) }}
-        {{ kernel.def_dma_op("MVOUT", OUT_DVAR, Y0_IDX, X0_TILE_DESC, indent_size=8) }}
-      }
-    }
-
-    affine.for %index2 = 0 to {{ X1_ROWS }} step 1 {
-      affine.for %index3 = 0 to {{ COLS }} step 1 {
-        {{ kernel.def_dma_op("MVIN", "X1", X1_IDX, X1_TILE_DESC, indent_size=8) }}
-        {{ kernel.def_dma_op("MVOUT", OUT_DVAR, Y1_IDX, X1_TILE_DESC, indent_size=8) }}
-      }
-    }
-{% else %}
-    affine.for %index0 = 0 to {{ ROWS }} step 1 {
-      affine.for %index1 = 0 to {{ X0_COLS }} step 1 {
-        {{ kernel.def_dma_op("MVIN", "X0", X0_IDX, X0_TILE_DESC, indent_size=8) }}
-        {{ kernel.def_dma_op("MVOUT", OUT_DVAR, Y0_IDX, X0_TILE_DESC, indent_size=8) }}
-      }
-      affine.for %index3 = 0 to {{ X1_COLS }} step 1 {
-        {{ kernel.def_dma_op("MVIN", "X1", X1_IDX, X1_TILE_DESC, indent_size=8) }}
-        {{ kernel.def_dma_op("MVOUT", OUT_DVAR, Y1_IDX, X1_TILE_DESC, indent_size=8) }}
-      }
-    }
-{% endif %}
+{%- for d in range(RANK-1) %}
+    affine.for %index{{ OUTPUT_DIM[d] }} = 0 to {{ OUTPUT_SIZES[d] }} step {{ TILE_SIZES[d] }} {
+{%- endfor %}
+{%- for i in range(NUM_INPUTS) %}
+      // Input tensor{{ i }}
+      affine.for %index_local{{ DIM }}_{{ i }} = 0 to {{ INPUT_SIZES[i][DIM] }} step {{ INPUT_TILE_SIZES_DIM[i] }} {
+        %index{{ DIM }}_{{i}} = affine.apply affine_map<(d0) -> (d0 + {{ CUMULATIVE_OFFSETS[i] }})> (%index_local{{ DIM }}_{{ i }})
+        {{ kernel.def_dma_op("MVIN", INPUT_BUFFER_NAMES[i], INPUT_IDXS[i], INPUT_TILE_DESCS[i], indent_size=INDENT_SIZE) }}
+        {{ kernel.def_dma_op("MVOUT", OUT_DVAR, OUTPUT_IDXS[i], INPUT_TILE_DESCS[i], indent_size=INDENT_SIZE) }}
+      } { inner_loop=true }
+{%- endfor %}
+
+{%- for d in range(RANK-1) %}
+    } { outer_loop=true }
+{%- endfor %}
   } { outer_loop=true }
   return
 }
@@ -51,8 +40,8 @@
 
 
 class MLIRCatTemplate(MLIRTemplate):
-    def __init__(self, input_nodes, layout, dim, input_reorder=None):
-        super().__init__("kernel", input_nodes, layout, input_reorder)
+    def __init__(self, input_nodes, layout, dim):
+        super().__init__("kernel", input_nodes, layout)
         self.dim = dim
 
     def render(
@@ -66,87 +55,248 @@ def render(
         is_out_variant = template_buffer_node is not None
         if is_out_variant:
             self.output_node = template_buffer_node
-        # cat template currently emits a single output buffer and does not
-        # support epilogue output remapping.
-
-        def _unwrap_node(n):
-            return n.node if hasattr(n, "node") else n
-
-        x0 = _unwrap_node(self.input_nodes[0])
-        x1 = _unwrap_node(self.input_nodes[1])
-        y = _unwrap_node(self.output_node)
-
-        def _as_int(v):
-            try:
-                return int(v)
-            except Exception:
-                return int(V.graph.sizevars.size_hint(v))
-
-        x0_rows = _as_int(x0.get_size()[0])
-        x1_rows = _as_int(x1.get_size()[0])
-        x0_cols = _as_int(x0.get_size()[1])
-        x1_cols = _as_int(x1.get_size()[1])
-        y_cols = _as_int(y.get_size()[1])
-        kernel.loop_size = None
-
-        # 2D cat template with contiguous layout.
-        x0_tile_desc = mlir_common.MLIRMultiDimTile([1, 1], kernel.vector_lane, vlane_split_axis=1, vlane_stride=1)
-        x0_tile_desc.set_tile_size_stride([1, 1], [1, 1])
-        x0_tile_desc.set_name("x0_cat_tile")
-        x1_tile_desc = mlir_common.MLIRMultiDimTile([1, 1], kernel.vector_lane, vlane_split_axis=1, vlane_stride=1)
-        x1_tile_desc.set_tile_size_stride([1, 1], [1, 1])
-        x1_tile_desc.set_name("x1_cat_tile")
-        y_tile_desc = mlir_common.MLIRMultiDimTile([1, 1], kernel.vector_lane, vlane_split_axis=1, vlane_stride=1)
-        y_tile_desc.set_tile_size_stride([1, 1], [1, 1])
-        y_tile_desc.set_name("y_cat_tile")
 
-        if self.dim == 0:
-            # Flattened offsets for dim=0 cat.
-            x0_idx = [sympy.Symbol("index0") * x0_cols, sympy.Symbol("index1")]
-            x1_idx = [sympy.Symbol("index2") * x1_cols, sympy.Symbol("index3")]
-            y0_idx = [sympy.Symbol("index0") * y_cols, sympy.Symbol("index1")]
-            y1_idx = [(sympy.Symbol("index2") + x0_rows) * y_cols, sympy.Symbol("index3")]
-        else:
-            # Flattened offsets for dim=1 cat.
-            x0_idx = [sympy.Symbol("index0") * x0_cols, sympy.Symbol("index1")]
-            x1_idx = [sympy.Symbol("index0") * x1_cols, sympy.Symbol("index3")]
-            y0_idx = [sympy.Symbol("index0") * y_cols, sympy.Symbol("index1")]
-            y1_idx = [sympy.Symbol("index0") * y_cols, sympy.Symbol("index3") + x0_cols]
+        # Extract info
+        input_nodes = self.input_nodes
+        y = self.output_node
+        num_inputs = len(self.input_nodes)
+        rank = len(y.get_size())
+
+        input_sizes = [x.get_size() for x in input_nodes]
+        output_sizes = [sz for dim, sz in enumerate(y.get_size()) if dim != self.dim]
+        output_dim = [dim for dim, sz in enumerate(y.get_size()) if dim != self.dim]
+        tile_sizes = tile_info if tile_info is not None else [1] * len(output_sizes)
+        output_strides = y.get_layout().stride
+
+        # Calculate input tile sizes
+        input_tile_sizes_dim = self._calculate_input_tile_sizes(
+            kernel, input_sizes, tile_sizes, num_inputs, rank
+        )
+        buffer_name_to_template_name, input_buffer_names = self._build_buffer_mapping(input_nodes)
+        input_tile_descs, unique_tile_descs = self._build_tile_descriptors(
+            kernel, input_nodes, input_sizes, input_tile_sizes_dim, tile_sizes, rank, input_buffer_names
+        )
+        y_tile_desc = self._build_output_tile_desc(
+            kernel, input_tile_sizes_dim, tile_sizes, rank
+        )
+
+        input_idxs, output_idxs, cumulative_offsets = self._build_index_expressions(
+            input_nodes, input_sizes, output_strides, rank, num_inputs
+        )
+
+        # Map unique buffer names to their tile descriptors for template
+        unique_buffer_tile_descs = {}
+        for actual_name, template_name in buffer_name_to_template_name.items():
+            if actual_name in unique_tile_descs:
+                unique_buffer_tile_descs[template_name] = unique_tile_descs[actual_name]
+
+        names_str = ", ".join(input_buffer_names + ["out_ptr1" if is_out_variant else "Y"])
+        indent_size = 2 + (rank - 1) * 2 + 4
 
         kernel.render_options = dict(
             KERNEL_NAME=self.name,
             kernel=kernel,
-            X0=x0,
-            X1=x1,
             Y=y,
             OUT_DVAR="out_ptr1" if is_out_variant else "Y",
-            NAMES_STR="X0, X1, out_ptr1" if is_out_variant else "X0, X1, Y",
+            NAMES_STR=names_str,
+            INPUT_NAMES=input_nodes,
+            INPUT_BUFFER_NAMES=input_buffer_names,
+            NUM_INPUTS=num_inputs,
+            RANK=rank,
             DIM=self.dim,
-            X0_ROWS=x0_rows,
-            X1_ROWS=x1_rows,
-            ROWS=x0_rows,
-            X0_COLS=x0_cols,
-            X1_COLS=x1_cols,
-            COLS=x0_cols,
-            X0_TILE_DESC=x0_tile_desc,
-            X1_TILE_DESC=x1_tile_desc,
-            Y_TILE_DESC=y_tile_desc,
-            X0_IDX=x0_idx,
-            X1_IDX=x1_idx,
-            Y0_IDX=y0_idx,
-            Y1_IDX=y1_idx,
+            INPUT_SIZES=input_sizes,
+            OUTPUT_SIZES=output_sizes,
+            OUTPUT_DIM=output_dim,
+            TILE_SIZES=tile_sizes,
+            INPUT_TILE_SIZES_DIM=input_tile_sizes_dim,
+            INPUT_TILE_DESCS=input_tile_descs,
+            UNIQUE_BUFFER_TILE_DESCS=unique_buffer_tile_descs,
+            INPUT_IDXS=input_idxs,
+            OUTPUT_IDXS=output_idxs,
+            CUMULATIVE_OFFSETS=cumulative_offsets,
+            INDENT_SIZE=indent_size,
             input_reorder=self.input_reorder,
         )
-        # Needed when epilogue fusion requests set_ranges().
-        kernel.dim_aliasing = {"index0": "index0", "index1": "index1"}
 
-        if hasattr(self.output_node, "node") and hasattr(self.output_node.node, "get_name"):
-            output_node_name = self.output_node.node.get_name()
-        elif hasattr(self.output_node, "get_name"):
-            output_node_name = self.output_node.get_name()
-        else:
-            output_node_name = self.output_node.name
+        self._setup_epilogue_info(kernel, y)
+        code = self._template_from_string(TEMPLATE).render(**kernel.render_options)
+        return code
+
+    def get_tile_candidates(
+        self,
+        kernel: MLIRTemplateKernel,
+        template_buffer_node=None,
+        epilogue_nodes: Optional[List[IRNode]] = None,
+        **kwargs,
+    ):
+        """Generate tile candidates for cat operation. Concat dimension always has tile size 1."""
+        if template_buffer_node is not None:
+            self.output_node = template_buffer_node
+
+        y = self.output_node
+        num_inputs = len(self.input_nodes)
+        output_sizes = [sz for dim, sz in enumerate(y.get_size()) if dim != self.dim]
+        num_non_dim_dims = len(output_sizes)
+
+        if num_non_dim_dims == 0:
+            return [[1]]
+
+        tile_candidates = []
+        dim_tile_candidates = []
+
+        for dim_size in output_sizes:
+            dim_candidates = []
+            max_tile = min(dim_size, kernel.spad_info["spad_size"] // (kernel.vector_lane * kernel.precision * 2 * num_inputs))
+
+            for mult in range(1, max_tile // kernel.vector_lane + 1):
+                tile = mult * kernel.vector_lane
+                if tile <= dim_size:
+                    dim_candidates.append(tile)
 
+            if max_tile > 0:
+                for exp in range(int(math.log2(max_tile)) + 1):
+                    tile = 2 ** exp
+                    if tile <= dim_size and tile not in dim_candidates:
+                        dim_candidates.append(tile)
+
+            if dim_size not in dim_candidates:
+                dim_candidates.append(dim_size)
+
+            dim_tile_candidates.append(sorted(set(dim_candidates))[:5])
+
+        for tile_combo in itertools.product(*dim_tile_candidates):
+            total_elements = math.prod(tile_combo)
+            total_spad_needed = total_elements * (num_inputs + 1) * kernel.precision
+
+            if total_spad_needed <= kernel.spad_info["spad_size"] * kernel.vector_lane:
+                tile_candidates.append(list(tile_combo))
+
+        if not tile_candidates:
+            tile_candidates = [[1] * num_non_dim_dims]
+
+        tile_candidates.sort(key=lambda x: -math.prod(x))
+        return tile_candidates[:4]
+
+    def _calculate_input_tile_sizes(
+        self, kernel, input_sizes, tile_sizes, num_inputs, rank
+    ):
+        """Calculate tile sizes for concat dimension for each input."""
+        non_dim_tile_elements = math.prod(tile_sizes) if tile_sizes else 1
+        non_dim_tile_spad = non_dim_tile_elements * kernel.precision
+        max_spad_per_input = kernel.spad_info["spad_size"] * kernel.vector_lane // 2
+        extra_concat_input = math.ceil(max_spad_per_input / non_dim_tile_spad) - num_inputs
+
+        input_tile_sizes_dim = []
+        for i in range(num_inputs):
+            input_dim_size = input_sizes[i][self.dim]
+            if extra_concat_input > 0 and non_dim_tile_elements > 0:
+                max_tile_dim = min(input_dim_size, extra_concat_input)
+                extra_concat_input -= max_tile_dim
+            else:
+                max_tile_dim = 1
+            input_tile_sizes_dim.append(max_tile_dim)
+        return input_tile_sizes_dim
+
+    def _build_buffer_mapping(self, input_nodes):
+        """Map actual buffer names to template buffer names """
+        buffer_name_to_template_name = {}
+        input_buffer_names = []
+        for x in input_nodes:
+            actual_name = x.get_name()
+            template_name = buffer_name_to_template_name.setdefault(
+                actual_name, f"X{len(buffer_name_to_template_name)}"
+            )
+            input_buffer_names.append(template_name)
+        return buffer_name_to_template_name, input_buffer_names
+
+    def _build_tile_descriptors(
+        self, kernel, input_nodes, input_sizes, input_tile_sizes_dim, tile_sizes, rank, input_buffer_names
+    ):
+        """Build tile descriptors for each input."""
+        input_tile_descs = []
+        unique_tile_descs = {}
+
+        for i, x in enumerate(input_nodes):
+            # Build full tile size list for this input
+            full_tile_sizes = []
+            tile_size_idx = 0
+            for d in range(rank):
+                if d != self.dim:
+                    full_tile_sizes.append(tile_sizes[tile_size_idx])
+                    tile_size_idx += 1
+                else:
+                    full_tile_sizes.append(input_tile_sizes_dim[i])
+
+            tile_desc = mlir_common.MLIRMultiDimTile(
+                full_tile_sizes,
+                kernel.vector_lane,
+                vlane_split_axis=rank - 1,
+                vlane_stride=1
+            )
+            tile_desc.set_tile_size(full_tile_sizes)
+            template_buffer_name = input_buffer_names[i]
+            tile_desc.set_name(f"{template_buffer_name.lower()}_cat_tile")
+            input_tile_descs.append(tile_desc)
+
+            # Store unique tile desc by actual buffer name
+            actual_name = x.get_name()
+            if actual_name not in unique_tile_descs:
+                unique_tile_descs[actual_name] = tile_desc
+
+        return input_tile_descs, unique_tile_descs
+
+    def _build_index_expressions(
+        self, input_nodes, input_sizes, output_strides, rank, num_inputs
+    ):
+        """Build index expressions for input and output."""
+        input_idxs = []
+        output_idxs = []
+        cumulative_offsets = [0]
+        for i in range(num_inputs - 1):
+            cumulative_offsets.append(cumulative_offsets[-1] + input_sizes[i][self.dim])
+
+        for i, x in enumerate(input_nodes):
+            x_stride = x.get_layout().stride
+            input_idx = []
+            output_idx = []
+            for d in range(rank):
+                if d != self.dim:
+                    input_idx_symbol = sympy.Symbol(f"index{d}")
+                    output_idx_symbol = sympy.Symbol(f"index{d}")
+                else:
+                    input_idx_symbol = sympy.Symbol(f"index_local{self.dim}_{i}")
+                    output_idx_symbol = sympy.Symbol(f"index{self.dim}_{i}")
+                input_idx.append(input_idx_symbol * x_stride[d])
+                output_idx.append(output_idx_symbol * output_strides[d])
+            input_idxs.append(input_idx)
+            output_idxs.append(output_idx)
+
+        return input_idxs, output_idxs, cumulative_offsets
+
+    def _build_output_tile_desc(self, kernel, input_tile_sizes_dim, tile_sizes, rank):
+        """Build output tile descriptor."""
+        max_output_tile_dim = max(input_tile_sizes_dim) if input_tile_sizes_dim else 1
+        output_full_tile_sizes = []
+        tile_size_idx = 0
+        for d in range(rank):
+            if d != self.dim:
+                output_full_tile_sizes.append(tile_sizes[tile_size_idx])
+                tile_size_idx += 1
+            else:
+                output_full_tile_sizes.append(max_output_tile_dim)
+
+        y_tile_desc = mlir_common.MLIRMultiDimTile(
+            output_full_tile_sizes,
+            kernel.vector_lane,
+            vlane_split_axis=rank - 1,
+            vlane_stride=1
+        )
+        y_tile_desc.set_tile_size(output_full_tile_sizes)
+        y_tile_desc.set_name("y_cat_tile")
+        return y_tile_desc
+
+    def _setup_epilogue_info(self, kernel, y):
+        """Setup epilogue information."""
         if hasattr(y, "get_numel"):
             y_numel = y.get_numel()
         elif hasattr(y, "node") and hasattr(y.node, "get_numel"):
@@ -154,14 +304,5 @@ def _as_int(v):
         else:
             y_numel = None
 
-        kernel.epilogue_info = dict(
-            output_node=output_node_name,
-            sram_var="y_cat_tile",
-            dram_var=kernel.render_options["OUT_DVAR"],
-            dram_tile_desc=y_tile_desc,
-        )
         if y_numel is not None:
             kernel.exception_nodes[kernel.render_options["OUT_DVAR"]] = {"numel": y_numel}
-
-        code = self._template_from_string(TEMPLATE).render(**kernel.render_options)
-        return code
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_common.py b/PyTorchSimFrontend/mlir/mlir_conv_common.py
index f8566b6d..f72a7663 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_common.py
@@ -12,6 +12,9 @@ class MLIRConvCommonTemplate(MLIRTemplate):
     WRAPPER_TEMPLATE = None
     def __init__(self, input_nodes, layout, input_reorder=None, **kwargs):
         super().__init__("kernel", input_nodes, layout, input_reorder)
+        self.support_epilogue_fusion = True
+        self.support_prologue_fusion = False
+        self.support_reduction_fusion = False
         self.stride = kwargs["stride"]
         self.padding = kwargs["padding"]
         self.dilation = kwargs["dilation"]
diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index 0158caa6..5b116807 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -105,6 +105,9 @@
 class MLIRGemmTemplate(MLIRTemplate):
     def __init__(self, input_nodes, layout, input_reorder=None):
         super().__init__("kernel", input_nodes, layout, input_reorder)
+        self.support_epilogue_fusion = True
+        self.support_prologue_fusion = True
+        self.support_reduction_fusion = True
 
     def render(self,
                kernel: MLIRTemplateKernel,
diff --git a/PyTorchSimFrontend/mlir/mlir_lowering.py b/PyTorchSimFrontend/mlir/mlir_lowering.py
index 0f28f03b..d7aee715 100644
--- a/PyTorchSimFrontend/mlir/mlir_lowering.py
+++ b/PyTorchSimFrontend/mlir/mlir_lowering.py
@@ -202,48 +202,9 @@ def _cat_layout(tensors: Sequence[TensorBox], dim: int) -> ir.Layout:
         stride,
     )
 
-
-def _can_use_cat_template(tensors: Sequence[TensorBox], dim: int) -> bool:
-    # Current template specialization: 2 inputs, rank-2, dim in {0, 1}.
-    if len(tensors) != 2:
-        return False
-    if not all(hasattr(t, "get_size") and hasattr(t, "get_dtype") and hasattr(t, "realize") for t in tensors):
-        return False
-    if tensors[0].get_dtype() != tensors[1].get_dtype():
-        return False
-    rank0 = len(tensors[0].get_size())
-    rank1 = len(tensors[1].get_size())
-    if rank0 != 2 or rank1 != 2:
-        return False
-    if dim < 0:
-        dim += rank0
-    if dim not in (0, 1):
-        return False
-
-    if dim == 0:
-        cols0 = tensors[0].get_size()[1]
-        cols1 = tensors[1].get_size()[1]
-        return V.graph.sizevars.statically_known_equals(cols0, cols1)
-
-    rows0 = tensors[0].get_size()[0]
-    rows1 = tensors[1].get_size()[0]
-    return V.graph.sizevars.statically_known_equals(rows0, rows1)
-
-
-def _cat_fallback(reason: str, tensors: Sequence[TensorBox], dim: int):
-    # Non-template cases delegate to the original lowering path.
-    return _orig_cat_default_lowering(tensors, dim)
-
-
-def _custom_cat_impl(tensors: Sequence[TensorBox], dim: int = 0):
-    if _orig_cat_default_lowering is None:
-        raise RuntimeError("Original aten.cat.default lowering is missing")
-    if len(tensors) > 0:
-        rank = len(tensors[0].get_size())
-        if dim < 0:
-            dim += rank
-    if not _can_use_cat_template(tensors, dim):
-        return _cat_fallback("default-path", tensors, dim)
+def custom_cat_default(tensors: Sequence[TensorBox], dim: int = 0):
+    if tensors and dim < 0:
+        dim += len(tensors[0].get_size())
 
     for t in tensors:
         t.realize()
@@ -251,75 +212,6 @@ def _custom_cat_impl(tensors: Sequence[TensorBox], dim: int = 0):
     mlir_template = MLIRCatTemplate(list(tensors), layout, dim=dim)
     return mlir_template.generate().output_node()
 
-
-def custom_cat_default(tensors: Sequence[TensorBox], dim: int = 0):
-    return _custom_cat_impl(tensors, dim)
-
-
-def custom_cat_out(tensors: Sequence[TensorBox], dim: int = 0, out: Optional[TensorBox] = None):
-    if _orig_cat_out_lowering is None:
-        raise RuntimeError("Original aten.cat.out lowering is missing")
-    if out is None:
-        return _orig_cat_out_lowering(tensors, dim, out)
-
-    copy_default_lowering = lowerings.get(aten.copy_.default)
-    slice_tensor_lowering = lowerings.get(aten.slice.Tensor)
-    if copy_default_lowering is None or slice_tensor_lowering is None:
-        raise RuntimeError("cat.out lowering requires aten.copy_.default and aten.slice.Tensor lowerings")
-
-    # Lower cat.out as a sequence of slice+copy ops so each piece still runs
-    # through the existing compiled/simulated kernel path.
-    if len(tensors) == 0:
-        raise RuntimeError("cat.out requires at least one input tensor")
-    if not all(hasattr(t, "get_size") and hasattr(t, "get_dtype") and hasattr(t, "realize") for t in tensors):
-        raise RuntimeError("cat.out inputs must be tensor-like values")
-    rank = len(tensors[0].get_size())
-    if rank == 0:
-        raise RuntimeError("cat.out does not support scalar inputs")
-    if dim < 0:
-        dim = dim + rank
-    if dim < 0 or dim >= rank:
-        raise RuntimeError(f"cat.out dim out of range: dim={dim}, rank={rank}")
-    if any(len(t.get_size()) != rank for t in tensors):
-        raise RuntimeError("cat.out inputs must have the same rank")
-    if any(t.get_dtype() != tensors[0].get_dtype() for t in tensors):
-        raise RuntimeError("cat.out inputs must have the same dtype")
-    # cat semantics: all non-cat dimensions must be equal.
-    for i in range(rank):
-        if i == dim:
-            continue
-        base = tensors[0].get_size()[i]
-        if any(not V.graph.sizevars.statically_known_equals(base, t.get_size()[i]) for t in tensors[1:]):
-            raise RuntimeError(f"cat.out non-concatenated dimension mismatch at dim={i}")
-
-    # Output shape must match concatenated shape.
-    if not hasattr(out, "get_size"):
-        raise RuntimeError("cat.out output must be tensor-like")
-    out_sizes = list(out.get_size())
-    if len(out_sizes) != rank:
-        raise RuntimeError("cat.out output rank mismatch")
-    for i in range(rank):
-        if i == dim:
-            continue
-        if not V.graph.sizevars.statically_known_equals(out_sizes[i], tensors[0].get_size()[i]):
-            raise RuntimeError(f"cat.out output shape mismatch at dim={i}")
-    expected_cat = sum(t.get_size()[dim] for t in tensors)
-    if not V.graph.sizevars.statically_known_equals(out_sizes[dim], expected_cat):
-        raise RuntimeError(f"cat.out output concatenated dimension mismatch at dim={dim}")
-
-    if isinstance(out, TensorBox):
-        out.realize()
-
-    offset = 0
-    for src in tensors:
-        src.realize()
-        end = offset + src.get_size()[dim]
-        dst_view = slice_tensor_lowering(out, dim, offset, end, 1)
-        copy_default_lowering(dst_view, src)
-        offset = end
-    return out
-
-
 def _custom_sort_values_impl(
     self: TensorBox,
     dim: int = -1,
@@ -459,9 +351,7 @@ def custom_sort_values_stable(
 lowerings.update({getattr(aten.bmm, overload): tuned_bmm for overload in aten.bmm.overloads()})
 lowerings.update({getattr(aten._sparse_addmm, overload): sparse_addmm for overload in aten._sparse_addmm.overloads()})
 lowerings.update({getattr(aten._unsafe_index, overload): custom_unsafe_index for overload in aten._unsafe_index.overloads()})
-
-lowerings.update({aten.cat.default: custom_cat_default})
-lowerings.update({aten.cat.out: custom_cat_out})
+lowerings.update({getattr(aten.cat, overload): custom_cat_default for overload in aten.cat.overloads()})
 
 lowerings.update({aten.sort.stable: custom_sort_stable})
 lowerings.update({aten.sort.values_stable: custom_sort_values_stable})
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index af960533..2f9c9704 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -44,12 +44,10 @@ def can_fuse_with_exceptions(self, node1: BaseSchedulerNode, node2: BaseSchedule
 
         # Case 3: Prologue(Pointwise) + Tempalte
         if len(base_template_node1) == 0 and len(node1.get_nodes())==1 and len(node2.get_nodes())==1 and not node1.is_reduction() and len(base_template_node2) == 1 and extension_config.CONFIG_FUSION_PROLOGUE:
-            from PyTorchSimFrontend.mlir.mlir_gemm_template import MLIRGemmTemplate
-            from PyTorchSimFrontend.mlir.mlir_bmm_template import MLIRBMMTemplate
-
             target_node = base_template_node2[0].node
-            # Currently only BMM, MM support prologue fusion
-            if not isinstance(target_node.template, (MLIRBMMTemplate, MLIRGemmTemplate)):
+
+            # Check if template supports prologue fusion
+            if not getattr(target_node.template, 'support_prologue_fusion', False):
                 return False
 
             if len(node1.read_writes.writes) != 1:
@@ -129,12 +127,14 @@ def can_fuse_horizontal(self, node1, node2):
         if len(base_template_node1) == 1 and len(node1.get_nodes())==1 and len(node2.get_nodes())==1 and len(base_template_node2) == 0 and not node2.is_reduction():
             # Don't fuse maxpool template code
             from PyTorchSimFrontend.mlir.mlir_maxpool_template import MLIRMaxPoolTemplate
-            from PyTorchSimFrontend.mlir.mlir_bmm_template import MLIRBMMTemplate
-            from PyTorchSimFrontend.mlir.mlir_gemm_template import MLIRGemmTemplate
 
             template_node = base_template_node1[0]
             epilogue_node = node2
 
+            # Check if template supports epilogue fusion
+            if not getattr(template_node.node.template, 'support_epilogue_fusion', False):
+                return False
+
             if isinstance(template_node.node.template, MLIRMaxPoolTemplate):
                 return False
 
@@ -161,7 +161,7 @@ def can_fuse_horizontal(self, node1, node2):
             # Revert act_node.group : simplify_and_reorder() modified _body, _size, group
             if template_node.group != epilogue_node.group:
                 # We don't fuse this case...
-                if (isinstance(template_node.node.template, MLIRBMMTemplate) or isinstance(template_node.node.template, MLIRGemmTemplate)) and template_node.group[1][0][0] == 1:
+                if getattr(template_node.node.template, 'support_prologue_fusion', False) and template_node.group[1][0][0] == 1:
                     return False
 
                 if list(template_node.group[1][0]) != list(epilogue_node.get_nodes()[0].node.data.get_size()):
@@ -171,10 +171,10 @@ def can_fuse_horizontal(self, node1, node2):
 
         # Case 2: Tempalte + Reduction fusion
         if len(base_template_node1) == 1 and len(node1.get_nodes())==1 and len(node2.get_nodes())==1 and len(base_template_node2) == 0 and node2.is_reduction() and extension_config.CONFIG_FUSION_REDUCTION_EPILOGUE:
-            from PyTorchSimFrontend.mlir.mlir_gemm_template import MLIRGemmTemplate
-            from PyTorchSimFrontend.mlir.mlir_bmm_template import MLIRBMMTemplate
             target_node = base_template_node1[0].node
-            if not isinstance(target_node.template, (MLIRBMMTemplate, MLIRGemmTemplate)):
+
+            # Check if template supports reduction fusion
+            if not getattr(target_node.template, 'support_reduction_fusion', False):
                 return False
 
             size_match = node1.get_nodes()[0].node.get_numel() == reduce(operator.mul, node2.get_nodes()[0].node.get_size(), 1) * reduce(operator.mul, node2.get_nodes()[0].node.get_reduction_size(), 1)
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 76b0ef71..04d327f8 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -14,7 +14,7 @@
 from unittest.mock import patch
 
 from torch._inductor.codegen.common import KernelTemplate, CSE, DeferredLine
-from torch._inductor.ir import Buffer, IRNode, TemplateBuffer, ChoiceCaller
+from torch._inductor.ir import Buffer, IRNode, TemplateBuffer, ChoiceCaller, ir_node_to_tensor
 from torch._inductor.select_algorithm import PartialRender
 from torch._inductor.codegen.cuda.cuda_kernel import CUDATemplateCaller
 from torch._inductor.autotune_process import TensorMeta
@@ -124,6 +124,7 @@ def __init__(self,
         self.epilogue_buffer_group = IndentedBufferGroup(self, prefix="epilogue_")
         self.global_vars = IndentedBuffer()
         self.exception_nodes = {}
+        self.epilogue_info = {}
         # Reduction data structure
         self.reduction_epilogue_suffix = IndentedBuffer()
         self.reduction_fusion = False
@@ -403,7 +404,7 @@ def call_kernel(self, kernel_name):
         _, call_args, _, _ = self.kernel_group.args.mlir_argdefs()
         # generate the code to call this
         wrapper.generate_kernel_call(
-            kernel_name if self.outer_func_name is None else self.outer_func_name + f"_{len(call_args)}", call_args)
+            kernel_name if self.outer_func_name is None else "wrapper_" + kernel_name, call_args)
 
     def codegen_template_code(self, render, template_node, prologue_nodes, epilogue_nodes, tile_info):
         with self as kernel:
@@ -460,11 +461,11 @@ def codegen_template_code(self, render, template_node, prologue_nodes, epilogue_
                         }
                         node.codegen((vars, reduction_vars))
 
-            # Codegen epilogue nodes
-            tile_desc = kernel.set_tile_size(kernel.epilogue_info)
-            kernel.kernel_group.set_tile_info(tile_desc)
-            kernel.call_ranges = None
             if epilogue_nodes:
+                # Codegen epilogue nodes
+                tile_desc = kernel.set_tile_size(kernel.epilogue_info)
+                kernel.kernel_group.set_tile_info(tile_desc)
+                kernel.call_ranges = None
                 with kernel.epilogue_buffer_group.as_local():
                     _, (group, reduction_group) = max(
                         epilogue_nodes, key=lambda x: int(x.is_reduction())
@@ -625,7 +626,9 @@ def def_kernel(
                     extra_node[node.get_name()] = node.node
                 else:
                     extra_node[node.get_name()] = node
-                self.buffer_names[node.get_name()] = self.epilogue_info['sram_var']
+
+                if 'sram_var' in self.epilogue_info:
+                    self.buffer_names[node.get_name()] = self.epilogue_info['sram_var']
 
         def hook():
             arg_defs, call_args, *_ = self.kernel_group.args.mlir_argdefs(extra_node=extra_node)
@@ -688,7 +691,8 @@ def def_conv_kernel(
                 self.kernel_group.args.output_buffers[node.get_name()] = name
                 self.store_buffer_names.add(node.get_name())    #TODO: Is this enough not calling store() in mlir_common.py?
                 self.extra_node[node.get_name()] = node
-                self.buffer_names[node.get_name()] = self.epilogue_info['sram_var']   #TODO: Buffer name fixed
+                if 'sram_var' in self.epilogue_info:
+                    self.buffer_names[node.get_name()] = self.epilogue_info['sram_var']   #TODO: Buffer name fixed
 
         def kernel_hook():
             arg_defs, *_ = self.kernel_group.args.mlir_argdefs(extra_node=self.extra_node)
@@ -1146,6 +1150,15 @@ def set_tile_size(self, template_fusion_info, prologue=False):
         return tile_desc
 
 class MLIRTemplateCaller(CUDATemplateCaller):
+    def __init__(self, name, category, input_nodes, layout, make_kernel_render, supports_epilogue_fusion, template, info_kwargs, description):
+        bmreq = MLIRBenchmarkRequest(
+            kernel_name=name,
+            input_tensor_meta=list(),
+            output_tensor_meta=list(),
+            extra_args=[],
+            source_code="",
+        )
+        super().__init__(name, category, input_nodes, layout, make_kernel_render, bmreq, supports_epilogue_fusion, template, info_kwargs, description)
     def __str__(self):
         return f"MLIRTemplateCaller(source_file={self.bmreq.source_file})"
 
@@ -1173,6 +1186,10 @@ def __init__(self, name, input_nodes, layout, input_reorder = None):
         self.output_nodes = [self.output_node]
         self.input_reorder = input_reorder
         self.layout = layout
+        # Fusion support flags (default to False)
+        self.support_epilogue_fusion = False
+        self.support_prologue_fusion = False
+        self.support_reduction_fusion = False
 
     def generate(self, **kwargs) -> ChoiceCaller:
         kernel_name = f"mlir_{self.name}"
@@ -1184,18 +1201,9 @@ def generate(self, **kwargs) -> ChoiceCaller:
             code = self.render(kernel=kernel, **kwargs)
 
         kernel_hash_name = f"mlir_{self.name}_{next(self.index_counter)}"
-        extra_args = []
         # create the BenchmarkRequest
         output_nodes = getattr(self, "output_nodes", None) or [self.output_node]
 
-        bmreq = MLIRBenchmarkRequest(
-            kernel_name=kernel_name,
-            input_tensor_meta=TensorMeta.from_irnodes(self.input_nodes),
-            output_tensor_meta=TensorMeta.from_irnodes(output_nodes),
-            extra_args=extra_args,
-            source_code=code,
-        )
-
         def make_kernel_render(
             template_node: TemplateBuffer,
             prologue_nodes: Optional[List[IRNode]] = None,
@@ -1236,7 +1244,6 @@ def make_kernel_render(
             self.input_nodes,
             self.output_node.get_layout(),
             make_kernel_render,
-            bmreq,
             False,  # supports_epilogue_fusion
             self,
             kwargs,
diff --git a/tests/test_cat.py b/tests/test_cat.py
index 32573a05..62de6759 100644
--- a/tests/test_cat.py
+++ b/tests/test_cat.py
@@ -20,24 +20,6 @@ def _test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
     print("cpu out: ", cpu_out)
     raise RuntimeError(f"{name} mismatch")
 
-
-def _togsim_log_count() -> int:
-    log_dir = Path("togsim_results")
-    if not log_dir.exists():
-        return 0
-    return len(list(log_dir.glob("*.log")))
-
-
-def _assert_simulation_happened(before_count: int, case_name: str):
-    after_count = _togsim_log_count()
-    if after_count <= before_count:
-        raise RuntimeError(
-            f"{case_name}: TOGSim log count did not increase "
-            f"(before={before_count}, after={after_count})"
-        )
-    print(f"{case_name}: TOGSim logs increased ({before_count} -> {after_count})")
-
-
 def test_cat_default(device):
     def cat_default_fn(a, b):
         return torch.cat([a, b], dim=0)
@@ -46,9 +28,7 @@ def cat_default_fn(a, b):
     y = torch.randn(6, 16, device=device)
     opt_fn = torch.compile(dynamic=False)(cat_default_fn)
 
-    before = _togsim_log_count()
     out = opt_fn(x, y)
-    _assert_simulation_happened(before, "cat.default")
 
     cpu_out = torch.cat([x.cpu(), y.cpu()], dim=0)
     _test_result("cat.default", out, cpu_out, rtol=1e-4, atol=1e-4)
@@ -63,19 +43,122 @@ def cat_out_fn(a, b, out):
     out_buf = torch.empty(14, 16, device=device)
     opt_fn = torch.compile(dynamic=False)(cat_out_fn)
 
-    before = _togsim_log_count()
     out = opt_fn(x, y, out_buf)
-    _assert_simulation_happened(before, "cat.out")
 
     cpu_out = torch.cat([x.cpu(), y.cpu()], dim=0)
     _test_result("cat.out", out, cpu_out, rtol=1e-4, atol=1e-4)
 
 
+def test_cat_4d_dim0(device):
+    def cat_4d_dim0_fn(a, b):
+        return torch.cat([a, b], dim=0)
+
+    x = torch.randn(2, 3, 4, 5, device=device)
+    y = torch.randn(3, 3, 4, 5, device=device)
+    opt_fn = torch.compile(dynamic=False)(cat_4d_dim0_fn)
+
+    out = opt_fn(x, y)
+
+    cpu_out = torch.cat([x.cpu(), y.cpu()], dim=0)
+    _test_result("cat.4d.dim0", out, cpu_out, rtol=1e-4, atol=1e-4)
+
+
+def test_cat_4d_dim1(device):
+    def cat_4d_dim1_fn(a, b):
+        return torch.cat([a, b], dim=1)
+
+    x = torch.randn(2, 3, 4, 5, device=device)
+    y = torch.randn(2, 5, 4, 5, device=device)
+    opt_fn = torch.compile(dynamic=False)(cat_4d_dim1_fn)
+
+    out = opt_fn(x, y)
+
+    cpu_out = torch.cat([x.cpu(), y.cpu()], dim=1)
+    _test_result("cat.4d.dim1", out, cpu_out, rtol=1e-4, atol=1e-4)
+
+
+def test_cat_4d_dim2(device):
+    def cat_4d_dim2_fn(a, b):
+        return torch.cat([a, b], dim=2)
+
+    x = torch.randn(2, 3, 4, 5, device=device)
+    y = torch.randn(2, 3, 6, 5, device=device)
+    opt_fn = torch.compile(dynamic=False)(cat_4d_dim2_fn)
+
+    out = opt_fn(x, y)
+
+    cpu_out = torch.cat([x.cpu(), y.cpu()], dim=2)
+    _test_result("cat.4d.dim2", out, cpu_out, rtol=1e-4, atol=1e-4)
+
+
+def test_cat_4d_dim3(device):
+    def cat_4d_dim3_fn(a, b):
+        return torch.cat([a, b], dim=3)
+
+    x = torch.randn(2, 3, 4, 5, device=device)
+    y = torch.randn(2, 3, 4, 7, device=device)
+    opt_fn = torch.compile(dynamic=False)(cat_4d_dim3_fn)
+
+    out = opt_fn(x, y)
+
+    cpu_out = torch.cat([x.cpu(), y.cpu()], dim=3)
+    _test_result("cat.4d.dim3", out, cpu_out, rtol=1e-4, atol=1e-4)
+
+
+def test_cat_three_inputs(device):
+    def cat_three_inputs_fn(a, b, c):
+        return torch.cat([a, b, c], dim=0)
+
+    x = torch.randn(4, 16, device=device)
+    y = torch.randn(5, 16, device=device)
+    z = torch.randn(3, 16, device=device)
+    opt_fn = torch.compile(dynamic=False)(cat_three_inputs_fn)
+
+    out = opt_fn(x, y, z)
+
+    cpu_out = torch.cat([x.cpu(), y.cpu(), z.cpu()], dim=0)
+    _test_result("cat.three_inputs", out, cpu_out, rtol=1e-4, atol=1e-4)
+
+
+def test_cat_four_inputs(device):
+    def cat_four_inputs_fn(a, b, c, d):
+        return torch.cat([a, b, c, d], dim=0)
+
+    x = torch.randn(3, 16, device=device)
+    y = torch.randn(4, 16, device=device)
+    z = torch.randn(5, 16, device=device)
+    w = torch.randn(2, 16, device=device)
+    opt_fn = torch.compile(dynamic=False)(cat_four_inputs_fn)
+
+    out = opt_fn(x, y, z, w)
+
+    cpu_out = torch.cat([x.cpu(), y.cpu(), z.cpu(), w.cpu()], dim=0)
+    _test_result("cat.four_inputs", out, cpu_out, rtol=1e-4, atol=1e-4)
+
+
+def test_cat_4d_three_inputs(device):
+    def cat_4d_three_inputs_fn(a, b, c):
+        return torch.cat([a, b, c], dim=1)
+
+    x = torch.randn(2, 3, 4, 5, device=device)
+    y = torch.randn(2, 4, 4, 5, device=device)
+    z = torch.randn(2, 5, 4, 5, device=device)
+    opt_fn = torch.compile(dynamic=False)(cat_4d_three_inputs_fn)
+
+    out = opt_fn(x, y, z)
+
+    cpu_out = torch.cat([x.cpu(), y.cpu(), z.cpu()], dim=1)
+    _test_result("cat.4d.three_inputs", out, cpu_out, rtol=1e-4, atol=1e-4)
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Run cat simulation tests")
     parser.add_argument(
         "--case",
-        choices=["default", "out", "all"],
+        choices=[
+            "default", "out", "4d_dim0", "4d_dim1", "4d_dim2", "4d_dim3",
+            "three_inputs", "four_inputs", "4d_three_inputs", "all"
+        ],
         default="all",
         help="Which cat case to run",
     )
@@ -87,3 +170,17 @@ def cat_out_fn(a, b, out):
         test_cat_default(device)
     if args.case in ("out", "all"):
         test_cat_out(device)
+    if args.case in ("4d_dim0", "all"):
+        test_cat_4d_dim0(device)
+    if args.case in ("4d_dim1", "all"):
+        test_cat_4d_dim1(device)
+    if args.case in ("4d_dim2", "all"):
+        test_cat_4d_dim2(device)
+    if args.case in ("4d_dim3", "all"):
+        test_cat_4d_dim3(device)
+    if args.case in ("three_inputs", "all"):
+        test_cat_three_inputs(device)
+    if args.case in ("four_inputs", "all"):
+        test_cat_four_inputs(device)
+    if args.case in ("4d_three_inputs", "all"):
+        test_cat_4d_three_inputs(device)

From 434bbb10793a68172e49e107bc3b639fd3b86264 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 4 Mar 2026 20:02:14 +0900
Subject: [PATCH 113/194] [WIP]

---
 PyTorchSimFrontend/mlir/mlir_cat_template.py | 13 -------------
 PyTorchSimFrontend/mlir/mlir_template.py     |  2 +-
 2 files changed, 1 insertion(+), 14 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_cat_template.py b/PyTorchSimFrontend/mlir/mlir_cat_template.py
index d68af7d4..5062e629 100644
--- a/PyTorchSimFrontend/mlir/mlir_cat_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_cat_template.py
@@ -118,7 +118,6 @@ def render(
             input_reorder=self.input_reorder,
         )
 
-        self._setup_epilogue_info(kernel, y)
         code = self._template_from_string(TEMPLATE).render(**kernel.render_options)
         return code
 
@@ -294,15 +293,3 @@ def _build_output_tile_desc(self, kernel, input_tile_sizes_dim, tile_sizes, rank
         y_tile_desc.set_tile_size(output_full_tile_sizes)
         y_tile_desc.set_name("y_cat_tile")
         return y_tile_desc
-
-    def _setup_epilogue_info(self, kernel, y):
-        """Setup epilogue information."""
-        if hasattr(y, "get_numel"):
-            y_numel = y.get_numel()
-        elif hasattr(y, "node") and hasattr(y.node, "get_numel"):
-            y_numel = y.node.get_numel()
-        else:
-            y_numel = None
-
-        if y_numel is not None:
-            kernel.exception_nodes[kernel.render_options["OUT_DVAR"]] = {"numel": y_numel}
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 04d327f8..59610228 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -813,7 +813,7 @@ def def_dma_op(self, dma_type, dram_var:str, index_list:list, tile_desc:mlir_com
             if dram_var in self.exception_nodes:
                 numel = self.exception_nodes[dram_var]["numel"]
             else:
-                numel = self.get_arg_info(self.named_nodes[dram_var].get_name()).get_numel()
+                numel = self.named_nodes[dram_var].get_numel()
             mlir_dtype = mlir_common.DTYPE_TO_MLIR[node_layout.dtype]
             dram_shape = f"memref<{numel}x{mlir_dtype}>"
             dram_stride = []

From 5295dfb5a16e21fda57b12d73906c1bd290c4f94 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 4 Mar 2026 22:13:26 +0900
Subject: [PATCH 114/194] [Template] Delay def_dma_op codegen

def_dma_op find data node using dram_var. But it can't locate the
proper node when output buffer has not been created.
---
 PyTorchSimFrontend/mlir/mlir_template.py | 146 +++++++++++++----------
 1 file changed, 81 insertions(+), 65 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 59610228..7c52bfe6 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -112,7 +112,8 @@ def __init__(self,
         self.outer_func_name = outer_func_name
         self.outer_func_render = outer_func_render
         self.kernel_arg_attributes = kernel_arg_attributes
-        self.render_hooks = OrderedDict()
+        self.render_hooks = OrderedDict()  # Stores {key: (priority, hook)}
+        self.dma_op_counter = itertools.count()  # Add counter for unique DMA op keys
         self.buffer_names = dict()
         self.render_options = dict()
         self.tile_size = []
@@ -555,7 +556,7 @@ def template_store():
             dram_var = self.epilogue_info["dram_var"]
             index_list = self.epilogue_info["dram_idx"]
             tile_desc = self.epilogue_info["dram_tile_desc"]
-            code = self.def_dma_op("MVOUT", dram_var, index_list, tile_desc)
+            code = self.def_dma_op("MVOUT", dram_var, index_list, tile_desc, lazy_mode=False)
             self.cse.generate(self.dma_stores, code, assignment = False)
 
         body = IndentedBuffer()
@@ -653,7 +654,7 @@ def hook():
             return f"({', '.join(renamed_arg_defs)})"
 
         assert "<DEF_KERNEL>" not in self.render_hooks
-        self.render_hooks["<DEF_KERNEL>"] = hook
+        self.render_hooks["<DEF_KERNEL>"] = (5, hook)  # Default priority 5
         return "<DEF_KERNEL>"
 
     # This function is a temporal function for convolution because currently convolution kernel is not considering padding.
@@ -700,7 +701,7 @@ def kernel_hook():
             return f"({', '.join(arg_defs)})"
 
         assert "<DEF_CONV_KERNEL>" not in self.render_hooks
-        self.render_hooks["<DEF_CONV_KERNEL>"] = kernel_hook
+        self.render_hooks["<DEF_CONV_KERNEL>"] = (5, kernel_hook)  # Default priority 5
         return "<DEF_CONV_KERNEL>"
 
     # This function is for convolution wrapper function finalizing.
@@ -711,7 +712,7 @@ def wrapper_hook():
             return f"({', '.join(wrapper_arg_defs)})"
 
         if "<DEF_CONV_WRAPPER>" not in self.render_hooks:
-            self.render_hooks["<DEF_CONV_WRAPPER>"] = wrapper_hook
+            self.render_hooks["<DEF_CONV_WRAPPER>"] = (5, wrapper_hook)  # Default priority 5
         return "<DEF_CONV_WRAPPER>"
 
     def get_conv_inputs(self):
@@ -720,15 +721,15 @@ def get_conv_inputs(self):
     def get_conv_outputs(self):
         return {k: v for k, v in self.kernel_group.args.output_buffers.items() if v != 'REMOVED'}
 
-    def load_input(self, indent_size: int = 0):
+    def load_input(self, indent_size: int = 0, priority: int = 1):
         def hook():
             code = IndentedBuffer()
             prologue_code = self.codegen_prologue_body()
             if prologue_code.getvalue():
                 input_dma_code = self.def_dma_op("MVIN", self.prologue_info["input_dram_var"], self.prologue_info["input_idx"],
-                                self.prologue_info["input_tile_desc"], subtile_size=self.prologue_info["input_subtile_size"], async_type=False)
+                                self.prologue_info["input_tile_desc"], subtile_size=self.prologue_info["input_subtile_size"], async_type=False, lazy_mode=False)
                 weight_dma_code = self.def_dma_op("MVIN", self.prologue_info["weight_dram_var"], self.prologue_info["weight_idx"],
-                                self.prologue_info["weight_tile_desc"], subtile_size=self.prologue_info["weight_subtile_size"], async_type=False)
+                                self.prologue_info["weight_tile_desc"], subtile_size=self.prologue_info["weight_subtile_size"], async_type=False, lazy_mode=False)
                 if (self.prologue_info["is_input_fused"]):
                     code.splice(input_dma_code)
                     code.splice(prologue_code)
@@ -739,58 +740,63 @@ def hook():
                     code.splice(input_dma_code)
             else:
                 dma_code = self.def_dma_op("MVIN", self.prologue_info["input_dram_var"], self.prologue_info["input_idx"],
-                                self.prologue_info["input_tile_desc"], subtile_size=self.prologue_info["input_subtile_size"], async_type=False)
+                                self.prologue_info["input_tile_desc"], subtile_size=self.prologue_info["input_subtile_size"], async_type=False, lazy_mode=False)
                 code.splice(dma_code)
                 dma_code = self.def_dma_op("MVIN", self.prologue_info["weight_dram_var"], self.prologue_info["weight_idx"],
-                                self.prologue_info["weight_tile_desc"], subtile_size=self.prologue_info["weight_subtile_size"], async_type=False)
+                                self.prologue_info["weight_tile_desc"], subtile_size=self.prologue_info["weight_subtile_size"], async_type=False, lazy_mode=False)
                 code.splice(dma_code)
             code = textwrap.indent(code.getvalue(), " "*indent_size).strip()
             return code
 
         assert "<PREPARE_INPUT>" not in self.render_hooks
-        self.render_hooks["<PREPARE_INPUT>"] = hook
-        self.render_hooks.move_to_end("<PREPARE_INPUT>", last=False) # Force order to be triggered first
+        self.render_hooks["<PREPARE_INPUT>"] = (priority, hook)
         return "<PREPARE_INPUT>"
 
-    def store_output(self, indent_size: int = 0):
+    def store_output(self, indent_size: int = 0, priority: int = 1):
         def hook():
             epilogue_code = self.codegen_epilogue_body()
             return textwrap.indent(epilogue_code.getvalue(), " "*indent_size).strip()
 
         assert "<STORE_OUTPUT>" not in self.render_hooks
-        self.render_hooks["<STORE_OUTPUT>"] = hook
-        self.render_hooks.move_to_end("<STORE_OUTPUT>", last=False) # Force order to be triggered first
+        self.render_hooks["<STORE_OUTPUT>"] = (priority, hook)
         return "<STORE_OUTPUT>"
 
-    def reduction_output(self, indent_size: int = 0):
+    def reduction_output(self, indent_size: int = 0, priority: int = 5):
         def hook():
             return textwrap.indent(self.reductions_suffix.getvalue(), " "*indent_size).strip()
 
         assert "<REDUCTION_OUTPUT>" not in self.render_hooks
-        self.render_hooks["<REDUCTION_OUTPUT>"] = hook
+        self.render_hooks["<REDUCTION_OUTPUT>"] = (priority, hook)
         return "<REDUCTION_OUTPUT>"
 
+    def _sort_hooks_by_priority(self):
+        """Sort hooks by priority (lower priority executes first)."""
+        sorted_hooks = OrderedDict()
+        for key, (priority, hook) in sorted(self.render_hooks.items(), key=lambda x: x[1][0]):
+            sorted_hooks[key] = hook
+        return sorted_hooks
+
     def def_function(self):
         _, call_args, _, _ = self.kernel_group.args.python_argdefs()
         if self.outer_func_render is not None:
             partial_code, function_name = self.outer_func_render(input_args=call_args)
+
             return PartialRender(
                 partial_code,
-                self.render_hooks,
+                self._sort_hooks_by_priority(),
             ), function_name
         else:
             return None, None
 
-    def def_global_vars(self):
+    def def_global_vars(self, priority: int = 10):
         key = "<GLOBAL_VARS>"
         def hook():
             return textwrap.indent(self.global_vars.getvalue(), "").strip()
 
-        assert key not in self.render_hooks
-        self.render_hooks[key] = hook
+        self.render_hooks[key] = (priority, hook)
         return key
 
-    def def_local_vars(self, indent_size=0):
+    def def_local_vars(self, indent_size=0, priority: int = 10):
         key = "<LOCAL_VARS>"
         def hook():
             code = IndentedBuffer()
@@ -799,52 +805,62 @@ def hook():
             code.splice(self.alloc_buffer)
             return textwrap.indent(code.getvalue(), " "*indent_size).strip()
 
-        assert key not in self.render_hooks
-        self.render_hooks[key] = hook
+        self.render_hooks[key] = (priority, hook)
         return key
 
     def def_dma_op(self, dma_type, dram_var:str, index_list:list, tile_desc:mlir_common.MLIRMultiDimTile,
-                   subtile_size:list=[], async_type=None, indent_size=0):
-        # Prepare code block
-        local_code = IndentedBuffer()
-        with self, self.override_buffer_cse(buffer=local_code, cse=self.apply_cse):
-            index_var = self.parse_index_list(index_list, offset=tile_desc.offset)
-            node_layout = self.named_nodes[dram_var].get_layout()
-            if dram_var in self.exception_nodes:
-                numel = self.exception_nodes[dram_var]["numel"]
-            else:
-                numel = self.named_nodes[dram_var].get_numel()
-            mlir_dtype = mlir_common.DTYPE_TO_MLIR[node_layout.dtype]
-            dram_shape = f"memref<{numel}x{mlir_dtype}>"
-            dram_stride = []
-            for idx in index_list:
-                if idx.is_Mul:
-                    dram_stride.append(int(idx.args[0]))
-                elif idx == sympy.Symbol("c0"):
-                    dram_stride.append(0)
-                elif not idx.is_Number:
-                    dram_stride.append(1)
+                   subtile_size:list=[], async_type=None, indent_size=0, priority: int = 5, lazy_mode: bool = True):
+        def generate_dma_code():
+            """Internal method to generate DMA code directly."""
+            local_code = IndentedBuffer()
+            with self, self.override_buffer_cse(buffer=local_code, cse=self.apply_cse):
+                index_var = self.parse_index_list(index_list, offset=tile_desc.offset)
+                node_layout = self.named_nodes[dram_var].get_layout()
+                if dram_var in self.exception_nodes:
+                    numel = self.exception_nodes[dram_var]["numel"]
                 else:
-                    dram_stride.append(0)
-
-            sram_var = tile_desc.get_name()
-            tile_shape = tile_desc.get_mlir_shape(mlir_dtype)
-            tile_stride = tile_desc.get_tile_stride()
-            vlane_split_axis = tile_desc.vmap.vlane_split_axis
-            vlane_stride = tile_desc.vmap.vlane_stride
-
-            zero_cse = self.get_const_cse(0, "index")
-            sram_index_var = ", ".join([f"%{str(zero_cse)}"]*tile_desc.get_nr_dim())
-
-            attribute_parts = [f"dram_stride={dram_stride}", f"sram_stride={tile_stride}", "padding=0"]
-            if subtile_size:
-                attribute_parts.append(f"subtile_size={subtile_size}, async={int(async_type) if async_type is not None else 1}")
-            attribute = "  {" + ", ".join(attribute_parts) + "}"
-            code = self.get_dma_code(dma_type, vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
-                                    dram_shape, tile_shape, "")
-            local_code.writeline(code)
-            local_code.writeline(attribute)
-        return textwrap.indent(local_code.getvalue(), " "*indent_size).strip()
+                    numel = self.get_arg_info(self.named_nodes[dram_var].get_name()).get_numel()
+                mlir_dtype = mlir_common.DTYPE_TO_MLIR[node_layout.dtype]
+                dram_shape = f"memref<{numel}x{mlir_dtype}>"
+                dram_stride = []
+                for idx in index_list:
+                    if idx.is_Mul:
+                        dram_stride.append(int(idx.args[0]))
+                    elif idx == sympy.Symbol("c0"):
+                        dram_stride.append(0)
+                    elif not idx.is_Number:
+                        dram_stride.append(1)
+                    else:
+                        dram_stride.append(0)
+
+                    sram_var = tile_desc.get_name()
+                    tile_shape = tile_desc.get_mlir_shape(mlir_dtype)
+                    tile_stride = tile_desc.get_tile_stride()
+                    vlane_split_axis = tile_desc.vmap.vlane_split_axis
+                    vlane_stride = tile_desc.vmap.vlane_stride
+
+                zero_cse = self.get_const_cse(0, "index")
+                sram_index_var = ", ".join([f"%{str(zero_cse)}"]*tile_desc.get_nr_dim())
+
+                attribute_parts = [f"dram_stride={dram_stride}", f"sram_stride={tile_stride}", "padding=0"]
+                if subtile_size:
+                    attribute_parts.append(f"subtile_size={subtile_size}, async={int(async_type) if async_type is not None else 1}")
+                attribute = "  {" + ", ".join(attribute_parts) + "}"
+                code = self.get_dma_code(dma_type, vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
+                                        dram_shape, tile_shape, "")
+                local_code.writeline(code)
+                local_code.writeline(attribute)
+            return textwrap.indent(local_code.getvalue(), " "*indent_size).strip()
+
+        if not lazy_mode:
+            # Immediate mode: generate code directly and return it
+            return generate_dma_code()
+
+        # Lazy mode: register hook and return key
+        dma_op_id = next(self.dma_op_counter)
+        key = f"<DMA_OP_{dma_op_id}>"
+        self.render_hooks[key] = (priority, generate_dma_code)
+        return key
 
     def def_sram_buffer(self, dram_name, tile_desc, id=0, indent_size=0):
         # Prepare code block
@@ -862,7 +878,7 @@ def render(self, template, kwargs, define_function=None):
 
         return PartialRender(
             code,
-            self.render_hooks,
+            self._sort_hooks_by_priority(),
         )
 
     def get_spad_size_per_lane(self, tile_m, tile_n):

From 61caebd5708ca21a88950d4d5073445891ea32f1 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 5 Mar 2026 00:12:49 +0900
Subject: [PATCH 115/194] [Template/Cat] Fix apply offset setting

---
 PyTorchSimFrontend/mlir/mlir_cat_template.py | 80 +++++++++-----------
 1 file changed, 37 insertions(+), 43 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_cat_template.py b/PyTorchSimFrontend/mlir/mlir_cat_template.py
index 5062e629..5aaf3e71 100644
--- a/PyTorchSimFrontend/mlir/mlir_cat_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_cat_template.py
@@ -26,7 +26,7 @@
       affine.for %index_local{{ DIM }}_{{ i }} = 0 to {{ INPUT_SIZES[i][DIM] }} step {{ INPUT_TILE_SIZES_DIM[i] }} {
         %index{{ DIM }}_{{i}} = affine.apply affine_map<(d0) -> (d0 + {{ CUMULATIVE_OFFSETS[i] }})> (%index_local{{ DIM }}_{{ i }})
         {{ kernel.def_dma_op("MVIN", INPUT_BUFFER_NAMES[i], INPUT_IDXS[i], INPUT_TILE_DESCS[i], indent_size=INDENT_SIZE) }}
-        {{ kernel.def_dma_op("MVOUT", OUT_DVAR, OUTPUT_IDXS[i], INPUT_TILE_DESCS[i], indent_size=INDENT_SIZE) }}
+        {{ kernel.def_dma_op("MVOUT", OUT_DVAR, OUTPUT_IDXS[i], OUTPUT_TILE_DESCS[i], indent_size=INDENT_SIZE) }}
       } { inner_loop=true }
 {%- endfor %}
 
@@ -52,10 +52,6 @@ def render(
         tile_info=None,
         **kwargs,
     ):
-        is_out_variant = template_buffer_node is not None
-        if is_out_variant:
-            self.output_node = template_buffer_node
-
         # Extract info
         input_nodes = self.input_nodes
         y = self.output_node
@@ -73,11 +69,8 @@ def render(
             kernel, input_sizes, tile_sizes, num_inputs, rank
         )
         buffer_name_to_template_name, input_buffer_names = self._build_buffer_mapping(input_nodes)
-        input_tile_descs, unique_tile_descs = self._build_tile_descriptors(
-            kernel, input_nodes, input_sizes, input_tile_sizes_dim, tile_sizes, rank, input_buffer_names
-        )
-        y_tile_desc = self._build_output_tile_desc(
-            kernel, input_tile_sizes_dim, tile_sizes, rank
+        input_tile_descs, output_tile_descs, unique_tile_descs = self._build_tile_descriptors(
+            kernel, input_nodes, input_sizes, input_tile_sizes_dim, tile_sizes, rank, input_buffer_names, y
         )
 
         input_idxs, output_idxs, cumulative_offsets = self._build_index_expressions(
@@ -90,14 +83,14 @@ def render(
             if actual_name in unique_tile_descs:
                 unique_buffer_tile_descs[template_name] = unique_tile_descs[actual_name]
 
-        names_str = ", ".join(input_buffer_names + ["out_ptr1" if is_out_variant else "Y"])
+        names_str = ", ".join(input_buffer_names + ["Y"])
         indent_size = 2 + (rank - 1) * 2 + 4
 
         kernel.render_options = dict(
             KERNEL_NAME=self.name,
             kernel=kernel,
             Y=y,
-            OUT_DVAR="out_ptr1" if is_out_variant else "Y",
+            OUT_DVAR="Y",
             NAMES_STR=names_str,
             INPUT_NAMES=input_nodes,
             INPUT_BUFFER_NAMES=input_buffer_names,
@@ -110,6 +103,7 @@ def render(
             TILE_SIZES=tile_sizes,
             INPUT_TILE_SIZES_DIM=input_tile_sizes_dim,
             INPUT_TILE_DESCS=input_tile_descs,
+            OUTPUT_TILE_DESCS=output_tile_descs,
             UNIQUE_BUFFER_TILE_DESCS=unique_buffer_tile_descs,
             INPUT_IDXS=input_idxs,
             OUTPUT_IDXS=output_idxs,
@@ -209,14 +203,16 @@ def _build_buffer_mapping(self, input_nodes):
         return buffer_name_to_template_name, input_buffer_names
 
     def _build_tile_descriptors(
-        self, kernel, input_nodes, input_sizes, input_tile_sizes_dim, tile_sizes, rank, input_buffer_names
+        self, kernel, input_nodes, input_sizes, input_tile_sizes_dim, tile_sizes, rank, input_buffer_names, output_node
     ):
-        """Build tile descriptors for each input."""
+        """Build tile descriptors for each input and output."""
         input_tile_descs = []
+        output_tile_descs = []
         unique_tile_descs = {}
+        output_offset = output_node.get_layout().offset
 
         for i, x in enumerate(input_nodes):
-            # Build full tile size list for this input
+            x_offset = x.get_layout().offset
             full_tile_sizes = []
             tile_size_idx = 0
             for d in range(rank):
@@ -226,23 +222,37 @@ def _build_tile_descriptors(
                 else:
                     full_tile_sizes.append(input_tile_sizes_dim[i])
 
-            tile_desc = mlir_common.MLIRMultiDimTile(
+            # Input tile descriptor
+            input_tile_desc = mlir_common.MLIRMultiDimTile(
                 full_tile_sizes,
                 kernel.vector_lane,
                 vlane_split_axis=rank - 1,
                 vlane_stride=1
             )
-            tile_desc.set_tile_size(full_tile_sizes)
+            input_tile_desc.set_tile_size(full_tile_sizes)
             template_buffer_name = input_buffer_names[i]
-            tile_desc.set_name(f"{template_buffer_name.lower()}_cat_tile")
-            input_tile_descs.append(tile_desc)
+            input_tile_desc.set_name(f"{template_buffer_name.lower()}_cat_tile")
+            input_tile_desc.offset = x_offset
+            input_tile_descs.append(input_tile_desc)
+
+            # Output tile descriptor (same as input but with output offset)
+            output_tile_desc = mlir_common.MLIRMultiDimTile(
+                full_tile_sizes,
+                kernel.vector_lane,
+                vlane_split_axis=rank - 1,
+                vlane_stride=1
+            )
+            output_tile_desc.set_tile_size(full_tile_sizes)
+            output_tile_desc.set_name(f"{template_buffer_name.lower()}_cat_tile")
+            output_tile_desc.offset = output_offset
+            output_tile_descs.append(output_tile_desc)
 
             # Store unique tile desc by actual buffer name
             actual_name = x.get_name()
             if actual_name not in unique_tile_descs:
-                unique_tile_descs[actual_name] = tile_desc
+                unique_tile_descs[actual_name] = input_tile_desc
 
-        return input_tile_descs, unique_tile_descs
+        return input_tile_descs, output_tile_descs, unique_tile_descs
 
     def _build_index_expressions(
         self, input_nodes, input_sizes, output_strides, rank, num_inputs
@@ -256,6 +266,12 @@ def _build_index_expressions(
 
         for i, x in enumerate(input_nodes):
             x_stride = x.get_layout().stride
+            x_offset = x.get_layout().offset
+            if hasattr(x, 'data') and hasattr(x.data, 'dims'):
+                # In case of PermuteView, the stride is permuted
+                perm_dims = x.data.dims
+                x_stride = [x_stride[perm_dims[d]] for d in range(rank)]
+
             input_idx = []
             output_idx = []
             for d in range(rank):
@@ -271,25 +287,3 @@ def _build_index_expressions(
             output_idxs.append(output_idx)
 
         return input_idxs, output_idxs, cumulative_offsets
-
-    def _build_output_tile_desc(self, kernel, input_tile_sizes_dim, tile_sizes, rank):
-        """Build output tile descriptor."""
-        max_output_tile_dim = max(input_tile_sizes_dim) if input_tile_sizes_dim else 1
-        output_full_tile_sizes = []
-        tile_size_idx = 0
-        for d in range(rank):
-            if d != self.dim:
-                output_full_tile_sizes.append(tile_sizes[tile_size_idx])
-                tile_size_idx += 1
-            else:
-                output_full_tile_sizes.append(max_output_tile_dim)
-
-        y_tile_desc = mlir_common.MLIRMultiDimTile(
-            output_full_tile_sizes,
-            kernel.vector_lane,
-            vlane_split_axis=rank - 1,
-            vlane_stride=1
-        )
-        y_tile_desc.set_tile_size(output_full_tile_sizes)
-        y_tile_desc.set_name("y_cat_tile")
-        return y_tile_desc

From 47684a75942bf9d35e19a7a79a1862418c5649a6 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 5 Mar 2026 17:44:32 +0900
Subject: [PATCH 116/194] [TOGSim] Add help print

---
 TOGSim/src/DMA.cc                      |  2 +-
 TOGSim/src/helper/CommandLineParser.cc |  6 +++++-
 TOGSim/src/helper/CommandLineParser.h  |  8 +++++++-
 TOGSim/src/main.cc                     | 13 +++++++++----
 4 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/TOGSim/src/DMA.cc b/TOGSim/src/DMA.cc
index f8f21025..fefee6d2 100644
--- a/TOGSim/src/DMA.cc
+++ b/TOGSim/src/DMA.cc
@@ -12,7 +12,7 @@ void DMA::issue_tile(std::shared_ptr<Instruction> inst) {
   _current_inst = std::move(inst);
   std::vector<size_t>& tile_size = _current_inst->get_tile_size();
   if (tile_size.size() <= 0 || tile_size.size() > get_max_dim()) {
-    spdlog::error("[DMA {}] issued tile is not supported format..", _id);
+    spdlog::error("[DMA {}] issued tile is not supported format.. tile.size: {}, tile_size: [{}]", _id, tile_size.size(), fmt::join(tile_size, ", "));
     exit(EXIT_FAILURE);
   }
   _finished = false;
diff --git a/TOGSim/src/helper/CommandLineParser.cc b/TOGSim/src/helper/CommandLineParser.cc
index 66aebbe1..9cd177ac 100644
--- a/TOGSim/src/helper/CommandLineParser.cc
+++ b/TOGSim/src/helper/CommandLineParser.cc
@@ -12,9 +12,13 @@ void CommandLineParser::parse(int argc, char **argv) noexcept(false) {
     po::notify(variables_map);
 }
 
+void CommandLineParser::print_help_message() const noexcept {
+    std::cout << options_description << std::endl;
+}
+
 void CommandLineParser::print_help_message_if_required() const noexcept {
     if (variables_map.count("help") > 0) {
-        std::cout << options_description << std::endl;
+        print_help_message();
         exit(0);
     }
 }
diff --git a/TOGSim/src/helper/CommandLineParser.h b/TOGSim/src/helper/CommandLineParser.h
index 39174d5d..b41eabf3 100644
--- a/TOGSim/src/helper/CommandLineParser.h
+++ b/TOGSim/src/helper/CommandLineParser.h
@@ -19,7 +19,7 @@ class CommandLineParser {
      * Command Line Parser constructor
      */
     CommandLineParser() noexcept {
-        options_description.add_options()("help", "Prints help message");
+        options_description.add_options()("help,h", "Prints help message");
     }
 
     /**
@@ -38,6 +38,12 @@ class CommandLineParser {
      */
     void print_help_message_if_required() const noexcept;
 
+    /**
+     * Prints the help message.
+     * (Can be called to show help for invalid options)
+     */
+    void print_help_message() const noexcept;
+
     /**
      * Add a new command line argument option.
      * (Should be called before `parse` method is called)
diff --git a/TOGSim/src/main.cc b/TOGSim/src/main.cc
index 7c596af5..cda8f986 100644
--- a/TOGSim/src/main.cc
+++ b/TOGSim/src/main.cc
@@ -96,19 +96,24 @@ int main(int argc, char** argv) {
   // parse command line argumnet
   CommandLineParser cmd_parser = CommandLineParser();
   cmd_parser.add_command_line_option<std::string>(
-      "config", "Path for hardware configuration file");
+      "config", "Path for hardware configuration file (.yml)");
   cmd_parser.add_command_line_option<std::string>(
-      "models_list", "Path for the models list file (can be FIFO or regular file)");
+      "models_list", "Path for the trace file (.trace)");
   cmd_parser.add_command_line_option<std::string>(
       "log_level", "Set for log level [trace, debug, info], default = info");
   try {
     cmd_parser.parse(argc, argv);
   } catch (const CommandLineParser::ParsingError& e) {
     spdlog::error(
-        "Command line argument parrsing error captured. Error message: {}",
+        "Command line argument parsing error captured. Error message: {}",
         e.what());
-    throw(e);
+    std::cerr << std::endl;
+    cmd_parser.print_help_message();
+    exit(1);
   }
+  
+  // Check if help was requested
+  cmd_parser.print_help_message_if_required();
 
   std::string level = "info";
   cmd_parser.set_if_defined("log_level", &level);

From a24f1f1081a4ce7e5e09a59f61763850d11d994f Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 5 Mar 2026 17:45:00 +0900
Subject: [PATCH 117/194] [Template/Cat] Limit maximum rank of tile

---
 PyTorchSimFrontend/mlir/mlir_cat_template.py | 52 +++++++++++++++-----
 1 file changed, 41 insertions(+), 11 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_cat_template.py b/PyTorchSimFrontend/mlir/mlir_cat_template.py
index 5aaf3e71..2a00ce95 100644
--- a/PyTorchSimFrontend/mlir/mlir_cat_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_cat_template.py
@@ -64,17 +64,30 @@ def render(
         tile_sizes = tile_info if tile_info is not None else [1] * len(output_sizes)
         output_strides = y.get_layout().stride
 
+        excluded_dims = list()
+        max_tiled_dims = 4 - 1
+        if len(tile_sizes) > max_tiled_dims:
+            # Create index:tile_size dictionary and sort by tile_size
+            dim_tile_dict = {idx: sz for idx, sz in enumerate(tile_sizes)}
+            sorted_dims = sorted(dim_tile_dict.items(), key=lambda x: x[1], reverse=True)
+            # Keep top 4 dimensions, exclude the rest
+            excluded_dims = [idx for idx, _ in sorted_dims[max_tiled_dims:]]
+            for idx in excluded_dims:
+                tile_sizes[idx] = 1
+
         # Calculate input tile sizes
         input_tile_sizes_dim = self._calculate_input_tile_sizes(
             kernel, input_sizes, tile_sizes, num_inputs, rank
         )
         buffer_name_to_template_name, input_buffer_names = self._build_buffer_mapping(input_nodes)
         input_tile_descs, output_tile_descs, unique_tile_descs = self._build_tile_descriptors(
-            kernel, input_nodes, input_sizes, input_tile_sizes_dim, tile_sizes, rank, input_buffer_names, y
+            kernel, input_nodes, input_sizes, input_tile_sizes_dim, tile_sizes, rank, input_buffer_names, y,
+            excluded_dims=excluded_dims
         )
 
         input_idxs, output_idxs, cumulative_offsets = self._build_index_expressions(
-            input_nodes, input_sizes, output_strides, rank, num_inputs
+            input_nodes, input_sizes, output_strides, rank, num_inputs,
+            excluded_dims=excluded_dims
         )
 
         # Map unique buffer names to their tile descriptors for template
@@ -203,9 +216,12 @@ def _build_buffer_mapping(self, input_nodes):
         return buffer_name_to_template_name, input_buffer_names
 
     def _build_tile_descriptors(
-        self, kernel, input_nodes, input_sizes, input_tile_sizes_dim, tile_sizes, rank, input_buffer_names, output_node
+        self, kernel, input_nodes, input_sizes, input_tile_sizes_dim, tile_sizes, rank, input_buffer_names, output_node, excluded_dims=None
     ):
         """Build tile descriptors for each input and output."""
+        if excluded_dims is None:
+            excluded_dims = set()
+
         input_tile_descs = []
         output_tile_descs = []
         unique_tile_descs = {}
@@ -217,16 +233,21 @@ def _build_tile_descriptors(
             tile_size_idx = 0
             for d in range(rank):
                 if d != self.dim:
-                    full_tile_sizes.append(tile_sizes[tile_size_idx])
+                    # Skip excluded dimensions
+                    if tile_size_idx not in excluded_dims:
+                        full_tile_sizes.append(tile_sizes[tile_size_idx])
                     tile_size_idx += 1
                 else:
                     full_tile_sizes.append(input_tile_sizes_dim[i])
 
+            # Calculate vlane_split_axis for reduced dimensions
+            vlane_split_axis = len(full_tile_sizes) - 1
+
             # Input tile descriptor
             input_tile_desc = mlir_common.MLIRMultiDimTile(
                 full_tile_sizes,
                 kernel.vector_lane,
-                vlane_split_axis=rank - 1,
+                vlane_split_axis=vlane_split_axis,
                 vlane_stride=1
             )
             input_tile_desc.set_tile_size(full_tile_sizes)
@@ -239,7 +260,7 @@ def _build_tile_descriptors(
             output_tile_desc = mlir_common.MLIRMultiDimTile(
                 full_tile_sizes,
                 kernel.vector_lane,
-                vlane_split_axis=rank - 1,
+                vlane_split_axis=vlane_split_axis,
                 vlane_stride=1
             )
             output_tile_desc.set_tile_size(full_tile_sizes)
@@ -255,9 +276,12 @@ def _build_tile_descriptors(
         return input_tile_descs, output_tile_descs, unique_tile_descs
 
     def _build_index_expressions(
-        self, input_nodes, input_sizes, output_strides, rank, num_inputs
+        self, input_nodes, input_sizes, output_strides, rank, num_inputs, excluded_dims=None
     ):
         """Build index expressions for input and output."""
+        if excluded_dims is None:
+            excluded_dims = set()
+
         input_idxs = []
         output_idxs = []
         cumulative_offsets = [0]
@@ -274,15 +298,21 @@ def _build_index_expressions(
 
             input_idx = []
             output_idx = []
+            tile_size_idx = 0
             for d in range(rank):
                 if d != self.dim:
-                    input_idx_symbol = sympy.Symbol(f"index{d}")
-                    output_idx_symbol = sympy.Symbol(f"index{d}")
+                    # Skip excluded dimensions
+                    if tile_size_idx not in excluded_dims:
+                        input_idx_symbol = sympy.Symbol(f"index{d}")
+                        output_idx_symbol = sympy.Symbol(f"index{d}")
+                        input_idx.append(input_idx_symbol * x_stride[d])
+                        output_idx.append(output_idx_symbol * output_strides[d])
+                    tile_size_idx += 1
                 else:
                     input_idx_symbol = sympy.Symbol(f"index_local{self.dim}_{i}")
                     output_idx_symbol = sympy.Symbol(f"index{self.dim}_{i}")
-                input_idx.append(input_idx_symbol * x_stride[d])
-                output_idx.append(output_idx_symbol * output_strides[d])
+                    input_idx.append(input_idx_symbol * x_stride[d])
+                    output_idx.append(output_idx_symbol * output_strides[d])
             input_idxs.append(input_idx)
             output_idxs.append(output_idx)
 

From 4e4300e2cda61dcc5eeec103c91fe5ef13ff3a73 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 5 Mar 2026 20:22:10 +0900
Subject: [PATCH 118/194] [Template/Cat] Refactor cat + Support explicit
 dram+stride in def_dma_op

---
 .github/workflows/pytorchsim_test.yml        |  21 +
 PyTorchSimFrontend/mlir/mlir_cat_template.py | 401 ++++++++++---------
 PyTorchSimFrontend/mlir/mlir_template.py     |  48 ++-
 tests/test_cat.py                            |  16 +-
 4 files changed, 288 insertions(+), 198 deletions(-)

diff --git a/.github/workflows/pytorchsim_test.yml b/.github/workflows/pytorchsim_test.yml
index 9589384b..eaaa7e50 100644
--- a/.github/workflows/pytorchsim_test.yml
+++ b/.github/workflows/pytorchsim_test.yml
@@ -163,6 +163,27 @@ jobs:
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_conv2d.py
 
+  test_cat:
+    name: Run test_cat.py
+    runs-on: self-hosted
+    steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Run test_cat.py
+        run: |
+          echo "Running test_cat.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump \
+            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
+            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/test_cat.py
+
   test_matmul:
     name: Run test_matmul.py
     runs-on: self-hosted
diff --git a/PyTorchSimFrontend/mlir/mlir_cat_template.py b/PyTorchSimFrontend/mlir/mlir_cat_template.py
index 2a00ce95..6eb60198 100644
--- a/PyTorchSimFrontend/mlir/mlir_cat_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_cat_template.py
@@ -1,4 +1,4 @@
-from typing import List, Optional
+from typing import List, Optional, Set
 import math
 import itertools
 
@@ -23,10 +23,12 @@
 {%- endfor %}
 {%- for i in range(NUM_INPUTS) %}
       // Input tensor{{ i }}
-      affine.for %index_local{{ DIM }}_{{ i }} = 0 to {{ INPUT_SIZES[i][DIM] }} step {{ INPUT_TILE_SIZES_DIM[i] }} {
-        %index{{ DIM }}_{{i}} = affine.apply affine_map<(d0) -> (d0 + {{ CUMULATIVE_OFFSETS[i] }})> (%index_local{{ DIM }}_{{ i }})
-        {{ kernel.def_dma_op("MVIN", INPUT_BUFFER_NAMES[i], INPUT_IDXS[i], INPUT_TILE_DESCS[i], indent_size=INDENT_SIZE) }}
-        {{ kernel.def_dma_op("MVOUT", OUT_DVAR, OUTPUT_IDXS[i], OUTPUT_TILE_DESCS[i], indent_size=INDENT_SIZE) }}
+      affine.for %index_local{{ DIM }}_{{ i }} = 0 to {{ INPUTS[i].sizes[DIM] }} step {{ INPUTS[i].tile_size_dim }} {
+        %index{{ DIM }}_{{ i }} = affine.apply affine_map<(d0) -> (d0 + {{ INPUTS[i].cum_offset }})> (%index_local{{ DIM }}_{{ i }})
+        %input_dram_offset_{{ i }} = affine.apply {{ INPUTS[i].offset_map }}({{ INPUTS[i].offset_vars }})
+        %output_dram_offset_{{ i }} = affine.apply {{ OUTPUTS[i].offset_map }}({{ OUTPUTS[i].offset_vars }})
+        {{ kernel.def_dma_op("MVIN", INPUTS[i].dram_name, [], INPUTS[i].tile_desc, indent_size=INDENT_SIZE, dram_stride=INPUTS[i].dram_strides, dram_offset="input_dram_offset_" ~ i) }}
+        {{ kernel.def_dma_op("MVOUT", "Y", [], OUTPUTS[i].tile_desc, indent_size=INDENT_SIZE, dram_stride=OUTPUTS[i].dram_strides, dram_offset="output_dram_offset_" ~ i) }}
       } { inner_loop=true }
 {%- endfor %}
 
@@ -52,81 +54,84 @@ def render(
         tile_info=None,
         **kwargs,
     ):
-        # Extract info
         input_nodes = self.input_nodes
         y = self.output_node
-        num_inputs = len(self.input_nodes)
+        num_inputs = len(input_nodes)
         rank = len(y.get_size())
 
         input_sizes = [x.get_size() for x in input_nodes]
-        output_sizes = [sz for dim, sz in enumerate(y.get_size()) if dim != self.dim]
-        output_dim = [dim for dim, sz in enumerate(y.get_size()) if dim != self.dim]
-        tile_sizes = tile_info if tile_info is not None else [1] * len(output_sizes)
+        output_sizes = [sz for d, sz in enumerate(y.get_size()) if d != self.dim]
+        output_dim   = [d  for d, _ in enumerate(y.get_size()) if d != self.dim]
         output_strides = y.get_layout().stride
 
-        excluded_dims = list()
-        max_tiled_dims = 4 - 1
-        if len(tile_sizes) > max_tiled_dims:
-            # Create index:tile_size dictionary and sort by tile_size
-            dim_tile_dict = {idx: sz for idx, sz in enumerate(tile_sizes)}
-            sorted_dims = sorted(dim_tile_dict.items(), key=lambda x: x[1], reverse=True)
-            # Keep top 4 dimensions, exclude the rest
-            excluded_dims = [idx for idx, _ in sorted_dims[max_tiled_dims:]]
-            for idx in excluded_dims:
-                tile_sizes[idx] = 1
-
-        # Calculate input tile sizes
+        tile_sizes = list(tile_info) if tile_info is not None else [1] * len(output_sizes)
+        excluded_dims = self._compute_excluded_dims(tile_sizes)
+
         input_tile_sizes_dim = self._calculate_input_tile_sizes(
             kernel, input_sizes, tile_sizes, num_inputs, rank
         )
-        buffer_name_to_template_name, input_buffer_names = self._build_buffer_mapping(input_nodes)
+        buffer_name_to_template_name, input_dram_names = self._build_buffer_mapping(input_nodes)
         input_tile_descs, output_tile_descs, unique_tile_descs = self._build_tile_descriptors(
-            kernel, input_nodes, input_sizes, input_tile_sizes_dim, tile_sizes, rank, input_buffer_names, y,
-            excluded_dims=excluded_dims
+            kernel, input_nodes, input_sizes, input_tile_sizes_dim, tile_sizes, rank,
+            input_dram_names, y, excluded_dims=excluded_dims
         )
-
-        input_idxs, output_idxs, cumulative_offsets = self._build_index_expressions(
-            input_nodes, input_sizes, output_strides, rank, num_inputs,
-            excluded_dims=excluded_dims
+        (input_offset_maps, input_offset_var_strs, input_dram_strides,
+         output_offset_maps, output_offset_var_strs, output_dram_strides,
+         cumulative_offsets) = self._build_dma_info(
+            input_nodes, input_sizes, output_strides, input_tile_descs, output_tile_descs,
+            rank, num_inputs, excluded_dims=excluded_dims
         )
 
-        # Map unique buffer names to their tile descriptors for template
-        unique_buffer_tile_descs = {}
-        for actual_name, template_name in buffer_name_to_template_name.items():
-            if actual_name in unique_tile_descs:
-                unique_buffer_tile_descs[template_name] = unique_tile_descs[actual_name]
-
-        names_str = ", ".join(input_buffer_names + ["Y"])
+        unique_buffer_tile_descs = {
+            buffer_name_to_template_name[name]: desc
+            for name, desc in unique_tile_descs.items()
+        }
+        names_str = ", ".join(input_dram_names + ["Y"])
         indent_size = 2 + (rank - 1) * 2 + 4
 
+        inputs_info = [
+            dict(
+                dram_name    = input_dram_names[i],
+                sizes        = input_sizes[i],
+                tile_size_dim= input_tile_sizes_dim[i],
+                tile_desc    = input_tile_descs[i],
+                offset_map   = input_offset_maps[i],
+                offset_vars  = input_offset_var_strs[i],
+                dram_strides = input_dram_strides[i],
+                cum_offset   = cumulative_offsets[i],
+            )
+            for i in range(num_inputs)
+        ]
+        outputs_info = [
+            dict(
+                tile_desc    = output_tile_descs[i],
+                offset_map   = output_offset_maps[i],
+                offset_vars  = output_offset_var_strs[i],
+                dram_strides = output_dram_strides[i],
+            )
+            for i in range(num_inputs)
+        ]
+
         kernel.render_options = dict(
-            KERNEL_NAME=self.name,
-            kernel=kernel,
-            Y=y,
-            OUT_DVAR="Y",
-            NAMES_STR=names_str,
-            INPUT_NAMES=input_nodes,
-            INPUT_BUFFER_NAMES=input_buffer_names,
-            NUM_INPUTS=num_inputs,
-            RANK=rank,
-            DIM=self.dim,
-            INPUT_SIZES=input_sizes,
-            OUTPUT_SIZES=output_sizes,
-            OUTPUT_DIM=output_dim,
-            TILE_SIZES=tile_sizes,
-            INPUT_TILE_SIZES_DIM=input_tile_sizes_dim,
-            INPUT_TILE_DESCS=input_tile_descs,
-            OUTPUT_TILE_DESCS=output_tile_descs,
-            UNIQUE_BUFFER_TILE_DESCS=unique_buffer_tile_descs,
-            INPUT_IDXS=input_idxs,
-            OUTPUT_IDXS=output_idxs,
-            CUMULATIVE_OFFSETS=cumulative_offsets,
-            INDENT_SIZE=indent_size,
-            input_reorder=self.input_reorder,
+            KERNEL_NAME           = self.name,
+            kernel                = kernel,
+            NUM_INPUTS            = num_inputs,
+            NAMES_STR             = names_str,
+            Y                     = y,
+            INPUT_NAMES           = input_nodes,
+            RANK                  = rank,
+            DIM                   = self.dim,
+            OUTPUT_SIZES          = output_sizes,
+            OUTPUT_DIM            = output_dim,
+            TILE_SIZES            = tile_sizes,
+            UNIQUE_BUFFER_TILE_DESCS = unique_buffer_tile_descs,
+            INPUTS                = inputs_info,
+            OUTPUTS               = outputs_info,
+            INDENT_SIZE           = indent_size,
+            input_reorder         = self.input_reorder,
         )
 
-        code = self._template_from_string(TEMPLATE).render(**kernel.render_options)
-        return code
+        return self._template_from_string(TEMPLATE).render(**kernel.render_options)
 
     def get_tile_candidates(
         self,
@@ -141,179 +146,217 @@ def get_tile_candidates(
 
         y = self.output_node
         num_inputs = len(self.input_nodes)
-        output_sizes = [sz for dim, sz in enumerate(y.get_size()) if dim != self.dim]
-        num_non_dim_dims = len(output_sizes)
+        output_sizes = [sz for d, sz in enumerate(y.get_size()) if d != self.dim]
 
-        if num_non_dim_dims == 0:
+        if not output_sizes:
             return [[1]]
 
-        tile_candidates = []
-        dim_tile_candidates = []
+        max_tile_total = kernel.spad_info["spad_size"] // (
+            kernel.vector_lane * kernel.precision * 2 * num_inputs
+        )
 
+        dim_tile_candidates = []
         for dim_size in output_sizes:
-            dim_candidates = []
-            max_tile = min(dim_size, kernel.spad_info["spad_size"] // (kernel.vector_lane * kernel.precision * 2 * num_inputs))
-
+            max_tile = min(dim_size, max_tile_total)
+            candidates = set()
             for mult in range(1, max_tile // kernel.vector_lane + 1):
-                tile = mult * kernel.vector_lane
-                if tile <= dim_size:
-                    dim_candidates.append(tile)
-
+                t = mult * kernel.vector_lane
+                if t <= dim_size:
+                    candidates.add(t)
             if max_tile > 0:
                 for exp in range(int(math.log2(max_tile)) + 1):
-                    tile = 2 ** exp
-                    if tile <= dim_size and tile not in dim_candidates:
-                        dim_candidates.append(tile)
-
-            if dim_size not in dim_candidates:
-                dim_candidates.append(dim_size)
-
-            dim_tile_candidates.append(sorted(set(dim_candidates))[:5])
-
-        for tile_combo in itertools.product(*dim_tile_candidates):
-            total_elements = math.prod(tile_combo)
-            total_spad_needed = total_elements * (num_inputs + 1) * kernel.precision
-
-            if total_spad_needed <= kernel.spad_info["spad_size"] * kernel.vector_lane:
-                tile_candidates.append(list(tile_combo))
+                    t = 2 ** exp
+                    if t <= dim_size:
+                        candidates.add(t)
+            candidates.add(dim_size)
+            dim_tile_candidates.append(sorted(candidates)[:5])
+
+        tile_candidates = [
+            list(combo)
+            for combo in itertools.product(*dim_tile_candidates)
+            if math.prod(combo) * (num_inputs + 1) * kernel.precision
+               <= kernel.spad_info["spad_size"] * kernel.vector_lane
+        ]
 
         if not tile_candidates:
-            tile_candidates = [[1] * num_non_dim_dims]
+            tile_candidates = [[1] * len(output_sizes)]
 
         tile_candidates.sort(key=lambda x: -math.prod(x))
         return tile_candidates[:4]
 
-    def _calculate_input_tile_sizes(
-        self, kernel, input_sizes, tile_sizes, num_inputs, rank
-    ):
-        """Calculate tile sizes for concat dimension for each input."""
+    # ------------------------------------------------------------------
+    # Private helpers
+    # ------------------------------------------------------------------
+
+    def _compute_excluded_dims(self, tile_sizes: list) -> list:
+        """Return non-tiled dimension indices when rank exceeds the 4-dim limit."""
+        max_tiled = 3
+        if len(tile_sizes) <= max_tiled:
+            return []
+        sorted_dims = sorted(enumerate(tile_sizes), key=lambda x: x[1], reverse=True)
+        excluded = [idx for idx, _ in sorted_dims[max_tiled:]]
+        for idx in excluded:
+            tile_sizes[idx] = 1
+        return excluded
+
+    def _calculate_input_tile_sizes(self, kernel, input_sizes, tile_sizes, num_inputs, rank):
+        """Calculate tile sizes along the concat dimension for each input."""
         non_dim_tile_elements = math.prod(tile_sizes) if tile_sizes else 1
-        non_dim_tile_spad = non_dim_tile_elements * kernel.precision
         max_spad_per_input = kernel.spad_info["spad_size"] * kernel.vector_lane // 2
-        extra_concat_input = math.ceil(max_spad_per_input / non_dim_tile_spad) - num_inputs
+        extra_concat = math.ceil(max_spad_per_input / (non_dim_tile_elements * kernel.precision)) - num_inputs
 
         input_tile_sizes_dim = []
         for i in range(num_inputs):
-            input_dim_size = input_sizes[i][self.dim]
-            if extra_concat_input > 0 and non_dim_tile_elements > 0:
-                max_tile_dim = min(input_dim_size, extra_concat_input)
-                extra_concat_input -= max_tile_dim
+            if extra_concat > 0 and non_dim_tile_elements > 0:
+                tile_dim = min(input_sizes[i][self.dim], extra_concat)
+                extra_concat -= tile_dim
             else:
-                max_tile_dim = 1
-            input_tile_sizes_dim.append(max_tile_dim)
+                tile_dim = 1
+            input_tile_sizes_dim.append(tile_dim)
         return input_tile_sizes_dim
 
     def _build_buffer_mapping(self, input_nodes):
-        """Map actual buffer names to template buffer names """
-        buffer_name_to_template_name = {}
-        input_buffer_names = []
+        """Map actual buffer names to short template names (X0, X1, ...)."""
+        name_map = {}
+        template_names = []
         for x in input_nodes:
-            actual_name = x.get_name()
-            template_name = buffer_name_to_template_name.setdefault(
-                actual_name, f"X{len(buffer_name_to_template_name)}"
-            )
-            input_buffer_names.append(template_name)
-        return buffer_name_to_template_name, input_buffer_names
+            actual = x.get_name()
+            template = name_map.setdefault(actual, f"X{len(name_map)}")
+            template_names.append(template)
+        return name_map, template_names
 
     def _build_tile_descriptors(
-        self, kernel, input_nodes, input_sizes, input_tile_sizes_dim, tile_sizes, rank, input_buffer_names, output_node, excluded_dims=None
+        self, kernel, input_nodes, input_sizes, input_tile_sizes_dim, tile_sizes, rank,
+        input_buffer_names, output_node, excluded_dims=None
     ):
-        """Build tile descriptors for each input and output."""
+        """Build tile descriptors for every input (and its paired output)."""
         if excluded_dims is None:
             excluded_dims = set()
 
-        input_tile_descs = []
-        output_tile_descs = []
-        unique_tile_descs = {}
+        def make_tile_desc(tile_sz, vector_lane, name, offset):
+            desc = mlir_common.MLIRMultiDimTile(
+                tile_sz, vector_lane,
+                vlane_split_axis=len(tile_sz) - 1,
+                vlane_stride=1
+            )
+            desc.set_tile_size(tile_sz)
+            desc.set_name(name)
+            desc.offset = offset
+            return desc
+
         output_offset = output_node.get_layout().offset
+        input_tile_descs, output_tile_descs, unique_tile_descs = [], [], {}
 
         for i, x in enumerate(input_nodes):
-            x_offset = x.get_layout().offset
-            full_tile_sizes = []
-            tile_size_idx = 0
+            # Collect tile sizes for tiled dimensions only (skip excluded non-concat dims)
+            tile_sz = []
+            tile_idx = 0
             for d in range(rank):
                 if d != self.dim:
-                    # Skip excluded dimensions
-                    if tile_size_idx not in excluded_dims:
-                        full_tile_sizes.append(tile_sizes[tile_size_idx])
-                    tile_size_idx += 1
+                    if tile_idx not in excluded_dims:
+                        tile_sz.append(tile_sizes[tile_idx])
+                    tile_idx += 1
                 else:
-                    full_tile_sizes.append(input_tile_sizes_dim[i])
+                    tile_sz.append(input_tile_sizes_dim[i])
 
-            # Calculate vlane_split_axis for reduced dimensions
-            vlane_split_axis = len(full_tile_sizes) - 1
+            sram_name = f"{input_buffer_names[i].lower()}_cat_tile"
+            input_tile_descs.append(make_tile_desc(tile_sz, kernel.vector_lane, sram_name, x.get_layout().offset))
+            output_tile_descs.append(make_tile_desc(tile_sz, kernel.vector_lane, sram_name, output_offset))
 
-            # Input tile descriptor
-            input_tile_desc = mlir_common.MLIRMultiDimTile(
-                full_tile_sizes,
-                kernel.vector_lane,
-                vlane_split_axis=vlane_split_axis,
-                vlane_stride=1
-            )
-            input_tile_desc.set_tile_size(full_tile_sizes)
-            template_buffer_name = input_buffer_names[i]
-            input_tile_desc.set_name(f"{template_buffer_name.lower()}_cat_tile")
-            input_tile_desc.offset = x_offset
-            input_tile_descs.append(input_tile_desc)
-
-            # Output tile descriptor (same as input but with output offset)
-            output_tile_desc = mlir_common.MLIRMultiDimTile(
-                full_tile_sizes,
-                kernel.vector_lane,
-                vlane_split_axis=vlane_split_axis,
-                vlane_stride=1
-            )
-            output_tile_desc.set_tile_size(full_tile_sizes)
-            output_tile_desc.set_name(f"{template_buffer_name.lower()}_cat_tile")
-            output_tile_desc.offset = output_offset
-            output_tile_descs.append(output_tile_desc)
-
-            # Store unique tile desc by actual buffer name
             actual_name = x.get_name()
             if actual_name not in unique_tile_descs:
-                unique_tile_descs[actual_name] = input_tile_desc
+                unique_tile_descs[actual_name] = input_tile_descs[-1]
 
         return input_tile_descs, output_tile_descs, unique_tile_descs
 
-    def _build_index_expressions(
-        self, input_nodes, input_sizes, output_strides, rank, num_inputs, excluded_dims=None
+    def _build_dma_info(
+        self, input_nodes, input_sizes, output_strides,
+        input_tile_descs, output_tile_descs,
+        rank, num_inputs, excluded_dims=None
     ):
-        """Build index expressions for input and output."""
+        """Build per-input DRAM offset affine maps and tile strides.
+
+        Three stride concepts are maintained:
+
+        * layout_strides (internal) - raw DRAM buffer strides for every rank
+          dimension, used to compute the flat base-address affine map.
+          These reflect how the tensor is physically laid out in DRAM.
+        * dram_strides (returned,  ``def_dma_op dram_stride=``) - stride in
+          DRAM per *tiled* dimension (excluded dims removed). The DMA engine
+          uses these to walk DRAM when loading/storing a tile.
+        * sram_strides (inside ``def_dma_op``, from tile_desc) - stride in
+          SRAM per tiled dimension. The DMA engine uses these to place data
+          into the SRAM tile buffer.
+
+        Returns:
+            input_offset_maps, input_offset_var_strs, input_dram_strides,
+            output_offset_maps, output_offset_var_strs, output_dram_strides,
+            cumulative_offsets
+        """
         if excluded_dims is None:
             excluded_dims = set()
 
-        input_idxs = []
-        output_idxs = []
+        def make_affine_map(idx_syms, strides, layout_offset):
+            terms = []
+            for j, s in enumerate(strides):
+                s = int(s)
+                if s == 1:
+                    terms.append(f"d{j}")
+                elif s != 0:
+                    terms.append(f"d{j} * {s}")
+            try:
+                off = int(layout_offset)
+            except (TypeError, ValueError):
+                off = 0
+            if off:
+                terms.append(str(off))
+            dim_str = ", ".join(f"d{j}" for j in range(len(idx_syms)))
+            return f"affine_map<({dim_str}) -> ({' + '.join(terms) if terms else '0'})>"
+
         cumulative_offsets = [0]
         for i in range(num_inputs - 1):
             cumulative_offsets.append(cumulative_offsets[-1] + input_sizes[i][self.dim])
 
+        input_offset_maps, input_offset_var_strs, input_dram_strides = [], [], []
+        output_offset_maps, output_offset_var_strs, output_dram_strides = [], [], []
+
         for i, x in enumerate(input_nodes):
             x_stride = x.get_layout().stride
-            x_offset = x.get_layout().offset
             if hasattr(x, 'data') and hasattr(x.data, 'dims'):
-                # In case of PermuteView, the stride is permuted
-                perm_dims = x.data.dims
-                x_stride = [x_stride[perm_dims[d]] for d in range(rank)]
+                # PermuteView: re-order strides according to the permutation
+                perm = x.data.dims
+                x_stride = [x_stride[perm[d]] for d in range(rank)]
+
+            in_syms, in_layout_strides, in_dram_strides = [], [], []
+            out_syms, out_layout_strides, out_dram_strides = [], [], []
+            tile_idx = 0
 
-            input_idx = []
-            output_idx = []
-            tile_size_idx = 0
             for d in range(rank):
                 if d != self.dim:
-                    # Skip excluded dimensions
-                    if tile_size_idx not in excluded_dims:
-                        input_idx_symbol = sympy.Symbol(f"index{d}")
-                        output_idx_symbol = sympy.Symbol(f"index{d}")
-                        input_idx.append(input_idx_symbol * x_stride[d])
-                        output_idx.append(output_idx_symbol * output_strides[d])
-                    tile_size_idx += 1
+                    in_syms.append(sympy.Symbol(f"index{d}"))
+                    in_layout_strides.append(int(x_stride[d]))
+                    out_syms.append(sympy.Symbol(f"index{d}"))
+                    out_layout_strides.append(int(output_strides[d]))
+                    if tile_idx not in excluded_dims:
+                        in_dram_strides.append(int(x_stride[d]))
+                        out_dram_strides.append(int(output_strides[d]))
+                    tile_idx += 1
                 else:
-                    input_idx_symbol = sympy.Symbol(f"index_local{self.dim}_{i}")
-                    output_idx_symbol = sympy.Symbol(f"index{self.dim}_{i}")
-                    input_idx.append(input_idx_symbol * x_stride[d])
-                    output_idx.append(output_idx_symbol * output_strides[d])
-            input_idxs.append(input_idx)
-            output_idxs.append(output_idx)
-
-        return input_idxs, output_idxs, cumulative_offsets
+                    in_syms.append(sympy.Symbol(f"index_local{self.dim}_{i}"))
+                    in_layout_strides.append(int(x_stride[d]))
+                    out_syms.append(sympy.Symbol(f"index{self.dim}_{i}"))
+                    out_layout_strides.append(int(output_strides[d]))
+                    in_dram_strides.append(int(x_stride[d]))
+                    out_dram_strides.append(int(output_strides[d]))
+
+            input_offset_maps.append(make_affine_map(in_syms, in_layout_strides, input_tile_descs[i].offset))
+            input_offset_var_strs.append(", ".join(f"%{s}" for s in in_syms))
+            input_dram_strides.append(in_dram_strides)
+
+            output_offset_maps.append(make_affine_map(out_syms, out_layout_strides, output_tile_descs[i].offset))
+            output_offset_var_strs.append(", ".join(f"%{s}" for s in out_syms))
+            output_dram_strides.append(out_dram_strides)
+
+        return (input_offset_maps, input_offset_var_strs, input_dram_strides,
+                output_offset_maps, output_offset_var_strs, output_dram_strides,
+                cumulative_offsets)
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 7c52bfe6..9cc79e0a 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -809,12 +809,18 @@ def hook():
         return key
 
     def def_dma_op(self, dma_type, dram_var:str, index_list:list, tile_desc:mlir_common.MLIRMultiDimTile,
-                   subtile_size:list=[], async_type=None, indent_size=0, priority: int = 5, lazy_mode: bool = True):
+                   subtile_size:list=[], async_type=None, indent_size=0, priority: int = 5, lazy_mode: bool = True,
+                   dram_stride:list=None, dram_offset=None):
+        # Todo. Remove legacy behavior (i.e., index_list parsing)
         def generate_dma_code():
             """Internal method to generate DMA code directly."""
             local_code = IndentedBuffer()
             with self, self.override_buffer_cse(buffer=local_code, cse=self.apply_cse):
-                index_var = self.parse_index_list(index_list, offset=tile_desc.offset)
+                if dram_offset is not None:
+                    # Use explicitly provided offset (pre-computed MLIR SSA variable name)
+                    index_var = dram_offset
+                else:
+                    index_var = self.parse_index_list(index_list, offset=tile_desc.offset)
                 node_layout = self.named_nodes[dram_var].get_layout()
                 if dram_var in self.exception_nodes:
                     numel = self.exception_nodes[dram_var]["numel"]
@@ -822,27 +828,33 @@ def generate_dma_code():
                     numel = self.get_arg_info(self.named_nodes[dram_var].get_name()).get_numel()
                 mlir_dtype = mlir_common.DTYPE_TO_MLIR[node_layout.dtype]
                 dram_shape = f"memref<{numel}x{mlir_dtype}>"
-                dram_stride = []
-                for idx in index_list:
-                    if idx.is_Mul:
-                        dram_stride.append(int(idx.args[0]))
-                    elif idx == sympy.Symbol("c0"):
-                        dram_stride.append(0)
-                    elif not idx.is_Number:
-                        dram_stride.append(1)
-                    else:
-                        dram_stride.append(0)
 
-                    sram_var = tile_desc.get_name()
-                    tile_shape = tile_desc.get_mlir_shape(mlir_dtype)
-                    tile_stride = tile_desc.get_tile_stride()
-                    vlane_split_axis = tile_desc.vmap.vlane_split_axis
-                    vlane_stride = tile_desc.vmap.vlane_stride
+                if dram_stride is not None:
+                    # Use explicitly provided dram_stride
+                    _dram_stride = dram_stride
+                else:
+                    # Extract dram_stride from index_list (legacy behavior)
+                    _dram_stride = []
+                    for idx in index_list:
+                        if idx.is_Mul:
+                            _dram_stride.append(int(idx.args[0]))
+                        elif idx == sympy.Symbol("c0"):
+                            _dram_stride.append(0)
+                        elif not idx.is_Number:
+                            _dram_stride.append(1)
+                        else:
+                            _dram_stride.append(0)
+
+                sram_var = tile_desc.get_name()
+                tile_shape = tile_desc.get_mlir_shape(mlir_dtype)
+                sram_strides = tile_desc.get_tile_stride()
+                vlane_split_axis = tile_desc.vmap.vlane_split_axis
+                vlane_stride = tile_desc.vmap.vlane_stride
 
                 zero_cse = self.get_const_cse(0, "index")
                 sram_index_var = ", ".join([f"%{str(zero_cse)}"]*tile_desc.get_nr_dim())
 
-                attribute_parts = [f"dram_stride={dram_stride}", f"sram_stride={tile_stride}", "padding=0"]
+                attribute_parts = [f"dram_stride={_dram_stride}", f"sram_stride={sram_strides}", "padding=0"]
                 if subtile_size:
                     attribute_parts.append(f"subtile_size={subtile_size}, async={int(async_type) if async_type is not None else 1}")
                 attribute = "  {" + ", ".join(attribute_parts) + "}"
diff --git a/tests/test_cat.py b/tests/test_cat.py
index 62de6759..97fcc754 100644
--- a/tests/test_cat.py
+++ b/tests/test_cat.py
@@ -150,13 +150,25 @@ def cat_4d_three_inputs_fn(a, b, c):
     cpu_out = torch.cat([x.cpu(), y.cpu(), z.cpu()], dim=1)
     _test_result("cat.4d.three_inputs", out, cpu_out, rtol=1e-4, atol=1e-4)
 
+def test_cat_5d(device, dim=0):
+    def cat_5d_fn(a, b):
+        return torch.cat([a, b], dim=dim)
+
+    x = torch.randn(2, 3, 4, 5, 6, device=device)
+    y = torch.randn(3, 3, 4, 5, 6, device=device)
+    opt_fn = torch.compile(dynamic=False)(cat_5d_fn)
+
+    out = opt_fn(x, y)
+
+    cpu_out = torch.cat([x.cpu(), y.cpu()], dim=dim)
+    _test_result("cat.5d.dim0", out, cpu_out, rtol=1e-4, atol=1e-4)
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Run cat simulation tests")
     parser.add_argument(
         "--case",
         choices=[
-            "default", "out", "4d_dim0", "4d_dim1", "4d_dim2", "4d_dim3",
+            "default", "out", "4d_dim0", "4d_dim1", "4d_dim2", "4d_dim3", "5d"
             "three_inputs", "four_inputs", "4d_three_inputs", "all"
         ],
         default="all",
@@ -184,3 +196,5 @@ def cat_4d_three_inputs_fn(a, b, c):
         test_cat_four_inputs(device)
     if args.case in ("4d_three_inputs", "all"):
         test_cat_4d_three_inputs(device)
+    if args.case in ("5d", "all"):
+        test_cat_5d(device)

From 3d9cb387b2ba27853efb983241fa4450c3174d9d Mon Sep 17 00:00:00 2001
From: jung-min <wjdals020503@naver.com>
Date: Thu, 5 Mar 2026 11:45:36 +0000
Subject: [PATCH 119/194] [Frontend/template] Connect SDPA template to NPU
 using Torch OpenReg

---
 PyTorchSimDevice/csrc/aten/OpenRegExtra.cpp   |  34 +---
 PyTorchSimDevice/csrc/aten/native/Extra.cpp   |  51 +----
 .../torch_openreg/openreg/__init__.py         |   4 +-
 PyTorchSimFrontend/mlir/mlir_lowering.py      |  14 +-
 PyTorchSimFrontend/mlir/mlir_sdpa_template.py | 186 +-----------------
 5 files changed, 14 insertions(+), 275 deletions(-)

diff --git a/PyTorchSimDevice/csrc/aten/OpenRegExtra.cpp b/PyTorchSimDevice/csrc/aten/OpenRegExtra.cpp
index 04ba6d48..f048f878 100644
--- a/PyTorchSimDevice/csrc/aten/OpenRegExtra.cpp
+++ b/PyTorchSimDevice/csrc/aten/OpenRegExtra.cpp
@@ -2,6 +2,7 @@
 
 #include <ATen/native/CPUFallback.h>
 #include <ATen/native/DispatchStub.h>
+#include <ATen/native/transformers/attention.h>
 
 #include <torch/csrc/autograd/autograd_not_implemented_fallback.h>
 #include <torch/library.h>
@@ -40,36 +41,6 @@ void wrapper_quantize_tensor_per_tensor_affine_stub(
       rtensor, qtensor, scale, zero_point);
 }
 
-std::tuple<
-    at::Tensor,
-    at::Tensor,
-    at::Tensor,
-    at::Tensor,
-    c10::SymInt,
-    c10::SymInt,
-    at::Tensor,
-    at::Tensor,
-    at::Tensor>
-wrapper__scaled_dot_product_fused_attention_overrideable(
-    const at::Tensor& query,
-    const at::Tensor& key,
-    const at::Tensor& value,
-    const std::optional<at::Tensor>& attn_bias,
-    double dropout_p,
-    bool is_causal,
-    bool return_debug_mask,
-    std::optional<double> scale) {
-  return at::native::openreg::_scaled_dot_product_fused_attention_overrideable(
-      query,
-      key,
-      value,
-      attn_bias,
-      dropout_p,
-      is_causal,
-      return_debug_mask,
-      scale);
-}
-
 std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
 wrapper_scaled_dot_product_fused_attention_overrideable_backward(
     const at::Tensor& grad_out,
@@ -172,9 +143,6 @@ TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
   m.impl("abs.out", &wrapper_abs_out);
   m.impl("quantize_per_tensor", &wrapper_quantize_per_tensor);
   m.impl("_fused_sdp_choice", &wrapper__fused_sdp_choice);
-  m.impl(
-      "_scaled_dot_product_fused_attention_overrideable",
-      &wrapper__scaled_dot_product_fused_attention_overrideable);
   m.impl(
       "_scaled_dot_product_fused_attention_overrideable_backward",
       &wrapper_scaled_dot_product_fused_attention_overrideable_backward);
diff --git a/PyTorchSimDevice/csrc/aten/native/Extra.cpp b/PyTorchSimDevice/csrc/aten/native/Extra.cpp
index 711d114c..aaf28e1a 100644
--- a/PyTorchSimDevice/csrc/aten/native/Extra.cpp
+++ b/PyTorchSimDevice/csrc/aten/native/Extra.cpp
@@ -19,7 +19,8 @@ int64_t _fused_sdp_choice(
     bool is_causal,
     std::optional<double> scale,
     bool enable_gqa) {
-  auto backend = sdp::SDPBackend::math;
+
+  auto backend = sdp::SDPBackend::overrideable;
   return static_cast<int64_t>(backend);
 }
 
@@ -29,54 +30,6 @@ void quantize_tensor_per_tensor_affine_stub(
     double scale,
     int64_t zero_point) {}
 
-std::tuple<
-    at::Tensor,
-    at::Tensor,
-    at::Tensor,
-    at::Tensor,
-    c10::SymInt,
-    c10::SymInt,
-    at::Tensor,
-    at::Tensor,
-    at::Tensor>
-_scaled_dot_product_fused_attention_overrideable(
-    const at::Tensor& query,
-    const at::Tensor& key,
-    const at::Tensor& value,
-    const std::optional<at::Tensor>& attn_bias,
-    double dropout_p,
-    bool is_causal,
-    bool return_debug_mask,
-    std::optional<double> scale) {
-  const int64_t batch_size = query.size(0);
-  const int64_t num_heads = query.size(1);
-  const int64_t head_dim_v = value.size(3);
-  const int64_t max_seqlen_q = query.size(2);
-  const int64_t max_seqlen_kv = key.size(2);
-
-  auto opts = query.options();
-  auto output =
-      at::empty({batch_size, num_heads, max_seqlen_q, head_dim_v}, opts);
-  auto logsumexp =
-      at::empty({batch_size, num_heads, max_seqlen_q}, opts.dtype(at::kFloat));
-  auto debug_attn_mask = at::empty(
-      {batch_size, num_heads, max_seqlen_q, max_seqlen_kv},
-      opts.dtype(at::kFloat));
-  auto philox_seed = at::empty({}, at::dtype(at::kLong));
-  auto philox_offset = at::empty({}, at::dtype(at::kLong));
-
-  return std::make_tuple(
-      output,
-      logsumexp,
-      at::Tensor(),
-      at::Tensor(),
-      max_seqlen_q,
-      max_seqlen_kv,
-      philox_seed,
-      philox_offset,
-      debug_attn_mask);
-}
-
 std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
 _scaled_dot_product_fused_attention_overrideable_backward(
     const at::Tensor& grad_out,
diff --git a/PyTorchSimDevice/torch_openreg/openreg/__init__.py b/PyTorchSimDevice/torch_openreg/openreg/__init__.py
index 5a0de6c3..9d10f90e 100644
--- a/PyTorchSimDevice/torch_openreg/openreg/__init__.py
+++ b/PyTorchSimDevice/torch_openreg/openreg/__init__.py
@@ -66,8 +66,8 @@ def _lazy_init():
         return
 
     # Replace the global C++ binding with our custom dispatcher patch
-    from PyTorchSimFrontend.mlir.mlir_sdpa_template import patched_scaled_dot_product_attention
-    torch._C._nn.scaled_dot_product_attention = patched_scaled_dot_product_attention
+    # from PyTorchSimFrontend.mlir.mlir_sdpa_template import patched_scaled_dot_product_attention
+    # torch._C._nn.scaled_dot_product_attention = patched_scaled_dot_product_attention
     
     torch_openreg._C._init()
     register_interface_for_device(custom_device(), ExtensionDeviceInterface)
diff --git a/PyTorchSimFrontend/mlir/mlir_lowering.py b/PyTorchSimFrontend/mlir/mlir_lowering.py
index e09dcf57..a6b2478c 100644
--- a/PyTorchSimFrontend/mlir/mlir_lowering.py
+++ b/PyTorchSimFrontend/mlir/mlir_lowering.py
@@ -15,7 +15,7 @@
 from PyTorchSimFrontend.mlir.mlir_conv_sb_template import MLIRConvSingleBatchTemplate
 from PyTorchSimFrontend.mlir.mlir_conv_sbs_template import MLIRConvSingleBatchStridedTemplate
 from PyTorchSimFrontend.mlir.mlir_maxpool_template import MLIRMaxPoolTemplate
-from PyTorchSimFrontend.mlir.mlir_sdpa_template import MLIRFlashSDPATemplate, flash_sdpa_args
+from PyTorchSimFrontend.mlir.mlir_sdpa_template import MLIRFlashSDPATemplate, flash_sdpa_args, calculate_scale
 from PyTorchSimFrontend import extension_config
 
 aten = torch.ops.aten
@@ -44,14 +44,16 @@ def tuned_flash_sdpa(
         query             : TensorBox, 
         key               : TensorBox, 
         value             : TensorBox, 
-        scale             : float, 
+        attn_bias         : Optional[TensorBox] = None,
         dropout_p         : float = 0.0, 
         is_causal         : bool = False, 
-        return_debug_mask : bool =False) -> tuple: 
+        return_debug_mask : bool = False,
+        scale             : Optional[float] = None) -> tuple: 
     
-    print("Enter tuned_flash_sdpa")
-
+    
+    scale = calculate_scale(query, scale)
     N, Hq, H, L, S, E, Ev, layout, query, key, value = flash_sdpa_args(query, key, value)
+    
     mlir_template = MLIRFlashSDPATemplate([query, key, value], layout, scale)
 
     # _scaled_dot_product_flash_attention has to return a tuple which has 9 values
@@ -211,4 +213,4 @@ def custom_unsafe_index(x, indices):
 if extension_config.CONFIG_USE_TIMING_POOLING:
     lowerings.update({getattr(aten.max_pool2d_with_indices, overload): custom_maxpool for overload in aten.max_pool2d_with_indices.overloads()}) # FIXME: maxpool should be implemented as a template
 
-lowerings.update({getattr(aten._scaled_dot_product_flash_attention, overload): tuned_flash_sdpa for overload in aten._scaled_dot_product_flash_attention.overloads()})
\ No newline at end of file
+lowerings.update({getattr(aten._scaled_dot_product_fused_attention_overrideable, overload): tuned_flash_sdpa for overload in aten._scaled_dot_product_fused_attention_overrideable.overloads()})
\ No newline at end of file
diff --git a/PyTorchSimFrontend/mlir/mlir_sdpa_template.py b/PyTorchSimFrontend/mlir/mlir_sdpa_template.py
index 49c6c6bb..05030f27 100644
--- a/PyTorchSimFrontend/mlir/mlir_sdpa_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_sdpa_template.py
@@ -73,121 +73,6 @@ def flash_sdpa_args(
     )
 
     return [n, hq, h, l, s, e, ev, layout, query, key, value]    
-    
-def validate_sdpa_input(
-        query       : torch.Tensor,
-        key         : torch.Tensor,
-        value       : torch.Tensor,
-        attn_mask   : torch.Tensor = None,
-        dropout_p   : float = 0.0, 
-        is_casual   : bool = False,
-        scale       : float = None,
-        enable_gqa  : bool = False) -> None:
-    """
-    Validates input tensors and parameters for Scaled Dot Product Attention (SDPA).
-    This function's logic can be found in:
-    https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/transformers/attention.cpp(504 line)
-    https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
-    """
-
-    # Tensor class, dtype, and device consistency
-    # Ensure all primary inputs are torch.Tensors
-    if not all(isinstance(t, torch.Tensor) for t in [query, key, value]):
-        raise TypeError(
-            f"Expected query, key and value to be Tensors, but got "
-            f"{type(query).__name__}, {type(key).__name__}, and {type(value).__name__}."
-        )
-
-    # Check for dtype mismatch
-    if query.dtype != key.dtype or query.dtype != value.dtype:
-        raise TypeError(
-            f"Expected query, key, and value to have the same dtype, "
-            f"but got {query.dtype}, {key.dtype}, and {value.dtype}."
-        )
-    
-    # Check for device mismatch (e.g., mixing CPU and NPU)
-    if query.device != key.device or query.device != value.device:
-        raise ValueError(
-            f"Expected query, key, and value to be on the same device, "
-            f"but got {query.device}, {key.device}, and {value.device}."
-        )
-
-    # Shape and dimension validation
-    # SDPA typically expects 4D (B, H, S, D), but we check for at least 2D here
-    if any(t.dim() < 2 for t in [query, key, value]):
-        raise ValueError(
-            f"Expected query, key, and value to be at least 2D, "
-            f"but got Q:{query.dim()}D, K:{key.dim()}D, V:{value.dim()}D."
-        )
-
-    # Attention mask validation
-    if attn_mask is not None:
-        if not isinstance(attn_mask, torch.Tensor):
-            raise TypeError(f"Expected attn_mask to be a Tensor, but got {type(attn_mask).__name__}.")
-        
-        # Dtype check: floating point masks must match query dtype; bool masks are also allowed
-        if attn_mask.dtype.is_floating_point:
-            if attn_mask.dtype != query.dtype:
-                raise TypeError(f"Floating point attn_mask must match query dtype ({query.dtype}), but got {attn_mask.dtype}.")
-        elif attn_mask.dtype != torch.bool:
-            raise TypeError(f"attn_mask must be floating point or bool, but got {attn_mask.dtype}.")
-
-        # Nested tensor limitation with explicit masking
-        if query.is_nested or key.is_nested:
-            raise ValueError("Nested tensors are not supported when an explicit attn_mask is set.")
-
-    # Dropout and causal flag validation (added)
-    # Dropout probability must be in the range [0, 1)
-    if not (0.0 <= dropout_p < 1.0):
-        raise ValueError(f"Expected dropout_p to be in [0, 1), but got {dropout_p}.")
-
-    # Mutual exclusivity: cannot use both explicit mask and causal flag (added)
-    if is_casual and attn_mask is not None:
-        raise ValueError("Both attn_mask and is_casual cannot be set at the same time.")
-
-    # Scaling factor validation (added)
-    if scale is not None and scale <= 0.0:
-        raise ValueError(f"Expected scale to be a positive number, but got {scale}.")
-    
-    # GQA (Grouped Query Attention) constraints (added)
-    n_head_q = query.size(1)
-    n_head_k = key.size(1)
-    n_head_v = value.size(1)
-    
-    # The aten._scaled_dot_product_flash_attention kernel does not accept an explicit enable_gqa parameter.
-    # Instead, the Flash SDPA implementation infers GQA usage by checking if n_head_q != n_head_k.
-    if not enable_gqa and n_head_q != n_head_k:
-        raise ValueError(f"Query and Key must have the same number of heads when enable_gqa is false (Q:{n_head_q} vs K:{n_head_k}).")
-
-    if enable_gqa:
-        if n_head_q == n_head_k:
-            raise ValueError(f"enable_gqa Query and Key ")
-
-        if n_head_k != n_head_v:
-            raise ValueError(f"Key and Value must have the same number of heads (K:{n_head_k} vs V:{n_head_v}).")
-        
-        # Query heads must be an integer multiple of key heads for grouping
-        if n_head_q % n_head_k != 0:
-            raise ValueError(
-                f"Number of query heads ({n_head_q}) must be divisible by "
-                f"number of key heads ({n_head_k}) for GQA."
-            )
-
-def convert_boolean_attn_mask(attn_mask: torch.Tensor, target_dtype: torch.dtype) -> float:
-    """
-    Equivalent to the C++ 'convert_boolean_attn_mask' function.
-    Converts a boolean mask to a floating-point mask for SDPA.
-    """
-
-    if attn_mask is not None and attn_mask.dtype == torch.bool:
-      
-        new_mask = torch.zeros_like(attn_mask, dtype=target_dtype)
-        minus_inf = torch.finfo(target_dtype).min
-        new_mask.masked_fill_(attn_mask.logical_not(), minus_inf)
-        
-        return new_mask
-
-    return attn_mask
 
 def calculate_scale(query: torch.Tensor, scale: float) -> float:
     """
@@ -195,79 +80,10 @@ def calculate_scale(query: torch.Tensor, scale: float) -> float:
     Otherwise, use the provided scale.
     """
     if scale is None:
-        return 1.0 / math.sqrt(query.size(-1))
+        return 1.0 / math.sqrt(query.layout.size[-1])
     else:
         return scale
 
-def patched_scaled_dot_product_attention(
-        query_      : torch.Tensor,
-        key         : torch.Tensor, 
-        value       : torch.Tensor, 
-        dropout_p   : float = 0.0, 
-        is_casual   : bool = False, 
-        attn_mask_  : torch.Tensor = None,
-        scale_       : float = None, 
-        enable_gqa  : bool = None,
-        orig_fn     = torch._C._nn.scaled_dot_product_attention) -> torch.Tensor :
-    """
-    Custom patch for Scaled Dot Product Attention (SDPA) to intercept high-level calls.
-    For NPU devices, it redirects execution to specific ATen kernels based on global flags.
-    For all devices, it maintains parity with the original dispatcher logic found in:
-    https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/transformers/attention.cpp
-
-    This function acts as a custom override that replaces the default PyTorch SDPA implementation, 
-    invoked via 'PyTorchSim/PyTorchSimDevice/torch_openreg/openreg/__init__.py'.
-    """
-
-    # Device-specific Dispatching: redirect to specialized kernels if on NPU
-    if "npu" in str(query_.device):
-        
-        validate_sdpa_input(query_, key, value, attn_mask_, dropout_p, is_casual, scale_, enable_gqa)
-        attn_mask = convert_boolean_attn_mask(attn_mask_, query_.dtype)
-        
-        # Kernel selection logic: emulate C++ dispatcher priority
-        # Selection priority(can be changed): flash attention > memory efficient > math (cuDNN is not supported)
-        aten = torch.ops.aten 
-        scale = calculate_scale(query_, scale_)
-
-        if flash_sdp_enabled(): 
-            # Skip padding query, key and value for alignment.
-            dispatch_kwargs = {
-                "dropout_p" : dropout_p,
-                "is_causal" : is_casual,
-                "return_debug_mask" : False,
-                "scale" : scale
-            }
-            
-            out_lse_softmax = aten._scaled_dot_product_flash_attention(
-                query_, key, value, **dispatch_kwargs 
-            )
-
-            return out_lse_softmax[0]
-        elif mem_efficient_sdp_enabled():
-            # out_and_lse = aten._scaled_dot_product_efficient_attention(...)
-            # return out_and_lse[0]
-            raise NotImplementedError("Memory efficient SDPA is not implemented yet.")
-        else:
-            dispatch_kwargs = {
-                "attn_mask" : attn_mask,
-                "dropout_p" : dropout_p,
-                "is_causal" : is_casual,
-                "dropout_mask" : None,
-                "scale": scale,
-                "enable_gqa" : enable_gqa
-            }
-
-            out_lse_softmax = aten._scaled_dot_product_attention_math(
-                query_,
-                key,
-                value, 
-                **dispatch_kwargs)
-            
-            return out_lse_softmax[0]
-    else: 
-        # Fallback: Delegate to the original C++ Dispatcher for other devices 
-        return orig_fn(query_, key, value)
 
 FLASH_SDPA_TEMPLATE = r"""
 // SDPA kernel

From 591e8a98cdb7a734f58c3e2afff6b252f5b86bee Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 5 Mar 2026 23:16:40 +0900
Subject: [PATCH 120/194] [Templte/Cat] Apply copy operation when node has view

---
 PyTorchSimFrontend/mlir/mlir_cat_template.py | 11 +++-------
 PyTorchSimFrontend/mlir/mlir_lowering.py     | 23 +++++++++++++++++---
 2 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_cat_template.py b/PyTorchSimFrontend/mlir/mlir_cat_template.py
index 6eb60198..7bee54ac 100644
--- a/PyTorchSimFrontend/mlir/mlir_cat_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_cat_template.py
@@ -161,14 +161,14 @@ def get_tile_candidates(
             candidates = set()
             for mult in range(1, max_tile // kernel.vector_lane + 1):
                 t = mult * kernel.vector_lane
-                if t <= dim_size:
+                if t <= dim_size and dim_size % t == 0:
                     candidates.add(t)
             if max_tile > 0:
                 for exp in range(int(math.log2(max_tile)) + 1):
                     t = 2 ** exp
-                    if t <= dim_size:
+                    if t <= dim_size and dim_size % t == 0:
                         candidates.add(t)
-            candidates.add(dim_size)
+            candidates.add(dim_size)  # dim_size always divides itself
             dim_tile_candidates.append(sorted(candidates)[:5])
 
         tile_candidates = [
@@ -322,11 +322,6 @@ def make_affine_map(idx_syms, strides, layout_offset):
 
         for i, x in enumerate(input_nodes):
             x_stride = x.get_layout().stride
-            if hasattr(x, 'data') and hasattr(x.data, 'dims'):
-                # PermuteView: re-order strides according to the permutation
-                perm = x.data.dims
-                x_stride = [x_stride[perm[d]] for d in range(rank)]
-
             in_syms, in_layout_strides, in_dram_strides = [], [], []
             out_syms, out_layout_strides, out_dram_strides = [], [], []
             tile_idx = 0
diff --git a/PyTorchSimFrontend/mlir/mlir_lowering.py b/PyTorchSimFrontend/mlir/mlir_lowering.py
index d7aee715..e5df4b78 100644
--- a/PyTorchSimFrontend/mlir/mlir_lowering.py
+++ b/PyTorchSimFrontend/mlir/mlir_lowering.py
@@ -1,3 +1,4 @@
+import math
 from typing import List, Optional, Sequence
 
 import torch
@@ -205,11 +206,27 @@ def _cat_layout(tensors: Sequence[TensorBox], dim: int) -> ir.Layout:
 def custom_cat_default(tensors: Sequence[TensorBox], dim: int = 0):
     if tensors and dim < 0:
         dim += len(tensors[0].get_size())
-
+    copy_default_lowering = lowerings.get(aten.copy_.default)
+    empty_strided_lowering = lowerings.get(aten.empty_strided.default)
+    new_tensors = []
     for t in tensors:
         t.realize()
-    layout = _cat_layout(tensors, dim)
-    mlir_template = MLIRCatTemplate(list(tensors), layout, dim=dim)
+        # If the tensor is backed by a view (ReinterpretView, PermuteView, etc.),
+        # materialise it into a fresh contiguous FixedLayout buffer so the cat
+        # kernel always receives plain, dense strides.
+        if isinstance(t.data, ir.BaseView):
+            sizes = list(t.get_size())
+            strides = [math.prod(sizes[i + 1:]) for i in range(len(sizes))]
+            new_buf = empty_strided_lowering(
+                sizes, strides, dtype=t.get_dtype(), device=t.get_device()
+            )
+            tt = copy_default_lowering(new_buf, t)
+        else:
+            tt = t
+        new_tensors.append(tt)
+
+    layout = _cat_layout(new_tensors, dim)
+    mlir_template = MLIRCatTemplate(list(new_tensors), layout, dim=dim)
     return mlir_template.generate().output_node()
 
 def _custom_sort_values_impl(

From dab34954d61d5558658684dcb1415fa75c3c6935 Mon Sep 17 00:00:00 2001
From: jung-min <wjdals020503@naver.com>
Date: Sat, 7 Mar 2026 10:11:57 +0000
Subject: [PATCH 121/194] [Refactor] Refactored TopK test code for the OpenReg
 device

---
 tests/test_topk.py | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/tests/test_topk.py b/tests/test_topk.py
index c8565310..caf56779 100644
--- a/tests/test_topk.py
+++ b/tests/test_topk.py
@@ -31,21 +31,11 @@ def topk_fn(a):
 
     opt_topk = torch.compile(dynamic=False)(topk_fn)
     res_values, res_indices = opt_topk(x)
-
     ref_values, ref_indices = torch.topk(x.cpu(), k, dim=dim, largest=largest, sorted=sorted)
 
     test_result("TopK/values", res_values, ref_values)
     test_result("TopK/indices", res_indices, ref_indices)
 
 if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser(description="Run LayerNorm test with dynamic shape")
-    parser.add_argument('--shape', type=str, default="(512,768)")
-    args = parser.parse_args()
-    shape = tuple(map(int, args.shape.strip('()').split(',')))
-
-    from Scheduler.scheduler import ExecutionEngine
-    module = ExecutionEngine.setup_device()
-    device = module.custom_device()
+    device = torch.device('npu:0') 
     test_topk(device, (128, 128), k=2, dim=-1)
\ No newline at end of file

From a15f5d2128429c5fa9580e8eb2b1f625a55f054d Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 11 Mar 2026 11:03:29 +0900
Subject: [PATCH 122/194] [Template/Sort] Add template code for Bitonic sort

---
 PyTorchSimFrontend/mlir/mlir_lowering.py      | 133 +---
 PyTorchSimFrontend/mlir/mlir_ops.py           |  76 ++-
 PyTorchSimFrontend/mlir/mlir_sort_template.py | 627 ++++++++++++------
 tests/test_sort.py                            | 128 ++--
 4 files changed, 591 insertions(+), 373 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_lowering.py b/PyTorchSimFrontend/mlir/mlir_lowering.py
index e5df4b78..36e9955b 100644
--- a/PyTorchSimFrontend/mlir/mlir_lowering.py
+++ b/PyTorchSimFrontend/mlir/mlir_lowering.py
@@ -17,13 +17,11 @@
 from PyTorchSimFrontend.mlir.mlir_conv_sbs_template import MLIRConvSingleBatchStridedTemplate
 from PyTorchSimFrontend.mlir.mlir_maxpool_template import MLIRMaxPoolTemplate
 from PyTorchSimFrontend.mlir.mlir_cat_template import MLIRCatTemplate
-from PyTorchSimFrontend.mlir.mlir_sort_template import MLIRSortTemplate
+from PyTorchSimFrontend.mlir.mlir_sort_template import MLIRSortTemplate, MLIRStableSortTemplate
 from PyTorchSimFrontend import extension_config
 
 aten = torch.ops.aten
 aten_spmm = MLIRExternKernelChoice(torch.sparse.mm, "custom_op::sparse_addmm")
-_orig_cat_default_lowering = lowerings.get(aten.cat.default)
-_orig_cat_out_lowering = lowerings.get(aten.cat.out)
 _orig_sort_values_stable_lowering = lowerings.get(aten.sort.values_stable)
 
 def tuned_mm(mat1, mat2, * ,layout=None):
@@ -229,48 +227,35 @@ def custom_cat_default(tensors: Sequence[TensorBox], dim: int = 0):
     mlir_template = MLIRCatTemplate(list(new_tensors), layout, dim=dim)
     return mlir_template.generate().output_node()
 
-def _custom_sort_values_impl(
-    self: TensorBox,
+def custom_sort_default(
+    value: TensorBox,
     dim: int = -1,
     descending: bool = False,
-    values: Optional[TensorBox] = None,
-    indices: Optional[TensorBox] = None,
     stable: Optional[bool] = None,
 ):
-    if values is None or indices is None:
-        raise RuntimeError("sort.values* lowering requires both out tensors: values, indices")
+    if dim < 0:
+        dim += len(value.get_size())
 
-    def _normalize_dim(rank: int, d: int) -> int:
-        return d + rank if d < 0 else d
+    value.realize()
 
-    if not hasattr(self, "get_size"):
-        raise RuntimeError("sort.values* lowering requires TensorBox input")
-
-    rank = len(self.get_size())
-    norm_dim = _normalize_dim(rank, dim)
-    if norm_dim < 0 or norm_dim >= rank:
-        raise RuntimeError(f"sort.values* dim out of range: dim={dim}, rank={rank}")
-    if rank != 2:
-        raise RuntimeError(f"sort.values* lowering currently supports rank-2 only, got rank={rank}")
-    if norm_dim not in (0, 1):
-        raise RuntimeError(f"sort.values* lowering currently supports dim in {{0,1}} only, got dim={norm_dim}")
-
-    self.realize()
-    if isinstance(values, TensorBox):
-        values.realize()
-    if isinstance(indices, TensorBox):
-        indices.realize()
-
-    value_layout, _ = _sort_layouts(self, norm_dim, descending)
-    mlir_template = MLIRSortTemplate(
-        [self],
+    value_layout, index_layout = _sort_layouts(value, dim, descending)
+    empty_strided_lowering = lowerings.get(aten.empty_strided.default)
+    indices = empty_strided_lowering(
+        value.get_size(),
+        index_layout.stride,
+        dtype=torch.int64,
+        device=value.get_device(),
+    )
+    stable_required = True if stable is None else stable
+    sort_template_cls = MLIRStableSortTemplate if stable_required else MLIRSortTemplate
+    mlir_template = sort_template_cls(
+        [value, indices],
         value_layout,
-        dim=norm_dim,
+        dim=dim,
         descending=descending,
-        stable=True if stable is None else stable,
-        indices_node=indices,
+        stable=stable_required,
     )
-    sorted_values = mlir_template.generate(template_buffer_node=values, epilogue_nodes=[indices]).output_node()
+    sorted_values = mlir_template.generate(template_buffer_node=value).output_node()
     return sorted_values, indices
 
 
@@ -290,78 +275,6 @@ def _sort_layouts(x: TensorBox, dim: int, descending: bool):
     index_layout = ir.FixedLayout(x.get_device(), torch.int64, i_sizes, i_stride)
     return value_layout, index_layout
 
-
-def custom_sort_stable(
-    self: TensorBox,
-    *,
-    stable: Optional[bool] = None,
-    dim: int = -1,
-    descending: bool = False,
-):
-    empty_strided_lowering = lowerings.get(aten.empty_strided.default)
-    if empty_strided_lowering is None:
-        if _orig_sort_values_stable_lowering is None:
-            raise RuntimeError("sort.stable lowering requires aten.empty_strided.default")
-        return _orig_sort_values_stable_lowering(self, dim=dim, descending=descending, stable=True)
-
-    rank = len(self.get_size()) if hasattr(self, "get_size") else 0
-    norm_dim = dim + rank if dim < 0 else dim
-    if rank > 0 and (norm_dim < 0 or norm_dim >= rank):
-        raise RuntimeError(f"sort.stable dim out of range: dim={dim}, rank={rank}")
-
-    # Template specialization supports rank-2 and dim in {0,1}.
-    if rank == 2 and norm_dim not in (0, 1):
-        if _orig_sort_values_stable_lowering is None:
-            raise RuntimeError("Original aten.sort.values_stable lowering is missing")
-        return _orig_sort_values_stable_lowering(self, dim=dim, descending=descending, stable=True)
-
-    try:
-        value_layout, index_layout = _sort_layouts(self, norm_dim, descending)
-        values = empty_strided_lowering(
-            list(value_layout.size),
-            list(value_layout.stride),
-            dtype=value_layout.dtype,
-            device=self.get_device(),
-        )
-        indices = empty_strided_lowering(
-            list(index_layout.size),
-            list(index_layout.stride),
-            dtype=index_layout.dtype,
-            device=self.get_device(),
-        )
-        return _custom_sort_values_impl(
-            self=self,
-            dim=dim,
-            descending=descending,
-            values=values,
-            indices=indices,
-            stable=True if stable is None else stable,
-        )
-    except Exception:
-        if _orig_sort_values_stable_lowering is None:
-            raise
-        return _orig_sort_values_stable_lowering(self, dim=dim, descending=descending, stable=stable)
-
-
-def custom_sort_values_stable(
-    self: TensorBox,
-    *,
-    stable: Optional[bool] = None,
-    dim: int = -1,
-    descending: bool = False,
-    values: Optional[TensorBox] = None,
-    indices: Optional[TensorBox] = None,
-):
-    return _custom_sort_values_impl(
-        self=self,
-        dim=dim,
-        descending=descending,
-        values=values,
-        indices=indices,
-        stable=stable,
-    )
-
-
 lowerings.update({getattr(aten.mm, overload): tuned_mm for overload in aten.mm.overloads()})
 lowerings.update({getattr(aten.addmm, overload): tuned_addmm for overload in aten.addmm.overloads()})
 lowerings.update({getattr(aten.convolution, overload): convolution for overload in aten.convolution.overloads()})
@@ -369,9 +282,7 @@ def custom_sort_values_stable(
 lowerings.update({getattr(aten._sparse_addmm, overload): sparse_addmm for overload in aten._sparse_addmm.overloads()})
 lowerings.update({getattr(aten._unsafe_index, overload): custom_unsafe_index for overload in aten._unsafe_index.overloads()})
 lowerings.update({getattr(aten.cat, overload): custom_cat_default for overload in aten.cat.overloads()})
-
-lowerings.update({aten.sort.stable: custom_sort_stable})
-lowerings.update({aten.sort.values_stable: custom_sort_values_stable})
+lowerings.update({getattr(aten.sort, overload): custom_sort_default for overload in aten.sort.overloads()})
     
 if extension_config.CONFIG_USE_TIMING_POOLING:
     lowerings.update({getattr(aten.max_pool2d_with_indices, overload): custom_maxpool for overload in aten.max_pool2d_with_indices.overloads()}) # FIXME: maxpool should be implemented as a template
diff --git a/PyTorchSimFrontend/mlir/mlir_ops.py b/PyTorchSimFrontend/mlir/mlir_ops.py
index 9edd2e44..ace4f9ea 100644
--- a/PyTorchSimFrontend/mlir/mlir_ops.py
+++ b/PyTorchSimFrontend/mlir/mlir_ops.py
@@ -182,7 +182,7 @@ def to_dtype(operand, dst_mlir_dtype, *args, **kwargs):
 
         # Case A: Integer -> Float
         if src_type_char == "i" and dst_type_char == "f":
-            op_str = f"arith.sitofp %{operand} : {src_shape} to {shape}"
+            op_str = f"arith.uitofp %{operand} : {src_shape} to {shape}"
         # Case B: Float -> Integer
         elif src_type_char == "f" and dst_type_char == "i":
             op_str = f"arith.fptosi %{operand} : {src_shape} to {shape}"
@@ -1142,6 +1142,80 @@ def multi_reduction(acc, init, vec_size, red_size, red_shape, red_type, type_nam
             line = reduction_combine_vec(red_type, value, init, axis=0, shape=new_vshape, reduced_shape=final_reduced_shape)
         return line, [red_size, type_name]
 
+    @staticmethod
+    def vector_shuffle(operand, indices, operand2=None, *args, **kwargs):
+        tile_size1, dtype1 = V.kernel.var_info[operand]
+        if operand2 is None:
+            operand2 = operand
+        tile_size2, dtype2 = V.kernel.var_info[operand2]
+        if dtype1 != dtype2:
+            raise ValueError(
+                f"vector_shuffle expects same element type, got {dtype1} and {dtype2}"
+            )
+        total_size = tile_size1 + tile_size2
+        for idx in indices:
+            if idx < -1 or idx >= total_size:
+                raise ValueError(
+                    f"vector_shuffle index out of range: {idx}, expected in [-1, {total_size - 1}]"
+                )
+        vt1 = f"vector<{tile_size1}x{dtype1}>"
+        vt2 = f"vector<{tile_size2}x{dtype1}>"
+        idx_str = ", ".join(str(i) for i in indices)
+        op_str = f"vector.shuffle %{operand}, %{operand2} [{idx_str}]"
+        return format_mlir_op(op_str, f"{vt1}, {vt2}", **kwargs), [len(indices), dtype1]
+
+    @staticmethod
+    def constant_mask(select_min, N, *args, **kwargs):
+        vals = ", ".join("true" if x else "false" for x in select_min)
+        op_str = f"arith.constant dense<[{vals}]>"
+        return format_mlir_op(op_str, f"vector<{N}xi1>", **kwargs), [N, "i1"]
+
+    @staticmethod
+    def bitonic_sort(operand, descending=False, *args, **kwargs):
+        def _compute_bitonic_stages(N: int, descending: bool):
+            assert N >= 2 and (N & (N - 1)) == 0, "N must be power-of-2 >= 2"
+            stages = []
+            size = 2
+            while size <= N:
+                stride = size // 2
+                while stride >= 1:
+                    merged_shuffle = list(range(N))
+                    merged_mask = [None] * N
+
+                    for start in range(0, N, size):
+                        blk_dir = "ASCENDING" if (start // size) % 2 == 0 else "DESCENDING"
+                        for i in range(start, start + size - stride, stride * 2):
+                            for j in range(stride):
+                                a, b = i + j, i + j + stride
+                                merged_shuffle[a] = b
+                                merged_shuffle[b] = a
+                                if blk_dir == "ASCENDING":
+                                    merged_mask[a] = True   # a = min
+                                    merged_mask[b] = False  # b = max
+                                else:
+                                    merged_mask[a] = False  # a = max
+                                    merged_mask[b] = True   # b = min
+                    select_min = [bool(x) if x is not None else False for x in merged_mask]
+                    if descending:
+                        select_min = [not x for x in select_min]
+                    stages.append({
+                        "shuffle": merged_shuffle,
+                        "select_min": select_min,
+                    })
+                    stride //= 2
+                size *= 2
+            return stages
+
+        tile_size, _ = V.kernel.var_info[operand]
+        cur = operand
+        for stage in _compute_bitonic_stages(tile_size, descending):
+            mask     = ops.constant_mask(stage["select_min"], tile_size)
+            shuffled = ops.vector_shuffle(cur, stage["shuffle"])
+            vmin     = ops.minimum(cur, shuffled)
+            vmax     = ops.maximum(cur, shuffled)
+            cur      = ops.where(mask, vmin, vmax)
+        return cur, V.kernel.var_info[cur]
+
     @staticmethod
     def _load(compute_vec_size, mlir_dtype, buffer, indices, buffer_shape, *args, **kwargs):
         if compute_vec_size == 1:
diff --git a/PyTorchSimFrontend/mlir/mlir_sort_template.py b/PyTorchSimFrontend/mlir/mlir_sort_template.py
index d12c7570..24b3a460 100644
--- a/PyTorchSimFrontend/mlir/mlir_sort_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_sort_template.py
@@ -1,130 +1,189 @@
 from typing import List, Optional
+import contextlib
 
-import sympy
-from torch._inductor.ir import IRNode
-from torch._inductor.virtualized import V
+from torch._inductor.ir import Buffer, IRNode
+from torch._inductor.virtualized import _ops as ops
+from torch._inductor.codegen import common
 
 from PyTorchSimFrontend.mlir import mlir_common
 from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate, MLIRTemplateKernel
+from PyTorchSimFrontend.mlir.mlir_common import LoopLevel
+
+VECTOR_SIZE = 16
 
 
 TEMPLATE = r"""
 {{kernel.def_global_vars()}}
+// chunk index -> element index
+#map_chunk_to_elem = affine_map<(d0) -> (d0 * {{ VECTOR_SIZE }})>
 
-func.func @{{ KERNEL_NAME }} {{kernel.def_kernel(inputs=[X, YI], outputs=[YV], names_str=NAMES_STR, input_reorder=input_reorder)}} {
-  {{ kernel.def_sram_buffer("YI", YI_TILE_DESC, id=1, indent_size=2) }}
-  {{ kernel.def_sram_buffer(OUT_DVAR, YV_TILE_DESC, id=2, indent_size=2) }}
+func.func @{{ KERNEL_NAME }} {{kernel.def_kernel(inputs=[X, XI], outputs=[YV], names_str=NAMES_STR, input_reorder=input_reorder)}} {
+  {{ kernel.def_sram_buffer("X",  X_TILE_DESC,  id=0, indent_size=2) }}
+  {{ kernel.def_sram_buffer("XI", XI_TILE_DESC, id=1, indent_size=2) }}
+  {{ kernel.def_sram_buffer("YV", YV_TILE_DESC, id=2, indent_size=2) }}
   {{ kernel.def_local_vars(indent_size=2) }}
 
-  %c0 = arith.constant 0 : index
-  %c_cols = arith.constant {{ COLS }} : index
 
   affine.for %sort_block = 0 to 1 step 1 {
-    // Initialize output value/index buffers.
-    affine.for %row = 0 to {{ ROWS }} step 1 {
-      affine.for %col = 0 to {{ COLS }} step 1 {
-        {{ kernel.def_dma_op("MVIN", "X", INIT_X_IDX, X_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=8) }}
-        {{ kernel.def_dma_op("MVOUT", OUT_DVAR, INIT_YV_IDX, X_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=8) }}
-{% if DIM == 1 %}
-        %idx_i64 = arith.index_cast %col : index to {{ YI_ELEM_TYPE }}
-{% else %}
-        %idx_i64 = arith.index_cast %row : index to {{ YI_ELEM_TYPE }}
-{% endif %}
-        memref.store %idx_i64, %yi_sort_tile[%c0, %c0] : {{ YI_TILE_MEMREF_TYPE }}
-        {{ kernel.def_dma_op("MVOUT", "YI", INIT_YI_IDX, YI_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=8) }}
-      }
-    }
-
-{% if DIM == 1 %}
-    // Stable bubble sort on each row (dim=1).
-    affine.for %row = 0 to {{ ROWS }} step 1 {
-      affine.for %pass = 0 to {{ COLS }} step 1 {
-        affine.for %j = 0 to {{ COLS_MINUS1 }} step 1 {
-          {{ kernel.def_dma_op("MVIN", OUT_DVAR, D1_S0_IDX, YV_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=10) }}
-          %lhs = memref.load %yv_sort_tile[%c0, %c0] : {{ YV_TILE_MEMREF_TYPE }}
-
-          {{ kernel.def_dma_op("MVIN", OUT_DVAR, D1_S1_IDX, YV_S1_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=10) }}
-          %rhs = memref.load %yv_sort_tile[%c0, %c0] : {{ YV_TILE_MEMREF_TYPE }}
-
-{% if DESCENDING %}
-          %need_swap = arith.cmpf olt, %lhs, %rhs : {{ YV_ELEM_TYPE }}
-{% else %}
-          %need_swap = arith.cmpf ogt, %lhs, %rhs : {{ YV_ELEM_TYPE }}
-{% endif %}
-          scf.if %need_swap {
-            memref.store %rhs, %yv_sort_tile[%c0, %c0] : {{ YV_TILE_MEMREF_TYPE }}
-            {{ kernel.def_dma_op("MVOUT", OUT_DVAR, D1_S0_IDX, YV_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=12) }}
-
-            memref.store %lhs, %yv_sort_tile[%c0, %c0] : {{ YV_TILE_MEMREF_TYPE }}
-            {{ kernel.def_dma_op("MVOUT", OUT_DVAR, D1_S1_IDX, YV_S1_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=12) }}
-
-            {{ kernel.def_dma_op("MVIN", "YI", D1_S0_IDX, YI_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=12) }}
-            %li = memref.load %yi_sort_tile[%c0, %c0] : {{ YI_TILE_MEMREF_TYPE }}
-
-            {{ kernel.def_dma_op("MVIN", "YI", D1_S1_IDX, YI_S1_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=12) }}
-            %ri = memref.load %yi_sort_tile[%c0, %c0] : {{ YI_TILE_MEMREF_TYPE }}
-
-            memref.store %ri, %yi_sort_tile[%c0, %c0] : {{ YI_TILE_MEMREF_TYPE }}
-            {{ kernel.def_dma_op("MVOUT", "YI", D1_S0_IDX, YI_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=12) }}
-
-            memref.store %li, %yi_sort_tile[%c0, %c0] : {{ YI_TILE_MEMREF_TYPE }}
-            {{ kernel.def_dma_op("MVOUT", "YI", D1_S1_IDX, YI_S1_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=12) }}
-          }
-        }
-      }
-    }
-{% else %}
-    // Stable bubble sort on each column (dim=0).
-    affine.for %col = 0 to {{ COLS }} step 1 {
-      affine.for %pass = 0 to {{ ROWS }} step 1 {
-        affine.for %i = 0 to {{ ROWS_MINUS1 }} step 1 {
-          {{ kernel.def_dma_op("MVIN", OUT_DVAR, D0_S0_IDX, YV_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=10) }}
-          %lhs = memref.load %yv_sort_tile[%c0, %c0] : {{ YV_TILE_MEMREF_TYPE }}
-
-          {{ kernel.def_dma_op("MVIN", OUT_DVAR, D0_S1_IDX, YV_S1_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=10) }}
-          %rhs = memref.load %yv_sort_tile[%c0, %c0] : {{ YV_TILE_MEMREF_TYPE }}
-
-{% if DESCENDING %}
-          %need_swap = arith.cmpf olt, %lhs, %rhs : {{ YV_ELEM_TYPE }}
-{% else %}
-          %need_swap = arith.cmpf ogt, %lhs, %rhs : {{ YV_ELEM_TYPE }}
-{% endif %}
-          scf.if %need_swap {
-            memref.store %rhs, %yv_sort_tile[%c0, %c0] : {{ YV_TILE_MEMREF_TYPE }}
-            {{ kernel.def_dma_op("MVOUT", OUT_DVAR, D0_S0_IDX, YV_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=12) }}
-
-            memref.store %lhs, %yv_sort_tile[%c0, %c0] : {{ YV_TILE_MEMREF_TYPE }}
-            {{ kernel.def_dma_op("MVOUT", OUT_DVAR, D0_S1_IDX, YV_S1_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=12) }}
-
-            {{ kernel.def_dma_op("MVIN", "YI", D0_S0_IDX, YI_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=12) }}
-            %li = memref.load %yi_sort_tile[%c0, %c0] : {{ YI_TILE_MEMREF_TYPE }}
-
-            {{ kernel.def_dma_op("MVIN", "YI", D0_S1_IDX, YI_S1_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=12) }}
-            %ri = memref.load %yi_sort_tile[%c0, %c0] : {{ YI_TILE_MEMREF_TYPE }}
-
-            memref.store %ri, %yi_sort_tile[%c0, %c0] : {{ YI_TILE_MEMREF_TYPE }}
-            {{ kernel.def_dma_op("MVOUT", "YI", D0_S0_IDX, YI_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=12) }}
-
-            memref.store %li, %yi_sort_tile[%c0, %c0] : {{ YI_TILE_MEMREF_TYPE }}
-            {{ kernel.def_dma_op("MVOUT", "YI", D0_S1_IDX, YI_S1_TILE_DESC, subtile_size=[1, 1], async_type=0, indent_size=12) }}
-          }
-        }
-      }
-    }
-{% endif %}
+  {%- for d in range(RANK-1) %}
+    affine.for %index{{ OUTPUT_DIM[d] }} = 0 to {{ OUTPUT_SIZES[d] }} step {{ STEP_SIZES[d] }} {
+  {%- endfor %}
+
+    %x_dram_offset = affine.apply {{ X_OFFSET_MAP }}({{ OUTER_VARS }})
+    %xi_dram_offset = affine.apply {{ XI_OFFSET_MAP }}({{ OUTER_VARS }})
+    %yv_dram_offset = affine.apply {{ YV_OFFSET_MAP }}({{ OUTER_VARS }})
+    {{ kernel.def_dma_op("MVIN", "X", [], X_TILE_DESC, indent_size=INDENT_SIZE, dram_stride=X_DRAM_STRIDE, dram_offset="x_dram_offset") }}
+
+    // SIMD local sort + loop-based chunk merge.
+{{ BITONIC_BODY }}
+
+    {{ kernel.def_dma_op("MVOUT", "XI", [], XI_TILE_DESC, indent_size=INDENT_SIZE, dram_stride=XI_DRAM_STRIDE, dram_offset="xi_dram_offset") }}
+    {{ kernel.def_dma_op("MVOUT", "YV", [], YV_TILE_DESC, indent_size=INDENT_SIZE, dram_stride=YV_DRAM_STRIDE, dram_offset="yv_dram_offset") }}
+  {%- for d in range(RANK-1) %}
+    } { outer_loop=true }
+  {%- endfor %}
   } { outer_loop=true }
   return
 }
 """
 
 
+def _make_offset_map(outer_dims, all_strides, layout_offset):
+    """Build an affine_map over outer-dim loop variables that computes the flat DRAM offset."""
+    terms = []
+    for j, d in enumerate(outer_dims):
+        s = int(all_strides[d])
+        if s == 1:
+            terms.append(f"d{j}")
+        elif s != 0:
+            terms.append(f"d{j} * {s}")
+    try:
+        off = int(layout_offset)
+    except (TypeError, ValueError):
+        off = 0
+    if off:
+        terms.append(str(off))
+    nd = len(outer_dims)
+    dim_str = ", ".join(f"d{j}" for j in range(nd))
+    expr = " + ".join(terms) if terms else "0"
+    return f"affine_map<({dim_str}) -> ({expr})>"
+
+
+def _compute_bitonic_stages(n: int, descending: bool):
+    stages = []
+    size = 2
+    while size <= n:
+        stride = size // 2
+        while stride >= 1:
+            merged_shuffle = list(range(n))
+            merged_mask = [None] * n
+            for start in range(0, n, size):
+                blk_dir = "ASCENDING" if (start // size) % 2 == 0 else "DESCENDING"
+                for i in range(start, start + size - stride, stride * 2):
+                    for j2 in range(stride):
+                        a, b = i + j2, i + j2 + stride
+                        merged_shuffle[a] = b
+                        merged_shuffle[b] = a
+                        if blk_dir == "ASCENDING":
+                            merged_mask[a] = True
+                            merged_mask[b] = False
+                        else:
+                            merged_mask[a] = False
+                            merged_mask[b] = True
+            select_min = [bool(x) if x is not None else False for x in merged_mask]
+            if descending:
+                select_min = [not x for x in select_min]
+            stages.append({"shuffle": merged_shuffle, "select_min": select_min})
+            stride //= 2
+        size *= 2
+    return stages
+
+
+def _pair_less_equal(left_v, right_v, left_i, right_i):
+    cmp_val = ops.lt(left_v, right_v)
+    cmp_eq = ops.eq(left_v, right_v)
+    cmp_idx = ops.le(left_i, right_i)
+    return ops.or_(cmp_val, ops.and_(cmp_eq, cmp_idx))
+
+
+def _pair_greater_equal(left_v, right_v, left_i, right_i):
+    cmp_val = ops.gt(left_v, right_v)
+    cmp_eq = ops.eq(left_v, right_v)
+    cmp_idx = ops.le(left_i, right_i)
+    return ops.or_(cmp_val, ops.and_(cmp_eq, cmp_idx))
+
+
+def _bitonic_sort_pair(values, indices, vector_size: int, descending: bool, stable_sort: bool):
+    cur_v = values
+    cur_i = indices
+    for stage_desc in _compute_bitonic_stages(vector_size, descending):
+        mask = ops.constant_mask(stage_desc["select_min"], vector_size)
+        shuf_v = ops.vector_shuffle(cur_v, stage_desc["shuffle"])
+        shuf_i = ops.vector_shuffle(cur_i, stage_desc["shuffle"])
+        if stable_sort:
+            # `cmp` drives the "min side" selection in the bitonic network.
+            # For descending stable sort, tie elements with smaller original index
+            # must stay earlier, so the min side should treat larger index as smaller.
+            if descending:
+                cmp_val = ops.lt(cur_v, shuf_v)
+                cmp_eq = ops.eq(cur_v, shuf_v)
+                cmp_idx = ops.ge(cur_i, shuf_i)
+                cmp = ops.or_(cmp_val, ops.and_(cmp_eq, cmp_idx))
+            else:
+                cmp = _pair_less_equal(cur_v, shuf_v, cur_i, shuf_i)
+        else:
+            cmp = ops.le(cur_v, shuf_v)
+        min_v = ops.where(cmp, cur_v, shuf_v)
+        min_i = ops.where(cmp, cur_i, shuf_i)
+        max_v = ops.where(cmp, shuf_v, cur_v)
+        max_i = ops.where(cmp, shuf_i, cur_i)
+        cur_v = ops.where(mask, min_v, max_v)
+        cur_i = ops.where(mask, min_i, max_i)
+    return cur_v, cur_i
+
+
+def _merge_sorted_pair_vectors(
+    left_norm,
+    left_idx_norm,
+    right_norm,
+    right_idx_norm,
+    ascending: bool,
+    stable_sort: bool,
+    vector_size: int,
+    rev_indices,
+):
+    right_pair = ops.vector_shuffle(right_norm, rev_indices, right_norm)
+    right_idx_pair = ops.vector_shuffle(right_idx_norm, rev_indices, right_idx_norm)
+    if ascending:
+        cmp = (
+            _pair_less_equal(left_norm, right_pair, left_idx_norm, right_idx_pair)
+            if stable_sort
+            else ops.le(left_norm, right_pair)
+        )
+    else:
+        cmp = (
+            _pair_greater_equal(left_norm, right_pair, left_idx_norm, right_idx_pair)
+            if stable_sort
+            else ops.ge(left_norm, right_pair)
+        )
+    left_merge = ops.where(cmp, left_norm, right_pair)
+    left_idx_merge = ops.where(cmp, left_idx_norm, right_idx_pair)
+    right_merge = ops.where(cmp, right_pair, left_norm)
+    right_idx_merge = ops.where(cmp, right_idx_pair, left_idx_norm)
+    return left_merge, left_idx_merge, right_merge, right_idx_merge
+
+
 class MLIRSortTemplate(MLIRTemplate):
-    def __init__(self, input_nodes, layout, dim, descending=False, stable=False, indices_node=None, input_reorder=None):
+    def __init__(self, input_nodes, layout, dim, descending=False, stable=False, input_reorder=None):
         super().__init__("kernel", input_nodes, layout, input_reorder)
         self.dim = dim
         self.descending = descending
         self.stable = stable
-        self.indices_node = indices_node
+        self.use_stable_sort = False
+        self.output_nodes = [
+            Buffer(name="buf_out_values", layout=layout),
+        ]
+        self.output_node = self.output_nodes[0]
 
     def render(
         self,
@@ -135,119 +194,281 @@ def render(
         **kwargs,
     ):
         if template_buffer_node is not None:
+            self.output_nodes[0] = template_buffer_node
             self.output_node = template_buffer_node
-        if self.indices_node is None:
-            raise RuntimeError("MLIRSortTemplate requires indices output node")
 
         x = self.input_nodes[0]
-        yv = self.output_node
-        yi = self.indices_node
-
-        def _as_int(v):
-            try:
-                return int(v)
-            except Exception:
-                return int(V.graph.sizevars.size_hint(v))
-
-        x_size = x.get_size()
-        if len(x_size) != 2:
-            raise RuntimeError("MLIRSortTemplate currently supports rank-2 input only")
-        if self.dim not in (0, 1):
-            raise RuntimeError(f"MLIRSortTemplate currently supports dim in {{0,1}} only, got dim={self.dim}")
-
-        rows = _as_int(x_size[0])
-        cols = _as_int(x_size[1])
-        cols_minus1 = max(0, cols - 1)
-        rows_minus1 = max(0, rows - 1)
-
-        x_dtype = x.get_dtype()
-        yv_dtype = yv.get_dtype()
-        yi_dtype = yi.get_dtype()
-        if x_dtype != yv_dtype:
-            raise RuntimeError("sort template requires input/value dtype match")
-
-        yi_tile_desc = mlir_common.MLIRMultiDimTile([1, 1], kernel.vector_lane, vlane_split_axis=1, vlane_stride=1)
-        yi_tile_desc.set_tile_size_stride([1, 1], [1, 1])
-        yi_tile_desc.set_name("yi_sort_tile")
-        yv_tile_desc = mlir_common.MLIRMultiDimTile([1, 1], kernel.vector_lane, vlane_split_axis=1, vlane_stride=1)
-        yv_tile_desc.set_tile_size_stride([1, 1], [1, 1])
-        yv_tile_desc.set_name("yv_sort_tile")
-        # Neighbor element descriptors use DRAM offset to preserve affine stride metadata.
-        yv_s1_tile_desc = mlir_common.MLIRMultiDimTile([1, 1], kernel.vector_lane, vlane_split_axis=1, vlane_stride=1)
-        yv_s1_tile_desc.set_tile_size_stride([1, 1], [1, 1])
-        yv_s1_tile_desc.set_name("yv_sort_tile")
-        yi_s1_tile_desc = mlir_common.MLIRMultiDimTile([1, 1], kernel.vector_lane, vlane_split_axis=1, vlane_stride=1)
-        yi_s1_tile_desc.set_tile_size_stride([1, 1], [1, 1])
-        yi_s1_tile_desc.set_name("yi_sort_tile")
-        if int(self.dim) == 1:
-            yv_s1_tile_desc.offset = sympy.Integer(1)
-            yi_s1_tile_desc.offset = sympy.Integer(1)
+        xi = self.input_nodes[1]
+        yv = self.output_nodes[0]
+        # XI is updated in-place by the sort kernel, so mark it as an inout arg.
+        kernel.kernel_group.args.make_inplace(xi.get_name(), xi.get_name())
+        sort_size = int(x.get_size()[self.dim])
+        vector_size = VECTOR_SIZE
+        if sort_size <= 0:
+            raise NotImplementedError("Sort size must be > 0")
+        if sort_size < vector_size or sort_size % vector_size != 0:
+            raise NotImplementedError(
+                f"Sort size must be a multiple of vector size (sort_size={sort_size}, vector_size={vector_size})"
+            )
+        num_chunks = sort_size // vector_size
+        if num_chunks & (num_chunks - 1):
+            raise NotImplementedError(
+                f"Loop-based bitonic chunk merge requires power-of-two chunk count (num_chunks={num_chunks})"
+            )
+
+        # --- N-D generalization: outer loops over all non-sort dims ---
+        rank = len(x.get_size())
+        sort_dim = self.dim if self.dim >= 0 else self.dim + rank
+        if sort_dim < 0 or sort_dim >= rank:
+            raise NotImplementedError(f"Invalid sort dim for rank-{rank} tensor (dim={self.dim})")
+        x_layout = x.get_layout()
+        xi_layout = xi.get_layout()
+        yv_layout = yv.get_layout()
+
+        if rank == 1:
+            # Edge case for 1D tensor
+            output_sizes = [1]
+            output_dim = [0]
+            step_sizes = [1]
+            tile_sizes = [1, sort_size]
+            x_dram_stride = [int(x_layout.stride[sort_dim]), int(x_layout.stride[sort_dim])]
+            xi_dram_stride = [int(xi_layout.stride[sort_dim]), int(xi_layout.stride[sort_dim])]
+            yv_dram_stride = [int(yv_layout.stride[sort_dim]), int(yv_layout.stride[sort_dim])]
+            template_rank = 2
         else:
-            yv_s1_tile_desc.offset = sympy.Integer(cols)
-            yi_s1_tile_desc.offset = sympy.Integer(cols)
-
-        row = sympy.Symbol("row")
-        col = sympy.Symbol("col")
-        i = sympy.Symbol("i")
-        j = sympy.Symbol("j")
-
-        init_x_idx = [row * cols, col]
-        init_yv_idx = [row * cols, col]
-        init_yi_idx = [row * cols, col]
+            output_sizes = [sz for d, sz in enumerate(yv.get_size()) if d != sort_dim]
+            output_dim = [d for d, _ in enumerate(yv.get_size()) if d != sort_dim]
+            step_sizes = [1] * len(output_sizes)
+
+            tile_dim = max(output_dim, key=lambda d: int(yv.get_size()[d]))
+            tile_sizes = [min(kernel.vector_lane, int(yv.get_size()[tile_dim])), sort_size]
+            step_sizes[output_dim.index(tile_dim)] = tile_sizes[0]
+
+            x_dram_stride = [int(x_layout.stride[tile_dim]), int(x_layout.stride[sort_dim])]
+            xi_dram_stride = [int(xi_layout.stride[tile_dim]), int(xi_layout.stride[sort_dim])]
+            yv_dram_stride = [int(yv_layout.stride[tile_dim]), int(yv_layout.stride[sort_dim])]
+            template_rank = rank
+
+        x_offset_map  = _make_offset_map(output_dim, x_layout.stride,  x_layout.offset)
+        xi_offset_map = _make_offset_map(output_dim, xi_layout.stride, xi_layout.offset)
+        yv_offset_map = _make_offset_map(output_dim, yv_layout.stride, yv_layout.offset)
+        outer_vars = ", ".join(f"%index{d}" for d in output_dim)
+
+        # indent for DMA ops = 2 (inside func) + 2 per outer loop
+        indent_size = 2 + len(output_dim) * 2 + 4
+
+        vlane_stride = 1
+        vlane_split_axis = 0
+        x_tile_desc = mlir_common.MLIRMultiDimTile(tile_sizes, kernel.vector_lane, vlane_split_axis, vlane_stride)
+        x_tile_desc.set_tile_size_stride(tile_sizes, [sort_size, 1])
+        x_tile_desc.set_name("X_buffer")
+        x_tile_desc.offset = x_layout.offset
+
+        xi_tile_desc = mlir_common.MLIRMultiDimTile(tile_sizes, kernel.vector_lane, vlane_split_axis, vlane_stride)
+        xi_tile_desc.set_tile_size_stride(tile_sizes, [sort_size, 1])
+        xi_tile_desc.set_name("XI_buffer")
+        xi_tile_desc.offset = xi_layout.offset
+
+        yv_tile_desc = mlir_common.MLIRMultiDimTile(tile_sizes, kernel.vector_lane, vlane_split_axis, vlane_stride)
+        yv_tile_desc.set_tile_size_stride(tile_sizes, [sort_size, 1])
+        yv_tile_desc.set_name("YV_buffer")
+        yv_tile_desc.offset = yv_layout.offset
+
+        data_stype = mlir_common.DTYPE_TO_MLIR[x.get_dtype()]
+        idx_stype = mlir_common.DTYPE_TO_MLIR[xi.get_dtype()]
+
+        elem_memref_t = f"memref<1x{sort_size}x{data_stype}, 1>"
+        rev_indices = list(range(vector_size - 1, -1, -1))
+
+        bitonic_body = mlir_common.ParallelLoopBuffer(initial_indent=2)
+        bitonic_body.tabwidth = 2
+        # 1) Local SIMD sort per chunk.
+        init_cse = common.CSE(kernel.newvar_prefix, kernel.suffix, name_prefix="sort_init")
+        with kernel, kernel.override_buffer_cse(buffer=bitonic_body, cse=init_cse):
+            bitonic_body.writelines(LoopLevel("chunk", num_chunks).lines())
+            with bitonic_body.indent(attribute="{inner_loop=true}"):
+                bitonic_body.writeline("%elem = affine.apply #map_chunk_to_elem(%chunk)")
+                x_chunk = ops._load(
+                    vector_size,
+                    data_stype,
+                    "X_buffer",
+                    "%t_const0, %elem",
+                    x_tile_desc.get_mlir_shape(data_stype),
+                )
+                idx_step_index = kernel.register_var_cse("idx_step_index", vector_size, "index")
+                bitonic_body.writeline(f"%{idx_step_index} = vector.step : vector<{vector_size}xindex>")
+                idx_step = ops.index_cast(idx_step_index, idx_stype)
+                idx_base = kernel.register_var_cse("idx_base", 1, idx_stype)
+                bitonic_body.writeline(f"%{idx_base} = arith.index_cast %elem : index to {idx_stype}")
+                idx_base_vec = ops.broadcast(idx_base, vector_size)
+                idx_chunk = ops.add(idx_base_vec, idx_step)
+                yv_chunk, yi_chunk = _bitonic_sort_pair(
+                    x_chunk, idx_chunk, vector_size, descending=self.descending, stable_sort=self.use_stable_sort
+                )
+                ops._store(
+                    yv_chunk,
+                    "YV_buffer",
+                    "%t_const0, %elem",
+                    yv_tile_desc.get_mlir_shape(data_stype),
+                )
+                ops._store(
+                    yi_chunk,
+                    "XI_buffer",
+                    "%t_const0, %elem",
+                    xi_tile_desc.get_mlir_shape(idx_stype),
+                )
+
+        # 2) Chunk-level bitonic merge (loop form).
+        stage = 0
+        k = 2
+        while k <= num_chunks:
+            j = k // 2
+            while j >= 1:
+                for block_start, is_even_block in ((0, True), (k, False)):
+                    if block_start >= num_chunks:
+                        continue
+                    asc_dir = is_even_block if not self.descending else (not is_even_block)
+                    stage_cse = common.CSE(kernel.newvar_prefix, kernel.suffix, name_prefix=f"sort_stage_{stage}")
+                    with kernel, kernel.override_buffer_cse(buffer=bitonic_body, cse=stage_cse):
+                        stage_loops = [
+                            LoopLevel("base", num_chunks, start=block_start, step=2 * k),
+                            LoopLevel("p", k, step=2 * j),
+                            LoopLevel("q", j),
+                        ]
+                        with contextlib.ExitStack() as stack:
+                            for loop in stage_loops:
+                                bitonic_body.writelines(loop.lines())
+                                stack.enter_context(bitonic_body.indent(attribute="{inner_loop=true}"))
+
+                            bitonic_body.writeline(
+                                f"%left_elem = affine.apply affine_map<(d0, d1, d2) -> ((d0 + d1 + d2) * {vector_size})>(%base, %p, %q)"
+                            )
+                            bitonic_body.writeline(
+                                f"%right_elem = affine.apply affine_map<(d0, d1, d2) -> ((d0 + d1 + d2 + {j}) * {vector_size})>(%base, %p, %q)"
+                            )
+
+                            left_vec = ops._load(
+                                vector_size,
+                                data_stype,
+                                "YV_buffer",
+                                "%t_const0, %left_elem",
+                                yv_tile_desc.get_mlir_shape(data_stype),
+                            )
+                            right_vec = ops._load(
+                                vector_size,
+                                data_stype,
+                                "YV_buffer",
+                                "%t_const0, %right_elem",
+                                yv_tile_desc.get_mlir_shape(data_stype),
+                            )
+                            left_idx = ops._load(
+                                vector_size,
+                                idx_stype,
+                                "XI_buffer",
+                                "%t_const0, %left_elem",
+                                xi_tile_desc.get_mlir_shape(idx_stype),
+                            )
+                            right_idx = ops._load(
+                                vector_size,
+                                idx_stype,
+                                "XI_buffer",
+                                "%t_const0, %right_elem",
+                                xi_tile_desc.get_mlir_shape(idx_stype),
+                            )
+                            norm_desc = not asc_dir
+                            left_norm, left_idx_norm = _bitonic_sort_pair(
+                                left_vec, left_idx, vector_size, descending=norm_desc, stable_sort=self.use_stable_sort
+                            )
+                            right_norm, right_idx_norm = _bitonic_sort_pair(
+                                right_vec, right_idx, vector_size, descending=norm_desc, stable_sort=self.use_stable_sort
+                            )
+                            left_merge, left_idx_merge, right_merge, right_idx_merge = _merge_sorted_pair_vectors(
+                                left_norm,
+                                left_idx_norm,
+                                right_norm,
+                                right_idx_norm,
+                                ascending=asc_dir,
+                                stable_sort=self.use_stable_sort,
+                                vector_size=vector_size,
+                                rev_indices=rev_indices,
+                            )
+                            left_new, left_idx_new = _bitonic_sort_pair(
+                                left_merge, left_idx_merge, vector_size, descending=norm_desc, stable_sort=self.use_stable_sort
+                            )
+                            right_new, right_idx_new = _bitonic_sort_pair(
+                                right_merge, right_idx_merge, vector_size, descending=norm_desc, stable_sort=self.use_stable_sort
+                            )
+                            ops._store(
+                                left_new,
+                                "YV_buffer",
+                                "%t_const0, %left_elem",
+                                yv_tile_desc.get_mlir_shape(data_stype),
+                            )
+                            ops._store(
+                                right_new,
+                                "YV_buffer",
+                                "%t_const0, %right_elem",
+                                yv_tile_desc.get_mlir_shape(data_stype),
+                            )
+                            ops._store(
+                                left_idx_new,
+                                "XI_buffer",
+                                "%t_const0, %left_elem",
+                                xi_tile_desc.get_mlir_shape(idx_stype),
+                            )
+                            ops._store(
+                                right_idx_new,
+                                "XI_buffer",
+                                "%t_const0, %right_elem",
+                                xi_tile_desc.get_mlir_shape(idx_stype),
+                            )
+                    stage += 1
+                j //= 2
+            k *= 2
 
-        d1_s0_idx = [row * cols, j]
-        d1_s1_idx = [row * cols, j]
-
-        d0_s0_idx = [i * cols, col]
-        d0_s1_idx = [i * cols, col]
-
-        kernel.loop_size = None
-        numel = rows * cols
         kernel.render_options = dict(
             KERNEL_NAME=self.name,
+            NAMES_STR="X, XI, YV",
             kernel=kernel,
             X=x,
+            XI=xi,
             YV=yv,
-            YI=yi,
-            OUT_DVAR="YV",
-            NAMES_STR="X, YI, YV",
-            ROWS=rows,
-            COLS=cols,
-            COLS_MINUS1=cols_minus1,
-            ROWS_MINUS1=rows_minus1,
-            DIM=int(self.dim),
-            DESCENDING=bool(self.descending),
-            YI_TILE_DESC=yi_tile_desc,
+            X_TILE_DESC=x_tile_desc,
+            XI_TILE_DESC=xi_tile_desc,
             YV_TILE_DESC=yv_tile_desc,
-            YI_S1_TILE_DESC=yi_s1_tile_desc,
-            YV_S1_TILE_DESC=yv_s1_tile_desc,
-            INIT_X_IDX=init_x_idx,
-            INIT_YV_IDX=init_yv_idx,
-            INIT_YI_IDX=init_yi_idx,
-            D1_S0_IDX=d1_s0_idx,
-            D1_S1_IDX=d1_s1_idx,
-            D0_S0_IDX=d0_s0_idx,
-            D0_S1_IDX=d0_s1_idx,
-            YV_ELEM_TYPE=mlir_common.DTYPE_TO_MLIR[yv_dtype],
-            YI_ELEM_TYPE=mlir_common.DTYPE_TO_MLIR[yi_dtype],
-            X_MEMREF_TYPE=f"memref<{numel}x{mlir_common.DTYPE_TO_MLIR[x_dtype]}>",
-            YV_MEMREF_TYPE=f"memref<{numel}x{mlir_common.DTYPE_TO_MLIR[yv_dtype]}>",
-            YI_MEMREF_TYPE=f"memref<{numel}x{mlir_common.DTYPE_TO_MLIR[yi_dtype]}>",
-            YV_TILE_MEMREF_TYPE=yv_tile_desc.get_mlir_shape(mlir_common.DTYPE_TO_MLIR[yv_dtype]),
-            YI_TILE_MEMREF_TYPE=yi_tile_desc.get_mlir_shape(mlir_common.DTYPE_TO_MLIR[yi_dtype]),
-            X_TILE_DESC=yv_tile_desc,
+            SORT_SIZE=sort_size,
+            VECTOR_SIZE=vector_size,
+            DATA_STYPE=data_stype,
+            IDX_STYPE=idx_stype,
+            ELEM_MEMREF_T=elem_memref_t,
+            BITONIC_BODY=bitonic_body.getvalue().rstrip(),
             input_reorder=self.input_reorder,
+            # N-D generalization
+            RANK                  = template_rank,
+            OUTPUT_SIZES          = output_sizes,
+            OUTPUT_DIM            = output_dim,
+            STEP_SIZES            = step_sizes,
+            OUTER_VARS            = outer_vars,
+            X_OFFSET_MAP          = x_offset_map,
+            XI_OFFSET_MAP         = xi_offset_map,
+            YV_OFFSET_MAP         = yv_offset_map,
+            X_DRAM_STRIDE         = x_dram_stride,
+            XI_DRAM_STRIDE        = xi_dram_stride,
+            YV_DRAM_STRIDE        = yv_dram_stride,
+            INDENT_SIZE           = indent_size,
         )
-
-        output_node_name = yv.get_name() if hasattr(yv, "get_name") else yv.name
-        kernel.epilogue_info = dict(
-            output_node=output_node_name,
-            sram_var="yv_sort_tile",
-            dram_var=kernel.render_options["OUT_DVAR"],
-            dram_tile_desc=yv_tile_desc,
-        )
-        kernel.exception_nodes[kernel.render_options["OUT_DVAR"]] = {"numel": yv.get_numel()}
-        kernel.exception_nodes["YI"] = {"numel": yi.get_numel()}
-
         code = self._template_from_string(TEMPLATE).render(**kernel.render_options)
         return code
+
+
+class MLIRStableSortTemplate(MLIRSortTemplate):
+    def __init__(self, input_nodes, layout, dim, descending=False, stable=True, input_reorder=None):
+        super().__init__(
+            input_nodes=input_nodes,
+            layout=layout,
+            dim=dim,
+            descending=descending,
+            stable=stable,
+            input_reorder=input_reorder,
+        )
+        self.use_stable_sort = True
diff --git a/tests/test_sort.py b/tests/test_sort.py
index 2b070223..05afe92b 100644
--- a/tests/test_sort.py
+++ b/tests/test_sort.py
@@ -1,7 +1,5 @@
 import argparse
 import torch
-import torch._dynamo
-import torch.utils.cpp_extension
 
 def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
@@ -34,63 +32,85 @@ def test_equal(name, out, cpu_out):
         print("cpu out:", cpu_out)
         raise SystemExit(1)
 
-
-def _normalize_dim(dim: int, rank: int) -> int:
-    d = dim if dim >= 0 else rank + dim
-    if d < 0 or d >= rank:
-        raise ValueError(f"dim out of range: dim={dim}, rank={rank}")
-    return d
-
-
-def test_sort_stable(device, size=(128, 128), dim=-1, descending=False):
-    _normalize_dim(dim, len(size))
-
-    def sort_stable_fn(x):
-        return torch.sort(x, stable=True, dim=dim, descending=descending)
-
-    x = torch.randn(size, dtype=torch.float32)
-    x_npu = x.to(device=device)
-
-    opt_sort = torch.compile(dynamic=False)(sort_stable_fn)
-    out_values, out_indices = opt_sort(x_npu)
-
-    ref_values, ref_indices = torch.sort(x, stable=True, dim=dim, descending=descending)
-
-    test_result("Sort.stable/values", out_values, ref_values)
-    test_equal("Sort.stable/indices", out_indices, ref_indices)
-
-
-def test_sort_values_stable(device, size=(128, 128), dim=-1, descending=False):
-    _normalize_dim(dim, len(size))
-
-    def sort_out_fn(x):
-        out_values = torch.empty_like(x, device=x.device)
-        out_indices = torch.empty_like(x, dtype=torch.int64, device=x.device)
-        return torch.sort(x, stable=True, dim=dim, descending=descending, out=(out_values, out_indices))
+def test_sort(device, size=(128, 128), dim=-1, descending=False, stable=True):
+    def sort_test(x):
+        return torch.sort(x, dim=dim, descending=descending, stable=stable)
 
     x = torch.randn(size, dtype=torch.float32)
     x_npu = x.to(device=device)
 
-    opt_sort = sort_out_fn# torch.compile(dynamic=False)(sort_out_fn)
+    opt_sort = torch.compile(dynamic=False)(sort_test)
     out_values, out_indices = opt_sort(x_npu)
+    ref_values, ref_indices = torch.sort(x, stable=stable, dim=dim, descending=descending)
 
-    ref_values, ref_indices = torch.sort(x, stable=True, dim=dim, descending=descending)
-
-    test_result("Sort.values_stable/values", out_values, ref_values)
-    test_equal("Sort.values_stable/indices", out_indices, ref_indices)
-
+    prefix = "Sort.stable" if stable else "Sort.unstable"
+    test_result(f"{prefix}/values size={size}, dim={dim}, desc={descending}", out_values, ref_values)
+    if stable:
+        test_result(f"{prefix}/indices size={size}, dim={dim}, desc={descending}", out_indices, ref_indices)
+    else:
+        # Unstable sort does not guarantee tie ordering; validate index-value consistency instead.
+        gathered = torch.gather(x, dim, out_indices.cpu())
+        test_result(f"{prefix}/indices_gather size={size}, dim={dim}, desc={descending}", gathered, out_values.cpu())
+
+
+def test_sort_stable_suite(device):
+    # Keep sort-axis sizes compatible with backend constraints (vector-size multiple).
+    cases = [
+        {"size": (64,), "dim": 0, "descending": False},          # 1D
+        {"size": (4, 64), "dim": 1, "descending": True},         # 2D, last dim
+        {"size": (2, 8, 32), "dim": 2, "descending": False},     # 3D, last dim
+        {"size": (2, 16, 4), "dim": 1, "descending": True},      # 3D, middle dim
+        {"size": (2, 4, 8, 32), "dim": 3, "descending": False},  # 4D, last dim
+        {"size": (4, 2, 32, 8), "dim": 2, "descending": True},   # 4D, inner dim
+    ]
+    for case in cases:
+        test_sort(
+            device=device,
+            size=case["size"],
+            dim=case["dim"],
+            descending=case["descending"],
+            stable=True,
+        )
+
+
+def test_sort_duplicate_cases(device):
+    duplicate_cases = [
+        {"size": (64,), "dim": 0, "descending": False},
+        {"size": (4, 64), "dim": 1, "descending": True},
+        {"size": (2, 8, 32), "dim": 2, "descending": False},
+    ]
+    for case in duplicate_cases:
+        base = torch.arange(case["size"][case["dim"]], dtype=torch.int64) % 7
+        view_shape = [1] * len(case["size"])
+        view_shape[case["dim"]] = case["size"][case["dim"]]
+        x = base.view(view_shape).expand(case["size"]).to(torch.float32)
+        noise = torch.randn(case["size"], dtype=torch.float32) * 0.0
+        x = x + noise
+
+        def sort_test(inp):
+            return torch.sort(inp, dim=case["dim"], descending=case["descending"], stable=True)
+
+        out_values, out_indices = torch.compile(dynamic=False)(sort_test)(x.to(device=device))
+        ref_values, ref_indices = torch.sort(
+            x, dim=case["dim"], descending=case["descending"], stable=True
+        )
+        test_result(f"Sort.dup/stable_values {case}", out_values, ref_values)
+        test_equal(f"Sort.dup/stable_indices {case}", out_indices, ref_indices)
+
+        def sort_test_unstable(inp):
+            return torch.sort(inp, dim=case["dim"], descending=case["descending"], stable=False)
+
+        out_values_u, out_indices_u = torch.compile(dynamic=False)(sort_test_unstable)(x.to(device=device))
+        ref_values_u, _ = torch.sort(x, dim=case["dim"], descending=case["descending"], stable=False)
+        test_result(f"Sort.dup/unstable_values {case}", out_values_u, ref_values_u)
+        gathered_u = torch.gather(x, case["dim"], out_indices_u.cpu())
+        test_result(f"Sort.dup/unstable_gather {case}", gathered_u, out_values_u.cpu())
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Run sort tests")
-    parser.add_argument("--shape", type=str, default="(128,128)")
+    parser.add_argument("--shape", type=str, default="(64, 32, 16)")
     parser.add_argument("--dim", type=int, default=0)
     parser.add_argument("--descending", action="store_true")
-    parser.add_argument(
-        "--mode",
-        type=str,
-        default="all",
-        choices=["all", "default", "values"],
-    )
     args = parser.parse_args()
 
     shape = tuple(map(int, args.shape.strip("()").split(",")))
@@ -100,13 +120,5 @@ def sort_out_fn(x):
     module = PyTorchSimRunner.setup_device()
     device = module.custom_device()
 
-    # Register recursive-compile bridge only when values_stable path is explicitly tested.
-    if args.mode in ("all", "values"):
-        torch.npu.register_eager_to_compile([
-            "aten::sort.values_stable",
-        ])
-
-    if args.mode in ("all", "default"):
-        test_sort_stable(device, size=shape, dim=args.dim, descending=args.descending)
-    if args.mode in ("all", "values"):
-        test_sort_values_stable(device, size=shape, dim=args.dim, descending=args.descending)
+    test_sort_stable_suite(device)
+    test_sort_duplicate_cases(device)
\ No newline at end of file

From 752cbb834df7705fe12ec18da281d5b76032034e Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 11 Mar 2026 11:55:53 +0900
Subject: [PATCH 123/194] [Template] Use buffer type instead of hard-coded type

---
 PyTorchSimFrontend/extension_codecache.py     | 21 +++-------
 PyTorchSimFrontend/mlir/mlir_bmm_template.py  | 39 +++++++++++--------
 .../mlir/mlir_caller_codegen.py               |  4 --
 PyTorchSimFrontend/mlir/mlir_common.py        |  2 +-
 PyTorchSimFrontend/mlir/mlir_conv_common.py   |  6 +++
 .../mlir/mlir_conv_mt_template.py             | 18 +++++----
 .../mlir/mlir_conv_sb_template.py             | 18 +++++----
 .../mlir/mlir_conv_sbs_template.py            | 18 +++++----
 PyTorchSimFrontend/mlir/mlir_conv_template.py | 18 +++++----
 PyTorchSimFrontend/mlir/mlir_gemm_template.py | 20 +++++++---
 Simulator/simulator.py                        |  3 +-
 11 files changed, 92 insertions(+), 75 deletions(-)

diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index d6b47123..8454dee6 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -67,9 +67,10 @@ def mlir_compile_command(filename, vectorlane_size, vlen=256):
         f"""
             {extension_config.CONFIG_TORCHSIM_LLVM_PATH}/llc \
                 -relocation-model=pic -march=riscv64 -O3 --stack-size-section \
-                -mattr=+m,+f,+d,+a,+c,+v,+xsfvcp,zvl{vlen}b \
+                -mattr=+m,+f,+d,+a,+c,+v,+zvfh,+xsfvcp,zvl{vlen}b \
+                -filetype=obj \
                 {'--print-after-all' if extension_config.CONFIG_TORCHSIM_DUMP_LLVM_IR else ''} \
-                -O2 {filename}.ll -o {filename}.s
+                -O2 {filename}.ll -o {filename}.o
         """,
     ).strip()]
 
@@ -109,9 +110,10 @@ def mlir_gem5_compile_command(filename, sample_filename, tog_file, vectorlane_si
         f"""
             {extension_config.CONFIG_TORCHSIM_LLVM_PATH}/llc \
                 -relocation-model=pic -march=riscv64 -O3 --stack-size-section \
-                -mattr=+m,+f,+d,+a,+c,+v,+xsfvcp,zvl{vlen}b \
+                -mattr=+m,+f,+d,+a,+c,+v,+zvfh,+xsfvcp,zvl{vlen}b \
+                -filetype=obj \
                 {'--print-after-all' if extension_config.CONFIG_TORCHSIM_DUMP_LLVM_IR else ''} \
-                -O2 {sample_filename}.ll -o {sample_filename}.s
+                -O2 {sample_filename}.ll -o {sample_filename}.o
         """,
     ).strip()]
 
@@ -180,17 +182,6 @@ def load(cls, source_code,
                 val_llvm_caller.generate_wrapper_file(write_path, validation_wrapper_name)
                 val_llvm_caller.compile_wih_kernel(write_path, key, validation_wrapper_name,
                                                    validation_binary_name, new_link_option)
-
-                stack_size = val_llvm_caller.parse_stack_sizes(f"{write_path}/{key}.s", vlenb=vlenb)
-                spad_size =  val_llvm_caller.get_spad_size(validation_binary_path)
-                spad_usage = stack_size + spad_size # Spad usage per lane
-                if extension_config.CONFIG_SPAD_INFO["spad_size"] < spad_usage:
-                    logger.debug(
-                        f"Scratchpad size exceeded: required {spad_usage} bytes, "
-                        f"but only {extension_config.CONFIG_SPAD_INFO['spad_size']} bytes available."
-                    )
-                    raise SpadOverflowError()
-
         # Skip if TOG file already exists
         if os.path.isfile(tog_path):
             return key
diff --git a/PyTorchSimFrontend/mlir/mlir_bmm_template.py b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
index 9398f90c..417d97cd 100644
--- a/PyTorchSimFrontend/mlir/mlir_bmm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
@@ -26,20 +26,20 @@
   {{ kernel.def_sram_buffer("W", W_tile_desc, indent_size=2) }}
   {{ kernel.def_sram_buffer("Y", Y_tile_desc, indent_size=2) }}
   {% if not Bias %}
-  %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>
+  %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}x{{DATA_STYPE}}>
   {% endif %}
   %c0 = arith.constant 0 : index
   {{ kernel.def_local_vars(indent_size=2) }}
   affine.for %index0 = 0 to {{ B }} {
     affine.for %index1 = 0 to {{ M }} step {{ TILE_M }} {
       affine.for %index2 = 0 to {{ N }} step {{ TILE_N }} {
-        %X_buffer2D = memref.reinterpret_cast %X_buffer to offset: [0], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : {{ X_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>
-        %W_buffer2D = memref.reinterpret_cast %W_buffer to offset: [0], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ W_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>
-        %Y_buffer2D = memref.reinterpret_cast %Y_buffer to offset: [0], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>
+        %X_buffer2D = memref.reinterpret_cast %X_buffer to offset: [0], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : {{ X_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_K }}x{{ DATA_STYPE }}, 1>
+        %W_buffer2D = memref.reinterpret_cast %W_buffer to offset: [0], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ W_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_K }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>
+        %Y_buffer2D = memref.reinterpret_cast %Y_buffer to offset: [0], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_N }}x{{ DATA_STYPE }}, 1>
         {% if Bias -%}
         {{ kernel.def_dma_op("MVIN", "Bias", Bias_idx, Y_tile_desc, subtile_size=[1, SUB_TILE_M, SUB_TILE_N], indent_size=8) }}
         {%- else -%}
-        affine.vector_store %v0, %Y_buffer[0, 0, 0] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }}, vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>
+        affine.vector_store %v0, %Y_buffer[0, 0, 0] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }}, vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}x{{DATA_STYPE}}>
         {% endif %}
 
         affine.for %index3 = 0 to {{ K }} step {{ TILE_K }} {
@@ -74,20 +74,20 @@
   {{ kernel.def_sram_buffer("W", W_tile_desc, indent_size=2) }}
   {{ kernel.def_sram_buffer("Y", Y_tile_desc, indent_size=2) }}
   {% if not Bias %}
-  %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>
+  %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}x{{DATA_STYPE}}>
   {% endif %}
   %c0 = arith.constant 0 : index
   {{ kernel.def_local_vars(indent_size=2) }}
   affine.for %index0 = 0 to {{ B }} {
     affine.for %index1 = 0 to {{ M }} step {{ TILE_M }} {
       affine.for %index2 = 0 to {{ N }} step {{ TILE_N }} {
-        %X_buffer2D = memref.reinterpret_cast %X_buffer to offset: [0], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : memref<1x{{ TILE_M }}x{{ TILE_K }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>
-        %W_buffer2D = memref.reinterpret_cast %W_buffer to offset: [0], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<1x{{ TILE_K }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>
-        %Y_buffer2D = memref.reinterpret_cast %Y_buffer to offset: [0], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<1x{{ TILE_M }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>
+        %X_buffer2D = memref.reinterpret_cast %X_buffer to offset: [0], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : memref<1x{{ TILE_M }}x{{ TILE_K }}x{{DATA_STYPE}}, 1> to memref<{{ TILE_M }}x{{ TILE_K }}x{{DATA_STYPE}}, 1>
+        %W_buffer2D = memref.reinterpret_cast %W_buffer to offset: [0], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<1x{{ TILE_K }}x{{ TILE_N }}x{{DATA_STYPE}}, 1> to memref<{{ TILE_K }}x{{ TILE_N }}x{{DATA_STYPE}}, 1>
+        %Y_buffer2D = memref.reinterpret_cast %Y_buffer to offset: [0], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<1x{{ TILE_M }}x{{ TILE_N }}x{{DATA_STYPE}}, 1> to memref<{{ TILE_M }}x{{ TILE_N }}x{{DATA_STYPE}}, 1>
         {% if Bias -%}
         {{ kernel.def_dma_op("MVIN", "Bias", Bias_idx, Y_tile_desc, subtile_size=[1, SUB_TILE_M, SUB_TILE_N], indent_size=8) }}
         {%- else -%}
-        affine.vector_store %v0, %Y_buffer[0, 0, 0] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }}, vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>
+        affine.vector_store %v0, %Y_buffer[0, 0, 0] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }}, vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}x{{DATA_STYPE}}>
         {% endif %}
         affine.for %index3 = 0 to {{ K }} step {{ TILE_K }} {
           {{kernel.load_input(indent_size=10)}}
@@ -120,21 +120,21 @@
   {{ kernel.def_sram_buffer("W", W_tile_desc, indent_size=2) }}
   {{ kernel.def_sram_buffer("Y", Y_tile_desc, indent_size=2) }}
   {% if not Bias %}
-  %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>
+  %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}x{{DATA_STYPE}}>
   {% endif %}
   %c0 = arith.constant 0 : index
   {{ kernel.def_local_vars(indent_size=2) }}
   affine.for %index0=0 to {{ B }} {
     affine.for %index2 = 0 to {{ N }} step {{ TILE_N }} {
       affine.for %index1 = 0 to {{ M }} step {{ TILE_M }} {
-        %X_buffer2D = memref.reinterpret_cast %X_buffer to offset: [0], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : memref<1x{{ TILE_M }}x{{ TILE_K }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_K }}xf32, 1>
-        %W_buffer2D = memref.reinterpret_cast %W_buffer to offset: [0], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<1x{{ TILE_K }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_K }}x{{ TILE_N }}xf32, 1>
-        %Y_buffer2D = memref.reinterpret_cast %Y_buffer to offset: [0], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<1x{{ TILE_N }}x{{ TILE_M }}xf32, 1> to memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>
+        %X_buffer2D = memref.reinterpret_cast %X_buffer to offset: [0], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : memref<1x{{ TILE_M }}x{{ TILE_K }}x{{DATA_STYPE}}, 1> to memref<{{ TILE_M }}x{{ TILE_K }}x{{DATA_STYPE}}, 1>
+        %W_buffer2D = memref.reinterpret_cast %W_buffer to offset: [0], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<1x{{ TILE_K }}x{{ TILE_N }}x{{DATA_STYPE}}, 1> to memref<{{ TILE_K }}x{{ TILE_N }}x{{DATA_STYPE}}, 1>
+        %Y_buffer2D = memref.reinterpret_cast %Y_buffer to offset: [0], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<1x{{ TILE_N }}x{{ TILE_M }}x{{DATA_STYPE}}, 1> to memref<{{ TILE_M }}x{{ TILE_N }}x{{DATA_STYPE}}, 1>
 
         {% if Bias -%}
         {{ kernel.def_dma_op("MVIN", "Bias", Bias_idx, Y_tile_desc, subtile_size=[1, SUB_TILE_M, SUB_TILE_N], indent_size=8) }} // Why not N,M? Currently, dma-fine-grained pass assume M->N order...
         {%- else -%}
-        affine.vector_store %v0, %Y_buffer[0, 0, 0] : memref<1x{{ TILE_N }}x{{ TILE_M }}xf32, 1>, vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>
+        affine.vector_store %v0, %Y_buffer[0, 0, 0] : memref<1x{{ TILE_N }}x{{ TILE_M }}x{{DATA_STYPE}}, 1>, vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}x{{DATA_STYPE}}>
         {% endif %}
         affine.for %index3 = 0 to {{ K }} step {{ TILE_K }} {
           {{ kernel.def_dma_op("MVIN", "X", X_idx, X_tile_desc, subtile_size=[1, SUB_TILE_M, SUB_TILE_K], indent_size=10) }}
@@ -237,6 +237,7 @@ def render(self,
         else:
           Bias_idx = None
 
+        data_stype = mlir_common.DTYPE_TO_MLIR[X.get_dtype()]
         kernel.render_options = dict(
             KERNEL_NAME=self.name,
             kernel=kernel,
@@ -245,7 +246,7 @@ def render(self,
             SUB_TILE_M=SUB_TILE_M,
             SUB_TILE_N=SUB_TILE_N,
             SUB_TILE_K=SUB_TILE_K,
-            DATA_STYPE="f32",
+            DATA_STYPE=data_stype,
             X = X, W = W,Y = Y, Bias = Bias,
             X_idx = X_idx,
             W_idx = W_idx,
@@ -319,6 +320,12 @@ def extract_info(self, template_buffer_node, epilogue_nodes, prologue_nodes):
         X, W = self.input_nodes[0], self.input_nodes[1]
         Y = self.output_node
         Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2]
+        dtype_infos = [("X", X.get_dtype()), ("W", W.get_dtype()), ("Y", Y.get_dtype())]
+        if Bias is not None:
+            dtype_infos.append(("Bias", Bias.get_dtype()))
+        if len({dtype for _, dtype in dtype_infos}) != 1:
+            dtype_desc = ", ".join(f"{name}={dtype}" for name, dtype in dtype_infos)
+            raise NotImplementedError(f"Mixed dtype BMM is not implemented yet ({dtype_desc})")
 
         W_tensor =  empty_strided(W.layout.size, W.layout.stride)
         X_tensor =  empty_strided(X.layout.size, X.layout.stride)
diff --git a/PyTorchSimFrontend/mlir/mlir_caller_codegen.py b/PyTorchSimFrontend/mlir/mlir_caller_codegen.py
index 06d41ea2..7c842272 100644
--- a/PyTorchSimFrontend/mlir/mlir_caller_codegen.py
+++ b/PyTorchSimFrontend/mlir/mlir_caller_codegen.py
@@ -182,22 +182,18 @@ def add_extention(self, name, extension):
     def compile_wih_kernel(self, write_path, llvm_name, wrapper_name, binary_name, link_option=""):
         main_path = os.path.join(write_path, self.add_extention(wrapper_name, 'c'))
         main_obj_path = os.path.join(write_path, self.add_extention(wrapper_name, 'o'))
-        kernel_path = os.path.join(write_path, self.add_extention(llvm_name, 's'))
         kernel_obj_path = os.path.join(write_path, self.add_extention(llvm_name, 'o'))
 
         main_compile = f'riscv64-unknown-elf-gcc -march=rv64gcv -c {main_path} -o {main_obj_path}'
-        kernel_compile = f'clang -c --target="riscv64" -march=rv64gcv -O2 -nostdlib {kernel_path} -o {kernel_obj_path}'
 
         target = os.path.join(write_path, binary_name)
         link = f'riscv64-unknown-elf-gcc -march=rv64gcv {main_obj_path} {kernel_obj_path} -o {target} -lm {link_option}'
 
         main_compile_cmd = shlex.split(main_compile)
-        kernel_compile_cmd = shlex.split(kernel_compile)
         link_cmd = shlex.split(link)
 
         try:
             subprocess.check_call(main_compile_cmd)
-            subprocess.check_call(kernel_compile_cmd)
             subprocess.check_call(link_cmd)
         except subprocess.CalledProcessError as e:
             print("Command failed with exit code", e.returncode)
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 256d7101..3c408681 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -67,7 +67,7 @@
 DTYPE_TO_C = {
     torch.float32: "float",
     torch.float64: "double",
-    torch.float16: "half",
+    torch.float16: "uint16_t",
     torch.int64: "int64_t",
     torch.int32: "int32_t",
     torch.int16: "int16_t",
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_common.py b/PyTorchSimFrontend/mlir/mlir_conv_common.py
index f72a7663..91e200a8 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_common.py
@@ -52,6 +52,12 @@ def extract_info(self, kernel, template_buffer_node, epilogue_nodes):
         X, W = self.input_nodes[0], self.input_nodes[1]
         Y = self.output_node
         Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2]
+        dtype_infos = [("X", X.get_dtype()), ("W", W.get_dtype()), ("Y", Y.get_dtype())]
+        if Bias is not None:
+            dtype_infos.append(("Bias", Bias.get_dtype()))
+        if len({dtype for _, dtype in dtype_infos}) != 1:
+            dtype_desc = ", ".join(f"{name}={dtype}" for name, dtype in dtype_infos)
+            raise NotImplementedError(f"Mixed dtype Conv is not implemented yet ({dtype_desc})")
 
         if epilogue_nodes is not None:
             extra_node_rw = {
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py b/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py
index da2bc829..e91014fa 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py
@@ -47,7 +47,7 @@
   {{ kernel.def_sram_buffer("X", X_tile_desc, indent_size=2) }}
   {{ kernel.def_sram_buffer("W", W_tile_desc, indent_size=2) }}
   {{ kernel.def_sram_buffer("Y", Y_tile_desc, indent_size=2) }}
-  %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}xf32>
+  %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}x{{DATA_STYPE}}>
   %c0 = arith.constant 0 : index
   {{- kernel.def_local_vars(indent_size=2) }}
 
@@ -59,7 +59,7 @@
           {%- if BIAS %}
           {{ kernel.def_dma_op("MVIN", "Bias", Bias_idx, Bias_tile_desc, subtile_size=[SUB_TILE_M, SUB_TILE_N, TILE_O_H, TILE_O_W], indent_size=10) }}
           {%- else %}
-          affine.vector_store %v0, %output_buffer[%c0, %c0, %c0, %c0] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }}, vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}xf32>
+          affine.vector_store %v0, %output_buffer[%c0, %c0, %c0, %c0] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }}, vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}x{{DATA_STYPE}}>
           {%- endif %}
           affine.for %k_h = 0 to {{ K_H }} step {{ TILE_K_H }} {
             affine.for %tile_k = 0 to {{ I_C * K_W }} step {{ TILE_K }} {
@@ -71,16 +71,16 @@
               affine.for %tile_k_h = 0 to {{ TILE_K_H }} { // loop order should be fixed for timing simulation. Do not change this order.
                 affine.for %tile_k_w = 0 to 1 {
                   %offset_w = affine.apply #offset_w_map(%tile_k_h, %tile_k_w)
-                  %W_buffer = memref.reinterpret_cast %weight_buffer to offset: [%offset_w], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ W_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
+                  %W_buffer = memref.reinterpret_cast %weight_buffer to offset: [%offset_w], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ W_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_K }}x{{ TILE_N }}x{{DATA_STYPE}}, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
                   affine.for %tile_o_h = 0 to {{ TILE_O_H }} {
                     affine.for %tile_o_w = 0 to {{ TILE_O_W }} {
                       %tile_i_h = affine.apply #map_I_H(%tile_o_h, %tile_k_h)
                       %offset_x = affine.apply #offset_x_map(%tile_i_h, %tile_o_w)
                       %offset_y = affine.apply #offset_y_map(%tile_o_h, %tile_o_w)
-                      %X_buffer = memref.reinterpret_cast %input_buffer to offset: [%offset_x], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : {{ X_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>
-                      %Y_buffer = memref.reinterpret_cast %output_buffer to offset: [%offset_y], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
-                      linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
-                            outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
+                      %X_buffer = memref.reinterpret_cast %input_buffer to offset: [%offset_x], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : {{ X_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_K }}x{{DATA_STYPE}}, strided<[{{ TILE_K }}, 1], offset: ?>, 1>
+                      %Y_buffer = memref.reinterpret_cast %output_buffer to offset: [%offset_y], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_N }}x{{DATA_STYPE}}, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
+                      linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}x{{DATA_STYPE}}, strided<[{{ TILE_K }}, 1], offset: ?>, 1>, memref<{{ TILE_K }}x{{ TILE_N }}x{{DATA_STYPE}}, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
+                            outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}x{{DATA_STYPE}}, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
                     } { inner_loop=true }
                   } { inner_loop=true }
                 } { inner_loop=true }
@@ -179,6 +179,8 @@ def render(self,
         if Bias is not None:
           Bias_tile_desc.offset = Bias.get_layout().offset
 
+        data_stype = mlir_common.DTYPE_TO_MLIR[X.get_dtype()]
+
         kernel.render_options = dict(
             KERNEL_NAME=self.name,
             kernel=kernel,
@@ -220,7 +222,7 @@ def render(self,
             X_idx = X_idx,
             W_idx = W_idx,
             Bias_idx = Bias_idx,
-            DATA_STYPE="f32",
+            DATA_STYPE=data_stype,
             input_reorder=self.input_reorder
         )
 
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py b/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py
index cc284522..db2c64db 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py
@@ -48,7 +48,7 @@
   {{ kernel.def_sram_buffer("X", X_tile_desc, indent_size=2) }}
   {{ kernel.def_sram_buffer("W", W_tile_desc, indent_size=2) }}
   {{ kernel.def_sram_buffer("Y", Y_tile_desc, indent_size=2) }}
-  %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}xf32>
+  %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}x{{DATA_STYPE}}>
   %c0 = arith.constant 0 : index
   {{- kernel.def_local_vars(indent_size=2) }}
   affine.for %tile_n = 0 to {{ O_C }} step {{ TILE_N }} {
@@ -58,7 +58,7 @@
         {%- if BIAS %}
         {{ kernel.def_dma_op("MVIN", "Bias", Bias_idx, Bias_tile_desc, subtile_size=[1, SUB_TILE_N, TILE_O_H, SUB_TILE_M], indent_size=8) }}
         {%- else %}
-        affine.vector_store %v0, %output_buffer[%c0, %c0, %c0, %c0] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }}, vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}xf32>
+        affine.vector_store %v0, %output_buffer[%c0, %c0, %c0, %c0] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }}, vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}x{{DATA_STYPE}}>
         {%- endif %}
         affine.for %k_h = 0 to {{ K_H }} step {{ TILE_K_H }} {
           affine.for %k_w = 0 to {{ K_W }} step {{ TILE_K_W }} {
@@ -72,16 +72,16 @@
               affine.for %tile_k_h = 0 to {{ TILE_K_H }} { // loop order should be fixed for timing simulation. Do not change this order.
                 affine.for %tile_k_w = 0 to {{ TILE_K_W }} {
                   %offset_w = affine.apply #offset_w_map(%tile_k_h, %tile_k_w)
-                  %W_buffer = memref.reinterpret_cast %weight_buffer to offset: [%offset_w], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ W_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
+                  %W_buffer = memref.reinterpret_cast %weight_buffer to offset: [%offset_w], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ W_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_K }}x{{ TILE_N }}x{{DATA_STYPE}}, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
                   affine.for %tile_o_h = 0 to {{ TILE_O_H }} {
                     affine.for %tile_o_w = 0 to {{ 1 }} { // TILE_O_W
                       %tile_i_h = affine.apply #map_I_H(%tile_o_h, %tile_k_h)
                       %offset_x = affine.apply #offset_x_map(%tile_i_h, %tile_k_w)
                       %offset_y = affine.apply #offset_y_map(%tile_o_h, %tile_o_w)
-                      %X_buffer = memref.reinterpret_cast %input_buffer to offset: [%offset_x], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : {{ X_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>
-                      %Y_buffer = memref.reinterpret_cast %output_buffer to offset: [%offset_y], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
-                      linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
-                            outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
+                      %X_buffer = memref.reinterpret_cast %input_buffer to offset: [%offset_x], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : {{ X_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_K }}x{{DATA_STYPE}}, strided<[{{ TILE_K }}, 1], offset: ?>, 1>
+                      %Y_buffer = memref.reinterpret_cast %output_buffer to offset: [%offset_y], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_N }}x{{DATA_STYPE}}, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
+                      linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}x{{DATA_STYPE}}, strided<[{{ TILE_K }}, 1], offset: ?>, 1>, memref<{{ TILE_K }}x{{ TILE_N }}x{{DATA_STYPE}}, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
+                            outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}x{{DATA_STYPE}}, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
                     } { inner_loop=true }
                   } { inner_loop=true }
                 } { inner_loop=true }
@@ -178,6 +178,8 @@ def render(self,
         if Bias is not None:
           Bias_tile_desc.offset = Bias.get_layout().offset
 
+        data_stype = mlir_common.DTYPE_TO_MLIR[X.get_dtype()]
+
         kernel.render_options = dict(
             KERNEL_NAME=self.name,
             kernel=kernel,
@@ -219,7 +221,7 @@ def render(self,
             X_idx = X_idx,
             W_idx = W_idx,
             Bias_idx = Bias_idx,
-            DATA_STYPE="f32",
+            DATA_STYPE=data_stype,
             input_reorder=self.input_reorder
         )
 
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py b/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py
index 6d768bf2..95db53c3 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py
@@ -48,7 +48,7 @@
   {{ kernel.def_sram_buffer("X", X_tile_desc, indent_size=2) }}
   {{ kernel.def_sram_buffer("W", W_tile_desc, indent_size=2) }}
   {{ kernel.def_sram_buffer("Y", Y_tile_desc, indent_size=2) }}
-  %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}xf32>
+  %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}x{{DATA_STYPE}}>
   %c0 = arith.constant 0 : index
   {{- kernel.def_local_vars(indent_size=2) }}
 
@@ -59,7 +59,7 @@
         {%- if BIAS %}
         {{ kernel.def_dma_op("MVIN", "Bias", Bias_idx, Bias_tile_desc, subtile_size=[1, SUB_TILE_N, TILE_O_H, SUB_TILE_M], indent_size=8) }}
         {%- else %}
-        affine.vector_store %v0, %output_buffer[%c0, %c0, %c0, %c0] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }}, vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}xf32>
+        affine.vector_store %v0, %output_buffer[%c0, %c0, %c0, %c0] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }}, vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}x{{DATA_STYPE}}>
         {%- endif %}
         affine.for %k_h = 0 to {{ K_H }} step {{ TILE_K_H }} {
           affine.for %k_w = 0 to {{ K_W }} step {{ TILE_K_W }} {
@@ -72,16 +72,16 @@
               affine.for %tile_k_h = 0 to {{ TILE_K_H }} { // loop order should be fixed for timing simulation. Do not change this order.
                 affine.for %tile_k_w = 0 to {{ TILE_K_W }} {
                   %offset_w = affine.apply #offset_w_map(%tile_k_h, %tile_k_w)
-                  %W_buffer = memref.reinterpret_cast %weight_buffer to offset: [%offset_w], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}xf32, 1> to memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
+                  %W_buffer = memref.reinterpret_cast %weight_buffer to offset: [%offset_w], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : memref<{{ TILE_K_H }}x{{ TILE_K_W }}x{{ TILE_K }}x{{ TILE_N }}x{{DATA_STYPE}}, 1> to memref<{{ TILE_K }}x{{ TILE_N }}x{{DATA_STYPE}}, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
                   affine.for %tile_o_h = 0 to {{ TILE_O_H }} {
                     affine.for %tile_o_w = 0 to {{ 1 }} { // TILE_O_W
                       %tile_i_h = affine.apply #map_I_H(%tile_o_h, %tile_k_h)
                       %offset_x = affine.apply #offset_x_map(%tile_i_h, %tile_k_w)
                       %offset_y = affine.apply #offset_y_map(%tile_o_h, %tile_o_w)
-                      %X_buffer = memref.reinterpret_cast %input_buffer to offset: [%offset_x], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : {{ X_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>
-                      %Y_buffer = memref.reinterpret_cast %output_buffer to offset: [%offset_y], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
-                      linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
-                            outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
+                      %X_buffer = memref.reinterpret_cast %input_buffer to offset: [%offset_x], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : {{ X_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_K }}x{{DATA_STYPE}}, strided<[{{ TILE_K }}, 1], offset: ?>, 1>
+                      %Y_buffer = memref.reinterpret_cast %output_buffer to offset: [%offset_y], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_N }}x{{DATA_STYPE}}, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
+                      linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}x{{DATA_STYPE}}, strided<[{{ TILE_K }}, 1], offset: ?>, 1>, memref<{{ TILE_K }}x{{ TILE_N }}x{{DATA_STYPE}}, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
+                            outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}x{{DATA_STYPE}}, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
                     } { inner_loop=true }
                   } { inner_loop=true }
                 } { inner_loop=true }
@@ -179,6 +179,8 @@ def render(self,
         if Bias is not None:
           Bias_tile_desc.offset = Bias.get_layout().offset
 
+        data_stype = mlir_common.DTYPE_TO_MLIR[X.get_dtype()]
+
         kernel.render_options = dict(
             KERNEL_NAME=self.name,
             kernel=kernel,
@@ -220,7 +222,7 @@ def render(self,
             X_idx = X_idx,
             W_idx = W_idx,
             Bias_idx = Bias_idx,
-            DATA_STYPE="f32",
+            DATA_STYPE=data_stype,
             input_reorder=self.input_reorder
         )
 
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index e2cd61fd..3666b3c9 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -48,7 +48,7 @@
   {{ kernel.def_sram_buffer("X", X_tile_desc, indent_size=2) }}
   {{ kernel.def_sram_buffer("W", W_tile_desc, indent_size=2) }}
   {{ kernel.def_sram_buffer("Y", Y_tile_desc, indent_size=2) }}
-  %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}xf32>
+  %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}x{{DATA_STYPE}}>
   %c0 = arith.constant 0 : index
   {{ kernel.def_local_vars(indent_size=2) }}
 
@@ -60,7 +60,7 @@
           {%- if BIAS %}
           {{ kernel.def_dma_op("MVIN", "Bias", Bias_idx, Bias_tile_desc, subtile_size=[SUB_TILE_M, SUB_TILE_N, TILE_O_H, TILE_O_W], indent_size=10) }}
           {%- else %}
-          affine.vector_store %v0, %output_buffer[%c0, %c0, %c0, %c0] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }}, vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}xf32>
+          affine.vector_store %v0, %output_buffer[%c0, %c0, %c0, %c0] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }}, vector<{{ kernel.get_spad_size_per_lane(TILE_O_H * TILE_M, TILE_N) }}x{{DATA_STYPE}}>
           {%- endif %}
           affine.for %k_h = 0 to {{ K_H }} step {{ TILE_K_H }} {
             affine.for %k_w = 0 to {{ K_W }} step {{ TILE_K_W }} {
@@ -74,17 +74,17 @@
                 affine.for %tile_k_h = 0 to {{ TILE_K_H }} { // loop order should be fixed for timing simulation. Do not change this order.
                   affine.for %tile_k_w = 0 to {{ TILE_K_W }} {
                     %offset_w = affine.apply #offset_w_map(%tile_k_h, %tile_k_w)
-                    %W_buffer = memref.reinterpret_cast %weight_buffer to offset: [%offset_w], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ W_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
+                    %W_buffer = memref.reinterpret_cast %weight_buffer to offset: [%offset_w], sizes: [{{ TILE_K }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ W_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_K }}x{{ TILE_N }}x{{DATA_STYPE}}, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
                     affine.for %tile_o_h = 0 to {{ TILE_O_H }} {
                       affine.for %tile_o_w = 0 to {{ TILE_O_W }} {
                         %tile_i_h = affine.apply #map_I_H(%tile_o_h, %tile_k_h)
                         %tile_i_w = affine.apply #map_I_W(%tile_o_w, %tile_k_w)
                         %offset_x = affine.apply #offset_x_map(%tile_i_h, %tile_i_w)
                         %offset_y = affine.apply #offset_y_map(%tile_o_h, %tile_o_w)
-                        %X_buffer = memref.reinterpret_cast %input_buffer to offset: [%offset_x], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : {{ X_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>
-                        %Y_buffer = memref.reinterpret_cast %output_buffer to offset: [%offset_y], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
-                        linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}xf32, strided<[{{ TILE_K }}, 1], offset: ?>, 1>, memref<{{ TILE_K }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
-                              outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}xf32, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
+                        %X_buffer = memref.reinterpret_cast %input_buffer to offset: [%offset_x], sizes: [{{ TILE_M }}, {{ TILE_K }}], strides: [{{ TILE_K }}, 1] : {{ X_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_K }}x{{DATA_STYPE}}, strided<[{{ TILE_K }}, 1], offset: ?>, 1>
+                        %Y_buffer = memref.reinterpret_cast %output_buffer to offset: [%offset_y], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_N }}x{{DATA_STYPE}}, strided<[{{ TILE_N }}, 1], offset: ?>, 1>
+                        linalg.matmul ins(%X_buffer, %W_buffer : memref<{{ TILE_M }}x{{ TILE_K }}x{{DATA_STYPE}}, strided<[{{ TILE_K }}, 1], offset: ?>, 1>, memref<{{ TILE_K }}x{{ TILE_N }}x{{DATA_STYPE}}, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
+                              outs(%Y_buffer : memref<{{ TILE_M }}x{{ TILE_N }}x{{DATA_STYPE}}, strided<[{{ TILE_N }}, 1], offset: ?>, 1>)
                       } { inner_loop=true }
                     } { inner_loop=true }
                   } { inner_loop=true }
@@ -183,6 +183,8 @@ def render(self,
         if Bias is not None:
           Bias_tile_desc.offset = Bias.get_layout().offset
 
+        data_stype = mlir_common.DTYPE_TO_MLIR[X.get_dtype()]
+
         kernel.render_options = dict(
             KERNEL_NAME=self.name,
             kernel=kernel,
@@ -224,7 +226,7 @@ def render(self,
             X_idx = X_idx,
             W_idx = W_idx,
             Bias_idx = Bias_idx,
-            DATA_STYPE="f32",
+            DATA_STYPE=data_stype,
             input_reorder=self.input_reorder
         )
 
diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index 5b116807..eb391dba 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -27,14 +27,14 @@
   {{ kernel.def_sram_buffer("W", W_tile_desc, indent_size=2) }}
   {{ kernel.def_sram_buffer("Y", Y_tile_desc, indent_size=2) }}
   {% if not Bias %}
-  %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>{% endif %}
+  %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}x{{DATA_STYPE}}>{% endif %}
   {{ kernel.def_local_vars(indent_size=2) }}
   affine.for %index0 = 0 to {{ M }} step {{ TILE_M }} {
     affine.for %index1 = 0 to {{ N }} step {{ TILE_N }} {
       {%- if Bias %}
       {{ kernel.def_dma_op("MVIN", "Bias", Bias_idx, Bias_tile_desc, subtile_size=[SUB_TILE_M, SUB_TILE_N], indent_size=6) }}
       {%- else %}
-      affine.vector_store %v0, %Y_buffer[0, 0] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }}, vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>
+      affine.vector_store %v0, %Y_buffer[0, 0] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }}, vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}x{{DATA_STYPE}}>
       {%- endif %}
       affine.for %index2 = 0 to {{ K }} step {{ TILE_K }} {
         {% if prologue_nodes -%}
@@ -77,16 +77,16 @@
   {{ kernel.def_sram_buffer("W", W_tile_desc, indent_size=2) }}
   {{ kernel.def_sram_buffer("Y", Y_tile_desc, indent_size=2) }}
   {% if not Bias %}
-  %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>
+  %v0 = arith.constant dense<0.0> : vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}x{{DATA_STYPE}}>
   {% endif %}
   {{ kernel.def_local_vars(indent_size=2) }}
   affine.for %index1 = 0 to {{ N }} step {{ TILE_N }} {
     affine.for %index0 = 0 to {{ M }} step {{ TILE_M }} {
-      %Y_bufferT = memref.reinterpret_cast %Y_buffer to offset: [0], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_N }}xf32, 1>
+      %Y_bufferT = memref.reinterpret_cast %Y_buffer to offset: [0], sizes: [{{ TILE_M }}, {{ TILE_N }}], strides: [{{ TILE_N }}, 1] : {{ Y_tile_desc.get_mlir_shape(DATA_STYPE) }} to memref<{{ TILE_M }}x{{ TILE_N }}x{{DATA_STYPE}}, 1>
       {%- if Bias %}
       {{ kernel.def_dma_op("MVIN", "Bias", Bias_idx, Bias_tile_desc, subtile_size=[SUB_TILE_M, SUB_TILE_N], indent_size=6) }}
       {%- else %}
-      affine.vector_store %v0, %Y_buffer[0, 0] : memref<{{ TILE_N }}x{{ TILE_M }}xf32, 1>, vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}xf32>
+      affine.vector_store %v0, %Y_buffer[0, 0] : memref<{{ TILE_N }}x{{ TILE_M }}x{{DATA_STYPE}}, 1>, vector<{{ kernel.get_spad_size_per_lane(TILE_M, TILE_N) }}x{{DATA_STYPE}}>
       {%- endif %}
       affine.for %index2 = 0 to {{ K }} step {{ TILE_K }} {
         {{ kernel.def_dma_op("MVIN", "X", X_idx, X_tile_desc, subtile_size=[SUB_TILE_M, SUB_TILE_K], indent_size=8) }}
@@ -187,6 +187,8 @@ def render(self,
         else:
           Bias_idx = None
 
+        data_stype = mlir_common.DTYPE_TO_MLIR[X.get_dtype()]
+
         kernel.render_options = dict(
             KERNEL_NAME=self.name,
             kernel=kernel,
@@ -197,7 +199,7 @@ def render(self,
             SUB_TILE_M=SUB_TILE_M,
             SUB_TILE_N=SUB_TILE_N,
             SUB_TILE_K=SUB_TILE_K,
-            DATA_STYPE="f32",
+            DATA_STYPE=data_stype,
             X = X, W = W, Y = Y,
             Bias = Bias,
             X_idx = X_idx,
@@ -280,6 +282,12 @@ def extract_info(self, template_buffer_node, epilogue_nodes, prologue_nodes):
 
         # Extract input arguments info
         X, W, Y = self.input_nodes[0], self.input_nodes[1], self.output_node
+        dtype_infos = [("X", X.get_dtype()), ("W", W.get_dtype()), ("Y", Y.get_dtype())]
+        if len(self.input_nodes) > 2:
+            dtype_infos.append(("Bias", self.input_nodes[2].get_dtype()))
+        if len({dtype for _, dtype in dtype_infos}) != 1:
+            dtype_desc = ", ".join(f"{name}={dtype}" for name, dtype in dtype_infos)
+            raise NotImplementedError(f"Mixed dtype GEMM is not implemented yet ({dtype_desc})")
         X_tensor = empty_strided(X.layout.size, X.layout.stride)
         W_tensor = empty_strided(W.layout.size, W.layout.stride)
         if len(W_tensor.size()) > 2 or len(X_tensor.size()) > 2:
diff --git a/Simulator/simulator.py b/Simulator/simulator.py
index 13f2b4f0..f24835ba 100644
--- a/Simulator/simulator.py
+++ b/Simulator/simulator.py
@@ -68,6 +68,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
     torch.uint8: np.uint8,
     torch.bool: np.uint8,
     torch.bfloat16: np.float16,
+    torch.float16: np.float16,
 }
 
 class FunctionalSimulator():
@@ -143,7 +144,7 @@ def run_spike(self, args, arg_attributes, runtime_path, binary, vectorlane_size=
         base_path= f"--base-path={runtime_path}"
         os.makedirs(os.path.join(runtime_path, "indirect_access"), exist_ok=True)
         os.makedirs(os.path.join(runtime_path, "dma_access"), exist_ok=True)
-        run = f'spike --isa rv64gcv --varch=vlen:256,elen:64 {vectorlane_option} {spad_option} {kernel_address} {base_path} /workspace/riscv-pk/build/pk {target_binary} {file_path_str}'
+        run = f'spike --isa rv64gcv_zfh --varch=vlen:256,elen:64 {vectorlane_option} {spad_option} {kernel_address} {base_path} /workspace/riscv-pk/build/pk {target_binary} {file_path_str}'
         if not silent_mode:
             logger.debug(f"[Spike] cmd> {run}")
             logger.info("[Spike] Running Spike simulator")

From 7af91dedeca74703c35ec9446ec167fcb8e4ec88 Mon Sep 17 00:00:00 2001
From: HamHyungkyu <hhk971@postech.ac.kr>
Date: Thu, 12 Mar 2026 10:09:40 +0900
Subject: [PATCH 124/194] [Frontend] Fix incorrect constant key usage and
 boolean scientific-notation edge case

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 10 +++++-----
 PyTorchSimFrontend/mlir/mlir_ops.py             |  4 ++++
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index d6ddb025..43cb65a4 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -1423,11 +1423,11 @@ def get_const_cse(self, value, dtype="index") -> common.CSEVariable:
             value = float(value)
         else:
             value = int(value)
-
-        if value not in self.consts:
-            self.consts[str(value)+dtype] = self.const_cse.generate(self.const_buffer, f"arith.constant {value} : {dtype}")
-            self.register_var_info(self.consts[str(value)+dtype], [1, dtype])
-        return self.consts[str(value)+dtype]
+        key = str(value)+dtype
+        if key not in self.consts:
+            self.consts[key] = self.const_cse.generate(self.const_buffer, f"arith.constant {value} : {dtype}")
+            self.register_var_info(self.consts[key], [1, dtype])
+        return self.consts[key]
 
     def get_tag_cse(self, value=None, shape="memref<1xi32>"):
         if value is None:
diff --git a/PyTorchSimFrontend/mlir/mlir_ops.py b/PyTorchSimFrontend/mlir/mlir_ops.py
index ace4f9ea..76a0e273 100644
--- a/PyTorchSimFrontend/mlir/mlir_ops.py
+++ b/PyTorchSimFrontend/mlir/mlir_ops.py
@@ -59,6 +59,10 @@ def constant(value, src_type, *args, **kwargs):
         str_val = str(value)
         if "inf" == str_val or "-inf" == str_val or "nan" == str_val:
             value = f"0x{mlir_common.MLIR_INF[str_val][src_type]:x}"
+        elif isinstance(value, bool):
+            value = 1 if value else 0
+            if src_type[0] == "f":
+                value = format(float(value), ".20f")
         # scientific notation check
         elif "e" in str_val:
             value = format(float(value), ".20f")

From 7bad17ae337873511a8b4e584d73767da56145bb Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 11 Mar 2026 19:51:41 +0900
Subject: [PATCH 125/194] [Fix] Refactor MLIR precision handling to be
 dtype-driven

---
 PyTorchSimFrontend/extension_config.py        |  4 +-
 PyTorchSimFrontend/mlir/mlir_bmm_template.py  | 10 +++--
 PyTorchSimFrontend/mlir/mlir_cat_template.py  | 20 +++++++---
 PyTorchSimFrontend/mlir/mlir_common.py        |  7 +++-
 PyTorchSimFrontend/mlir/mlir_conv_common.py   | 11 +++---
 .../mlir/mlir_conv_mt_template.py             | 10 ++---
 .../mlir/mlir_conv_sb_template.py             |  8 ++--
 .../mlir/mlir_conv_sbs_template.py            |  8 ++--
 PyTorchSimFrontend/mlir/mlir_conv_template.py |  8 ++--
 PyTorchSimFrontend/mlir/mlir_gemm_template.py | 10 +++--
 PyTorchSimFrontend/mlir/mlir_template.py      | 38 +++++++++----------
 README.md                                     |  1 -
 12 files changed, 76 insertions(+), 59 deletions(-)

diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py
index eff6f573..fe8cc380 100644
--- a/PyTorchSimFrontend/extension_config.py
+++ b/PyTorchSimFrontend/extension_config.py
@@ -31,8 +31,6 @@ def __getattr__(name):
           "spad_size" : config_yaml["vpu_spad_size_kb_per_lane"] << 10 # Note: spad size per lane
         }
 
-    if name == "CONFIG_PRECISION":
-        return 4 # 32bit
     if name == "CONFIG_NUM_CORES":
         return config_yaml["num_cores"]
     if name == "vpu_vector_length_bits":
@@ -132,7 +130,7 @@ def load_plan_from_module(module_path):
 
 CONFIG_USE_TIMING_POOLING = int(os.environ.get('TORCHSIM_USE_TIMING_POOLING', default=0))
 
-CONFIG_DEBUG_MODE = int(os.environ.get('TORCHSIM_DEBUG_MODE', default=0))
+CONFIG_DEBUG_MODE = int(os.environ.get('TORCHSIM_DEBUG_MODE', default=1))
 
 
 def setup_logger(name=None, level=None):
diff --git a/PyTorchSimFrontend/mlir/mlir_bmm_template.py b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
index 417d97cd..c5fd902f 100644
--- a/PyTorchSimFrontend/mlir/mlir_bmm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_bmm_template.py
@@ -166,8 +166,9 @@ def render(self,
                tile_info = None,
                **kwargs):
         X, W, Y, Bias, W_tensor, X_tensor, B, M, N, K, n_extra_node, n_prologue_node = self.extract_info(template_buffer_node, epilogue_nodes, prologue_nodes)
+        precision_bytes = mlir_common.get_dtype_nbytes(X.get_dtype())
         if tile_info is None:
-            TILE_M, TILE_N, TILE_K, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, M, N, K, n_extra_node, 0, n_prologue_node)[0]
+            TILE_M, TILE_N, TILE_K, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, M, N, K, n_extra_node, 0, n_prologue_node, precision_bytes)[0]
         else:
             TILE_M, TILE_N, TILE_K, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = tile_info
 
@@ -350,10 +351,11 @@ def get_tile_candidates(self,
                prologue_nodes: Optional[List[IRNode]] = None,
                **kwargs):
         X, W, Y, Bias, W_tensor, X_tensor, B, M, N, K, n_extra_node, n_prologue_node = self.extract_info(template_buffer_node, epilogue_nodes, prologue_nodes)
-        return self.select_tile(kernel, M, N, K, n_extra_node, 0, n_prologue_node)
+        precision_bytes = mlir_common.get_dtype_nbytes(X.get_dtype())
+        return self.select_tile(kernel, M, N, K, n_extra_node, 0, n_prologue_node, precision_bytes)
 
-    def select_tile(self, kernel, M, N, K, n_extra_node, n_extra_read, n_prologue_node):
-        tile_candidates = kernel.gemm_combination_mapping(M, N, K, n_extra_node=n_extra_node)
+    def select_tile(self, kernel, M, N, K, n_extra_node, n_extra_read, n_prologue_node, precision_bytes):
+        tile_candidates = kernel.gemm_combination_mapping(M, N, K, n_extra_node=n_extra_node, precision_bytes=precision_bytes)
         for idx, (TILE_M, TILE_N, TILE_K) in enumerate(tile_candidates):
             SUB_TILE_M = TILE_M if (TILE_M < kernel.vector_lane) or n_prologue_node else kernel.vector_lane
             SUB_TILE_N = TILE_N # if (TILE_N < kernel.vector_lane) or prologue_nodes else kernel.vector_lane
diff --git a/PyTorchSimFrontend/mlir/mlir_cat_template.py b/PyTorchSimFrontend/mlir/mlir_cat_template.py
index 7bee54ac..7abdfee6 100644
--- a/PyTorchSimFrontend/mlir/mlir_cat_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_cat_template.py
@@ -56,6 +56,11 @@ def render(
     ):
         input_nodes = self.input_nodes
         y = self.output_node
+        dtype_infos = [("Y", y.get_dtype())] + [(f"X{i}", x.get_dtype()) for i, x in enumerate(input_nodes)]
+        if len({dtype for _, dtype in dtype_infos}) != 1:
+            dtype_desc = ", ".join(f"{name}={dtype}" for name, dtype in dtype_infos)
+            raise NotImplementedError(f"Mixed dtype Cat is not implemented yet ({dtype_desc})")
+        precision_bytes = mlir_common.get_dtype_nbytes(y.get_dtype())
         num_inputs = len(input_nodes)
         rank = len(y.get_size())
 
@@ -68,7 +73,7 @@ def render(
         excluded_dims = self._compute_excluded_dims(tile_sizes)
 
         input_tile_sizes_dim = self._calculate_input_tile_sizes(
-            kernel, input_sizes, tile_sizes, num_inputs, rank
+            kernel, input_sizes, tile_sizes, num_inputs, rank, precision_bytes
         )
         buffer_name_to_template_name, input_dram_names = self._build_buffer_mapping(input_nodes)
         input_tile_descs, output_tile_descs, unique_tile_descs = self._build_tile_descriptors(
@@ -145,6 +150,11 @@ def get_tile_candidates(
             self.output_node = template_buffer_node
 
         y = self.output_node
+        dtype_infos = [("Y", y.get_dtype())] + [(f"X{i}", x.get_dtype()) for i, x in enumerate(self.input_nodes)]
+        if len({dtype for _, dtype in dtype_infos}) != 1:
+            dtype_desc = ", ".join(f"{name}={dtype}" for name, dtype in dtype_infos)
+            raise NotImplementedError(f"Mixed dtype Cat is not implemented yet ({dtype_desc})")
+        precision_bytes = mlir_common.get_dtype_nbytes(y.get_dtype())
         num_inputs = len(self.input_nodes)
         output_sizes = [sz for d, sz in enumerate(y.get_size()) if d != self.dim]
 
@@ -152,7 +162,7 @@ def get_tile_candidates(
             return [[1]]
 
         max_tile_total = kernel.spad_info["spad_size"] // (
-            kernel.vector_lane * kernel.precision * 2 * num_inputs
+            kernel.vector_lane * precision_bytes * 2 * num_inputs
         )
 
         dim_tile_candidates = []
@@ -174,7 +184,7 @@ def get_tile_candidates(
         tile_candidates = [
             list(combo)
             for combo in itertools.product(*dim_tile_candidates)
-            if math.prod(combo) * (num_inputs + 1) * kernel.precision
+            if math.prod(combo) * (num_inputs + 1) * precision_bytes
                <= kernel.spad_info["spad_size"] * kernel.vector_lane
         ]
 
@@ -199,11 +209,11 @@ def _compute_excluded_dims(self, tile_sizes: list) -> list:
             tile_sizes[idx] = 1
         return excluded
 
-    def _calculate_input_tile_sizes(self, kernel, input_sizes, tile_sizes, num_inputs, rank):
+    def _calculate_input_tile_sizes(self, kernel, input_sizes, tile_sizes, num_inputs, rank, precision_bytes):
         """Calculate tile sizes along the concat dimension for each input."""
         non_dim_tile_elements = math.prod(tile_sizes) if tile_sizes else 1
         max_spad_per_input = kernel.spad_info["spad_size"] * kernel.vector_lane // 2
-        extra_concat = math.ceil(max_spad_per_input / (non_dim_tile_elements * kernel.precision)) - num_inputs
+        extra_concat = math.ceil(max_spad_per_input / (non_dim_tile_elements * precision_bytes)) - num_inputs
 
         input_tile_sizes_dim = []
         for i in range(num_inputs):
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 3c408681..9f5dc6ab 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -90,6 +90,12 @@
     "index": 64
 }
 
+def get_dtype_nbytes(dtype):
+    mlir_dtype = DTYPE_TO_MLIR.get(dtype)
+    if mlir_dtype is None or mlir_dtype not in MLIR_TO_BIT:
+        raise NotImplementedError(f"Unsupported dtype for precision calculation: {dtype}")
+    return MLIR_TO_BIT[mlir_dtype] // 8
+
 DTYPE_LOWP_FP = [
     torch.bfloat16,
     torch.float16,
@@ -579,7 +585,6 @@ def __init__(self):
         # Default HW setting
         self.vector_lane = extension_config.vpu_num_lanes
         self.spad_info = extension_config.CONFIG_SPAD_INFO
-        self.precision = extension_config.CONFIG_PRECISION
         self.num_cores = extension_config.CONFIG_NUM_CORES
         self.vlen = extension_config.vpu_vector_length_bits
 
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_common.py b/PyTorchSimFrontend/mlir/mlir_conv_common.py
index 91e200a8..386e9bd5 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_common.py
@@ -2,7 +2,7 @@
 import math
 from typing import List, Optional
 
-from PyTorchSimFrontend.mlir.mlir_common import MLIRKernelArgs
+from PyTorchSimFrontend.mlir.mlir_common import MLIRKernelArgs, get_dtype_nbytes
 from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplate
 from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplateKernel
 from torch._inductor.ir import IRNode
@@ -40,7 +40,7 @@ def render(self,
                **kwargs):
         raise NotImplementedError()
 
-    def select_tile(self, kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W):
+    def select_tile(self, kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W, precision_bytes):
         raise NotImplementedError()
 
     def extract_info(self, kernel, template_buffer_node, epilogue_nodes):
@@ -58,6 +58,7 @@ def extract_info(self, kernel, template_buffer_node, epilogue_nodes):
         if len({dtype for _, dtype in dtype_infos}) != 1:
             dtype_desc = ", ".join(f"{name}={dtype}" for name, dtype in dtype_infos)
             raise NotImplementedError(f"Mixed dtype Conv is not implemented yet ({dtype_desc})")
+        precision_bytes = get_dtype_nbytes(X.get_dtype())
 
         if epilogue_nodes is not None:
             extra_node_rw = {
@@ -75,7 +76,7 @@ def extract_info(self, kernel, template_buffer_node, epilogue_nodes):
         PADDING_W=self.padding[1]
         STRIDE_H=self.stride[0]
         STRIDE_W=self.stride[1]
-        return X,W,Y,Bias,n_extra_node,BATCH,I_C,I_H,I_W,O_C,K_H,K_W,O_H,O_W,PADDING_H,PADDING_W,STRIDE_H,STRIDE_W
+        return X,W,Y,Bias,n_extra_node,BATCH,I_C,I_H,I_W,O_C,K_H,K_W,O_H,O_W,PADDING_H,PADDING_W,STRIDE_H,STRIDE_W,precision_bytes
 
     def get_tile_candidates(self,
                kernel: MLIRTemplateKernel,
@@ -83,8 +84,8 @@ def get_tile_candidates(self,
                epilogue_nodes: Optional[List[IRNode]] = None,
                **kwargs):
         # Extract input arguments info
-        X, W, Y, Bias, n_extra_node, BATCH, I_C, I_H, I_W, O_C, K_H, K_W, O_H, O_W, PADDING_H, PADDING_W, STRIDE_H, STRIDE_W = self.extract_info(kernel, template_buffer_node, epilogue_nodes)
-        return self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W)
+        X, W, Y, Bias, n_extra_node, BATCH, I_C, I_H, I_W, O_C, K_H, K_W, O_H, O_W, PADDING_H, PADDING_W, STRIDE_H, STRIDE_W, precision_bytes = self.extract_info(kernel, template_buffer_node, epilogue_nodes)
+        return self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W, precision_bytes)
 
     def outer_func_render(self, kernel_name, input_args):
         X, W = self.input_nodes[0], self.input_nodes[1]
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py b/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py
index e91014fa..8b8288a8 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_mt_template.py
@@ -131,12 +131,12 @@ def render(self,
                tile_info = None,
                **kwargs):
         # Extract input arguments info
-        X, W, Y, Bias, n_extra_node, BATCH, I_C, I_H, I_W, O_C, K_H, K_W, O_H, O_W, PADDING_H, PADDING_W, STRIDE_H, STRIDE_W = self.extract_info(kernel, template_buffer_node, epilogue_nodes)
+        X, W, Y, Bias, n_extra_node, BATCH, I_C, I_H, I_W, O_C, K_H, K_W, O_H, O_W, PADDING_H, PADDING_W, STRIDE_H, STRIDE_W, precision_bytes = self.extract_info(kernel, template_buffer_node, epilogue_nodes)
 
         # Select tile size adn template
         conv_template = CONV_TEMPLATE
         if tile_info is None:
-            TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W)[0]
+            TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W, precision_bytes)[0]
         else:
             TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = tile_info
         SUB_TILE_N = TILE_N if TILE_N > 512 else SUB_TILE_N
@@ -170,7 +170,7 @@ def render(self,
         Y_tile_desc.set_name("output_buffer")
         Y_dim = [Symbol("tile_m"), Symbol("tile_n"), Symbol("o_h"), Symbol("o_w")]
         Y_idx = [Y_dim[0]*O_C*O_H*O_W, Y_dim[1]*O_H*O_W, Y_dim[2]*O_W, Y_dim[3]]
-        
+
         # Extract Bias info
         Bias_idx = [Number(0), Symbol("tile_n"), Number(0), Number(0)]
         Bias_tile_desc = mlir_common.MLIRMultiDimTile(Y_tile_size, kernel.vector_lane, vlane_split_axis, vlane_stride)
@@ -239,8 +239,8 @@ def render(self,
         kernel.add_loop_info([kernel.render_options["K_H"], kernel.render_options["K_W"], kernel.render_options["O_H"], kernel.render_options["O_W"], kernel.render_options["BATCH"], kernel.render_options["O_C"], kernel.render_options["I_C"]], [kernel.render_options["TILE_M"], kernel.render_options["TILE_N"], kernel.render_options["TILE_K"]])
         return code
 
-    def select_tile(self, kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W): 
-        tile_candidates = kernel.conv_multi_tile_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node)
+    def select_tile(self, kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W, precision_bytes):
+        tile_candidates = kernel.conv_multi_tile_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node, precision_bytes=precision_bytes)
         for idx, (TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K) in enumerate(tile_candidates):
             TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1]
             TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py b/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py
index db2c64db..92efff66 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_sb_template.py
@@ -132,12 +132,12 @@ def render(self,
                tile_info = None,
                **kwargs):
         # Extract input arguments info
-        X, W, Y, Bias, n_extra_node, BATCH, I_C, I_H, I_W, O_C, K_H, K_W, O_H, O_W, PADDING_H, PADDING_W, STRIDE_H, STRIDE_W = self.extract_info(kernel, template_buffer_node, epilogue_nodes)
+        X, W, Y, Bias, n_extra_node, BATCH, I_C, I_H, I_W, O_C, K_H, K_W, O_H, O_W, PADDING_H, PADDING_W, STRIDE_H, STRIDE_W, precision_bytes = self.extract_info(kernel, template_buffer_node, epilogue_nodes)
 
         # Select tile size adn template
         conv_template = CONV_TEMPLATE
         if tile_info is None:
-            TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W)[0]
+            TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W, precision_bytes)[0]
         else:
             TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = tile_info
         SUB_TILE_N = TILE_N if TILE_N > 512 else SUB_TILE_N
@@ -238,8 +238,8 @@ def render(self,
         kernel.add_loop_info([kernel.render_options["K_H"], kernel.render_options["K_W"], kernel.render_options["O_H"], kernel.render_options["O_W"], kernel.render_options["BATCH"], kernel.render_options["O_C"], kernel.render_options["I_C"]], [kernel.render_options["TILE_M"], kernel.render_options["TILE_N"], kernel.render_options["TILE_K"]])
         return code
 
-    def select_tile(self, kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W):
-        tile_candidates = kernel.conv_single_batch_mapping(BATCH, O_C, I_C, K_H, 1, O_H, O_W, self.stride, self.dilation, n_extra_node) # TODO: implement K_W
+    def select_tile(self, kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W, precision_bytes):
+        tile_candidates = kernel.conv_single_batch_mapping(BATCH, O_C, I_C, K_H, 1, O_H, O_W, self.stride, self.dilation, n_extra_node, precision_bytes=precision_bytes) # TODO: implement K_W
         for idx, (TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K) in enumerate(tile_candidates):
             TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
             TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] + (TILE_K_W - 1) * self.dilation[1]
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py b/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py
index 95db53c3..dfd418d9 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_sbs_template.py
@@ -132,12 +132,12 @@ def render(self,
                tile_info = None,
                **kwargs):
         # Extract input arguments info
-        X, W, Y, Bias, n_extra_node, BATCH, I_C, I_H, I_W, O_C, K_H, K_W, O_H, O_W, PADDING_H, PADDING_W, STRIDE_H, STRIDE_W = self.extract_info(kernel, template_buffer_node, epilogue_nodes)
+        X, W, Y, Bias, n_extra_node, BATCH, I_C, I_H, I_W, O_C, K_H, K_W, O_H, O_W, PADDING_H, PADDING_W, STRIDE_H, STRIDE_W, precision_bytes = self.extract_info(kernel, template_buffer_node, epilogue_nodes)
 
         # Select tile size adn template
         conv_template = CONV_TEMPLATE
         if tile_info is None:
-            TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W)[0]
+            TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W, precision_bytes)[0]
         else:
             TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = tile_info
         SUB_TILE_N = TILE_N if TILE_N > 512 else SUB_TILE_N
@@ -239,8 +239,8 @@ def render(self,
         kernel.add_loop_info([kernel.render_options["K_H"], kernel.render_options["K_W"], kernel.render_options["O_H"], kernel.render_options["O_W"], kernel.render_options["BATCH"], kernel.render_options["O_C"], kernel.render_options["I_C"]], [kernel.render_options["TILE_M"], kernel.render_options["TILE_N"], kernel.render_options["TILE_K"]])
         return code
 
-    def select_tile(self, kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W):
-        tile_candidates = kernel.conv_single_batch_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node) # TODO: implement K_W
+    def select_tile(self, kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W, precision_bytes):
+        tile_candidates = kernel.conv_single_batch_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node, precision_bytes=precision_bytes) # TODO: implement K_W
         for idx, (TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K) in enumerate(tile_candidates):
             TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
             TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] + (TILE_K_W - 1) * self.dilation[1]
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_template.py b/PyTorchSimFrontend/mlir/mlir_conv_template.py
index 3666b3c9..178ba7c6 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_template.py
@@ -136,12 +136,12 @@ def render(self,
                tile_info = None,
                **kwargs):
         # Extract input arguments info
-        X, W, Y, Bias, n_extra_node, BATCH, I_C, I_H, I_W, O_C, K_H, K_W, O_H, O_W, PADDING_H, PADDING_W, STRIDE_H, STRIDE_W = self.extract_info(kernel, template_buffer_node, epilogue_nodes)
+        X, W, Y, Bias, n_extra_node, BATCH, I_C, I_H, I_W, O_C, K_H, K_W, O_H, O_W, PADDING_H, PADDING_W, STRIDE_H, STRIDE_W, precision_bytes = self.extract_info(kernel, template_buffer_node, epilogue_nodes)
 
         # Select tile size adn template
         conv_template = CONV_TEMPLATE
         if tile_info is None:
-            TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W)[0]
+            TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W, precision_bytes)[0]
         else:
             TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K, TILE_I_H, TILE_I_W, SUB_TILE_I_H, SUB_TILE_I_W, SUB_TILE_K_H, SUB_TILE_K_W, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = tile_info
         TOG_latency = BATCH if TILE_M > BATCH else TILE_M
@@ -243,8 +243,8 @@ def render(self,
         kernel.add_loop_info([kernel.render_options["K_H"], kernel.render_options["K_W"], kernel.render_options["O_H"], kernel.render_options["O_W"], kernel.render_options["BATCH"], kernel.render_options["O_C"], kernel.render_options["I_C"]], [kernel.render_options["TILE_M"], kernel.render_options["TILE_N"], kernel.render_options["TILE_K"]])
         return code
 
-    def select_tile(self, kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W):
-        tile_candidates = kernel.conv_combination_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node)
+    def select_tile(self, kernel, n_extra_node, BATCH, I_C, O_C, K_H, K_W, O_H, O_W, precision_bytes):
+        tile_candidates = kernel.conv_combination_mapping(BATCH, O_C, I_C, K_H, K_W, O_H, O_W, self.stride, self.dilation, n_extra_node, precision_bytes=precision_bytes)
         for idx, (TILE_K_H, TILE_K_W, TILE_O_H, TILE_O_W, TILE_M, TILE_N, TILE_K) in enumerate(tile_candidates):
             TILE_I_H = 1 + (TILE_O_H - 1) * self.stride[0] + (TILE_K_H - 1) * self.dilation[0]
             TILE_I_W = 1 + (TILE_O_W - 1) * self.stride[1] + (TILE_K_W - 1) * self.dilation[1]
diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index eb391dba..9c61c3d9 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -117,8 +117,9 @@ def render(self,
                tile_info = None,
                **kwargs):
         X, W, Y, M, N, K, n_epilogue_node, n_prologue_node, n_extra_read = self.extract_info(template_buffer_node, epilogue_nodes, prologue_nodes)
+        precision_bytes = mlir_common.get_dtype_nbytes(X.get_dtype())
         if tile_info is None:
-            TILE_M, TILE_N, TILE_K, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, M, N, K, n_epilogue_node, n_extra_read, n_prologue_node)[0]
+            TILE_M, TILE_N, TILE_K, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = self.select_tile(kernel, M, N, K, n_epilogue_node, n_extra_read, n_prologue_node, precision_bytes)[0]
         else:
             TILE_M, TILE_N, TILE_K, SUB_TILE_M, SUB_TILE_N, SUB_TILE_K = tile_info
 
@@ -274,7 +275,8 @@ def get_tile_candidates(self,
                prologue_nodes: Optional[List[IRNode]] = None,
                **kwargs):
         X, W, Y, M, N, K, n_epilogue_node, n_prologue_node, n_extra_read = self.extract_info(template_buffer_node, epilogue_nodes, prologue_nodes)
-        return self.select_tile(kernel, M, N, K, n_epilogue_node, n_extra_read, n_prologue_node)
+        precision_bytes = mlir_common.get_dtype_nbytes(X.get_dtype())
+        return self.select_tile(kernel, M, N, K, n_epilogue_node, n_extra_read, n_prologue_node, precision_bytes)
 
     def extract_info(self, template_buffer_node, epilogue_nodes, prologue_nodes):
         if template_buffer_node is not None:
@@ -307,7 +309,7 @@ def extract_info(self, template_buffer_node, epilogue_nodes, prologue_nodes):
         M, N, K = X_tensor.size()[0], W_tensor.size()[1], X_tensor.size()[1]
         return X,W,Y,M,N,K,n_epilogue_node,n_prologue_node,len(n_extra_read)
 
-    def select_tile(self, kernel, M, N, K, n_extra_node, n_extra_read, n_prologue_node):
+    def select_tile(self, kernel, M, N, K, n_extra_node, n_extra_read, n_prologue_node, precision_bytes):
         data = {}
         gemm_shape = f"{M}_{N}_{K}"
         if "external" in extension_config.codegen_mapping_strategy:
@@ -327,7 +329,7 @@ def select_tile(self, kernel, M, N, K, n_extra_node, n_extra_read, n_prologue_no
         else:
             # case 2: use heuristic mapping
             min_tile = (n_extra_node + n_prologue_node) == 0
-            tile_candidates = kernel.gemm_combination_mapping(M, N, K, max(n_extra_read-2, 0), n_prologue_node, min_tile=True)
+            tile_candidates = kernel.gemm_combination_mapping(M, N, K, max(n_extra_read-2, 0), n_prologue_node, min_tile=True, precision_bytes=precision_bytes)
 
         # Edge case
         if (M == 0) or (N == 0) or (K == 0):
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 9cc79e0a..81b3d606 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -150,10 +150,10 @@ def add_loop_info(self, mat_size, tile_size):
         for idx, (loop_size, stride) in enumerate(zip(mat_size, tile_size)):
             self.loop_info[f"index{idx}"] = [0, loop_size, stride]
 
-    def gemmini_gemm_mapping(self, M, N, K):
+    def gemmini_gemm_mapping(self, M, N, K, precision_bytes=4):
         spad_size = self.spad_info["spad_size"] * self.vector_lane
         num_cores = self.num_cores
-        precision = self.precision
+        precision = precision_bytes
         dim_I, dim_J, dim_K = M, N, K
         dim = self.vector_lane
 
@@ -205,7 +205,7 @@ def gemmini_gemm_mapping(self, M, N, K):
 
         return inner_I, inner_J, inner_K
 
-    def gemm_combination_mapping(self, M, N, K, n_extra_node=0, n_prologue_node=0, pad_k=True, min_tile=False, is_conv=False):
+    def gemm_combination_mapping(self, M, N, K, n_extra_node=0, n_prologue_node=0, pad_k=True, min_tile=False, is_conv=False, precision_bytes=4):
         tile_candidates = []
         spad_size_per_lane = self.spad_info["spad_size"]
         spad_size = spad_size_per_lane * self.vector_lane
@@ -233,11 +233,11 @@ def gemm_combination_mapping(self, M, N, K, n_extra_node=0, n_prologue_node=0, p
                 tile_M = i * self.vector_lane if M > self.vector_lane else M_padded
                 for j in tile_N_range:
                     tile_N = j * self.vector_lane if N > self.vector_lane else N_padded
-                    used_spad_size = (tile_M * tile_K * (1 + n_prologue_node) + tile_K * tile_N + tile_M * tile_N * (1 + n_extra_node)) * self.precision
+                    used_spad_size = (tile_M * tile_K * (1 + n_prologue_node) + tile_K * tile_N + tile_M * tile_N * (1 + n_extra_node)) * precision_bytes
                     weight_size_per_lane = self.get_spad_size_per_lane(tile_K, tile_N)
                     input_size_per_lane = self.get_spad_size_per_lane(tile_M * (1 + n_prologue_node), tile_K)
                     output_size_per_lane = self.get_spad_size_per_lane(tile_M * (1 + n_extra_node), tile_N)
-                    used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * self.precision
+                    used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * precision_bytes
                     check_spad_size = (used_spad_size < max_spad_size and used_spad_size_per_lane < max_spad_per_lane)
                     if check_spad_size:
                         dir_path = f"{extension_config.CONFIG_TORCHSIM_DIR}/validation/gemm_candidates"
@@ -259,11 +259,11 @@ def gemm_combination_mapping(self, M, N, K, n_extra_node=0, n_prologue_node=0, p
                 tile_M = i * self.vector_lane if M > self.vector_lane else M_padded
                 for j in tile_N_range:
                     tile_N = j * self.vector_lane if N > self.vector_lane else N_padded
-                    used_spad_size = (tile_M * tile_K * (1 + n_prologue_node) + tile_K * tile_N + tile_M * tile_N * (1 + n_extra_node)) * self.precision
+                    used_spad_size = (tile_M * tile_K * (1 + n_prologue_node) + tile_K * tile_N + tile_M * tile_N * (1 + n_extra_node)) * precision_bytes
                     weight_size_per_lane = self.get_spad_size_per_lane(tile_K, tile_N)
                     input_size_per_lane = self.get_spad_size_per_lane(tile_M * (1 + n_prologue_node), tile_K)
                     output_size_per_lane = self.get_spad_size_per_lane(tile_M * (1 + n_extra_node), tile_N)
-                    used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * self.precision
+                    used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * precision_bytes
                     n_tile = math.ceil(M / max(tile_M, 128)) * math.ceil(N / max(tile_N, 128))
                     check_spad_size = (used_spad_size < max_spad_size and used_spad_size_per_lane < max_spad_per_lane)
                     if check_spad_size and max_used_spad_size < used_spad_size and maximize_i_j <= tile_M * tile_N and n_tile >= minimum_n_tile and max(tile_N, 128) // max(tile_M, 128) < 10:
@@ -277,7 +277,7 @@ def gemm_combination_mapping(self, M, N, K, n_extra_node=0, n_prologue_node=0, p
         tile_candidates = [v for _, v in tile_candidates]
         return tile_candidates
 
-    def conv_combination_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation, n_extra_node=0):
+    def conv_combination_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation, n_extra_node=0, precision_bytes=4):
         tile_candidates = []
         spad_size_per_lane = self.spad_info["spad_size"]
         spad_size = spad_size_per_lane * self.vector_lane
@@ -285,7 +285,7 @@ def conv_combination_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation
         max_spad_per_lane = spad_size_per_lane // 2 # double buffer
 
         max_used_spad_size = 0
-        M, N, K = self.gemm_combination_mapping(M, N, K, n_extra_node=n_extra_node, pad_k=False, is_conv=True)[0]
+        M, N, K = self.gemm_combination_mapping(M, N, K, n_extra_node=n_extra_node, pad_k=False, is_conv=True, precision_bytes=precision_bytes)[0]
         max_k_h_w = 1 # maximize kernel size
         max_o_h_w = 1 # maximize output size
         K = min(K, self.vector_lane)
@@ -298,11 +298,11 @@ def conv_combination_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation
                         weight_size = k_w * k_h * K * N
                         input_size = i_w * i_h * M * K
                         output_size = o_w * o_h * M * N
-                        used_spad_size = (weight_size + input_size + output_size * (1 + n_extra_node)) * self.precision
+                        used_spad_size = (weight_size + input_size + output_size * (1 + n_extra_node)) * precision_bytes
                         weight_size_per_lane = self.get_spad_size_per_lane(k_w * k_h * K, N)
                         input_size_per_lane = self.get_spad_size_per_lane(i_w * i_h * M, K)
                         output_size_per_lane = self.get_spad_size_per_lane(o_w * o_h * M  * (1 + n_extra_node), N)
-                        used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * self.precision
+                        used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * precision_bytes
                         check_spad_size = (used_spad_size < max_spad_size and used_spad_size_per_lane < max_spad_per_lane)
                         if check_spad_size:
                             tile_candidates.append((used_spad_size, (k_h, k_w, o_h, o_w, M, N, K)))
@@ -318,7 +318,7 @@ def conv_combination_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation
         tile_candidates = [v for _, v in tile_candidates]
         return tile_candidates
 
-    def conv_multi_tile_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation, n_extra_node=0):
+    def conv_multi_tile_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation, n_extra_node=0, precision_bytes=4):
         tile_candidates = []
         spad_size_per_lane = self.spad_info["spad_size"]
         spad_size = spad_size_per_lane * self.vector_lane
@@ -326,7 +326,7 @@ def conv_multi_tile_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation,
         max_spad_per_lane = spad_size_per_lane // 2
 
         max_used_spad_size = 0
-        M, N, K = self.gemm_combination_mapping(M, N, K * K_W, n_extra_node=n_extra_node, pad_k=False, is_conv=True)[0]
+        M, N, K = self.gemm_combination_mapping(M, N, K * K_W, n_extra_node=n_extra_node, pad_k=False, is_conv=True, precision_bytes=precision_bytes)[0]
         max_k_h_w = K_W
         for o_h in sympy.divisors(O_H):
             for o_w in sympy.divisors(O_W):
@@ -336,11 +336,11 @@ def conv_multi_tile_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation,
                     weight_size = 1 * k_h * K * N
                     input_size = i_w * i_h * M * K
                     output_size = o_w * o_h * M * N
-                    used_spad_size = (weight_size + input_size + output_size * (1 + n_extra_node)) * self.precision
+                    used_spad_size = (weight_size + input_size + output_size * (1 + n_extra_node)) * precision_bytes
                     weight_size_per_lane = self.get_spad_size_per_lane(1 * k_h * K, N)
                     input_size_per_lane = self.get_spad_size_per_lane(i_w * i_h * M, K)
                     output_size_per_lane = self.get_spad_size_per_lane(o_w * o_h * M  * (1 + n_extra_node), N)
-                    used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * self.precision
+                    used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * precision_bytes
                     check_spad_size = (used_spad_size < max_spad_size and used_spad_size_per_lane < max_spad_per_lane)
                     if check_spad_size:
                         tile_candidates.append((used_spad_size, (k_h, K_W, o_h, o_w, M, N, K)))
@@ -354,7 +354,7 @@ def conv_multi_tile_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation,
         tile_candidates = [v for _, v in tile_candidates]
         return tile_candidates
 
-    def conv_single_batch_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation, n_extra_node=0):
+    def conv_single_batch_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilation, n_extra_node=0, precision_bytes=4):
         tile_candidates = []
         spad_size_per_lane = self.spad_info["spad_size"]
         spad_size = spad_size_per_lane * self.vector_lane
@@ -362,7 +362,7 @@ def conv_single_batch_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilatio
         max_spad_per_lane = spad_size_per_lane // 2
 
         max_used_spad_size = 0
-        M, N, K = self.gemm_combination_mapping(O_W, N, K, n_extra_node=n_extra_node, pad_k=False, is_conv=True)[0]
+        M, N, K = self.gemm_combination_mapping(O_W, N, K, n_extra_node=n_extra_node, pad_k=False, is_conv=True, precision_bytes=precision_bytes)[0]
         max_k_h_w = 1
         for o_h in sympy.divisors(O_H):
             for k_h in sympy.divisors(K_H):
@@ -372,11 +372,11 @@ def conv_single_batch_mapping(self, M, N, K, K_H, K_W, O_H, O_W, stride, dilatio
                     weight_size = k_w * k_h * K * N
                     input_size = i_w * i_h * k_w * K
                     output_size = M * o_h * N
-                    used_spad_size = (weight_size + input_size + output_size * (1 + n_extra_node)) * self.precision
+                    used_spad_size = (weight_size + input_size + output_size * (1 + n_extra_node)) * precision_bytes
                     weight_size_per_lane = self.get_spad_size_per_lane(k_w * k_h * K, N)
                     input_size_per_lane = self.get_spad_size_per_lane(i_w * i_h * k_w, K)
                     output_size_per_lane = self.get_spad_size_per_lane(M * o_h  * (1 + n_extra_node), N)
-                    used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * self.precision
+                    used_spad_size_per_lane = (weight_size_per_lane + input_size_per_lane + output_size_per_lane) * precision_bytes
                     check_spad_size = (used_spad_size < max_spad_size and used_spad_size_per_lane < max_spad_per_lane)
                     if check_spad_size:
                         tile_candidates.append((used_spad_size, (k_h, k_w, o_h, M, M, N, K)))
diff --git a/README.md b/README.md
index 4a3ef145..f55995c9 100644
--- a/README.md
+++ b/README.md
@@ -396,7 +396,6 @@ export TORCHSIM_USE_TIMING_POOLING=0 # use lightweight pooling for timing
   "icnt_injection_ports_per_core" : 16 // Interconnect injection ports per core
   "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt", // Booksim2 config file path
 
-  "precision" : 4,                   // Element's precision in tensor (Byte)
   "scheduler" : "simple",            // Scheduler type (Now, only support simple scheduler)
   "num_partition" : 2,               // Multi-core Partitioning
   "partition": {                     // allocate request queue index

From fadba78ef71f69992b321c9318a23a1377506121 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 12 Mar 2026 14:42:37 +0900
Subject: [PATCH 126/194] [Fix] malloc size align + fix origin info

---
 AsmParser/tog_generator.py                    |  4 ++--
 PyTorchSimFrontend/extension_codecache.py     | 21 +++++++++++++++++++
 PyTorchSimFrontend/mlir/mlir_autotune.py      |  2 +-
 .../mlir/mlir_codegen_backend.py              |  3 ++-
 PyTorchSimFrontend/mlir/mlir_scheduling.py    |  6 ++++--
 5 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/AsmParser/tog_generator.py b/AsmParser/tog_generator.py
index 5f586d99..a12460e3 100644
--- a/AsmParser/tog_generator.py
+++ b/AsmParser/tog_generator.py
@@ -37,7 +37,7 @@ class tog_generator:
     StonneTraceCompute= 6
     StonneTraceLoad = 7
     StonneTraceStore = 8
-    def __init__(self, origins="Unknown") -> None:
+    def __init__(self, origins={"Unknown"}) -> None:
         self.module_name = "tile_operation_graph"
         self.module = None
         self.raw_graph = {}
@@ -226,7 +226,7 @@ def generate_tile_graph(self, name="tile_graph", cycle_list=list, x_offset=int,
                         offset = w_offset if is_preload else x_offset
                         iter_node.torchsim_overlapping_cycle = max(iter_node.torchsim_cycle - offset, 0)
 
-        origin_info = "_".join(map(str, self.origins))
+        origin_info = self.origins if isinstance(self.origins, str) else "_".join(map(str, self.origins))
         onnx_node_list = [node.to_onnx() for node in node_list] # Exclude root node
         dump_onnx_graph(name, onnx_node_list, vector_lane, origin_info, stonneGraph=stonneGraph)
 
diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index 8454dee6..b1c457d3 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -72,6 +72,14 @@ def mlir_compile_command(filename, vectorlane_size, vlen=256):
                 {'--print-after-all' if extension_config.CONFIG_TORCHSIM_DUMP_LLVM_IR else ''} \
                 -O2 {filename}.ll -o {filename}.o
         """,
+    ).strip(),
+            re.sub(r"[ \n]+", " ",
+        f"""
+            {extension_config.CONFIG_TORCHSIM_LLVM_PATH}/llc \
+                -relocation-model=pic -march=riscv64 -O3 --stack-size-section \
+                -mattr=+m,+f,+d,+a,+c,+v,+zvfh,+xsfvcp,zvl{vlen}b \
+                -O2 {filename}.ll -o {filename}.s
+        """,
     ).strip()]
 
 def mlir_gem5_compile_command(filename, sample_filename, tog_file, vectorlane_size, vlen=256):
@@ -168,11 +176,13 @@ def load(cls, source_code,
             opt_cmd = shlex.split(cmds[0])
             translate_cmd = shlex.split(cmds[1])
             llc_cmd = shlex.split(cmds[2])
+            llc_asm_cmd = shlex.split(cmds[3])
             with lock:
                 try:
                     subprocess.check_call(opt_cmd)
                     subprocess.check_call(translate_cmd)
                     subprocess.check_call(llc_cmd)
+                    subprocess.check_call(llc_asm_cmd)
                 except subprocess.CalledProcessError as e:
                     logger.error(f"Command failed with exit code {e.returncode}")
                     logger.error(f"Error output: {e.output.decode() if isinstance(e.output, bytes) else e.output}")
@@ -182,6 +192,17 @@ def load(cls, source_code,
                 val_llvm_caller.generate_wrapper_file(write_path, validation_wrapper_name)
                 val_llvm_caller.compile_wih_kernel(write_path, key, validation_wrapper_name,
                                                    validation_binary_name, new_link_option)
+
+                stack_size = val_llvm_caller.parse_stack_sizes(f"{write_path}/{key}.s", vlenb=vlenb)
+                spad_size =  val_llvm_caller.get_spad_size(validation_binary_path)
+                spad_usage = stack_size + spad_size # Spad usage per lane
+                if extension_config.CONFIG_SPAD_INFO["spad_size"] < spad_usage:
+                    logger.debug(
+                        f"Scratchpad size exceeded: required {spad_usage} bytes, "
+                        f"but only {extension_config.CONFIG_SPAD_INFO['spad_size']} bytes available."
+                    )
+                    raise SpadOverflowError()
+
         # Skip if TOG file already exists
         if os.path.isfile(tog_path):
             return key
diff --git a/PyTorchSimFrontend/mlir/mlir_autotune.py b/PyTorchSimFrontend/mlir/mlir_autotune.py
index 4503584c..caf4d6da 100644
--- a/PyTorchSimFrontend/mlir/mlir_autotune.py
+++ b/PyTorchSimFrontend/mlir/mlir_autotune.py
@@ -85,7 +85,7 @@ def cached_run_fn(*args, **kwargs):
             self.source_code, vectorlane_size=self.extra_args["vector_lane"],
             loop_size=None, spad_info=self.extra_args["spad_info"],
             vlen=self.extra_args["vlen"], arg_attributes=self.extra_args["arg_attributes"],
-            origins="Unknown", silent_mode=True,
+            origins=self.extra_args["origins"], silent_mode=True,
             autotune=self.extra_args['autotune'])
 
         args = [
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 43cb65a4..24d6636a 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -285,7 +285,7 @@ def __init__(self, kernel_group, reason=None):
         self.gem5_header = IndentedBuffer()
         self.header.writeline("#include <unistd.h>")
         self.header.writeline("#include <stdlib.h>")
-        self.header.writeline("void* __wrap_malloc(size_t size) { return sbrk(size); }")
+        self.header.writeline("void* __wrap_malloc(size_t size) { size = (size + 511UL) & ~511UL; return sbrk(size); }") # Align to 512 bytes
         self.header.writeline("void __wrap_free(void *ptr) { return; }")
         self.reduction_cse = common.CSE(self.newvar_prefix, self.suffix, name_prefix="tmp_acc")
         self.spad_cse = common.CSE(self.newvar_prefix, self.suffix, name_prefix="spad")
@@ -1060,6 +1060,7 @@ def run_bench(self, nodes, kernel_name, src_code):
                 "vlen" : self.vlen,
                 "arg_attributes" : arg_attributes,
                 "autotune" : True,
+                "origins" : {str(i) for node in nodes for i in node.node.origins},
             },
             source_code=src_code,
         )
diff --git a/PyTorchSimFrontend/mlir/mlir_scheduling.py b/PyTorchSimFrontend/mlir/mlir_scheduling.py
index 2f9c9704..22d1011b 100644
--- a/PyTorchSimFrontend/mlir/mlir_scheduling.py
+++ b/PyTorchSimFrontend/mlir/mlir_scheduling.py
@@ -276,7 +276,7 @@ def codegen_node(self, _node):
         MLIRScheduling.count += 1
         src_code, meta_code = ex_kernel.codegen_nodes(nodes, kernel_name_candidate)
         kernel_name = self.define_kernel(src_code, meta_code, kernel_name_candidate, ex_kernel.vector_lane,
-                           ex_kernel.spad_info, origins= {str(i) for i in nodes[0].node.origins})
+                           ex_kernel.spad_info, origins={str(i) for node in nodes for i in node.node.origins})
         ex_kernel.call_kernel(kernel_name)
         _, args, _, _ = ex_kernel.args.mlir_argdefs()
         args = ", ".join(args)
@@ -332,8 +332,10 @@ def codegen_template(self, template_node, epilogue_nodes, prologue_nodes):
         src_code, meta_code = kernel.codegen_nodes(tile_candidates, render, template_node, prologue_nodes, epilogue_nodes)
 
         with kernel:
+            all_nodes = [template_node] + (epilogue_nodes or []) + (prologue_nodes or [])
+            origins = {str(i) for n in all_nodes for i in n.node.origins}
             kernel_name = self.define_kernel(src_code, meta_code, kernel.kernel_name, kernel.vector_lane, kernel.spad_info,
-                                             kernel.loop_size, origins={str(i) for i in template_node.node.origins})
+                                             kernel.loop_size, origins=origins)
             self.define_function(kernel)
 
         kernel.call_kernel(kernel_name)

From 0189ab978fbe3ce02e72bb77f66c2bd10342babe Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 12 Mar 2026 15:06:28 +0900
Subject: [PATCH 127/194] [TOGSim] Fix local/remote memory stat

---
 TOGSim/src/Simulator.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TOGSim/src/Simulator.cc b/TOGSim/src/Simulator.cc
index b5b9c778..d7fe9f1b 100644
--- a/TOGSim/src/Simulator.cc
+++ b/TOGSim/src/Simulator.cc
@@ -121,7 +121,7 @@ void Simulator::icnt_cycle() {
         front->set_core_id(core_id);
         if (!_icnt->is_full(port_id, front)) {
           int node_id = _dram->get_channel_id(front) / _config.dram_channels_per_partitions;
-          if (core_id == node_id)
+          if (get_partition_id(core_id) == node_id)
             _cores[core_id]->inc_numa_local_access();
           else
             _cores[core_id]->inc_numa_remote_access();

From 5268be2df8352f3470bee4e60739b9467fa07ca8 Mon Sep 17 00:00:00 2001
From: HamHyungkyu <hhk971@postech.ac.kr>
Date: Thu, 12 Mar 2026 19:30:04 +0900
Subject: [PATCH 128/194] [Frontend/template] add SPDA decode GQA template
 imlementation

---
 .../mlir/mlir_codegen_backend.py              |   7 +-
 PyTorchSimFrontend/mlir/mlir_lowering.py      |  37 +-
 PyTorchSimFrontend/mlir/mlir_sdpa_template.py | 888 +++++++++++++++++-
 PyTorchSimFrontend/mlir/mlir_template.py      |   4 +-
 tests/test_sdpa.py                            |  57 +-
 5 files changed, 973 insertions(+), 20 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 24d6636a..38125e31 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -470,7 +470,12 @@ def parse_index_list(self, expr_list:list, offset=sympy.Number(0)) -> common.CSE
                 new_expr_list[idx] = arg.subs(arg.args[1], dim_list[idx])
                 indices.append(str(new_arg))
             elif not arg.is_number:
-                new_arg = sympy.Symbol(str(self.convert_index(arg)))
+                try:
+                    new_arg = sympy.Symbol(str(self.convert_index(arg)))
+                #not implemented case
+                except NotImplementedError:
+                    print(f"Not implemented case: {arg}")
+                    raise NotImplementedError(f"Not implemented case: {arg}")
                 new_expr_list[idx] = new_arg.subs(new_arg, dim_list[idx])
                 indices.append(str(new_arg))
             else:
diff --git a/PyTorchSimFrontend/mlir/mlir_lowering.py b/PyTorchSimFrontend/mlir/mlir_lowering.py
index 9d49f212..ac7eb853 100644
--- a/PyTorchSimFrontend/mlir/mlir_lowering.py
+++ b/PyTorchSimFrontend/mlir/mlir_lowering.py
@@ -16,9 +16,15 @@
 from PyTorchSimFrontend.mlir.mlir_conv_sb_template import MLIRConvSingleBatchTemplate
 from PyTorchSimFrontend.mlir.mlir_conv_sbs_template import MLIRConvSingleBatchStridedTemplate
 from PyTorchSimFrontend.mlir.mlir_maxpool_template import MLIRMaxPoolTemplate
-from PyTorchSimFrontend.mlir.mlir_sdpa_template import MLIRFlashSDPATemplate, flash_sdpa_args, calculate_scale
 from PyTorchSimFrontend.mlir.mlir_cat_template import MLIRCatTemplate
 from PyTorchSimFrontend.mlir.mlir_sort_template import MLIRSortTemplate, MLIRStableSortTemplate
+from PyTorchSimFrontend.mlir.mlir_sdpa_template import (
+    MLIRFlashSDPATemplate,
+    MLIRDecodeGQASDPAPartialTemplate,
+    MLIRDecodeGQASDPAReduceTemplate,
+    flash_sdpa_args,
+    calculate_scale,
+)
 from PyTorchSimFrontend import extension_config
 
 aten = torch.ops.aten
@@ -58,6 +64,35 @@ def tuned_flash_sdpa(
     scale = calculate_scale(query, scale)
     N, Hq, H, L, S, E, Ev, layout, query, key, value = flash_sdpa_args(query, key, value)
     
+    # Decode-only GQA fast path: q is (B,Hq,1,Dh), B==1, Hq!=H, Hq%H==0.
+    # Always use the 2-kernel decode path:
+    # 1) block partials over (kv head, sequence block)
+    # 2) reduce/merge across blocks
+    # This keeps KV shared across qsub, avoids dh0-outer duplication, and
+    # stores compact partials instead of full score/prob tensors in DRAM.
+    if L == 1 and Hq != H and N == 1 and (Hq % H) == 0:
+        g = Hq // H
+        vector_lane = extension_config.vpu_num_lanes
+        tile_e = vector_lane
+        dh_tiles = E // tile_e
+        decode_gqa_block_size = 512
+        BlkS = decode_gqa_block_size if S >= decode_gqa_block_size else int(S)
+        # Padding-based tail handling: allow S not divisible by BlkS.
+        nblk = (S + BlkS - 1) // BlkS
+        HgDhTiles = H * g * dh_tiles
+        tile_pack = tile_e * 2
+
+        partial_layout = ir.FixedLayout(
+            query.get_device(),
+            torch.float32,
+            [HgDhTiles, nblk, tile_pack],
+        )
+        partial_tmpl = MLIRDecodeGQASDPAPartialTemplate([query, key, value], partial_layout, scale, BlkS=BlkS)
+        partial = partial_tmpl.generate().output_node()
+        reduce_tmpl = MLIRDecodeGQASDPAReduceTemplate([partial], layout, BlkS=BlkS)
+        out_node = reduce_tmpl.generate().output_node()
+        return (out_node, None, None, None, None, None, None, None, None)
+
     mlir_template = MLIRFlashSDPATemplate([query, key, value], layout, scale)
 
     # _scaled_dot_product_flash_attention has to return a tuple which has 9 values
diff --git a/PyTorchSimFrontend/mlir/mlir_sdpa_template.py b/PyTorchSimFrontend/mlir/mlir_sdpa_template.py
index 05030f27..1cd810e8 100644
--- a/PyTorchSimFrontend/mlir/mlir_sdpa_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_sdpa_template.py
@@ -48,23 +48,28 @@ def flash_sdpa_args(
     s = V.graph.sizevars.guard_equals(sk, sv)
     e = V.graph.sizevars.guard_equals(eq, ek)
 
-    # While there are no theoretical requirements for e == ev, 
-    # this implementation enforces e == ev for simplicity. 
-    # Distinct notations are still maintained to ensure future compatibility and clarity.
+    # While there are no theoretical requirements for e == ev,
+    # this implementation currently enforces e == ev for simplicity.
     if e != ev:
-        raise NotImplementedError("Flash SDPA does not support mismatched head dimensions between query and value.")
-   
-    # Flash attention does not split tiles along the head dimension (e or ev).
-    # Therefore, the head dimension size must be less than or equal to the number of vlanes.
-    vector_lane = extension_config.vpu_num_lanes 
-    if e > vector_lane or ev > vector_lane:
-        raise ValueError(f"The head dimension size must be less than or equal to the number of vlanes (e: {e}, ev: {ev}, vlanes: {vector_lane}).")
+        raise NotImplementedError(
+            "Flash SDPA currently requires matching head dimensions between query and value (e == ev)."
+        )
+
+    # Support head dimensions larger than vector lanes by tiling e/ev.
+    # For now, require multiples of vector lanes (covers 64/128 with vlanes=16).
+    vector_lane = extension_config.vpu_num_lanes
+    if (e % vector_lane) != 0:
+        raise NotImplementedError(
+            f"Flash SDPA currently requires e to be a multiple of vlanes (e: {e}, vlanes: {vector_lane})."
+        )
     
-    # The aten._scaled_dot_product_flash_attention kernel does not accept an explicit enable_gqa parameter.
-    # Instead, the Flash SDPA implementation infers GQA usage by checking if hq != hk.
-    # The Flash SDPA for GQA will be implemented after implementing its native version.
-    if hq != h :
-        raise NotImplementedError("Flash SDPA for GQA is not supported yet.")
+    # Minimal GQA support (single-batch only for now).
+    # We map each query head to a KV head by grouping: hq = g * h.
+    if hq != h:
+        if n != 1:
+            raise NotImplementedError("Flash SDPA GQA is currently supported only for n == 1.")
+        if (hq % h) != 0:
+            raise NotImplementedError(f"Flash SDPA GQA requires hq % h == 0 (hq: {hq}, h: {h}).")
     
     layout = FixedLayout(
         query.get_device(),
@@ -479,3 +484,856 @@ def select_tile(self, kernel, l, s, e, n_extra_node, n_extra_read, n_prologue_no
             tile_candidates[idx] = tile_l,tile_s,tile_e,subtile_l,subtile_s,subtile_e
 
         return tile_candidates
+
+
+# ---------------------------
+# Decode-only GQA SDPA (Lq == 1)
+# ---------------------------
+
+DECODE_GQA_SDPA_TEMPLATE = r"""
+// Decode GQA SDPA kernel (Lq == 1)
+// B = {{ B }}
+// Hq = {{ Hq }}
+// H = {{ H }}
+// g = {{ g }}
+// S = {{ S }}
+// Dh = {{ Dh }}
+// BlkS = {{ BlkS }}
+// tile_s = {{ tile_s }}
+// tile_e = {{ tile_e }}
+// dh_tiles = {{ dh_tiles }}
+{{kernel.def_global_vars()}}
+
+func.func @{{ KERNEL_NAME }}{{kernel.def_kernel(inputs=[query, key, value], outputs=[out], names_str="query, key, value, out", input_reorder=input_reorder)}} {
+  // IO buffers follow input dtype (fp16/bf16/f32)
+  {{ kernel.def_sram_buffer("query", q_tile_desc, indent_size=2) }}
+  {{ kernel.def_sram_buffer("key", k_tile_desc, indent_size=2) }}
+  {{ kernel.def_sram_buffer("value", v_tile_desc, indent_size=2) }}
+  // Softmax output used for SV matmul (io dtype)
+  {{ kernel.def_sram_buffer("mul", mul_tile_desc, indent_size=2) }}
+  {{ kernel.def_sram_buffer("score", score_desc, indent_size=2) }}
+  {{ kernel.def_sram_buffer("prob", prob_desc, indent_size=2) }}
+  // Accumulator in fp32 (stable)
+  {{ kernel.def_sram_buffer("out_acc", out_acc_tile_desc, indent_size=2) }}
+  // Temp output in io dtype for SV matmul result
+  {{ kernel.def_sram_buffer("out_io", out_io_tile_desc, indent_size=2) }}
+  // Softmax running stats in fp32
+  {{ kernel.def_sram_buffer("max", max_desc, indent_size=2) }}
+  {{ kernel.def_sram_buffer("sum", sum_desc, indent_size=2) }}
+
+  %c0 = arith.constant 0.0 : {{ acc_stype }}
+  %c1 = arith.constant 1.0 : {{ acc_stype }}
+  %c_scale = arith.constant {{ scale }} : {{ acc_stype }}
+  %c_neg_inf = arith.constant -1.0e+30 : {{ acc_stype }}
+
+  %v0_e_acc = arith.constant dense<0.0> : vector<{{ tile_e }}x{{ acc_stype }}>
+  %v0_e_io = arith.constant dense<0.0> : vector<{{ tile_e }}x{{ io_stype }}>
+  %v0_2x = arith.constant dense<0.0> : vector<2x{{ acc_stype }}>
+  %v_neg_inf_2x = arith.constant dense<-1.0e+30> : vector<2x{{ acc_stype }}>
+  %v0_s_acc = arith.constant dense<0.0> : vector<{{ tile_s }}x{{ acc_stype }}>
+
+  %v_scale = vector.broadcast %c_scale : {{ acc_stype }} to vector<{{ tile_s }}x{{ acc_stype }}>
+
+  {{ kernel.def_local_vars(indent_size=2) }}
+
+  // kv_head parallelism is the natural unit for GQA reuse
+  affine.for %kv = 0 to {{ H }} {
+    // Process S in blocks (BlkS). Sequential inside a core.
+    affine.for %blk = 0 to {{ S }} step {{ BlkS }} {
+      // Initialize per-qsub accumulators for this (kv, blk)
+      affine.for %qsub = 0 to {{ g }} {
+        affine.vector_store %v_neg_inf_2x, %max_buffer[%qsub, 0] : {{ max_desc.get_mlir_shape(acc_stype) }}, vector<2x{{ acc_stype }}>
+        affine.vector_store %v0_2x, %sum_buffer[%qsub, 0] : {{ sum_desc.get_mlir_shape(acc_stype) }}, vector<2x{{ acc_stype }}>
+        affine.for %dht = 0 to {{ dh_tiles }} {
+          affine.vector_store %v0_e_acc, %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_e }}x{{ acc_stype }}>
+        }
+      }
+
+      affine.for %s0 = %blk to (%blk + {{ BlkS }}) step {{ tile_s }} {
+        // Accumulate score per qsub so K tiles can be shared across qsub.
+        affine.for %qsub = 0 to {{ g }} {
+          affine.vector_store %v0_s_acc, %score_buffer[%qsub, 0] : {{ score_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_s }}x{{ acc_stype }}>
+        }
+
+        affine.for %k0 = 0 to {{ Dh }} step {{ tile_e }} {
+          // Load K slice once for all qsub.
+          {{ kernel.def_dma_op("MVIN", "key", kk_idx, k_tile_desc, subtile_size=[1, tile_s, tile_e], indent_size=10, padding=1) }}
+          %k2D = memref.reinterpret_cast %k_buffer to offset: [0], sizes: [{{ tile_s }}, {{ tile_e }}], strides: [{{ tile_e }}, 1] : {{ k_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_s }}x{{ tile_e }}x{{ io_stype }}, 1>
+
+          affine.for %qsub = 0 to {{ g }} {
+            {{ kernel.def_dma_op("MVIN", "query", qk_idx, q_tile_desc, subtile_size=[1, 1, tile_e], indent_size=12) }}
+            %q2D = memref.reinterpret_cast %q_buffer to offset: [0], sizes: [{{ tile_e }}, 1], strides: [1, 1] : {{ q_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_e }}x1x{{ io_stype }}, 1>
+
+            // mul = k @ q  -> (tile_s x 1) in io dtype, then upcast and accumulate.
+            linalg.matmul
+              { idx_map = array<i32: 1, 0, -1> }
+              ins(%k2D, %q2D : memref<{{ tile_s }}x{{ tile_e }}x{{ io_stype }}, 1>, memref<{{ tile_e }}x1x{{ io_stype }}, 1>)
+              outs(%mul_buffer : {{ mul_tile_desc.get_mlir_shape(io_stype) }})
+
+            %raw_mul_io = affine.vector_load %mul_buffer[0, 0] : {{ mul_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_s }}x{{ io_stype }}>
+            %raw_mul = arith.extf %raw_mul_io : vector<{{ tile_s }}x{{ io_stype }}> to vector<{{ tile_s }}x{{ acc_stype }}>
+            %old_score = affine.vector_load %score_buffer[%qsub, 0] : {{ score_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_s }}x{{ acc_stype }}>
+            %new_score = arith.addf %old_score, %raw_mul : vector<{{ tile_s }}x{{ acc_stype }}>
+            affine.vector_store %new_score, %score_buffer[%qsub, 0] : {{ score_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_s }}x{{ acc_stype }}>
+          } { accumulation_loop=true }
+        } { accumulation_loop=true }
+
+        affine.for %qsub = 0 to {{ g }} {
+          %score_acc = affine.vector_load %score_buffer[%qsub, 0] : {{ score_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_s }}x{{ acc_stype }}>
+          // scale after full Dh reduction
+          %scaled_mul_vec = arith.mulf %score_acc, %v_scale : vector<{{ tile_s }}x{{ acc_stype }}>
+
+            // Online softmax update (max/sum/out) identical to FLASH_SDPA_TEMPLATE but specialized to Lq==1.
+            %old_max = affine.vector_load %max_buffer[%qsub, 0] : {{ max_desc.get_mlir_shape(acc_stype) }}, vector<2x{{ acc_stype }}>
+            // Reduce max over tile_s
+            %max_init = vector.broadcast %c_neg_inf : {{ acc_stype }} to vector<{{ tile_s }}x{{ acc_stype }}>
+            %local_max_vec = arith.maximumf %scaled_mul_vec, %max_init : vector<{{ tile_s }}x{{ acc_stype }}>
+            %max_cast = vector.shape_cast %local_max_vec : vector<{{ tile_s }}x{{ acc_stype }}> to vector<{{ tile_s // 2 }}x2x{{ acc_stype }}>
+            %max_red1 = vector.multi_reduction <maximumf>, %max_cast, %v_neg_inf_2x [0] : vector<{{ tile_s // 2 }}x2x{{ acc_stype }}> to vector<2x{{ acc_stype }}>
+            %max_shuf = vector.shuffle %max_red1, %max_red1 [1, 0] : vector<2x{{ acc_stype }}>, vector<2x{{ acc_stype }}>
+            %max_red2 = arith.maximumf %max_red1, %max_shuf : vector<2x{{ acc_stype }}>
+            %new_max = arith.maximumf %max_red2, %old_max : vector<2x{{ acc_stype }}>
+            affine.vector_store %new_max, %max_buffer[%qsub, 0] : {{ max_desc.get_mlir_shape(acc_stype) }}, vector<2x{{ acc_stype }}>
+
+            // rescale = exp(old_max - new_max)
+            %max_diff = arith.subf %old_max, %new_max : vector<2x{{ acc_stype }}>
+            %max_diff_scalar = vector.extract %max_diff[0] : {{ acc_stype }} from vector<2x{{ acc_stype }}>
+            %rescale_e = vector.broadcast %max_diff_scalar : {{ acc_stype }} to vector<{{ tile_e }}x{{ acc_stype }}>
+            %exp_rescale_e = math.exp %rescale_e : vector<{{ tile_e }}x{{ acc_stype }}>
+            %rescale_2 = vector.broadcast %max_diff_scalar : {{ acc_stype }} to vector<2x{{ acc_stype }}>
+            %exp_rescale_2 = math.exp %rescale_2 : vector<2x{{ acc_stype }}>
+
+            // out *= rescale
+            %old_out = affine.vector_load %out_acc_buffer[%qsub, 0, 0] : {{ out_acc_tile_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_e }}x{{ acc_stype }}>
+            %rescaled_out = arith.mulf %exp_rescale_e, %old_out : vector<{{ tile_e }}x{{ acc_stype }}>
+            affine.vector_store %rescaled_out, %out_acc_buffer[%qsub, 0, 0] : {{ out_acc_tile_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_e }}x{{ acc_stype }}>
+
+            // sum *= rescale
+            %old_sum = affine.vector_load %sum_buffer[%qsub, 0] : {{ sum_desc.get_mlir_shape(acc_stype) }}, vector<2x{{ acc_stype }}>
+            %rescaled_sum = arith.mulf %old_sum, %exp_rescale_2 : vector<2x{{ acc_stype }}>
+
+            // exp(score - new_max)
+            %new_max_scalar = vector.extract %new_max[0] : {{ acc_stype }} from vector<2x{{ acc_stype }}>
+            %new_max_bcast = vector.broadcast %new_max_scalar : {{ acc_stype }} to vector<{{ tile_s }}x{{ acc_stype }}>
+            %shifted = arith.subf %scaled_mul_vec, %new_max_bcast : vector<{{ tile_s }}x{{ acc_stype }}>
+            %exp_scores = math.exp %shifted : vector<{{ tile_s }}x{{ acc_stype }}>
+            // For SV matmul: downcast softmax output to io dtype (common in practice)
+            %exp_scores_io = arith.truncf %exp_scores : vector<{{ tile_s }}x{{ acc_stype }}> to vector<{{ tile_s }}x{{ io_stype }}>
+            affine.vector_store %exp_scores_io, %prob_buffer[%qsub, 0] : {{ prob_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_s }}x{{ io_stype }}>
+
+            // sum += reduce(exp_scores)
+            %sum_cast = vector.shape_cast %exp_scores : vector<{{ tile_s }}x{{ acc_stype }}> to vector<{{ tile_s // 2 }}x2x{{ acc_stype }}>
+            %zero_2x = vector.broadcast %c0 : {{ acc_stype }} to vector<2x{{ acc_stype }}>
+            %sum_red1 = vector.multi_reduction <add>, %sum_cast, %zero_2x [0] : vector<{{ tile_s // 2 }}x2x{{ acc_stype }}> to vector<2x{{ acc_stype }}>
+            %sum_shuf = vector.shuffle %sum_red1, %sum_red1 [1, 0] : vector<2x{{ acc_stype }}>, vector<2x{{ acc_stype }}>
+            %sum_red2 = arith.addf %sum_red1, %sum_shuf : vector<2x{{ acc_stype }}>
+            %new_sum = arith.addf %sum_red2, %rescaled_sum : vector<2x{{ acc_stype }}>
+            affine.vector_store %new_sum, %sum_buffer[%qsub, 0] : {{ sum_desc.get_mlir_shape(acc_stype) }}, vector<2x{{ acc_stype }}>
+
+        } { accumulation_loop=true }
+
+        // 2) SV accumulation: for each output dh tile, load V once and share across qsub.
+        affine.for %dht = 0 to {{ dh_tiles }} {
+          %dh0 = affine.apply affine_map<(d0) -> (d0 * {{ tile_e }})>(%dht)
+          {{ kernel.def_dma_op("MVIN", "value", v_idx, v_tile_desc, subtile_size=[1, tile_s, tile_e], indent_size=10, padding=0) }}
+          %v2D = memref.reinterpret_cast %v_buffer to offset: [0], sizes: [{{ tile_e }}, {{ tile_s }}], strides: [{{ tile_s }}, 1] : {{ v_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_e }}x{{ tile_s }}x{{ io_stype }}, 1>
+
+          affine.for %qsub = 0 to {{ g }} {
+            %prob_vec = affine.vector_load %prob_buffer[%qsub, 0] : {{ prob_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_s }}x{{ io_stype }}>
+            affine.vector_store %prob_vec, %mul_buffer[0, 0] : {{ mul_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_s }}x{{ io_stype }}>
+            affine.vector_store %v0_e_io, %out_io_buffer[0, 0, 0] : {{ out_io_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_e }}x{{ io_stype }}>
+            %out_io_2D = memref.reinterpret_cast %out_io_buffer to offset: [0], sizes: [{{ tile_e }}, 1], strides: [1, 1] : {{ out_io_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_e }}x1x{{ io_stype }}, 1>
+            linalg.matmul
+              { idx_map = array<i32: 2, 1, -1> }
+              ins(%v2D, %mul_buffer : memref<{{ tile_e }}x{{ tile_s }}x{{ io_stype }}, 1>, {{ mul_tile_desc.get_mlir_shape(io_stype) }})
+              outs(%out_io_2D : memref<{{ tile_e }}x1x{{ io_stype }}, 1>)
+
+            %out_io_vec = affine.vector_load %out_io_buffer[0, 0, 0] : {{ out_io_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_e }}x{{ io_stype }}>
+            %out_io_f32 = arith.extf %out_io_vec : vector<{{ tile_e }}x{{ io_stype }}> to vector<{{ tile_e }}x{{ acc_stype }}>
+            %out_acc_vec = affine.vector_load %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_e }}x{{ acc_stype }}>
+            %out_acc_new = arith.addf %out_acc_vec, %out_io_f32 : vector<{{ tile_e }}x{{ acc_stype }}>
+            affine.vector_store %out_acc_new, %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_e }}x{{ acc_stype }}>
+          } { accumulation_loop=true }
+        } { accumulation_loop=true }
+      } { accumulation_loop=true }
+
+      // finalize per-qsub for this (kv, blk) and store out for all dh tiles
+      affine.for %qsub = 0 to {{ g }} {
+        %final_sum = affine.vector_load %sum_buffer[%qsub, 0] : {{ sum_desc.get_mlir_shape(acc_stype) }}, vector<2x{{ acc_stype }}>
+        %one_2x = vector.broadcast %c1 : {{ acc_stype }} to vector<2x{{ acc_stype }}>
+        %inv_sum_2x = arith.divf %one_2x, %final_sum : vector<2x{{ acc_stype }}>
+        %inv_sum = vector.extract %inv_sum_2x[0] : {{ acc_stype }} from vector<2x{{ acc_stype }}>
+        %inv_bcast = vector.broadcast %inv_sum : {{ acc_stype }} to vector<{{ tile_e }}x{{ acc_stype }}>
+
+        affine.for %dht = 0 to {{ dh_tiles }} {
+          %dh0 = affine.apply affine_map<(d0) -> (d0 * {{ tile_e }})>(%dht)
+          %acc_out = affine.vector_load %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_e }}x{{ acc_stype }}>
+          %final_out_acc = arith.mulf %acc_out, %inv_bcast : vector<{{ tile_e }}x{{ acc_stype }}>
+          %final_out_io = arith.truncf %final_out_acc : vector<{{ tile_e }}x{{ acc_stype }}> to vector<{{ tile_e }}x{{ io_stype }}>
+          affine.vector_store %final_out_io, %out_io_buffer[0, 0, 0] : {{ out_io_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_e }}x{{ io_stype }}>
+          {{ kernel.store_output(indent_size=10) }}
+        }
+      } { outer_loop=true }
+    } { outer_loop=true }
+  } { outer_loop=true }
+
+  return
+}
+"""
+
+
+class MLIRDecodeGQASDPATemplate(MLIRTemplate):
+    def __init__(self, input_nodes, layout, scale, BlkS: int = 1024, input_reorder=None):
+        super().__init__("kernel", input_nodes, layout, input_reorder)
+        self.scale = scale
+        self.BlkS = BlkS
+
+    def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue_nodes=None, prologue_nodes=None, tile_info=None, **kwargs):
+        # Decode-only: q is (B,Hq,1,Dh)
+        query, key, value, out = self.input_nodes[0], self.input_nodes[1], self.input_nodes[2], self.output_node
+
+        # Materialize tensors for stride metadata
+        q_tensor4 = empty_strided(query.layout.size, query.layout.stride)
+        k_tensor4 = empty_strided(key.layout.size, key.layout.stride)
+        v_tensor4 = empty_strided(value.layout.size, value.layout.stride)
+
+        B, Hq, Lq, Dh = q_tensor4.shape
+        Bk, H, S, Dhk = k_tensor4.shape
+        assert B == 1, "Decode GQA template currently supports B==1"
+        assert Lq == 1, "Decode GQA template requires Lq==1"
+        assert Dh == Dhk
+        g = Hq // H
+        BlkS = min(int(self.BlkS), int(S))
+
+        # Use 3D views to match the existing SDPA indexing scheme
+        # q: (Hq, 1, Dh), k/v: (H, S, Dh), out: (Hq, 1, Dh)
+        q_tensor = q_tensor4.view(Hq, 1, Dh)
+        k_tensor = k_tensor4.view(H, S, Dh)
+        v_tensor = v_tensor4.view(H, S, Dh)
+
+        tile_s = kernel.vector_lane
+        tile_e = kernel.vector_lane
+        dh_tiles = int(Dh) // int(tile_e)
+
+        io_stype = mlir_common.DTYPE_TO_MLIR[query.get_dtype()]
+        acc_stype = "f32"
+
+        # SRAM tiles: q(1x1xtile_e), k/v(1xtile_sxtile_e), mul(tile_sx1) in io dtype.
+        # out_acc in f32; out_io temp in io dtype.
+        vlane_stride = 1
+        q_tile_desc = mlir_common.MLIRMultiDimTile([1, 1, tile_e], kernel.vector_lane, 1, vlane_stride)
+        q_tile_desc.set_tile_size_stride([1, 1, tile_e], [0, tile_e, 1])
+        q_tile_desc.set_name("q_buffer")
+        q_tile_desc.offset = query.get_layout().offset
+
+        k_tile_desc = mlir_common.MLIRMultiDimTile([1, tile_s, tile_e], kernel.vector_lane, 2, vlane_stride)
+        k_tile_desc.set_tile_size_stride([1, tile_s, tile_e], [0, 1, tile_s])
+        k_tile_desc.set_name("k_buffer")
+        k_tile_desc.offset = key.get_layout().offset
+
+        v_tile_desc = mlir_common.MLIRMultiDimTile([1, tile_s, tile_e], kernel.vector_lane, 1, vlane_stride)
+        v_tile_desc.set_tile_size_stride([1, tile_s, tile_e], [0, tile_e, 1])
+        v_tile_desc.set_name("v_buffer")
+        v_tile_desc.offset = value.get_layout().offset
+
+        mul_tile_desc = mlir_common.MLIRMultiDimTile([tile_s, 1], kernel.vector_lane, 1, vlane_stride)
+        mul_tile_desc.set_tile_size_stride([tile_s, 1], [1, 1])
+        mul_tile_desc.set_name("mul_buffer")
+
+        score_desc = mlir_common.MLIRMultiDimTile([g, tile_s], kernel.vector_lane, 1, vlane_stride)
+        score_desc.set_tile_size_stride([g, tile_s], [tile_s, 1])
+        score_desc.set_name("score_buffer")
+
+        prob_desc = mlir_common.MLIRMultiDimTile([g, tile_s], kernel.vector_lane, 1, vlane_stride)
+        prob_desc.set_tile_size_stride([g, tile_s], [tile_s, 1])
+        prob_desc.set_name("prob_buffer")
+
+        # Per-qsub accumulators so KV tiles can be shared across qsub
+        out_acc_tile_desc = mlir_common.MLIRMultiDimTile([g, dh_tiles, tile_e], kernel.vector_lane, 2, vlane_stride)
+        out_acc_tile_desc.set_tile_size_stride([g, dh_tiles, tile_e], [dh_tiles * tile_e, tile_e, 1])
+        out_acc_tile_desc.set_name("out_acc_buffer")
+
+        out_io_tile_desc = mlir_common.MLIRMultiDimTile([1, 1, tile_e], kernel.vector_lane, 1, vlane_stride)
+        out_io_tile_desc.set_tile_size_stride([1, 1, tile_e], [0, tile_e, 1])
+        out_io_tile_desc.set_name("out_io_buffer")
+
+        max_desc = mlir_common.MLIRMultiDimTile([g, 2], kernel.vector_lane, 0, vlane_stride)
+        max_desc.set_tile_size_stride([g, 2], [2, 1])
+        max_desc.set_name("max_buffer")
+
+        sum_desc = mlir_common.MLIRMultiDimTile([g, 2], kernel.vector_lane, 0, vlane_stride)
+        sum_desc.set_tile_size_stride([g, 2], [2, 1])
+        sum_desc.set_name("sum_buffer")
+
+        # Indices
+        kv = sympy.Symbol("kv")
+        qsub = sympy.Symbol("qsub")
+        dh0 = sympy.Symbol("dh0")
+        k0 = sympy.Symbol("k0")
+        s0 = sympy.Symbol("s0")
+        q_head = kv * g + qsub
+
+        q_stride = q_tensor.stride()
+        k_stride = k_tensor.stride()
+        v_stride = v_tensor.stride()
+        # out is (B,Hq,1,Dh) but we address it as (Hq,1,Dh)
+        out_tensor = empty_strided(out.get_layout().size, out.get_layout().stride).view(Hq, 1, Dh)
+        out_stride = out_tensor.stride()
+
+        # QK indices use k0 reduction over Dh
+        qk_idx = [q_head * q_stride[0], sympy.Integer(0), k0 * q_stride[2]]
+        kk_idx = [kv * k_stride[0], s0 * k_stride[1], k0 * k_stride[2]]
+        # V and output use dh0 tile offset
+        v_idx = [kv * v_stride[0], s0 * v_stride[1], dh0 * v_stride[2]]
+        out_idx = [q_head * out_stride[0], sympy.Integer(0), dh0 * out_stride[2]]
+
+        kernel.loop_size = [tile_s, tile_e, 1]
+
+        kernel.render_options = dict(
+            KERNEL_NAME=self.name,
+            kernel=kernel,
+            B=B,
+            Hq=Hq,
+            H=H,
+            g=g,
+            S=S,
+            Dh=Dh,
+            dh_tiles=dh_tiles,
+            BlkS=BlkS,
+            tile_s=tile_s,
+            tile_e=tile_e,
+            io_stype=io_stype,
+            acc_stype=acc_stype,
+            scale=self.scale,
+            query=query,
+            key=key,
+            value=value,
+            out=out,
+            q_tile_desc=q_tile_desc,
+            k_tile_desc=k_tile_desc,
+            v_tile_desc=v_tile_desc,
+            out_acc_tile_desc=out_acc_tile_desc,
+            out_io_tile_desc=out_io_tile_desc,
+            mul_tile_desc=mul_tile_desc,
+            score_desc=score_desc,
+            prob_desc=prob_desc,
+            max_desc=max_desc,
+            sum_desc=sum_desc,
+            qk_idx=qk_idx,
+            kk_idx=kk_idx,
+            v_idx=v_idx,
+            out_idx=out_idx,
+            input_reorder=self.input_reorder,
+        )
+
+        kernel.epilogue_info = dict(
+            output_node=self.output_node.name,
+            sram_var="out_io_buffer",
+            dram_var="out",
+            dram_idx=out_idx,
+            dram_tile_desc=out_io_tile_desc,
+            nr_rdim=0,
+            r_dim_size=0,
+            dim_aliasing={"kv": "kv", "qsub": "qsub", "dh0": "dh0", "s0": "s0"},
+        )
+
+        return self._template_from_string(DECODE_GQA_SDPA_TEMPLATE).render(**kernel.render_options)
+
+
+# ---------------------------
+# Decode-only GQA SDPA: 2-kernel pipeline (partial blocks + reduce)
+# ---------------------------
+
+DECODE_GQA_SDPA_PARTIAL_TEMPLATE = r"""
+// Decode GQA SDPA partial kernel (per sequence block)
+// Produces partials per (kv,qsub,dh_tile,blk):
+// - first half lanes: o_j (tile_e)
+// - second half lanes: [m_j, l_j, 0, 0, ...] (tile_e)
+// QK/softmax is computed once per (kv,qsub,s0) over full Dh using k0 reduction.
+// SV then reuses those probabilities across all dh tiles.
+// H = {{ H }}, g = {{ g }}, Dh = {{ Dh }}, dh_tiles = {{ dh_tiles }}, S = {{ S }}, BlkS = {{ BlkS }}, nblk = {{ nblk }}
+{{kernel.def_global_vars()}}
+
+func.func @{{ KERNEL_NAME }}{{kernel.def_kernel(inputs=[query, key, value], outputs=[partial], names_str="query, key, value, partial", input_reorder=input_reorder)}} {
+  {{ kernel.def_sram_buffer("query", q_tile_desc, indent_size=2) }}
+  {{ kernel.def_sram_buffer("key", k_tile_desc, indent_size=2) }}
+  {{ kernel.def_sram_buffer("value", v_tile_desc, indent_size=2) }}
+  {{ kernel.def_sram_buffer("mul", mul_tile_desc, indent_size=2) }}
+  {{ kernel.def_sram_buffer("score", score_desc, indent_size=2) }}
+  {{ kernel.def_sram_buffer("prob", prob_desc, indent_size=2) }}
+  {{ kernel.def_sram_buffer("out_io", out_io_tile_desc, indent_size=2) }}
+  {{ kernel.def_sram_buffer("max", max_desc, indent_size=2) }}
+  {{ kernel.def_sram_buffer("sum", sum_desc, indent_size=2) }}
+  {{ kernel.def_sram_buffer("out_acc", out_acc_tile_desc, indent_size=2) }}
+  {{ kernel.def_sram_buffer("partial", partial_tile_desc, indent_size=2) }}
+
+  %c0 = arith.constant 0.0 : f32
+  %c_scale = arith.constant {{ scale }} : f32
+  %c_neg_inf = arith.constant -1.0e+30 : f32
+
+  %v0_e = arith.constant dense<0.0> : vector<{{ tile_e }}xf32>
+  %v0_e_io = arith.constant dense<0.0> : vector<{{ tile_e }}x{{ io_stype }}>
+  %v0_s = arith.constant dense<0.0> : vector<{{ tile_s }}xf32>
+  %v0_2x = arith.constant dense<0.0> : vector<2xf32>
+  %v_neg_inf_2x = arith.constant dense<-1.0e+30> : vector<2xf32>
+  %v_scale = vector.broadcast %c_scale : f32 to vector<{{ tile_s }}xf32>
+
+  {{ kernel.def_local_vars(indent_size=2) }}
+
+  affine.for %kv = 0 to {{ H }} {
+    affine.for %blk = 0 to {{ nblk }} step 1 {
+      // Reset per-block accumulators for all qsub/dh tiles.
+      affine.for %qsub = 0 to {{ g }} {
+        affine.vector_store %v_neg_inf_2x, %max_buffer[%qsub, 0] : {{ max_desc.get_mlir_shape("f32") }}, vector<2xf32>
+        affine.vector_store %v0_2x, %sum_buffer[%qsub, 0] : {{ sum_desc.get_mlir_shape("f32") }}, vector<2xf32>
+        affine.for %dht = 0 to {{ dh_tiles }} {
+          affine.vector_store %v0_e, %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32>
+        }
+      }
+
+      affine.for %s0 = ({{ BlkS }} * %blk) to ({{ BlkS }} * (%blk + 1)) step {{ tile_s }} {
+        // Accumulate score per qsub so K tiles can be shared across qsub.
+        affine.for %qsub = 0 to {{ g }} {
+          affine.vector_store %v0_s, %score_buffer[%qsub, 0] : {{ score_desc.get_mlir_shape("f32") }}, vector<{{ tile_s }}xf32>
+        }
+
+        affine.for %k0 = 0 to {{ Dh }} step {{ tile_e }} {
+          {{ kernel.def_dma_op("MVIN", "key", kk_idx, k_tile_desc, subtile_size=[1, tile_s, tile_e], indent_size=10, padding=1) }}
+          %k2D = memref.reinterpret_cast %k_buffer to offset: [0], sizes: [{{ tile_s }}, {{ tile_e }}], strides: [{{ tile_e }}, 1] : {{ k_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_s }}x{{ tile_e }}x{{ io_stype }}, 1>
+
+          affine.for %qsub = 0 to {{ g }} {
+            {{ kernel.def_dma_op("MVIN", "query", qk_idx, q_tile_desc, subtile_size=[1, 1, tile_e], indent_size=12) }}
+            %q2D = memref.reinterpret_cast %q_buffer to offset: [0], sizes: [{{ tile_e }}, 1], strides: [1, 1] : {{ q_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_e }}x1x{{ io_stype }}, 1>
+            linalg.matmul
+              { idx_map = array<i32: 1, 0, -1> }
+              ins(%k2D, %q2D : memref<{{ tile_s }}x{{ tile_e }}x{{ io_stype }}, 1>, memref<{{ tile_e }}x1x{{ io_stype }}, 1>)
+              outs(%mul_buffer : {{ mul_tile_desc.get_mlir_shape(io_stype) }})
+            %raw_mul_io = affine.vector_load %mul_buffer[0, 0] : {{ mul_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_s }}x{{ io_stype }}>
+            %raw_mul = arith.extf %raw_mul_io : vector<{{ tile_s }}x{{ io_stype }}> to vector<{{ tile_s }}xf32>
+            %old_score = affine.vector_load %score_buffer[%qsub, 0] : {{ score_desc.get_mlir_shape("f32") }}, vector<{{ tile_s }}xf32>
+            %new_score = arith.addf %old_score, %raw_mul : vector<{{ tile_s }}xf32>
+            affine.vector_store %new_score, %score_buffer[%qsub, 0] : {{ score_desc.get_mlir_shape("f32") }}, vector<{{ tile_s }}xf32>
+          } { accumulation_loop=true }
+        } { accumulation_loop=true }
+
+        // Softmax once per qsub; persist probabilities in SRAM for all SV dh tiles.
+        affine.for %qsub = 0 to {{ g }} {
+          %score = affine.vector_load %score_buffer[%qsub, 0] : {{ score_desc.get_mlir_shape("f32") }}, vector<{{ tile_s }}xf32>
+          %scaled = arith.mulf %score, %v_scale : vector<{{ tile_s }}xf32>
+
+          %old_max = affine.vector_load %max_buffer[%qsub, 0] : {{ max_desc.get_mlir_shape("f32") }}, vector<2xf32>
+          %max_init = vector.broadcast %c_neg_inf : f32 to vector<{{ tile_s }}xf32>
+          %local_max_vec = arith.maximumf %scaled, %max_init : vector<{{ tile_s }}xf32>
+          %max_cast = vector.shape_cast %local_max_vec : vector<{{ tile_s }}xf32> to vector<{{ tile_s // 2 }}x2xf32>
+          %max_red1 = vector.multi_reduction <maximumf>, %max_cast, %v_neg_inf_2x [0] : vector<{{ tile_s // 2 }}x2xf32> to vector<2xf32>
+          %max_shuf = vector.shuffle %max_red1, %max_red1 [1, 0] : vector<2xf32>, vector<2xf32>
+          %max_red2 = arith.maximumf %max_red1, %max_shuf : vector<2xf32>
+          %new_max = arith.maximumf %max_red2, %old_max : vector<2xf32>
+          affine.vector_store %new_max, %max_buffer[%qsub, 0] : {{ max_desc.get_mlir_shape("f32") }}, vector<2xf32>
+
+          %max_diff = arith.subf %old_max, %new_max : vector<2xf32>
+          %max_diff_scalar = vector.extract %max_diff[0] : f32 from vector<2xf32>
+          %rescale_e = vector.broadcast %max_diff_scalar : f32 to vector<{{ tile_e }}xf32>
+          %exp_rescale_e = math.exp %rescale_e : vector<{{ tile_e }}xf32>
+          %rescale_2 = vector.broadcast %max_diff_scalar : f32 to vector<2xf32>
+          %exp_rescale_2 = math.exp %rescale_2 : vector<2xf32>
+
+          %old_sum = affine.vector_load %sum_buffer[%qsub, 0] : {{ sum_desc.get_mlir_shape("f32") }}, vector<2xf32>
+          %rescaled_sum = arith.mulf %old_sum, %exp_rescale_2 : vector<2xf32>
+
+          affine.for %dht = 0 to {{ dh_tiles }} {
+            %old_out = affine.vector_load %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32>
+            %rescaled_out = arith.mulf %exp_rescale_e, %old_out : vector<{{ tile_e }}xf32>
+            affine.vector_store %rescaled_out, %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32>
+          }
+
+          %new_max_scalar = vector.extract %new_max[0] : f32 from vector<2xf32>
+          %new_max_bcast = vector.broadcast %new_max_scalar : f32 to vector<{{ tile_s }}xf32>
+          %shifted = arith.subf %scaled, %new_max_bcast : vector<{{ tile_s }}xf32>
+          %exp_scores = math.exp %shifted : vector<{{ tile_s }}xf32>
+          %exp_scores_io = arith.truncf %exp_scores : vector<{{ tile_s }}xf32> to vector<{{ tile_s }}x{{ io_stype }}>
+          affine.vector_store %exp_scores_io, %prob_buffer[%qsub, 0] : {{ prob_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_s }}x{{ io_stype }}>
+
+          %sum_cast = vector.shape_cast %exp_scores : vector<{{ tile_s }}xf32> to vector<{{ tile_s // 2 }}x2xf32>
+          %zero_2x = vector.broadcast %c0 : f32 to vector<2xf32>
+          %sum_red1 = vector.multi_reduction <add>, %sum_cast, %zero_2x [0] : vector<{{ tile_s // 2 }}x2xf32> to vector<2xf32>
+          %sum_shuf = vector.shuffle %sum_red1, %sum_red1 [1, 0] : vector<2xf32>, vector<2xf32>
+          %sum_red2 = arith.addf %sum_red1, %sum_shuf : vector<2xf32>
+          %new_sum = arith.addf %sum_red2, %rescaled_sum : vector<2xf32>
+          affine.vector_store %new_sum, %sum_buffer[%qsub, 0] : {{ sum_desc.get_mlir_shape("f32") }}, vector<2xf32>
+        } { accumulation_loop=true }
+
+        // For each output dh tile, load V once and share it across qsub.
+        affine.for %dht = 0 to {{ dh_tiles }} {
+          %dh0 = affine.apply affine_map<(d0) -> (d0 * {{ tile_e }})>(%dht)
+          {{ kernel.def_dma_op("MVIN", "value", v_idx, v_tile_desc, subtile_size=[1, tile_s, tile_e], indent_size=10, padding=0) }}
+          %v2D = memref.reinterpret_cast %v_buffer to offset: [0], sizes: [{{ tile_e }}, {{ tile_s }}], strides: [{{ tile_s }}, 1] : {{ v_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_e }}x{{ tile_s }}x{{ io_stype }}, 1>
+
+          affine.for %qsub = 0 to {{ g }} {
+            %prob_vec = affine.vector_load %prob_buffer[%qsub, 0] : {{ prob_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_s }}x{{ io_stype }}>
+            affine.vector_store %prob_vec, %mul_buffer[0, 0] : {{ mul_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_s }}x{{ io_stype }}>
+            affine.vector_store %v0_e_io, %out_io_buffer[0, 0, 0] : {{ out_io_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_e }}x{{ io_stype }}>
+            %out_io_2D = memref.reinterpret_cast %out_io_buffer to offset: [0], sizes: [{{ tile_e }}, 1], strides: [1, 1] : {{ out_io_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_e }}x1x{{ io_stype }}, 1>
+            linalg.matmul
+              { idx_map = array<i32: 2, 1, -1> }
+              ins(%v2D, %mul_buffer : memref<{{ tile_e }}x{{ tile_s }}x{{ io_stype }}, 1>, {{ mul_tile_desc.get_mlir_shape(io_stype) }})
+              outs(%out_io_2D : memref<{{ tile_e }}x1x{{ io_stype }}, 1>)
+
+            %out_io_vec = affine.vector_load %out_io_buffer[0, 0, 0] : {{ out_io_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_e }}x{{ io_stype }}>
+            %out_io_f32 = arith.extf %out_io_vec : vector<{{ tile_e }}x{{ io_stype }}> to vector<{{ tile_e }}xf32>
+            %out_acc_vec = affine.vector_load %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32>
+            %out_acc_new = arith.addf %out_acc_vec, %out_io_f32 : vector<{{ tile_e }}xf32>
+            affine.vector_store %out_acc_new, %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32>
+          } { accumulation_loop=true }
+        } { accumulation_loop=true }
+      } { accumulation_loop=true }
+
+      // Store packed partials for all qsub/dh tiles.
+      affine.for %qsub = 0 to {{ g }} {
+        %final_max = affine.vector_load %max_buffer[%qsub, 0] : {{ max_desc.get_mlir_shape("f32") }}, vector<2xf32>
+        %m_scalar = vector.extract %final_max[0] : f32 from vector<2xf32>
+        %final_sum = affine.vector_load %sum_buffer[%qsub, 0] : {{ sum_desc.get_mlir_shape("f32") }}, vector<2xf32>
+        %l_scalar = vector.extract %final_sum[0] : f32 from vector<2xf32>
+        %ml_vec = vector.broadcast %c0 : f32 to vector<{{ tile_e }}xf32>
+        %ml0 = vector.insert %m_scalar, %ml_vec[0] : f32 into vector<{{ tile_e }}xf32>
+        %ml1 = vector.insert %l_scalar, %ml0[1] : f32 into vector<{{ tile_e }}xf32>
+
+        affine.for %dht = 0 to {{ dh_tiles }} {
+          %out_vec = affine.vector_load %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32>
+          %packed = vector.concat %out_vec, %ml1 : vector<{{ tile_pack }}xf32>
+          affine.vector_store %packed, %partial_buffer[0, 0, 0] : {{ partial_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_pack }}xf32>
+          {{ kernel.store_output(indent_size=10) }}
+        }
+      } { outer_loop=true }
+    } { outer_loop=true }
+  } { outer_loop=true }
+  return
+}
+"""
+
+
+DECODE_GQA_SDPA_REDUCE_TEMPLATE = r"""
+// Decode GQA SDPA reduce kernel: merge partials across blocks
+// Input partial shape: (HgDhTiles, nblk, tile_pack)
+{{kernel.def_global_vars()}}
+
+func.func @{{ KERNEL_NAME }}{{kernel.def_kernel(inputs=[partial], outputs=[out], names_str="partial, out", input_reorder=input_reorder)}} {
+  {{ kernel.def_sram_buffer("partial", partial_tile_desc, indent_size=2) }}
+  {{ kernel.def_sram_buffer("out_acc", out_acc_tile_desc, indent_size=2) }}
+  {{ kernel.def_sram_buffer("max", max_desc, indent_size=2) }}
+  {{ kernel.def_sram_buffer("sum", sum_desc, indent_size=2) }}
+
+  %c0 = arith.constant 0.0 : f32
+  %c1 = arith.constant 1.0 : f32
+  %c_neg_inf = arith.constant -1.0e+30 : f32
+  %v0_e = arith.constant dense<0.0> : vector<{{ tile_e }}xf32>
+  %v0_2x = arith.constant dense<0.0> : vector<2xf32>
+  %v_neg_inf_2x = arith.constant dense<-1.0e+30> : vector<2xf32>
+
+  {{ kernel.def_local_vars(indent_size=2) }}
+
+  affine.for %gh = 0 to {{ HgDhTiles }} {
+    // reset merged accumulators
+    affine.vector_store %v0_e, %out_acc_buffer[0, 0, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32>
+    affine.vector_store %v_neg_inf_2x, %max_buffer[0, 0] : {{ max_desc.get_mlir_shape("f32") }}, vector<2xf32>
+    affine.vector_store %v0_2x, %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape("f32") }}, vector<2xf32>
+
+    affine.for %blk = 0 to {{ nblk }} {
+      {{ kernel.def_dma_op("MVIN", "partial", partial_idx, partial_tile_desc, subtile_size=[1, 1, tile_pack], indent_size=8) }}
+      %p = affine.vector_load %partial_buffer[0, 0, 0] : {{ partial_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_pack }}xf32>
+      %p2 = vector.shape_cast %p : vector<{{ tile_pack }}xf32> to vector<2x{{ tile_e }}xf32>
+      %o_j = vector.extract %p2[0] : vector<{{ tile_e }}xf32> from vector<2x{{ tile_e }}xf32>
+      %ml_j = vector.extract %p2[1] : vector<{{ tile_e }}xf32> from vector<2x{{ tile_e }}xf32>
+      %m_j = vector.extract %ml_j[0] : f32 from vector<{{ tile_e }}xf32>
+      %l_j = vector.extract %ml_j[1] : f32 from vector<{{ tile_e }}xf32>
+
+      %old_max = affine.vector_load %max_buffer[0, 0] : {{ max_desc.get_mlir_shape("f32") }}, vector<2xf32>
+      %m_old = vector.extract %old_max[0] : f32 from vector<2xf32>
+      %m_new = arith.maximumf %m_old, %m_j : f32
+      %m_new2 = vector.broadcast %m_new : f32 to vector<2xf32>
+      affine.vector_store %m_new2, %max_buffer[0, 0] : {{ max_desc.get_mlir_shape("f32") }}, vector<2xf32>
+
+      %diff_old = arith.subf %m_old, %m_new : f32
+      %diff_j = arith.subf %m_j, %m_new : f32
+      %scale_old = math.exp %diff_old : f32
+      %scale_j = math.exp %diff_j : f32
+      %scale_old_e = vector.broadcast %scale_old : f32 to vector<{{ tile_e }}xf32>
+      %scale_j_e = vector.broadcast %scale_j : f32 to vector<{{ tile_e }}xf32>
+
+      %o_old = affine.vector_load %out_acc_buffer[0, 0, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32>
+      %o_old_rs = arith.mulf %o_old, %scale_old_e : vector<{{ tile_e }}xf32>
+      %o_j_rs = arith.mulf %o_j, %scale_j_e : vector<{{ tile_e }}xf32>
+      %o_new = arith.addf %o_old_rs, %o_j_rs : vector<{{ tile_e }}xf32>
+      affine.vector_store %o_new, %out_acc_buffer[0, 0, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32>
+
+      %old_sum = affine.vector_load %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape("f32") }}, vector<2xf32>
+      %l_old = vector.extract %old_sum[0] : f32 from vector<2xf32>
+      %l_new = arith.addf (arith.mulf %l_old, %scale_old : f32), (arith.mulf %l_j, %scale_j : f32) : f32
+      %l_new2 = vector.broadcast %l_new : f32 to vector<2xf32>
+      affine.vector_store %l_new2, %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape("f32") }}, vector<2xf32>
+    } { accumulation_loop=true }
+
+    // finalize: out = o / l
+    %sum2 = affine.vector_load %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape("f32") }}, vector<2xf32>
+    %l = vector.extract %sum2[0] : f32 from vector<2xf32>
+    %inv = arith.divf %c1, %l : f32
+    %inv_e = vector.broadcast %inv : f32 to vector<{{ tile_e }}xf32>
+    %o = affine.vector_load %out_acc_buffer[0, 0, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32>
+    %out_f32 = arith.mulf %o, %inv_e : vector<{{ tile_e }}xf32>
+    %out_io = arith.truncf %out_f32 : vector<{{ tile_e }}xf32> to vector<{{ tile_e }}x{{ io_stype }}>
+    affine.vector_store %out_io, %out_buffer[0, 0, 0] : {{ out_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_e }}x{{ io_stype }}>
+    {{ kernel.store_output(indent_size=4) }}
+  } { outer_loop=true }
+  return
+}
+"""
+
+
+class MLIRDecodeGQASDPAPartialTemplate(MLIRTemplate):
+    def __init__(self, input_nodes, layout, scale, BlkS: int = 1024, input_reorder=None):
+        super().__init__("kernel", input_nodes, layout, input_reorder)
+        self.scale = scale
+        self.BlkS = BlkS
+
+    def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue_nodes=None, prologue_nodes=None, tile_info=None, **kwargs):
+        query, key, value = self.input_nodes[0], self.input_nodes[1], self.input_nodes[2]
+        partial = self.output_node
+
+        q_tensor4 = empty_strided(query.layout.size, query.layout.stride)
+        k_tensor4 = empty_strided(key.layout.size, key.layout.stride)
+        v_tensor4 = empty_strided(value.layout.size, value.layout.stride)
+        B, Hq, Lq, Dh = q_tensor4.shape
+        _, H, S, _ = k_tensor4.shape
+        assert B == 1 and Lq == 1
+        g = Hq // H
+        BlkS = min(int(self.BlkS), int(S))
+        nblk = (int(S) + int(BlkS) - 1) // int(BlkS)
+
+        io_stype = mlir_common.DTYPE_TO_MLIR[query.get_dtype()]
+        tile_s = kernel.vector_lane
+        tile_e = kernel.vector_lane
+        tile_pack = tile_e * 2
+
+        # Use 3D views for indices
+        q_tensor = q_tensor4.view(Hq, 1, Dh)
+        k_tensor = k_tensor4.view(H, S, Dh)
+        v_tensor = v_tensor4.view(H, S, Dh)
+
+        # Flatten (kv,qsub,dh_tile) into GH = H*g*(Dh/tile_e)
+        dh_tiles = int(Dh) // int(tile_e)
+        HgDhTiles = int(H) * int(g) * int(dh_tiles)
+
+        # tile descs
+        vlane_stride = 1
+        q_tile_desc = mlir_common.MLIRMultiDimTile([1, 1, tile_e], kernel.vector_lane, 1, vlane_stride)
+        q_tile_desc.set_tile_size_stride([1, 1, tile_e], [0, tile_e, 1])
+        q_tile_desc.set_name("q_buffer")
+        q_tile_desc.offset = query.get_layout().offset
+
+        k_tile_desc = mlir_common.MLIRMultiDimTile([1, tile_s, tile_e], kernel.vector_lane, 2, vlane_stride)
+        k_tile_desc.set_tile_size_stride([1, tile_s, tile_e], [0, 1, tile_s])
+        k_tile_desc.set_name("k_buffer")
+        k_tile_desc.offset = key.get_layout().offset
+
+        v_tile_desc = mlir_common.MLIRMultiDimTile([1, tile_s, tile_e], kernel.vector_lane, 1, vlane_stride)
+        v_tile_desc.set_tile_size_stride([1, tile_s, tile_e], [0, tile_e, 1])
+        v_tile_desc.set_name("v_buffer")
+        v_tile_desc.offset = value.get_layout().offset
+
+        mul_tile_desc = mlir_common.MLIRMultiDimTile([tile_s, 1], kernel.vector_lane, 1, vlane_stride)
+        mul_tile_desc.set_tile_size_stride([tile_s, 1], [1, 1])
+        mul_tile_desc.set_name("mul_buffer")
+
+        score_desc = mlir_common.MLIRMultiDimTile([g, tile_s], kernel.vector_lane, 1, vlane_stride)
+        score_desc.set_tile_size_stride([g, tile_s], [tile_s, 1])
+        score_desc.set_name("score_buffer")
+
+        prob_desc = mlir_common.MLIRMultiDimTile([g, tile_s], kernel.vector_lane, 1, vlane_stride)
+        prob_desc.set_tile_size_stride([g, tile_s], [tile_s, 1])
+        prob_desc.set_name("prob_buffer")
+
+        # Per-qsub, per-dh-tile accumulators so QK is computed once and SV expands across dh tiles.
+        out_acc_tile_desc = mlir_common.MLIRMultiDimTile([g, dh_tiles, tile_e], kernel.vector_lane, 2, vlane_stride)
+        out_acc_tile_desc.set_tile_size_stride([g, dh_tiles, tile_e], [dh_tiles * tile_e, tile_e, 1])
+        out_acc_tile_desc.set_name("out_acc_buffer")
+
+        max_desc = mlir_common.MLIRMultiDimTile([g, 2], kernel.vector_lane, 0, vlane_stride)
+        max_desc.set_tile_size_stride([g, 2], [2, 1])
+        max_desc.set_name("max_buffer")
+
+        sum_desc = mlir_common.MLIRMultiDimTile([g, 2], kernel.vector_lane, 0, vlane_stride)
+        sum_desc.set_tile_size_stride([g, 2], [2, 1])
+        sum_desc.set_name("sum_buffer")
+
+        out_io_tile_desc = mlir_common.MLIRMultiDimTile([1, 1, tile_e], kernel.vector_lane, 1, vlane_stride)
+        out_io_tile_desc.set_tile_size_stride([1, 1, tile_e], [0, tile_e, 1])
+        out_io_tile_desc.set_name("out_io_buffer")
+
+        partial_tile_desc = mlir_common.MLIRMultiDimTile([1, 1, tile_pack], kernel.vector_lane, 1, vlane_stride)
+        partial_tile_desc.set_tile_size_stride([1, 1, tile_pack], [0, tile_pack, 1])
+        partial_tile_desc.set_name("partial_buffer")
+
+        # Indices
+        kv = sympy.Symbol("kv")
+        qsub = sympy.Symbol("qsub")
+        dht = sympy.Symbol("dht")
+        dh0 = sympy.Symbol("dh0")
+        k0 = sympy.Symbol("k0")
+        blk = sympy.Symbol("blk")
+        s0 = sympy.Symbol("s0")
+        q_head = kv * g + qsub
+
+        q_stride = q_tensor.stride()
+        k_stride = k_tensor.stride()
+        v_stride = v_tensor.stride()
+
+        qk_idx = [q_head * q_stride[0], sympy.Integer(0), k0 * q_stride[2]]
+        kk_idx = [kv * k_stride[0], s0 * k_stride[1], k0 * k_stride[2]]
+        v_idx = [kv * v_stride[0], s0 * v_stride[1], dh0 * v_stride[2]]
+
+        # partial tensor is view(HgDhTiles, nblk, tile_pack) contiguous
+        p_tensor = empty_strided(partial.get_layout().size, partial.get_layout().stride).view(HgDhTiles, nblk, tile_pack)
+        p_stride = p_tensor.stride()
+        # group head index: ((kv*g + qsub)*dh_tiles + dht)
+        gh = (kv * g + qsub) * dh_tiles + dht
+        partial_idx = [gh * p_stride[0], blk * p_stride[1], sympy.Integer(0)]
+
+        kernel.loop_size = [tile_s, tile_e, tile_pack]
+
+        kernel.render_options = dict(
+            KERNEL_NAME=self.name,
+            kernel=kernel,
+            H=H,
+            g=g,
+            Dh=Dh,
+            S=S,
+            BlkS=BlkS,
+            nblk=nblk,
+            tile_s=tile_s,
+            tile_e=tile_e,
+            dh_tiles=dh_tiles,
+            tile_pack=tile_pack,
+            io_stype=io_stype,
+            scale=self.scale,
+            query=query,
+            key=key,
+            value=value,
+            partial=partial,
+            q_tile_desc=q_tile_desc,
+            k_tile_desc=k_tile_desc,
+            v_tile_desc=v_tile_desc,
+            mul_tile_desc=mul_tile_desc,
+            score_desc=score_desc,
+            prob_desc=prob_desc,
+            out_io_tile_desc=out_io_tile_desc,
+            out_acc_tile_desc=out_acc_tile_desc,
+            max_desc=max_desc,
+            sum_desc=sum_desc,
+            partial_tile_desc=partial_tile_desc,
+            qk_idx=qk_idx,
+            kk_idx=kk_idx,
+            v_idx=v_idx,
+            partial_idx=partial_idx,
+            input_reorder=self.input_reorder,
+        )
+
+        kernel.epilogue_info = dict(
+            output_node=self.output_node.name,
+            sram_var="partial_buffer",
+            dram_var="partial",
+            dram_idx=partial_idx,
+            dram_tile_desc=partial_tile_desc,
+            nr_rdim=0,
+            r_dim_size=0,
+            dim_aliasing={"kv": "kv", "qsub": "qsub", "dht": "dht", "dh0": "dh0", "k0": "k0", "blk": "blk", "s0": "s0"},
+        )
+        return self._template_from_string(DECODE_GQA_SDPA_PARTIAL_TEMPLATE).render(**kernel.render_options)
+
+
+class MLIRDecodeGQASDPAReduceTemplate(MLIRTemplate):
+    def __init__(self, input_nodes, layout, BlkS: int = 1024, input_reorder=None):
+        super().__init__("kernel", input_nodes, layout, input_reorder)
+        self.BlkS = BlkS
+
+    def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue_nodes=None, prologue_nodes=None, tile_info=None, **kwargs):
+        partial = self.input_nodes[0]
+        out = self.output_node
+
+        tile_e = kernel.vector_lane
+        tile_pack = tile_e * 2
+
+        # Infer sizes from partial layout: (HgDhTiles, nblk, tile_pack)
+        HgDhTiles, nblk, _ = partial.get_size()
+        io_stype = mlir_common.DTYPE_TO_MLIR[out.get_dtype()]
+
+        vlane_stride = 1
+        partial_tile_desc = mlir_common.MLIRMultiDimTile([1, 1, tile_pack], kernel.vector_lane, 1, vlane_stride)
+        partial_tile_desc.set_tile_size_stride([1, 1, tile_pack], [0, tile_pack, 1])
+        partial_tile_desc.set_name("partial_buffer")
+        partial_tile_desc.offset = partial.get_layout().offset
+
+        out_acc_tile_desc = mlir_common.MLIRMultiDimTile([1, 1, tile_e], kernel.vector_lane, 1, vlane_stride)
+        out_acc_tile_desc.set_tile_size_stride([1, 1, tile_e], [0, tile_e, 1])
+        out_acc_tile_desc.set_name("out_acc_buffer")
+
+        max_desc = mlir_common.MLIRMultiDimTile([1, 2], kernel.vector_lane, 0, vlane_stride)
+        max_desc.set_tile_size_stride([1, 2], [2, 1])
+        max_desc.set_name("max_buffer")
+
+        sum_desc = mlir_common.MLIRMultiDimTile([1, 2], kernel.vector_lane, 0, vlane_stride)
+        sum_desc.set_tile_size_stride([1, 2], [2, 1])
+        sum_desc.set_name("sum_buffer")
+
+        out_tile_desc = mlir_common.MLIRMultiDimTile([1, 1, tile_e], kernel.vector_lane, 1, vlane_stride)
+        out_tile_desc.set_tile_size_stride([1, 1, tile_e], [0, tile_e, 1])
+        out_tile_desc.set_name("out_buffer")
+
+        # Indexing: partial is already 3D; out is (Hq,1,Dh) but view as (Hq*Dh/tile_e, 1, tile_e)
+        p_tensor = empty_strided(partial.get_layout().size, partial.get_layout().stride)
+        p_stride = p_tensor.stride()
+        gh = sympy.Symbol("gh")
+        blk = sympy.Symbol("blk")
+        partial_idx = [gh * p_stride[0], blk * p_stride[1], sympy.Integer(0)]
+
+        # out view
+        out_tensor4 = empty_strided(out.get_layout().size, out.get_layout().stride)
+        B, Hq, Lq, Dh = out_tensor4.shape
+        assert B == 1 and Lq == 1
+        dh_tiles = int(Dh) // int(tile_e)
+        out_tensor = out_tensor4.view(Hq * dh_tiles, 1, tile_e)
+        o_stride = out_tensor.stride()
+        out_idx = [gh * o_stride[0], sympy.Integer(0), sympy.Integer(0)]
+
+        kernel.loop_size = [tile_pack, tile_e, 1]
+
+        kernel.render_options = dict(
+            KERNEL_NAME=self.name,
+            kernel=kernel,
+            HgDhTiles=HgDhTiles,
+            nblk=nblk,
+            tile_e=tile_e,
+            tile_pack=tile_pack,
+            io_stype=io_stype,
+            partial=partial,
+            out=out,
+            partial_tile_desc=partial_tile_desc,
+            out_acc_tile_desc=out_acc_tile_desc,
+            max_desc=max_desc,
+            sum_desc=sum_desc,
+            out_tile_desc=out_tile_desc,
+            partial_idx=partial_idx,
+            out_idx=out_idx,
+            input_reorder=self.input_reorder,
+        )
+
+        kernel.epilogue_info = dict(
+            output_node=self.output_node.name,
+            sram_var="out_buffer",
+            dram_var="out",
+            dram_idx=out_idx,
+            dram_tile_desc=out_tile_desc,
+            nr_rdim=0,
+            r_dim_size=0,
+            dim_aliasing={"gh": "gh", "blk": "blk"},
+        )
+        return self._template_from_string(DECODE_GQA_SDPA_REDUCE_TEMPLATE).render(**kernel.render_options)
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index b2df1d06..53db988b 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -904,7 +904,7 @@ def hook():
 
     def def_dma_op(self, dma_type, dram_var:str, index_list:list, tile_desc:mlir_common.MLIRMultiDimTile,
                    subtile_size:list=[], async_type=None, indent_size=0, priority: int = 5, lazy_mode: bool = True,
-                   dram_stride:list=None, dram_offset=None):
+                   dram_stride:list=None, dram_offset=None, padding: int = 0):
         # Todo. Remove legacy behavior (i.e., index_list parsing)
         def generate_dma_code():
             """Internal method to generate DMA code directly."""
@@ -948,7 +948,7 @@ def generate_dma_code():
                 zero_cse = self.get_const_cse(0, "index")
                 sram_index_var = ", ".join([f"%{str(zero_cse)}"]*tile_desc.get_nr_dim())
 
-                attribute_parts = [f"dram_stride={_dram_stride}", f"sram_stride={sram_strides}", "padding=0"]
+                attribute_parts = [f"dram_stride={_dram_stride}", f"sram_stride={sram_strides}", f"padding={int(padding)}"]
                 if subtile_size:
                     attribute_parts.append(f"subtile_size={subtile_size}, async={int(async_type) if async_type is not None else 1}")
                 attribute = "  {" + ", ".join(attribute_parts) + "}"
diff --git a/tests/test_sdpa.py b/tests/test_sdpa.py
index 6ffd6f2e..ed7ae8f8 100644
--- a/tests/test_sdpa.py
+++ b/tests/test_sdpa.py
@@ -58,6 +58,60 @@ def test_scaled_dot_product_attention(device, backends="flash"):
     
     print("All tests passed!")
 
+def test_scaled_dot_product_attention_gqa_single_batch(device):
+    """
+    Focused GQA testcases for single-batch (n==1).
+    Shapes:
+      q: (B, Hq, Lq, Dh)
+      k: (B, H,  S,  Dh)
+      v: (B, H,  S,  Dh)
+    """
+    torch.manual_seed(0)
+
+    B = 1
+    # Decode-focused: include a larger S to hit BlkS logic
+    seq_len_list = [128, 256, 1024]
+    head_dim_list = [64, 128]
+    # GQA ratios requested: Hq / H in {4, 5, 8, 16}.
+    # Keep H=1 to directly realize those ratios.
+    gqa_ratios = [4, 5, 8, 16]
+    H = 1
+
+    for seq_len in seq_len_list:
+        for head_dim in head_dim_list:
+            for ratio in gqa_ratios:
+                Hq = ratio * H
+
+                clear_caches()
+                # Decode shape: Lq == 1
+                q = torch.rand(B, Hq, 1, head_dim, dtype=torch.float32)
+                k = torch.rand(B, H, seq_len, head_dim, dtype=torch.float32)
+                v = torch.rand(B, H, seq_len, head_dim, dtype=torch.float32)
+
+                # NPU
+                q_npu = q.to(device=device)
+                k_npu = k.to(device=device)
+                v_npu = v.to(device=device)
+                opt_fn = torch.compile(dynamic=False)(F.scaled_dot_product_attention)
+                out = opt_fn(q_npu, k_npu, v_npu, attn_mask=None, dropout_p=0.0, is_causal=True, enable_gqa=True)
+
+                # CPU reference
+                cpu_device = torch.device("cpu")
+                cpu_out = F.scaled_dot_product_attention(
+                    q.to(device=cpu_device),
+                    k.to(device=cpu_device),
+                    v.to(device=cpu_device),
+                    attn_mask=None,
+                    dropout_p=0.0,
+                    is_causal=True,
+                    enable_gqa=True,
+                )
+
+                name = f"SDPA-GQA(B: {B}, Hq: {Hq}, H: {H}, S: {seq_len}, head_dim: {head_dim})"
+                test_result(name, out, cpu_out)
+
+    print("All GQA single-batch tests passed!")
+
 def clear_caches():
     import os
     from torch._functorch._aot_autograd.autograd_cache import AOTAutogradCache
@@ -69,5 +123,6 @@ def clear_caches():
 
 if __name__ == "__main__":    
     device = torch.device('npu:0')
-    test_scaled_dot_product_attention(device, backends="flash")
+    # test_scaled_dot_product_attention(device, backends="flash")
+    test_scaled_dot_product_attention_gqa_single_batch(device)
     
\ No newline at end of file

From 59bd8f8ddc9ff86f35a45a347d7f5c7d5fe8bf7a Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 12 Mar 2026 21:29:16 +0900
Subject: [PATCH 129/194] WIP

---
 PyTorchSimFrontend/mlir/mlir_lowering.py      |   1 +
 PyTorchSimFrontend/mlir/mlir_sdpa_template.py | 660 +++++++++++-------
 2 files changed, 398 insertions(+), 263 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_lowering.py b/PyTorchSimFrontend/mlir/mlir_lowering.py
index ac7eb853..7b2c07bf 100644
--- a/PyTorchSimFrontend/mlir/mlir_lowering.py
+++ b/PyTorchSimFrontend/mlir/mlir_lowering.py
@@ -89,6 +89,7 @@ def tuned_flash_sdpa(
         )
         partial_tmpl = MLIRDecodeGQASDPAPartialTemplate([query, key, value], partial_layout, scale, BlkS=BlkS)
         partial = partial_tmpl.generate().output_node()
+        partial.realize()
         reduce_tmpl = MLIRDecodeGQASDPAReduceTemplate([partial], layout, BlkS=BlkS)
         out_node = reduce_tmpl.generate().output_node()
         return (out_node, None, None, None, None, None, None, None, None)
diff --git a/PyTorchSimFrontend/mlir/mlir_sdpa_template.py b/PyTorchSimFrontend/mlir/mlir_sdpa_template.py
index 1cd810e8..077a8cd2 100644
--- a/PyTorchSimFrontend/mlir/mlir_sdpa_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_sdpa_template.py
@@ -16,17 +16,87 @@
 from PyTorchSimFrontend.mlir.mlir_template import MLIRTemplateKernel
 
 
+def _make_offset_map_with_sym(strides, sym_dim, sym_stride, offset=0):
+    """Like _make_offset_map but injects a block symbol ``s`` into dimension ``sym_dim``.
+
+    The effective index for that dimension becomes ``d{sym_dim} + sym_stride * s``.
+    Use this to keep ``affine.for`` bounds static and encode the block contribution
+    directly inside the ``affine.apply`` call that computes the DRAM offset.
+
+    Args:
+        strides:    per-dimension DRAM strides.
+        sym_dim:    which dimension carries the block symbol.
+        sym_stride: multiplier for the symbol (1 for abs-position loops like FLASH
+                    ``%blk``; ``BlkS`` for block-index loops like PARTIAL ``%blk``).
+        offset:     constant layout offset.
+
+    Returns:
+        MLIR affine_map string with one symbol, e.g.
+        ``affine_map<(d0, d1, d2)[s] -> (d0 * 8192 + (d1 + 128 * s) * 64 + d2)>``
+    """
+    n = len(strides)
+    terms = []
+    for j, sv in enumerate(strides):
+        sv = int(sv)
+        if sv == 0:
+            continue
+        if j == sym_dim:
+            inner = f"d{j} + s" if sym_stride == 1 else f"d{j} + {sym_stride} * s"
+            terms.append(f"({inner})" if sv == 1 else f"({inner}) * {sv}")
+        else:
+            terms.append(f"d{j}" if sv == 1 else f"d{j} * {sv}")
+    try:
+        off = int(offset)
+    except (TypeError, ValueError):
+        off = 0
+    if off:
+        terms.append(str(off))
+    dim_str = ", ".join(f"d{j}" for j in range(n))
+    expr = " + ".join(terms) if terms else "0"
+    return f"affine_map<({dim_str})[s] -> ({expr})>"
+
+
+def _make_offset_map(strides, offset=0):
+    """Generate an MLIR affine_map string for a flat DRAM base-address.
+
+    Args:
+        strides: list of integer per-dimension strides.
+                 A stride of 0 means the dimension does not contribute.
+        offset:  constant layout offset (e.g. from IRNode.get_layout().offset).
+
+    Returns:
+        MLIR affine_map string, e.g. ``affine_map<(d0, d1) -> (d0 * 128 + d1)>``
+    """
+    n = len(strides)
+    terms = []
+    for j, s in enumerate(strides):
+        s = int(s)
+        if s == 1:
+            terms.append(f"d{j}")
+        elif s != 0:
+            terms.append(f"d{j} * {s}")
+    try:
+        off = int(offset)
+    except (TypeError, ValueError):
+        off = 0
+    if off:
+        terms.append(str(off))
+    dim_str = ", ".join(f"d{j}" for j in range(n))
+    expr = " + ".join(terms) if terms else "0"
+    return f"affine_map<({dim_str}) -> ({expr})>"
+
+
 def flash_sdpa_args(
-        query : TensorBox, 
-        key   : TensorBox, 
+        query : TensorBox,
+        key   : TensorBox,
         value : TensorBox) -> list:
     """
     Arg processing for flash SDPA.
-    Its logic is based on: 
+    Its logic is based on:
     mm_args() which is in torch._inductor.kernel.mm_common.py (142 line).
     """
 
-    # Materialize input buffers for the codegen backend. 
+    # Materialize input buffers for the codegen backend.
     query, key, value = realize_inputs(query, key, value)
 
     # query : (n, hq, l, e)
@@ -43,7 +113,7 @@ def flash_sdpa_args(
 
     n = V.graph.sizevars.guard_equals(nq, nk)
     n = V.graph.sizevars.guard_equals(nq, nk)
-    
+
     h = V.graph.sizevars.guard_equals(hk, hv)
     s = V.graph.sizevars.guard_equals(sk, sv)
     e = V.graph.sizevars.guard_equals(eq, ek)
@@ -62,7 +132,7 @@ def flash_sdpa_args(
         raise NotImplementedError(
             f"Flash SDPA currently requires e to be a multiple of vlanes (e: {e}, vlanes: {vector_lane})."
         )
-    
+
     # Minimal GQA support (single-batch only for now).
     # We map each query head to a KV head by grouping: hq = g * h.
     if hq != h:
@@ -70,14 +140,14 @@ def flash_sdpa_args(
             raise NotImplementedError("Flash SDPA GQA is currently supported only for n == 1.")
         if (hq % h) != 0:
             raise NotImplementedError(f"Flash SDPA GQA requires hq % h == 0 (hq: {hq}, h: {h}).")
-    
+
     layout = FixedLayout(
         query.get_device(),
         query.get_dtype(),
         [n, hq, l, ev]
     )
 
-    return [n, hq, h, l, s, e, ev, layout, query, key, value]    
+    return [n, hq, h, l, s, e, ev, layout, query, key, value]
 
 def calculate_scale(query: torch.Tensor, scale: float) -> float:
     """
@@ -109,7 +179,7 @@ def calculate_scale(query: torch.Tensor, scale: float) -> float:
   {{ kernel.def_sram_buffer("query", q_tile_desc, indent_size=2) }}
   {{ kernel.def_sram_buffer("key", k_tile_desc, indent_size=2) }}
   {{ kernel.def_sram_buffer("value", v_tile_desc, indent_size=2) }}
-  
+
   // Output
   {{ kernel.def_sram_buffer("out", out_tile_desc, indent_size=2) }}
 
@@ -117,7 +187,7 @@ def calculate_scale(query: torch.Tensor, scale: float) -> float:
   {{ kernel.def_sram_buffer("mul", mul_tile_desc, indent_size=2) }}
   {{ kernel.def_sram_buffer("max", max_desc, indent_size=2) }}
   {{ kernel.def_sram_buffer("sum", sum_desc, indent_size=2) }}
-  
+
   // Constants
   %c0 = arith.constant 0.0 : {{ data_stype }}
   %c1 = arith.constant 1.0 : {{ data_stype }}
@@ -133,33 +203,36 @@ def calculate_scale(query: torch.Tensor, scale: float) -> float:
   %v_neg_inf_2x = arith.constant dense<-1.0e+30> : vector<2x{{ data_stype }}>
 
   %v_scale = vector.broadcast %c_scale : {{ data_stype }} to vector<{{ tile_s }}x{{ data_stype }}>
-  
-  {{ kernel.def_local_vars(indent_size=2) }}  
-  
+
+  {{ kernel.def_local_vars(indent_size=2) }}
+
   affine.for %index0 = 0 to {{ b }} {
     affine.for %index3 = 0 to 1 step 1 {
       affine.for %index1 = 0 to {{ l }} step {{ tile_l }} {
-        {{ kernel.def_dma_op("MVIN", "query", q_idx, q_tile_desc, subtile_size=[1, subtile_l, subtile_e], indent_size=8) }}  
-        
+        %q_dram_offset = affine.apply {{ q_offset_map }}(%index0, %index1, %index3)
+        {{ kernel.def_dma_op("MVIN", "query", [], q_tile_desc, subtile_size=[1, subtile_l, subtile_e], indent_size=8, dram_stride=q_dram_stride, dram_offset="q_dram_offset") }}
+
         affine.vector_store %v0_l, %out_buffer[0, 0, 0] : {{ out_tile_desc.get_mlir_shape(data_stype) }}, vector<{{ kernel.get_spad_size_per_lane(tile_l, tile_e) }}x{{ data_stype }}>
-        affine.vector_store %v_neg_inf_2x, %max_buffer[0, 0] : {{ max_desc.get_mlir_shape(data_stype) }}, vector<2x{{ data_stype }}> 
+        affine.vector_store %v_neg_inf_2x, %max_buffer[0, 0] : {{ max_desc.get_mlir_shape(data_stype) }}, vector<2x{{ data_stype }}>
         affine.vector_store %v0_2x, %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape(data_stype) }}, vector<2x{{ data_stype }}>
-              
+
         %qt_buffer2D = memref.reinterpret_cast %q_buffer to offset: [0], sizes: [{{ tile_e }}, {{ tile_l }}], strides: [{{ tile_l }}, 1] : {{ q_tile_desc.get_mlir_shape(data_stype) }} to memref<{{ tile_e }}x{{ tile_l }}x{{ data_stype }}, 1>
         %ot_buffer2D = memref.reinterpret_cast %out_buffer to offset: [0], sizes: [{{ tile_e }}, {{ tile_l }}], strides: [{{ tile_l }}, 1] : {{ out_tile_desc.get_mlir_shape(data_stype) }} to memref<{{ tile_e }}x{{ tile_l }}x{{ data_stype }}, 1>
 
         affine.for %index2 = 0 to {{ s }} step {{ tile_s }} {
-          {{ kernel.def_dma_op("MVIN", "key", k_idx, k_tile_desc, subtile_size=[1, subtile_s, subtile_e], indent_size=10) }} 
-          {{ kernel.def_dma_op("MVIN", "value", v_idx, v_tile_desc, subtile_size=[1, subtile_s, subtile_e], indent_size=10) }}
+          %k_dram_offset = affine.apply {{ k_offset_map }}(%index0, %index2, %index3)
+          {{ kernel.def_dma_op("MVIN", "key", [], k_tile_desc, subtile_size=[1, subtile_s, subtile_e], indent_size=10, dram_stride=k_dram_stride, dram_offset="k_dram_offset") }}
+          %v_dram_offset = affine.apply {{ v_offset_map }}(%index0, %index2, %index3)
+          {{ kernel.def_dma_op("MVIN", "value", [], v_tile_desc, subtile_size=[1, subtile_s, subtile_e], indent_size=10, dram_stride=v_dram_stride, dram_offset="v_dram_offset") }}
 
-          affine.vector_store %v0_s, %mul_buffer[0, 0] : {{ mul_tile_desc.get_mlir_shape(data_stype) }}, vector<{{ kernel.get_spad_size_per_lane(tile_s, tile_l) }}x{{ data_stype }}>        
+          affine.vector_store %v0_s, %mul_buffer[0, 0] : {{ mul_tile_desc.get_mlir_shape(data_stype) }}, vector<{{ kernel.get_spad_size_per_lane(tile_s, tile_l) }}x{{ data_stype }}>
 
           %k_buffer2D = memref.reinterpret_cast %k_buffer to offset: [0], sizes: [{{ tile_s }}, {{ tile_e }}], strides: [{{ tile_e }}, 1] : {{ k_tile_desc.get_mlir_shape(data_stype) }} to memref<{{ tile_s }}x{{ tile_e }}x{{ data_stype }}, 1>
           %vt_buffer2D = memref.reinterpret_cast %v_buffer to offset: [0], sizes: [{{ tile_e }}, {{ tile_s }}], strides: [{{ tile_s }}, 1] : {{ v_tile_desc.get_mlir_shape(data_stype) }} to memref<{{ tile_e }}x{{ tile_s }}x{{ data_stype }}, 1>
 
-          
+
           // key @ query.t and scaling.
-          linalg.matmul 
+          linalg.matmul
             { idx_map = array<i32: 1, 0, -1> }
             ins(%k_buffer2D, %qt_buffer2D : memref<{{ tile_s }}x{{ tile_e }}x{{ data_stype }}, 1>, memref<{{ tile_e }}x{{ tile_l }}x{{ data_stype }}, 1>)
             outs(%mul_buffer : {{ mul_tile_desc.get_mlir_shape(data_stype) }})
@@ -168,7 +241,7 @@ def calculate_scale(query: torch.Tensor, scale: float) -> float:
           %scaled_mul_vec = arith.mulf %raw_mul_vec, %v_scale :  vector<{{ tile_s }}x{{ data_stype }}>
           affine.vector_store %scaled_mul_vec, %mul_buffer[0, 0] : {{ mul_tile_desc.get_mlir_shape(data_stype) }}, vector<{{ tile_s }}x{{ data_stype }}>
 
-          
+
           // Find new max.
           %old_max = affine.vector_load %max_buffer[0,0] : {{ max_desc.get_mlir_shape(data_stype) }}, vector<2x{{ data_stype }}>
 
@@ -182,22 +255,22 @@ def calculate_scale(query: torch.Tensor, scale: float) -> float:
           %max_reduced_1 = vector.multi_reduction <maximumf>, %max_cast, %v_neg_inf_2x [0] : vector<8x2x{{ data_stype }}> to vector<2x{{ data_stype }}>
           %max_shuffled = vector.shuffle %max_reduced_1, %max_reduced_1 [1, 0] : vector<2x{{ data_stype }}>, vector<2x{{ data_stype }}>
           %max_reduced_2 = arith.maximumf %max_reduced_1, %max_shuffled : vector<2x{{ data_stype }}>
-          
-          %new_max = arith.maximumf %max_reduced_2, %old_max : vector<2x{{ data_stype }}> 
+
+          %new_max = arith.maximumf %max_reduced_2, %old_max : vector<2x{{ data_stype }}>
           affine.vector_store %new_max, %max_buffer[0, 0] : {{ max_desc.get_mlir_shape(data_stype) }}, vector<2x{{ data_stype }}>
-          
+
 
           // Compute rescale factors: exp(old_max - new_max)
           %max_diff = arith.subf %old_max, %new_max : vector<2x{{ data_stype }}>
           %max_diff_scalar = vector.extract %max_diff[0] : {{ data_stype }} from vector<2x{{ data_stype }}>
-          
-          %rescale_bcast_e = vector.broadcast %max_diff_scalar : {{ data_stype }} to vector<{{ tile_e }}x{{ data_stype }}> 
-          %exp_rescale_e = math.exp %rescale_bcast_e : vector<{{ tile_e }}x{{ data_stype }}> 
+
+          %rescale_bcast_e = vector.broadcast %max_diff_scalar : {{ data_stype }} to vector<{{ tile_e }}x{{ data_stype }}>
+          %exp_rescale_e = math.exp %rescale_bcast_e : vector<{{ tile_e }}x{{ data_stype }}>
 
           %rescale_bcast_2 = vector.broadcast %max_diff_scalar : {{ data_stype }} to vector<2x{{ data_stype }}>
           %exp_rescale_2 = math.exp %rescale_bcast_2 : vector<2x{{ data_stype }}>
 
-          
+
           // Rescale previous out and sum accumulators
           %old_out = affine.vector_load %ot_buffer2D[0, 0] : memref<{{ tile_e }}x{{ tile_l }}x{{ data_stype }}, 1>, vector<{{ tile_e }}x{{ data_stype }}>
           %rescaled_out = arith.mulf %exp_rescale_e, %old_out : vector<{{ tile_e }}x{{ data_stype }}>
@@ -206,16 +279,16 @@ def calculate_scale(query: torch.Tensor, scale: float) -> float:
           %old_sum = affine.vector_load %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape(data_stype) }}, vector<2x{{ data_stype }}>
           %rescaled_sum = arith.mulf %old_sum, %exp_rescale_2 : vector<2x{{ data_stype }}>
 
-          
+
           // Shift scores and apply exp: exp(x - new_max)
           %scaled_scores_reload = affine.vector_load %mul_buffer[0, 0] : {{ mul_tile_desc.get_mlir_shape(data_stype) }}, vector<{{ tile_s }}x{{ data_stype }}>
           %new_max_scalar = vector.extract %new_max[0] : {{ data_stype }} from vector<2x{{ data_stype }}>
           %new_max_bcast = vector.broadcast %new_max_scalar : {{ data_stype }} to vector<{{ tile_s }}x{{ data_stype }}>
-          
+
           %shifted_scores = arith.subf %scaled_scores_reload, %new_max_bcast : vector<{{ tile_s }}x{{ data_stype }}>
           %exp_scores = math.exp %shifted_scores :  vector<{{ tile_s }}x{{ data_stype }}>
           affine.vector_store %exp_scores, %mul_buffer[0, 0] : {{ mul_tile_desc.get_mlir_shape(data_stype) }}, vector<{{ tile_s }}x{{ data_stype }}>
-          
+
 
           // accumulate current sum
           %chunk_sum_res = affine.for %index5 = 0 to {{ tile_s }} step {{ chunk_size }} iter_args(%iter_sum=%v0_c) -> (vector<{{ chunk_size }}x{{ data_stype }}>) {
@@ -223,19 +296,19 @@ def calculate_scale(query: torch.Tensor, scale: float) -> float:
             %local_sum = arith.addf %chunk_exp, %iter_sum : vector<{{ chunk_size }}x{{ data_stype }}>
             affine.yield %local_sum : vector<{{ chunk_size }}x{{ data_stype }}>
           }
-          
+
           %zero_2x = vector.broadcast %c0 : {{ data_stype }} to vector<2x{{ data_stype }}>
           %sum_cast = vector.shape_cast %chunk_sum_res : vector<{{ chunk_size }}x{{ data_stype }}> to vector<{{ chunk_size // 2 }}x2x{{ data_stype }}>
           %sum_reduced_1 = vector.multi_reduction <add>, %sum_cast, %zero_2x [0] : vector<8x2x{{ data_stype }}> to vector<2x{{ data_stype }}>
           %sum_shuffled = vector.shuffle %sum_reduced_1, %sum_reduced_1 [1, 0] : vector<2x{{ data_stype }}>, vector<2x{{ data_stype }}>
           %sum_reduced_2 = arith.addf %sum_reduced_1, %sum_shuffled : vector<2x{{ data_stype }}>
-          
+
           %new_sum = arith.addf %sum_reduced_2, %rescaled_sum :  vector<2x{{ data_stype }}>
           affine.vector_store %new_sum, %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape(data_stype) }}, vector<2x{{ data_stype }}>
 
-          
+
           // value.t @ mul
-          linalg.matmul 
+          linalg.matmul
             { idx_map = array<i32: 2, 1, -1> }
             ins(%vt_buffer2D, %mul_buffer : memref<{{ tile_e }}x{{ tile_s }}x{{ data_stype }}, 1>, {{ mul_tile_desc.get_mlir_shape(data_stype) }})
             outs(%ot_buffer2D : memref<{{ tile_e }}x{{ tile_l }}x{{ data_stype }}, 1>)
@@ -244,20 +317,21 @@ def calculate_scale(query: torch.Tensor, scale: float) -> float:
         // out @ row_sum^(-1)
         %final_row_sum = affine.vector_load %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape(data_stype) }}, vector<2x{{ data_stype }}>
         %one_2x = vector.broadcast %c1 : {{ data_stype }} to vector<2x{{ data_stype }}>
-        
+
         %reciprocal_row_sum_2x = arith.divf %one_2x, %final_row_sum : vector<2x{{ data_stype }}>
         %reciprocal_scalar = vector.extract %reciprocal_row_sum_2x[0] : {{ data_stype }} from vector<2x{{ data_stype }}>
         %reciprocal_bcast_e = vector.broadcast %reciprocal_scalar : {{ data_stype }} to vector<{{ tile_e }}x{{ data_stype }}>
-        
+
         %accumulated_out = affine.vector_load %ot_buffer2D[0, 0] : memref<{{ tile_e }}x{{ tile_l }}x{{ data_stype }}, 1>, vector<{{ tile_e }}x{{ data_stype }}>
         %stable_final_out = arith.mulf %accumulated_out, %reciprocal_bcast_e : vector<{{ tile_e }}x{{ data_stype }}>
         affine.vector_store %stable_final_out, %ot_buffer2D[0, 0] : memref<{{ tile_e }}x{{ tile_l }}x{{ data_stype }}, 1>, vector<{{ tile_e }}x{{ data_stype }}>
 
-        {{ kernel.store_output(indent_size=8) }}
-      } { accumulation_loop=true } 
+        %out_dram_offset = affine.apply {{ out_offset_map }}(%index0, %index1, %index3)
+        {{ kernel.def_dma_op("MVOUT", "out", [], out_tile_desc, indent_size=8, dram_stride=out_dram_stride, dram_offset="out_dram_offset") }}
+      } { accumulation_loop=true }
     } { outer_loop=true }
   } { outer_loop=true }
-  return 
+  return
 }
 """
 
@@ -273,10 +347,10 @@ def render(self,
                prologue_nodes: Optional[List[IRNode]] = None,
                tile_info = None,
                **kwargs):
-    
+
         # Except for kernel, other arguments are usually None.
         query, key, value, out, q_tensor, k_tensor, v_tensor, out_tensor, b, l, s, e, ev, n_extra_node, n_prologue_node = self.extract_info(template_buffer_node, epilogue_nodes, prologue_nodes)
-       
+
         if tile_info is None:
             tile_l, tile_s, tile_e, subtile_l, subtile_s, subtile_e = self.select_tile(kernel, l, s, e, n_extra_node, 0, n_prologue_node)[0]
         else:
@@ -299,10 +373,10 @@ def render(self,
 
         # Prepare tile descriptors for input and output tensors.
         # Intermediate buffers (transient data) do not require DRAM settings(dram stride and dram indices)
-        # as they are not synchronized with external DRAM. 
+        # as they are not synchronized with external DRAM.
         # DRAM and SRAM tile shapes must match.
         vlane_stride = 1
-        
+
         # (n, l, s, e, ev)
         loop_dim = [sympy.Symbol("index0"), sympy.Symbol("index1"), sympy.Symbol("index2"), sympy.Symbol("index3")]
 
@@ -317,11 +391,10 @@ def render(self,
         q_tile_desc.set_tile_size_stride(q_tile_size, q_tile_stride)
         q_tile_desc.set_name("q_buffer")
         q_tile_desc.offset = query.get_layout().offset
-        # DRAM settings 
+        # DRAM settings
         q_stride = q_tensor.stride()
-        q_idx = [loop_dim[0]*q_stride[0], loop_dim[1]*q_stride[1], loop_dim[3]*q_stride[2]] # To keep index arguemnt order, we used index_list
 
-        # Since we use a weight-stationary approach in the Systolic Array (SA), 
+        # Since we use a weight-stationary approach in the Systolic Array (SA),
         # the split axis of the first operand differs from a standard linear algebra matmul.
         # The first operand (key) must be split along the column axis.
         # This logic aligns with the relationship between the dot product's summation direction and the hardware's accumulation direction in the SA.
@@ -335,7 +408,6 @@ def render(self,
         k_tile_desc.offset = key.get_layout().offset
         # DRAM settings
         k_stride = k_tensor.stride()
-        k_idx = [loop_dim[0]*k_stride[0], loop_dim[2]*k_stride[1], loop_dim[3]*k_stride[2]]
 
         # Since we compute mul = key @ query.t, we perform out.t = (value.t @ Softmax(mul).t).t,
         # which simplifies to (value.t @ Softmax(mul))
@@ -349,19 +421,17 @@ def render(self,
         v_tile_desc.offset = value.get_layout().offset
         # DRAM settings
         v_stride = v_tensor.stride()
-        v_idx = [loop_dim[0]*v_stride[0], loop_dim[2]*v_stride[1], loop_dim[3]*v_stride[2]] # To keep index arguemnt order, we used index_list
 
         # Output is also stored in transposed format to match the value.t @ Softmax(mul) operation.
         # SRAM settings
         vlane_split_axis = 1
-        out_tile_size = [1, tile_l, tile_e] 
-        out_tile_stride=[0, tile_e, 1] 
+        out_tile_size = [1, tile_l, tile_e]
+        out_tile_stride=[0, tile_e, 1]
         out_tile_desc = mlir_common.MLIRMultiDimTile(out_tile_size, kernel.vector_lane, vlane_split_axis, vlane_stride)
         out_tile_desc.set_tile_size_stride(out_tile_size, out_tile_stride)
         out_tile_desc.set_name("out_buffer")
         # DRAM settings
         out_stride = out.get_layout().stride[1:]
-        out_idx = [loop_dim[0]*out_stride[0], loop_dim[1]*out_stride[1], loop_dim[3]*out_stride[2]]
 
         # Intermediate buffers
 
@@ -393,28 +463,46 @@ def render(self,
         # For reduction
         chunk_size = 16
 
+        # DMA strides and offset affine maps (dram_stride + dram_offset style)
+        q_dram_stride  = [int(q_stride[0]), int(q_stride[1]), int(q_stride[2])]
+        k_dram_stride  = [int(k_stride[0]), int(k_stride[1]), int(k_stride[2])]
+        v_dram_stride  = [int(v_stride[0]), int(v_stride[1]), int(v_stride[2])]
+        out_dram_stride = [int(out_stride[0]), int(out_stride[1]), int(out_stride[2])]
+
+        q_offset_map   = _make_offset_map(q_dram_stride,   q_tile_desc.offset)
+        k_offset_map   = _make_offset_map(k_dram_stride,   k_tile_desc.offset)
+        v_offset_map   = _make_offset_map(v_dram_stride,   v_tile_desc.offset)
+        out_offset_map = _make_offset_map(out_dram_stride, 0)
+
+        # Keep out_idx only for epilogue_info (not in render_options)
+        out_idx = [loop_dim[0]*out_stride[0], loop_dim[1]*out_stride[1], loop_dim[3]*out_stride[2]]
+
         kernel.render_options = dict(
             KERNEL_NAME = self.name,
             kernel = kernel,
-            b = b, 
-            l = l, 
-            s = s, 
+            b = b,
+            l = l,
+            s = s,
             e = e,                             # Input sizes (dram)
-            tile_l = tile_l, 
-            tile_s = tile_s, 
+            tile_l = tile_l,
+            tile_s = tile_s,
             tile_e = tile_e,                   # Tile sizes (sram)
-            subtile_l = subtile_l, 
-            subtile_s = subtile_s, 
-            subtile_e = subtile_e,             # Subtile sizes (sram)  
+            subtile_l = subtile_l,
+            subtile_s = subtile_s,
+            subtile_e = subtile_e,             # Subtile sizes (sram)
             data_stype="f32",
-            query = query, 
+            query = query,
             key = key,
-            value = value, 
+            value = value,
             out = out,                         # Inputs and output (dram)
-            q_idx = q_idx,
-            k_idx = k_idx,
-            v_idx = v_idx,
-            out_idx = out_idx,                 # Strides (dram)       
+            q_dram_stride  = q_dram_stride,
+            k_dram_stride  = k_dram_stride,
+            v_dram_stride  = v_dram_stride,
+            out_dram_stride = out_dram_stride, # Per-dim DRAM strides
+            q_offset_map   = q_offset_map,
+            k_offset_map   = k_offset_map,
+            v_offset_map   = v_offset_map,
+            out_offset_map = out_offset_map,   # Affine maps for base address
             q_tile_desc = q_tile_desc,
             k_tile_desc = k_tile_desc,
             v_tile_desc = v_tile_desc,
@@ -423,19 +511,8 @@ def render(self,
             max_desc = max_desc,
             sum_desc = sum_desc,               # Intermediate buffer descriptions (sram)
             scale = self.scale,
-            chunk_size = chunk_size,        
-            input_reorder = self.input_reorder # ETC 
-        )
-
-        kernel.epilogue_info = dict(
-            output_node = self.output_node.name,
-            sram_var = "out_buffer",
-            dram_var = "out",
-            dram_idx = out_idx,
-            dram_tile_desc = out_tile_desc,
-            nr_rdim = nr_rdim,
-            r_dim_size = 0,
-            dim_aliasing = epilogue_dim_aliasing
+            chunk_size = chunk_size,
+            input_reorder = self.input_reorder # ETC
         )
 
         code = self._template_from_string(template).render(**kernel.render_options)
@@ -445,7 +522,7 @@ def render(self,
     def extract_info(self, template_buffer_node, epilogue_nodes, prologue_nodes):
         if template_buffer_node is not None:
             self.output_node = template_buffer_node
-        
+
         query = self.input_nodes[0]
         key = self.input_nodes[1]
         value = self.input_nodes[2]
@@ -462,7 +539,7 @@ def extract_info(self, template_buffer_node, epilogue_nodes, prologue_nodes):
         v_tensor = v_tensor.view([-1, v_tensor.shape[-2], v_tensor.shape[-1]])
         out_tensor = out_tensor.view([-1, out_tensor.shape[-2], out_tensor.shape[-1]])
 
-        b, l, s, e, ev = q_tensor.size(0), q_tensor.size(1), k_tensor.size(1), k_tensor.size(2), v_tensor.size(2) 
+        b, l, s, e, ev = q_tensor.size(0), q_tensor.size(1), k_tensor.size(1), k_tensor.size(2), v_tensor.size(2)
 
         n_extra_node = len(epilogue_nodes) if epilogue_nodes is not None else 0
         n_prologue_node = len(prologue_nodes) if prologue_nodes is not None else 0
@@ -549,7 +626,7 @@ def select_tile(self, kernel, l, s, e, n_extra_node, n_extra_read, n_prologue_no
         }
       }
 
-      affine.for %s0 = %blk to (%blk + {{ BlkS }}) step {{ tile_s }} {
+      affine.for %s0 = 0 to {{ BlkS }} step {{ tile_s }} {
         // Accumulate score per qsub so K tiles can be shared across qsub.
         affine.for %qsub = 0 to {{ g }} {
           affine.vector_store %v0_s_acc, %score_buffer[%qsub, 0] : {{ score_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_s }}x{{ acc_stype }}>
@@ -557,11 +634,14 @@ def select_tile(self, kernel, l, s, e, n_extra_node, n_extra_read, n_prologue_no
 
         affine.for %k0 = 0 to {{ Dh }} step {{ tile_e }} {
           // Load K slice once for all qsub.
-          {{ kernel.def_dma_op("MVIN", "key", kk_idx, k_tile_desc, subtile_size=[1, tile_s, tile_e], indent_size=10, padding=1) }}
+          %kk_offset = affine.apply {{ kk_offset_map_blk }}(%kv, %s0, %k0)[%blk]
+          {{ kernel.def_dma_op("MVIN", "key", [], k_tile_desc, subtile_size=[1, tile_s, tile_e], indent_size=10, padding=1, dram_stride=k_dram_stride, dram_offset="kk_offset") }}
           %k2D = memref.reinterpret_cast %k_buffer to offset: [0], sizes: [{{ tile_s }}, {{ tile_e }}], strides: [{{ tile_e }}, 1] : {{ k_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_s }}x{{ tile_e }}x{{ io_stype }}, 1>
 
           affine.for %qsub = 0 to {{ g }} {
-            {{ kernel.def_dma_op("MVIN", "query", qk_idx, q_tile_desc, subtile_size=[1, 1, tile_e], indent_size=12) }}
+            %q_head = affine.apply affine_map<(d0, d1) -> (d0 * {{ g }} + d1)>(%kv, %qsub)
+            %qk_offset = affine.apply {{ qk_offset_map }}(%q_head, %k0)
+            {{ kernel.def_dma_op("MVIN", "query", [], q_tile_desc, subtile_size=[1, 1, tile_e], indent_size=12, dram_stride=q_dram_stride, dram_offset="qk_offset") }}
             %q2D = memref.reinterpret_cast %q_buffer to offset: [0], sizes: [{{ tile_e }}, 1], strides: [1, 1] : {{ q_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_e }}x1x{{ io_stype }}, 1>
 
             // mul = k @ q  -> (tile_s x 1) in io dtype, then upcast and accumulate.
@@ -571,9 +651,9 @@ def select_tile(self, kernel, l, s, e, n_extra_node, n_extra_read, n_prologue_no
               outs(%mul_buffer : {{ mul_tile_desc.get_mlir_shape(io_stype) }})
 
             %raw_mul_io = affine.vector_load %mul_buffer[0, 0] : {{ mul_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_s }}x{{ io_stype }}>
-            %raw_mul = arith.extf %raw_mul_io : vector<{{ tile_s }}x{{ io_stype }}> to vector<{{ tile_s }}x{{ acc_stype }}>
+            {% if io_stype != acc_stype %}%raw_mul = arith.extf %raw_mul_io : vector<{{ tile_s }}x{{ io_stype }}> to vector<{{ tile_s }}x{{ acc_stype }}>{% endif %}
             %old_score = affine.vector_load %score_buffer[%qsub, 0] : {{ score_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_s }}x{{ acc_stype }}>
-            %new_score = arith.addf %old_score, %raw_mul : vector<{{ tile_s }}x{{ acc_stype }}>
+            %new_score = arith.addf %old_score, {{ "%raw_mul" if io_stype != acc_stype else "%raw_mul_io" }} : vector<{{ tile_s }}x{{ acc_stype }}>
             affine.vector_store %new_score, %score_buffer[%qsub, 0] : {{ score_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_s }}x{{ acc_stype }}>
           } { accumulation_loop=true }
         } { accumulation_loop=true }
@@ -618,8 +698,8 @@ def select_tile(self, kernel, l, s, e, n_extra_node, n_extra_read, n_prologue_no
             %shifted = arith.subf %scaled_mul_vec, %new_max_bcast : vector<{{ tile_s }}x{{ acc_stype }}>
             %exp_scores = math.exp %shifted : vector<{{ tile_s }}x{{ acc_stype }}>
             // For SV matmul: downcast softmax output to io dtype (common in practice)
-            %exp_scores_io = arith.truncf %exp_scores : vector<{{ tile_s }}x{{ acc_stype }}> to vector<{{ tile_s }}x{{ io_stype }}>
-            affine.vector_store %exp_scores_io, %prob_buffer[%qsub, 0] : {{ prob_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_s }}x{{ io_stype }}>
+            {% if io_stype != acc_stype %}%exp_scores_io = arith.truncf %exp_scores : vector<{{ tile_s }}x{{ acc_stype }}> to vector<{{ tile_s }}x{{ io_stype }}>{% endif %}
+            affine.vector_store {{ "%exp_scores_io" if io_stype != acc_stype else "%exp_scores" }}, %prob_buffer[%qsub, 0] : {{ prob_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_s }}x{{ io_stype }}>
 
             // sum += reduce(exp_scores)
             %sum_cast = vector.shape_cast %exp_scores : vector<{{ tile_s }}x{{ acc_stype }}> to vector<{{ tile_s // 2 }}x2x{{ acc_stype }}>
@@ -635,7 +715,8 @@ def select_tile(self, kernel, l, s, e, n_extra_node, n_extra_read, n_prologue_no
         // 2) SV accumulation: for each output dh tile, load V once and share across qsub.
         affine.for %dht = 0 to {{ dh_tiles }} {
           %dh0 = affine.apply affine_map<(d0) -> (d0 * {{ tile_e }})>(%dht)
-          {{ kernel.def_dma_op("MVIN", "value", v_idx, v_tile_desc, subtile_size=[1, tile_s, tile_e], indent_size=10, padding=0) }}
+          %v_offset = affine.apply {{ v_offset_map_blk }}(%kv, %s0, %dh0)[%blk]
+          {{ kernel.def_dma_op("MVIN", "value", [], v_tile_desc, subtile_size=[1, tile_s, tile_e], indent_size=10, padding=0, dram_stride=v_dram_stride, dram_offset="v_offset") }}
           %v2D = memref.reinterpret_cast %v_buffer to offset: [0], sizes: [{{ tile_e }}, {{ tile_s }}], strides: [{{ tile_s }}, 1] : {{ v_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_e }}x{{ tile_s }}x{{ io_stype }}, 1>
 
           affine.for %qsub = 0 to {{ g }} {
@@ -649,9 +730,9 @@ def select_tile(self, kernel, l, s, e, n_extra_node, n_extra_read, n_prologue_no
               outs(%out_io_2D : memref<{{ tile_e }}x1x{{ io_stype }}, 1>)
 
             %out_io_vec = affine.vector_load %out_io_buffer[0, 0, 0] : {{ out_io_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_e }}x{{ io_stype }}>
-            %out_io_f32 = arith.extf %out_io_vec : vector<{{ tile_e }}x{{ io_stype }}> to vector<{{ tile_e }}x{{ acc_stype }}>
+            {% if io_stype != acc_stype %}%out_io_f32 = arith.extf %out_io_vec : vector<{{ tile_e }}x{{ io_stype }}> to vector<{{ tile_e }}x{{ acc_stype }}>{% endif %}
             %out_acc_vec = affine.vector_load %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_e }}x{{ acc_stype }}>
-            %out_acc_new = arith.addf %out_acc_vec, %out_io_f32 : vector<{{ tile_e }}x{{ acc_stype }}>
+            %out_acc_new = arith.addf %out_acc_vec, {{ "%out_io_f32" if io_stype != acc_stype else "%out_io_vec" }} : vector<{{ tile_e }}x{{ acc_stype }}>
             affine.vector_store %out_acc_new, %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_e }}x{{ acc_stype }}>
           } { accumulation_loop=true }
         } { accumulation_loop=true }
@@ -669,9 +750,11 @@ def select_tile(self, kernel, l, s, e, n_extra_node, n_extra_read, n_prologue_no
           %dh0 = affine.apply affine_map<(d0) -> (d0 * {{ tile_e }})>(%dht)
           %acc_out = affine.vector_load %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_e }}x{{ acc_stype }}>
           %final_out_acc = arith.mulf %acc_out, %inv_bcast : vector<{{ tile_e }}x{{ acc_stype }}>
-          %final_out_io = arith.truncf %final_out_acc : vector<{{ tile_e }}x{{ acc_stype }}> to vector<{{ tile_e }}x{{ io_stype }}>
-          affine.vector_store %final_out_io, %out_io_buffer[0, 0, 0] : {{ out_io_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_e }}x{{ io_stype }}>
-          {{ kernel.store_output(indent_size=10) }}
+          {% if io_stype != acc_stype %}%final_out_io = arith.truncf %final_out_acc : vector<{{ tile_e }}x{{ acc_stype }}> to vector<{{ tile_e }}x{{ io_stype }}>{% endif %}
+          affine.vector_store {{ "%final_out_io" if io_stype != acc_stype else "%final_out_acc" }}, %out_io_buffer[0, 0, 0] : {{ out_io_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_e }}x{{ io_stype }}>
+          %q_head = affine.apply affine_map<(d0, d1) -> (d0 * {{ g }} + d1)>(%kv, %qsub)
+          %out_offset = affine.apply {{ out_offset_map }}(%q_head, %dh0)
+          {{ kernel.def_dma_op("MVOUT", "out", [], out_io_tile_desc, indent_size=10, dram_stride=out_dram_stride, dram_offset="out_offset") }}
         }
       } { outer_loop=true }
     } { outer_loop=true }
@@ -690,7 +773,12 @@ def __init__(self, input_nodes, layout, scale, BlkS: int = 1024, input_reorder=N
 
     def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue_nodes=None, prologue_nodes=None, tile_info=None, **kwargs):
         # Decode-only: q is (B,Hq,1,Dh)
-        query, key, value, out = self.input_nodes[0], self.input_nodes[1], self.input_nodes[2], self.output_node
+        # Use template_buffer_node (the actual V.graph-registered CUDATemplateBuffer with its
+        # real name e.g. "buf0") when available, instead of the placeholder self.output_node
+        # (always named "buf_out").  This ensures output_buffers["buf0"] maps correctly
+        # in mlir_argdefs, which looks up buffer_types by the actual DRAM buffer name.
+        query, key, value, out = self.input_nodes[0], self.input_nodes[1], self.input_nodes[2], \
+            template_buffer_node if template_buffer_node is not None else self.output_node
 
         # Materialize tensors for stride metadata
         q_tensor4 = empty_strided(query.layout.size, query.layout.stride)
@@ -765,14 +853,7 @@ def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue
         sum_desc.set_tile_size_stride([g, 2], [2, 1])
         sum_desc.set_name("sum_buffer")
 
-        # Indices
-        kv = sympy.Symbol("kv")
-        qsub = sympy.Symbol("qsub")
-        dh0 = sympy.Symbol("dh0")
-        k0 = sympy.Symbol("k0")
-        s0 = sympy.Symbol("s0")
-        q_head = kv * g + qsub
-
+        # Strides from 3D tensor views
         q_stride = q_tensor.stride()
         k_stride = k_tensor.stride()
         v_stride = v_tensor.stride()
@@ -780,11 +861,34 @@ def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue
         out_tensor = empty_strided(out.get_layout().size, out.get_layout().stride).view(Hq, 1, Dh)
         out_stride = out_tensor.stride()
 
-        # QK indices use k0 reduction over Dh
-        qk_idx = [q_head * q_stride[0], sympy.Integer(0), k0 * q_stride[2]]
-        kk_idx = [kv * k_stride[0], s0 * k_stride[1], k0 * k_stride[2]]
-        # V and output use dh0 tile offset
-        v_idx = [kv * v_stride[0], s0 * v_stride[1], dh0 * v_stride[2]]
+        # DMA strides (per-dimension DRAM strides for each tile)
+        k_dram_stride  = [int(k_stride[0]), int(k_stride[1]), int(k_stride[2])]
+        # Q: q_head is pre-computed in template; stride[1]=0 since Lq=1
+        q_dram_stride  = [int(q_stride[0]), 0, int(q_stride[2])]
+        v_dram_stride  = [int(v_stride[0]), int(v_stride[1]), int(v_stride[2])]
+        # out: q_head is pre-computed; stride[1]=0 since Lq=1
+        out_dram_stride = [int(out_stride[0]), 0, int(out_stride[2])]
+
+        # Affine maps for flat DRAM base address (used with pre-computed loop var expressions)
+        # K: offset(kv, s0, k0)
+        kk_offset_map = _make_offset_map(k_dram_stride, k_tile_desc.offset)
+        # Q: offset(q_head, k0)  -- q_head = kv*g+qsub pre-computed in template
+        qk_offset_map = _make_offset_map([int(q_stride[0]), int(q_stride[2])], q_tile_desc.offset)
+        # V: offset(kv, s0, dh0)
+        v_offset_map  = _make_offset_map(v_dram_stride, v_tile_desc.offset)
+        # Out: offset(q_head, dh0)  -- q_head pre-computed in template
+        out_offset_map = _make_offset_map([int(out_stride[0]), int(out_stride[2])], 0)
+        # Blk-symbol variants: %s0 is relative (0..BlkS-1), %blk is the absolute
+        # block start (steps by BlkS), so actual_s = s0_rel + 1*blk → sym_stride=1.
+        kk_offset_map_blk = _make_offset_map_with_sym(k_dram_stride, sym_dim=1, sym_stride=1, offset=k_tile_desc.offset)
+        v_offset_map_blk  = _make_offset_map_with_sym(v_dram_stride, sym_dim=1, sym_stride=1, offset=v_tile_desc.offset)
+
+        # Keep sympy-based out_idx only for epilogue_info (not in render_options)
+        kv      = sympy.Symbol("kv")
+        qsub    = sympy.Symbol("qsub")
+        dh0     = sympy.Symbol("dh0")
+        s0      = sympy.Symbol("s0")
+        q_head  = kv * g + qsub
         out_idx = [q_head * out_stride[0], sympy.Integer(0), dh0 * out_stride[2]]
 
         kernel.loop_size = [tile_s, tile_e, 1]
@@ -819,24 +923,21 @@ def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue
             prob_desc=prob_desc,
             max_desc=max_desc,
             sum_desc=sum_desc,
-            qk_idx=qk_idx,
-            kk_idx=kk_idx,
-            v_idx=v_idx,
-            out_idx=out_idx,
+            # DMA strides
+            k_dram_stride=k_dram_stride,
+            q_dram_stride=q_dram_stride,
+            v_dram_stride=v_dram_stride,
+            out_dram_stride=out_dram_stride,
+            # Affine offset maps
+            kk_offset_map=kk_offset_map,
+            qk_offset_map=qk_offset_map,
+            v_offset_map=v_offset_map,
+            out_offset_map=out_offset_map,
+            kk_offset_map_blk=kk_offset_map_blk,
+            v_offset_map_blk=v_offset_map_blk,
             input_reorder=self.input_reorder,
         )
 
-        kernel.epilogue_info = dict(
-            output_node=self.output_node.name,
-            sram_var="out_io_buffer",
-            dram_var="out",
-            dram_idx=out_idx,
-            dram_tile_desc=out_io_tile_desc,
-            nr_rdim=0,
-            r_dim_size=0,
-            dim_aliasing={"kv": "kv", "qsub": "qsub", "dh0": "dh0", "s0": "s0"},
-        )
-
         return self._template_from_string(DECODE_GQA_SDPA_TEMPLATE).render(**kernel.render_options)
 
 
@@ -891,27 +992,30 @@ def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue
         }
       }
 
-      affine.for %s0 = ({{ BlkS }} * %blk) to ({{ BlkS }} * (%blk + 1)) step {{ tile_s }} {
+      affine.for %s0 = 0 to {{ BlkS }} step {{ tile_s }} {
         // Accumulate score per qsub so K tiles can be shared across qsub.
         affine.for %qsub = 0 to {{ g }} {
           affine.vector_store %v0_s, %score_buffer[%qsub, 0] : {{ score_desc.get_mlir_shape("f32") }}, vector<{{ tile_s }}xf32>
         }
 
         affine.for %k0 = 0 to {{ Dh }} step {{ tile_e }} {
-          {{ kernel.def_dma_op("MVIN", "key", kk_idx, k_tile_desc, subtile_size=[1, tile_s, tile_e], indent_size=10, padding=1) }}
+          %kk_offset = affine.apply {{ kk_offset_map_blk }}(%kv, %s0, %k0)[%blk]
+          {{ kernel.def_dma_op("MVIN", "key", [], k_tile_desc, subtile_size=[1, tile_s, tile_e], indent_size=10, padding=1, dram_stride=k_dram_stride, dram_offset="kk_offset") }}
           %k2D = memref.reinterpret_cast %k_buffer to offset: [0], sizes: [{{ tile_s }}, {{ tile_e }}], strides: [{{ tile_e }}, 1] : {{ k_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_s }}x{{ tile_e }}x{{ io_stype }}, 1>
 
           affine.for %qsub = 0 to {{ g }} {
-            {{ kernel.def_dma_op("MVIN", "query", qk_idx, q_tile_desc, subtile_size=[1, 1, tile_e], indent_size=12) }}
+            %q_head = affine.apply affine_map<(d0, d1) -> (d0 * {{ g }} + d1)>(%kv, %qsub)
+            %qk_offset = affine.apply {{ qk_offset_map }}(%q_head, %k0)
+            {{ kernel.def_dma_op("MVIN", "query", [], q_tile_desc, subtile_size=[1, 1, tile_e], indent_size=12, dram_stride=q_dram_stride, dram_offset="qk_offset") }}
             %q2D = memref.reinterpret_cast %q_buffer to offset: [0], sizes: [{{ tile_e }}, 1], strides: [1, 1] : {{ q_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_e }}x1x{{ io_stype }}, 1>
             linalg.matmul
               { idx_map = array<i32: 1, 0, -1> }
               ins(%k2D, %q2D : memref<{{ tile_s }}x{{ tile_e }}x{{ io_stype }}, 1>, memref<{{ tile_e }}x1x{{ io_stype }}, 1>)
               outs(%mul_buffer : {{ mul_tile_desc.get_mlir_shape(io_stype) }})
             %raw_mul_io = affine.vector_load %mul_buffer[0, 0] : {{ mul_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_s }}x{{ io_stype }}>
-            %raw_mul = arith.extf %raw_mul_io : vector<{{ tile_s }}x{{ io_stype }}> to vector<{{ tile_s }}xf32>
+            {% if io_stype != "f32" %}%raw_mul = arith.extf %raw_mul_io : vector<{{ tile_s }}x{{ io_stype }}> to vector<{{ tile_s }}xf32>{% endif %}
             %old_score = affine.vector_load %score_buffer[%qsub, 0] : {{ score_desc.get_mlir_shape("f32") }}, vector<{{ tile_s }}xf32>
-            %new_score = arith.addf %old_score, %raw_mul : vector<{{ tile_s }}xf32>
+            %new_score = arith.addf %old_score, {{ "%raw_mul" if io_stype != "f32" else "%raw_mul_io" }} : vector<{{ tile_s }}xf32>
             affine.vector_store %new_score, %score_buffer[%qsub, 0] : {{ score_desc.get_mlir_shape("f32") }}, vector<{{ tile_s }}xf32>
           } { accumulation_loop=true }
         } { accumulation_loop=true }
@@ -951,8 +1055,8 @@ def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue
           %new_max_bcast = vector.broadcast %new_max_scalar : f32 to vector<{{ tile_s }}xf32>
           %shifted = arith.subf %scaled, %new_max_bcast : vector<{{ tile_s }}xf32>
           %exp_scores = math.exp %shifted : vector<{{ tile_s }}xf32>
-          %exp_scores_io = arith.truncf %exp_scores : vector<{{ tile_s }}xf32> to vector<{{ tile_s }}x{{ io_stype }}>
-          affine.vector_store %exp_scores_io, %prob_buffer[%qsub, 0] : {{ prob_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_s }}x{{ io_stype }}>
+          {% if io_stype != "f32" %}%exp_scores_io = arith.truncf %exp_scores : vector<{{ tile_s }}xf32> to vector<{{ tile_s }}x{{ io_stype }}>{% endif %}
+          affine.vector_store {{ "%exp_scores_io" if io_stype != "f32" else "%exp_scores" }}, %prob_buffer[%qsub, 0] : {{ prob_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_s }}x{{ io_stype }}>
 
           %sum_cast = vector.shape_cast %exp_scores : vector<{{ tile_s }}xf32> to vector<{{ tile_s // 2 }}x2xf32>
           %zero_2x = vector.broadcast %c0 : f32 to vector<2xf32>
@@ -966,7 +1070,8 @@ def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue
         // For each output dh tile, load V once and share it across qsub.
         affine.for %dht = 0 to {{ dh_tiles }} {
           %dh0 = affine.apply affine_map<(d0) -> (d0 * {{ tile_e }})>(%dht)
-          {{ kernel.def_dma_op("MVIN", "value", v_idx, v_tile_desc, subtile_size=[1, tile_s, tile_e], indent_size=10, padding=0) }}
+          %v_offset = affine.apply {{ v_offset_map_blk }}(%kv, %s0, %dh0)[%blk]
+          {{ kernel.def_dma_op("MVIN", "value", [], v_tile_desc, subtile_size=[1, tile_s, tile_e], indent_size=10, padding=0, dram_stride=v_dram_stride, dram_offset="v_offset") }}
           %v2D = memref.reinterpret_cast %v_buffer to offset: [0], sizes: [{{ tile_e }}, {{ tile_s }}], strides: [{{ tile_s }}, 1] : {{ v_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_e }}x{{ tile_s }}x{{ io_stype }}, 1>
 
           affine.for %qsub = 0 to {{ g }} {
@@ -980,9 +1085,9 @@ def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue
               outs(%out_io_2D : memref<{{ tile_e }}x1x{{ io_stype }}, 1>)
 
             %out_io_vec = affine.vector_load %out_io_buffer[0, 0, 0] : {{ out_io_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_e }}x{{ io_stype }}>
-            %out_io_f32 = arith.extf %out_io_vec : vector<{{ tile_e }}x{{ io_stype }}> to vector<{{ tile_e }}xf32>
+            {% if io_stype != "f32" %}%out_io_f32 = arith.extf %out_io_vec : vector<{{ tile_e }}x{{ io_stype }}> to vector<{{ tile_e }}xf32>{% endif %}
             %out_acc_vec = affine.vector_load %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32>
-            %out_acc_new = arith.addf %out_acc_vec, %out_io_f32 : vector<{{ tile_e }}xf32>
+            %out_acc_new = arith.addf %out_acc_vec, {{ "%out_io_f32" if io_stype != "f32" else "%out_io_vec" }} : vector<{{ tile_e }}xf32>
             affine.vector_store %out_acc_new, %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32>
           } { accumulation_loop=true }
         } { accumulation_loop=true }
@@ -1000,9 +1105,12 @@ def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue
 
         affine.for %dht = 0 to {{ dh_tiles }} {
           %out_vec = affine.vector_load %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32>
-          %packed = vector.concat %out_vec, %ml1 : vector<{{ tile_pack }}xf32>
+          %packed = vector.shuffle %out_vec, %ml1 [{{ range(tile_pack) | join(', ') }}] : vector<{{ tile_e }}xf32>, vector<{{ tile_e }}xf32>
           affine.vector_store %packed, %partial_buffer[0, 0, 0] : {{ partial_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_pack }}xf32>
-          {{ kernel.store_output(indent_size=10) }}
+          %q_head = affine.apply affine_map<(d0, d1) -> (d0 * {{ g }} + d1)>(%kv, %qsub)
+          %gh = affine.apply affine_map<(d0, d1) -> (d0 * {{ dh_tiles }} + d1)>(%q_head, %dht)
+          %partial_offset = affine.apply {{ partial_offset_map }}(%gh, %blk)
+          {{ kernel.def_dma_op("MVOUT", "partial", [], partial_tile_desc, indent_size=10, dram_stride=partial_dram_stride, dram_offset="partial_offset") }}
         }
       } { outer_loop=true }
     } { outer_loop=true }
@@ -1012,83 +1120,6 @@ def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue
 """
 
 
-DECODE_GQA_SDPA_REDUCE_TEMPLATE = r"""
-// Decode GQA SDPA reduce kernel: merge partials across blocks
-// Input partial shape: (HgDhTiles, nblk, tile_pack)
-{{kernel.def_global_vars()}}
-
-func.func @{{ KERNEL_NAME }}{{kernel.def_kernel(inputs=[partial], outputs=[out], names_str="partial, out", input_reorder=input_reorder)}} {
-  {{ kernel.def_sram_buffer("partial", partial_tile_desc, indent_size=2) }}
-  {{ kernel.def_sram_buffer("out_acc", out_acc_tile_desc, indent_size=2) }}
-  {{ kernel.def_sram_buffer("max", max_desc, indent_size=2) }}
-  {{ kernel.def_sram_buffer("sum", sum_desc, indent_size=2) }}
-
-  %c0 = arith.constant 0.0 : f32
-  %c1 = arith.constant 1.0 : f32
-  %c_neg_inf = arith.constant -1.0e+30 : f32
-  %v0_e = arith.constant dense<0.0> : vector<{{ tile_e }}xf32>
-  %v0_2x = arith.constant dense<0.0> : vector<2xf32>
-  %v_neg_inf_2x = arith.constant dense<-1.0e+30> : vector<2xf32>
-
-  {{ kernel.def_local_vars(indent_size=2) }}
-
-  affine.for %gh = 0 to {{ HgDhTiles }} {
-    // reset merged accumulators
-    affine.vector_store %v0_e, %out_acc_buffer[0, 0, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32>
-    affine.vector_store %v_neg_inf_2x, %max_buffer[0, 0] : {{ max_desc.get_mlir_shape("f32") }}, vector<2xf32>
-    affine.vector_store %v0_2x, %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape("f32") }}, vector<2xf32>
-
-    affine.for %blk = 0 to {{ nblk }} {
-      {{ kernel.def_dma_op("MVIN", "partial", partial_idx, partial_tile_desc, subtile_size=[1, 1, tile_pack], indent_size=8) }}
-      %p = affine.vector_load %partial_buffer[0, 0, 0] : {{ partial_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_pack }}xf32>
-      %p2 = vector.shape_cast %p : vector<{{ tile_pack }}xf32> to vector<2x{{ tile_e }}xf32>
-      %o_j = vector.extract %p2[0] : vector<{{ tile_e }}xf32> from vector<2x{{ tile_e }}xf32>
-      %ml_j = vector.extract %p2[1] : vector<{{ tile_e }}xf32> from vector<2x{{ tile_e }}xf32>
-      %m_j = vector.extract %ml_j[0] : f32 from vector<{{ tile_e }}xf32>
-      %l_j = vector.extract %ml_j[1] : f32 from vector<{{ tile_e }}xf32>
-
-      %old_max = affine.vector_load %max_buffer[0, 0] : {{ max_desc.get_mlir_shape("f32") }}, vector<2xf32>
-      %m_old = vector.extract %old_max[0] : f32 from vector<2xf32>
-      %m_new = arith.maximumf %m_old, %m_j : f32
-      %m_new2 = vector.broadcast %m_new : f32 to vector<2xf32>
-      affine.vector_store %m_new2, %max_buffer[0, 0] : {{ max_desc.get_mlir_shape("f32") }}, vector<2xf32>
-
-      %diff_old = arith.subf %m_old, %m_new : f32
-      %diff_j = arith.subf %m_j, %m_new : f32
-      %scale_old = math.exp %diff_old : f32
-      %scale_j = math.exp %diff_j : f32
-      %scale_old_e = vector.broadcast %scale_old : f32 to vector<{{ tile_e }}xf32>
-      %scale_j_e = vector.broadcast %scale_j : f32 to vector<{{ tile_e }}xf32>
-
-      %o_old = affine.vector_load %out_acc_buffer[0, 0, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32>
-      %o_old_rs = arith.mulf %o_old, %scale_old_e : vector<{{ tile_e }}xf32>
-      %o_j_rs = arith.mulf %o_j, %scale_j_e : vector<{{ tile_e }}xf32>
-      %o_new = arith.addf %o_old_rs, %o_j_rs : vector<{{ tile_e }}xf32>
-      affine.vector_store %o_new, %out_acc_buffer[0, 0, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32>
-
-      %old_sum = affine.vector_load %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape("f32") }}, vector<2xf32>
-      %l_old = vector.extract %old_sum[0] : f32 from vector<2xf32>
-      %l_new = arith.addf (arith.mulf %l_old, %scale_old : f32), (arith.mulf %l_j, %scale_j : f32) : f32
-      %l_new2 = vector.broadcast %l_new : f32 to vector<2xf32>
-      affine.vector_store %l_new2, %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape("f32") }}, vector<2xf32>
-    } { accumulation_loop=true }
-
-    // finalize: out = o / l
-    %sum2 = affine.vector_load %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape("f32") }}, vector<2xf32>
-    %l = vector.extract %sum2[0] : f32 from vector<2xf32>
-    %inv = arith.divf %c1, %l : f32
-    %inv_e = vector.broadcast %inv : f32 to vector<{{ tile_e }}xf32>
-    %o = affine.vector_load %out_acc_buffer[0, 0, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32>
-    %out_f32 = arith.mulf %o, %inv_e : vector<{{ tile_e }}xf32>
-    %out_io = arith.truncf %out_f32 : vector<{{ tile_e }}xf32> to vector<{{ tile_e }}x{{ io_stype }}>
-    affine.vector_store %out_io, %out_buffer[0, 0, 0] : {{ out_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_e }}x{{ io_stype }}>
-    {{ kernel.store_output(indent_size=4) }}
-  } { outer_loop=true }
-  return
-}
-"""
-
-
 class MLIRDecodeGQASDPAPartialTemplate(MLIRTemplate):
     def __init__(self, input_nodes, layout, scale, BlkS: int = 1024, input_reorder=None):
         super().__init__("kernel", input_nodes, layout, input_reorder)
@@ -1097,7 +1128,8 @@ def __init__(self, input_nodes, layout, scale, BlkS: int = 1024, input_reorder=N
 
     def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue_nodes=None, prologue_nodes=None, tile_info=None, **kwargs):
         query, key, value = self.input_nodes[0], self.input_nodes[1], self.input_nodes[2]
-        partial = self.output_node
+        # Use the actual registered buffer node (e.g. "buf0") instead of the placeholder "buf_out".
+        partial = template_buffer_node if template_buffer_node is not None else self.output_node
 
         q_tensor4 = empty_strided(query.layout.size, query.layout.stride)
         k_tensor4 = empty_strided(key.layout.size, key.layout.stride)
@@ -1173,28 +1205,39 @@ def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue
         partial_tile_desc.set_tile_size_stride([1, 1, tile_pack], [0, tile_pack, 1])
         partial_tile_desc.set_name("partial_buffer")
 
-        # Indices
-        kv = sympy.Symbol("kv")
-        qsub = sympy.Symbol("qsub")
-        dht = sympy.Symbol("dht")
-        dh0 = sympy.Symbol("dh0")
-        k0 = sympy.Symbol("k0")
-        blk = sympy.Symbol("blk")
-        s0 = sympy.Symbol("s0")
-        q_head = kv * g + qsub
-
+        # Strides from 3D tensor views
         q_stride = q_tensor.stride()
         k_stride = k_tensor.stride()
         v_stride = v_tensor.stride()
 
-        qk_idx = [q_head * q_stride[0], sympy.Integer(0), k0 * q_stride[2]]
-        kk_idx = [kv * k_stride[0], s0 * k_stride[1], k0 * k_stride[2]]
-        v_idx = [kv * v_stride[0], s0 * v_stride[1], dh0 * v_stride[2]]
-
         # partial tensor is view(HgDhTiles, nblk, tile_pack) contiguous
         p_tensor = empty_strided(partial.get_layout().size, partial.get_layout().stride).view(HgDhTiles, nblk, tile_pack)
         p_stride = p_tensor.stride()
-        # group head index: ((kv*g + qsub)*dh_tiles + dht)
+
+        # DMA strides
+        k_dram_stride = [int(k_stride[0]), int(k_stride[1]), int(k_stride[2])]
+        q_dram_stride = [int(q_stride[0]), 0, int(q_stride[2])]
+        v_dram_stride = [int(v_stride[0]), int(v_stride[1]), int(v_stride[2])]
+        partial_dram_stride = [int(p_stride[0]), int(p_stride[1]), 1]
+
+        # Affine offset maps
+        kk_offset_map   = _make_offset_map(k_dram_stride, k_tile_desc.offset)
+        qk_offset_map   = _make_offset_map([int(q_stride[0]), int(q_stride[2])], q_tile_desc.offset)
+        v_offset_map    = _make_offset_map(v_dram_stride, v_tile_desc.offset)
+        # partial: offset(gh, blk)  -- gh = (kv*g+qsub)*dh_tiles+dht, pre-computed in template
+        partial_offset_map = _make_offset_map([int(p_stride[0]), int(p_stride[1])], 0)
+        # Blk-symbol variants: %s0 is relative (0..BlkS-1), %blk is a block index (0..nblk-1),
+        # so actual_s = s0_rel + BlkS * blk → sym_stride=BlkS.
+        kk_offset_map_blk = _make_offset_map_with_sym(k_dram_stride, sym_dim=1, sym_stride=int(BlkS), offset=k_tile_desc.offset)
+        v_offset_map_blk  = _make_offset_map_with_sym(v_dram_stride, sym_dim=1, sym_stride=int(BlkS), offset=v_tile_desc.offset)
+
+        # Keep sympy-based indices only for epilogue_info
+        kv   = sympy.Symbol("kv")
+        qsub = sympy.Symbol("qsub")
+        dht  = sympy.Symbol("dht")
+        dh0  = sympy.Symbol("dh0")
+        blk  = sympy.Symbol("blk")
+        q_head = kv * g + qsub
         gh = (kv * g + qsub) * dh_tiles + dht
         partial_idx = [gh * p_stride[0], blk * p_stride[1], sympy.Integer(0)]
 
@@ -1230,26 +1273,110 @@ def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue
             max_desc=max_desc,
             sum_desc=sum_desc,
             partial_tile_desc=partial_tile_desc,
-            qk_idx=qk_idx,
-            kk_idx=kk_idx,
-            v_idx=v_idx,
-            partial_idx=partial_idx,
+            # DMA strides
+            k_dram_stride=k_dram_stride,
+            q_dram_stride=q_dram_stride,
+            v_dram_stride=v_dram_stride,
+            partial_dram_stride=partial_dram_stride,
+            # Affine offset maps
+            kk_offset_map=kk_offset_map,
+            qk_offset_map=qk_offset_map,
+            v_offset_map=v_offset_map,
+            partial_offset_map=partial_offset_map,
+            kk_offset_map_blk=kk_offset_map_blk,
+            v_offset_map_blk=v_offset_map_blk,
             input_reorder=self.input_reorder,
         )
 
-        kernel.epilogue_info = dict(
-            output_node=self.output_node.name,
-            sram_var="partial_buffer",
-            dram_var="partial",
-            dram_idx=partial_idx,
-            dram_tile_desc=partial_tile_desc,
-            nr_rdim=0,
-            r_dim_size=0,
-            dim_aliasing={"kv": "kv", "qsub": "qsub", "dht": "dht", "dh0": "dh0", "k0": "k0", "blk": "blk", "s0": "s0"},
-        )
         return self._template_from_string(DECODE_GQA_SDPA_PARTIAL_TEMPLATE).render(**kernel.render_options)
 
 
+DECODE_GQA_SDPA_REDUCE_TEMPLATE = r"""
+// Decode GQA SDPA reduce kernel: merge partials across blocks
+// Input partial shape: (HgDhTiles, nblk, tile_pack)
+{{kernel.def_global_vars()}}
+
+func.func @{{ KERNEL_NAME }}{{kernel.def_kernel(inputs=[partial], outputs=[out], names_str="partial, out", input_reorder=input_reorder)}} {
+  {{ kernel.def_sram_buffer("partial", partial_tile_desc, indent_size=2) }}
+  {{ kernel.def_sram_buffer("out_acc", out_acc_tile_desc, indent_size=2) }}
+  {{ kernel.def_sram_buffer("out", out_tile_desc, indent_size=2) }}
+  {{ kernel.def_sram_buffer("max", max_desc, indent_size=2) }}
+  {{ kernel.def_sram_buffer("sum", sum_desc, indent_size=2) }}
+
+  %c0 = arith.constant 0.0 : f32
+  %c1 = arith.constant 1.0 : f32
+  %c_neg_inf = arith.constant -1.0e+30 : f32
+  %v0_e = arith.constant dense<0.0> : vector<{{ tile_e }}xf32>
+  %v0_2x = arith.constant dense<0.0> : vector<2xf32>
+  %v_neg_inf_2x = arith.constant dense<-1.0e+30> : vector<2xf32>
+
+  {{ kernel.def_local_vars(indent_size=2) }}
+
+  affine.for %gh = 0 to {{ HgDhTiles }} {
+    // reset merged accumulators
+    affine.vector_store %v0_e, %out_acc_buffer[0, 0, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32>
+    affine.vector_store %v_neg_inf_2x, %max_buffer[0, 0] : {{ max_desc.get_mlir_shape("f32") }}, vector<2xf32>
+    affine.vector_store %v0_2x, %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape("f32") }}, vector<2xf32>
+
+    affine.for %blk = 0 to {{ nblk }} {
+      %partial_offset = affine.apply {{ partial_offset_map }}(%gh, %blk)
+      {{ kernel.def_dma_op("MVIN", "partial", [], partial_tile_desc, subtile_size=[1, 1, tile_pack], indent_size=8, dram_stride=partial_dram_stride, dram_offset="partial_offset") }}
+      %p = affine.vector_load %partial_buffer[0, 0, 0] : {{ partial_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_pack }}xf32>
+      %p2 = vector.shape_cast %p : vector<{{ tile_pack }}xf32> to vector<2x{{ tile_e }}xf32>
+      %o_j = vector.extract %p2[0] : vector<{{ tile_e }}xf32> from vector<2x{{ tile_e }}xf32>
+      %ml_j = vector.extract %p2[1] : vector<{{ tile_e }}xf32> from vector<2x{{ tile_e }}xf32>
+      %m_j = vector.extract %ml_j[0] : f32 from vector<{{ tile_e }}xf32>
+      %l_j = vector.extract %ml_j[1] : f32 from vector<{{ tile_e }}xf32>
+
+      %old_max = affine.vector_load %max_buffer[0, 0] : {{ max_desc.get_mlir_shape("f32") }}, vector<2xf32>
+      %m_old = vector.extract %old_max[0] : f32 from vector<2xf32>
+      %m_new = arith.maximumf %m_old, %m_j : f32
+      %m_new2 = vector.broadcast %m_new : f32 to vector<2xf32>
+      affine.vector_store %m_new2, %max_buffer[0, 0] : {{ max_desc.get_mlir_shape("f32") }}, vector<2xf32>
+
+      %diff_old = arith.subf %m_old, %m_new : f32
+      %diff_j = arith.subf %m_j, %m_new : f32
+      %diff_old_v = vector.broadcast %diff_old : f32 to vector<1xf32>
+      %diff_j_v = vector.broadcast %diff_j : f32 to vector<1xf32>
+      %scale_old_v = math.exp %diff_old_v : vector<1xf32>
+      %scale_j_v = math.exp %diff_j_v : vector<1xf32>
+      %scale_old = vector.extract %scale_old_v[0] : f32 from vector<1xf32>
+      %scale_j = vector.extract %scale_j_v[0] : f32 from vector<1xf32>
+      %scale_old_e = vector.broadcast %scale_old : f32 to vector<{{ tile_e }}xf32>
+      %scale_j_e = vector.broadcast %scale_j : f32 to vector<{{ tile_e }}xf32>
+
+      %o_old = affine.vector_load %out_acc_buffer[0, 0, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32>
+      %o_old_rs = arith.mulf %o_old, %scale_old_e : vector<{{ tile_e }}xf32>
+      %o_j_rs = arith.mulf %o_j, %scale_j_e : vector<{{ tile_e }}xf32>
+      %o_new = arith.addf %o_old_rs, %o_j_rs : vector<{{ tile_e }}xf32>
+      affine.vector_store %o_new, %out_acc_buffer[0, 0, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32>
+
+      %old_sum = affine.vector_load %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape("f32") }}, vector<2xf32>
+      %l_old = vector.extract %old_sum[0] : f32 from vector<2xf32>
+      %l_old_rs = arith.mulf %l_old, %scale_old : f32
+      %l_j_rs = arith.mulf %l_j, %scale_j : f32
+      %l_new = arith.addf %l_old_rs, %l_j_rs : f32
+      %l_new2 = vector.broadcast %l_new : f32 to vector<2xf32>
+      affine.vector_store %l_new2, %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape("f32") }}, vector<2xf32>
+    } { accumulation_loop=true }
+
+    // finalize: out = o / l
+    %sum2 = affine.vector_load %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape("f32") }}, vector<2xf32>
+    %l = vector.extract %sum2[0] : f32 from vector<2xf32>
+    %inv = arith.divf %c1, %l : f32
+    %inv_e = vector.broadcast %inv : f32 to vector<{{ tile_e }}xf32>
+    %o = affine.vector_load %out_acc_buffer[0, 0, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32>
+    %out_f32 = arith.mulf %o, %inv_e : vector<{{ tile_e }}xf32>
+    {% if io_stype != "f32" %}%out_io = arith.truncf %out_f32 : vector<{{ tile_e }}xf32> to vector<{{ tile_e }}x{{ io_stype }}>{% endif %}
+    affine.vector_store {{ "%out_io" if io_stype != "f32" else "%out_f32" }}, %out_buffer[0, 0, 0] : {{ out_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_e }}x{{ io_stype }}>
+    %out_offset = affine.apply {{ out_offset_map }}(%gh)
+    {{ kernel.def_dma_op("MVOUT", "out", [], out_tile_desc, indent_size=4, dram_stride=out_dram_stride, dram_offset="out_offset") }}
+  } { outer_loop=true }
+  return
+}
+"""
+
+
 class MLIRDecodeGQASDPAReduceTemplate(MLIRTemplate):
     def __init__(self, input_nodes, layout, BlkS: int = 1024, input_reorder=None):
         super().__init__("kernel", input_nodes, layout, input_reorder)
@@ -1257,7 +1384,8 @@ def __init__(self, input_nodes, layout, BlkS: int = 1024, input_reorder=None):
 
     def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue_nodes=None, prologue_nodes=None, tile_info=None, **kwargs):
         partial = self.input_nodes[0]
-        out = self.output_node
+        # Use the actual registered buffer node (e.g. "buf0") instead of the placeholder "buf_out".
+        out = template_buffer_node if template_buffer_node is not None else self.output_node
 
         tile_e = kernel.vector_lane
         tile_pack = tile_e * 2
@@ -1288,21 +1416,33 @@ def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue
         out_tile_desc.set_tile_size_stride([1, 1, tile_e], [0, tile_e, 1])
         out_tile_desc.set_name("out_buffer")
 
-        # Indexing: partial is already 3D; out is (Hq,1,Dh) but view as (Hq*Dh/tile_e, 1, tile_e)
+        # Partial tensor strides
         p_tensor = empty_strided(partial.get_layout().size, partial.get_layout().stride)
         p_stride = p_tensor.stride()
-        gh = sympy.Symbol("gh")
-        blk = sympy.Symbol("blk")
-        partial_idx = [gh * p_stride[0], blk * p_stride[1], sympy.Integer(0)]
 
-        # out view
+        # Out view: (Hq*dh_tiles, 1, tile_e)
         out_tensor4 = empty_strided(out.get_layout().size, out.get_layout().stride)
         B, Hq, Lq, Dh = out_tensor4.shape
         assert B == 1 and Lq == 1
         dh_tiles = int(Dh) // int(tile_e)
         out_tensor = out_tensor4.view(Hq * dh_tiles, 1, tile_e)
         o_stride = out_tensor.stride()
-        out_idx = [gh * o_stride[0], sympy.Integer(0), sympy.Integer(0)]
+
+        # DMA strides
+        partial_dram_stride = [int(p_stride[0]), int(p_stride[1]), 1]
+        out_dram_stride     = [int(o_stride[0]), 0, 0]
+
+        # Affine offset maps
+        # partial: offset(gh, blk)
+        partial_offset_map = _make_offset_map([int(p_stride[0]), int(p_stride[1])], partial_tile_desc.offset)
+        # out: offset(gh)  -- single dimension
+        out_offset_map     = _make_offset_map([int(o_stride[0])], 0)
+
+        # Keep sympy-based indices for epilogue_info
+        gh  = sympy.Symbol("gh")
+        blk = sympy.Symbol("blk")
+        partial_idx = [gh * p_stride[0], blk * p_stride[1], sympy.Integer(0)]
+        out_idx     = [gh * o_stride[0], sympy.Integer(0), sympy.Integer(0)]
 
         kernel.loop_size = [tile_pack, tile_e, 1]
 
@@ -1321,19 +1461,13 @@ def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue
             max_desc=max_desc,
             sum_desc=sum_desc,
             out_tile_desc=out_tile_desc,
-            partial_idx=partial_idx,
-            out_idx=out_idx,
+            # DMA strides
+            partial_dram_stride=partial_dram_stride,
+            out_dram_stride=out_dram_stride,
+            # Affine offset maps
+            partial_offset_map=partial_offset_map,
+            out_offset_map=out_offset_map,
             input_reorder=self.input_reorder,
         )
 
-        kernel.epilogue_info = dict(
-            output_node=self.output_node.name,
-            sram_var="out_buffer",
-            dram_var="out",
-            dram_idx=out_idx,
-            dram_tile_desc=out_tile_desc,
-            nr_rdim=0,
-            r_dim_size=0,
-            dim_aliasing={"gh": "gh", "blk": "blk"},
-        )
         return self._template_from_string(DECODE_GQA_SDPA_REDUCE_TEMPLATE).render(**kernel.render_options)

From bfc2b22b334599fe8ddd959adb2e17ac1f576474 Mon Sep 17 00:00:00 2001
From: HamHyungkyu <hhk971@postech.ac.kr>
Date: Fri, 13 Mar 2026 19:37:08 +0900
Subject: [PATCH 130/194] [Frontend/template] SPDA implementation debug

---
 PyTorchSimFrontend/extension_codecache.py     |   2 -
 PyTorchSimFrontend/mlir/mlir_sdpa_template.py | 592 ++----------------
 2 files changed, 48 insertions(+), 546 deletions(-)

diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index b1c457d3..d3ac7259 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -37,7 +37,6 @@ def mlir_compile_command(filename, vectorlane_size, vlen=256):
         f"""
             {extension_config.CONFIG_TORCHSIM_LLVM_PATH}/mlir-opt \
             -test-loop-padding \
-            -dma-fine-grained='systolic-array-size={vectorlane_size}' \
             -global-idx='vlen={vlen}' \
             -test-pytorchsim-to-vcix='systolic-array-size={vectorlane_size} vlen={vlen}' \
             -test-memref-to-gemmini="vectorlane={vectorlane_size}" \
@@ -87,7 +86,6 @@ def mlir_gem5_compile_command(filename, sample_filename, tog_file, vectorlane_si
         f"""
             {extension_config.CONFIG_TORCHSIM_LLVM_PATH}/mlir-opt \
             -test-loop-padding='timing_mode=1' \
-            -dma-fine-grained='systolic-array-size={vectorlane_size}' \
             -global-idx='vlen={vlen}' \
             -test-pytorchsim-to-vcix='systolic-array-size={vectorlane_size} vlen={vlen}' \
             -test-tile-operation-graph='vectorlane={vectorlane_size} tls_mode={extension_config.CONFIG_TLS_MODE}' \
diff --git a/PyTorchSimFrontend/mlir/mlir_sdpa_template.py b/PyTorchSimFrontend/mlir/mlir_sdpa_template.py
index 077a8cd2..adcc7801 100644
--- a/PyTorchSimFrontend/mlir/mlir_sdpa_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_sdpa_template.py
@@ -563,384 +563,6 @@ def select_tile(self, kernel, l, s, e, n_extra_node, n_extra_read, n_prologue_no
         return tile_candidates
 
 
-# ---------------------------
-# Decode-only GQA SDPA (Lq == 1)
-# ---------------------------
-
-DECODE_GQA_SDPA_TEMPLATE = r"""
-// Decode GQA SDPA kernel (Lq == 1)
-// B = {{ B }}
-// Hq = {{ Hq }}
-// H = {{ H }}
-// g = {{ g }}
-// S = {{ S }}
-// Dh = {{ Dh }}
-// BlkS = {{ BlkS }}
-// tile_s = {{ tile_s }}
-// tile_e = {{ tile_e }}
-// dh_tiles = {{ dh_tiles }}
-{{kernel.def_global_vars()}}
-
-func.func @{{ KERNEL_NAME }}{{kernel.def_kernel(inputs=[query, key, value], outputs=[out], names_str="query, key, value, out", input_reorder=input_reorder)}} {
-  // IO buffers follow input dtype (fp16/bf16/f32)
-  {{ kernel.def_sram_buffer("query", q_tile_desc, indent_size=2) }}
-  {{ kernel.def_sram_buffer("key", k_tile_desc, indent_size=2) }}
-  {{ kernel.def_sram_buffer("value", v_tile_desc, indent_size=2) }}
-  // Softmax output used for SV matmul (io dtype)
-  {{ kernel.def_sram_buffer("mul", mul_tile_desc, indent_size=2) }}
-  {{ kernel.def_sram_buffer("score", score_desc, indent_size=2) }}
-  {{ kernel.def_sram_buffer("prob", prob_desc, indent_size=2) }}
-  // Accumulator in fp32 (stable)
-  {{ kernel.def_sram_buffer("out_acc", out_acc_tile_desc, indent_size=2) }}
-  // Temp output in io dtype for SV matmul result
-  {{ kernel.def_sram_buffer("out_io", out_io_tile_desc, indent_size=2) }}
-  // Softmax running stats in fp32
-  {{ kernel.def_sram_buffer("max", max_desc, indent_size=2) }}
-  {{ kernel.def_sram_buffer("sum", sum_desc, indent_size=2) }}
-
-  %c0 = arith.constant 0.0 : {{ acc_stype }}
-  %c1 = arith.constant 1.0 : {{ acc_stype }}
-  %c_scale = arith.constant {{ scale }} : {{ acc_stype }}
-  %c_neg_inf = arith.constant -1.0e+30 : {{ acc_stype }}
-
-  %v0_e_acc = arith.constant dense<0.0> : vector<{{ tile_e }}x{{ acc_stype }}>
-  %v0_e_io = arith.constant dense<0.0> : vector<{{ tile_e }}x{{ io_stype }}>
-  %v0_2x = arith.constant dense<0.0> : vector<2x{{ acc_stype }}>
-  %v_neg_inf_2x = arith.constant dense<-1.0e+30> : vector<2x{{ acc_stype }}>
-  %v0_s_acc = arith.constant dense<0.0> : vector<{{ tile_s }}x{{ acc_stype }}>
-
-  %v_scale = vector.broadcast %c_scale : {{ acc_stype }} to vector<{{ tile_s }}x{{ acc_stype }}>
-
-  {{ kernel.def_local_vars(indent_size=2) }}
-
-  // kv_head parallelism is the natural unit for GQA reuse
-  affine.for %kv = 0 to {{ H }} {
-    // Process S in blocks (BlkS). Sequential inside a core.
-    affine.for %blk = 0 to {{ S }} step {{ BlkS }} {
-      // Initialize per-qsub accumulators for this (kv, blk)
-      affine.for %qsub = 0 to {{ g }} {
-        affine.vector_store %v_neg_inf_2x, %max_buffer[%qsub, 0] : {{ max_desc.get_mlir_shape(acc_stype) }}, vector<2x{{ acc_stype }}>
-        affine.vector_store %v0_2x, %sum_buffer[%qsub, 0] : {{ sum_desc.get_mlir_shape(acc_stype) }}, vector<2x{{ acc_stype }}>
-        affine.for %dht = 0 to {{ dh_tiles }} {
-          affine.vector_store %v0_e_acc, %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_e }}x{{ acc_stype }}>
-        }
-      }
-
-      affine.for %s0 = 0 to {{ BlkS }} step {{ tile_s }} {
-        // Accumulate score per qsub so K tiles can be shared across qsub.
-        affine.for %qsub = 0 to {{ g }} {
-          affine.vector_store %v0_s_acc, %score_buffer[%qsub, 0] : {{ score_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_s }}x{{ acc_stype }}>
-        }
-
-        affine.for %k0 = 0 to {{ Dh }} step {{ tile_e }} {
-          // Load K slice once for all qsub.
-          %kk_offset = affine.apply {{ kk_offset_map_blk }}(%kv, %s0, %k0)[%blk]
-          {{ kernel.def_dma_op("MVIN", "key", [], k_tile_desc, subtile_size=[1, tile_s, tile_e], indent_size=10, padding=1, dram_stride=k_dram_stride, dram_offset="kk_offset") }}
-          %k2D = memref.reinterpret_cast %k_buffer to offset: [0], sizes: [{{ tile_s }}, {{ tile_e }}], strides: [{{ tile_e }}, 1] : {{ k_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_s }}x{{ tile_e }}x{{ io_stype }}, 1>
-
-          affine.for %qsub = 0 to {{ g }} {
-            %q_head = affine.apply affine_map<(d0, d1) -> (d0 * {{ g }} + d1)>(%kv, %qsub)
-            %qk_offset = affine.apply {{ qk_offset_map }}(%q_head, %k0)
-            {{ kernel.def_dma_op("MVIN", "query", [], q_tile_desc, subtile_size=[1, 1, tile_e], indent_size=12, dram_stride=q_dram_stride, dram_offset="qk_offset") }}
-            %q2D = memref.reinterpret_cast %q_buffer to offset: [0], sizes: [{{ tile_e }}, 1], strides: [1, 1] : {{ q_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_e }}x1x{{ io_stype }}, 1>
-
-            // mul = k @ q  -> (tile_s x 1) in io dtype, then upcast and accumulate.
-            linalg.matmul
-              { idx_map = array<i32: 1, 0, -1> }
-              ins(%k2D, %q2D : memref<{{ tile_s }}x{{ tile_e }}x{{ io_stype }}, 1>, memref<{{ tile_e }}x1x{{ io_stype }}, 1>)
-              outs(%mul_buffer : {{ mul_tile_desc.get_mlir_shape(io_stype) }})
-
-            %raw_mul_io = affine.vector_load %mul_buffer[0, 0] : {{ mul_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_s }}x{{ io_stype }}>
-            {% if io_stype != acc_stype %}%raw_mul = arith.extf %raw_mul_io : vector<{{ tile_s }}x{{ io_stype }}> to vector<{{ tile_s }}x{{ acc_stype }}>{% endif %}
-            %old_score = affine.vector_load %score_buffer[%qsub, 0] : {{ score_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_s }}x{{ acc_stype }}>
-            %new_score = arith.addf %old_score, {{ "%raw_mul" if io_stype != acc_stype else "%raw_mul_io" }} : vector<{{ tile_s }}x{{ acc_stype }}>
-            affine.vector_store %new_score, %score_buffer[%qsub, 0] : {{ score_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_s }}x{{ acc_stype }}>
-          } { accumulation_loop=true }
-        } { accumulation_loop=true }
-
-        affine.for %qsub = 0 to {{ g }} {
-          %score_acc = affine.vector_load %score_buffer[%qsub, 0] : {{ score_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_s }}x{{ acc_stype }}>
-          // scale after full Dh reduction
-          %scaled_mul_vec = arith.mulf %score_acc, %v_scale : vector<{{ tile_s }}x{{ acc_stype }}>
-
-            // Online softmax update (max/sum/out) identical to FLASH_SDPA_TEMPLATE but specialized to Lq==1.
-            %old_max = affine.vector_load %max_buffer[%qsub, 0] : {{ max_desc.get_mlir_shape(acc_stype) }}, vector<2x{{ acc_stype }}>
-            // Reduce max over tile_s
-            %max_init = vector.broadcast %c_neg_inf : {{ acc_stype }} to vector<{{ tile_s }}x{{ acc_stype }}>
-            %local_max_vec = arith.maximumf %scaled_mul_vec, %max_init : vector<{{ tile_s }}x{{ acc_stype }}>
-            %max_cast = vector.shape_cast %local_max_vec : vector<{{ tile_s }}x{{ acc_stype }}> to vector<{{ tile_s // 2 }}x2x{{ acc_stype }}>
-            %max_red1 = vector.multi_reduction <maximumf>, %max_cast, %v_neg_inf_2x [0] : vector<{{ tile_s // 2 }}x2x{{ acc_stype }}> to vector<2x{{ acc_stype }}>
-            %max_shuf = vector.shuffle %max_red1, %max_red1 [1, 0] : vector<2x{{ acc_stype }}>, vector<2x{{ acc_stype }}>
-            %max_red2 = arith.maximumf %max_red1, %max_shuf : vector<2x{{ acc_stype }}>
-            %new_max = arith.maximumf %max_red2, %old_max : vector<2x{{ acc_stype }}>
-            affine.vector_store %new_max, %max_buffer[%qsub, 0] : {{ max_desc.get_mlir_shape(acc_stype) }}, vector<2x{{ acc_stype }}>
-
-            // rescale = exp(old_max - new_max)
-            %max_diff = arith.subf %old_max, %new_max : vector<2x{{ acc_stype }}>
-            %max_diff_scalar = vector.extract %max_diff[0] : {{ acc_stype }} from vector<2x{{ acc_stype }}>
-            %rescale_e = vector.broadcast %max_diff_scalar : {{ acc_stype }} to vector<{{ tile_e }}x{{ acc_stype }}>
-            %exp_rescale_e = math.exp %rescale_e : vector<{{ tile_e }}x{{ acc_stype }}>
-            %rescale_2 = vector.broadcast %max_diff_scalar : {{ acc_stype }} to vector<2x{{ acc_stype }}>
-            %exp_rescale_2 = math.exp %rescale_2 : vector<2x{{ acc_stype }}>
-
-            // out *= rescale
-            %old_out = affine.vector_load %out_acc_buffer[%qsub, 0, 0] : {{ out_acc_tile_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_e }}x{{ acc_stype }}>
-            %rescaled_out = arith.mulf %exp_rescale_e, %old_out : vector<{{ tile_e }}x{{ acc_stype }}>
-            affine.vector_store %rescaled_out, %out_acc_buffer[%qsub, 0, 0] : {{ out_acc_tile_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_e }}x{{ acc_stype }}>
-
-            // sum *= rescale
-            %old_sum = affine.vector_load %sum_buffer[%qsub, 0] : {{ sum_desc.get_mlir_shape(acc_stype) }}, vector<2x{{ acc_stype }}>
-            %rescaled_sum = arith.mulf %old_sum, %exp_rescale_2 : vector<2x{{ acc_stype }}>
-
-            // exp(score - new_max)
-            %new_max_scalar = vector.extract %new_max[0] : {{ acc_stype }} from vector<2x{{ acc_stype }}>
-            %new_max_bcast = vector.broadcast %new_max_scalar : {{ acc_stype }} to vector<{{ tile_s }}x{{ acc_stype }}>
-            %shifted = arith.subf %scaled_mul_vec, %new_max_bcast : vector<{{ tile_s }}x{{ acc_stype }}>
-            %exp_scores = math.exp %shifted : vector<{{ tile_s }}x{{ acc_stype }}>
-            // For SV matmul: downcast softmax output to io dtype (common in practice)
-            {% if io_stype != acc_stype %}%exp_scores_io = arith.truncf %exp_scores : vector<{{ tile_s }}x{{ acc_stype }}> to vector<{{ tile_s }}x{{ io_stype }}>{% endif %}
-            affine.vector_store {{ "%exp_scores_io" if io_stype != acc_stype else "%exp_scores" }}, %prob_buffer[%qsub, 0] : {{ prob_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_s }}x{{ io_stype }}>
-
-            // sum += reduce(exp_scores)
-            %sum_cast = vector.shape_cast %exp_scores : vector<{{ tile_s }}x{{ acc_stype }}> to vector<{{ tile_s // 2 }}x2x{{ acc_stype }}>
-            %zero_2x = vector.broadcast %c0 : {{ acc_stype }} to vector<2x{{ acc_stype }}>
-            %sum_red1 = vector.multi_reduction <add>, %sum_cast, %zero_2x [0] : vector<{{ tile_s // 2 }}x2x{{ acc_stype }}> to vector<2x{{ acc_stype }}>
-            %sum_shuf = vector.shuffle %sum_red1, %sum_red1 [1, 0] : vector<2x{{ acc_stype }}>, vector<2x{{ acc_stype }}>
-            %sum_red2 = arith.addf %sum_red1, %sum_shuf : vector<2x{{ acc_stype }}>
-            %new_sum = arith.addf %sum_red2, %rescaled_sum : vector<2x{{ acc_stype }}>
-            affine.vector_store %new_sum, %sum_buffer[%qsub, 0] : {{ sum_desc.get_mlir_shape(acc_stype) }}, vector<2x{{ acc_stype }}>
-
-        } { accumulation_loop=true }
-
-        // 2) SV accumulation: for each output dh tile, load V once and share across qsub.
-        affine.for %dht = 0 to {{ dh_tiles }} {
-          %dh0 = affine.apply affine_map<(d0) -> (d0 * {{ tile_e }})>(%dht)
-          %v_offset = affine.apply {{ v_offset_map_blk }}(%kv, %s0, %dh0)[%blk]
-          {{ kernel.def_dma_op("MVIN", "value", [], v_tile_desc, subtile_size=[1, tile_s, tile_e], indent_size=10, padding=0, dram_stride=v_dram_stride, dram_offset="v_offset") }}
-          %v2D = memref.reinterpret_cast %v_buffer to offset: [0], sizes: [{{ tile_e }}, {{ tile_s }}], strides: [{{ tile_s }}, 1] : {{ v_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_e }}x{{ tile_s }}x{{ io_stype }}, 1>
-
-          affine.for %qsub = 0 to {{ g }} {
-            %prob_vec = affine.vector_load %prob_buffer[%qsub, 0] : {{ prob_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_s }}x{{ io_stype }}>
-            affine.vector_store %prob_vec, %mul_buffer[0, 0] : {{ mul_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_s }}x{{ io_stype }}>
-            affine.vector_store %v0_e_io, %out_io_buffer[0, 0, 0] : {{ out_io_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_e }}x{{ io_stype }}>
-            %out_io_2D = memref.reinterpret_cast %out_io_buffer to offset: [0], sizes: [{{ tile_e }}, 1], strides: [1, 1] : {{ out_io_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_e }}x1x{{ io_stype }}, 1>
-            linalg.matmul
-              { idx_map = array<i32: 2, 1, -1> }
-              ins(%v2D, %mul_buffer : memref<{{ tile_e }}x{{ tile_s }}x{{ io_stype }}, 1>, {{ mul_tile_desc.get_mlir_shape(io_stype) }})
-              outs(%out_io_2D : memref<{{ tile_e }}x1x{{ io_stype }}, 1>)
-
-            %out_io_vec = affine.vector_load %out_io_buffer[0, 0, 0] : {{ out_io_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_e }}x{{ io_stype }}>
-            {% if io_stype != acc_stype %}%out_io_f32 = arith.extf %out_io_vec : vector<{{ tile_e }}x{{ io_stype }}> to vector<{{ tile_e }}x{{ acc_stype }}>{% endif %}
-            %out_acc_vec = affine.vector_load %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_e }}x{{ acc_stype }}>
-            %out_acc_new = arith.addf %out_acc_vec, {{ "%out_io_f32" if io_stype != acc_stype else "%out_io_vec" }} : vector<{{ tile_e }}x{{ acc_stype }}>
-            affine.vector_store %out_acc_new, %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_e }}x{{ acc_stype }}>
-          } { accumulation_loop=true }
-        } { accumulation_loop=true }
-      } { accumulation_loop=true }
-
-      // finalize per-qsub for this (kv, blk) and store out for all dh tiles
-      affine.for %qsub = 0 to {{ g }} {
-        %final_sum = affine.vector_load %sum_buffer[%qsub, 0] : {{ sum_desc.get_mlir_shape(acc_stype) }}, vector<2x{{ acc_stype }}>
-        %one_2x = vector.broadcast %c1 : {{ acc_stype }} to vector<2x{{ acc_stype }}>
-        %inv_sum_2x = arith.divf %one_2x, %final_sum : vector<2x{{ acc_stype }}>
-        %inv_sum = vector.extract %inv_sum_2x[0] : {{ acc_stype }} from vector<2x{{ acc_stype }}>
-        %inv_bcast = vector.broadcast %inv_sum : {{ acc_stype }} to vector<{{ tile_e }}x{{ acc_stype }}>
-
-        affine.for %dht = 0 to {{ dh_tiles }} {
-          %dh0 = affine.apply affine_map<(d0) -> (d0 * {{ tile_e }})>(%dht)
-          %acc_out = affine.vector_load %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape(acc_stype) }}, vector<{{ tile_e }}x{{ acc_stype }}>
-          %final_out_acc = arith.mulf %acc_out, %inv_bcast : vector<{{ tile_e }}x{{ acc_stype }}>
-          {% if io_stype != acc_stype %}%final_out_io = arith.truncf %final_out_acc : vector<{{ tile_e }}x{{ acc_stype }}> to vector<{{ tile_e }}x{{ io_stype }}>{% endif %}
-          affine.vector_store {{ "%final_out_io" if io_stype != acc_stype else "%final_out_acc" }}, %out_io_buffer[0, 0, 0] : {{ out_io_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_e }}x{{ io_stype }}>
-          %q_head = affine.apply affine_map<(d0, d1) -> (d0 * {{ g }} + d1)>(%kv, %qsub)
-          %out_offset = affine.apply {{ out_offset_map }}(%q_head, %dh0)
-          {{ kernel.def_dma_op("MVOUT", "out", [], out_io_tile_desc, indent_size=10, dram_stride=out_dram_stride, dram_offset="out_offset") }}
-        }
-      } { outer_loop=true }
-    } { outer_loop=true }
-  } { outer_loop=true }
-
-  return
-}
-"""
-
-
-class MLIRDecodeGQASDPATemplate(MLIRTemplate):
-    def __init__(self, input_nodes, layout, scale, BlkS: int = 1024, input_reorder=None):
-        super().__init__("kernel", input_nodes, layout, input_reorder)
-        self.scale = scale
-        self.BlkS = BlkS
-
-    def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue_nodes=None, prologue_nodes=None, tile_info=None, **kwargs):
-        # Decode-only: q is (B,Hq,1,Dh)
-        # Use template_buffer_node (the actual V.graph-registered CUDATemplateBuffer with its
-        # real name e.g. "buf0") when available, instead of the placeholder self.output_node
-        # (always named "buf_out").  This ensures output_buffers["buf0"] maps correctly
-        # in mlir_argdefs, which looks up buffer_types by the actual DRAM buffer name.
-        query, key, value, out = self.input_nodes[0], self.input_nodes[1], self.input_nodes[2], \
-            template_buffer_node if template_buffer_node is not None else self.output_node
-
-        # Materialize tensors for stride metadata
-        q_tensor4 = empty_strided(query.layout.size, query.layout.stride)
-        k_tensor4 = empty_strided(key.layout.size, key.layout.stride)
-        v_tensor4 = empty_strided(value.layout.size, value.layout.stride)
-
-        B, Hq, Lq, Dh = q_tensor4.shape
-        Bk, H, S, Dhk = k_tensor4.shape
-        assert B == 1, "Decode GQA template currently supports B==1"
-        assert Lq == 1, "Decode GQA template requires Lq==1"
-        assert Dh == Dhk
-        g = Hq // H
-        BlkS = min(int(self.BlkS), int(S))
-
-        # Use 3D views to match the existing SDPA indexing scheme
-        # q: (Hq, 1, Dh), k/v: (H, S, Dh), out: (Hq, 1, Dh)
-        q_tensor = q_tensor4.view(Hq, 1, Dh)
-        k_tensor = k_tensor4.view(H, S, Dh)
-        v_tensor = v_tensor4.view(H, S, Dh)
-
-        tile_s = kernel.vector_lane
-        tile_e = kernel.vector_lane
-        dh_tiles = int(Dh) // int(tile_e)
-
-        io_stype = mlir_common.DTYPE_TO_MLIR[query.get_dtype()]
-        acc_stype = "f32"
-
-        # SRAM tiles: q(1x1xtile_e), k/v(1xtile_sxtile_e), mul(tile_sx1) in io dtype.
-        # out_acc in f32; out_io temp in io dtype.
-        vlane_stride = 1
-        q_tile_desc = mlir_common.MLIRMultiDimTile([1, 1, tile_e], kernel.vector_lane, 1, vlane_stride)
-        q_tile_desc.set_tile_size_stride([1, 1, tile_e], [0, tile_e, 1])
-        q_tile_desc.set_name("q_buffer")
-        q_tile_desc.offset = query.get_layout().offset
-
-        k_tile_desc = mlir_common.MLIRMultiDimTile([1, tile_s, tile_e], kernel.vector_lane, 2, vlane_stride)
-        k_tile_desc.set_tile_size_stride([1, tile_s, tile_e], [0, 1, tile_s])
-        k_tile_desc.set_name("k_buffer")
-        k_tile_desc.offset = key.get_layout().offset
-
-        v_tile_desc = mlir_common.MLIRMultiDimTile([1, tile_s, tile_e], kernel.vector_lane, 1, vlane_stride)
-        v_tile_desc.set_tile_size_stride([1, tile_s, tile_e], [0, tile_e, 1])
-        v_tile_desc.set_name("v_buffer")
-        v_tile_desc.offset = value.get_layout().offset
-
-        mul_tile_desc = mlir_common.MLIRMultiDimTile([tile_s, 1], kernel.vector_lane, 1, vlane_stride)
-        mul_tile_desc.set_tile_size_stride([tile_s, 1], [1, 1])
-        mul_tile_desc.set_name("mul_buffer")
-
-        score_desc = mlir_common.MLIRMultiDimTile([g, tile_s], kernel.vector_lane, 1, vlane_stride)
-        score_desc.set_tile_size_stride([g, tile_s], [tile_s, 1])
-        score_desc.set_name("score_buffer")
-
-        prob_desc = mlir_common.MLIRMultiDimTile([g, tile_s], kernel.vector_lane, 1, vlane_stride)
-        prob_desc.set_tile_size_stride([g, tile_s], [tile_s, 1])
-        prob_desc.set_name("prob_buffer")
-
-        # Per-qsub accumulators so KV tiles can be shared across qsub
-        out_acc_tile_desc = mlir_common.MLIRMultiDimTile([g, dh_tiles, tile_e], kernel.vector_lane, 2, vlane_stride)
-        out_acc_tile_desc.set_tile_size_stride([g, dh_tiles, tile_e], [dh_tiles * tile_e, tile_e, 1])
-        out_acc_tile_desc.set_name("out_acc_buffer")
-
-        out_io_tile_desc = mlir_common.MLIRMultiDimTile([1, 1, tile_e], kernel.vector_lane, 1, vlane_stride)
-        out_io_tile_desc.set_tile_size_stride([1, 1, tile_e], [0, tile_e, 1])
-        out_io_tile_desc.set_name("out_io_buffer")
-
-        max_desc = mlir_common.MLIRMultiDimTile([g, 2], kernel.vector_lane, 0, vlane_stride)
-        max_desc.set_tile_size_stride([g, 2], [2, 1])
-        max_desc.set_name("max_buffer")
-
-        sum_desc = mlir_common.MLIRMultiDimTile([g, 2], kernel.vector_lane, 0, vlane_stride)
-        sum_desc.set_tile_size_stride([g, 2], [2, 1])
-        sum_desc.set_name("sum_buffer")
-
-        # Strides from 3D tensor views
-        q_stride = q_tensor.stride()
-        k_stride = k_tensor.stride()
-        v_stride = v_tensor.stride()
-        # out is (B,Hq,1,Dh) but we address it as (Hq,1,Dh)
-        out_tensor = empty_strided(out.get_layout().size, out.get_layout().stride).view(Hq, 1, Dh)
-        out_stride = out_tensor.stride()
-
-        # DMA strides (per-dimension DRAM strides for each tile)
-        k_dram_stride  = [int(k_stride[0]), int(k_stride[1]), int(k_stride[2])]
-        # Q: q_head is pre-computed in template; stride[1]=0 since Lq=1
-        q_dram_stride  = [int(q_stride[0]), 0, int(q_stride[2])]
-        v_dram_stride  = [int(v_stride[0]), int(v_stride[1]), int(v_stride[2])]
-        # out: q_head is pre-computed; stride[1]=0 since Lq=1
-        out_dram_stride = [int(out_stride[0]), 0, int(out_stride[2])]
-
-        # Affine maps for flat DRAM base address (used with pre-computed loop var expressions)
-        # K: offset(kv, s0, k0)
-        kk_offset_map = _make_offset_map(k_dram_stride, k_tile_desc.offset)
-        # Q: offset(q_head, k0)  -- q_head = kv*g+qsub pre-computed in template
-        qk_offset_map = _make_offset_map([int(q_stride[0]), int(q_stride[2])], q_tile_desc.offset)
-        # V: offset(kv, s0, dh0)
-        v_offset_map  = _make_offset_map(v_dram_stride, v_tile_desc.offset)
-        # Out: offset(q_head, dh0)  -- q_head pre-computed in template
-        out_offset_map = _make_offset_map([int(out_stride[0]), int(out_stride[2])], 0)
-        # Blk-symbol variants: %s0 is relative (0..BlkS-1), %blk is the absolute
-        # block start (steps by BlkS), so actual_s = s0_rel + 1*blk → sym_stride=1.
-        kk_offset_map_blk = _make_offset_map_with_sym(k_dram_stride, sym_dim=1, sym_stride=1, offset=k_tile_desc.offset)
-        v_offset_map_blk  = _make_offset_map_with_sym(v_dram_stride, sym_dim=1, sym_stride=1, offset=v_tile_desc.offset)
-
-        # Keep sympy-based out_idx only for epilogue_info (not in render_options)
-        kv      = sympy.Symbol("kv")
-        qsub    = sympy.Symbol("qsub")
-        dh0     = sympy.Symbol("dh0")
-        s0      = sympy.Symbol("s0")
-        q_head  = kv * g + qsub
-        out_idx = [q_head * out_stride[0], sympy.Integer(0), dh0 * out_stride[2]]
-
-        kernel.loop_size = [tile_s, tile_e, 1]
-
-        kernel.render_options = dict(
-            KERNEL_NAME=self.name,
-            kernel=kernel,
-            B=B,
-            Hq=Hq,
-            H=H,
-            g=g,
-            S=S,
-            Dh=Dh,
-            dh_tiles=dh_tiles,
-            BlkS=BlkS,
-            tile_s=tile_s,
-            tile_e=tile_e,
-            io_stype=io_stype,
-            acc_stype=acc_stype,
-            scale=self.scale,
-            query=query,
-            key=key,
-            value=value,
-            out=out,
-            q_tile_desc=q_tile_desc,
-            k_tile_desc=k_tile_desc,
-            v_tile_desc=v_tile_desc,
-            out_acc_tile_desc=out_acc_tile_desc,
-            out_io_tile_desc=out_io_tile_desc,
-            mul_tile_desc=mul_tile_desc,
-            score_desc=score_desc,
-            prob_desc=prob_desc,
-            max_desc=max_desc,
-            sum_desc=sum_desc,
-            # DMA strides
-            k_dram_stride=k_dram_stride,
-            q_dram_stride=q_dram_stride,
-            v_dram_stride=v_dram_stride,
-            out_dram_stride=out_dram_stride,
-            # Affine offset maps
-            kk_offset_map=kk_offset_map,
-            qk_offset_map=qk_offset_map,
-            v_offset_map=v_offset_map,
-            out_offset_map=out_offset_map,
-            kk_offset_map_blk=kk_offset_map_blk,
-            v_offset_map_blk=v_offset_map_blk,
-            input_reorder=self.input_reorder,
-        )
-
-        return self._template_from_string(DECODE_GQA_SDPA_TEMPLATE).render(**kernel.render_options)
-
-
 # ---------------------------
 # Decode-only GQA SDPA: 2-kernel pipeline (partial blocks + reduce)
 # ---------------------------
@@ -960,13 +582,7 @@ def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue
   {{ kernel.def_sram_buffer("key", k_tile_desc, indent_size=2) }}
   {{ kernel.def_sram_buffer("value", v_tile_desc, indent_size=2) }}
   {{ kernel.def_sram_buffer("mul", mul_tile_desc, indent_size=2) }}
-  {{ kernel.def_sram_buffer("score", score_desc, indent_size=2) }}
-  {{ kernel.def_sram_buffer("prob", prob_desc, indent_size=2) }}
-  {{ kernel.def_sram_buffer("out_io", out_io_tile_desc, indent_size=2) }}
-  {{ kernel.def_sram_buffer("max", max_desc, indent_size=2) }}
-  {{ kernel.def_sram_buffer("sum", sum_desc, indent_size=2) }}
-  {{ kernel.def_sram_buffer("out_acc", out_acc_tile_desc, indent_size=2) }}
-  {{ kernel.def_sram_buffer("partial", partial_tile_desc, indent_size=2) }}
+
 
   %c0 = arith.constant 0.0 : f32
   %c_scale = arith.constant {{ scale }} : f32
@@ -984,135 +600,21 @@ def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue
   affine.for %kv = 0 to {{ H }} {
     affine.for %blk = 0 to {{ nblk }} step 1 {
       // Reset per-block accumulators for all qsub/dh tiles.
-      affine.for %qsub = 0 to {{ g }} {
-        affine.vector_store %v_neg_inf_2x, %max_buffer[%qsub, 0] : {{ max_desc.get_mlir_shape("f32") }}, vector<2xf32>
-        affine.vector_store %v0_2x, %sum_buffer[%qsub, 0] : {{ sum_desc.get_mlir_shape("f32") }}, vector<2xf32>
-        affine.for %dht = 0 to {{ dh_tiles }} {
-          affine.vector_store %v0_e, %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32>
-        }
-      }
-
+      %qk_offset = affine.apply {{ qk_offset_map }}(%kv)
+      {{ kernel.def_dma_op("MVIN", "query", [], q_tile_desc, subtile_size=[Dh, 1, g_size], indent_size=8, dram_stride=q_dram_stride, dram_offset="qk_offset") }}
+      %q2D_buffer = memref.reinterpret_cast %q_buffer to offset: [0], sizes: [{{ Dh }}, {{ g_size }}], strides: [{{g_size}}, 1] : {{ q_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ Dh }}x{{ g_size }}x{{ io_stype }}, 1>
       affine.for %s0 = 0 to {{ BlkS }} step {{ tile_s }} {
-        // Accumulate score per qsub so K tiles can be shared across qsub.
-        affine.for %qsub = 0 to {{ g }} {
-          affine.vector_store %v0_s, %score_buffer[%qsub, 0] : {{ score_desc.get_mlir_shape("f32") }}, vector<{{ tile_s }}xf32>
-        }
-
         affine.for %k0 = 0 to {{ Dh }} step {{ tile_e }} {
           %kk_offset = affine.apply {{ kk_offset_map_blk }}(%kv, %s0, %k0)[%blk]
           {{ kernel.def_dma_op("MVIN", "key", [], k_tile_desc, subtile_size=[1, tile_s, tile_e], indent_size=10, padding=1, dram_stride=k_dram_stride, dram_offset="kk_offset") }}
-          %k2D = memref.reinterpret_cast %k_buffer to offset: [0], sizes: [{{ tile_s }}, {{ tile_e }}], strides: [{{ tile_e }}, 1] : {{ k_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_s }}x{{ tile_e }}x{{ io_stype }}, 1>
-
-          affine.for %qsub = 0 to {{ g }} {
-            %q_head = affine.apply affine_map<(d0, d1) -> (d0 * {{ g }} + d1)>(%kv, %qsub)
-            %qk_offset = affine.apply {{ qk_offset_map }}(%q_head, %k0)
-            {{ kernel.def_dma_op("MVIN", "query", [], q_tile_desc, subtile_size=[1, 1, tile_e], indent_size=12, dram_stride=q_dram_stride, dram_offset="qk_offset") }}
-            %q2D = memref.reinterpret_cast %q_buffer to offset: [0], sizes: [{{ tile_e }}, 1], strides: [1, 1] : {{ q_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_e }}x1x{{ io_stype }}, 1>
-            linalg.matmul
-              { idx_map = array<i32: 1, 0, -1> }
-              ins(%k2D, %q2D : memref<{{ tile_s }}x{{ tile_e }}x{{ io_stype }}, 1>, memref<{{ tile_e }}x1x{{ io_stype }}, 1>)
-              outs(%mul_buffer : {{ mul_tile_desc.get_mlir_shape(io_stype) }})
-            %raw_mul_io = affine.vector_load %mul_buffer[0, 0] : {{ mul_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_s }}x{{ io_stype }}>
-            {% if io_stype != "f32" %}%raw_mul = arith.extf %raw_mul_io : vector<{{ tile_s }}x{{ io_stype }}> to vector<{{ tile_s }}xf32>{% endif %}
-            %old_score = affine.vector_load %score_buffer[%qsub, 0] : {{ score_desc.get_mlir_shape("f32") }}, vector<{{ tile_s }}xf32>
-            %new_score = arith.addf %old_score, {{ "%raw_mul" if io_stype != "f32" else "%raw_mul_io" }} : vector<{{ tile_s }}xf32>
-            affine.vector_store %new_score, %score_buffer[%qsub, 0] : {{ score_desc.get_mlir_shape("f32") }}, vector<{{ tile_s }}xf32>
-          } { accumulation_loop=true }
-        } { accumulation_loop=true }
-
-        // Softmax once per qsub; persist probabilities in SRAM for all SV dh tiles.
-        affine.for %qsub = 0 to {{ g }} {
-          %score = affine.vector_load %score_buffer[%qsub, 0] : {{ score_desc.get_mlir_shape("f32") }}, vector<{{ tile_s }}xf32>
-          %scaled = arith.mulf %score, %v_scale : vector<{{ tile_s }}xf32>
-
-          %old_max = affine.vector_load %max_buffer[%qsub, 0] : {{ max_desc.get_mlir_shape("f32") }}, vector<2xf32>
-          %max_init = vector.broadcast %c_neg_inf : f32 to vector<{{ tile_s }}xf32>
-          %local_max_vec = arith.maximumf %scaled, %max_init : vector<{{ tile_s }}xf32>
-          %max_cast = vector.shape_cast %local_max_vec : vector<{{ tile_s }}xf32> to vector<{{ tile_s // 2 }}x2xf32>
-          %max_red1 = vector.multi_reduction <maximumf>, %max_cast, %v_neg_inf_2x [0] : vector<{{ tile_s // 2 }}x2xf32> to vector<2xf32>
-          %max_shuf = vector.shuffle %max_red1, %max_red1 [1, 0] : vector<2xf32>, vector<2xf32>
-          %max_red2 = arith.maximumf %max_red1, %max_shuf : vector<2xf32>
-          %new_max = arith.maximumf %max_red2, %old_max : vector<2xf32>
-          affine.vector_store %new_max, %max_buffer[%qsub, 0] : {{ max_desc.get_mlir_shape("f32") }}, vector<2xf32>
-
-          %max_diff = arith.subf %old_max, %new_max : vector<2xf32>
-          %max_diff_scalar = vector.extract %max_diff[0] : f32 from vector<2xf32>
-          %rescale_e = vector.broadcast %max_diff_scalar : f32 to vector<{{ tile_e }}xf32>
-          %exp_rescale_e = math.exp %rescale_e : vector<{{ tile_e }}xf32>
-          %rescale_2 = vector.broadcast %max_diff_scalar : f32 to vector<2xf32>
-          %exp_rescale_2 = math.exp %rescale_2 : vector<2xf32>
-
-          %old_sum = affine.vector_load %sum_buffer[%qsub, 0] : {{ sum_desc.get_mlir_shape("f32") }}, vector<2xf32>
-          %rescaled_sum = arith.mulf %old_sum, %exp_rescale_2 : vector<2xf32>
-
-          affine.for %dht = 0 to {{ dh_tiles }} {
-            %old_out = affine.vector_load %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32>
-            %rescaled_out = arith.mulf %exp_rescale_e, %old_out : vector<{{ tile_e }}xf32>
-            affine.vector_store %rescaled_out, %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32>
-          }
-
-          %new_max_scalar = vector.extract %new_max[0] : f32 from vector<2xf32>
-          %new_max_bcast = vector.broadcast %new_max_scalar : f32 to vector<{{ tile_s }}xf32>
-          %shifted = arith.subf %scaled, %new_max_bcast : vector<{{ tile_s }}xf32>
-          %exp_scores = math.exp %shifted : vector<{{ tile_s }}xf32>
-          {% if io_stype != "f32" %}%exp_scores_io = arith.truncf %exp_scores : vector<{{ tile_s }}xf32> to vector<{{ tile_s }}x{{ io_stype }}>{% endif %}
-          affine.vector_store {{ "%exp_scores_io" if io_stype != "f32" else "%exp_scores" }}, %prob_buffer[%qsub, 0] : {{ prob_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_s }}x{{ io_stype }}>
-
-          %sum_cast = vector.shape_cast %exp_scores : vector<{{ tile_s }}xf32> to vector<{{ tile_s // 2 }}x2xf32>
-          %zero_2x = vector.broadcast %c0 : f32 to vector<2xf32>
-          %sum_red1 = vector.multi_reduction <add>, %sum_cast, %zero_2x [0] : vector<{{ tile_s // 2 }}x2xf32> to vector<2xf32>
-          %sum_shuf = vector.shuffle %sum_red1, %sum_red1 [1, 0] : vector<2xf32>, vector<2xf32>
-          %sum_red2 = arith.addf %sum_red1, %sum_shuf : vector<2xf32>
-          %new_sum = arith.addf %sum_red2, %rescaled_sum : vector<2xf32>
-          affine.vector_store %new_sum, %sum_buffer[%qsub, 0] : {{ sum_desc.get_mlir_shape("f32") }}, vector<2xf32>
-        } { accumulation_loop=true }
+          %k2D = memref.reinterpret_cast %k_buffer to offset: [0], sizes: [{{ tile_s }}, {{ tile_e }}], strides: [{{ tile_e }},1] : {{ k_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_s }}x{{ tile_e }}x{{ io_stype }}, 1>
+          %q2D = memref.reinterpret_cast %q2D_buffer to offset: [%k0], sizes: [{{ tile_e }}, {{ g_size }}], strides: [{{ g_size }}, 1] : memref<{{ Dh }}x{{ g_size }}x{{ io_stype }}, 1> to memref<{{ tile_e }}x{{ g_size }}x{{ io_stype }}, 1>
+          linalg.matmul
+            ins(%k2D, %q2D : memref<{{ tile_s }}x{{ tile_e }}x{{ io_stype }}, 1>, memref<{{ tile_e }}x{{ g_size }}x{{ io_stype }}, 1>)
+            outs(%mul_buffer : {{ mul_tile_desc.get_mlir_shape(io_stype) }})
 
-        // For each output dh tile, load V once and share it across qsub.
-        affine.for %dht = 0 to {{ dh_tiles }} {
-          %dh0 = affine.apply affine_map<(d0) -> (d0 * {{ tile_e }})>(%dht)
-          %v_offset = affine.apply {{ v_offset_map_blk }}(%kv, %s0, %dh0)[%blk]
-          {{ kernel.def_dma_op("MVIN", "value", [], v_tile_desc, subtile_size=[1, tile_s, tile_e], indent_size=10, padding=0, dram_stride=v_dram_stride, dram_offset="v_offset") }}
-          %v2D = memref.reinterpret_cast %v_buffer to offset: [0], sizes: [{{ tile_e }}, {{ tile_s }}], strides: [{{ tile_s }}, 1] : {{ v_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_e }}x{{ tile_s }}x{{ io_stype }}, 1>
-
-          affine.for %qsub = 0 to {{ g }} {
-            %prob_vec = affine.vector_load %prob_buffer[%qsub, 0] : {{ prob_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_s }}x{{ io_stype }}>
-            affine.vector_store %prob_vec, %mul_buffer[0, 0] : {{ mul_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_s }}x{{ io_stype }}>
-            affine.vector_store %v0_e_io, %out_io_buffer[0, 0, 0] : {{ out_io_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_e }}x{{ io_stype }}>
-            %out_io_2D = memref.reinterpret_cast %out_io_buffer to offset: [0], sizes: [{{ tile_e }}, 1], strides: [1, 1] : {{ out_io_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_e }}x1x{{ io_stype }}, 1>
-            linalg.matmul
-              { idx_map = array<i32: 2, 1, -1> }
-              ins(%v2D, %mul_buffer : memref<{{ tile_e }}x{{ tile_s }}x{{ io_stype }}, 1>, {{ mul_tile_desc.get_mlir_shape(io_stype) }})
-              outs(%out_io_2D : memref<{{ tile_e }}x1x{{ io_stype }}, 1>)
-
-            %out_io_vec = affine.vector_load %out_io_buffer[0, 0, 0] : {{ out_io_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_e }}x{{ io_stype }}>
-            {% if io_stype != "f32" %}%out_io_f32 = arith.extf %out_io_vec : vector<{{ tile_e }}x{{ io_stype }}> to vector<{{ tile_e }}xf32>{% endif %}
-            %out_acc_vec = affine.vector_load %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32>
-            %out_acc_new = arith.addf %out_acc_vec, {{ "%out_io_f32" if io_stype != "f32" else "%out_io_vec" }} : vector<{{ tile_e }}xf32>
-            affine.vector_store %out_acc_new, %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32>
-          } { accumulation_loop=true }
         } { accumulation_loop=true }
       } { accumulation_loop=true }
-
-      // Store packed partials for all qsub/dh tiles.
-      affine.for %qsub = 0 to {{ g }} {
-        %final_max = affine.vector_load %max_buffer[%qsub, 0] : {{ max_desc.get_mlir_shape("f32") }}, vector<2xf32>
-        %m_scalar = vector.extract %final_max[0] : f32 from vector<2xf32>
-        %final_sum = affine.vector_load %sum_buffer[%qsub, 0] : {{ sum_desc.get_mlir_shape("f32") }}, vector<2xf32>
-        %l_scalar = vector.extract %final_sum[0] : f32 from vector<2xf32>
-        %ml_vec = vector.broadcast %c0 : f32 to vector<{{ tile_e }}xf32>
-        %ml0 = vector.insert %m_scalar, %ml_vec[0] : f32 into vector<{{ tile_e }}xf32>
-        %ml1 = vector.insert %l_scalar, %ml0[1] : f32 into vector<{{ tile_e }}xf32>
-
-        affine.for %dht = 0 to {{ dh_tiles }} {
-          %out_vec = affine.vector_load %out_acc_buffer[%qsub, %dht, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32>
-          %packed = vector.shuffle %out_vec, %ml1 [{{ range(tile_pack) | join(', ') }}] : vector<{{ tile_e }}xf32>, vector<{{ tile_e }}xf32>
-          affine.vector_store %packed, %partial_buffer[0, 0, 0] : {{ partial_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_pack }}xf32>
-          %q_head = affine.apply affine_map<(d0, d1) -> (d0 * {{ g }} + d1)>(%kv, %qsub)
-          %gh = affine.apply affine_map<(d0, d1) -> (d0 * {{ dh_tiles }} + d1)>(%q_head, %dht)
-          %partial_offset = affine.apply {{ partial_offset_map }}(%gh, %blk)
-          {{ kernel.def_dma_op("MVOUT", "partial", [], partial_tile_desc, indent_size=10, dram_stride=partial_dram_stride, dram_offset="partial_offset") }}
-        }
-      } { outer_loop=true }
     } { outer_loop=true }
   } { outer_loop=true }
   return
@@ -1138,6 +640,7 @@ def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue
         _, H, S, _ = k_tensor4.shape
         assert B == 1 and Lq == 1
         g = Hq // H
+        g_size = g
         BlkS = min(int(self.BlkS), int(S))
         nblk = (int(S) + int(BlkS) - 1) // int(BlkS)
 
@@ -1157,53 +660,53 @@ def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue
 
         # tile descs
         vlane_stride = 1
-        q_tile_desc = mlir_common.MLIRMultiDimTile([1, 1, tile_e], kernel.vector_lane, 1, vlane_stride)
-        q_tile_desc.set_tile_size_stride([1, 1, tile_e], [0, tile_e, 1])
+        q_tile_desc = mlir_common.MLIRMultiDimTile([Dh, 1, g_size], kernel.vector_lane, 2, vlane_stride)
+        q_tile_desc.set_tile_size_stride([Dh, 1, g_size], [g_size, 1, 1])
         q_tile_desc.set_name("q_buffer")
         q_tile_desc.offset = query.get_layout().offset
 
         k_tile_desc = mlir_common.MLIRMultiDimTile([1, tile_s, tile_e], kernel.vector_lane, 2, vlane_stride)
-        k_tile_desc.set_tile_size_stride([1, tile_s, tile_e], [0, 1, tile_s])
+        k_tile_desc.set_tile_size_stride([1, tile_s, tile_e], [1, 1, tile_s])
         k_tile_desc.set_name("k_buffer")
         k_tile_desc.offset = key.get_layout().offset
 
         v_tile_desc = mlir_common.MLIRMultiDimTile([1, tile_s, tile_e], kernel.vector_lane, 1, vlane_stride)
-        v_tile_desc.set_tile_size_stride([1, tile_s, tile_e], [0, tile_e, 1])
+        v_tile_desc.set_tile_size_stride([1, tile_s, tile_e], [1, tile_e, 1])
         v_tile_desc.set_name("v_buffer")
         v_tile_desc.offset = value.get_layout().offset
 
-        mul_tile_desc = mlir_common.MLIRMultiDimTile([tile_s, 1], kernel.vector_lane, 1, vlane_stride)
-        mul_tile_desc.set_tile_size_stride([tile_s, 1], [1, 1])
+        mul_tile_desc = mlir_common.MLIRMultiDimTile([tile_s, g_size], kernel.vector_lane, 1, vlane_stride)
+        mul_tile_desc.set_tile_size_stride([tile_s, g_size], [1, tile_s])
         mul_tile_desc.set_name("mul_buffer")
 
-        score_desc = mlir_common.MLIRMultiDimTile([g, tile_s], kernel.vector_lane, 1, vlane_stride)
-        score_desc.set_tile_size_stride([g, tile_s], [tile_s, 1])
-        score_desc.set_name("score_buffer")
+        # score_desc = mlir_common.MLIRMultiDimTile([g, tile_s], kernel.vector_lane, 1, vlane_stride)
+        # score_desc.set_tile_size_stride([g, tile_s], [tile_s, 1])
+        # score_desc.set_name("score_buffer")
 
-        prob_desc = mlir_common.MLIRMultiDimTile([g, tile_s], kernel.vector_lane, 1, vlane_stride)
-        prob_desc.set_tile_size_stride([g, tile_s], [tile_s, 1])
-        prob_desc.set_name("prob_buffer")
+        # prob_desc = mlir_common.MLIRMultiDimTile([g, tile_s], kernel.vector_lane, 1, vlane_stride)
+        # prob_desc.set_tile_size_stride([g, tile_s], [tile_s, 1])
+        # prob_desc.set_name("prob_buffer")
 
-        # Per-qsub, per-dh-tile accumulators so QK is computed once and SV expands across dh tiles.
-        out_acc_tile_desc = mlir_common.MLIRMultiDimTile([g, dh_tiles, tile_e], kernel.vector_lane, 2, vlane_stride)
-        out_acc_tile_desc.set_tile_size_stride([g, dh_tiles, tile_e], [dh_tiles * tile_e, tile_e, 1])
-        out_acc_tile_desc.set_name("out_acc_buffer")
+        # # Per-qsub, per-dh-tile accumulators so QK is computed once and SV expands across dh tiles.
+        # out_acc_tile_desc = mlir_common.MLIRMultiDimTile([g, dh_tiles, tile_e], kernel.vector_lane, 2, vlane_stride)
+        # out_acc_tile_desc.set_tile_size_stride([g, dh_tiles, tile_e], [dh_tiles * tile_e, tile_e, 1])
+        # out_acc_tile_desc.set_name("out_acc_buffer")
 
-        max_desc = mlir_common.MLIRMultiDimTile([g, 2], kernel.vector_lane, 0, vlane_stride)
-        max_desc.set_tile_size_stride([g, 2], [2, 1])
-        max_desc.set_name("max_buffer")
+        # max_desc = mlir_common.MLIRMultiDimTile([g, 2], kernel.vector_lane, 0, vlane_stride)
+        # max_desc.set_tile_size_stride([g, 2], [2, 1])
+        # max_desc.set_name("max_buffer")
 
-        sum_desc = mlir_common.MLIRMultiDimTile([g, 2], kernel.vector_lane, 0, vlane_stride)
-        sum_desc.set_tile_size_stride([g, 2], [2, 1])
-        sum_desc.set_name("sum_buffer")
+        # sum_desc = mlir_common.MLIRMultiDimTile([g, 2], kernel.vector_lane, 0, vlane_stride)
+        # sum_desc.set_tile_size_stride([g, 2], [2, 1])
+        # sum_desc.set_name("sum_buffer")
 
-        out_io_tile_desc = mlir_common.MLIRMultiDimTile([1, 1, tile_e], kernel.vector_lane, 1, vlane_stride)
-        out_io_tile_desc.set_tile_size_stride([1, 1, tile_e], [0, tile_e, 1])
-        out_io_tile_desc.set_name("out_io_buffer")
+        # out_io_tile_desc = mlir_common.MLIRMultiDimTile([1, 1, tile_e], kernel.vector_lane, 1, vlane_stride)
+        # out_io_tile_desc.set_tile_size_stride([1, 1, tile_e], [0, tile_e, 1])
+        # out_io_tile_desc.set_name("out_io_buffer")
 
-        partial_tile_desc = mlir_common.MLIRMultiDimTile([1, 1, tile_pack], kernel.vector_lane, 1, vlane_stride)
-        partial_tile_desc.set_tile_size_stride([1, 1, tile_pack], [0, tile_pack, 1])
-        partial_tile_desc.set_name("partial_buffer")
+        # partial_tile_desc = mlir_common.MLIRMultiDimTile([1, 1, tile_pack], kernel.vector_lane, 1, vlane_stride)
+        # partial_tile_desc.set_tile_size_stride([1, 1, tile_pack], [0, tile_pack, 1])
+        # partial_tile_desc.set_name("partial_buffer")
 
         # Strides from 3D tensor views
         q_stride = q_tensor.stride()
@@ -1216,13 +719,13 @@ def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue
 
         # DMA strides
         k_dram_stride = [int(k_stride[0]), int(k_stride[1]), int(k_stride[2])]
-        q_dram_stride = [int(q_stride[0]), 0, int(q_stride[2])]
+        q_dram_stride = [int(q_stride[2]), 0, int(q_stride[1])]
         v_dram_stride = [int(v_stride[0]), int(v_stride[1]), int(v_stride[2])]
         partial_dram_stride = [int(p_stride[0]), int(p_stride[1]), 1]
 
         # Affine offset maps
         kk_offset_map   = _make_offset_map(k_dram_stride, k_tile_desc.offset)
-        qk_offset_map   = _make_offset_map([int(q_stride[0]), int(q_stride[2])], q_tile_desc.offset)
+        qk_offset_map   = _make_offset_map([int(g) * int(q_stride[2])], q_tile_desc.offset)
         v_offset_map    = _make_offset_map(v_dram_stride, v_tile_desc.offset)
         # partial: offset(gh, blk)  -- gh = (kv*g+qsub)*dh_tiles+dht, pre-computed in template
         partial_offset_map = _make_offset_map([int(p_stride[0]), int(p_stride[1])], 0)
@@ -1254,6 +757,7 @@ def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue
             nblk=nblk,
             tile_s=tile_s,
             tile_e=tile_e,
+            g_size=g_size,
             dh_tiles=dh_tiles,
             tile_pack=tile_pack,
             io_stype=io_stype,
@@ -1266,13 +770,13 @@ def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue
             k_tile_desc=k_tile_desc,
             v_tile_desc=v_tile_desc,
             mul_tile_desc=mul_tile_desc,
-            score_desc=score_desc,
-            prob_desc=prob_desc,
-            out_io_tile_desc=out_io_tile_desc,
-            out_acc_tile_desc=out_acc_tile_desc,
-            max_desc=max_desc,
-            sum_desc=sum_desc,
-            partial_tile_desc=partial_tile_desc,
+            # score_desc=score_desc,
+            # prob_desc=prob_desc,
+            # out_io_tile_desc=out_io_tile_desc,
+            # out_acc_tile_desc=out_acc_tile_desc,
+            # max_desc=max_desc,
+            # sum_desc=sum_desc,
+            # partial_tile_desc=partial_tile_desc,
             # DMA strides
             k_dram_stride=k_dram_stride,
             q_dram_stride=q_dram_stride,

From ce9330670c60bb4debf795c4771b8d80057e92e5 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 13 Mar 2026 21:30:07 +0900
Subject: [PATCH 131/194] [Template/SPDA] Remove subtile size temporarily

---
 PyTorchSimFrontend/extension_codecache.py     |  2 ++
 PyTorchSimFrontend/mlir/mlir_sdpa_template.py | 18 ++++++------------
 2 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index d3ac7259..b1c457d3 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -37,6 +37,7 @@ def mlir_compile_command(filename, vectorlane_size, vlen=256):
         f"""
             {extension_config.CONFIG_TORCHSIM_LLVM_PATH}/mlir-opt \
             -test-loop-padding \
+            -dma-fine-grained='systolic-array-size={vectorlane_size}' \
             -global-idx='vlen={vlen}' \
             -test-pytorchsim-to-vcix='systolic-array-size={vectorlane_size} vlen={vlen}' \
             -test-memref-to-gemmini="vectorlane={vectorlane_size}" \
@@ -86,6 +87,7 @@ def mlir_gem5_compile_command(filename, sample_filename, tog_file, vectorlane_si
         f"""
             {extension_config.CONFIG_TORCHSIM_LLVM_PATH}/mlir-opt \
             -test-loop-padding='timing_mode=1' \
+            -dma-fine-grained='systolic-array-size={vectorlane_size}' \
             -global-idx='vlen={vlen}' \
             -test-pytorchsim-to-vcix='systolic-array-size={vectorlane_size} vlen={vlen}' \
             -test-tile-operation-graph='vectorlane={vectorlane_size} tls_mode={extension_config.CONFIG_TLS_MODE}' \
diff --git a/PyTorchSimFrontend/mlir/mlir_sdpa_template.py b/PyTorchSimFrontend/mlir/mlir_sdpa_template.py
index adcc7801..b1569be6 100644
--- a/PyTorchSimFrontend/mlir/mlir_sdpa_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_sdpa_template.py
@@ -169,9 +169,6 @@ def calculate_scale(query: torch.Tensor, scale: float) -> float:
 // tile_l = {{ tile_l }}
 // tile_s = {{ tile_s }}
 // tile_e = {{ tile_e }}
-// subtile_l = {{ subtile_l }}
-// subtile_s = {{ subtile_s }}
-// subtile_e = {{ subtile_e }}
 {{kernel.def_global_vars()}}
 
 func.func @{{ KERNEL_NAME }}{{kernel.def_kernel(inputs=[query, key, value], outputs=[out], names_str="query, key, value, out", input_reorder=input_reorder)}} {
@@ -210,7 +207,7 @@ def calculate_scale(query: torch.Tensor, scale: float) -> float:
     affine.for %index3 = 0 to 1 step 1 {
       affine.for %index1 = 0 to {{ l }} step {{ tile_l }} {
         %q_dram_offset = affine.apply {{ q_offset_map }}(%index0, %index1, %index3)
-        {{ kernel.def_dma_op("MVIN", "query", [], q_tile_desc, subtile_size=[1, subtile_l, subtile_e], indent_size=8, dram_stride=q_dram_stride, dram_offset="q_dram_offset") }}
+        {{ kernel.def_dma_op("MVIN", "query", [], q_tile_desc, indent_size=8, dram_stride=q_dram_stride, dram_offset="q_dram_offset") }}
 
         affine.vector_store %v0_l, %out_buffer[0, 0, 0] : {{ out_tile_desc.get_mlir_shape(data_stype) }}, vector<{{ kernel.get_spad_size_per_lane(tile_l, tile_e) }}x{{ data_stype }}>
         affine.vector_store %v_neg_inf_2x, %max_buffer[0, 0] : {{ max_desc.get_mlir_shape(data_stype) }}, vector<2x{{ data_stype }}>
@@ -221,9 +218,9 @@ def calculate_scale(query: torch.Tensor, scale: float) -> float:
 
         affine.for %index2 = 0 to {{ s }} step {{ tile_s }} {
           %k_dram_offset = affine.apply {{ k_offset_map }}(%index0, %index2, %index3)
-          {{ kernel.def_dma_op("MVIN", "key", [], k_tile_desc, subtile_size=[1, subtile_s, subtile_e], indent_size=10, dram_stride=k_dram_stride, dram_offset="k_dram_offset") }}
+          {{ kernel.def_dma_op("MVIN", "key", [], k_tile_desc, indent_size=10, dram_stride=k_dram_stride, dram_offset="k_dram_offset") }}
           %v_dram_offset = affine.apply {{ v_offset_map }}(%index0, %index2, %index3)
-          {{ kernel.def_dma_op("MVIN", "value", [], v_tile_desc, subtile_size=[1, subtile_s, subtile_e], indent_size=10, dram_stride=v_dram_stride, dram_offset="v_dram_offset") }}
+          {{ kernel.def_dma_op("MVIN", "value", [], v_tile_desc, indent_size=10, dram_stride=v_dram_stride, dram_offset="v_dram_offset") }}
 
           affine.vector_store %v0_s, %mul_buffer[0, 0] : {{ mul_tile_desc.get_mlir_shape(data_stype) }}, vector<{{ kernel.get_spad_size_per_lane(tile_s, tile_l) }}x{{ data_stype }}>
 
@@ -487,9 +484,6 @@ def render(self,
             tile_l = tile_l,
             tile_s = tile_s,
             tile_e = tile_e,                   # Tile sizes (sram)
-            subtile_l = subtile_l,
-            subtile_s = subtile_s,
-            subtile_e = subtile_e,             # Subtile sizes (sram)
             data_stype="f32",
             query = query,
             key = key,
@@ -601,12 +595,12 @@ def select_tile(self, kernel, l, s, e, n_extra_node, n_extra_read, n_prologue_no
     affine.for %blk = 0 to {{ nblk }} step 1 {
       // Reset per-block accumulators for all qsub/dh tiles.
       %qk_offset = affine.apply {{ qk_offset_map }}(%kv)
-      {{ kernel.def_dma_op("MVIN", "query", [], q_tile_desc, subtile_size=[Dh, 1, g_size], indent_size=8, dram_stride=q_dram_stride, dram_offset="qk_offset") }}
+      {{ kernel.def_dma_op("MVIN", "query", [], q_tile_desc, indent_size=8, dram_stride=q_dram_stride, dram_offset="qk_offset") }}
       %q2D_buffer = memref.reinterpret_cast %q_buffer to offset: [0], sizes: [{{ Dh }}, {{ g_size }}], strides: [{{g_size}}, 1] : {{ q_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ Dh }}x{{ g_size }}x{{ io_stype }}, 1>
       affine.for %s0 = 0 to {{ BlkS }} step {{ tile_s }} {
         affine.for %k0 = 0 to {{ Dh }} step {{ tile_e }} {
           %kk_offset = affine.apply {{ kk_offset_map_blk }}(%kv, %s0, %k0)[%blk]
-          {{ kernel.def_dma_op("MVIN", "key", [], k_tile_desc, subtile_size=[1, tile_s, tile_e], indent_size=10, padding=1, dram_stride=k_dram_stride, dram_offset="kk_offset") }}
+          {{ kernel.def_dma_op("MVIN", "key", [], k_tile_desc, indent_size=10, padding=1, dram_stride=k_dram_stride, dram_offset="kk_offset") }}
           %k2D = memref.reinterpret_cast %k_buffer to offset: [0], sizes: [{{ tile_s }}, {{ tile_e }}], strides: [{{ tile_e }},1] : {{ k_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_s }}x{{ tile_e }}x{{ io_stype }}, 1>
           %q2D = memref.reinterpret_cast %q2D_buffer to offset: [%k0], sizes: [{{ tile_e }}, {{ g_size }}], strides: [{{ g_size }}, 1] : memref<{{ Dh }}x{{ g_size }}x{{ io_stype }}, 1> to memref<{{ tile_e }}x{{ g_size }}x{{ io_stype }}, 1>
           linalg.matmul
@@ -824,7 +818,7 @@ def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue
 
     affine.for %blk = 0 to {{ nblk }} {
       %partial_offset = affine.apply {{ partial_offset_map }}(%gh, %blk)
-      {{ kernel.def_dma_op("MVIN", "partial", [], partial_tile_desc, subtile_size=[1, 1, tile_pack], indent_size=8, dram_stride=partial_dram_stride, dram_offset="partial_offset") }}
+      {{ kernel.def_dma_op("MVIN", "partial", [], partial_tile_desc, indent_size=8, dram_stride=partial_dram_stride, dram_offset="partial_offset") }}
       %p = affine.vector_load %partial_buffer[0, 0, 0] : {{ partial_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_pack }}xf32>
       %p2 = vector.shape_cast %p : vector<{{ tile_pack }}xf32> to vector<2x{{ tile_e }}xf32>
       %o_j = vector.extract %p2[0] : vector<{{ tile_e }}xf32> from vector<2x{{ tile_e }}xf32>

From f2717e1cd117b5229f769ebf3a7040185c984891 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 13 Mar 2026 22:45:00 +0900
Subject: [PATCH 132/194] [Template/SPDA] minor fix

---
 PyTorchSimFrontend/mlir/mlir_sdpa_template.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_sdpa_template.py b/PyTorchSimFrontend/mlir/mlir_sdpa_template.py
index b1569be6..be6e7124 100644
--- a/PyTorchSimFrontend/mlir/mlir_sdpa_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_sdpa_template.py
@@ -713,7 +713,7 @@ def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue
 
         # DMA strides
         k_dram_stride = [int(k_stride[0]), int(k_stride[1]), int(k_stride[2])]
-        q_dram_stride = [int(q_stride[2]), 0, int(q_stride[1])]
+        q_dram_stride = [int(q_stride[2]), 0, int(q_stride[0])]
         v_dram_stride = [int(v_stride[0]), int(v_stride[1]), int(v_stride[2])]
         partial_dram_stride = [int(p_stride[0]), int(p_stride[1]), 1]
 

From be23638400926454d8be17742eff4b6fc358b750 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 16 Mar 2026 20:43:21 +0900
Subject: [PATCH 133/194] [Cleanup] Unflag debug option

---
 PyTorchSimFrontend/extension_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py
index fe8cc380..1b7ccf8d 100644
--- a/PyTorchSimFrontend/extension_config.py
+++ b/PyTorchSimFrontend/extension_config.py
@@ -130,7 +130,7 @@ def load_plan_from_module(module_path):
 
 CONFIG_USE_TIMING_POOLING = int(os.environ.get('TORCHSIM_USE_TIMING_POOLING', default=0))
 
-CONFIG_DEBUG_MODE = int(os.environ.get('TORCHSIM_DEBUG_MODE', default=1))
+CONFIG_DEBUG_MODE = int(os.environ.get('TORCHSIM_DEBUG_MODE', default=0))
 
 
 def setup_logger(name=None, level=None):

From e925ae45cad8cebca98e42de5c1cfb8c01cd35bf Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 16 Mar 2026 22:02:07 +0900
Subject: [PATCH 134/194] [CI] Add deepseek test case

---
 .github/workflows/pytorchsim_test.yml | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/.github/workflows/pytorchsim_test.yml b/.github/workflows/pytorchsim_test.yml
index eaaa7e50..36a62b68 100644
--- a/.github/workflows/pytorchsim_test.yml
+++ b/.github/workflows/pytorchsim_test.yml
@@ -726,6 +726,27 @@ jobs:
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/Yolov5/test_yolov5.py
 
+  test_deepseek:
+    name: Run test_deepseek
+    runs-on: self-hosted
+    steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Run test_deepseek_v3_base.py
+        run: |
+          echo "Running test_deepseek_v3_base.py"
+          docker run --rm \
+            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
+            -e TORCHSIM_DUMP_PATH=/dump \
+            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
+            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/DeepSeek/test_deepseek_v3_base.py
+
   test_accuracy:
     name: Run test_accuracy
     runs-on: self-hosted

From db859911ed73b21db65031f84dc47dc4555dcc3f Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 17 Mar 2026 16:24:45 +0900
Subject: [PATCH 135/194] [Template/SPDA] Cleanup test case + Add an activate
 option

---
 PyTorchSimDevice/csrc/aten/native/Extra.cpp   |  34 +-
 .../torch_openreg/openreg/__init__.py         |   5 +
 PyTorchSimFrontend/mlir/mlir_lowering.py      |  60 +--
 PyTorchSimFrontend/mlir/mlir_sdpa_template.py | 423 +-----------------
 tests/test_sdpa.py                            | 241 +++++-----
 5 files changed, 181 insertions(+), 582 deletions(-)

diff --git a/PyTorchSimDevice/csrc/aten/native/Extra.cpp b/PyTorchSimDevice/csrc/aten/native/Extra.cpp
index aaf28e1a..eb76f5d7 100644
--- a/PyTorchSimDevice/csrc/aten/native/Extra.cpp
+++ b/PyTorchSimDevice/csrc/aten/native/Extra.cpp
@@ -20,8 +20,38 @@ int64_t _fused_sdp_choice(
     std::optional<double> scale,
     bool enable_gqa) {
 
-  auto backend = sdp::SDPBackend::overrideable;
-  return static_cast<int64_t>(backend);
+  sdp::sdp_params params{query, key, value, attn_mask, dropout_p, is_causal, enable_gqa};
+
+  // Reject inputs that are fundamentally unsupported (e.g. wrong rank)
+  if (!sdp::check_tensor_shapes(params, /*debug=*/false)) {
+    return static_cast<int64_t>(sdp::SDPBackend::error);
+  }
+
+  // q: (B, Hq, L, E)   k/v: (B, H, S, E)
+  const int64_t Hq = query.size(-3);
+  const int64_t H  = key.size(-3);
+  const int64_t L  = query.size(-2);  // query sequence length
+  const int64_t S  = key.size(-2);    // key/value sequence length
+
+  // Conditions required by the MLIR FlashSDPA kernel:
+  // Prefill only  : L == S  (decode has L == 1, not supported)
+  // Non-GQA       : Hq == H (equal query and KV heads)
+  // No dropout    : template has no dropout implementation
+  // Dense tensors : no nested tensor support
+  const bool can_use_mlir_flash =
+      (L == S) &&
+      (Hq == H) && !enable_gqa &&
+      sdp::check_for_dropout(params, /*debug=*/false) &&
+      sdp::check_nested_tensor(params, /*debug=*/false);
+
+  const bool ctx_flash        = at::globalContext().userEnabledFlashSDP();
+  const bool ctx_math         = at::globalContext().userEnabledMathSDP();
+
+  if (ctx_flash && can_use_mlir_flash) {
+    return static_cast<int64_t>(sdp::SDPBackend::overrideable);
+  }
+
+  return static_cast<int64_t>(sdp::SDPBackend::math);
 }
 
 void quantize_tensor_per_tensor_affine_stub(
diff --git a/PyTorchSimDevice/torch_openreg/openreg/__init__.py b/PyTorchSimDevice/torch_openreg/openreg/__init__.py
index f674ec06..592011aa 100644
--- a/PyTorchSimDevice/torch_openreg/openreg/__init__.py
+++ b/PyTorchSimDevice/torch_openreg/openreg/__init__.py
@@ -73,6 +73,11 @@ def _lazy_init():
     register_interface_for_device(custom_device(), ExtensionDeviceInterface)
     _initialized = True
 
+    # Set default SDPA backend to math-only for this device.
+    torch._C._set_sdp_use_flash(False)
+    torch._C._set_sdp_use_overrideable(False)
+    torch._C._set_sdp_use_math(True)
+
     # Create default streams for all devices
     num_devices = device_count()
     for device_idx in range(num_devices):
diff --git a/PyTorchSimFrontend/mlir/mlir_lowering.py b/PyTorchSimFrontend/mlir/mlir_lowering.py
index 7b2c07bf..b717089f 100644
--- a/PyTorchSimFrontend/mlir/mlir_lowering.py
+++ b/PyTorchSimFrontend/mlir/mlir_lowering.py
@@ -20,8 +20,6 @@
 from PyTorchSimFrontend.mlir.mlir_sort_template import MLIRSortTemplate, MLIRStableSortTemplate
 from PyTorchSimFrontend.mlir.mlir_sdpa_template import (
     MLIRFlashSDPATemplate,
-    MLIRDecodeGQASDPAPartialTemplate,
-    MLIRDecodeGQASDPAReduceTemplate,
     flash_sdpa_args,
     calculate_scale,
 )
@@ -51,56 +49,27 @@ def tuned_bmm(mat1, mat2, *, layout=None):
 
 
 def tuned_flash_sdpa(
-        query             : TensorBox, 
-        key               : TensorBox, 
-        value             : TensorBox, 
+        query             : TensorBox,
+        key               : TensorBox,
+        value             : TensorBox,
         attn_bias         : Optional[TensorBox] = None,
-        dropout_p         : float = 0.0, 
-        is_causal         : bool = False, 
+        dropout_p         : float = 0.0,
+        is_causal         : bool = False,
         return_debug_mask : bool = False,
-        scale             : Optional[float] = None) -> tuple: 
-    
-    
+        scale             : Optional[float] = None,
+        enable_gqa        : bool = False) -> tuple:
+    # _fused_sdp_choice in C++ already guarantees:
+    #   L == S (prefill), Hq == H (non-GQA), dropout_p == 0.0
+    # before routing here via SDPBackend::overrideable.
+    # Non-matching shapes fall back to SDPBackend::math in C++ and decompose
+    # into primitive ops (matmul/softmax) before reaching this lowering.
     scale = calculate_scale(query, scale)
     N, Hq, H, L, S, E, Ev, layout, query, key, value = flash_sdpa_args(query, key, value)
-    
-    # Decode-only GQA fast path: q is (B,Hq,1,Dh), B==1, Hq!=H, Hq%H==0.
-    # Always use the 2-kernel decode path:
-    # 1) block partials over (kv head, sequence block)
-    # 2) reduce/merge across blocks
-    # This keeps KV shared across qsub, avoids dh0-outer duplication, and
-    # stores compact partials instead of full score/prob tensors in DRAM.
-    if L == 1 and Hq != H and N == 1 and (Hq % H) == 0:
-        g = Hq // H
-        vector_lane = extension_config.vpu_num_lanes
-        tile_e = vector_lane
-        dh_tiles = E // tile_e
-        decode_gqa_block_size = 512
-        BlkS = decode_gqa_block_size if S >= decode_gqa_block_size else int(S)
-        # Padding-based tail handling: allow S not divisible by BlkS.
-        nblk = (S + BlkS - 1) // BlkS
-        HgDhTiles = H * g * dh_tiles
-        tile_pack = tile_e * 2
-
-        partial_layout = ir.FixedLayout(
-            query.get_device(),
-            torch.float32,
-            [HgDhTiles, nblk, tile_pack],
-        )
-        partial_tmpl = MLIRDecodeGQASDPAPartialTemplate([query, key, value], partial_layout, scale, BlkS=BlkS)
-        partial = partial_tmpl.generate().output_node()
-        partial.realize()
-        reduce_tmpl = MLIRDecodeGQASDPAReduceTemplate([partial], layout, BlkS=BlkS)
-        out_node = reduce_tmpl.generate().output_node()
-        return (out_node, None, None, None, None, None, None, None, None)
-
     mlir_template = MLIRFlashSDPATemplate([query, key, value], layout, scale)
-
-    # _scaled_dot_product_flash_attention has to return a tuple which has 9 values
-    # since its backward(_scaled_dot_product_flash_attention_backward) needs that values.
-    # (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor rng_state, Tensor unused, Tensor debug_attn_mask)
     return (mlir_template.generate().output_node(), None, None, None, None, None, None, None, None)
 
+
+
 def conv_layout(
     x: TensorBox,
     weight: TensorBox,
@@ -345,5 +314,4 @@ def _sort_layouts(x: TensorBox, dim: int, descending: bool):
     
 if extension_config.CONFIG_USE_TIMING_POOLING:
     lowerings.update({getattr(aten.max_pool2d_with_indices, overload): custom_maxpool for overload in aten.max_pool2d_with_indices.overloads()}) # FIXME: maxpool should be implemented as a template
-
 lowerings.update({getattr(aten._scaled_dot_product_fused_attention_overrideable, overload): tuned_flash_sdpa for overload in aten._scaled_dot_product_fused_attention_overrideable.overloads()})
diff --git a/PyTorchSimFrontend/mlir/mlir_sdpa_template.py b/PyTorchSimFrontend/mlir/mlir_sdpa_template.py
index be6e7124..37db4956 100644
--- a/PyTorchSimFrontend/mlir/mlir_sdpa_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_sdpa_template.py
@@ -125,14 +125,6 @@ def flash_sdpa_args(
             "Flash SDPA currently requires matching head dimensions between query and value (e == ev)."
         )
 
-    # Support head dimensions larger than vector lanes by tiling e/ev.
-    # For now, require multiples of vector lanes (covers 64/128 with vlanes=16).
-    vector_lane = extension_config.vpu_num_lanes
-    if (e % vector_lane) != 0:
-        raise NotImplementedError(
-            f"Flash SDPA currently requires e to be a multiple of vlanes (e: {e}, vlanes: {vector_lane})."
-        )
-
     # Minimal GQA support (single-batch only for now).
     # We map each query head to a KV head by grouping: hq = g * h.
     if hq != h:
@@ -309,7 +301,7 @@ def calculate_scale(query: torch.Tensor, scale: float) -> float:
             { idx_map = array<i32: 2, 1, -1> }
             ins(%vt_buffer2D, %mul_buffer : memref<{{ tile_e }}x{{ tile_s }}x{{ data_stype }}, 1>, {{ mul_tile_desc.get_mlir_shape(data_stype) }})
             outs(%ot_buffer2D : memref<{{ tile_e }}x{{ tile_l }}x{{ data_stype }}, 1>)
-        }
+        } {inner_loop=true}
 
         // out @ row_sum^(-1)
         %final_row_sum = affine.vector_load %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape(data_stype) }}, vector<2x{{ data_stype }}>
@@ -556,416 +548,3 @@ def select_tile(self, kernel, l, s, e, n_extra_node, n_extra_read, n_prologue_no
 
         return tile_candidates
 
-
-# ---------------------------
-# Decode-only GQA SDPA: 2-kernel pipeline (partial blocks + reduce)
-# ---------------------------
-
-DECODE_GQA_SDPA_PARTIAL_TEMPLATE = r"""
-// Decode GQA SDPA partial kernel (per sequence block)
-// Produces partials per (kv,qsub,dh_tile,blk):
-// - first half lanes: o_j (tile_e)
-// - second half lanes: [m_j, l_j, 0, 0, ...] (tile_e)
-// QK/softmax is computed once per (kv,qsub,s0) over full Dh using k0 reduction.
-// SV then reuses those probabilities across all dh tiles.
-// H = {{ H }}, g = {{ g }}, Dh = {{ Dh }}, dh_tiles = {{ dh_tiles }}, S = {{ S }}, BlkS = {{ BlkS }}, nblk = {{ nblk }}
-{{kernel.def_global_vars()}}
-
-func.func @{{ KERNEL_NAME }}{{kernel.def_kernel(inputs=[query, key, value], outputs=[partial], names_str="query, key, value, partial", input_reorder=input_reorder)}} {
-  {{ kernel.def_sram_buffer("query", q_tile_desc, indent_size=2) }}
-  {{ kernel.def_sram_buffer("key", k_tile_desc, indent_size=2) }}
-  {{ kernel.def_sram_buffer("value", v_tile_desc, indent_size=2) }}
-  {{ kernel.def_sram_buffer("mul", mul_tile_desc, indent_size=2) }}
-
-
-  %c0 = arith.constant 0.0 : f32
-  %c_scale = arith.constant {{ scale }} : f32
-  %c_neg_inf = arith.constant -1.0e+30 : f32
-
-  %v0_e = arith.constant dense<0.0> : vector<{{ tile_e }}xf32>
-  %v0_e_io = arith.constant dense<0.0> : vector<{{ tile_e }}x{{ io_stype }}>
-  %v0_s = arith.constant dense<0.0> : vector<{{ tile_s }}xf32>
-  %v0_2x = arith.constant dense<0.0> : vector<2xf32>
-  %v_neg_inf_2x = arith.constant dense<-1.0e+30> : vector<2xf32>
-  %v_scale = vector.broadcast %c_scale : f32 to vector<{{ tile_s }}xf32>
-
-  {{ kernel.def_local_vars(indent_size=2) }}
-
-  affine.for %kv = 0 to {{ H }} {
-    affine.for %blk = 0 to {{ nblk }} step 1 {
-      // Reset per-block accumulators for all qsub/dh tiles.
-      %qk_offset = affine.apply {{ qk_offset_map }}(%kv)
-      {{ kernel.def_dma_op("MVIN", "query", [], q_tile_desc, indent_size=8, dram_stride=q_dram_stride, dram_offset="qk_offset") }}
-      %q2D_buffer = memref.reinterpret_cast %q_buffer to offset: [0], sizes: [{{ Dh }}, {{ g_size }}], strides: [{{g_size}}, 1] : {{ q_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ Dh }}x{{ g_size }}x{{ io_stype }}, 1>
-      affine.for %s0 = 0 to {{ BlkS }} step {{ tile_s }} {
-        affine.for %k0 = 0 to {{ Dh }} step {{ tile_e }} {
-          %kk_offset = affine.apply {{ kk_offset_map_blk }}(%kv, %s0, %k0)[%blk]
-          {{ kernel.def_dma_op("MVIN", "key", [], k_tile_desc, indent_size=10, padding=1, dram_stride=k_dram_stride, dram_offset="kk_offset") }}
-          %k2D = memref.reinterpret_cast %k_buffer to offset: [0], sizes: [{{ tile_s }}, {{ tile_e }}], strides: [{{ tile_e }},1] : {{ k_tile_desc.get_mlir_shape(io_stype) }} to memref<{{ tile_s }}x{{ tile_e }}x{{ io_stype }}, 1>
-          %q2D = memref.reinterpret_cast %q2D_buffer to offset: [%k0], sizes: [{{ tile_e }}, {{ g_size }}], strides: [{{ g_size }}, 1] : memref<{{ Dh }}x{{ g_size }}x{{ io_stype }}, 1> to memref<{{ tile_e }}x{{ g_size }}x{{ io_stype }}, 1>
-          linalg.matmul
-            ins(%k2D, %q2D : memref<{{ tile_s }}x{{ tile_e }}x{{ io_stype }}, 1>, memref<{{ tile_e }}x{{ g_size }}x{{ io_stype }}, 1>)
-            outs(%mul_buffer : {{ mul_tile_desc.get_mlir_shape(io_stype) }})
-
-        } { accumulation_loop=true }
-      } { accumulation_loop=true }
-    } { outer_loop=true }
-  } { outer_loop=true }
-  return
-}
-"""
-
-
-class MLIRDecodeGQASDPAPartialTemplate(MLIRTemplate):
-    def __init__(self, input_nodes, layout, scale, BlkS: int = 1024, input_reorder=None):
-        super().__init__("kernel", input_nodes, layout, input_reorder)
-        self.scale = scale
-        self.BlkS = BlkS
-
-    def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue_nodes=None, prologue_nodes=None, tile_info=None, **kwargs):
-        query, key, value = self.input_nodes[0], self.input_nodes[1], self.input_nodes[2]
-        # Use the actual registered buffer node (e.g. "buf0") instead of the placeholder "buf_out".
-        partial = template_buffer_node if template_buffer_node is not None else self.output_node
-
-        q_tensor4 = empty_strided(query.layout.size, query.layout.stride)
-        k_tensor4 = empty_strided(key.layout.size, key.layout.stride)
-        v_tensor4 = empty_strided(value.layout.size, value.layout.stride)
-        B, Hq, Lq, Dh = q_tensor4.shape
-        _, H, S, _ = k_tensor4.shape
-        assert B == 1 and Lq == 1
-        g = Hq // H
-        g_size = g
-        BlkS = min(int(self.BlkS), int(S))
-        nblk = (int(S) + int(BlkS) - 1) // int(BlkS)
-
-        io_stype = mlir_common.DTYPE_TO_MLIR[query.get_dtype()]
-        tile_s = kernel.vector_lane
-        tile_e = kernel.vector_lane
-        tile_pack = tile_e * 2
-
-        # Use 3D views for indices
-        q_tensor = q_tensor4.view(Hq, 1, Dh)
-        k_tensor = k_tensor4.view(H, S, Dh)
-        v_tensor = v_tensor4.view(H, S, Dh)
-
-        # Flatten (kv,qsub,dh_tile) into GH = H*g*(Dh/tile_e)
-        dh_tiles = int(Dh) // int(tile_e)
-        HgDhTiles = int(H) * int(g) * int(dh_tiles)
-
-        # tile descs
-        vlane_stride = 1
-        q_tile_desc = mlir_common.MLIRMultiDimTile([Dh, 1, g_size], kernel.vector_lane, 2, vlane_stride)
-        q_tile_desc.set_tile_size_stride([Dh, 1, g_size], [g_size, 1, 1])
-        q_tile_desc.set_name("q_buffer")
-        q_tile_desc.offset = query.get_layout().offset
-
-        k_tile_desc = mlir_common.MLIRMultiDimTile([1, tile_s, tile_e], kernel.vector_lane, 2, vlane_stride)
-        k_tile_desc.set_tile_size_stride([1, tile_s, tile_e], [1, 1, tile_s])
-        k_tile_desc.set_name("k_buffer")
-        k_tile_desc.offset = key.get_layout().offset
-
-        v_tile_desc = mlir_common.MLIRMultiDimTile([1, tile_s, tile_e], kernel.vector_lane, 1, vlane_stride)
-        v_tile_desc.set_tile_size_stride([1, tile_s, tile_e], [1, tile_e, 1])
-        v_tile_desc.set_name("v_buffer")
-        v_tile_desc.offset = value.get_layout().offset
-
-        mul_tile_desc = mlir_common.MLIRMultiDimTile([tile_s, g_size], kernel.vector_lane, 1, vlane_stride)
-        mul_tile_desc.set_tile_size_stride([tile_s, g_size], [1, tile_s])
-        mul_tile_desc.set_name("mul_buffer")
-
-        # score_desc = mlir_common.MLIRMultiDimTile([g, tile_s], kernel.vector_lane, 1, vlane_stride)
-        # score_desc.set_tile_size_stride([g, tile_s], [tile_s, 1])
-        # score_desc.set_name("score_buffer")
-
-        # prob_desc = mlir_common.MLIRMultiDimTile([g, tile_s], kernel.vector_lane, 1, vlane_stride)
-        # prob_desc.set_tile_size_stride([g, tile_s], [tile_s, 1])
-        # prob_desc.set_name("prob_buffer")
-
-        # # Per-qsub, per-dh-tile accumulators so QK is computed once and SV expands across dh tiles.
-        # out_acc_tile_desc = mlir_common.MLIRMultiDimTile([g, dh_tiles, tile_e], kernel.vector_lane, 2, vlane_stride)
-        # out_acc_tile_desc.set_tile_size_stride([g, dh_tiles, tile_e], [dh_tiles * tile_e, tile_e, 1])
-        # out_acc_tile_desc.set_name("out_acc_buffer")
-
-        # max_desc = mlir_common.MLIRMultiDimTile([g, 2], kernel.vector_lane, 0, vlane_stride)
-        # max_desc.set_tile_size_stride([g, 2], [2, 1])
-        # max_desc.set_name("max_buffer")
-
-        # sum_desc = mlir_common.MLIRMultiDimTile([g, 2], kernel.vector_lane, 0, vlane_stride)
-        # sum_desc.set_tile_size_stride([g, 2], [2, 1])
-        # sum_desc.set_name("sum_buffer")
-
-        # out_io_tile_desc = mlir_common.MLIRMultiDimTile([1, 1, tile_e], kernel.vector_lane, 1, vlane_stride)
-        # out_io_tile_desc.set_tile_size_stride([1, 1, tile_e], [0, tile_e, 1])
-        # out_io_tile_desc.set_name("out_io_buffer")
-
-        # partial_tile_desc = mlir_common.MLIRMultiDimTile([1, 1, tile_pack], kernel.vector_lane, 1, vlane_stride)
-        # partial_tile_desc.set_tile_size_stride([1, 1, tile_pack], [0, tile_pack, 1])
-        # partial_tile_desc.set_name("partial_buffer")
-
-        # Strides from 3D tensor views
-        q_stride = q_tensor.stride()
-        k_stride = k_tensor.stride()
-        v_stride = v_tensor.stride()
-
-        # partial tensor is view(HgDhTiles, nblk, tile_pack) contiguous
-        p_tensor = empty_strided(partial.get_layout().size, partial.get_layout().stride).view(HgDhTiles, nblk, tile_pack)
-        p_stride = p_tensor.stride()
-
-        # DMA strides
-        k_dram_stride = [int(k_stride[0]), int(k_stride[1]), int(k_stride[2])]
-        q_dram_stride = [int(q_stride[2]), 0, int(q_stride[0])]
-        v_dram_stride = [int(v_stride[0]), int(v_stride[1]), int(v_stride[2])]
-        partial_dram_stride = [int(p_stride[0]), int(p_stride[1]), 1]
-
-        # Affine offset maps
-        kk_offset_map   = _make_offset_map(k_dram_stride, k_tile_desc.offset)
-        qk_offset_map   = _make_offset_map([int(g) * int(q_stride[2])], q_tile_desc.offset)
-        v_offset_map    = _make_offset_map(v_dram_stride, v_tile_desc.offset)
-        # partial: offset(gh, blk)  -- gh = (kv*g+qsub)*dh_tiles+dht, pre-computed in template
-        partial_offset_map = _make_offset_map([int(p_stride[0]), int(p_stride[1])], 0)
-        # Blk-symbol variants: %s0 is relative (0..BlkS-1), %blk is a block index (0..nblk-1),
-        # so actual_s = s0_rel + BlkS * blk → sym_stride=BlkS.
-        kk_offset_map_blk = _make_offset_map_with_sym(k_dram_stride, sym_dim=1, sym_stride=int(BlkS), offset=k_tile_desc.offset)
-        v_offset_map_blk  = _make_offset_map_with_sym(v_dram_stride, sym_dim=1, sym_stride=int(BlkS), offset=v_tile_desc.offset)
-
-        # Keep sympy-based indices only for epilogue_info
-        kv   = sympy.Symbol("kv")
-        qsub = sympy.Symbol("qsub")
-        dht  = sympy.Symbol("dht")
-        dh0  = sympy.Symbol("dh0")
-        blk  = sympy.Symbol("blk")
-        q_head = kv * g + qsub
-        gh = (kv * g + qsub) * dh_tiles + dht
-        partial_idx = [gh * p_stride[0], blk * p_stride[1], sympy.Integer(0)]
-
-        kernel.loop_size = [tile_s, tile_e, tile_pack]
-
-        kernel.render_options = dict(
-            KERNEL_NAME=self.name,
-            kernel=kernel,
-            H=H,
-            g=g,
-            Dh=Dh,
-            S=S,
-            BlkS=BlkS,
-            nblk=nblk,
-            tile_s=tile_s,
-            tile_e=tile_e,
-            g_size=g_size,
-            dh_tiles=dh_tiles,
-            tile_pack=tile_pack,
-            io_stype=io_stype,
-            scale=self.scale,
-            query=query,
-            key=key,
-            value=value,
-            partial=partial,
-            q_tile_desc=q_tile_desc,
-            k_tile_desc=k_tile_desc,
-            v_tile_desc=v_tile_desc,
-            mul_tile_desc=mul_tile_desc,
-            # score_desc=score_desc,
-            # prob_desc=prob_desc,
-            # out_io_tile_desc=out_io_tile_desc,
-            # out_acc_tile_desc=out_acc_tile_desc,
-            # max_desc=max_desc,
-            # sum_desc=sum_desc,
-            # partial_tile_desc=partial_tile_desc,
-            # DMA strides
-            k_dram_stride=k_dram_stride,
-            q_dram_stride=q_dram_stride,
-            v_dram_stride=v_dram_stride,
-            partial_dram_stride=partial_dram_stride,
-            # Affine offset maps
-            kk_offset_map=kk_offset_map,
-            qk_offset_map=qk_offset_map,
-            v_offset_map=v_offset_map,
-            partial_offset_map=partial_offset_map,
-            kk_offset_map_blk=kk_offset_map_blk,
-            v_offset_map_blk=v_offset_map_blk,
-            input_reorder=self.input_reorder,
-        )
-
-        return self._template_from_string(DECODE_GQA_SDPA_PARTIAL_TEMPLATE).render(**kernel.render_options)
-
-
-DECODE_GQA_SDPA_REDUCE_TEMPLATE = r"""
-// Decode GQA SDPA reduce kernel: merge partials across blocks
-// Input partial shape: (HgDhTiles, nblk, tile_pack)
-{{kernel.def_global_vars()}}
-
-func.func @{{ KERNEL_NAME }}{{kernel.def_kernel(inputs=[partial], outputs=[out], names_str="partial, out", input_reorder=input_reorder)}} {
-  {{ kernel.def_sram_buffer("partial", partial_tile_desc, indent_size=2) }}
-  {{ kernel.def_sram_buffer("out_acc", out_acc_tile_desc, indent_size=2) }}
-  {{ kernel.def_sram_buffer("out", out_tile_desc, indent_size=2) }}
-  {{ kernel.def_sram_buffer("max", max_desc, indent_size=2) }}
-  {{ kernel.def_sram_buffer("sum", sum_desc, indent_size=2) }}
-
-  %c0 = arith.constant 0.0 : f32
-  %c1 = arith.constant 1.0 : f32
-  %c_neg_inf = arith.constant -1.0e+30 : f32
-  %v0_e = arith.constant dense<0.0> : vector<{{ tile_e }}xf32>
-  %v0_2x = arith.constant dense<0.0> : vector<2xf32>
-  %v_neg_inf_2x = arith.constant dense<-1.0e+30> : vector<2xf32>
-
-  {{ kernel.def_local_vars(indent_size=2) }}
-
-  affine.for %gh = 0 to {{ HgDhTiles }} {
-    // reset merged accumulators
-    affine.vector_store %v0_e, %out_acc_buffer[0, 0, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32>
-    affine.vector_store %v_neg_inf_2x, %max_buffer[0, 0] : {{ max_desc.get_mlir_shape("f32") }}, vector<2xf32>
-    affine.vector_store %v0_2x, %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape("f32") }}, vector<2xf32>
-
-    affine.for %blk = 0 to {{ nblk }} {
-      %partial_offset = affine.apply {{ partial_offset_map }}(%gh, %blk)
-      {{ kernel.def_dma_op("MVIN", "partial", [], partial_tile_desc, indent_size=8, dram_stride=partial_dram_stride, dram_offset="partial_offset") }}
-      %p = affine.vector_load %partial_buffer[0, 0, 0] : {{ partial_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_pack }}xf32>
-      %p2 = vector.shape_cast %p : vector<{{ tile_pack }}xf32> to vector<2x{{ tile_e }}xf32>
-      %o_j = vector.extract %p2[0] : vector<{{ tile_e }}xf32> from vector<2x{{ tile_e }}xf32>
-      %ml_j = vector.extract %p2[1] : vector<{{ tile_e }}xf32> from vector<2x{{ tile_e }}xf32>
-      %m_j = vector.extract %ml_j[0] : f32 from vector<{{ tile_e }}xf32>
-      %l_j = vector.extract %ml_j[1] : f32 from vector<{{ tile_e }}xf32>
-
-      %old_max = affine.vector_load %max_buffer[0, 0] : {{ max_desc.get_mlir_shape("f32") }}, vector<2xf32>
-      %m_old = vector.extract %old_max[0] : f32 from vector<2xf32>
-      %m_new = arith.maximumf %m_old, %m_j : f32
-      %m_new2 = vector.broadcast %m_new : f32 to vector<2xf32>
-      affine.vector_store %m_new2, %max_buffer[0, 0] : {{ max_desc.get_mlir_shape("f32") }}, vector<2xf32>
-
-      %diff_old = arith.subf %m_old, %m_new : f32
-      %diff_j = arith.subf %m_j, %m_new : f32
-      %diff_old_v = vector.broadcast %diff_old : f32 to vector<1xf32>
-      %diff_j_v = vector.broadcast %diff_j : f32 to vector<1xf32>
-      %scale_old_v = math.exp %diff_old_v : vector<1xf32>
-      %scale_j_v = math.exp %diff_j_v : vector<1xf32>
-      %scale_old = vector.extract %scale_old_v[0] : f32 from vector<1xf32>
-      %scale_j = vector.extract %scale_j_v[0] : f32 from vector<1xf32>
-      %scale_old_e = vector.broadcast %scale_old : f32 to vector<{{ tile_e }}xf32>
-      %scale_j_e = vector.broadcast %scale_j : f32 to vector<{{ tile_e }}xf32>
-
-      %o_old = affine.vector_load %out_acc_buffer[0, 0, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32>
-      %o_old_rs = arith.mulf %o_old, %scale_old_e : vector<{{ tile_e }}xf32>
-      %o_j_rs = arith.mulf %o_j, %scale_j_e : vector<{{ tile_e }}xf32>
-      %o_new = arith.addf %o_old_rs, %o_j_rs : vector<{{ tile_e }}xf32>
-      affine.vector_store %o_new, %out_acc_buffer[0, 0, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32>
-
-      %old_sum = affine.vector_load %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape("f32") }}, vector<2xf32>
-      %l_old = vector.extract %old_sum[0] : f32 from vector<2xf32>
-      %l_old_rs = arith.mulf %l_old, %scale_old : f32
-      %l_j_rs = arith.mulf %l_j, %scale_j : f32
-      %l_new = arith.addf %l_old_rs, %l_j_rs : f32
-      %l_new2 = vector.broadcast %l_new : f32 to vector<2xf32>
-      affine.vector_store %l_new2, %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape("f32") }}, vector<2xf32>
-    } { accumulation_loop=true }
-
-    // finalize: out = o / l
-    %sum2 = affine.vector_load %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape("f32") }}, vector<2xf32>
-    %l = vector.extract %sum2[0] : f32 from vector<2xf32>
-    %inv = arith.divf %c1, %l : f32
-    %inv_e = vector.broadcast %inv : f32 to vector<{{ tile_e }}xf32>
-    %o = affine.vector_load %out_acc_buffer[0, 0, 0] : {{ out_acc_tile_desc.get_mlir_shape("f32") }}, vector<{{ tile_e }}xf32>
-    %out_f32 = arith.mulf %o, %inv_e : vector<{{ tile_e }}xf32>
-    {% if io_stype != "f32" %}%out_io = arith.truncf %out_f32 : vector<{{ tile_e }}xf32> to vector<{{ tile_e }}x{{ io_stype }}>{% endif %}
-    affine.vector_store {{ "%out_io" if io_stype != "f32" else "%out_f32" }}, %out_buffer[0, 0, 0] : {{ out_tile_desc.get_mlir_shape(io_stype) }}, vector<{{ tile_e }}x{{ io_stype }}>
-    %out_offset = affine.apply {{ out_offset_map }}(%gh)
-    {{ kernel.def_dma_op("MVOUT", "out", [], out_tile_desc, indent_size=4, dram_stride=out_dram_stride, dram_offset="out_offset") }}
-  } { outer_loop=true }
-  return
-}
-"""
-
-
-class MLIRDecodeGQASDPAReduceTemplate(MLIRTemplate):
-    def __init__(self, input_nodes, layout, BlkS: int = 1024, input_reorder=None):
-        super().__init__("kernel", input_nodes, layout, input_reorder)
-        self.BlkS = BlkS
-
-    def render(self, kernel: MLIRTemplateKernel, template_buffer_node=None, epilogue_nodes=None, prologue_nodes=None, tile_info=None, **kwargs):
-        partial = self.input_nodes[0]
-        # Use the actual registered buffer node (e.g. "buf0") instead of the placeholder "buf_out".
-        out = template_buffer_node if template_buffer_node is not None else self.output_node
-
-        tile_e = kernel.vector_lane
-        tile_pack = tile_e * 2
-
-        # Infer sizes from partial layout: (HgDhTiles, nblk, tile_pack)
-        HgDhTiles, nblk, _ = partial.get_size()
-        io_stype = mlir_common.DTYPE_TO_MLIR[out.get_dtype()]
-
-        vlane_stride = 1
-        partial_tile_desc = mlir_common.MLIRMultiDimTile([1, 1, tile_pack], kernel.vector_lane, 1, vlane_stride)
-        partial_tile_desc.set_tile_size_stride([1, 1, tile_pack], [0, tile_pack, 1])
-        partial_tile_desc.set_name("partial_buffer")
-        partial_tile_desc.offset = partial.get_layout().offset
-
-        out_acc_tile_desc = mlir_common.MLIRMultiDimTile([1, 1, tile_e], kernel.vector_lane, 1, vlane_stride)
-        out_acc_tile_desc.set_tile_size_stride([1, 1, tile_e], [0, tile_e, 1])
-        out_acc_tile_desc.set_name("out_acc_buffer")
-
-        max_desc = mlir_common.MLIRMultiDimTile([1, 2], kernel.vector_lane, 0, vlane_stride)
-        max_desc.set_tile_size_stride([1, 2], [2, 1])
-        max_desc.set_name("max_buffer")
-
-        sum_desc = mlir_common.MLIRMultiDimTile([1, 2], kernel.vector_lane, 0, vlane_stride)
-        sum_desc.set_tile_size_stride([1, 2], [2, 1])
-        sum_desc.set_name("sum_buffer")
-
-        out_tile_desc = mlir_common.MLIRMultiDimTile([1, 1, tile_e], kernel.vector_lane, 1, vlane_stride)
-        out_tile_desc.set_tile_size_stride([1, 1, tile_e], [0, tile_e, 1])
-        out_tile_desc.set_name("out_buffer")
-
-        # Partial tensor strides
-        p_tensor = empty_strided(partial.get_layout().size, partial.get_layout().stride)
-        p_stride = p_tensor.stride()
-
-        # Out view: (Hq*dh_tiles, 1, tile_e)
-        out_tensor4 = empty_strided(out.get_layout().size, out.get_layout().stride)
-        B, Hq, Lq, Dh = out_tensor4.shape
-        assert B == 1 and Lq == 1
-        dh_tiles = int(Dh) // int(tile_e)
-        out_tensor = out_tensor4.view(Hq * dh_tiles, 1, tile_e)
-        o_stride = out_tensor.stride()
-
-        # DMA strides
-        partial_dram_stride = [int(p_stride[0]), int(p_stride[1]), 1]
-        out_dram_stride     = [int(o_stride[0]), 0, 0]
-
-        # Affine offset maps
-        # partial: offset(gh, blk)
-        partial_offset_map = _make_offset_map([int(p_stride[0]), int(p_stride[1])], partial_tile_desc.offset)
-        # out: offset(gh)  -- single dimension
-        out_offset_map     = _make_offset_map([int(o_stride[0])], 0)
-
-        # Keep sympy-based indices for epilogue_info
-        gh  = sympy.Symbol("gh")
-        blk = sympy.Symbol("blk")
-        partial_idx = [gh * p_stride[0], blk * p_stride[1], sympy.Integer(0)]
-        out_idx     = [gh * o_stride[0], sympy.Integer(0), sympy.Integer(0)]
-
-        kernel.loop_size = [tile_pack, tile_e, 1]
-
-        kernel.render_options = dict(
-            KERNEL_NAME=self.name,
-            kernel=kernel,
-            HgDhTiles=HgDhTiles,
-            nblk=nblk,
-            tile_e=tile_e,
-            tile_pack=tile_pack,
-            io_stype=io_stype,
-            partial=partial,
-            out=out,
-            partial_tile_desc=partial_tile_desc,
-            out_acc_tile_desc=out_acc_tile_desc,
-            max_desc=max_desc,
-            sum_desc=sum_desc,
-            out_tile_desc=out_tile_desc,
-            # DMA strides
-            partial_dram_stride=partial_dram_stride,
-            out_dram_stride=out_dram_stride,
-            # Affine offset maps
-            partial_offset_map=partial_offset_map,
-            out_offset_map=out_offset_map,
-            input_reorder=self.input_reorder,
-        )
-
-        return self._template_from_string(DECODE_GQA_SDPA_REDUCE_TEMPLATE).render(**kernel.render_options)
diff --git a/tests/test_sdpa.py b/tests/test_sdpa.py
index ed7ae8f8..c4825731 100644
--- a/tests/test_sdpa.py
+++ b/tests/test_sdpa.py
@@ -1,128 +1,145 @@
 import sys
-import math
+import os
 import torch
-import inspect
-from typing import List
+import torch._dynamo
 import torch.nn.functional as F
-from torch.nn.attention import SDPBackend, sdpa_kernel 
-from torch.fx.passes.graph_drawer import FxGraphDrawer
-from torch._inductor.decomposition import decompositions
 
-def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
-    message = f"|{name} Test Passed|"
+base_dir = os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim")
+sys.path.append(base_dir)
+
+device = torch.device("npu:0")
+
+# ---------------------------------------------------------------------------
+# Default sweep configs - edit here to change what gets tested
+# ---------------------------------------------------------------------------
+SDPA_DEFAULTS = dict(
+    n_batch_list  = [1, 4, 8, 16],
+    n_head_list   = [4, 6, 8, 12],
+    n_token_list  = [128, 256, 512, 1024],
+    head_dim_list = [32, 64, 128],
+    is_causal     = False,
+)
+
+GQA_DEFAULTS = dict(
+    batch_list      = [1],
+    num_kv_heads    = 1,
+    gqa_ratios      = [4, 5, 8, 16],   # Hq = ratio * num_kv_heads
+    seq_len_list    = [128, 256, 1024],
+    head_dim_list   = [64, 128],
+    query_len       = 1,               # decode shape: Lq == 1
+    is_causal       = True,
+)
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def clear_caches():
+    from torch._functorch._aot_autograd.autograd_cache import AOTAutogradCache
+    from torch._inductor.codecache import FxGraphCache
+    AOTAutogradCache.clear()
+    torch._dynamo.reset()
+    os.environ["TORCHINDUCTOR_CACHE"] = "0"
+    FxGraphCache.clear()
+
+
+def assert_close(name, out, cpu_out, rtol=1e-4, atol=1e-4):
+    msg = f"|{name} Test Passed|"
     if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
-        print("-" * len(message))
-        print(message)
-        print("-" * len(message))
-        pass
+        print("-" * len(msg))
+        print(msg)
+        print("-" * len(msg))
     else:
-        print("custom out: ", out.cpu())
-        print("cpu out: ", cpu_out)
+        print(f"[FAIL] {name}")
+        print("  device out:", out.cpu())
+        print("  cpu    out:", cpu_out)
         exit(1)
 
-def test_scaled_dot_product_attention(device, backends="flash"):
+
+def _run_sdpa(device, q, k, v, **kwargs):
+    """Compile and run SDPA on device; return result on device."""
+    opt_fn = torch.compile(dynamic=False)(F.scaled_dot_product_attention)
+    return opt_fn(q.to(device), k.to(device), v.to(device), **kwargs)
+
+
+def _cpu_sdpa(q, k, v, **kwargs):
+    """Run reference SDPA on CPU."""
+    return F.scaled_dot_product_attention(q.cpu(), k.cpu(), v.cpu(), **kwargs)
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+def test_sdpa(
+    device,
+    n_batch_list  = SDPA_DEFAULTS["n_batch_list"],
+    n_head_list   = SDPA_DEFAULTS["n_head_list"],
+    n_token_list  = SDPA_DEFAULTS["n_token_list"],
+    head_dim_list = SDPA_DEFAULTS["head_dim_list"],
+    is_causal     = SDPA_DEFAULTS["is_causal"],
+):
     torch.manual_seed(0)
-    n_batch_list = [1, 4, 8, 16]
-    n_head_list = [1, 4, 8, 12]
-    n_token_list = [128, 256, 512, 1024]
-    head_dim_list = [32, 64, 128]
-
-    for n_batch in n_batch_list:
-        for n_head in n_head_list:
-            for n_token in n_token_list:
-                for head_dim in head_dim_list:
-                    # Inputs
+    sdpa_kwargs = dict(attn_mask=None, dropout_p=0.0, is_causal=is_causal)
+
+    for B in n_batch_list:
+        for H in n_head_list:
+            for S in n_token_list:
+                for D in head_dim_list:
                     clear_caches()
-                    query = torch.rand(n_batch, n_head, n_token, head_dim, dtype=torch.float32)
-                    key = torch.rand(n_batch, n_head, n_token, head_dim, dtype=torch.float32)
-                    value = torch.rand(n_batch, n_head, n_token, head_dim, dtype=torch.float32)
-
-                    # With NPU
-                    query = query.to(device=device)
-                    key = key.to(device=device)
-                    value = value.to(device=device)
-
-                    opt_fn = torch.compile(dynamic=False)(F.scaled_dot_product_attention)
-                    out = opt_fn(query, key, value)
-                    out = out.to(device)
-
-                    # With CPU
-                    cpu_device = torch.device('cpu')
-                    query = query.to(device=cpu_device)
-                    key = key.to(device=cpu_device)
-                    value = value.to(device=cpu_device)
-                    cpu_out = F.scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False)
-
-                    name = f"SDPA(n_batch: {n_batch}, n_head: {n_head}, n_token: {n_token}, head_dim: {head_dim})"
-                    test_result(name, out, cpu_out)
-    
-    print("All tests passed!")
-
-def test_scaled_dot_product_attention_gqa_single_batch(device):
+                    q = torch.rand(B, H, S, D, dtype=torch.float32)
+                    k = torch.rand(B, H, S, D, dtype=torch.float32)
+                    v = torch.rand(B, H, S, D, dtype=torch.float32)
+
+                    out     = _run_sdpa(device, q, k, v, **sdpa_kwargs)
+                    cpu_out = _cpu_sdpa(q, k, v, **sdpa_kwargs)
+
+                    assert_close(f"SDPA(B:{B}, H:{H}, S:{S}, D:{D})", out, cpu_out)
+
+    print("All SDPA tests passed!")
+
+
+def test_gqa(
+    device,
+    batch_list   = GQA_DEFAULTS["batch_list"],
+    num_kv_heads = GQA_DEFAULTS["num_kv_heads"],
+    gqa_ratios   = GQA_DEFAULTS["gqa_ratios"],
+    seq_len_list = GQA_DEFAULTS["seq_len_list"],
+    head_dim_list= GQA_DEFAULTS["head_dim_list"],
+    query_len    = GQA_DEFAULTS["query_len"],
+    is_causal    = GQA_DEFAULTS["is_causal"],
+):
     """
-    Focused GQA testcases for single-batch (n==1).
-    Shapes:
-      q: (B, Hq, Lq, Dh)
-      k: (B, H,  S,  Dh)
-      v: (B, H,  S,  Dh)
+    GQA sweep: q shape (B, Hq, Lq, D), kv shape (B, H, S, D).
+    Hq = ratio * num_kv_heads for each ratio in gqa_ratios.
     """
     torch.manual_seed(0)
+    sdpa_kwargs = dict(attn_mask=None, dropout_p=0.0, is_causal=is_causal, enable_gqa=True)
 
-    B = 1
-    # Decode-focused: include a larger S to hit BlkS logic
-    seq_len_list = [128, 256, 1024]
-    head_dim_list = [64, 128]
-    # GQA ratios requested: Hq / H in {4, 5, 8, 16}.
-    # Keep H=1 to directly realize those ratios.
-    gqa_ratios = [4, 5, 8, 16]
-    H = 1
-
-    for seq_len in seq_len_list:
-        for head_dim in head_dim_list:
-            for ratio in gqa_ratios:
-                Hq = ratio * H
-
-                clear_caches()
-                # Decode shape: Lq == 1
-                q = torch.rand(B, Hq, 1, head_dim, dtype=torch.float32)
-                k = torch.rand(B, H, seq_len, head_dim, dtype=torch.float32)
-                v = torch.rand(B, H, seq_len, head_dim, dtype=torch.float32)
-
-                # NPU
-                q_npu = q.to(device=device)
-                k_npu = k.to(device=device)
-                v_npu = v.to(device=device)
-                opt_fn = torch.compile(dynamic=False)(F.scaled_dot_product_attention)
-                out = opt_fn(q_npu, k_npu, v_npu, attn_mask=None, dropout_p=0.0, is_causal=True, enable_gqa=True)
-
-                # CPU reference
-                cpu_device = torch.device("cpu")
-                cpu_out = F.scaled_dot_product_attention(
-                    q.to(device=cpu_device),
-                    k.to(device=cpu_device),
-                    v.to(device=cpu_device),
-                    attn_mask=None,
-                    dropout_p=0.0,
-                    is_causal=True,
-                    enable_gqa=True,
-                )
-
-                name = f"SDPA-GQA(B: {B}, Hq: {Hq}, H: {H}, S: {seq_len}, head_dim: {head_dim})"
-                test_result(name, out, cpu_out)
-
-    print("All GQA single-batch tests passed!")
+    for B in batch_list:
+        for S in seq_len_list:
+            for D in head_dim_list:
+                for ratio in gqa_ratios:
+                    Hq = ratio * num_kv_heads
+                    clear_caches()
+                    q = torch.rand(B, Hq, query_len, D, dtype=torch.float32)
+                    k = torch.rand(B, num_kv_heads, S, D, dtype=torch.float32)
+                    v = torch.rand(B, num_kv_heads, S, D, dtype=torch.float32)
 
-def clear_caches():
-    import os
-    from torch._functorch._aot_autograd.autograd_cache import AOTAutogradCache
-    from torch._inductor.codecache import FxGraphCache
-    AOTAutogradCache.clear()
-    torch._dynamo.reset()
-    os.environ["TORCHINDUCTOR_CACHE"] = "0"
-    FxGraphCache.clear()
+                    out     = _run_sdpa(device, q, k, v, **sdpa_kwargs)
+                    cpu_out = _cpu_sdpa(q, k, v, **sdpa_kwargs)
+
+                    assert_close(
+                        f"GQA(B:{B}, Hq:{Hq}, H:{num_kv_heads}, S:{S}, D:{D})",
+                        out, cpu_out,
+                    )
+
+    print("All GQA tests passed!")
+
+
+if __name__ == "__main__":
+    with torch.nn.attention.sdpa_kernel([torch.nn.attention.SDPBackend.FLASH_ATTENTION]):
+        test_sdpa(device)
+    #test_gqa(device)
 
-if __name__ == "__main__":    
-    device = torch.device('npu:0')
-    # test_scaled_dot_product_attention(device, backends="flash")
-    test_scaled_dot_product_attention_gqa_single_batch(device)
-    
\ No newline at end of file
+    # Example: quick single-config run
+    # test_gqa(device, batch_list=[1], gqa_ratios=[5], seq_len_list=[32], head_dim_list=[128])

From dd71c70766a06149a975615585f51536d1ea2904 Mon Sep 17 00:00:00 2001
From: HamHyungkyu <hhk971@postech.ac.kr>
Date: Tue, 17 Mar 2026 02:31:49 +0000
Subject: [PATCH 136/194] [Frontend] Handle RecompileSignal in MLIRKernel code
 generation

---
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 38125e31..672c35f7 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -964,7 +964,10 @@ def make_choices(self, nodes, kernel_name):
 
             # Try initial tile size
             self.reset(None)
-            src_code, meta_code = super().codegen_nodes(nodes, kernel_name)
+            try:
+                src_code, meta_code = super().codegen_nodes(nodes, kernel_name)
+            except mlir_common.RecompileSignal:
+                continue
             current_tile_sz = tuple(self.kernel_group.tile_desc.get_tile_size())
             search_space.add(current_tile_sz)
 
@@ -986,14 +989,12 @@ def make_choices(self, nodes, kernel_name):
                     # Try increase tile size for this axis
                     try:
                         self.kernel_group.tile_desc.scale_tile_dim(axis, prev_ranges[axis], 2)
-                    except extension_codecache.TileSizeError as e:
-                        # Failed to find proper tile size
+                        self.reset(None)
+                        src_code, meta_code = super().codegen_nodes(nodes, kernel_name)
+                    except (extension_codecache.TileSizeError, mlir_common.RecompileSignal):
                         candidate_axes.remove(axis)
                         self.reset(None)
                         continue
-
-                    self.reset(None)
-                    src_code, meta_code = super().codegen_nodes(nodes, kernel_name)
                     current_tile_sz = tuple(self.kernel_group.tile_desc.get_tile_size())
 
                     # FIXME. How to intergrate this constraint to tile system?

From c5f085ece4e9523ca1e97ee165c6cb976df5427c Mon Sep 17 00:00:00 2001
From: HamHyungkyu <hhk971@postech.ac.kr>
Date: Tue, 17 Mar 2026 02:39:03 +0000
Subject: [PATCH 137/194] [Frontend] Enhance vector size handling for
 low-precision paths in MLIR kernels

---
 PyTorchSimFrontend/mlir/mlir_common.py   | 73 +++++++++++++++++++++---
 PyTorchSimFrontend/mlir/mlir_template.py |  4 +-
 2 files changed, 68 insertions(+), 9 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 9f5dc6ab..32805261 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -103,14 +103,17 @@ def get_dtype_nbytes(dtype):
 
 MLIR_INF = {
     "inf" : {
+        "f16" : 0x7C00,
         "f32" : 0x7F800000,
         "f64" : 0x7FF0000000000000
     },
     "-inf" : {
+        "f16" : 0xFC00,
         "f32" : 0xFF800000,
         "f64" : 0xFFF0000000000000
     },
     "nan" : {
+        "f16" : 0x7C00,
         "f32" : 0x7FC00000,
         "f64" : 0x7FF8000000000000
     }
@@ -260,17 +263,23 @@ def get_tile_stride_per_lane(self, tile_size: list[int], tile_stride: list[int])
         return tile_stride
 
     def get_compute_vec_size(self, tile_size: list[int], reduction_numel: int, nr_rdim: int) -> int:
-        if self.forced_vec_size is not None:
-            return self.forced_vec_size
-
         per_lane = self.get_numel_per_lane(tile_size)
         stride = self.vlane_stride
         if nr_rdim:
             val = per_lane // max(reduction_numel, 1)
+            result = val
             for mult in [8, 4, 2]:
                 if per_lane >= val * mult:
-                    return val * mult
-            return val
+                    result = val * mult
+                    break
+            if self.forced_vec_size is not None:
+                # Cap while keeping result divisible by val (= reduction_size).
+                # This preserves the assert(vec_len % reduction_size == 0) invariant.
+                capped = (min(result, self.forced_vec_size) // max(val, 1)) * max(val, 1)
+                result = max(capped, val)
+            return result
+        if self.forced_vec_size is not None:
+            return self.forced_vec_size
         for mult in [8, 4, 2]:
             if (per_lane // stride) >= mult:
                 return stride * mult
@@ -787,10 +796,24 @@ def codegen_nodes(self, nodes, kernel_name):
             # Set node range info
             vars, reduction_vars = self.set_ranges(group, reduction_group)
             tile_desc = self.compute_tile_size(nodes, vars, reduction_vars)
+            _, _, _, self.buffer_types = self.kernel_group.args.mlir_argdefs()
+            safe_vec_size = self.get_safe_vec_size(tile_desc.get_compute_vec_size())
+            # For pointwise (non-reduction) kernels, cap the MLIR vector size so that
+            # f16->f32 widening stays within LMUL<=4 (step and forced_vec_size must match).
+            # Reduction kernels are left unchanged: their accumulator/multi_reduction
+            # structure assumes compute_vec_size == step, so we must not split them here.
+            tile_desc.vmap.forced_vec_size = safe_vec_size
+            compute_vec = tile_desc.get_compute_vec_size()
+            # RVV requires vector lengths that produce integer power-of-2 LMUL values.
+            # Non-power-of-2 element counts (e.g. 24) cause LLVM WidenVectorResult crashes.
+            # Raise BEFORE the try/except so this propagates to make_choices (not retried).
+            if compute_vec > 1 and (compute_vec & (compute_vec - 1)) != 0:
+                raise RecompileSignal(
+                    f"Non-power-of-2 compute_vec_size {compute_vec}: tile rejected (RVV requires power-of-2 LMUL)"
+                )
             self.compute_body_loop.size = tile_desc.get_numel_per_lane()
-            self.compute_body_loop.step = tile_desc.get_compute_vec_size()
+            self.compute_body_loop.step = compute_vec
             try:
-                _, _, _, self.buffer_types = self.kernel_group.args.mlir_argdefs()
                 with self as kernel:
                     for node in nodes:
                         node.run(vars, reduction_vars)
@@ -1035,6 +1058,42 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         self._nested_context_depth -= 1
         if self._nested_context_depth == 0:
             super().__exit__(exc_type, exc_val, exc_tb)
+    
+    def get_safe_vec_size(self, default_vec_size: int = 64) -> int:
+        """
+        Cap forced vector size for low-precision paths so widening ops
+        (e.g., f16/bf16 -> f32) do not exceed RVV LMUL limits.
+
+        Widening is legal up to source LMUL<=4 (destination LMUL<=8).
+        Using RVV relation LMUL = (SEW * VL) / VLEN, the safe source VL is:
+            VL <= 4 * VLEN / SEW
+        """
+
+        if not hasattr(self, "buffer_types") or not self.buffer_types:
+            return default_vec_size
+
+        lowp_bits = []
+        for info in self.buffer_types.values():
+            dtype = info[0] if info else None
+            if dtype in DTYPE_LOWP_FP:
+                mlir_dtype = DTYPE_TO_MLIR[dtype]
+                lowp_bits.append(MLIR_TO_BIT[mlir_dtype])
+
+        if not lowp_bits:
+            return default_vec_size
+
+        min_lowp_bits = min(lowp_bits)
+        # Constraint: Vector element count must be compatible across all types.
+        # VLEN=256: f16 (LMUL=2) and f32 (LMUL=4) both yield 32 elements.
+        # Note: Gem5 version restricts widening ops to LMUL < 8 for destination registers.
+        # Max LMUL set to 2 to ensure compatibility/safety.
+
+        widen_safe_cap = self.vlen * 2 // min_lowp_bits
+        if widen_safe_cap <= 0:
+            return default_vec_size
+
+        vec_size = min(default_vec_size, widen_safe_cap)
+        return vec_size
 
 @dataclasses.dataclass
 class LoopLevel:
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 53db988b..851f070f 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -1255,7 +1255,7 @@ def set_tile_size(self, template_fusion_info, prologue=False):
             numel_per_lane = tile_desc.get_numel_per_lane()
             r_tile_size = tile_desc.get_tile_size()[-1]
             nr_outer_loop = (numel_per_lane + r_tile_size-1) // r_tile_size
-            tile_desc.vmap.forced_vec_size = nr_outer_loop * 32 # Why? Emprically selected, other option failed to functionality...
+            tile_desc.vmap.forced_vec_size = self.get_safe_vec_size(nr_outer_loop * 32) # Why? Emprically selected, other option failed to functionality...
 
             self.reduction_fusion = True
             self.r_tile_size = tile_desc.get_tile_size()[-1]
@@ -1266,7 +1266,7 @@ def set_tile_size(self, template_fusion_info, prologue=False):
             self.compute_body_loop.step = tile_desc.get_compute_vec_size() // nr_outer_loop
             self.reduction_body_loop = mlir_common.LoopLevel(self.reduction_loop_idx, nr_outer_loop)
         else:
-            tile_desc.vmap.forced_vec_size = 64
+            tile_desc.vmap.forced_vec_size = self.get_safe_vec_size(64)
 
             if prologue:
                 self.prologue_compute_body_loop.size = tile_desc.get_numel_per_lane()

From fdd5b5459c41892b4d1a738b5baa3e21cd945b31 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 19 Mar 2026 02:01:08 +0900
Subject: [PATCH 138/194] [Refactor] move to TOGSimulator-based scheduler API

---
 experiments/BERT.py      | 77 ++++++++++++++++------------------------
 experiments/attention.py | 70 +++++++++++++-----------------------
 experiments/conv.py      | 76 +++++++++++++++------------------------
 experiments/gemm.py      | 61 +++++++++++--------------------
 experiments/layernorm.py | 59 +++++++++++-------------------
 experiments/resnet18.py  | 57 ++++++++++-------------------
 experiments/resnet50.py  | 57 ++++++++++-------------------
 experiments/softmax.py   | 58 +++++++++++-------------------
 tests/Fusion/__init__.py |  0
 tests/__init__.py        |  0
 10 files changed, 182 insertions(+), 333 deletions(-)
 create mode 100644 tests/Fusion/__init__.py
 create mode 100644 tests/__init__.py

diff --git a/experiments/BERT.py b/experiments/BERT.py
index fd671833..b938f4e6 100644
--- a/experiments/BERT.py
+++ b/experiments/BERT.py
@@ -1,57 +1,42 @@
-import torch
-import torch._dynamo
-import torch.utils.cpp_extension
-
+import os
+import sys
 import argparse
-import datetime
 
-def run_BERT(size, input_seq, config):
-    from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
-    # from tests.test_transformer import EncoderBlock
-    from tests.Fusion.test_transformer_fusion import EncoderBlock
-    scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config)
-    device = scheduler.execution_engine.module.custom_device()
+base_path = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
+sys.path.insert(0, base_path)
 
-    hidden_dim = {'base': 768, 'large': 1024, 'xlarge': 2048}
-    embedding_size = {'base': 768, 'large': 1024, 'xlarge': 2048}
-    heads = {'base': 12, 'large': 16, 'xlarge': 32} # hidden/64 https://arxiv.org/pdf/1909.11942
-    cpu_query = torch.randn(input_seq, hidden_dim[size])
-    encoder_block = EncoderBlock(embedding_size[size], heads[size]).eval()
-
-    query = cpu_query.clone().to(device=device)
-    opt_fn = torch.compile(dynamic=False)(encoder_block.to(device=device))
+import torch
+from Simulator.simulator import TOGSimulator
 
-    SchedulerDNNModel.register_model(f"BERT-{size}", opt_fn)
-    request = Request(f"BERT-{size}", [query], [], request_queue_idx=0)
-    scheduler.add_request(request, request_time=0)
+config = os.environ.get('TOGSIM_CONFIG', f'{base_path}/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml')
+os.environ['TOGSIM_CONFIG'] = config
 
-    # Run scheduler
-    while not scheduler.is_finished():
-        with torch.no_grad():
-            scheduler.schedule()
+# Try Fusion EncoderBlock first, fall back to standard test_transformer
+try:
+    from tests.Fusion.test_transformer_fusion import EncoderBlock
+except ImportError:
+    from tests.test_transformer import EncoderBlock
 
-    print(f"BERT-{size} Simulation Done")
+HIDDEN_DIM = {'base': 768, 'large': 1024, 'xlarge': 2048}
+EMBEDDING_SIZE = {'base': 768, 'large': 1024, 'xlarge': 2048}
+HEADS = {'base': 12, 'large': 16, 'xlarge': 32}
 
 if __name__ == "__main__":
-    import os
-    import sys
-    base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml')
-    config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path FIXME: gem5 result is different as directoy name
-    sys.path.append(base_dir)
     args = argparse.ArgumentParser()
-    args.add_argument('--size', type=str, default='base')
-    args.add_argument('--dump_path', type=str, default='results')
+    args.add_argument('--size', type=str, default='base', choices=['base', 'large', 'xlarge'])
     args.add_argument('--input_size', type=int, default=512)
     args = args.parse_args()
-    size = args.size
-    input_seq = args.input_size
-    result_path = os.path.join(base_dir, args.dump_path, config_prefix, f"BERT_{size}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
-    # setting environment variables
-    os.environ['TORCHSIM_LOG_PATH'] = result_path
-    # only timing simulation
-    os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
-    if 'pytorchsim_functional_mode' in os.environ:
-        del os.environ['pytorchsim_functional_mode']
-
-    run_BERT(size, input_seq, config)
+
+    hidden_dim = HIDDEN_DIM[args.size]
+    embedding_size = EMBEDDING_SIZE[args.size]
+    heads = HEADS[args.size]
+
+    device = torch.device("npu:0")
+    model = EncoderBlock(embedding_size, heads).eval().to(device=device)
+    model_input = torch.randn(args.input_size, hidden_dim).to(device=device)
+    opt_fn = torch.compile(dynamic=False)(model)
+
+    with TOGSimulator(config_path=config):
+        torch.npu.launch_model(opt_fn, model_input, stream_index=0, timestamp=0)
+        torch.npu.synchronize()
+    print(f"BERT-{args.size} Simulation Done")
diff --git a/experiments/attention.py b/experiments/attention.py
index 211433f1..b56ed537 100644
--- a/experiments/attention.py
+++ b/experiments/attention.py
@@ -1,56 +1,36 @@
-import torch
-import torch._dynamo
-import torch.utils.cpp_extension
-
+import os
+import sys
+import math
 import argparse
-import datetime
 
+base_path = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
+sys.path.insert(0, base_path)
 
-def run_attention(size, config):
-    def attention(query, key, value):
-        import math
-        d_k = query.size(-1)
-        scores = torch.matmul(key, query.transpose(-2, -1)) / math.sqrt(d_k)
-        p_attn = scores.softmax(dim=-2)
-        return torch.matmul(value.transpose(-1, -2), p_attn)
-    from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
-    scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config)
-    device = scheduler.execution_engine.module.custom_device()
-    query = torch.randn(size).to(device=device)
-    key = torch.randn(size).to(device=device)
-    value = torch.randn(size).to(device=device)
-    opt_fn = torch.compile(dynamic=False)(attention)
-
-    SchedulerDNNModel.register_model("attention", opt_fn)
-    request = Request("attention", [query, key, value], [], request_queue_idx=0)
-    scheduler.add_request(request, request_time=0)
+import torch
+from Simulator.simulator import TOGSimulator
 
-    # Run scheduler
-    while not scheduler.is_finished():
-        with torch.no_grad():
-            scheduler.schedule()
+config = os.environ.get('TOGSIM_CONFIG', f'{base_path}/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml')
+os.environ['TOGSIM_CONFIG'] = config
 
-    print(f"Attention {str(size)} Simulation Done")
+def attention(query, key, value):
+    d_k = query.size(-1)
+    scores = torch.matmul(key, query.transpose(-2, -1)) / math.sqrt(d_k)
+    p_attn = scores.softmax(dim=-2)
+    return torch.matmul(value.transpose(-1, -2), p_attn)
 
 if __name__ == "__main__":
-    import os
-    import sys
-    base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml')
-    config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
-    sys.path.append(base_dir)
     args = argparse.ArgumentParser()
     args.add_argument('--size', nargs='+', type=int, default=[12, 512, 64], help='Tensor Shape')
-    args.add_argument('--dump_path', type=str, default='results')
     args = args.parse_args()
-    size = args.size
-    size_str = "x".join([str(i) for i in size])
-    result_path = os.path.join(base_dir, args.dump_path, config_prefix, f"attention_{size_str}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
-    # setting environment variables
-    os.environ['TORCHSIM_LOG_PATH'] = result_path
-    # only timing simulation
-    os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
-    if 'pytorchsim_functional_mode' in os.environ:
-        del os.environ['pytorchsim_functional_mode']
+    size = tuple(args.size)
+
+    device = torch.device("npu:0")
+    query = torch.randn(*size).to(device=device)
+    key = torch.randn(*size).to(device=device)
+    value = torch.randn(*size).to(device=device)
+    opt_fn = torch.compile(dynamic=False)(attention)
 
-    run_attention(size, config)
+    with TOGSimulator(config_path=config):
+        torch.npu.launch_model(opt_fn, query, key, value, stream_index=0, timestamp=0)
+        torch.npu.synchronize()
+    print(f"Attention {size} Simulation Done")
diff --git a/experiments/conv.py b/experiments/conv.py
index 61f7ad80..98391fae 100644
--- a/experiments/conv.py
+++ b/experiments/conv.py
@@ -1,57 +1,39 @@
-import torch
-import torch._dynamo
-import torch.utils.cpp_extension
-
+import os
+import sys
 import argparse
-import datetime
 
+base_path = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
+sys.path.insert(0, base_path)
+
+import torch
+from Simulator.simulator import TOGSimulator
+
+config = os.environ.get('TOGSIM_CONFIG', f'{base_path}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml')
+os.environ['TOGSIM_CONFIG'] = config
 
-def run_conv2d(batch_size, i_h, i_w, i_c, o_c, kernel_size, stride, padding, config):
-    from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
-    def custom_conv2d(a, b, bias):
-        i_c = a.shape[1]
-        o_c = b.shape[0]
-        conv2d = torch.nn.Conv2d(i_c, o_c, b.shape[-1], stride=stride, padding=padding, dilation=1, bias=False)
+def conv2d_fn(batch_size, i_h, i_w, i_c, o_c, kernel_size, stride, padding):
+    def _conv(a, b, bias):
+        conv2d = torch.nn.Conv2d(i_c, o_c, kernel_size, stride=stride, padding=padding, dilation=1, bias=False)
         conv2d.weight = torch.nn.Parameter(b)
-        # conv2d.bias = torch.nn.Parameter(bias)
         return conv2d(a)
-    scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config)
-    device = scheduler.execution_engine.module.custom_device()
+    return _conv
+
+if __name__ == "__main__":
+    args = argparse.ArgumentParser()
+    args.add_argument('--size', nargs='+', type=int, default=[8, 28, 28, 128, 128, 3, 1, 1],
+                      help='B H W I_C O_C K S P')
+    args = args.parse_args()
+    batch_size, i_h, i_w, i_c, o_c, kernel_size, stride, padding = args.size
+
+    device = torch.device("npu:0")
     conv_input = torch.randn(batch_size, i_c, i_h, i_w).to(memory_format=torch.channels_last, device=device)
     conv_kernel = torch.randn(o_c, i_c, kernel_size, kernel_size).to(memory_format=torch.channels_last, device=device)
     conv_bias = torch.randn(o_c).to(device=device)
-    opt_fn = torch.compile(dynamic=False)(custom_conv2d)
-
-    SchedulerDNNModel.register_model("CONV", opt_fn)
-    request = Request("CONV", [conv_input, conv_kernel, conv_bias], [], request_queue_idx=0)
-    scheduler.add_request(request, request_time=0)
-
-    # Run scheduler
-    while not scheduler.is_finished():
-        with torch.no_grad():
-            scheduler.schedule()
 
-    print(f"CONV {batch_size}_{i_h}_{i_w}_{i_c}_{o_c}_{kernel_size}_{stride}_{padding} (B_H_W_I_C_O_C_K_S_P) Simulation Done")
+    custom_conv = conv2d_fn(batch_size, i_h, i_w, i_c, o_c, kernel_size, stride, padding)
+    opt_fn = torch.compile(dynamic=False)(custom_conv)
 
-if __name__ == "__main__":
-    import os
-    import sys
-    base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml')
-    config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
-    sys.path.append(base_dir)
-    args = argparse.ArgumentParser()
-    args.add_argument('--size', nargs='+', type=int, default=[8, 28, 28, 128, 128, 3, 1, 1], help='B H W I_C O_C K S P')
-    args.add_argument('--dump_path', type=str, default='results')
-    args = args.parse_args()
-    size = args.size
-    size_str = "_".join([str(i) for i in size])
-    result_path = os.path.join(base_dir, args.dump_path, config_prefix, f"CONV_{size_str}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
-    # setting environment variables
-    os.environ['TORCHSIM_LOG_PATH'] = result_path
-    # only timing simulation
-    os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
-    if 'pytorchsim_functional_mode' in os.environ:
-        del os.environ['pytorchsim_functional_mode']
-
-    run_conv2d(size[0], size[1], size[2], size[3], size[4], size[5], size[6], size[7], config)
\ No newline at end of file
+    with TOGSimulator(config_path=config):
+        torch.npu.launch_model(opt_fn, conv_input, conv_kernel, conv_bias, stream_index=0, timestamp=0)
+        torch.npu.synchronize()
+    print(f"CONV {batch_size}_{i_h}_{i_w}_{i_c}_{o_c}_{kernel_size}_{stride}_{padding} Simulation Done")
diff --git a/experiments/gemm.py b/experiments/gemm.py
index 0e1a15e4..d256e931 100644
--- a/experiments/gemm.py
+++ b/experiments/gemm.py
@@ -1,51 +1,32 @@
-import torch
-import torch._dynamo
-import torch.utils.cpp_extension
-
+import os
+import sys
 import argparse
-import datetime
-
 
-def run_matmul(input_size, hidden_size, output_size, config):
-    from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
-    def custom_matmul(a, b):
-        return torch.matmul(a, b)
-    scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config)
-    device = scheduler.execution_engine.module.custom_device()
-    torch.manual_seed(0)
-    input = torch.randn(input_size, hidden_size).to(device=device)
-    weight = torch.randn(hidden_size, output_size).to(device=device)
-    opt_fn = torch.compile(dynamic=False)(custom_matmul)
+base_path = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
+sys.path.insert(0, base_path)
 
-    SchedulerDNNModel.register_model("GEMM", opt_fn)
-    request = Request("GEMM", [input, weight], [], request_queue_idx=0)
-    scheduler.add_request(request, request_time=0)
+import torch
+from Simulator.simulator import TOGSimulator
 
-    # Run scheduler
-    while not scheduler.is_finished():
-        scheduler.schedule()
+config = os.environ.get('TOGSIM_CONFIG', f'{base_path}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml')
+os.environ['TOGSIM_CONFIG'] = config
 
-    print(f"GEMM {input_size}x{hidden_size}x{output_size} (MxKxN) Simulation Done")
+def matmul_fn(a, b):
+    return torch.matmul(a, b)
 
 if __name__ == "__main__":
-    import os
-    import sys
-    base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml')
-    config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
-    sys.path.append(base_dir)
     args = argparse.ArgumentParser()
     args.add_argument('--size', nargs='+', type=int, default=[128, 128, 128], help='M K N')
-    args.add_argument('--dump_path', type=str, default='results')
     args = args.parse_args()
-    size = args.size
-    size_str = "x".join([str(i) for i in size])
-    result_path = os.path.join(base_dir, args.dump_path, config_prefix, f"GEMM_{size_str}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
-    # setting environment variables
-    os.environ['TORCHSIM_LOG_PATH'] = result_path
-    # only timing simulation
-    os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
-    if 'pytorchsim_functional_mode' in os.environ:
-        del os.environ['pytorchsim_functional_mode']
+    M, K, N = args.size[0], args.size[1], args.size[2]
 
-    run_matmul(size[0], size[1], size[2], config)
+    device = torch.device("npu:0")
+    torch.manual_seed(0)
+    input_a = torch.randn(M, K).to(device=device)
+    input_b = torch.randn(K, N).to(device=device)
+    opt_fn = torch.compile(dynamic=False)(matmul_fn)
+
+    with TOGSimulator(config_path=config):
+        torch.npu.launch_model(opt_fn, input_a, input_b, stream_index=0, timestamp=0)
+        torch.npu.synchronize()
+    print(f"GEMM {M}x{K}x{N} (MxKxN) Simulation Done")
diff --git a/experiments/layernorm.py b/experiments/layernorm.py
index a6b16986..a9170c6b 100644
--- a/experiments/layernorm.py
+++ b/experiments/layernorm.py
@@ -1,48 +1,29 @@
-import torch
-import torch._dynamo
-import torch.utils.cpp_extension
-
+import os
+import sys
 import argparse
-import datetime
-
 
-def run_layernorm(size, config):
-    from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
-    scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config)
-    device = scheduler.execution_engine.module.custom_device()
-    input = torch.randn(size).to(device=device)
-    opt_fn = torch.compile(dynamic=False)(torch.nn.LayerNorm(size[-1]).to(device=device))
+base_path = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
+sys.path.insert(0, base_path)
 
-    SchedulerDNNModel.register_model("LayerNorm", opt_fn)
-    request = Request("LayerNorm", [input], [], request_queue_idx=0)
-    scheduler.add_request(request, request_time=0)
-
-    # Run scheduler
-    while not scheduler.is_finished():
-        scheduler.schedule()
+import torch
+from Simulator.simulator import TOGSimulator
 
-    print(f"LayerNorm {str(size)} Simulation Done")
+config = os.environ.get('TOGSIM_CONFIG', f'{base_path}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml')
+os.environ['TOGSIM_CONFIG'] = config
 
 if __name__ == "__main__":
-    import os
-    import sys
-    base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml')
-    config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
-    sys.path.append(base_dir)
     args = argparse.ArgumentParser()
     args.add_argument('--size', nargs='+', type=int, default=[512, 768], help='Tensor Shape')
-    args.add_argument('--dump_path', type=str, default='results')
     args = args.parse_args()
-    size = args.size
-    size_str = "x".join([str(i) for i in size])
-    result_path = os.path.join(base_dir, args.dump_path, config_prefix, f"LayerNorm_{size_str}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
-    # setting environment variables
-    os.environ['TORCHSIM_LOG_PATH'] = result_path
-    os.environ['TORCHSIM_FUSION_REDUCTION_REDUCTION'] = "0"
-    # only timing simulation
-    os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
-    if 'pytorchsim_functional_mode' in os.environ:
-        del os.environ['pytorchsim_functional_mode']
-
-    run_layernorm(size, config)
+    size = tuple(args.size)
+    normalized_shape = size[-1]
+
+    device = torch.device("npu:0")
+    model = torch.nn.LayerNorm(normalized_shape).to(device=device)
+    opt_fn = torch.compile(dynamic=False)(model)
+    model_input = torch.randn(*size).to(device=device)
+
+    with TOGSimulator(config_path=config):
+        torch.npu.launch_model(opt_fn, model_input, stream_index=0, timestamp=0)
+        torch.npu.synchronize()
+    print(f"LayerNorm {size} Simulation Done")
diff --git a/experiments/resnet18.py b/experiments/resnet18.py
index c7763d86..38fb80fe 100644
--- a/experiments/resnet18.py
+++ b/experiments/resnet18.py
@@ -1,49 +1,28 @@
-import torch
-import torch._dynamo
-import torch.utils.cpp_extension
-
+import os
+import sys
 import argparse
-import datetime
 
-def run_resnet(batch, config):
-    from torchvision.models import resnet18
-    from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
-    scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config)
-    device = scheduler.execution_engine.module.custom_device()
-    model = resnet18().eval()
-    input = torch.randn(batch, 3, 224, 224).to(device=device)
-    opt_fn = torch.compile(dynamic=False)(model.to(device, memory_format=torch.channels_last))
+base_path = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
+sys.path.insert(0, base_path)
 
-    SchedulerDNNModel.register_model("resnet18", opt_fn)
-    request = Request("resnet18", [input], [], request_queue_idx=0)
-    scheduler.add_request(request, request_time=0)
+import torch
+from torchvision.models import resnet18
+from Simulator.simulator import TOGSimulator
 
-    # Run scheduler
-    while not scheduler.is_finished():
-        with torch.no_grad():
-            scheduler.schedule()
-
-    print("ResNet18 Simulation Done")
+config = os.environ.get('TOGSIM_CONFIG', f'{base_path}/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml')
+os.environ['TOGSIM_CONFIG'] = config
 
 if __name__ == "__main__":
-    import os
-    import sys
-    base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml')
-    config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
-    sys.path.append(base_dir)
     args = argparse.ArgumentParser()
     args.add_argument('--batch', type=int, default=1)
-    args.add_argument('--dump_path', type=str, default='results')
     args = args.parse_args()
-    batch = args.batch
-    result_path = os.path.join(base_dir, args.dump_path, config_prefix, f"resnet18_{batch}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
-    # setting environment variables
-    os.environ['TORCHSIM_LOG_PATH'] = result_path
-    os.environ['TORCHSIM_USE_TIMING_POOLING'] = "1"
-    # only timing simulation
-    os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
-    if 'pytorchsim_functional_mode' in os.environ:
-        del os.environ['pytorchsim_functional_mode']
 
-    run_resnet(batch, config)
+    device = torch.device("npu:0")
+    model = resnet18().eval().to(device=device, memory_format=torch.channels_last)
+    opt_fn = torch.compile(dynamic=False)(model)
+    model_input = torch.randn(args.batch, 3, 224, 224).to(device=device)
+
+    with TOGSimulator(config_path=config):
+        torch.npu.launch_model(opt_fn, model_input, stream_index=0, timestamp=0)
+        torch.npu.synchronize()
+    print("ResNet18 Simulation Done")
diff --git a/experiments/resnet50.py b/experiments/resnet50.py
index 4e611541..5b134c13 100644
--- a/experiments/resnet50.py
+++ b/experiments/resnet50.py
@@ -1,49 +1,28 @@
-import torch
-import torch._dynamo
-import torch.utils.cpp_extension
-
+import os
+import sys
 import argparse
-import datetime
 
-def run_resnet(batch, config):
-    from torchvision.models import resnet50
-    from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
-    scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config)
-    device = scheduler.execution_engine.module.custom_device()
-    model = resnet50().eval()
-    input = torch.randn(batch, 3, 224, 224).to(device=device)
-    opt_fn = torch.compile(dynamic=False)(model.to(device, memory_format=torch.channels_last))
+base_path = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
+sys.path.insert(0, base_path)
 
-    SchedulerDNNModel.register_model("resnet50", opt_fn)
-    request = Request("resnet50", [input], [], request_queue_idx=0)
-    scheduler.add_request(request, request_time=0)
+import torch
+from torchvision.models import resnet50
+from Simulator.simulator import TOGSimulator
 
-    # Run scheduler
-    while not scheduler.is_finished():
-        with torch.no_grad():
-            scheduler.schedule()
-
-    print("ResNet50 Simulation Done")
+config = os.environ.get('TOGSIM_CONFIG', f'{base_path}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml')
+os.environ['TOGSIM_CONFIG'] = config
 
 if __name__ == "__main__":
-    import os
-    import sys
-    base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml')
-    config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
-    sys.path.append(base_dir)
     args = argparse.ArgumentParser()
     args.add_argument('--batch', type=int, default=1)
-    args.add_argument('--dump_path', type=str, default='results')
     args = args.parse_args()
-    batch = args.batch
-    result_path = os.path.join(base_dir, args.dump_path, config_prefix, f"resnet50_{batch}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
-    # setting environment variables
-    os.environ['TORCHSIM_LOG_PATH'] = result_path
-    os.environ['TORCHSIM_USE_TIMING_POOLING'] = "1"
-    # only timing simulation
-    os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
-    if 'pytorchsim_functional_mode' in os.environ:
-        del os.environ['pytorchsim_functional_mode']
 
-    run_resnet(batch, config)
+    device = torch.device("npu:0")
+    model = resnet50().eval().to(device=device, memory_format=torch.channels_last)
+    opt_fn = torch.compile(dynamic=False)(model)
+    model_input = torch.randn(args.batch, 3, 224, 224).to(device=device)
+
+    with TOGSimulator(config_path=config):
+        torch.npu.launch_model(opt_fn, model_input, stream_index=0, timestamp=0)
+        torch.npu.synchronize()
+    print("ResNet50 Simulation Done")
diff --git a/experiments/softmax.py b/experiments/softmax.py
index d30559f7..b86febe0 100644
--- a/experiments/softmax.py
+++ b/experiments/softmax.py
@@ -1,47 +1,29 @@
-import torch
-import torch._dynamo
-import torch.utils.cpp_extension
-
+import os
+import sys
 import argparse
-import datetime
-
 
-def run_softmax(size, config, dim=1):
-    from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
-    scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config)
-    device = scheduler.execution_engine.module.custom_device()
-    input = torch.randn(size).to(device=device)
-    opt_fn = torch.compile(dynamic=False)(torch.nn.Softmax(dim=dim).to(device=device))
+base_path = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
+sys.path.insert(0, base_path)
 
-    SchedulerDNNModel.register_model("Softmax", opt_fn)
-    request = Request("Softmax", [input], [], request_queue_idx=0)
-    scheduler.add_request(request, request_time=0)
-
-    # Run scheduler
-    while not scheduler.is_finished():
-        scheduler.schedule()
+import torch
+from Simulator.simulator import TOGSimulator
 
-    print(f"Softmax {str(size)} Simulation Done")
+config = os.environ.get('TOGSIM_CONFIG', f'{base_path}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml')
+os.environ['TOGSIM_CONFIG'] = config
 
 if __name__ == "__main__":
-    import os
-    import sys
-    base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-    config = os.environ.get('TORCHSIM_CONFIG', default=f'{base_dir}/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml')
-    config_prefix = config.split('/')[-1].split('.')[0][9:] # extract config name from config path
-    sys.path.append(base_dir)
     args = argparse.ArgumentParser()
     args.add_argument('--size', nargs='+', type=int, default=[512, 512], help='Tensor Shape')
-    args.add_argument('--dump_path', type=str, default='results')
     args = args.parse_args()
-    size = args.size
-    size_str = "x".join([str(i) for i in size])
-    result_path = os.path.join(base_dir, args.dump_path, config_prefix, f"Softmax_{size_str}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
-    # setting environment variables
-    os.environ['TORCHSIM_LOG_PATH'] = result_path
-    # only timing simulation
-    os.environ['TORCHSIM_VALIDATION_MODE'] = "0"
-    if 'pytorchsim_functional_mode' in os.environ:
-        del os.environ['pytorchsim_functional_mode']
-
-    run_softmax(size, config)
+    size = tuple(args.size)
+    dim = 1
+
+    device = torch.device("npu:0")
+    model = torch.nn.Softmax(dim=dim).to(device=device)
+    opt_fn = torch.compile(dynamic=False)(model)
+    model_input = torch.randn(*size).to(device=device)
+
+    with TOGSimulator(config_path=config):
+        torch.npu.launch_model(opt_fn, model_input, stream_index=0, timestamp=0)
+        torch.npu.synchronize()
+    print(f"Softmax {size} Simulation Done")
diff --git a/tests/Fusion/__init__.py b/tests/Fusion/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 00000000..e69de29b

From 3847f9b28053ef0c02b65f0b92d5babd8f01211d Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 19 Mar 2026 14:28:36 +0900
Subject: [PATCH 139/194] [CI] Add missing package + Add test cases

---
 .github/workflows/docker-image-2-8.yml | 10 +++++++++-
 Dockerfile.base                        |  2 +-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/docker-image-2-8.yml b/.github/workflows/docker-image-2-8.yml
index f1e915d6..52464dff 100644
--- a/.github/workflows/docker-image-2-8.yml
+++ b/.github/workflows/docker-image-2-8.yml
@@ -52,10 +52,18 @@ jobs:
           echo "Image did not become available in GHCR within expected time."
           exit 1
 
-  test-pytorchsim-wrapper:
+  test-pytorchsim-wrapper1:
     needs: build-and-test
     uses: ./.github/workflows/pytorchsim_test.yml
     with:
       image_name: ghcr.io/psal-postech/torchsim-test-2-8:${{ github.sha }}
       vector_lane: 128
       spad_size: 128
+
+  test-pytorchsim-wrapper2:
+    needs: build-and-test
+    uses: ./.github/workflows/pytorchsim_test.yml
+    with:
+      image_name: ghcr.io/psal-postech/torchsim-test-2-8:${{ github.sha }}
+      vector_lane: 32
+      spad_size: 32
diff --git a/Dockerfile.base b/Dockerfile.base
index e8504bcf..05444d41 100644
--- a/Dockerfile.base
+++ b/Dockerfile.base
@@ -34,7 +34,7 @@ RUN apt -y update && \
     python3-dev python-is-python3 libboost-all-dev \
     libhdf5-serial-dev python3-pydot libpng-dev libelf-dev pkg-config pip \
     python3-venv black libssl-dev libasan5 libubsan1 curl device-tree-compiler wget ninja-build && \
-    pip install onnx matplotlib scikit-learn pydot tabulate && pip install --user conan==1.56.0 cmake==3.26.4 && rm -rf /var/lib/apt/lists/*
+    pip install onnx matplotlib scikit-learn pydot tabulate flash_attn && pip install --user conan==1.56.0 cmake==3.26.4 && rm -rf /var/lib/apt/lists/*
 
 # Download RISC-V tool chain
 RUN wget https://github.com/riscv-collab/riscv-gnu-toolchain/releases/download/2023.12.14/riscv64-glibc-ubuntu-22.04-llvm-nightly-2023.12.14-nightly.tar.gz && \

From 1d7a3a919d7310a03427b2e1b38dcb9067e4f317 Mon Sep 17 00:00:00 2001
From: student-Jungmin <wjdals020503@naver.com>
Date: Sun, 22 Mar 2026 15:02:24 +0000
Subject: [PATCH 140/194] [FIX] Fix zero systolic array utilization during SDPA
 execution in TOGSim

---
 PyTorchSimFrontend/mlir/mlir_sdpa_template.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_sdpa_template.py b/PyTorchSimFrontend/mlir/mlir_sdpa_template.py
index 37db4956..a3ae6192 100644
--- a/PyTorchSimFrontend/mlir/mlir_sdpa_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_sdpa_template.py
@@ -238,7 +238,7 @@ def calculate_scale(query: torch.Tensor, scale: float) -> float:
             %chunk_val = affine.vector_load %mul_buffer[0, %index5] : {{ mul_tile_desc.get_mlir_shape(data_stype) }}, vector<{{ chunk_size }}x{{ data_stype }}>
             %local_max = arith.maximumf %chunk_val, %iter_max : vector<{{ chunk_size }}x{{ data_stype }}>
             affine.yield %local_max : vector<{{ chunk_size }}x{{ data_stype }}>
-          }
+          } { accumulation_loop=true }
 
           %max_cast = vector.shape_cast %chunk_max_res : vector<{{ chunk_size }}x{{ data_stype }}> to vector<{{ chunk_size // 2 }}x2x{{ data_stype }}>
           %max_reduced_1 = vector.multi_reduction <maximumf>, %max_cast, %v_neg_inf_2x [0] : vector<8x2x{{ data_stype }}> to vector<2x{{ data_stype }}>
@@ -284,7 +284,7 @@ def calculate_scale(query: torch.Tensor, scale: float) -> float:
             %chunk_exp = affine.vector_load %mul_buffer[0, %index5] : {{ mul_tile_desc.get_mlir_shape(data_stype) }}, vector<{{ chunk_size }}x{{ data_stype }}>
             %local_sum = arith.addf %chunk_exp, %iter_sum : vector<{{ chunk_size }}x{{ data_stype }}>
             affine.yield %local_sum : vector<{{ chunk_size }}x{{ data_stype }}>
-          }
+          } { accumulation_loop=true }
 
           %zero_2x = vector.broadcast %c0 : {{ data_stype }} to vector<2x{{ data_stype }}>
           %sum_cast = vector.shape_cast %chunk_sum_res : vector<{{ chunk_size }}x{{ data_stype }}> to vector<{{ chunk_size // 2 }}x2x{{ data_stype }}>
@@ -301,7 +301,7 @@ def calculate_scale(query: torch.Tensor, scale: float) -> float:
             { idx_map = array<i32: 2, 1, -1> }
             ins(%vt_buffer2D, %mul_buffer : memref<{{ tile_e }}x{{ tile_s }}x{{ data_stype }}, 1>, {{ mul_tile_desc.get_mlir_shape(data_stype) }})
             outs(%ot_buffer2D : memref<{{ tile_e }}x{{ tile_l }}x{{ data_stype }}, 1>)
-        } {inner_loop=true}
+        } { accumulation_loop=true }
 
         // out @ row_sum^(-1)
         %final_row_sum = affine.vector_load %sum_buffer[0, 0] : {{ sum_desc.get_mlir_shape(data_stype) }}, vector<2x{{ data_stype }}>
@@ -317,7 +317,7 @@ def calculate_scale(query: torch.Tensor, scale: float) -> float:
 
         %out_dram_offset = affine.apply {{ out_offset_map }}(%index0, %index1, %index3)
         {{ kernel.def_dma_op("MVOUT", "out", [], out_tile_desc, indent_size=8, dram_stride=out_dram_stride, dram_offset="out_dram_offset") }}
-      } { accumulation_loop=true }
+      } { outer_loop=true }
     } { outer_loop=true }
   } { outer_loop=true }
   return

From 10f592388013bf6b4b0dde0970f1291fe89da569 Mon Sep 17 00:00:00 2001
From: HamHyungkyu <hhk971@postech.ac.kr>
Date: Tue, 17 Mar 2026 04:18:42 +0000
Subject: [PATCH 141/194] [Frontend/Fix] Enforce vector length constraints and
 resolve ext() widening errors

Updated the frontend to strictly validate vector element counts, preventing invalid LMUL=8 configurations in Gem5. Fixed a mismatch in the ext() operation's type-checking logic.
---
 PyTorchSimFrontend/mlir/mlir_common.py | 4 ++--
 PyTorchSimFrontend/mlir/mlir_ops.py    | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 32805261..23c02066 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -1086,9 +1086,9 @@ def get_safe_vec_size(self, default_vec_size: int = 64) -> int:
         # Constraint: Vector element count must be compatible across all types.
         # VLEN=256: f16 (LMUL=2) and f32 (LMUL=4) both yield 32 elements.
         # Note: Gem5 version restricts widening ops to LMUL < 8 for destination registers.
-        # Max LMUL set to 2 to ensure compatibility/safety.
+        # Max LMUL set to 1 to ensure compatibility/safety.
 
-        widen_safe_cap = self.vlen * 2 // min_lowp_bits
+        widen_safe_cap = self.vlen // min_lowp_bits
         if widen_safe_cap <= 0:
             return default_vec_size
 
diff --git a/PyTorchSimFrontend/mlir/mlir_ops.py b/PyTorchSimFrontend/mlir/mlir_ops.py
index 76a0e273..218f60a9 100644
--- a/PyTorchSimFrontend/mlir/mlir_ops.py
+++ b/PyTorchSimFrontend/mlir/mlir_ops.py
@@ -1041,7 +1041,7 @@ def ext(operand, dtype, *args, **kwargs):
         op_type = V.kernel.var_info[operand]
         shape = f"vector<{op_type[0]}x{op_type[1]}>" if op_type[0] > 1 else f"{op_type[1]}"
         target_type = f"vector<{op_type[0]}x{dtype}>" if op_type[0] > 1 else f"{dtype}"
-        if op_type[0] == "f":
+        if dtype[0] == "f":
             opcode = f'arith.extf'
         else:
             opcode = f'arith.extui'

From a32f9e04d74bb9035b11facf8aae6c7661d41a6f Mon Sep 17 00:00:00 2001
From: HamHyungkyu <hhk971@postech.ac.kr>
Date: Tue, 17 Mar 2026 04:20:31 +0000
Subject: [PATCH 142/194] [Frontend] Add optimized GQA decode implementation
 with tile-based softmax

Note: Known compilation errors persist when using smaller tile sizes; investigation into the tile-stride logic is ongoing.
---
 tests/test_gqa_decode.py | 216 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 216 insertions(+)
 create mode 100644 tests/test_gqa_decode.py

diff --git a/tests/test_gqa_decode.py b/tests/test_gqa_decode.py
new file mode 100644
index 00000000..3605d638
--- /dev/null
+++ b/tests/test_gqa_decode.py
@@ -0,0 +1,216 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import os
+import sys
+import math
+import argparse
+from Simulator.simulator import TOGSimulator
+from Scheduler.scheduler import PyTorchSimRunner
+device = PyTorchSimRunner.setup_device().custom_device()
+# ─────────────────────────────────────────────────────────────────────────────
+# Optimized: Flash-Decode style — tile S upfront, batch in B dimension
+# ─────────────────────────────────────────────────────────────────────────────
+
+class GQADecodeOptimized(nn.Module):
+    """Flash-Decode style GQA decode for multi-core NPU.
+
+    Splits the KV-cache sequence into n_tiles chunks and folds them into the
+    BMM batch dimension (B_total = H_kv × n_tiles).  Both the QK and SV
+    matrix multiplications are issued as a *single* batched BMM with a short
+    inner-K loop, so the NPU scheduler can distribute all B_total tiles across
+    available cores simultaneously.
+
+    Improvement over GQABaseline
+    ─────────────────────────────
+    Baseline QK : B=H_kv=1,      M=G, N=S(large), K=D  → 640 N-tile iters on 1 batch
+    Optimized QK: B=H_kv*n_tiles, M=G, N=T(small), K=D  → n_tiles batch slots for cores
+
+    Baseline SV : B=H_kv=1,      M=G, N=D, K=S   → K-loop=640, only 8 outer tiles
+    Optimized SV: B=H_kv*n_tiles, M=G, N=D, K=T   → K-loop=T/TILE_K, n_tiles outer tiles
+
+    Memory layout improvements
+    ──────────────────────────
+    • K/V tiles are generated with a single contiguous view+reshape (no mid-loop transpose).
+    • Avoids materializing the full score tensor [H_kv, G, S] in DRAM before tiling.
+    • Softmax intermediates are kept in smaller [B_total, G, T] buffers.
+
+    Input conventions
+    ─────────────────
+        q : [H_kv, G, D]  – one decode-step query token per KV head
+        k : [H_kv, S, D]  – KV-cache keys   (NOT pre-transposed)
+        v : [H_kv, S, D]  – KV-cache values
+
+    tile_size selection
+    ───────────────────
+        Ideal: tile_size = round_up(S * H_kv / num_cores, vpu_num_lanes)
+        so that B_total ≈ num_cores.  Must also satisfy the SPAD budget:
+            (G*T + T*D + G*D) * bytes ≤ spad_per_core   (for sub-tile occupancy)
+        Default 512 works for (G=5, D=128, fp16, 16-lane × 8 KB/lane SPAD).
+    """
+
+    def __init__(self, tile_size: int = 512):
+        super().__init__()
+        self.tile_size = tile_size
+
+    def forward(
+        self,
+        q: torch.Tensor,   # [H_kv, G, D]
+        k: torch.Tensor,   # [H_kv, S, D]
+        v: torch.Tensor,   # [H_kv, S, D]
+        scale: float,
+    ) -> torch.Tensor:
+        H_kv, G, D = q.shape
+        _, S, _    = k.shape
+        T          = self.tile_size
+        n_tiles    = (S + T - 1) // T
+        pad_len    = n_tiles * T - S
+        B_total    = H_kv * n_tiles
+
+        # ── 1. Pad S → multiple of T ───────────────────────────────────────
+        if pad_len > 0:
+            k = F.pad(k, (0, 0, 0, pad_len))   # [H_kv, S', D]
+            v = F.pad(v, (0, 0, 0, pad_len))   # [H_kv, S', D]
+
+        # ── 2. Tile K, V → [B_total, T, D]  (contiguous, no copy) ─────────
+        # k is [H_kv, S', D]; view splits S' → n_tiles×T along dim-1
+        k_tiles = k.view(H_kv, n_tiles, T, D).reshape(B_total, T, D)
+        v_tiles = v.view(H_kv, n_tiles, T, D).reshape(B_total, T, D)
+
+        # ── 3. Expand Q → [B_total, G, D] ─────────────────────────────────
+        # expand: zero-copy view; reshape: contiguous copy (small: B_total*G*D elems)
+        q_exp = q.unsqueeze(1).expand(H_kv, n_tiles, G, D).reshape(B_total, G, D)
+
+        # ── 4. Batched QK BMM ──────────────────────────────────────────────
+        # [B_total, G, D] × [B_total, D, T] → [B_total, G, T]
+        # NPU mapping: B=B_total, M=G, N=T, K=D
+        #   → outer tiles = B_total × M_tiles × N_tiles  (all parallelizable)
+        #   → inner K-loop = D/TILE_K  (short, D=128)
+        k_t    = k_tiles.transpose(1, 2)            # [B_total, D, T]
+        scores = torch.bmm(q_exp, k_t) * scale      # [B_total, G, T]
+
+        # ── 5. Tile-local softmax (fp32 accumulation) ──────────────────────
+        # All ops are elementwise on [B_total, G, T] → torch.compile fuses them
+        scores_f32 = scores.float()
+        local_max  = scores_f32.amax(dim=-1, keepdim=True)  # [B_total, G, 1]
+        local_exp  = (scores_f32 - local_max).exp()          # [B_total, G, T]
+        local_sum  = local_exp.sum(dim=-1, keepdim=True)     # [B_total, G, 1]
+
+        # ── 6. Batched SV BMM ──────────────────────────────────────────────
+        # [B_total, G, T] × [B_total, T, D] → [B_total, G, D]
+        # NPU mapping: B=B_total, M=G, N=D, K=T
+        #   → outer tiles = B_total × M_tiles × N_tiles  (parallelizable)
+        #   → inner K-loop = T/TILE_K  (controlled, T≪S)
+        sv = torch.bmm(local_exp.to(q.dtype), v_tiles)     # [B_total, G, D]
+
+        # ── 7. Online-softmax global reduction (elementwise, fused) ────────
+        local_max = local_max.view(H_kv, n_tiles, G, 1)
+        local_sum = local_sum.view(H_kv, n_tiles, G, 1)
+        sv        = sv.view(H_kv, n_tiles, G, D)
+
+        global_max    = local_max.amax(dim=1, keepdim=True)     # [H_kv, 1, G, 1]
+        rescale       = (local_max - global_max).exp()           # [H_kv, n_tiles, G, 1]
+        corrected_sv  = (sv        * rescale).sum(dim=1)         # [H_kv, G, D]
+        corrected_sum = (local_sum * rescale).sum(dim=1)         # [H_kv, G, 1]
+
+        return (corrected_sv / corrected_sum.clamp_min(1e-12)).to(q.dtype)
+
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Test
+# ─────────────────────────────────────────────────────────────────────────────
+
+MODEL_CONFIGS = {
+    "LLAMA4_TP8": {
+        "HEAD_DIM":     128,
+        "NUM_HEADS":    5,    # = 40 total / TP8
+        "NUM_KV_HEADS": 1,    # =  8 total / TP8
+    },
+    "QWEN3-235B_TP4": {
+        "HEAD_DIM":     128,
+        "NUM_HEADS": 16,
+        "NUM_KV_HEADS": 1,
+    },
+    "GPT-OSS_TP1": {
+        "HEAD_DIM":     64,
+        "NUM_HEADS": 64,
+        "NUM_KV_HEADS": 8,
+    },
+    "GPT-OSS_TP2": {
+        "HEAD_DIM":     64,
+        "NUM_HEADS": 32,
+        "NUM_KV_HEADS": 4,
+    },
+    "GPT-OSS_TP4": {
+        "HEAD_DIM":     64,
+        "NUM_HEADS": 16,
+        "NUM_KV_HEADS": 2,
+    },
+    "GPT-OSS_TP8": {
+        "HEAD_DIM":     64,
+        "NUM_HEADS":  8,
+        "NUM_KV_HEADS": 1,
+    },
+}
+
+
+def _make_inputs(cfg, seq_len, dtype):
+    H_kv  = cfg["NUM_KV_HEADS"]
+    G     = cfg["NUM_HEADS"] // cfg["NUM_KV_HEADS"]
+    D     = cfg["HEAD_DIM"]
+    scale = 1.0 / math.sqrt(D)
+
+    q = torch.randn(H_kv, G, D,        dtype=dtype)
+    k = torch.randn(H_kv, seq_len, D,  dtype=dtype)   # NOT pre-transposed
+    v = torch.randn(H_kv, seq_len, D,  dtype=dtype)
+    return q, k, v, scale
+
+
+def test_gqa_decode_optimized(model, device, seq_len: int = 10240, tile_size: int = 512):
+
+    cfg = MODEL_CONFIGS[model] if model is not None else MODEL_CONFIGS["LLAMA4_TP8"]
+    dtype = torch.float16
+
+    model = GQADecodeOptimized(tile_size=tile_size).eval()
+
+    # ── NPU run ────────────────────────────────────────────────────────────
+    q, k, v, scale = _make_inputs(cfg, seq_len, dtype)
+    model_dev      = model.to(device)
+    compiled       = torch.compile(model_dev, dynamic=False)
+
+    q_dev, k_dev, v_dev = q.to(device), k.to(device), v.to(device)
+    with torch.no_grad():
+        with TOGSimulator():
+            out_dev = compiled(q_dev, k_dev, v_dev, scale=scale)
+
+    # ── CPU reference ──────────────────────────────────────────────────────
+    with torch.no_grad():
+        out_cpu = model.cpu()(q, k, v, scale=scale)
+
+    max_diff = (out_dev.cpu() - out_cpu).abs().max().item()
+
+    with torch.no_grad():#CPU reference
+        out_library = F.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=0.0, is_causal=False, enable_gqa=True)
+
+    max_diff_library = (out_library.cpu() - out_cpu).abs().max().item()
+
+    print(f"[GQADecodeOptimized] seq_len={seq_len}, tile_size={tile_size}")
+    print(f"  max |npu - cpu| = {max_diff:.6f}")
+    print(f"  npu out max     = {out_dev.cpu().abs().max().item():.6f}")
+    print(f"  cpu out max     = {out_cpu.abs().max().item():.6f}")
+    print(f"  library out max = {out_library.abs().max().item():.6f}")
+    print("  PASS" if max_diff < 0.05 else "  FAIL (diff too large)")
+
+
+
+
+if __name__ == "__main__":
+    argparser = argparse.ArgumentParser(description="Test GQA Attention Implementations")
+    argparser.add_argument("--model", type=str, default="LLAMA4_TP8", choices=MODEL_CONFIGS.keys(), help="Model configuration to test")
+    argparser.add_argument("--context_length", type=int, default=10240, help="Sequence length (context length) for the attention test")
+    argparser.add_argument("--tile_size", type=int, default=4096, help="Tile size for the optimized attention implementation")
+    args = argparser.parse_args()
+    model = args.model
+    base_dir = os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim")
+    sys.path.append(base_dir)
+    test_gqa_decode_optimized(model=model, device=device, seq_len=args.context_length, tile_size=args.tile_size)

From 9e20d955720eecad26b649c747a9c43f0e2d4f53 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 23 Mar 2026 16:41:18 +0900
Subject: [PATCH 143/194] [PyTorchSim/Frontend] Use kernel specific filelock to
 avoid race

---
 PyTorchSimFrontend/extension_codecache.py      | 18 +++++++++++-------
 .../mlir/mlir_codegen_backend.py               |  8 ++++++--
 PyTorchSimFrontend/mlir/mlir_template.py       | 16 ++++++++++------
 3 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index b1c457d3..6463dbac 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -4,7 +4,7 @@
 import subprocess
 import torch
 
-from torch._inductor.codecache import get_lock_dir, get_hash, write
+from torch._inductor.codecache import get_hash, write
 from torch._inductor.async_compile import AsyncCompile
 from AsmParser.tog_generator import tog_generator
 from PyTorchSimFrontend.mlir.mlir_caller_codegen import MLIRKernelCallerCodeGen
@@ -22,6 +22,11 @@ def hash_prefix(hash_value):
 def get_write_path(src_code):
     return os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "outputs", hash_prefix(get_hash(src_code.strip())))
 
+
+def get_lock_path(write_path):
+    """Return lock file path for the given write_path (per-source_code lock)."""
+    return os.path.join(write_path, ".compile.lock")
+
 def dump_metadata(args, arg_attributes, path):
     meta_path = os.path.join(path, "meta.txt")
     if os.path.isfile(meta_path):
@@ -161,8 +166,8 @@ def load(cls, source_code,
         gem5_cmds = mlir_gem5_compile_command(new_input_path, sample_mlir_path, raw_tog_path, vectorlane_size)
 
         from filelock import FileLock
-        lock_dir = get_lock_dir()
-        lock = FileLock(os.path.join(lock_dir, key + ".lock"), timeout=LOCK_TIMEOUT)
+        os.makedirs(write_path, exist_ok=True)
+        lock = FileLock(get_lock_path(write_path), timeout=LOCK_TIMEOUT)
 
         if spad_info is not None:
             link_option = f"-Wl,--section-start=.spad=0x{spad_info['spad_vaddr']:x}"
@@ -212,7 +217,7 @@ def load(cls, source_code,
         gem5_translate_cmd = shlex.split(gem5_cmds[1])
         gem5_llc_cmd = shlex.split(gem5_cmds[2])
 
-        lock = FileLock(os.path.join(lock_dir, key + ".lock"), timeout=LOCK_TIMEOUT)
+        lock = FileLock(get_lock_path(write_path), timeout=LOCK_TIMEOUT)
         with lock:
             try:
                 result = subprocess.check_output(gem5_sample_cmd)
@@ -278,11 +283,10 @@ def run_kernel_simulation(*args, **kwargs):
             # Wait for compilation
             key = future.result()
             from filelock import FileLock
-            lock_dir = get_lock_dir()
-            lock = FileLock(os.path.join(lock_dir, key + ".lock"), timeout=LOCK_TIMEOUT)
+            result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "outputs", hash_prefix(key))
+            lock = FileLock(get_lock_path(result_path), timeout=LOCK_TIMEOUT)
             with lock:
                 # Run simulator pass
-                result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "outputs", hash_prefix(key))
                 # Dump arguments and meta data
                 dump_metadata(args, arg_attributes, result_path)
                 runtime_path = FunctionalSimulator.get_runtime_dump_path(result_path)
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 672c35f7..17a60b44 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -1091,6 +1091,8 @@ def codegen_nodes(self, nodes, kernel_name):
         return src_code, meta_code
 
     def _prepare_simulator_headers(self, src_code):
+        from filelock import FileLock
+
         write_path = extension_codecache.get_write_path(src_code)
         os.makedirs(write_path, exist_ok=True)
 
@@ -1101,8 +1103,10 @@ def _prepare_simulator_headers(self, src_code):
         spad_section_end_symbol = (
             f"int spad_section_end[0] __attribute__ ((section(\".spad\"), aligned({self.spad_info['spad_size']*self.vector_lane})));"
         )
-        write_atomic(spike_write_path, self.header.getvalue() + spad_end_symbol + spad_section_end_symbol)
-        write_atomic(gem5_write_path, self.gem5_header.getvalue())
+        lock = FileLock(extension_codecache.get_lock_path(write_path), timeout=extension_codecache.LOCK_TIMEOUT)
+        with lock:
+            write_atomic(spike_write_path, self.header.getvalue() + spad_end_symbol + spad_section_end_symbol)
+            write_atomic(gem5_write_path, self.gem5_header.getvalue())
 
     def get_arg_info(self, name):
         arg_info = dict()
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 851f070f..b126d3af 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -613,18 +613,22 @@ def codegen_nodes(self, tile_candidates, render, template_node, prologue_nodes,
         return src_code, meta_code
 
     def _prepare_simulator_headers(self, src_code):
+        from filelock import FileLock
+
         spad_end_symbol = f"int spad_end[0] __attribute__ ((section(\".spad\")));\n"
         spad_section_end_symbol = f"int spad_section_end[0] __attribute__ ((section(\".spad\"), aligned({self.spad_info['spad_size']*self.vector_lane})));"
 
         write_path = extension_codecache.get_write_path(src_code)
-        if not os.path.exists(write_path):
-            os.makedirs(write_path, exist_ok=True)
+        os.makedirs(write_path, exist_ok=True)
         spike_write_path = os.path.join(write_path, "global_var.h")
         gem5_write_path = os.path.join(write_path, "gem5_global_var.h")
-        if not os.path.exists(spike_write_path):
-            write_atomic(spike_write_path, self.header.getvalue()+spad_end_symbol+spad_section_end_symbol)
-        if not os.path.exists(gem5_write_path):
-            write_atomic(gem5_write_path, self.gem5_header.getvalue())
+
+        lock = FileLock(extension_codecache.get_lock_path(write_path), timeout=extension_codecache.LOCK_TIMEOUT)
+        with lock:
+            if not os.path.exists(spike_write_path):
+                write_atomic(spike_write_path, self.header.getvalue()+spad_end_symbol+spad_section_end_symbol)
+            if not os.path.exists(gem5_write_path):
+                write_atomic(gem5_write_path, self.gem5_header.getvalue())
 
     def codegen_prologue_body(self):
         body = IndentedBuffer()

From 070c43a6ae7b194a15119b3abcece4a4ee40a539 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 23 Mar 2026 17:00:04 +0900
Subject: [PATCH 144/194] [Fix] replace outdated config name

---
 README.md                                            |  2 +-
 experiments/artifact/cycle_validation/run_cycle.sh   |  2 +-
 .../artifact/speedup/scripts/run_speed_ils_bert.sh   |  2 +-
 .../artifact/speedup/scripts/run_speed_ils_conv.sh   |  2 +-
 .../artifact/speedup/scripts/run_speed_ils_matmul.sh |  2 +-
 .../artifact/speedup/scripts/run_speed_ils_resnet.sh |  2 +-
 scripts/CompilerOpt_experiment/DMAopt.sh             |  2 +-
 scripts/sparsity_experiment/run.sh                   | 12 ++++++------
 tests/Yolov5/test_yolov5.py                          |  2 +-
 9 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md
index f55995c9..c6280498 100644
--- a/README.md
+++ b/README.md
@@ -414,7 +414,7 @@ export TORCHSIM_USE_TIMING_POOLING=0 # use lightweight pooling for timing
 ```
 You can set TOGSim config path as below.
 ```bash
-export TORCHSIM_CONFIG=/workspace/PyTorchSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml
+export TOGSIM_CONFIG=/workspace/PyTorchSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml
 ```
 ## Future Works
 Currently, PyTorchSim supports PyTorch 2.2. Support for newer versions will be added soon.
diff --git a/experiments/artifact/cycle_validation/run_cycle.sh b/experiments/artifact/cycle_validation/run_cycle.sh
index 9cfd1e98..ebf0b11f 100755
--- a/experiments/artifact/cycle_validation/run_cycle.sh
+++ b/experiments/artifact/cycle_validation/run_cycle.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 set -e
 
-export TORCHSIM_CONFIG=$TORCHSIM_DIR/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml
+export TOGSIM_CONFIG=$TORCHSIM_DIR/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml
 LOG_DIR=$TORCHSIM_DIR/experiments/artifact/logs
 mkdir -p $LOG_DIR
 
diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh b/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh
index 467949af..35d744bf 100755
--- a/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh
+++ b/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh
@@ -33,7 +33,7 @@ for i in "${config[@]}"; do
       output=$(bash -c "
         export TORCHSIM_TLS_MODE=0;
         export TORCHSIM_VALIDATION_MODE=0;
-        export TORCHSIM_CONFIG=$config_path;
+        export TOGSIM_CONFIG=$config_path;
         export AUTOTUNE=0;
         printenv;
         python3 $workload 2> /dev/null | $TORCHSIM_DIR/experiments/artifact/speedup/scripts/ils_parser.sh
diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh b/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh
index fb681c74..f85b4c40 100755
--- a/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh
+++ b/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh
@@ -34,7 +34,7 @@ for i in "${config[@]}"; do
       output=$(bash -c "
         export TORCHSIM_TLS_MODE=0;
         export TORCHSIM_VALIDATION_MODE=0;
-        export TORCHSIM_CONFIG=$config_path;
+        export TOGSIM_CONFIG=$config_path;
         export AUTOTUNE=0;
         printenv;
         python3 $workload 2> /dev/null | $TORCHSIM_DIR/experiments/artifact/speedup/scripts/ils_parser.sh
diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh b/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh
index dc0fdd20..b38848d0 100755
--- a/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh
+++ b/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh
@@ -32,7 +32,7 @@ for i in "${config[@]}"; do
       output=$(bash -c "
         export TORCHSIM_TLS_MODE=0;
         export TORCHSIM_VALIDATION_MODE=1;
-        export TORCHSIM_CONFIG=$config_path;
+        export TOGSIM_CONFIG=$config_path;
         export AUTOTUNE=0;
         printenv;
         python3 $workload 2> /dev/null | $TORCHSIM_DIR/experiments/artifact/speedup/scripts/ils_parser.sh
diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh b/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh
index 2346ab3c..689e6913 100755
--- a/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh
+++ b/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh
@@ -40,7 +40,7 @@ for i in "${config[@]}"; do
         output=$(bash -c "
           export TORCHSIM_TLS_MODE=0;
           export TORCHSIM_VALIDATION_MODE=0;
-          export TORCHSIM_CONFIG=$config_path;
+          export TOGSIM_CONFIG=$config_path;
           export AUTOTUNE=0;
           printenv;
           python3 $workload 2> /dev/null | $TORCHSIM_DIR/experiments/artifact/speedup/scripts/ils_parser.sh
diff --git a/scripts/CompilerOpt_experiment/DMAopt.sh b/scripts/CompilerOpt_experiment/DMAopt.sh
index 9e494d9b..9f3a9df2 100644
--- a/scripts/CompilerOpt_experiment/DMAopt.sh
+++ b/scripts/CompilerOpt_experiment/DMAopt.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-export TORCHSIM_CONFIG="/root/workspace/PyTorchSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml"
+export TOGSIM_CONFIG="/root/workspace/PyTorchSim/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml"
 
 # None FG DMA
 export TORCHSIM_SUBTILE=0
diff --git a/scripts/sparsity_experiment/run.sh b/scripts/sparsity_experiment/run.sh
index da9b73cc..7996b5ab 100755
--- a/scripts/sparsity_experiment/run.sh
+++ b/scripts/sparsity_experiment/run.sh
@@ -5,7 +5,7 @@ export TORCHSIM_FORCE_TIME_M=8
 export TORCHSIM_FORCE_TIME_N=8
 
 OUTPUT_DIR="12GB"
-export TORCHSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c1_12G_simple_noc.yml"
+export TOGSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c1_12G_simple_noc.yml"
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.0  > ${OUTPUT_DIR}/0.0
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.2  > ${OUTPUT_DIR}/0.2
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.4  > ${OUTPUT_DIR}/0.4
@@ -13,7 +13,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.6  > ${OUTPUT
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.8  > ${OUTPUT_DIR}/0.8
 
 OUTPUT_DIR="24GB"
-export TORCHSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c1_24G_simple_noc.yml"
+export TOGSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c1_24G_simple_noc.yml"
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.0  > ${OUTPUT_DIR}/0.0
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.2  > ${OUTPUT_DIR}/0.2
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.4  > ${OUTPUT_DIR}/0.4
@@ -21,7 +21,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.6  > ${OUTPUT
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.8  > ${OUTPUT_DIR}/0.8
 
 OUTPUT_DIR="48GB"
-export TORCHSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c1_48G_simple_noc.yml"
+export TOGSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c1_48G_simple_noc.yml"
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.0  > ${OUTPUT_DIR}/0.0
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.2  > ${OUTPUT_DIR}/0.2
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.4  > ${OUTPUT_DIR}/0.4
@@ -29,7 +29,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.6  > ${OUTPUT
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.8  > ${OUTPUT_DIR}/0.8
 
 OUTPUT_DIR="12GB_2core"
-export TORCHSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c2_12G_simple_noc.yml"
+export TOGSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c2_12G_simple_noc.yml"
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.0  > ${OUTPUT_DIR}/0.0
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.2  > ${OUTPUT_DIR}/0.2
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.4  > ${OUTPUT_DIR}/0.4
@@ -37,7 +37,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.6  > ${OUTPUT
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.8  > ${OUTPUT_DIR}/0.8
 
 OUTPUT_DIR="24GB_2core"
-export TORCHSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c2_24G_simple_noc.yml"
+export TOGSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c2_24G_simple_noc.yml"
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.0  > ${OUTPUT_DIR}/0.0
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.2  > ${OUTPUT_DIR}/0.2
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.4  > ${OUTPUT_DIR}/0.4
@@ -45,7 +45,7 @@ python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.6  > ${OUTPUT
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.8  > ${OUTPUT_DIR}/0.8
 
 OUTPUT_DIR="48GB_2core"
-export TORCHSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c2_48G_simple_noc.yml"
+export TOGSIM_CONFIG="/workspace/PyTorchSim/configs/systolic_ws_8x8_c2_48G_simple_noc.yml"
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.0  > ${OUTPUT_DIR}/0.0
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.2  > ${OUTPUT_DIR}/0.2
 python3 /workspace/PyTorchSim/tests/test_sparsity.py --sparsity  0.4  > ${OUTPUT_DIR}/0.4
diff --git a/tests/Yolov5/test_yolov5.py b/tests/Yolov5/test_yolov5.py
index 1262dfb9..d98828bd 100644
--- a/tests/Yolov5/test_yolov5.py
+++ b/tests/Yolov5/test_yolov5.py
@@ -241,7 +241,7 @@ def concat_fn(x1, x2, x3):
 
     base_dir = os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim")
     config = os.environ.get(
-        "TORCHSIM_CONFIG",
+        "TOGSIM_CONFIG",
         default=f"{base_dir}/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml",
     )
     args = argparse.ArgumentParser()

From 9fc08116a6dac34a1b2ebd4346401fe3df5c8cdb Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 23 Mar 2026 17:06:39 +0900
Subject: [PATCH 145/194] [Experiment] use timing mode for validation script

---
 ...28x128_c1_simple_noc_tpuv3_timing_only.yml | 30 +++++++++++++++++++
 experiments/BERT.py                           |  4 +--
 .../artifact/cycle_validation/run_cycle.sh    | 22 +++++++-------
 experiments/attention.py                      |  2 +-
 experiments/conv.py                           |  2 +-
 experiments/gemm.py                           |  2 +-
 experiments/layernorm.py                      |  2 +-
 experiments/resnet18.py                       |  2 +-
 experiments/resnet50.py                       |  2 +-
 experiments/softmax.py                        |  2 +-
 10 files changed, 50 insertions(+), 20 deletions(-)
 create mode 100644 configs/systolic_ws_128x128_c1_simple_noc_tpuv3_timing_only.yml

diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_timing_only.yml b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_timing_only.yml
new file mode 100644
index 00000000..f8ac0a54
--- /dev/null
+++ b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_timing_only.yml
@@ -0,0 +1,30 @@
+num_cores: 1
+core_freq_mhz: 940
+core_stats_print_period_cycles: 10000
+num_systolic_array_per_core: 2
+
+vpu_num_lanes: 128
+vpu_spad_size_kb_per_lane: 128
+vpu_vector_length_bits: 256
+
+dram_type: ramulator2
+dram_freq_mhz: 940
+dram_channels: 16
+dram_req_size_byte: 32
+dram_num_burst_length: 2
+dram_stats_print_period_cycles: 10000
+ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
+
+icnt_type: simple
+icnt_latency_cycles: 10
+icnt_freq_mhz: 940
+icnt_injection_ports_per_core: 16
+
+pytorchsim_functional_mode: 0
+pytorchsim_timing_mode: 1
+
+codegen_mapping_strategy: autotune
+codegen_external_mapping_file: ''
+codegen_autotune_max_retry: 10
+codegen_autotune_template_topk: 4
+codegen_compiler_optimization: all
diff --git a/experiments/BERT.py b/experiments/BERT.py
index b938f4e6..12e3cb33 100644
--- a/experiments/BERT.py
+++ b/experiments/BERT.py
@@ -8,7 +8,7 @@
 import torch
 from Simulator.simulator import TOGSimulator
 
-config = os.environ.get('TOGSIM_CONFIG', f'{base_path}/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml')
+config = os.environ.get('TOGSIM_CONFIG', f'{base_path}/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_timing_only.yml')
 os.environ['TOGSIM_CONFIG'] = config
 
 # Try Fusion EncoderBlock first, fall back to standard test_transformer
@@ -36,7 +36,7 @@
     model_input = torch.randn(args.input_size, hidden_dim).to(device=device)
     opt_fn = torch.compile(dynamic=False)(model)
 
-    with TOGSimulator(config_path=config):
+    with TOGSimulator(config_path=config), torch.no_grad():
         torch.npu.launch_model(opt_fn, model_input, stream_index=0, timestamp=0)
         torch.npu.synchronize()
     print(f"BERT-{args.size} Simulation Done")
diff --git a/experiments/artifact/cycle_validation/run_cycle.sh b/experiments/artifact/cycle_validation/run_cycle.sh
index ebf0b11f..7406f356 100755
--- a/experiments/artifact/cycle_validation/run_cycle.sh
+++ b/experiments/artifact/cycle_validation/run_cycle.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 set -e
 
-export TOGSIM_CONFIG=$TORCHSIM_DIR/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml
+export TOGSIM_CONFIG=$TORCHSIM_DIR/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_timing_only.yml
 LOG_DIR=$TORCHSIM_DIR/experiments/artifact/logs
 mkdir -p $LOG_DIR
 
@@ -33,16 +33,6 @@ for sz in \
   python3 $TORCHSIM_DIR/experiments/conv.py --size $sz | tee $LOG_DIR/${name}.log
 done
 
-# Attention
-for sz in "12 512 64" "16 512 64" "32 512 64"; do
-  name="attention_${sz// /x}"
-  echo ""
-  echo "==================================================="
-  echo "[*] Running Attention size=$sz"
-  echo "==================================================="
-  python3 $TORCHSIM_DIR/experiments/attention.py --size $sz | tee $LOG_DIR/${name}.log
-done
-
 # LayerNorm
 for sz in "512 768" "2048 768" "8192 768"; do
   name="layernorm_${sz// /x}"
@@ -63,6 +53,16 @@ for sz in "512 512" "2048 2048" "8192 8192"; do
   python3 $TORCHSIM_DIR/experiments/softmax.py --size $sz | tee $LOG_DIR/${name}.log
 done
 
+# Attention
+for sz in "12 512 64" "16 512 64" "32 512 64"; do
+  name="attention_${sz// /x}"
+  echo ""
+  echo "==================================================="
+  echo "[*] Running Attention size=$sz"
+  echo "==================================================="
+  python3 $TORCHSIM_DIR/experiments/attention.py --size $sz | tee $LOG_DIR/${name}.log
+done
+
 # ResNet
 for model in "resnet18" "resnet50"; do
   echo ""
diff --git a/experiments/attention.py b/experiments/attention.py
index b56ed537..db0f45bb 100644
--- a/experiments/attention.py
+++ b/experiments/attention.py
@@ -30,7 +30,7 @@ def attention(query, key, value):
     value = torch.randn(*size).to(device=device)
     opt_fn = torch.compile(dynamic=False)(attention)
 
-    with TOGSimulator(config_path=config):
+    with TOGSimulator(config_path=config), torch.no_grad():
         torch.npu.launch_model(opt_fn, query, key, value, stream_index=0, timestamp=0)
         torch.npu.synchronize()
     print(f"Attention {size} Simulation Done")
diff --git a/experiments/conv.py b/experiments/conv.py
index 98391fae..65e52635 100644
--- a/experiments/conv.py
+++ b/experiments/conv.py
@@ -33,7 +33,7 @@ def _conv(a, b, bias):
     custom_conv = conv2d_fn(batch_size, i_h, i_w, i_c, o_c, kernel_size, stride, padding)
     opt_fn = torch.compile(dynamic=False)(custom_conv)
 
-    with TOGSimulator(config_path=config):
+    with TOGSimulator(config_path=config), torch.no_grad():
         torch.npu.launch_model(opt_fn, conv_input, conv_kernel, conv_bias, stream_index=0, timestamp=0)
         torch.npu.synchronize()
     print(f"CONV {batch_size}_{i_h}_{i_w}_{i_c}_{o_c}_{kernel_size}_{stride}_{padding} Simulation Done")
diff --git a/experiments/gemm.py b/experiments/gemm.py
index d256e931..dbbba3ea 100644
--- a/experiments/gemm.py
+++ b/experiments/gemm.py
@@ -26,7 +26,7 @@ def matmul_fn(a, b):
     input_b = torch.randn(K, N).to(device=device)
     opt_fn = torch.compile(dynamic=False)(matmul_fn)
 
-    with TOGSimulator(config_path=config):
+    with TOGSimulator(config_path=config), torch.no_grad():
         torch.npu.launch_model(opt_fn, input_a, input_b, stream_index=0, timestamp=0)
         torch.npu.synchronize()
     print(f"GEMM {M}x{K}x{N} (MxKxN) Simulation Done")
diff --git a/experiments/layernorm.py b/experiments/layernorm.py
index a9170c6b..375f98e9 100644
--- a/experiments/layernorm.py
+++ b/experiments/layernorm.py
@@ -23,7 +23,7 @@
     opt_fn = torch.compile(dynamic=False)(model)
     model_input = torch.randn(*size).to(device=device)
 
-    with TOGSimulator(config_path=config):
+    with TOGSimulator(config_path=config), torch.no_grad():
         torch.npu.launch_model(opt_fn, model_input, stream_index=0, timestamp=0)
         torch.npu.synchronize()
     print(f"LayerNorm {size} Simulation Done")
diff --git a/experiments/resnet18.py b/experiments/resnet18.py
index 38fb80fe..ffec9a50 100644
--- a/experiments/resnet18.py
+++ b/experiments/resnet18.py
@@ -22,7 +22,7 @@
     opt_fn = torch.compile(dynamic=False)(model)
     model_input = torch.randn(args.batch, 3, 224, 224).to(device=device)
 
-    with TOGSimulator(config_path=config):
+    with TOGSimulator(config_path=config), torch.no_grad():
         torch.npu.launch_model(opt_fn, model_input, stream_index=0, timestamp=0)
         torch.npu.synchronize()
     print("ResNet18 Simulation Done")
diff --git a/experiments/resnet50.py b/experiments/resnet50.py
index 5b134c13..d886c159 100644
--- a/experiments/resnet50.py
+++ b/experiments/resnet50.py
@@ -22,7 +22,7 @@
     opt_fn = torch.compile(dynamic=False)(model)
     model_input = torch.randn(args.batch, 3, 224, 224).to(device=device)
 
-    with TOGSimulator(config_path=config):
+    with TOGSimulator(config_path=config), torch.no_grad():
         torch.npu.launch_model(opt_fn, model_input, stream_index=0, timestamp=0)
         torch.npu.synchronize()
     print("ResNet50 Simulation Done")
diff --git a/experiments/softmax.py b/experiments/softmax.py
index b86febe0..05024121 100644
--- a/experiments/softmax.py
+++ b/experiments/softmax.py
@@ -23,7 +23,7 @@
     opt_fn = torch.compile(dynamic=False)(model)
     model_input = torch.randn(*size).to(device=device)
 
-    with TOGSimulator(config_path=config):
+    with TOGSimulator(config_path=config), torch.no_grad():
         torch.npu.launch_model(opt_fn, model_input, stream_index=0, timestamp=0)
         torch.npu.synchronize()
     print(f"Softmax {size} Simulation Done")

From cf56c596b05457ecd1a3093574a87722e677862e Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 23 Mar 2026 20:36:06 +0900
Subject: [PATCH 146/194] [CI] Run validation script only for vector_lane==128

---
 .github/workflows/pytorchsim_test.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/pytorchsim_test.yml b/.github/workflows/pytorchsim_test.yml
index 36a62b68..3a383137 100644
--- a/.github/workflows/pytorchsim_test.yml
+++ b/.github/workflows/pytorchsim_test.yml
@@ -750,6 +750,7 @@ jobs:
   test_accuracy:
     name: Run test_accuracy
     runs-on: self-hosted
+    if: inputs.vector_lane == 128
     steps:
       - name: Log in to GitHub Container Registry
         uses: docker/login-action@v3

From 8d22583c1b15eb04c83875f39a0f7bdd140ec967 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 23 Mar 2026 20:44:54 +0900
Subject: [PATCH 147/194] [TOGSim] Add error handling of idle stat couting

---
 TOGSim/src/Core.cc | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/TOGSim/src/Core.cc b/TOGSim/src/Core.cc
index 30858193..1f831661 100644
--- a/TOGSim/src/Core.cc
+++ b/TOGSim/src/Core.cc
@@ -62,9 +62,9 @@ void Core::vu_cycle() {
     if (!_vu_compute_pipeline.empty()) {
       _stat_vu_compute_cycle++;
       if(_vu_compute_pipeline.front()->finish_cycle <= _core_cycle) {
-        int bubble = _vu_compute_pipeline.front()->bubble_cycle;
+        cycle_type bubble = _vu_compute_pipeline.front()->bubble_cycle;
         _stat_vu_compute_idle_cycle += bubble;
-        _stat_vu_compute_cycle -= bubble;
+        _stat_vu_compute_cycle = (bubble < _stat_vu_compute_cycle) ? (_stat_vu_compute_cycle - bubble) : 0;
         finish_instruction(_vu_compute_pipeline.front());
         _vu_compute_pipeline.pop();
       } else {
@@ -83,9 +83,10 @@ void Core::sa_cycle() {
     while (retry) {
       if (!_sa_compute_pipeline.at(i).empty()) {
         if(_sa_compute_pipeline.at(i).front()->finish_cycle <= _core_cycle) {
-          int bubble = _sa_compute_pipeline.at(i).front()->bubble_cycle;
+          cycle_type bubble = _sa_compute_pipeline.at(i).front()->bubble_cycle;
           _stat_sa_compute_idle_cycle.at(i) += bubble;
-          _stat_sa_compute_cycle.at(i) -= bubble;
+          cycle_type& stat = _stat_sa_compute_cycle.at(i);
+          stat = (bubble < stat) ? (stat - bubble) : 0;
           finish_instruction(_sa_compute_pipeline.at(i).front());
           _sa_compute_pipeline.at(i).pop();
         } else {

From 0b60ddde6369fa028037c50e281ac36a2ba5e6c4 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 23 Mar 2026 16:17:18 +0900
Subject: [PATCH 148/194] [TOGSim] Update DRAM Bw stat with exact number

---
 TOGSim/extern/ramulator2 | 2 +-
 TOGSim/src/Dram.cc       | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/TOGSim/extern/ramulator2 b/TOGSim/extern/ramulator2
index 748cd709..49556128 160000
--- a/TOGSim/extern/ramulator2
+++ b/TOGSim/extern/ramulator2
@@ -1 +1 @@
-Subproject commit 748cd7099778d7196326aeb6384da92efb0c34c9
+Subproject commit 495561282d99f2ef2652618710e98c4a287025da
diff --git a/TOGSim/src/Dram.cc b/TOGSim/src/Dram.cc
index 089c582e..656e57f8 100644
--- a/TOGSim/src/Dram.cc
+++ b/TOGSim/src/Dram.cc
@@ -54,7 +54,8 @@ DramRamulator2::DramRamulator2(SimulationConfig config, cycle_type* core_cycle)
   _mem.resize(_n_ch);
   for (int ch = 0; ch < _n_ch; ch++) {
     _mem[ch] = std::make_unique<Ramulator2>(
-      ch, _n_ch, config.dram_config_path, "Ramulator2", _config.dram_print_interval, _n_bl);
+      ch, _n_ch, config.dram_config_path, "Ramulator2", _config.dram_print_interval, _n_bl,
+      _req_size, config.dram_freq_mhz);
   }
   _tx_log2 = log2(_req_size);
   _tx_ch_log2 = log2(_n_ch_per_partition) + _tx_log2;

From 6bc1204b802b518afc8216318e94812529b549a7 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 24 Mar 2026 15:22:14 +0900
Subject: [PATCH 149/194] [Experiment] Fix ils script to use updated config

---
 TOGSim/src/main.cc                            |  8 +++
 ...lic_ws_128x128_c2_simple_noc_tpuv3_ils.yml | 33 ++++++++++
 experiments/artifact/speedup/run_speedup.sh   | 61 +++++++++----------
 .../speedup/scripts/run_speed_ils_bert.sh     | 13 +---
 .../speedup/scripts/run_speed_ils_conv.sh     | 13 +---
 .../speedup/scripts/run_speed_ils_matmul.sh   | 13 +---
 .../speedup/scripts/run_speed_ils_resnet.sh   | 13 +---
 7 files changed, 83 insertions(+), 71 deletions(-)
 create mode 100644 configs/systolic_ws_128x128_c2_simple_noc_tpuv3_ils.yml

diff --git a/TOGSim/src/main.cc b/TOGSim/src/main.cc
index cda8f986..57e0e696 100644
--- a/TOGSim/src/main.cc
+++ b/TOGSim/src/main.cc
@@ -115,6 +115,14 @@ int main(int argc, char** argv) {
   // Check if help was requested
   cmd_parser.print_help_message_if_required();
 
+  // Dump full command for copy-paste re-run
+  std::ostringstream cmd_oss;
+  for (int i = 0; i < argc; ++i) {
+    if (i > 0) cmd_oss << " ";
+    cmd_oss << argv[i];
+  }
+  spdlog::info("[TOGSim] Run command: {}", cmd_oss.str());
+
   std::string level = "info";
   cmd_parser.set_if_defined("log_level", &level);
   if (level == "trace")
diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_ils.yml b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_ils.yml
new file mode 100644
index 00000000..ce2d932d
--- /dev/null
+++ b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_ils.yml
@@ -0,0 +1,33 @@
+# ILS (Instruction-Level Simulation) 전용 config
+# - pytorchsim_functional_mode: 0 (timing only, no validation)
+# - codegen_mapping_strategy: heuristic (no autotune)
+num_cores: 2
+core_freq_mhz: 940
+core_stats_print_period_cycles: 10000
+num_systolic_array_per_core: 2
+
+vpu_num_lanes: 128
+vpu_spad_size_kb_per_lane: 128
+vpu_vector_length_bits: 256
+
+dram_type: ramulator2
+dram_freq_mhz: 940
+dram_channels: 32
+dram_req_size_byte: 32
+dram_num_burst_length: 2
+dram_stats_print_period_cycles: 10000
+ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
+
+icnt_type: simple
+icnt_latency_cycles: 10
+icnt_freq_mhz: 940
+icnt_injection_ports_per_core: 16
+
+pytorchsim_functional_mode: 0
+pytorchsim_timing_mode: 1
+
+codegen_mapping_strategy: heuristic
+codegen_external_mapping_file: ''
+codegen_autotune_max_retry: 10
+codegen_autotune_template_topk: 4
+codegen_compiler_optimization: all
diff --git a/experiments/artifact/speedup/run_speedup.sh b/experiments/artifact/speedup/run_speedup.sh
index e84ab1a9..cb5ee511 100755
--- a/experiments/artifact/speedup/run_speedup.sh
+++ b/experiments/artifact/speedup/run_speedup.sh
@@ -1,7 +1,11 @@
 #!/bin/bash
+set -e
+
 LOG_DIR=$TORCHSIM_DIR/experiments/artifact/logs
 CONFIG_DIR="$TORCHSIM_DIR/configs"
-SIMULATOR_BIN="$TORCHSIM_DIR/TOGSim/build/bin/Simulator"
+EXTRACT_TRACE="$TORCHSIM_DIR/experiments/artifact/speedup/scripts/extract_trace_from_log.py"
+TRACE_CACHE_DIR="$TORCHSIM_DIR/experiments/artifact/speedup/trace_cache"
+mkdir -p "$TRACE_CACHE_DIR"
 
 configs=(
     "systolic_ws_128x128_c2_simple_noc_tpuv3.yml"
@@ -25,9 +29,11 @@ output_dir="$TORCHSIM_DIR/experiments/artifact/speedup/results"
 mkdir -p "$output_dir"
 
 echo "[*] Scanning log files in: $LOG_DIR"
+echo "[*] Extracting [TOGSim] Run command and trace from logs"
 echo ""
 
 for log_file in "$LOG_DIR"/*.log; do
+  [[ -f "$log_file" ]] || continue
   filename=$(basename "$log_file")
   workload="${filename%.log}"
 
@@ -36,45 +42,38 @@ for log_file in "$LOG_DIR"/*.log; do
   fi
   echo "==> Workload: $workload"
 
-  declare -a ONNX_ATTR_PAIRS=()
+  # === Extract [TOGSim] Run command from log ===
+  base_cmd=$(grep "\[TOGSim\] Run command:" "$log_file" 2>/dev/null | sed 's/.*\[TOGSim\] Run command: //' | head -1)
+  if [[ -z "$base_cmd" ]]; then
+    echo "    Skipping: no [TOGSim] Run command found in $log_file"
+    continue
+  fi
 
-  # === Grep launch line ===
-  while IFS= read -r line; do
-    if [[ "$line" == launch* ]]; then
-      read -r _ onnx_path attr_path _ <<< "$line"
-      ONNX_ATTR_PAIRS+=("$onnx_path|$attr_path")
-    fi
-  done < "$log_file"
+  # === Get trace file (replace FIFO in command; stored trace or generate from log) ===
+  trace_file=$(python3 "$EXTRACT_TRACE" "$log_file" "$TRACE_CACHE_DIR/${workload}.trace" 2>/dev/null) || true
+  if [[ -z "$trace_file" || ! -f "$trace_file" ]]; then
+    echo "    Skipping: could not extract trace from $log_file"
+    continue
+  fi
 
   # Normal configs
   for config in "${configs[@]}"; do
-    output_file="$output_dir/${workload}_${config}.txt" 
-    echo "Running with config=$config"
-    echo "===== config=$config | model=$workload =====" >> "$output_file"
+    output_file="$output_dir/${workload}_${config}.txt"
+    echo "===== config=$config | model=$workload =====" > "$output_file"
     sum_all_iters=0.0
     iter_count=0
 
-     # === Run 5 iterations ===
     for iter in {1..5}; do
       echo "[Iter $iter] Running simulation for workload=$workload config=$config"
-      cmd=""
-      for pair in "${ONNX_ATTR_PAIRS[@]}"; do
-        IFS="|" read -r onnx_path attr_path <<< "$pair"
-        cmd+=" $SIMULATOR_BIN --config $CONFIG_DIR/$config --models_list $onnx_path --attributes_list $attr_path;"
-      done
-
-      output=$(bash -c "$cmd")
-      sim_times=$(echo "$output" | grep "Simulation time:" | sed -E 's/.*Simulation time: ([0-9]+\.[0-9]+).*/\1/')
-
-      if [[ -n "$sim_times" ]]; then
-        sum_per_iter=0.0
-        while IFS= read -r sim_time; do
-          echo "Iteration $iter: simulation_time = $sim_time" >> "$output_file"
-          sum_per_iter=$(awk -v a="$sum_per_iter" -v b="$sim_time" 'BEGIN {printf "%.6f", a + b}')
-        done <<< "$sim_times"
-
-        echo "Iteration $iter: total_simulation_time = $sum_per_iter" >> "$output_file"
-        sum_all_iters=$(awk -v a="$sum_all_iters" -v b="$sum_per_iter" 'BEGIN {printf "%.6f", a + b}')
+      # Build command: replace --config and --models_list in base_cmd with our config and trace
+      cmd=$(echo "$base_cmd" | sed -E "s|--config [^ ]+|--config $CONFIG_DIR/$config|" | sed -E "s|--models_list [^ ]+|--models_list $trace_file|")
+      echo "$cmd"
+      output=$(bash -c "$cmd" 2>&1) || true
+      sim_time=$(echo "$output" | grep "Wall-clock time for simulation:" | sed -E 's/.*Wall-clock time for simulation: ([0-9]+\.[0-9]+) seconds.*/\1/')
+
+      if [[ -n "$sim_time" ]]; then
+        echo "Iteration $iter: simulation_time = $sim_time" >> "$output_file"
+        sum_all_iters=$(awk -v a="$sum_all_iters" -v b="$sim_time" 'BEGIN {printf "%.6f", a + b}')
         iter_count=$((iter_count + 1))
       else
         echo "Iteration $iter: No simulation time found." >> "$output_file"
diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh b/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh
index 35d744bf..642fec34 100755
--- a/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh
+++ b/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh
@@ -2,10 +2,7 @@
 
 base_dir=$TORCHSIM_DIR/experiments/artifact/speedup
 config=(
-    # "systolic_ws_8x8_c1_simple_noc.yml"
-    "systolic_ws_128x128_c2_simple_noc_tpuv3.yml"
-    #"systolic_ws_128x128_c2_booksim_tpuv3.yml"
-    # "systolic_ws_128x128_c2_simple_noc_tpuv4.yml"
+    "systolic_ws_128x128_c2_simple_noc_tpuv3_ils.yml"
 )
 TIMESTAMP=$(date +"%Y-%m-%d_%H-%M-%S")
 SIZE_LIST=(
@@ -31,15 +28,11 @@ for i in "${config[@]}"; do
     for iter in {1..5}; do
       echo "[Iter $iter] Running simulation for workload=ils_$ops config=$config"
       output=$(bash -c "
-        export TORCHSIM_TLS_MODE=0;
-        export TORCHSIM_VALIDATION_MODE=0;
         export TOGSIM_CONFIG=$config_path;
-        export AUTOTUNE=0;
-        printenv;
-        python3 $workload 2> /dev/null | $TORCHSIM_DIR/experiments/artifact/speedup/scripts/ils_parser.sh
+        cd $TORCHSIM_DIR && python3 $workload 2>&1
       ")
 
-      sim_time=$(echo "$output" | grep "Simulation time:" | tail -n 1 | sed -E 's/.*Simulation time: ([0-9]+\.[0-9]+).*/\1/')
+      sim_time=$(echo "$output" | grep "Wall-clock time for simulation:" | tail -n 1 | sed -E 's/.*Wall-clock time for simulation: ([0-9]+\.[0-9]+) seconds.*/\1/')
 
       if [[ -n "$sim_time" ]]; then
         echo "Iteration $iter: Simulation time = $sim_time"
diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh b/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh
index f85b4c40..f5602668 100755
--- a/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh
+++ b/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh
@@ -2,10 +2,7 @@
 
 base_dir=$TORCHSIM_DIR/experiments/artifact/speedup
 config=(
-    # "systolic_ws_8x8_c1_simple_noc.yml"
-    "systolic_ws_128x128_c2_simple_noc_tpuv3.yml"
-    #"systolic_ws_128x128_c2_booksim_tpuv3.yml"
-    # "systolic_ws_128x128_c2_simple_noc_tpuv4.yml"
+    "systolic_ws_128x128_c2_simple_noc_tpuv3_ils.yml"
 )
 TIMESTAMP=$(date +"%Y-%m-%d_%H-%M-%S")
 SHAPE_LIST=(
@@ -32,15 +29,11 @@ for i in "${config[@]}"; do
     for iter in {1..5}; do
       echo "[Iter $iter] Running simulation for workload=ils_$ops config=$config"
       output=$(bash -c "
-        export TORCHSIM_TLS_MODE=0;
-        export TORCHSIM_VALIDATION_MODE=0;
         export TOGSIM_CONFIG=$config_path;
-        export AUTOTUNE=0;
-        printenv;
-        python3 $workload 2> /dev/null | $TORCHSIM_DIR/experiments/artifact/speedup/scripts/ils_parser.sh
+        cd $TORCHSIM_DIR && python3 $workload 2>&1
       ")
 
-      sim_time=$(echo "$output" | grep "Simulation time:" | tail -n 1 | sed -E 's/.*Simulation time: ([0-9]+\.[0-9]+).*/\1/')
+      sim_time=$(echo "$output" | grep "Wall-clock time for simulation:" | tail -n 1 | sed -E 's/.*Wall-clock time for simulation: ([0-9]+\.[0-9]+) seconds.*/\1/')
 
       if [[ -n "$sim_time" ]]; then
         echo "Iteration $iter: Simulation time = $sim_time"
diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh b/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh
index b38848d0..bc912aa6 100755
--- a/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh
+++ b/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh
@@ -2,10 +2,7 @@
 
 base_dir=$TORCHSIM_DIR/experiments/artifact/speedup
 config=(
-    # "systolic_ws_8x8_c1_simple_noc.yml"
-    "systolic_ws_128x128_c2_simple_noc_tpuv3.yml"
-    #"systolic_ws_128x128_c2_booksim_tpuv3.yml"
-    # "systolic_ws_128x128_c2_simple_noc_tpuv4.yml"
+    "systolic_ws_128x128_c2_simple_noc_tpuv3_ils.yml"
 )
 TIMESTAMP=$(date +"%Y-%m-%d_%H-%M-%S")
 SHAPE_LIST=(
@@ -30,15 +27,11 @@ for i in "${config[@]}"; do
     for iter in {1..5}; do
       echo "[Iter $iter] Running simulation for workload=ils_$ops config=$config"
       output=$(bash -c "
-        export TORCHSIM_TLS_MODE=0;
-        export TORCHSIM_VALIDATION_MODE=1;
         export TOGSIM_CONFIG=$config_path;
-        export AUTOTUNE=0;
-        printenv;
-        python3 $workload 2> /dev/null | $TORCHSIM_DIR/experiments/artifact/speedup/scripts/ils_parser.sh
+        cd $TORCHSIM_DIR && python3 $workload 2>&1
       ")
 
-      sim_time=$(echo "$output" | grep "Simulation time:" | tail -n 1 | sed -E 's/.*Simulation time: ([0-9]+\.[0-9]+).*/\1/')
+      sim_time=$(echo "$output" | grep "Wall-clock time for simulation:" | tail -n 1 | sed -E 's/.*Wall-clock time for simulation: ([0-9]+\.[0-9]+) seconds.*/\1/')
 
       if [[ -n "$sim_time" ]]; then
         echo "Iteration $iter: simulation_time = $sim_time" >> "$output_file"
diff --git a/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh b/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh
index 689e6913..b1a43cb5 100755
--- a/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh
+++ b/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh
@@ -2,10 +2,7 @@
 
 base_dir=$TORCHSIM_DIR/experiments/artifact/speedup
 config=(
-    # "systolic_ws_8x8_c1_simple_noc.yml"
-    "systolic_ws_128x128_c2_simple_noc_tpuv3.yml"
-    #"systolic_ws_128x128_c2_booksim_tpuv3.yml"
-    # "systolic_ws_128x128_c2_simple_noc_tpuv4.yml"
+    "systolic_ws_128x128_c2_simple_noc_tpuv3_ils.yml"
 )
 TIMESTAMP=$(date +"%Y-%m-%d_%H-%M-%S")
 SIZE_LIST=(
@@ -38,15 +35,11 @@ for i in "${config[@]}"; do
       for iter in {1..5}; do
         echo "[Iter $iter] Running simulation for workload=ils_$ops config=$config"
         output=$(bash -c "
-          export TORCHSIM_TLS_MODE=0;
-          export TORCHSIM_VALIDATION_MODE=0;
           export TOGSIM_CONFIG=$config_path;
-          export AUTOTUNE=0;
-          printenv;
-          python3 $workload 2> /dev/null | $TORCHSIM_DIR/experiments/artifact/speedup/scripts/ils_parser.sh
+          cd $TORCHSIM_DIR && python3 $workload 2>&1
         ")
 
-        sim_time=$(echo "$output" | grep "Simulation time:" | tail -n 1 | sed -E 's/.*Simulation time: ([0-9]+\.[0-9]+).*/\1/')
+        sim_time=$(echo "$output" | grep "Wall-clock time for simulation:" | tail -n 1 | sed -E 's/.*Wall-clock time for simulation: ([0-9]+\.[0-9]+) seconds.*/\1/')
 
         if [[ -n "$sim_time" ]]; then
           echo "Iteration $iter: Simulation time = $sim_time"

From 336fdf375ac60b066d41a4906df8fc554944e1b9 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 24 Mar 2026 15:26:59 +0900
Subject: [PATCH 150/194] [CI] Remove dump folder mount for test

---
 .github/workflows/pytorchsim_test.yml | 89 +--------------------------
 1 file changed, 2 insertions(+), 87 deletions(-)

diff --git a/.github/workflows/pytorchsim_test.yml b/.github/workflows/pytorchsim_test.yml
index 3a383137..2a9d60a1 100644
--- a/.github/workflows/pytorchsim_test.yml
+++ b/.github/workflows/pytorchsim_test.yml
@@ -31,8 +31,6 @@ jobs:
         run: |
           echo "Running test_add.py"
           docker run --rm \
-            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
-            -e TORCHSIM_DUMP_PATH=/dump \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_add.py
@@ -52,8 +50,6 @@ jobs:
         run: |
           echo "Running test_transcendental.py"
           docker run --rm \
-            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
-            -e TORCHSIM_DUMP_PATH=/dump \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_transcendental.py
@@ -73,8 +69,6 @@ jobs:
         run: |
           echo "Running test_activation.py"
           docker run --rm \
-            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
-            -e TORCHSIM_DUMP_PATH=/dump \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_activation.py
@@ -94,8 +88,6 @@ jobs:
         run: |
           echo "Running test_batchnorm.py"
           docker run --rm \
-            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
-            -e TORCHSIM_DUMP_PATH=/dump \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_batchnorm.py
@@ -115,8 +107,6 @@ jobs:
         run: |
           echo "Running test_bmm.py"
           docker run --rm \
-            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
-            -e TORCHSIM_DUMP_PATH=/dump \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_bmm.py
@@ -136,8 +126,6 @@ jobs:
         run: |
           echo "Running test_cnn.py"
           docker run --rm \
-            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
-            -e TORCHSIM_DUMP_PATH=/dump \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_cnn.py
@@ -157,8 +145,6 @@ jobs:
         run: |
           echo "Running test_conv2d.py"
           docker run --rm \
-            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
-            -e TORCHSIM_DUMP_PATH=/dump \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_conv2d.py
@@ -178,8 +164,6 @@ jobs:
         run: |
           echo "Running test_cat.py"
           docker run --rm \
-            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
-            -e TORCHSIM_DUMP_PATH=/dump \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_cat.py
@@ -199,8 +183,6 @@ jobs:
         run: |
           echo "Running test_matmul.py"
           docker run --rm \
-            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
-            -e TORCHSIM_DUMP_PATH=/dump \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_matmul.py
@@ -220,8 +202,6 @@ jobs:
         run: |
           echo "Running test_reduce.py"
           docker run --rm \
-            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
-            -e TORCHSIM_DUMP_PATH=/dump \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_reduce.py
@@ -241,8 +221,6 @@ jobs:
         run: |
           echo "Running test_softmax.py"
           docker run --rm \
-            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
-            -e TORCHSIM_DUMP_PATH=/dump \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_softmax.py
@@ -262,8 +240,6 @@ jobs:
         run: |
           echo "Running test_transpose2D.py"
           docker run --rm \
-            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
-            -e TORCHSIM_DUMP_PATH=/dump \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_transpose2D.py
@@ -283,8 +259,6 @@ jobs:
         run: |
           echo "Running test_view3D_2D.py"
           docker run --rm \
-            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
-            -e TORCHSIM_DUMP_PATH=/dump \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_view3D_2D.py
@@ -304,8 +278,6 @@ jobs:
         run: |
           echo "Running test_layernorm.py"
           docker run --rm \
-            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
-            -e TORCHSIM_DUMP_PATH=/dump \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_layernorm.py
@@ -325,8 +297,6 @@ jobs:
         run: |
           echo "Running test_mlp.py"
           docker run --rm \
-            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
-            -e TORCHSIM_DUMP_PATH=/dump \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_mlp.py
@@ -346,8 +316,6 @@ jobs:
         run: |
           echo "Running test_resnet.py"
           docker run --rm \
-            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
-            -e TORCHSIM_DUMP_PATH=/dump \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_resnet.py
@@ -356,8 +324,6 @@ jobs:
         run: |
           echo "Running test_resnet.py"
           docker run --rm \
-            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
-            -e TORCHSIM_DUMP_PATH=/dump \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_resnet.py --model_type resnet50
@@ -377,8 +343,6 @@ jobs:
         run: |
           echo "Running test_transformer.py"
           docker run --rm \
-            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
-            -e TORCHSIM_DUMP_PATH=/dump \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_transformer.py
@@ -398,8 +362,6 @@ jobs:
         run: |
           echo "Running test_transpose3D.py"
           docker run --rm \
-            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
-            -e TORCHSIM_DUMP_PATH=/dump \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_transpose3D.py
@@ -419,8 +381,6 @@ jobs:
         run: |
           echo "Running test_sparsity.py"
           docker run --rm \
-            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
-            -e TORCHSIM_DUMP_PATH=/dump \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_sparsity.py
@@ -440,8 +400,6 @@ jobs:
         run: |
           echo "Running test_pool.py"
           docker run --rm \
-            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
-            -e TORCHSIM_DUMP_PATH=/dump \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_pool.py
@@ -461,8 +419,6 @@ jobs:
         run: |
           echo "Running test_single_perceptron.py"
           docker run --rm \
-            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
-            -e TORCHSIM_DUMP_PATH=/dump \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_single_perceptron.py
@@ -482,8 +438,6 @@ jobs:
         run: |
           echo "Running test_addmm_residual.py"
           docker run --rm \
-            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
-            -e TORCHSIM_DUMP_PATH=/dump \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/Fusion/test_addmm_residual.py
@@ -492,8 +446,6 @@ jobs:
         run: |
           echo "Running test_matmul_activation.py"
           docker run --rm \
-            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
-            -e TORCHSIM_DUMP_PATH=/dump \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/Fusion/test_matmul_activation.py
@@ -502,8 +454,6 @@ jobs:
         run: |
           echo "Running test_matmul_scalar.py"
           docker run --rm \
-            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
-            -e TORCHSIM_DUMP_PATH=/dump \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/Fusion/test_matmul_scalar.py
@@ -512,8 +462,6 @@ jobs:
         run: |
           echo "Running test_matmul_reduction.py"
           docker run --rm \
-            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
-            -e TORCHSIM_DUMP_PATH=/dump \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/Fusion/test_matmul_reduction.py
@@ -522,8 +470,6 @@ jobs:
         run: |
           echo "Running test_bmm_reduction.py"
           docker run --rm \
-            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
-            -e TORCHSIM_DUMP_PATH=/dump \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/Fusion/test_bmm_reduction.py
@@ -532,8 +478,6 @@ jobs:
         run: |
           echo "Running test_prologue_fusion.py"
           docker run --rm \
-            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
-            -e TORCHSIM_DUMP_PATH=/dump \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/Fusion/test_prologue_fusion.py
@@ -542,8 +486,6 @@ jobs:
         run: |
           echo "Running test_transformer_fusion.py"
           docker run --rm \
-            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
-            -e TORCHSIM_DUMP_PATH=/dump \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/Fusion/test_transformer_fusion.py
@@ -552,8 +494,6 @@ jobs:
         run: |
           echo "Running test_conv_fusion.py"
           docker run --rm \
-            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
-            -e TORCHSIM_DUMP_PATH=/dump \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/Fusion/test_conv_fusion.py
@@ -573,8 +513,6 @@ jobs:
         run: |
           echo "Running test_moe.py"
           docker run --rm \
-            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
-            -e TORCHSIM_DUMP_PATH=/dump \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/MoE/test_moe.py
@@ -594,8 +532,6 @@ jobs:
         run: |
           echo "Running test_mistral.py"
           docker run --rm \
-            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
-            -e TORCHSIM_DUMP_PATH=/dump \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/Mixtral_8x7B/test_attention.py
@@ -615,8 +551,6 @@ jobs:
         run: |
           echo "Running test_vit.py"
           docker run --rm \
-            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
-            -e TORCHSIM_DUMP_PATH=/dump \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_vit.py
@@ -636,8 +570,6 @@ jobs:
         run: |
           echo "Running test_diffusion.py"
           docker run --rm \
-            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
-            -e TORCHSIM_DUMP_PATH=/dump \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/Diffusion/test_diffusion.py
@@ -657,8 +589,6 @@ jobs:
         run: |
           echo "Running test_indirect.py"
           docker run --rm \
-            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
-            -e TORCHSIM_DUMP_PATH=/dump \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_indirect_access.py
@@ -678,8 +608,6 @@ jobs:
         run: |
           echo "Running test_scheduler.py"
           docker run --rm \
-            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
-            -e TORCHSIM_DUMP_PATH=/dump \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_scheduler.py
@@ -699,8 +627,6 @@ jobs:
         run: |
           echo "Running test_llama.py"
           docker run --rm \
-            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
-            -e TORCHSIM_DUMP_PATH=/dump \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/Llama/test_llama.py
@@ -720,8 +646,6 @@ jobs:
         run: |
           echo "Running test_yolov5.py"
           docker run --rm \
-            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
-            -e TORCHSIM_DUMP_PATH=/dump \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/Yolov5/test_yolov5.py
@@ -741,8 +665,6 @@ jobs:
         run: |
           echo "Running test_deepseek_v3_base.py"
           docker run --rm \
-            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
-            -e TORCHSIM_DUMP_PATH=/dump \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/DeepSeek/test_deepseek_v3_base.py
@@ -759,25 +681,18 @@ jobs:
           username: ${{ github.actor }}
           password: ${{ secrets.GITHUB_TOKEN }}
 
-      - name: Prepare volume directory
-        run: mkdir -p /tmp/torchsim-ci/${GITHUB_SHA}
-
       - name: Run run_cycle.sh
         run: |
           echo "Running run_cycle.sh"
           docker run --rm \
-            -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \
-            -e TORCHSIM_DUMP_PATH=/dump \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} bash -c \
-            "cd /workspace && PyTorchSim/experiments/artifact/cycle_validation/run_cycle.sh && \
-            cp PyTorchSim/experiments/artifact/cycle_validation/summary_cycle.out /dump/summary_cycle.out"
-          ls /tmp/torchsim-ci/${GITHUB_SHA}
+            "cd /workspace && PyTorchSim/experiments/artifact/cycle_validation/run_cycle.sh >/dev/null 2>&1 && cat PyTorchSim/experiments/artifact/cycle_validation/summary_cycle.out" > summary_cycle.out
 
       - name: Upload Accuracy Report Artifact
         uses: actions/upload-artifact@v4
         with:
           name: accuracy-report
-          path: /tmp/torchsim-ci/${{ github.sha }}/summary_cycle.out
+          path: summary_cycle.out
           if-no-files-found: error

From 8838bfe361cdadf5a9516d15e737d9443522a84e Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 24 Mar 2026 22:09:45 +0900
Subject: [PATCH 151/194] [Decompse] Add naive group convolution decomposition
 + test

---
 .../mlir/mlir_codegen_backend.py              |   8 +-
 PyTorchSimFrontend/mlir/mlir_decomposition.py | 127 +++++++++++++++++-
 tests/test_group_conv.py                      |  79 +++++++++++
 3 files changed, 212 insertions(+), 2 deletions(-)
 create mode 100644 tests/test_group_conv.py

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 17a60b44..3ecf3b53 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -285,7 +285,13 @@ def __init__(self, kernel_group, reason=None):
         self.gem5_header = IndentedBuffer()
         self.header.writeline("#include <unistd.h>")
         self.header.writeline("#include <stdlib.h>")
-        self.header.writeline("void* __wrap_malloc(size_t size) { size = (size + 511UL) & ~511UL; return sbrk(size); }") # Align to 512 bytes
+        self.header.writeline("#include <stdio.h>")
+        self.header.writeline("void* __wrap_malloc(size_t size) {")  # Align to 512 bytes
+        self.header.writeline("    size_t aligned = (size + 511UL) & ~511UL;")
+        self.header.writeline("    void *p = sbrk(aligned);")
+        #self.header.writeline('    fprintf(stderr, "[SPIKE][__wrap_malloc] addr=%p size=%zu (req=%zu)\\n", p, aligned, size);')
+        self.header.writeline("    return p;")
+        self.header.writeline("}")
         self.header.writeline("void __wrap_free(void *ptr) { return; }")
         self.reduction_cse = common.CSE(self.newvar_prefix, self.suffix, name_prefix="tmp_acc")
         self.spad_cse = common.CSE(self.newvar_prefix, self.suffix, name_prefix="spad")
diff --git a/PyTorchSimFrontend/mlir/mlir_decomposition.py b/PyTorchSimFrontend/mlir/mlir_decomposition.py
index 284d25d7..122c2677 100644
--- a/PyTorchSimFrontend/mlir/mlir_decomposition.py
+++ b/PyTorchSimFrontend/mlir/mlir_decomposition.py
@@ -1,9 +1,134 @@
 import math
+import operator
+from typing import Optional, Sequence, Tuple, Union
+
 import torch
 import torch.nn.functional as F
 from torch._inductor.decomposition import register_decomposition
 
-aten = torch.ops.aten
+aten = torch.ops.aten  # only for @register_decomposition target
+
+
+def _pair_2d(seq: Sequence[int]) -> Tuple[int, int]:
+    if len(seq) == 1:
+        v = int(seq[0])
+        return v, v
+    return int(seq[0]), int(seq[1])
+
+
+def _group_conv_cin1_cout1(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor],
+    stride: Tuple[int, ...],
+    padding: Tuple[int, ...],
+    dilation: Tuple[int, ...],
+    groups: int,
+) -> torch.Tensor:
+    """
+    Grouped conv with ``Cin//groups == 1`` and ``Cout//groups == 1`` (input ``[N,G,H,W]``, weight ``[G,1,Kh,Kw]``).
+
+    1. Symmetric spatial padding on the input.
+    2. For each kernel position ``(kh, kw)``, gather the output grid from the padded tensor and
+       multiply by ``weight[:, 0, kh, kw]`` (broadcast over ``N``), then sum over ``(kh, kw)``.
+
+    Note
+    ----
+    This is not a performance-optimized kernel: it is explicit gather–multiply–accumulate over
+    kernel elements. For competitive performance, add a dedicated template (or fused) kernel
+    instead of relying on this decomposition.
+    """
+    n, c_in, _, _ = input.shape
+    # PyTorch layout: ``[Cout, Cin/groups, Kh, Kw]`` i.e. ``[G, 1, Kh, Kw]`` here.
+    c_out, cin_pg, kh, kw = weight.shape
+    g = groups
+    assert c_in == g and c_out == g and cin_pg == 1, (c_in, c_out, cin_pg, g)
+
+    sh, sw = _pair_2d(stride)
+    ph, pw = _pair_2d(padding)
+    d_h, d_w = _pair_2d(dilation)
+
+    # (left, right, top, bottom) for last two dims
+    x_pad = F.pad(input, (pw, pw, ph, ph))
+    _, _, hp, wp = x_pad.shape
+
+    h_out = (hp - d_h * (kh - 1) - 1) // sh + 1
+    w_out = (wp - d_w * (kw - 1) - 1) // sw + 1
+
+    out = torch.zeros(n, g, h_out, w_out, dtype=input.dtype, device=input.device)
+    for ki in range(kh):
+        rows = torch.arange(h_out, device=input.device, dtype=torch.long) * sh + ki * d_h
+        for kj in range(kw):
+            cols = torch.arange(w_out, device=input.device, dtype=torch.long) * sw + kj * d_w
+            sub = x_pad[:, :, rows[:, None], cols[None, :]]
+            wgk = weight[:, 0, ki, kj].reshape(1, g, 1, 1)
+            out = out + sub * wgk
+
+    if bias is not None:
+        out = out + bias.reshape(1, g, 1, 1)
+    return out
+
+
+@register_decomposition(aten.convolution.default)
+def decompose_group_convolution(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Union[torch.Tensor, None],
+    stride: Sequence[int],
+    padding: Sequence[int],
+    dilation: Sequence[int],
+    transposed: bool,
+    output_padding: Sequence[int],
+    groups: Union[int, torch.SymInt],
+):
+    """
+    Lower grouped ``aten.convolution`` only when each group has a single input and output
+    channel (``Cin//groups == Cout//groups == 1``), via ``_group_conv_cin1_cout1``.
+
+    Note
+    ----
+    The lowered path is not a performance-optimized kernel; it exists for correctness and
+    lowering experiments. For speed, implement a separate template (fused) kernel for group
+    convolution.
+
+    Non-static ``groups`` (cannot ``int()``) falls back: returns ``NotImplemented`` so the
+    default ``aten.convolution`` is used. ``groups==1`` also returns ``NotImplemented``.
+    """
+    try:
+        gcount = operator.index(groups)
+    except (TypeError, ValueError):
+        return NotImplemented
+    # groups==1: do not decompose; Inductor keeps the default aten.convolution (plain conv).
+    if gcount == 1:
+        return NotImplemented
+
+    cin = input.shape[1]
+    cout = weight.shape[0]
+    cin_pg = cin // gcount
+    cout_pg = cout // gcount
+    supported = (
+        not transposed
+        and cin % gcount == 0
+        and cout % gcount == 0
+        and cin_pg == 1
+        and cout_pg == 1
+        and weight.shape[1] == 1
+    )
+    if not supported:
+        raise NotImplementedError(
+            "PyTorchSim aten.convolution decomposition supports grouped conv only when "
+            "Cin//groups == 1 and Cout//groups == 1 (i.e. per-group Cin and Cout are 1). "
+            "For general group convolution, use the default kernel or a dedicated template kernel."
+        )
+    return _group_conv_cin1_cout1(
+        input,
+        weight,
+        bias,
+        tuple(stride),
+        tuple(padding),
+        tuple(dilation),
+        gcount,
+    )
 
 @register_decomposition(aten._native_multi_head_attention.default)
 def decompose_native_multi_head_attention(
diff --git a/tests/test_group_conv.py b/tests/test_group_conv.py
new file mode 100644
index 00000000..4f97cff6
--- /dev/null
+++ b/tests/test_group_conv.py
@@ -0,0 +1,79 @@
+import torch
+import torch._dynamo
+from Simulator.simulator import TOGSimulator
+
+def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
+    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        message = f"|{name} Test Passed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+    else:
+        message = f"|{name} Test Failed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+        print("custom out: ", out.cpu())
+        print("cpu out: ", cpu_out)
+        exit(1)
+
+def test_group_convolution(
+    device,
+    groups=2,
+    stride=1,
+    padding=1,
+    batch_size=2,
+    c_per_group=8,
+    out_per_group=12,
+    spatial=16,
+    kernel_size=3,
+    seed=0,
+):
+    """``torch.compile`` on NPU vs CPU reference — same structure as ``test_matmul`` / ``test_conv2d``."""
+
+    def custom_group_conv(a, weight, bias):
+        return torch.convolution(
+            a,
+            weight,
+            bias,
+            (stride, stride),
+            (padding, padding),
+            (1, 1),
+            False,
+            (0, 0),
+            groups,
+        )
+
+    torch.manual_seed(seed)
+    c_in = c_per_group * groups
+    c_out = out_per_group * groups
+    k = kernel_size
+    x = torch.randn(batch_size, c_in, spatial, spatial)
+    wgt = torch.randn(c_out, c_in // groups, k, k)
+    b = torch.randn(c_out)
+
+    x1 = x.to(device=device, memory_format=torch.channels_last)
+    w1 = wgt.to(device=device, memory_format=torch.channels_last)
+    b1 = b.to(device=device)
+    x2 = x.to("cpu", memory_format=torch.channels_last)
+    w2 = wgt.to("cpu", memory_format=torch.channels_last)
+    b2 = b.to("cpu")
+
+    opt_fn = torch.compile(dynamic=False)(custom_group_conv)
+    res = opt_fn(x1, w1, b1)
+    y = custom_group_conv(x2, w2, b2)
+    label = f"Group Conv Forward (groups={groups}, stride={stride}, pad={padding})"
+    test_result(label, res, y, rtol=1e-3, atol=1e-3)
+    print("Max diff > ", torch.max(torch.abs(res.cpu() - y)))
+
+
+if __name__ == "__main__":
+    device = torch.device("npu:0")
+    with torch.no_grad():
+        #test_group_convolution(device, batch_size=1, groups=2, stride=1, padding=1, seed=0)
+        #test_group_convolution(device, batch_size=1, groups=4, stride=1, padding=1, seed=1)
+        #test_group_convolution(device, batch_size=1, groups=2, stride=2, padding=1, seed=2)
+        test_group_convolution(device, batch_size=1, groups=240, stride=2, padding=1, seed=2, c_per_group=1, out_per_group=1, spatial=40)
+
+        #test_group_convolution(device, batch_size=1, groups=240, stride=2, padding=1, seed=2, c_per_group=1, out_per_group=1)
+    print("test_group_conv_decomposition: all passed")

From 9b0ab3babd9c00d006016b841da92daac82a0e55 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 25 Mar 2026 14:22:18 +0900
Subject: [PATCH 152/194] [Frontend] Fix attribute passing to TOGSIM

---
 PyTorchSimFrontend/extension_codecache.py |  9 ++++---
 Scheduler/scheduler.py                    |  2 +-
 Simulator/simulator.py                    | 32 +++++++++++++++++------
 3 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index 6463dbac..ac711650 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -302,15 +302,16 @@ def run_kernel_simulation(*args, **kwargs):
 
                 # Prepare arguments for launch kernel
                 onnx_path = os.path.join(result_path, "tile_graph.onnx")
-                attribute_path = os.path.join(runtime_path, "attribute")
+                attribute_dir = os.path.join(runtime_path, "attribute")
+                kernel_attribute_path = TOGSimulator.write_kernel_attribute_file(attribute_dir, args)
 
                 TOGSim = torch.npu.get_tog_simulator()
                 if not autotune and TOGSim is not None:
-                    attribute_path = TOGSim.create_attribute_file(attribute_path, args)
-                    torch.npu.launch_kernel(onnx_path, attribute_path)
+                    torch.npu.launch_kernel(onnx_path, kernel_attribute_path)
                     result = None # No result for non-autotune mode
                 else:
-                    result_path = TOGSimulator.run_standalone(onnx_path, attribute_path, autotune_mode=autotune)
+                    result_path = TOGSimulator.run_standalone(
+                        onnx_path, kernel_attribute_path, autotune_mode=autotune)
                     result = TOGSimulator.get_result_from_file(result_path)
                 return result
         return run_kernel_simulation
diff --git a/Scheduler/scheduler.py b/Scheduler/scheduler.py
index 77e218ea..732f2841 100644
--- a/Scheduler/scheduler.py
+++ b/Scheduler/scheduler.py
@@ -231,7 +231,7 @@ def prepare_launch_kernel(self, kernel, inputs):
         onnx_path = os.path.join(result_path, "tile_graph.onnx")
 
         attribute_path = os.path.join(runtime_path, "attribute")
-        attribute_path = self.tog_simulator.create_attribute_file(attribute_path, inputs)
+        attribute_path = TOGSimulator.write_kernel_attribute_file(attribute_path, inputs)
         return onnx_path, attribute_path
 
     def launch_kernel(self, current_cycle, partion_idx=0):
diff --git a/Simulator/simulator.py b/Simulator/simulator.py
index f24835ba..a02d8fc9 100644
--- a/Simulator/simulator.py
+++ b/Simulator/simulator.py
@@ -427,28 +427,44 @@ def sram_dealloc(cls, buf_name, addr_range):
         if buf_name in cls.ALLOC_POOL:
             del cls.ALLOC_POOL[buf_name]
 
-    def create_attribute_file(self, attribute_path, inputs, **kwargs):
+    @staticmethod
+    def write_kernel_attribute_file(attribute_dir, inputs, alloc_pool=None):
+        """
+        Write kernel attribute YAML (address_info + sram_alloc) under attribute_dir.
+
+        Does not require a TOGSimulator instance. alloc_pool defaults to class ALLOC_POOL.
+
+        Args:
+            attribute_dir: Directory to hold numbered attribute files (created if needed)
+            inputs: Kernel input tensors (data_ptr used for address_info)
+            alloc_pool: Optional dict like ALLOC_POOL; defaults to TOGSimulator.ALLOC_POOL
+
+        Returns:
+            Path to the written YAML file.
+        """
+        if alloc_pool is None:
+            alloc_pool = TOGSimulator.ALLOC_POOL
         address_info = {}
         sram_buffer = {}
         yaml_content = {}
 
-        os.makedirs(attribute_path, exist_ok=True)
-        index = str(len(os.listdir(attribute_path)))
-        attribute_path = os.path.join(attribute_path, index)
+        os.makedirs(attribute_dir, exist_ok=True)
+        index = str(len(os.listdir(attribute_dir)))
+        attribute_file = os.path.join(attribute_dir, index)
 
         for idx, tensor in enumerate(inputs):
             address_info[f"arg{idx}"] = tensor.data_ptr()
         yaml_content["address_info"] = address_info
 
-        for buf_name, range in self.ALLOC_POOL.items():
+        for buf_name, range in alloc_pool.items():
             sram_buffer[buf_name] = range
         yaml_content["sram_alloc"] = sram_buffer
 
-        with open(attribute_path, "w") as f:
+        with open(attribute_file, "w") as f:
             yaml.dump(yaml_content, f, default_flow_style=False)
             f.flush()
-            os.fsync(f.fileno()) # There could be a race condition.
-        return attribute_path
+            os.fsync(f.fileno())
+        return attribute_file
 
     def load_yaml(self, config_path):
         config_path = Path(config_path)

From 5cbe9d1fbd6432550e73ca71d5afa0b4939c3543 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 25 Mar 2026 17:11:48 +0900
Subject: [PATCH 153/194] [Frontend] Fix loop_size argument passing

---
 PyTorchSimFrontend/mlir/mlir_autotune.py        | 2 +-
 PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_autotune.py b/PyTorchSimFrontend/mlir/mlir_autotune.py
index caf4d6da..b8f5eaf9 100644
--- a/PyTorchSimFrontend/mlir/mlir_autotune.py
+++ b/PyTorchSimFrontend/mlir/mlir_autotune.py
@@ -83,7 +83,7 @@ def cached_run_fn(*args, **kwargs):
         # Run a candidate code
         run_method = custom_async_compile.mlir(
             self.source_code, vectorlane_size=self.extra_args["vector_lane"],
-            loop_size=None, spad_info=self.extra_args["spad_info"],
+            loop_size=self.extra_args["loop_size"], spad_info=self.extra_args["spad_info"],
             vlen=self.extra_args["vlen"], arg_attributes=self.extra_args["arg_attributes"],
             origins=self.extra_args["origins"], silent_mode=True,
             autotune=self.extra_args['autotune'])
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 3ecf3b53..8bfdc57f 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -319,6 +319,7 @@ def __init__(self, kernel_group, reason=None):
         self.reduce_iterator = {}
         self.spad_buffer_dict = dict()
         self.base_vector_initialized = False
+        self.loop_size = None
 
     def reset(self, reason):
         save = self.exit_stack, self._nested_context_depth
@@ -1072,6 +1073,7 @@ def run_bench(self, nodes, kernel_name, src_code):
                 "vlen" : self.vlen,
                 "arg_attributes" : arg_attributes,
                 "autotune" : True,
+                "loop_size" : self.loop_size,
                 "origins" : {str(i) for node in nodes for i in node.node.origins},
             },
             source_code=src_code,

From f03f72731b5d06f2b49e422de1b059d8b1235a2a Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 25 Mar 2026 17:12:08 +0900
Subject: [PATCH 154/194] [Script] Add utility option

---
 .../artifact/cycle_validation/run_cycle.sh    | 194 ++++++++++++------
 1 file changed, 131 insertions(+), 63 deletions(-)

diff --git a/experiments/artifact/cycle_validation/run_cycle.sh b/experiments/artifact/cycle_validation/run_cycle.sh
index 7406f356..e49538d0 100755
--- a/experiments/artifact/cycle_validation/run_cycle.sh
+++ b/experiments/artifact/cycle_validation/run_cycle.sh
@@ -1,85 +1,153 @@
 #!/bin/bash
 set -e
 
+usage() {
+  cat <<'EOF'
+Usage: run_cycle.sh [--only SECTION[,SECTION...]]
+
+  Run cycle validation benchmarks. Default: all sections + summary.
+
+  SECTION (comma-separated for --only):
+    matmul      GEMM sizes
+    conv        Conv2d sizes
+    layernorm   LayerNorm sizes
+    softmax     Softmax sizes
+    attention   Attention sizes
+    resnet      resnet18, resnet50
+    bert        BERT base/large/xlarge
+    summary     summary_cycle.py (reads logs under experiments/artifact/logs)
+
+Examples:
+  ./run_cycle.sh
+  ./run_cycle.sh --only matmul
+  ./run_cycle.sh --only matmul,conv,summary
+EOF
+}
+
+ONLY=""
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --only)
+      ONLY="${2:-}"
+      if [[ -z "$ONLY" ]]; then echo "error: --only needs a value"; exit 1; fi
+      shift 2
+      ;;
+    -h|--help)
+      usage
+      exit 0
+      ;;
+    *)
+      echo "error: unknown argument: $1" >&2
+      usage >&2
+      exit 1
+      ;;
+  esac
+done
+
+# If ONLY is set, run section NAME only when ",$NAME," appears in ",$ONLY,"
+should_run() {
+  local name=$1
+  if [[ -z "$ONLY" ]]; then
+    return 0
+  fi
+  [[ ",${ONLY}," == *",${name},"* ]]
+}
+
 export TOGSIM_CONFIG=$TORCHSIM_DIR/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_timing_only.yml
 LOG_DIR=$TORCHSIM_DIR/experiments/artifact/logs
 mkdir -p $LOG_DIR
 
 # Matmul
-for sz in "256 256 256" "512 512 512" "1024 1024 1024" "2048 2048 2048"; do
-  name="gemm_${sz// /x}"
-  echo ""
-  echo "==================================================="
-  echo "[*] Running Matmul size=$sz"
-  echo "==================================================="
-  python3 $TORCHSIM_DIR/experiments/gemm.py --size $sz | tee $LOG_DIR/${name}.log
-done
+if should_run matmul; then
+  for sz in "256 256 256" "512 512 512" "1024 1024 1024" "2048 2048 2048"; do
+    name="gemm_${sz// /x}"
+    echo ""
+    echo "==================================================="
+    echo "[*] Running Matmul size=$sz"
+    echo "==================================================="
+    python3 $TORCHSIM_DIR/experiments/gemm.py --size $sz | tee $LOG_DIR/${name}.log
+  done
+fi
 
 # Conv
-for sz in \
-  "1 56 56 64 64 3 1 1" \
-  "1 28 28 128 128 3 1 1" \
-  "1 14 14 256 256 3 1 1" \
-  "1 7 7 512 512 3 1 1" \
-  "64 56 56 64 64 3 1 1" \
-  "64 28 28 128 128 3 1 1" \
-  "64 14 14 256 256 3 1 1" \
-  "64 7 7 512 512 3 1 1"; do
-  name="conv_${sz// /x}"
-  echo ""
-  echo "==================================================="
-  echo "[*] Running Conv size=$sz"
-  echo "==================================================="
-  python3 $TORCHSIM_DIR/experiments/conv.py --size $sz | tee $LOG_DIR/${name}.log
-done
+if should_run conv; then
+  for sz in \
+    "1 56 56 64 64 3 1 1" \
+    "1 28 28 128 128 3 1 1" \
+    "1 14 14 256 256 3 1 1" \
+    "1 7 7 512 512 3 1 1" \
+    "64 56 56 64 64 3 1 1" \
+    "64 28 28 128 128 3 1 1" \
+    "64 14 14 256 256 3 1 1" \
+    "64 7 7 512 512 3 1 1"; do
+    name="conv_${sz// /x}"
+    echo ""
+    echo "==================================================="
+    echo "[*] Running Conv size=$sz"
+    echo "==================================================="
+    python3 $TORCHSIM_DIR/experiments/conv.py --size $sz | tee $LOG_DIR/${name}.log
+  done
+fi
 
 # LayerNorm
-for sz in "512 768" "2048 768" "8192 768"; do
-  name="layernorm_${sz// /x}"
-  echo ""
-  echo "==================================================="
-  echo "[*] Running LayerNorm size=$sz"
-  echo "==================================================="
-  python3 $TORCHSIM_DIR/experiments/layernorm.py --size $sz | tee $LOG_DIR/${name}.log
-done
+if should_run layernorm; then
+  for sz in "512 768" "2048 768" "8192 768"; do
+    name="layernorm_${sz// /x}"
+    echo ""
+    echo "==================================================="
+    echo "[*] Running LayerNorm size=$sz"
+    echo "==================================================="
+    python3 $TORCHSIM_DIR/experiments/layernorm.py --size $sz | tee $LOG_DIR/${name}.log
+  done
+fi
 
 # Softmax
-for sz in "512 512" "2048 2048" "8192 8192"; do
-  name="softmax_${sz// /x}"
-  echo ""
-  echo "==================================================="
-  echo "[*] Running Softmax size=$sz"
-  echo "==================================================="
-  python3 $TORCHSIM_DIR/experiments/softmax.py --size $sz | tee $LOG_DIR/${name}.log
-done
+if should_run softmax; then
+  for sz in "512 512" "2048 2048" "8192 8192"; do
+    name="softmax_${sz// /x}"
+    echo ""
+    echo "==================================================="
+    echo "[*] Running Softmax size=$sz"
+    echo "==================================================="
+    python3 $TORCHSIM_DIR/experiments/softmax.py --size $sz | tee $LOG_DIR/${name}.log
+  done
+fi
 
 # Attention
-for sz in "12 512 64" "16 512 64" "32 512 64"; do
-  name="attention_${sz// /x}"
-  echo ""
-  echo "==================================================="
-  echo "[*] Running Attention size=$sz"
-  echo "==================================================="
-  python3 $TORCHSIM_DIR/experiments/attention.py --size $sz | tee $LOG_DIR/${name}.log
-done
+if should_run attention; then
+  for sz in "12 512 64" "16 512 64" "32 512 64"; do
+    name="attention_${sz// /x}"
+    echo ""
+    echo "==================================================="
+    echo "[*] Running Attention size=$sz"
+    echo "==================================================="
+    python3 $TORCHSIM_DIR/experiments/attention.py --size $sz | tee $LOG_DIR/${name}.log
+  done
+fi
 
 # ResNet
-for model in "resnet18" "resnet50"; do
-  echo ""
-  echo "==================================================="
-  echo "[*] Running $model"
-  echo "==================================================="
-  python3 $TORCHSIM_DIR/experiments/${model}.py | tee $LOG_DIR/${model}.log
-done
+if should_run resnet; then
+  for model in "resnet18" "resnet50"; do
+    echo ""
+    echo "==================================================="
+    echo "[*] Running $model"
+    echo "==================================================="
+    python3 $TORCHSIM_DIR/experiments/${model}.py | tee $LOG_DIR/${model}.log
+  done
+fi
 
 # BERT
-for model in "base" "large" "xlarge"; do
-  echo ""
-  echo "==================================================="
-  echo "[*] Running BERT size=$model"
-  echo "==================================================="
-  python3 $TORCHSIM_DIR/experiments/BERT.py --size $model | tee $LOG_DIR/bert_${model}.log
-done
+if should_run bert; then
+  for model in "base" "large" "xlarge"; do
+    echo ""
+    echo "==================================================="
+    echo "[*] Running BERT size=$model"
+    echo "==================================================="
+    python3 $TORCHSIM_DIR/experiments/BERT.py --size $model | tee $LOG_DIR/bert_${model}.log
+  done
+fi
 
 # Cycle Summary
-python3 $TORCHSIM_DIR/experiments/artifact/cycle_validation/summary_cycle.py | tee "$TORCHSIM_DIR/experiments/artifact/cycle_validation/summary_cycle.out"
\ No newline at end of file
+if should_run summary; then
+  python3 $TORCHSIM_DIR/experiments/artifact/cycle_validation/summary_cycle.py | tee "$TORCHSIM_DIR/experiments/artifact/cycle_validation/summary_cycle.out"
+fi

From 1ae39bfb4926b8a6b42500b997154329dfc56051 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 26 Mar 2026 12:40:55 +0900
Subject: [PATCH 155/194] [Cleanup] #219 cleanup the  deprecated scheduler
 module

---
 README.md                                    | 126 +++--
 Scheduler/scheduler.py                       | 533 +------------------
 Simulator/simulator.py                       |  21 +-
 tests/MLP/test_mlp.py                        |   4 +-
 tests/MoE/test_moe.py                        |   4 +-
 tests/test_compile_overhead.py               |  45 --
 tests/test_gqa_decode.py                     |   3 +-
 tests/test_hetro.py                          |  76 ++-
 tests/test_scheduler.py                      |   4 +-
 tests/test_scheduler_batching.py             |  41 --
 tests/test_sort.py                           |   5 +-
 tests/test_sparse_core.py                    |   5 +-
 tests/test_spmm_scheduler.py                 |  66 ---
 tutorial/session1/CompilerOptimization.ipynb |   3 +-
 tutorial/session1/ExecutionMode.ipynb        |   3 +-
 tutorial/session1/Inference.ipynb            |   3 +-
 tutorial/session1/LogAnalysis.ipynb          |   3 +-
 tutorial/session1/Mapping.ipynb              |   3 +-
 tutorial/session1/Training.ipynb             |   3 +-
 tutorial/session2/Hands_on.ipynb             |   4 +-
 20 files changed, 139 insertions(+), 816 deletions(-)
 delete mode 100644 tests/test_compile_overhead.py
 delete mode 100644 tests/test_scheduler_batching.py
 delete mode 100644 tests/test_spmm_scheduler.py

diff --git a/README.md b/README.md
index c6280498..03041355 100644
--- a/README.md
+++ b/README.md
@@ -106,9 +106,8 @@ You can run your own PyTorch model on PyTorchSim by setting up a custom NPU devi
 This method also applies when you want to simulate models beyond the provided examples.
 ```python
 import torch
-from Scheduler.scheduler import PyTorchSimRunner
-# Declare a custom NPU device
-device = PyTorchSimRunner.setup_device().custom_device()
+
+device = torch.device("npu:0")
 
 # Declare you own model (e.g. resnet18 from torchvision)
 from torchvision.models import resnet18
@@ -215,76 +214,95 @@ opt_step()
 `tests/test_mlp.py` provides an example of MLP training.
 
 ## Multi-tenancy
-Our load generator supports multi-tenancy experiments. You can run a simple example by executing `tests/test_scheduler.py`.
-```bash
-python tests/test_scheduler.py
-```
-Below is an example code of multi-tenancy `resnet18` and `EncoderBlock`.
-In this example, the `Scheduler` is initialized with a number of request queues, a scheduling policy, and a TOGSimulator config file(`.yml`). The compiled PyTorch models are then registered with a unique model id.
 
-```python3
-import os
-import sys
+While the **`with TOGSimulator(config_path=...)`** block is active, **`TOGSIM_CONFIG`** is set to that YAML so **compilation and TOGSim use the same** hardware description.
+
+### 1. One TOGSim session, one continuous log
+
+If you want **one** log where kernels are simulated **in sequence** as a single run, wrap the code you already use to execute the compiled model with **`with TOGSimulator(config_path=...)`**. No other API is required; every forward inside the block shares that session.
+
+```python
 import torch
-from torchvision.models import resnet18
-base_path = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-config = f'{base_path}/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.yml'
+from Simulator.simulator import TOGSimulator
 
-sys.path.append(base_path)
-from tests.test_transformer import EncoderBlock
-from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request, poisson_request_generator
-scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE, togsim_config=config)
+# ... build model, torch.compile, tensors on npu:0 as usual ...
 
-# Register compiled model
-target_model0 = resnet18().eval()
-target_model1 = EncoderBlock(768, 12).eval()
-opt_model0 = torch.compile(target_model0.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last))
-opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device()))
-SchedulerDNNModel.register_model("model0", opt_model0)
-SchedulerDNNModel.register_model("model1", opt_model1)
+with TOGSimulator(config_path=config):
+    y = compiled_model(x)
 ```
 
-The config file(`.yml`) specifies two key items:
-- `num_partition`: The total number of independent request queues to create.
-- `partition`: Defines the hardware mapping, assigning each queue (identified by its index) to a specific physical core.
-For example, the configuration below creates two scheduling queues (`0` and `1`) and maps `core_0` to queue `0` and `core_1` to queue `1`:
+### 2. Multi-tenancy and explicit scheduling (`launch_model`)
+
+For **multi-tenant** or **interleaved** execution, you usually need to attach a **timestamp** and a **`stream_index`** to each launch so the simulator can order work correctly. Use **`torch.npu.launch_model(compiled_model, *inputs, stream_index=..., timestamp=...)`** for that; plain `compiled_model(x)` does not carry those parameters.
+
+**`stream_index`** is the **request-queue / partition index** in the TOGSim config: it must match the **values** in the **`partition`** map (each queue index is mapped to a **core**). For example, `stream_index=0` goes to the queue bound to `core_0`, `stream_index=1` to the queue for `core_1`, and so on.
+
+**`timestamp`** is in **nanoseconds** (simulation time for ordering launches). Use `0` when you do not need explicit times beyond submission order.
+
+```python
+with TOGSimulator(config_path=config):
+    torch.npu.launch_model(opt_model1, x1, stream_index=0, timestamp=0)
+    torch.npu.launch_model(opt_model2, x2, stream_index=1, timestamp=0)
+    torch.npu.synchronize()
+    torch.npu.launch_model(opt_model1, x1, stream_index=0, timestamp=0)
+    torch.npu.launch_model(opt_model2, x2, stream_index=1, timestamp=0)
+```
+
+Here **`synchronize()`** acts as a barrier: it does not return until every **`launch_model`** issued **above** it has finished in the simulator. The later pair of `launch_model` calls therefore runs only after those earlier models have fully completed—so the sync is the point in the timeline where **all preceding launches are done**.
+
+```bash
+python tests/test_scheduler.py
+```
+
+Use a TOGSim config(`.yml`) that defines **partitions** when mapping queues to cores, for example:
+
+- **`num_partition`**: Number of independent request queues (valid **`stream_index`** values are `0 … num_partition-1`).
+- **`partition`**: Maps each **core** name to a **queue index**; that index is the same **`stream_index`** you pass to **`launch_model`**.
+
 ```
   "num_partition" : 2,
   "partition": {
-    "core_0":0,
-    "core_1":1
+    "core_0": 0,
+    "core_1": 1
   }
 ```
 
-Next, DNN model requests are generated and submitted. We provide a `poisson_request_generator` utility, which generates request arrival times.
-Each `Request` is created with its model name, data, and a request_queue_idx to specify its target queue, then added via `scheduler.add_request`.
-As shown in the code, `model0` requests are queued to `request_queue_idx=0`, while `model1` requests are queued to `request_queue_idx=1`.
-```python3
-# Load Generation
+Here `stream_index=0` selects queue `0` (core_0), `stream_index=1` selects queue `1` (core_1).
+
+### 3. Load generation (Poisson arrivals)
+
+The **`poisson_request_generator`** in **`Scheduler.scheduler`** yields synthetic **arrival times** (in **milliseconds**). Merge those with **`launch_model`**: convert each time to **nanoseconds** for **`timestamp`**, set **`stream_index`** to the target partition queue, and run all launches inside one **`with TOGSimulator(...)`** so a **single** log captures the full trace.
+
+```python
+from Scheduler.scheduler import poisson_request_generator
+
 model0_lambda = 5.0
 model1_lambda = 3.0
-max_time = 1000.0 # [s]
+max_time_msec = 1000.0  # Poisson horizon [ms]
 
-# Generate Possion distribution requests for model0
-for model0_request_time in poisson_request_generator(model0_lambda, max_msec_time=max_time):
-    x = torch.randn(1, 3, 224, 224)
-    new_request = Request("model0", [x], [], request_queue_idx=0)
-    scheduler.add_request(new_request, request_time=model0_request_time)
+events = []
+for t in poisson_request_generator(model0_lambda, max_msec_time=max_time_msec):
+    x = torch.randn(1, 3, 224, 224, device=device)
+    events.append((t, 0, opt_model0, (x,)))  # stream_index 0 → queue / partition 0
 
-# Generate Possion distribution requests for model1
-for model1_request_time in poisson_request_generator(model1_lambda, max_msec_time=max_time):
-    x = torch.randn(128, 768)
-    new_request = Request("model1", [x], [], request_queue_idx=1)
-    scheduler.add_request(new_request, request_time=model1_request_time)
-```
+for t in poisson_request_generator(model1_lambda, max_msec_time=max_time_msec):
+    x = torch.randn(128, 768, device=device)
+    events.append((t, 1, opt_model1, (x,)))  # stream_index 1 → queue / partition 1
 
-Finally, `scheduler.schedule()` is called in a loop until all requests are processed.
-```python3
-# Run scheduler
-while not scheduler.is_finished():
-    scheduler.schedule()
+events.sort(key=lambda e: e[0])
+
+with TOGSimulator(config_path=config):
+    for t_msec, stream_index, model, args in events:
+        torch.npu.launch_model(
+            model,
+            *args,
+            stream_index=stream_index,
+            timestamp=int(t_msec * 1e6),
+        )  # ms → ns
 ```
 
+The two Poisson streams are **combined and sorted by time** so launches follow a single global arrival order.
+
 ## Compiler Optimizations
 PyTorchSim compiler supports several fusion optimizations:
 - GEMM prologue fusion
diff --git a/Scheduler/scheduler.py b/Scheduler/scheduler.py
index 732f2841..2b3aac92 100644
--- a/Scheduler/scheduler.py
+++ b/Scheduler/scheduler.py
@@ -1,34 +1,11 @@
-from typing import List
-import os
-import sys
-import numpy as np
-import torch
-from pathlib import Path
-import importlib.util
-from PyTorchSimFrontend.extension_codecache import hash_prefix
-from Simulator.simulator import TOGSimulator
-from PyTorchSimFrontend import extension_config
-
-# Configure logger for Scheduler module
-logger = extension_config.setup_logger()
-
+"""Poisson load helpers for synthetic request arrival times."""
 
-def import_module_from_path(module_name, path):
-    module_path = Path(path)  # Convert to Path object for safety
-    if not module_path.exists() or not module_path.is_file():
-        raise FileNotFoundError(f"No such file: '{module_path}'")
-
-    spec = importlib.util.spec_from_file_location(module_name, module_path)
-    if spec is None:
-        raise ImportError(f"Could not load module from path: '{module_path}'")
-
-    module = importlib.util.module_from_spec(spec)
-    spec.loader.exec_module(module)
+import numpy as np
 
-    return module
 
 def poisson_request_generator(lambda_requests, max_msec_time=None):
-    current_time = 0.0 # msec
+    """Yield synthetic arrival times in milliseconds (first sample is 0)."""
+    current_time = 0.0  # msec
 
     yield 0
     while max_msec_time is None or current_time < max_msec_time:
@@ -39,505 +16,3 @@ def poisson_request_generator(lambda_requests, max_msec_time=None):
             break
 
         yield current_time
-
-class Request:
-    """ Each request has model name, it's own id, and requested time. """
-    request_id = 0
-    QUEUED     = 1
-    RUNNING    = 2
-    INCREMENT  = 3
-    FINISHED   = 4
-    def __init__(self, model:str, batchable_input_tensor : List[torch.Tensor],
-                 shared_input_tensor: List[torch.tensor], request_queue_idx=0) -> None:
-        self.model = model
-        self.batchable_input_tensor = batchable_input_tensor
-        self.shared_input_tensor = shared_input_tensor
-        self.arrival_time = None
-        self.start_time = []
-        self.finish_time = []
-        self.state = self.QUEUED
-        self.id = self.allocate_id()
-        self.request_queue_idx = request_queue_idx
-
-    def allocate_id(self):
-        allocated_id = Request.request_id
-        Request.request_id += 1
-        return allocated_id
-
-    def set_start(self, start_time):
-        self.state = self.RUNNING
-        self.start_time.append(start_time)
-
-    def set_finished(self, finish_time):
-        self.state = self.FINISHED
-        self.finish_time.append(finish_time)
-
-    def get_latency(self):
-        # Todo. Provide Toke-By-Token
-        if self.state == self.FINISHED:
-            turnaround_time = self.finish_time[-1] - self.arrival_time
-        else:
-            turnaround_time = None
-
-        if self.start_time:
-            response_time = self.start_time[0] - self.arrival_time
-        else:
-            response_time = None
-
-        if self.start_time and self.finish_time:
-            tbt_time = [i-j for i,j in zip(self.finish_time, self.start_time)]
-        else:
-            tbt_time = []
-
-        return turnaround_time, response_time, tbt_time
-
-    def free_memory(self):
-        """ Free memory resources that are allocated for handle this request """
-        return
-
-    def __str__(self) -> str:
-        return f"Request{self.id} Model: '{self.model}', Arrival: {self.arrival_time}, Start: {self.start_time}, End: {self.finish_time}, State: {self.state}, Partion: {self.request_queue_idx}"
-
-class RequestReturn:
-    INCREMENT = 0
-    FINISHED = 1
-    def __init__(self, state) -> None:
-        self.state = state
-
-    def is_finished(self):
-        return self.state == self.FINISHED
-
-    def is_increment(self):
-        return self.state == self.INCREMENT
-
-class SchedulerDNNModel:
-    MODEL_MAP = {}
-    def __init__(self, batched_req : List[Request], partition_idx) -> None:
-        self.model_name = batched_req[0].model
-        self.batched_req = batched_req
-        self.args = None
-        self.model = self.find_model(self.model_name)
-        self.partition_idx = partition_idx
-
-    def find_model(self, model_name : str):
-        if model_name in SchedulerDNNModel.MODEL_MAP:
-            return SchedulerDNNModel.MODEL_MAP[model_name]
-        else:
-            raise KeyError(f'[Scheduler] Requested model "{model_name}" is not registered...')
-
-    def get_batchable_input(self):
-        batched_input_tensor = []
-        for i in range(len(self.batched_req[0].batchable_input_tensor)):
-            tensor_list = [req.batchable_input_tensor[i] for req in self.batched_req]
-            batched_input_tensor.append(torch.concat(tensor_list, dim=0))
-        return batched_input_tensor
-
-    def get_shared_input(self):
-        return self.batched_req[0].shared_input_tensor
-
-    def get_input(self):
-        return self.get_batchable_input() + self.get_shared_input()
-
-    def __str__(self):
-        return f"DNN Model: {self.model_name}, Partion idx: {self.partition_idx} Req: {self.batched_req[0]}"
-
-    @staticmethod
-    def register_model(model_name : str, compiled_model):
-        SchedulerDNNModel.MODEL_MAP[model_name] = compiled_model
-
-class PyTorchSimRunner:
-    PARTITION_BUSY = 0
-    PARTITION_IDLE = 1
-    SELECT_NOTHING = 2
-    NPU_MODULE = None
-    def __init__(self, tog_simulator : TOGSimulator, num_partion=1) -> None:
-        self.module = self.setup_device()
-        self.num_partion = num_partion
-        self.launch_model_dicts = []
-        self.nested_launch_model_dicts = []
-        self.partition_state = []
-        for i in range(self.num_partion):
-            self.launch_model_dicts.append({})
-            self.nested_launch_model_dicts.append({})
-            self.partition_state.append(self.PARTITION_IDLE)
-
-        self.finish_req_dict = {}
-        self.tog_simulator = tog_simulator
-
-        # Dry run for compile and create generator
-        os.environ["TOGSIM_EAGER_MODE"] = "1"
-
-    @classmethod
-    def setup_device(cls):
-        if cls.NPU_MODULE is not None:
-            return cls.NPU_MODULE
-
-        try:
-            from torch._inductor.codegen.common import register_backend_for_device
-            from PyTorchSimFrontend.mlir.mlir_codegen_backend import ExtensionWrapperCodegen
-            from PyTorchSimFrontend.mlir.mlir_scheduling import MLIRScheduling
-        except ImportError as e:
-            logger.error(f"Failed to import torch_openreg: {e}")
-            logger.error("Please ensure PyTorchSimDevice2 is installed: pip install -e PyTorchSimDevice2")
-            raise
-
-        register_backend_for_device(
-            "npu",
-            lambda scheduling: MLIRScheduling(scheduling),
-            ExtensionWrapperCodegen
-        )
-
-        cls.NPU_MODULE = torch.npu
-        return cls.NPU_MODULE
-
-    def submit(self, batched_req, partition_idx) -> List[RequestReturn]:
-        # FIXME. Construct SchedulerDNNModel
-        batched_req_model = self.get_compiled_model(batched_req, partition_idx)
-        self.prepare_model(batched_req_model)
-
-    def get_compiled_model(self, batched_req: List[Request], request_queue_idx):
-        compiled_model = SchedulerDNNModel(batched_req, request_queue_idx)
-        return compiled_model
-
-    def is_partition_idle(self, partition_idx):
-        return len(self.launch_model_dicts[partition_idx]) == 0
-
-    def is_any_idle(self, skip_list):
-        return any([self.is_partition_idle(i) and not skip_list[i] for i in range(self.num_partion)])
-
-    def is_all_idle(self):
-        return all([self.is_partition_idle(i) for i in range(self.num_partion)])
-
-    def prepare_model(self, req_model: SchedulerDNNModel):
-        result_path = os.path.join(extension_config.CONFIG_TORCHSIM_LOG_PATH, "togsim_result", req_model.model_name)
-        os.makedirs(result_path, exist_ok=True)
-        index = str(len(os.listdir(result_path)))
-
-        # Prepare input tensor
-        input_tensor_list = req_model.get_input()
-        input_tensor_list = [input_tensor.to(device=self.module.custom_device()) for input_tensor in input_tensor_list]
-
-        # This model-call will return generator
-        ret = req_model.model(*input_tensor_list)
-        self.launch_model_dicts[req_model.partition_idx][req_model] = ret
-
-    def finish_model(self, model : SchedulerDNNModel, output : torch.Tensor):
-        for req in model.batched_req:
-            # TODO. finish time
-            self.finish_req_dict[req] = RequestReturn(RequestReturn.FINISHED)
-
-    def prepare_launch_kernel(self, kernel, inputs):
-        result_path, runtime_path, _ = kernel(*inputs)
-        onnx_path = os.path.join(result_path, "tile_graph.onnx")
-
-        attribute_path = os.path.join(runtime_path, "attribute")
-        attribute_path = TOGSimulator.write_kernel_attribute_file(attribute_path, inputs)
-        return onnx_path, attribute_path
-
-    def launch_kernel(self, current_cycle, partion_idx=0):
-        # Check partition is busy
-        if self.partition_state[partion_idx] != self.PARTITION_IDLE:
-            return self.partition_state[partion_idx]
-        result = self.select_kernel(partion_idx)
-        if result == self.SELECT_NOTHING:
-            return self.SELECT_NOTHING
-        kernel, inputs = result
-        if not isinstance(kernel, str):
-            onnx_path, attribute_path = self.prepare_launch_kernel(kernel, inputs)
-        else:
-            onnx_path, attribute_path = kernel, inputs
-        self.partition_state[partion_idx] = self.PARTITION_BUSY
-        return self.tog_simulator.launch(onnx_path, attribute_path, current_cycle, partion_idx)
-
-class FIFORunner(PyTorchSimRunner):
-    def __init__(self, tog_simulator: TOGSimulator, num_partion=1) -> None:
-        super().__init__(tog_simulator, num_partion)
-
-    def select_kernel(self, partition_idx):
-        while len(self.nested_launch_model_dicts[partition_idx]) or len(self.launch_model_dicts[partition_idx]):
-            if len(self.nested_launch_model_dicts[partition_idx]):
-                target_dict = self.nested_launch_model_dicts
-            else:
-                target_dict = self.launch_model_dicts
-
-            # Select FIFO manner
-            req, target_model = next(iter(target_dict[partition_idx].items()))
-            try:
-                kernel, inputs = next(target_model)
-
-                # For extern call
-                if isinstance(kernel, str):
-                    return kernel, inputs
-
-                # For convolution...
-                if not hasattr(kernel, "future"):
-                    nested_gen = kernel(*inputs)
-                    self.nested_launch_model_dicts[partition_idx] = {req : nested_gen}
-                    kernel, inputs = \
-                        next(self.nested_launch_model_dicts[partition_idx][req])
-                return kernel, inputs
-            except StopIteration as e:
-                # Retry
-                if target_dict == self.launch_model_dicts:
-                    self.finish_model(req, e.value)
-                del target_dict[partition_idx][req]
-        # No proper kernel now
-        return self.SELECT_NOTHING
-
-class RoundRobinRunner(PyTorchSimRunner):
-    def __init__(self, tog_simulator: TOGSimulator, num_partion=1) -> None:
-        super().__init__(tog_simulator, num_partion)
-        self.next_pointer = None
-
-    def select_kernel(self, partition_idx):
-        while len(self.nested_launch_model_dicts[partition_idx]) or len(self.launch_model_dicts[partition_idx]):
-            if len(self.nested_launch_model_dicts[partition_idx]):
-                target_dict = self.nested_launch_model_dicts
-            else:
-                target_dict = self.launch_model_dicts
-
-            req_list = list(target_dict[partition_idx].keys())
-            # Select RR manner
-            if self.next_pointer is None or self.next_pointer not in req_list:
-                req = req_list[0]
-                pos = 0
-            else:
-                req = self.next_pointer
-                pos = req_list.index(req)
-
-            # Set Next pointer
-            if pos + 1 < len(req_list):
-                self.next_pointer = req_list[pos+1]
-            else:
-                self.next_pointer = req_list[0]
-
-            target_model = self.launch_model_dicts[partition_idx][req]
-            try:
-                kernel, inputs = next(target_model)
-
-                # For convolution...
-                if not hasattr(kernel, "future"):
-                    nested_gen = kernel(*inputs)
-                    self.nested_launch_model_dicts[partition_idx] = {req : nested_gen}
-                    kernel, inputs = \
-                        next(self.nested_launch_model_dicts[partition_idx][req])
-                return kernel, inputs
-            except StopIteration as e:
-                # Retry
-                if target_dict == self.launch_model_dicts:
-                    self.finish_model(req, e.value)
-                del self.launch_model_dicts[partition_idx][req]
-        # No proper kernel now
-        return self.SELECT_NOTHING
-
-class Scheduler:
-
-    FIFO_ENGINE = 0
-    RR_ENGINE = 1
-    def __init__(self, num_request_queue=1, max_batch=1, engine_select=FIFO_ENGINE, togsim_config=extension_config.CONFIG_TOGSIM_CONFIG) -> None:
-        self.current_cycle = 0
-        self.max_batch = max_batch
-        self.num_request_queue = num_request_queue
-        self.request_queue : List[List[Request]] = []
-        for i in range(self.num_request_queue):
-            self.request_queue.append([])
-        self.finish_queue : List[Request] = []
-
-        self.tog_simulator = TOGSimulator(togsim_config)
-        if self.tog_simulator.config_yaml['pytorchsim_timing_mode'] == 0:
-            # Scheduler requires timing mode to be enabled (pytorchsim_timing_mode != 0).
-            logger.error(f"pytorchsim_timing_mode is set to 0 in config file '{togsim_config}'. ")
-            logger.error(f"Scheduler requires timing mode to be enabled (pytorchsim_timing_mode != 0).")
-            exit(0)
-
-        os.environ['TOGSIM_CONFIG'] = togsim_config
-        self.tog_simulator.interactive_simulation()
-        if engine_select == Scheduler.FIFO_ENGINE:
-            self.execution_engine = FIFORunner(self.tog_simulator, self.num_request_queue)
-        elif engine_select == Scheduler.RR_ENGINE:
-            self.execution_engine = RoundRobinRunner(self.tog_simulator, self.num_request_queue)
-        else:
-            logger.error(f"Not supported engine type {engine_select}")
-            exit(1)
-
-    def add_request(self, request: Request, request_time=-1):
-        """register model at timestamp time
-            request_time : msec
-        """
-        request_time = self.current_time() if request_time == -1 else request_time
-        request.arrival_time = request_time
-        self.request_queue[request.request_queue_idx].append(request)
-
-    def request_empty(self, request_queue_idx):
-        return len(self.request_queue[request_queue_idx])==0
-
-    def select(self, request_queue_idx=0) -> List[Request]:
-        """
-        Select 1 request from request_queue in FCFS manner.
-        If there is no proper request, return None
-        """
-        candidate_req = []
-        if not self.request_queue[request_queue_idx]:
-            return candidate_req
-        for req in self.request_queue[request_queue_idx]:
-
-            if self.msec_to_cycle(req.arrival_time) <= self.current_cycle and req.state == Request.QUEUED:
-                candidate_req.append(req)
-
-                # Stop batching
-                if self.max_batch <= len(candidate_req):
-                    break
-        return candidate_req
-
-    def next_request_time(self, request_queue_idx=0):
-        for req in self.request_queue[request_queue_idx]:
-            if req.state == Request.QUEUED:
-                return req, req.arrival_time
-        return None, -1
-
-    def nearest_next_reqeust_time(self):
-        nearest_req = None
-        nearest_arrival_time = -1
-        for i in range(self.num_request_queue):
-            req, arrival_time = self.next_request_time(i)
-            if nearest_arrival_time == -1 and arrival_time != -1:
-                nearest_req = req
-                nearest_arrival_time = arrival_time
-            elif arrival_time != -1 and nearest_arrival_time > arrival_time:
-                nearest_req = req
-                nearest_arrival_time = arrival_time
-        return nearest_req, nearest_arrival_time
-
-    def finish_request(self, req : Request):
-        req.set_finished(self.current_time())
-
-        # Free resources
-        req.free_memory()
-
-        # Move to finish queue
-        self.finish_queue.append(req)
-        self.request_queue[req.request_queue_idx].remove(req)
-        turnaround_time, response_time, tbt_time = req.get_latency()
-        logger.info(
-            f"[Request-{req.id} finished] partition: {req.request_queue_idx} arrival_time: "
-            f"{req.arrival_time} start_time: {req.start_time[0]} turnaround latency: {turnaround_time}, "
-            f"response time: {response_time} tbt_time: {tbt_time}"
-        )
-
-    def per_schedule(self, request_queue_idx):
-        # Wait partition is idle
-        if not self.execution_engine.is_partition_idle(request_queue_idx):
-            return False
-
-        request_list = self.select(request_queue_idx)
-        if not request_list:
-            return False
-
-        logger.info(f"[Request issue] partition: {request_queue_idx} batch size: {len(request_list)}")
-        for req in request_list:
-            req.set_start(self.current_time())
-            logger.info(
-                f"[Request-{req.id} issue] partition: {req.request_queue_idx} "
-                f"arrival_time: {req.arrival_time} start_time: {req.start_time[0]}"
-            )
-        # Submit batched request
-        self.execution_engine.submit(request_list, request_queue_idx)
-
-        return True
-
-    def check_finish_request(self):
-        # Check finished request
-        while self.execution_engine.finish_req_dict:
-            req, req_ret = next(iter(self.execution_engine.finish_req_dict.items()))
-            self.finish_request(req)
-            del self.execution_engine.finish_req_dict[req]
-
-    def schedule(self):
-        # Try schedule all request queue
-        result = []
-        for i in range(self.num_request_queue):
-            result.append(self.per_schedule(i))
-
-        # Try move to next nearest request time
-        next_req, next_time = self.nearest_next_reqeust_time()
-        if next_req is None and self.execution_engine.is_all_idle():
-            # No request remained...
-            return
-
-        # Need to forward the time until next_arrival_time
-        if self.execution_engine.is_all_idle():
-            reason = self.tog_simulator.until(self.msec_to_cycle(next_time))
-            self.current_cycle = self.tog_simulator.cycle()
-        else:
-            self.run(next_time)
-        return
-
-    def run(self, until_time):
-        req_empty_info = [self.request_empty(i) for i in range(self.execution_engine.num_partion)]
-        def execute_cycle():
-            launch_ret_info = []
-            for i in range(self.execution_engine.num_partion):
-                if self.execution_engine.partition_state[i] == PyTorchSimRunner.PARTITION_IDLE:
-                    ret = self.execution_engine.launch_kernel(self.current_cycle, i)
-                    launch_ret_info.append(ret)
-
-            self.check_finish_request()
-            # Check if the stop condition is met
-            if self.execution_engine.is_any_idle(req_empty_info) or self.execution_engine.is_all_idle(): # Ignore empty request queue
-                return []
-
-            # Schedule jobs and update the current time
-            result_list = self.tog_simulator.until(self.msec_to_cycle(until_time))
-            self.current_cycle = self.tog_simulator.cycle()
-
-            for core_idx in result_list:
-                # Kernel is finished. So set idle state
-                self.execution_engine.partition_state[core_idx] = PyTorchSimRunner.PARTITION_IDLE
-
-            return result_list
-
-        if self.current_cycle >= self.msec_to_cycle(until_time):
-            until_time = -1
-
-        if until_time == -1:
-            while not self.execution_engine.is_any_idle(req_empty_info):
-                result = execute_cycle()
-                req_empty_info = [self.request_empty(i) for i in range(self.execution_engine.num_partion)]
-                # if result is not -1, schedule new request
-                if len(result)==0:
-                    break
-
-        else:
-            while self.current_cycle <= self.msec_to_cycle(until_time) and not self.execution_engine.is_all_idle():
-                result = execute_cycle()
-                # if result is not -1, schedule new request
-                if len(result)==0:
-                    break
-        return
-
-    def is_request_queue_empty(self):
-        result = True
-        for i in range(self.num_request_queue):
-            result = result and (not len(self.request_queue[i]))
-        return result
-
-    def is_finished(self):
-        if self.is_request_queue_empty() and self.execution_engine.is_all_idle():
-            self.tog_simulator.wait()
-            return True
-        return False
-
-    def current_time(self):
-        return self.cycle_to_msec(self.current_cycle)
-
-    def cycle_to_msec(self, cycle):
-        freq = self.tog_simulator.get_core_freq()
-        return cycle / (freq  / 1000)
-
-    def msec_to_cycle(self, msec):
-        # We treat -1 as special time
-        if (msec == -1):
-            return msec
-
-        freq = self.tog_simulator.get_core_freq()
-        return int(msec * (freq / 1000))
diff --git a/Simulator/simulator.py b/Simulator/simulator.py
index a02d8fc9..5b00d5d4 100644
--- a/Simulator/simulator.py
+++ b/Simulator/simulator.py
@@ -223,6 +223,7 @@ class TOGSimulator():
     TOGSIM_RESULT_PATH_KEY = "TOGSIM_RESULT_PATH"
     FINISH_STR = "Simulation finished"
     ALLOC_POOL = dict() # For eagermode buffer plan
+    _TOGSIM_CONFIG_ENV_UNSET = object()
     def __init__(self, config_path=None, togsim_path=None) -> None:
         if config_path is None:
             config_path = extension_config.CONFIG_TOGSIM_CONFIG
@@ -258,18 +259,32 @@ def __init__(self, config_path=None, togsim_path=None) -> None:
             raise RuntimeError(f"Failed to open trace file: {e}")
 
     def __enter__(self):
-        """Context manager entry."""
-        # Set this simulator instance as the global TOGSimulator
+        """Context manager entry.
+
+        Sets ``TOGSIM_CONFIG`` to this instance's config path so that compilation
+        (``extension_config`` / codegen) uses the same YAML as TOGSim. Previous
+        value is restored in ``__exit__``.
+        """
+        if "TOGSIM_CONFIG" in os.environ:
+            self._old_togsim_config_env = os.environ["TOGSIM_CONFIG"]
+        else:
+            self._old_togsim_config_env = self._TOGSIM_CONFIG_ENV_UNSET
+        os.environ["TOGSIM_CONFIG"] = os.path.abspath(self.config_path)
+
         self.old_tog_simulator = torch.npu.get_tog_simulator()
         torch.npu.set_tog_simulator(self)
         return self
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         """Context manager exit - automatically cleanup."""
-        # Reset global TOGSimulator to None
         self.until()
         torch.npu.set_tog_simulator(self.old_tog_simulator)
 
+        if self._old_togsim_config_env is self._TOGSIM_CONFIG_ENV_UNSET:
+            os.environ.pop("TOGSIM_CONFIG", None)
+        else:
+            os.environ["TOGSIM_CONFIG"] = self._old_togsim_config_env
+
     def _start_process(self):
         cmd = f"{self.get_togsim_command(self.config_path, self.base_dir)} --models_list {self.trace_file_path}"
         if extension_config.CONFIG_TOGSIM_DEBUG_LEVEL:
diff --git a/tests/MLP/test_mlp.py b/tests/MLP/test_mlp.py
index 31bcefdf..c910729e 100644
--- a/tests/MLP/test_mlp.py
+++ b/tests/MLP/test_mlp.py
@@ -281,10 +281,8 @@ def train(model, device):
     return
 
 if __name__ == "__main__":
-    from Scheduler.scheduler import PyTorchSimRunner
     torch.set_printoptions(threshold=float('inf'), linewidth=600)
-    module = PyTorchSimRunner.setup_device()
-    device = module.custom_device()
+    device = torch.device("npu:0")
 
     test_mlp(device)
     # test_train_mlp(device)
diff --git a/tests/MoE/test_moe.py b/tests/MoE/test_moe.py
index f9c96aff..d4cd98f1 100644
--- a/tests/MoE/test_moe.py
+++ b/tests/MoE/test_moe.py
@@ -807,10 +807,8 @@ def evaluation(model, evaluation_loader):
         train(opt_model, train_loader)
 
 if __name__ == "__main__":
-    from Scheduler.scheduler import PyTorchSimRunner
     torch.set_printoptions(threshold=float('inf'), linewidth=600)
-    module = PyTorchSimRunner.setup_device()
-    device = module.custom_device()
+    device = torch.device("npu:0")
 
     test_moe(device)
     # train_moe(device)
diff --git a/tests/test_compile_overhead.py b/tests/test_compile_overhead.py
deleted file mode 100644
index 449707a5..00000000
--- a/tests/test_compile_overhead.py
+++ /dev/null
@@ -1,45 +0,0 @@
-import os
-import time
-import sys
-import torch
-from torchvision.models import resnet18 as model1
-import argparse
-import shutil
-
-sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
-from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request, poisson_request_generator
-CONFIG_TORCHSIM_DIR = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-
-if __name__ == "__main__":
-    target_model1 = model1().eval()
-
-    # Init scheduler
-    for i in range(1):
-        timestamp = time.time()  # 현재 타임스탬프 (초 단위)
-        print(f"[{i}] Time Stamp: {timestamp:.6f}")  # 소수점 6자리까지 출력
-        #try:
-        #    shutil.rmtree("/tmp/torchinductor")
-        #except FileNotFoundError:
-        #    print("no cache")
-        scheduler = Scheduler(num_request_queue=1, max_batch=4, engine_select=Scheduler.FIFO_ENGINE, togsim_config=f"{CONFIG_TORCHSIM_DIR}/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml")
-        # Register compiled model
-        opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last), dynamic=False)
-        SchedulerDNNModel.register_model("resnet18", opt_model1)
-
-        # Generate time stamp
-        for request_time in [0]*12:
-            # Init input data
-            model_input1 = torch.randn(1, 3, 224, 224)
-
-            # Init request
-            new_request1 = Request("resnet18", [model_input1], [], request_queue_idx=0)
-
-            # Add request to scheduler
-            print("[Reqest] Resnet18 request time: ", request_time, flush=True)
-            scheduler.add_request(new_request1, request_time=request_time)
-
-        # Run scheduler
-        while not scheduler.is_finished():
-            scheduler.schedule()
-
-    print("Done", file=sys.stderr)
\ No newline at end of file
diff --git a/tests/test_gqa_decode.py b/tests/test_gqa_decode.py
index 3605d638..7a7ab06c 100644
--- a/tests/test_gqa_decode.py
+++ b/tests/test_gqa_decode.py
@@ -6,8 +6,7 @@
 import math
 import argparse
 from Simulator.simulator import TOGSimulator
-from Scheduler.scheduler import PyTorchSimRunner
-device = PyTorchSimRunner.setup_device().custom_device()
+device = torch.device("npu:0")
 # ─────────────────────────────────────────────────────────────────────────────
 # Optimized: Flash-Decode style — tile S upfront, batch in B dimension
 # ─────────────────────────────────────────────────────────────────────────────
diff --git a/tests/test_hetro.py b/tests/test_hetro.py
index 9fac8c65..eaf145d4 100644
--- a/tests/test_hetro.py
+++ b/tests/test_hetro.py
@@ -2,28 +2,31 @@
 import sys
 import torch
 import argparse
-sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
-from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
+
+sys.path.append(os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim"))
+
+from Simulator.simulator import TOGSimulator
 from test_stonne import sparse_matmul
 
+
 def custom_matmul(a, b):
     return torch.matmul(a, b)
+
+
 torch.manual_seed(0)
-CONFIG_TORCHSIM_DIR = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
+CONFIG_TORCHSIM_DIR = os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim")
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="")
     parser.add_argument("--M", type=int, default=128, help="Batch size")
     parser.add_argument("--N", type=int, default=128, help="Input layer size")
     parser.add_argument("--K", type=int, default=128, help="Hidden layer size")
-    parser.add_argument("--sparsity", type=float, default=0.9, help="Output layer size")
-    parser.add_argument("--config", type=str, default="stonne_big_c1_simple_noc.yml", help="Output layer size")
-    parser.add_argument("--mode", type=int, default=0, help="Output layer size")
+    parser.add_argument("--sparsity", type=float, default=0.9, help="Sparsity")
+    parser.add_argument("--config", type=str, default="stonne_big_c1_simple_noc.yml", help="TOGSim config file name under configs/")
+    parser.add_argument("--mode", type=int, default=0, help="0=spmm only, 1=dense matmul only, 2=both partitions")
     args = parser.parse_args()
 
-    M = args.M
-    N = args.N
-    K = args.K
+    M, N, K = args.M, args.N, args.K
     sparsity = args.sparsity
     mode = args.mode
     config_path = f"{CONFIG_TORCHSIM_DIR}/configs/{args.config}"
@@ -33,45 +36,30 @@ def custom_matmul(a, b):
     print("K: ", K)
     print("sparsity: ", sparsity)
 
-    with torch.no_grad():
-        # Init scheduler
-        scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE,
-                            togsim_config=config_path)
-
-        # Register compiled model
-        opt_model1 = torch.compile(custom_matmul)
-        opt_model2 = torch.compile(sparse_matmul)
-        SchedulerDNNModel.register_model("matmul", opt_model1)
-        SchedulerDNNModel.register_model("spmm", opt_model2)
+    device = torch.device("npu:0")
 
-        # Init input data
-        for i in range(1):
-            dense_input1 = torch.randn(M, K)
-            dense_input2 = torch.randn(K, N)
+    opt_model1 = torch.compile(custom_matmul)
+    opt_model2 = torch.compile(sparse_matmul)
 
-            sparse_input1 = torch.randn(128, 128)
-            sparse_input2 = torch.randn(128, 128)
-            mask1 = torch.rand(sparse_input1.shape) > sparsity
-            mask2 = torch.rand(sparse_input2.shape) > sparsity
+    dense_input1 = torch.randn(M, K, device=device)
+    dense_input2 = torch.randn(K, N, device=device)
 
-            sparse_input1 = sparse_input1 * mask1
-            sparse_input2 = sparse_input2 * mask2
+    sparse_input1 = torch.randn(128, 128, device=device)
+    sparse_input2 = torch.randn(128, 128, device=device)
+    mask1 = torch.rand(sparse_input1.shape, device=device) > sparsity
+    mask2 = torch.rand(sparse_input2.shape, device=device) > sparsity
+    sparse_input1 = sparse_input1 * mask1
+    sparse_input2 = sparse_input2 * mask2
 
-            # Init request
+    with torch.no_grad():
+        with TOGSimulator(config_path=config_path):
             if mode == 0:
-                new_request1 = Request("spmm", [sparse_input1, sparse_input2], [], request_queue_idx=0)
-                scheduler.add_request(new_request1, request_time=0)
+                torch.npu.launch_model(opt_model2, sparse_input1, sparse_input2, stream_index=0, timestamp=0)
             elif mode == 1:
-                new_request2 = Request("matmul", [dense_input1, dense_input2], [], request_queue_idx=0)
-                scheduler.add_request(new_request2, request_time=0)
+                torch.npu.launch_model(opt_model1, dense_input1, dense_input2, stream_index=0, timestamp=0)
             elif mode == 2:
-                new_request1 = Request("spmm", [sparse_input1, sparse_input2], [], request_queue_idx=0)
-                new_request2 = Request("matmul", [dense_input1, dense_input2], [], request_queue_idx=1)
-
-                # Add request to scheduler
-                scheduler.add_request(new_request1, request_time=0)
-                scheduler.add_request(new_request2, request_time=0)
-
-        # Run scheduler
-        while not scheduler.is_finished():
-            scheduler.schedule()
\ No newline at end of file
+                torch.npu.launch_model(opt_model2, sparse_input1, sparse_input2, stream_index=0, timestamp=0)
+                torch.npu.launch_model(opt_model1, dense_input1, dense_input2, stream_index=1, timestamp=0)
+            else:
+                raise ValueError(f"unknown mode {mode}")
+            torch.npu.synchronize()
diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py
index 724c10d0..beab8054 100644
--- a/tests/test_scheduler.py
+++ b/tests/test_scheduler.py
@@ -1,5 +1,4 @@
 import os
-import sys
 import torch
 from torchvision.models import resnet18 as model1
 from test_transformer import EncoderBlock as model2
@@ -7,7 +6,6 @@
 
 base_path = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
 config = f'{base_path}/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.yml'
-os.environ['TOGSIM_CONFIG'] = config
 
 target_model1 = model1().eval()
 target_model2 = model2(768, 12).eval()
@@ -24,4 +22,4 @@
     torch.npu.synchronize()
     torch.npu.launch_model(opt_model1, model_input1, stream_index=0, timestamp=0)
     torch.npu.launch_model(opt_model2, model_input2, stream_index=1, timestamp=0)
-print("Done")
\ No newline at end of file
+print("Done")
diff --git a/tests/test_scheduler_batching.py b/tests/test_scheduler_batching.py
deleted file mode 100644
index 65213ef0..00000000
--- a/tests/test_scheduler_batching.py
+++ /dev/null
@@ -1,41 +0,0 @@
-import os
-import sys
-import torch
-from torchvision.models import resnet18 as model1
-import argparse
-
-sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
-from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request, poisson_request_generator
-CONFIG_TORCHSIM_DIR = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Poisson Request Generator (ms)")
-    parser.add_argument("lambda_requests", nargs="?", type=int, help="Average requests per second (λ)", default=2000)
-    parser.add_argument("max_time", nargs="?", type=int, help="Maximum simulation time in milliseconds", default=30)
-
-    args = parser.parse_args()
-    target_model1 = model1().eval()
-
-    # Init scheduler
-    scheduler = Scheduler(num_request_queue=1, max_batch=32, engine_select=Scheduler.FIFO_ENGINE, togsim_config=f"{CONFIG_TORCHSIM_DIR}/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml")
-    # Register compiled model
-    opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last), dynamic=False)
-    SchedulerDNNModel.register_model("resnet18", opt_model1)
-
-    # Generate time stamp
-    for request_time in poisson_request_generator(args.lambda_requests, args.max_time):
-        # Init input data
-        model_input1 = torch.randn(1, 3, 224, 224)
-
-        # Init request
-        new_request1 = Request("resnet18", [model_input1], [], request_queue_idx=0)
-
-        # Add request to scheduler
-        print("[Reqest] Resnet18 request time: ", request_time, flush=True)
-        scheduler.add_request(new_request1, request_time=request_time)
-
-    # Run scheduler
-    while not scheduler.is_finished():
-        scheduler.schedule()
-
-    print("Done", file=sys.stderr)
\ No newline at end of file
diff --git a/tests/test_sort.py b/tests/test_sort.py
index 05afe92b..5bce2532 100644
--- a/tests/test_sort.py
+++ b/tests/test_sort.py
@@ -115,10 +115,7 @@ def sort_test_unstable(inp):
 
     shape = tuple(map(int, args.shape.strip("()").split(",")))
 
-    from Scheduler.scheduler import PyTorchSimRunner
-
-    module = PyTorchSimRunner.setup_device()
-    device = module.custom_device()
+    device = torch.device("npu:0")
 
     test_sort_stable_suite(device)
     test_sort_duplicate_cases(device)
\ No newline at end of file
diff --git a/tests/test_sparse_core.py b/tests/test_sparse_core.py
index 72eda0c8..bb4ff630 100644
--- a/tests/test_sparse_core.py
+++ b/tests/test_sparse_core.py
@@ -80,9 +80,6 @@ def test_sparse_mlp(device, batch_size=32, input_size=128, hidden_size=128, outp
     import os
     import sys
     sys.path.append(os.environ.get('TORCHSIM_DIR', default='/root/workspace/PyTorchSim'))
-    from Scheduler.scheduler import PyTorchSimRunner
-
-    module = PyTorchSimRunner.setup_device()
-    device = module.custom_device()
+    device = torch.device("npu:0")
     test_sparse_mlp(device, batch_size=8, input_size=16, hidden_size=32, output_size=64)
     
diff --git a/tests/test_spmm_scheduler.py b/tests/test_spmm_scheduler.py
deleted file mode 100644
index 71594eb2..00000000
--- a/tests/test_spmm_scheduler.py
+++ /dev/null
@@ -1,66 +0,0 @@
-import os
-import sys
-import torch
-import argparse
-sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
-from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request
-from test_sparse_core import SparseMLP as model1
-from test_transformer import EncoderBlock as model2
-CONFIG_TORCHSIM_DIR = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="")
-    parser.add_argument("--batch_size", type=int, default=128, help="Batch size")
-    parser.add_argument("--input_size", type=int, default=128, help="Input layer size")
-    parser.add_argument("--hidden_size", type=int, default=128, help="Hidden layer size")
-    parser.add_argument("--output_size", type=int, default=128, help="Output layer size")
-    parser.add_argument("--w1_sparsity", type=float, default=0.5, help="Sparsity of first layer weights (0 to 1)")
-    parser.add_argument("--w2_sparsity", type=float, default=0.5, help="Sparsity of second layer weights (0 to 1)")
-    parser.add_argument("--config", type=str)
-    args = parser.parse_args()
-
-    batch_size = args.batch_size
-    input_size = args.input_size
-    hidden_size = args.hidden_size
-    output_size = args.output_size
-    w1_sparsity = args.w1_sparsity
-    w2_sparsity = args.w2_sparsity
-    config_path = f"{CONFIG_TORCHSIM_DIR}/configs/{args.config}"
-
-    print("batch_size: ", batch_size)
-    print("input_size: ", input_size)
-    print("hidden_size: ", hidden_size)
-    print("output_size: ", output_size)
-    print("w1_sparsity: ", w1_sparsity)
-    print("w2_sparsity: ", w2_sparsity)
-
-    with torch.no_grad():
-        # Init scheduler
-        scheduler = Scheduler(num_request_queue=2, engine_select=Scheduler.FIFO_ENGINE,
-                              togsim_config=config_path)
-
-        target_model1 = model1(input_size, hidden_size, output_size, w1_sparsity, w2_sparsity, scheduler.execution_engine.module.custom_device()).eval()
-        target_model2 = model2(768, 12).eval()
-
-        # Register compiled model
-        opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device()))
-        opt_model2 = torch.compile(target_model2.to(device=scheduler.execution_engine.module.custom_device()))
-        SchedulerDNNModel.register_model("mlp", opt_model1)
-        SchedulerDNNModel.register_model("bert", opt_model2)
-
-        # Init input data
-        model_input1 = torch.randn(batch_size, input_size)
-        model_input2 = torch.randn(1, 512, 768)
-
-        # Init request
-        new_request1 = Request("mlp", [model_input1], [], request_queue_idx=0)
-        #new_request2 = Request("bert", [model_input2], [], request_queue_idx=1)
-
-
-        # Add request to scheduler
-        scheduler.add_request(new_request1, request_time=0)
-        #scheduler.add_request(new_request2, request_time=0)
-
-        # Run scheduler
-        while not scheduler.is_finished():
-            scheduler.schedule()
\ No newline at end of file
diff --git a/tutorial/session1/CompilerOptimization.ipynb b/tutorial/session1/CompilerOptimization.ipynb
index ead695c0..d17a6b25 100644
--- a/tutorial/session1/CompilerOptimization.ipynb
+++ b/tutorial/session1/CompilerOptimization.ipynb
@@ -35,8 +35,7 @@
    "outputs": [],
    "source": [
     "os.environ['TORCHSIM_DUMP_PATH']=os.path.join(os.getcwd(), \"fused\")\n",
-    "from Scheduler.scheduler import PyTorchSimRunner\n",
-    "device = PyTorchSimRunner.setup_device().custom_device()\n",
+    "device = torch.device(\"npu:0\")\n",
     "\n",
     "input = torch.randn(1024, 1024).to(device=device)\n",
     "weight = torch.randn(1024, 1024).to(device=device)\n",
diff --git a/tutorial/session1/ExecutionMode.ipynb b/tutorial/session1/ExecutionMode.ipynb
index b6f0e048..d94323db 100644
--- a/tutorial/session1/ExecutionMode.ipynb
+++ b/tutorial/session1/ExecutionMode.ipynb
@@ -33,8 +33,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from Scheduler.scheduler import PyTorchSimRunner\n",
-    "device = PyTorchSimRunner.setup_device().custom_device()\n",
+    "device = torch.device(\"npu:0\")\n",
     "\n",
     "input = torch.randn(1024, 1024).to(device=device)\n",
     "weight = torch.randn(1024, 1024).to(device=device)\n",
diff --git a/tutorial/session1/Inference.ipynb b/tutorial/session1/Inference.ipynb
index a49e2440..6fd54aed 100644
--- a/tutorial/session1/Inference.ipynb
+++ b/tutorial/session1/Inference.ipynb
@@ -57,8 +57,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from Scheduler.scheduler import PyTorchSimRunner\n",
-    "device = PyTorchSimRunner.setup_device().custom_device()\n",
+    "device = torch.device(\"npu:0\")\n",
     "\n",
     "torch.manual_seed(0)\n",
     "input = torch.randn(128, 128).to(device)\n",
diff --git a/tutorial/session1/LogAnalysis.ipynb b/tutorial/session1/LogAnalysis.ipynb
index a82737db..24dae52b 100644
--- a/tutorial/session1/LogAnalysis.ipynb
+++ b/tutorial/session1/LogAnalysis.ipynb
@@ -35,8 +35,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from Scheduler.scheduler import PyTorchSimRunner\n",
-    "device = PyTorchSimRunner.setup_device().custom_device()\n",
+    "device = torch.device(\"npu:0\")\n",
     "\n",
     "input = torch.randn(1024, 1024).to(device=device)\n",
     "weight = torch.randn(1024, 1024).to(device=device)\n",
diff --git a/tutorial/session1/Mapping.ipynb b/tutorial/session1/Mapping.ipynb
index 684b69c0..0b978bcb 100644
--- a/tutorial/session1/Mapping.ipynb
+++ b/tutorial/session1/Mapping.ipynb
@@ -33,8 +33,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from Scheduler.scheduler import PyTorchSimRunner\n",
-    "device = PyTorchSimRunner.setup_device().custom_device()\n",
+    "device = torch.device(\"npu:0\")\n",
     "\n",
     "input = torch.randn(1024, 1024).to(device=device)\n",
     "weight = torch.randn(1024, 1024).to(device=device)\n",
diff --git a/tutorial/session1/Training.ipynb b/tutorial/session1/Training.ipynb
index 0c6b138a..badf7ed7 100644
--- a/tutorial/session1/Training.ipynb
+++ b/tutorial/session1/Training.ipynb
@@ -20,8 +20,7 @@
     "sys.path.append(base_dir)\n",
     "\n",
     "cpu_device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
-    "from Scheduler.scheduler import PyTorchSimRunner\n",
-    "npu_device = PyTorchSimRunner.setup_device().custom_device()"
+    "npu_device = torch.device(\"npu:0\")"
    ]
   },
   {
diff --git a/tutorial/session2/Hands_on.ipynb b/tutorial/session2/Hands_on.ipynb
index 2964f293..9a7c35e3 100644
--- a/tutorial/session2/Hands_on.ipynb
+++ b/tutorial/session2/Hands_on.ipynb
@@ -35,9 +35,7 @@
     "os.environ['TORCHSIM_LOG_PATH']=os.path.join(os.getcwd(), \"togsim_results\")\n",
     "sys.path.append(base_dir)\n",
     "\n",
-    "from Scheduler.scheduler import PyTorchSimRunner\n",
-    "module = PyTorchSimRunner.setup_device()\n",
-    "device = module.custom_device()\n",
+    "device = torch.device(\"npu:0\")\n",
     "\n",
     "def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):\n",
     "    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):\n",

From 8ca844a6b1227839208874502e8680d74d390fd3 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 26 Mar 2026 20:56:11 +0900
Subject: [PATCH 156/194] [Frontend/MobileNet] Add MobileNet CI and 1x1 spatial
 conv linear decomposition (#205)

---
 .github/workflows/pytorchsim_test.yml         |  19 ++++
 PyTorchSimFrontend/mlir/mlir_conv_common.py   |   2 +-
 PyTorchSimFrontend/mlir/mlir_decomposition.py | 100 ++++++++++++++++--
 tests/test_conv2d.py                          |   1 +
 4 files changed, 111 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/pytorchsim_test.yml b/.github/workflows/pytorchsim_test.yml
index 2a9d60a1..a7613b6e 100644
--- a/.github/workflows/pytorchsim_test.yml
+++ b/.github/workflows/pytorchsim_test.yml
@@ -328,6 +328,25 @@ jobs:
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
             ${{ inputs.image_name }} python3 PyTorchSim/tests/test_resnet.py --model_type resnet50
 
+  test_mobilenet:
+    name: Run test_mobilenet.py
+    runs-on: self-hosted
+    steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Run test_mobilenet.py
+        run: |
+          echo "Running test_mobilenet.py"
+          docker run --rm \
+            -e vpu_num_lanes="${{ inputs.vector_lane }}" \
+            -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
+            ${{ inputs.image_name }} python3 PyTorchSim/tests/MobileNet/test_mobilenet.py
+
   test_transformer:
     name: Run test_transformer.py
     runs-on: self-hosted
diff --git a/PyTorchSimFrontend/mlir/mlir_conv_common.py b/PyTorchSimFrontend/mlir/mlir_conv_common.py
index 386e9bd5..d577dbd8 100644
--- a/PyTorchSimFrontend/mlir/mlir_conv_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_conv_common.py
@@ -123,6 +123,6 @@ def compute_stride(shape):
             return stride
 
         X_stride = compute_stride(X_shape)
-        arg_attributes.append([X.data.data.name, [MLIRKernelArgs.MLIR_ARGS_IN, X.layout.dtype, math.prod(X_shape), X_shape, X_stride]])
+        arg_attributes.append([X.get_name(), [MLIRKernelArgs.MLIR_ARGS_IN, X.layout.dtype, math.prod(X_shape), X_shape, X_stride]])
 
         return arg_attributes
diff --git a/PyTorchSimFrontend/mlir/mlir_decomposition.py b/PyTorchSimFrontend/mlir/mlir_decomposition.py
index 122c2677..0f443cf8 100644
--- a/PyTorchSimFrontend/mlir/mlir_decomposition.py
+++ b/PyTorchSimFrontend/mlir/mlir_decomposition.py
@@ -16,6 +16,73 @@ def _pair_2d(seq: Sequence[int]) -> Tuple[int, int]:
     return int(seq[0]), int(seq[1])
 
 
+def _int_eq(x, v: int) -> bool:
+    try:
+        return int(x) == v
+    except (TypeError, ValueError):
+        return False
+
+
+def _can_rewrite_pointwise_conv_on_1x1_spatial_to_linear(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    stride: Sequence[int],
+    padding: Sequence[int],
+    dilation: Sequence[int],
+    transposed: bool,
+    output_padding: Sequence[int],
+    groups: int,
+) -> bool:
+    """
+    Whether this ``aten.convolution`` is **exactly** ``F.linear`` on ``[N, C]`` (then reshaped
+    to ``[N, C_out, 1, 1]``): 1x1 kernel, spatial size 1x1, ``groups==1``, stride 1, no padding,
+    dilation 1 (typical SE line after global pool).
+
+    If True, use ``_apply_pointwise_conv_on_1x1_spatial_as_linear``; if False, keep normal conv.
+    """
+    if transposed or input.dim() != 4 or weight.dim() != 4:
+        return False
+    if groups != 1:
+        return False
+    if not (
+        _int_eq(input.shape[2], 1)
+        and _int_eq(input.shape[3], 1)
+        and _int_eq(weight.shape[2], 1)
+        and _int_eq(weight.shape[3], 1)
+    ):
+        return False
+
+    sh, sw = _pair_2d(stride)
+    ph, pw = _pair_2d(padding)
+    dh, dw = _pair_2d(dilation)
+    if sh != 1 or sw != 1 or ph != 0 or pw != 0 or dh != 1 or dw != 1:
+        return False
+    if len(output_padding) and any(not _int_eq(o, 0) for o in output_padding):
+        return False
+
+    _, cin, _, _ = input.shape
+    _, cin_w, _, _ = weight.shape
+    try:
+        if int(cin_w) != int(cin):
+            return False
+    except (TypeError, ValueError):
+        return False
+    return True
+
+
+def _apply_pointwise_conv_on_1x1_spatial_as_linear(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+    """Same numerics as ``convolution``; call only when ``_can_rewrite_...`` is True."""
+    n, cin, _, _ = input.shape
+    cout, _, _, _ = weight.shape
+    x = input.reshape(n, cin)
+    w = weight.reshape(cout, cin)
+    return F.linear(x, w, bias).reshape(n, cout, 1, 1)
+
+
 def _group_conv_cin1_cout1(
     input: torch.Tensor,
     weight: torch.Tensor,
@@ -70,7 +137,7 @@ def _group_conv_cin1_cout1(
 
 
 @register_decomposition(aten.convolution.default)
-def decompose_group_convolution(
+def decompose_convolution(
     input: torch.Tensor,
     weight: torch.Tensor,
     bias: Union[torch.Tensor, None],
@@ -82,23 +149,36 @@ def decompose_group_convolution(
     groups: Union[int, torch.SymInt],
 ):
     """
-    Lower grouped ``aten.convolution`` only when each group has a single input and output
-    channel (``Cin//groups == Cout//groups == 1``), via ``_group_conv_cin1_cout1``.
+    1. Pointwise 1x1 on spatial 1x1 (groups==1): rewrite to F.linear so backends
+       that struggle with tiny spatial convs (e.g. SE after AdaptiveAvgPool2d(1)) see
+       aten.mm / linear lowering instead.
+
+    2. Grouped conv when Cin//groups == Cout//groups == 1: _group_conv_cin1_cout1.
+
+    Otherwise returns NotImplemented (Inductor uses the default aten.convolution).
 
     Note
     ----
-    The lowered path is not a performance-optimized kernel; it exists for correctness and
-    lowering experiments. For speed, implement a separate template (fused) kernel for group
-    convolution.
-
-    Non-static ``groups`` (cannot ``int()``) falls back: returns ``NotImplemented`` so the
-    default ``aten.convolution`` is used. ``groups==1`` also returns ``NotImplemented``.
+    The grouped path is not performance-optimized; it exists for correctness experiments.
     """
     try:
         gcount = operator.index(groups)
     except (TypeError, ValueError):
         return NotImplemented
-    # groups==1: do not decompose; Inductor keeps the default aten.convolution (plain conv).
+
+    if _can_rewrite_pointwise_conv_on_1x1_spatial_to_linear(
+        input,
+        weight,
+        stride,
+        padding,
+        dilation,
+        transposed,
+        output_padding,
+        gcount,
+    ):
+        return _apply_pointwise_conv_on_1x1_spatial_as_linear(input, weight, bias)
+
+    # groups==1, non-1x1 spatial: keep default aten.convolution (plain conv).
     if gcount == 1:
         return NotImplemented
 
diff --git a/tests/test_conv2d.py b/tests/test_conv2d.py
index 533a04db..313003b1 100644
--- a/tests/test_conv2d.py
+++ b/tests/test_conv2d.py
@@ -50,3 +50,4 @@ def custom_conv2d(a, b, bias):
         test_conv2d(device, batch_size=1, in_channels=128, out_channels=256, input_size=2, kernel_size=1, stride=1, padding=0)
         test_conv2d(device, batch_size=1, in_channels=128, out_channels=256, input_size=14, kernel_size=1, stride=2, padding=0)
         test_conv2d(device, batch_size=1, in_channels=3, out_channels=768, input_size=224, kernel_size=16,stride=16, padding=0)
+        test_conv2d(device, batch_size=1, in_channels=8, out_channels=16, input_size=1, kernel_size=1,stride=1, padding=0)

From 6f747224377f839f9e1dac62aaa08bf874c6b85c Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 31 Mar 2026 18:43:13 +0900
Subject: [PATCH 157/194] [Test] Add missing mobilenet test script

---
 tests/MobileNet/test_mobilenet.py | 106 ++++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)
 create mode 100644 tests/MobileNet/test_mobilenet.py

diff --git a/tests/MobileNet/test_mobilenet.py b/tests/MobileNet/test_mobilenet.py
new file mode 100644
index 00000000..966d479a
--- /dev/null
+++ b/tests/MobileNet/test_mobilenet.py
@@ -0,0 +1,106 @@
+import argparse
+import copy
+import os
+
+import torch
+import torch._dynamo
+import torch.utils.cpp_extension
+from torchvision.models import mobilenet_v2
+
+
+def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):
+    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):
+        message = f"|{name} Test Passed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+    else:
+        message = f"|{name} Test Failed|"
+        print("-" * len(message))
+        print(message)
+        print("-" * len(message))
+        print("custom out: ", out.cpu())
+        print("cpu out: ", cpu_out)
+        exit(1)
+
+
+def _mobilenet_v2():
+    try:
+        from torchvision.models import MobileNet_V2_Weights
+
+        return mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).cpu().eval()
+    except Exception:
+        return mobilenet_v2().cpu().eval()
+
+
+def run_mobilenet(batch, config):
+    device = torch.device("npu:0")
+
+    torch._dynamo.config.recompile_limit = 64
+    torch._dynamo.config.cache_size_limit = 128
+
+    model = _mobilenet_v2()
+    imgsz = 224
+    x = torch.randn(batch, 3, imgsz, imgsz)
+
+    model_cpu = copy.deepcopy(model).cpu().eval()
+    x_cpu = copy.deepcopy(x).cpu()
+    y_cpu = model_cpu(x_cpu)
+
+    model_npu = model_cpu.to(device).eval()
+    x_npu = copy.deepcopy(x).to(device)
+    compiled_model_npu = torch.compile(dynamic=False)(model_npu)
+    y_npu = compiled_model_npu(x_npu)
+
+    if isinstance(y_cpu, (list, tuple)):
+        for i, (out_npu, out_cpu) in enumerate(zip(y_npu, y_cpu)):
+            test_result(f"MobileNet Output {i}", out_npu, out_cpu)
+    else:
+        test_result("MobileNet Output", y_npu, y_cpu)
+
+    print("MobileNet Simulation Done")
+
+
+def test_inverted_residual_module(device, batch=1, inp=32, oup=32, stride=1, expand_ratio=6, h=28, w=28):
+    from torchvision.models.mobilenetv2 import InvertedResidual
+
+    torch.manual_seed(0)
+
+    x = torch.randn(batch, inp, h, w)
+
+    model_cpu = InvertedResidual(inp, oup, stride, expand_ratio).cpu().eval()
+    x_cpu = copy.deepcopy(x).cpu()
+    y_cpu = model_cpu(x_cpu)
+
+    model_npu = model_cpu.to(device).eval()
+    x_npu = copy.deepcopy(x).to(device)
+    compiled_model_npu = torch.compile(dynamic=False)(model_npu)
+    y_npu = compiled_model_npu(x_npu)
+
+    test_result("InvertedResidual Module", y_npu, y_cpu)
+    print("InvertedResidual Module Test Done")
+
+
+if __name__ == "__main__":
+    base_dir = os.environ.get("TORCHSIM_DIR", default="/workspace/PyTorchSim")
+    config = os.environ.get(
+        "TOGSIM_CONFIG",
+        default=f"{base_dir}/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml",
+    )
+    args = argparse.ArgumentParser()
+    args.add_argument("--batch", type=int, default=1)
+    args.add_argument("--dump_path", type=str, default="results")
+    args = args.parse_args()
+    batch = args.batch
+
+    device = torch.device("npu:0")
+
+    # print("\n" + "=" * 80)
+    # print("Testing InvertedResidual Module")
+    # print("=" * 80)
+    # test_inverted_residual_module(device, batch=batch, inp=32, oup=32, stride=1, expand_ratio=6, h=28, w=28)
+
+    print("\n" + "=" * 80)
+    print("Testing Full MobileNet V2 Model")
+    print("=" * 80)
+    run_mobilenet(batch, config)

From 7b6cfe549aefea70ab7830edf8b474a73c60de2a Mon Sep 17 00:00:00 2001
From: HamHyungkyu <hhk971@postech.ac.kr>
Date: Sat, 4 Apr 2026 16:30:12 +0900
Subject: [PATCH 158/194] [TOGSim] Migration to Ramulator2.1

-Update Ramulator version to 2.1
-Update Ramulator2 DRAM configs
---
 TOGSim/extern/ramulator2                   |   2 +-
 configs/ramulator2_configs/DDR4.yaml       | 446 +++++++++++++++++-
 configs/ramulator2_configs/HBM2.yaml       | 501 ++++++++++++++++++++-
 configs/ramulator2_configs/HBM2_TPUv3.yaml | 501 ++++++++++++++++++++-
 configs/ramulator2_configs/LPDDR5.yaml     | 494 ++++++++++++++++++++
 configs/ramulator2_configs/LPDDR5X.yaml    | 494 ++++++++++++++++++++
 configs/ramulator2_configs/gen_configs.py  | 109 +++++
 7 files changed, 2471 insertions(+), 76 deletions(-)
 create mode 100644 configs/ramulator2_configs/LPDDR5.yaml
 create mode 100644 configs/ramulator2_configs/LPDDR5X.yaml
 create mode 100644 configs/ramulator2_configs/gen_configs.py

diff --git a/TOGSim/extern/ramulator2 b/TOGSim/extern/ramulator2
index 49556128..70e85563 160000
--- a/TOGSim/extern/ramulator2
+++ b/TOGSim/extern/ramulator2
@@ -1 +1 @@
-Subproject commit 495561282d99f2ef2652618710e98c4a287025da
+Subproject commit 70e855630b7f582bc8fa7370bfd582dc71d8af63
diff --git a/configs/ramulator2_configs/DDR4.yaml b/configs/ramulator2_configs/DDR4.yaml
index e65528ed..45799436 100644
--- a/configs/ramulator2_configs/DDR4.yaml
+++ b/configs/ramulator2_configs/DDR4.yaml
@@ -1,25 +1,421 @@
-Frontend:
-  impl: GEM5            
-
-MemorySystem:
-  impl: GenericDRAM
-  clock_ratio: 1
-
-  DRAM:
-    impl: DDR4
-    org:
-      preset: DDR4_16Gb_x4
-      channel: 1
-    timing:
-      preset: DDR4_1600J
-
-  Controller:
-    impl: Generic
-    Scheduler:
-      impl: FRFCFS
-    RefreshManager:
-      impl: AllBank
-    plugins:
-
-  AddrMapper:
-    impl: RoBaRaCoCh 
\ No newline at end of file
+{
+  "frontend": {
+    "impl": "External",
+    "clock_ratio": 1
+  },
+  "memory_system": {
+    "impl": "GenericDRAM",
+    "clock_ratio": 1,
+    "channel_mapper": {
+      "impl": "PassThroughChannelMapper"
+    },
+    "controllers": [
+      {
+        "impl": "GenericDDR",
+        "wr_low_watermark": 0.2,
+        "wr_high_watermark": 0.8,
+        "read_buffer_size": 32,
+        "write_buffer_size": 32,
+        "priority_buffer_size": 1568,
+        "scheduler": {
+          "impl": "FRFCFS"
+        },
+        "refresh_manager": {
+          "impl": "AllBank",
+          "scope": "Rank"
+        },
+        "row_policy": {
+          "impl": "Open"
+        },
+        "addr_mapper": {
+          "impl": "RoBaRaCoCh"
+        },
+        "dram": {
+          "impl": "DDR4",
+          "org": {
+            "dq": 8,
+            "count": [
+              1,
+              1,
+              4,
+              4,
+              65536,
+              1024
+            ]
+          },
+          "timing": [
+            3200,
+            4,
+            22,
+            22,
+            22,
+            52,
+            74,
+            24,
+            12,
+            16,
+            4,
+            8,
+            4,
+            8,
+            4,
+            12,
+            34,
+            576,
+            12480,
+            2,
+            625
+          ],
+          "channel_width": 64,
+          "read_latency": 26,
+          "timing_constraints": [
+            [
+              0,
+              [
+                3,
+                5
+              ],
+              [
+                3,
+                5
+              ],
+              4
+            ],
+            [
+              0,
+              [
+                4,
+                6
+              ],
+              [
+                4,
+                6
+              ],
+              4
+            ],
+            [
+              1,
+              [
+                3,
+                5
+              ],
+              [
+                3,
+                5
+              ],
+              4
+            ],
+            [
+              1,
+              [
+                4,
+                6
+              ],
+              [
+                4,
+                6
+              ],
+              4
+            ],
+            [
+              1,
+              [
+                3,
+                5
+              ],
+              [
+                4,
+                6
+              ],
+              12
+            ],
+            [
+              1,
+              [
+                4,
+                6
+              ],
+              [
+                3,
+                5
+              ],
+              24
+            ],
+            [
+              1,
+              [
+                3,
+                5
+              ],
+              [
+                3,
+                4,
+                5,
+                6
+              ],
+              6,
+              1,
+              true
+            ],
+            [
+              1,
+              [
+                4,
+                6
+              ],
+              [
+                3,
+                5
+              ],
+              0,
+              1,
+              true
+            ],
+            [
+              1,
+              [
+                3
+              ],
+              [
+                2
+              ],
+              12
+            ],
+            [
+              1,
+              [
+                4
+              ],
+              [
+                2
+              ],
+              44
+            ],
+            [
+              1,
+              [
+                0
+              ],
+              [
+                0
+              ],
+              4
+            ],
+            [
+              1,
+              [
+                0
+              ],
+              [
+                0
+              ],
+              34,
+              4
+            ],
+            [
+              1,
+              [
+                0
+              ],
+              [
+                2
+              ],
+              52
+            ],
+            [
+              1,
+              [
+                2
+              ],
+              [
+                0
+              ],
+              22
+            ],
+            [
+              1,
+              [
+                0
+              ],
+              [
+                7
+              ],
+              74
+            ],
+            [
+              1,
+              [
+                1,
+                2
+              ],
+              [
+                7
+              ],
+              22
+            ],
+            [
+              1,
+              [
+                5
+              ],
+              [
+                7
+              ],
+              34
+            ],
+            [
+              1,
+              [
+                6
+              ],
+              [
+                7
+              ],
+              66
+            ],
+            [
+              1,
+              [
+                7
+              ],
+              [
+                0,
+                2
+              ],
+              576
+            ],
+            [
+              2,
+              [
+                3,
+                5
+              ],
+              [
+                3,
+                5
+              ],
+              8
+            ],
+            [
+              2,
+              [
+                4,
+                6
+              ],
+              [
+                4,
+                6
+              ],
+              8
+            ],
+            [
+              2,
+              [
+                4,
+                6
+              ],
+              [
+                3,
+                5
+              ],
+              32
+            ],
+            [
+              2,
+              [
+                0
+              ],
+              [
+                0
+              ],
+              8
+            ],
+            [
+              3,
+              [
+                0
+              ],
+              [
+                0
+              ],
+              74
+            ],
+            [
+              3,
+              [
+                0
+              ],
+              [
+                3,
+                4,
+                5,
+                6
+              ],
+              22
+            ],
+            [
+              3,
+              [
+                0
+              ],
+              [
+                1
+              ],
+              52
+            ],
+            [
+              3,
+              [
+                1
+              ],
+              [
+                0
+              ],
+              22
+            ],
+            [
+              3,
+              [
+                3
+              ],
+              [
+                1
+              ],
+              12
+            ],
+            [
+              3,
+              [
+                4
+              ],
+              [
+                1
+              ],
+              44
+            ],
+            [
+              3,
+              [
+                5
+              ],
+              [
+                0
+              ],
+              34
+            ],
+            [
+              3,
+              [
+                6
+              ],
+              [
+                0
+              ],
+              66
+            ]
+          ]
+        }
+      }
+    ]
+  }
+}
\ No newline at end of file
diff --git a/configs/ramulator2_configs/HBM2.yaml b/configs/ramulator2_configs/HBM2.yaml
index 70cddef0..2bdd1705 100644
--- a/configs/ramulator2_configs/HBM2.yaml
+++ b/configs/ramulator2_configs/HBM2.yaml
@@ -1,25 +1,476 @@
-Frontend:
-  impl: GEM5            
-
-MemorySystem:
-  impl: GenericDRAM
-  clock_ratio: 1
-
-  DRAM:
-    impl: HBM2
-    org:
-      preset: HBM2_8Gb
-      channel: 1
-    timing:
-      preset: HBM2_1.4Gbps
-
-  Controller:
-    impl: Generic
-    Scheduler:
-      impl: FRFCFS
-    RefreshManager:
-      impl: AllBank
-    plugins:
-
-  AddrMapper:
-    impl: RoBaRaCoCh
\ No newline at end of file
+{
+  "frontend": {
+    "impl": "External",
+    "clock_ratio": 1
+  },
+  "memory_system": {
+    "impl": "GenericDRAM",
+    "clock_ratio": 1,
+    "channel_mapper": {
+      "impl": "PassThroughChannelMapper"
+    },
+    "controllers": [
+      {
+        "impl": "GenericDDR",
+        "wr_low_watermark": 0.2,
+        "wr_high_watermark": 0.8,
+        "read_buffer_size": 32,
+        "write_buffer_size": 32,
+        "priority_buffer_size": 1568,
+        "scheduler": {
+          "impl": "FRFCFS"
+        },
+        "refresh_manager": {
+          "impl": "AllBank",
+          "scope": "PseudoChannel"
+        },
+        "row_policy": {
+          "impl": "Open"
+        },
+        "addr_mapper": {
+          "impl": "RoBaRaCoCh"
+        },
+        "dram": {
+          "impl": "HBM2",
+          "org": {
+            "dq": 64,
+            "count": [
+              1,
+              2,
+              4,
+              4,
+              65536,
+              32
+            ]
+          },
+          "timing": [
+            2000,
+            2,
+            14,
+            14,
+            12,
+            14,
+            34,
+            48,
+            16,
+            5,
+            5,
+            2,
+            4,
+            4,
+            4,
+            6,
+            8,
+            15,
+            350,
+            160,
+            8,
+            3900,
+            122,
+            1000
+          ],
+          "channel_width": 64,
+          "read_latency": 16,
+          "timing_constraints": [
+            [
+              0,
+              [
+                0
+              ],
+              [
+                0,
+                1,
+                2,
+                7,
+                8
+              ],
+              2
+            ],
+            [
+              1,
+              [
+                3,
+                5
+              ],
+              [
+                3,
+                5
+              ],
+              2
+            ],
+            [
+              1,
+              [
+                4,
+                6
+              ],
+              [
+                4,
+                6
+              ],
+              2
+            ],
+            [
+              1,
+              [
+                3,
+                5
+              ],
+              [
+                3,
+                5
+              ],
+              2
+            ],
+            [
+              1,
+              [
+                4,
+                6
+              ],
+              [
+                4,
+                6
+              ],
+              2
+            ],
+            [
+              1,
+              [
+                3,
+                5
+              ],
+              [
+                4,
+                6
+              ],
+              13
+            ],
+            [
+              1,
+              [
+                4,
+                6
+              ],
+              [
+                3,
+                5
+              ],
+              13
+            ],
+            [
+              1,
+              [
+                3
+              ],
+              [
+                2
+              ],
+              5
+            ],
+            [
+              1,
+              [
+                4
+              ],
+              [
+                2
+              ],
+              23
+            ],
+            [
+              1,
+              [
+                0
+              ],
+              [
+                0
+              ],
+              4
+            ],
+            [
+              1,
+              [
+                0
+              ],
+              [
+                0
+              ],
+              15,
+              4
+            ],
+            [
+              1,
+              [
+                0
+              ],
+              [
+                2
+              ],
+              35
+            ],
+            [
+              1,
+              [
+                2
+              ],
+              [
+                0
+              ],
+              13
+            ],
+            [
+              1,
+              [
+                0
+              ],
+              [
+                7
+              ],
+              49
+            ],
+            [
+              1,
+              [
+                1,
+                2
+              ],
+              [
+                7
+              ],
+              14
+            ],
+            [
+              1,
+              [
+                5
+              ],
+              [
+                7
+              ],
+              19
+            ],
+            [
+              1,
+              [
+                6
+              ],
+              [
+                7
+              ],
+              37
+            ],
+            [
+              1,
+              [
+                7
+              ],
+              [
+                0
+              ],
+              349
+            ],
+            [
+              1,
+              [
+                7
+              ],
+              [
+                2
+              ],
+              350
+            ],
+            [
+              1,
+              [
+                8
+              ],
+              [
+                0
+              ],
+              7
+            ],
+            [
+              1,
+              [
+                0
+              ],
+              [
+                8
+              ],
+              5
+            ],
+            [
+              2,
+              [
+                3,
+                5
+              ],
+              [
+                3,
+                5
+              ],
+              4
+            ],
+            [
+              2,
+              [
+                4,
+                6
+              ],
+              [
+                4,
+                6
+              ],
+              4
+            ],
+            [
+              2,
+              [
+                4,
+                6
+              ],
+              [
+                3,
+                5
+              ],
+              15
+            ],
+            [
+              2,
+              [
+                0
+              ],
+              [
+                0
+              ],
+              4
+            ],
+            [
+              3,
+              [
+                0
+              ],
+              [
+                0
+              ],
+              48
+            ],
+            [
+              3,
+              [
+                0
+              ],
+              [
+                3,
+                5
+              ],
+              15
+            ],
+            [
+              3,
+              [
+                0
+              ],
+              [
+                4,
+                6
+              ],
+              13
+            ],
+            [
+              3,
+              [
+                0
+              ],
+              [
+                1
+              ],
+              35
+            ],
+            [
+              3,
+              [
+                1
+              ],
+              [
+                0
+              ],
+              13
+            ],
+            [
+              3,
+              [
+                3
+              ],
+              [
+                1
+              ],
+              5
+            ],
+            [
+              3,
+              [
+                4
+              ],
+              [
+                1
+              ],
+              23
+            ],
+            [
+              3,
+              [
+                5
+              ],
+              [
+                0
+              ],
+              18
+            ],
+            [
+              3,
+              [
+                6
+              ],
+              [
+                0
+              ],
+              36
+            ],
+            [
+              3,
+              [
+                8
+              ],
+              [
+                0
+              ],
+              159
+            ],
+            [
+              3,
+              [
+                0
+              ],
+              [
+                8
+              ],
+              49
+            ],
+            [
+              3,
+              [
+                1
+              ],
+              [
+                8
+              ],
+              14
+            ]
+          ]
+        }
+      }
+    ]
+  }
+}
\ No newline at end of file
diff --git a/configs/ramulator2_configs/HBM2_TPUv3.yaml b/configs/ramulator2_configs/HBM2_TPUv3.yaml
index e6543d14..2bdd1705 100644
--- a/configs/ramulator2_configs/HBM2_TPUv3.yaml
+++ b/configs/ramulator2_configs/HBM2_TPUv3.yaml
@@ -1,25 +1,476 @@
-Frontend:
-  impl: GEM5
-
-MemorySystem:
-  impl: GenericDRAM
-  clock_ratio: 1
-
-  DRAM:
-    impl: HBM2
-    org:
-      preset: HBM2_8Gb
-      channel: 1
-    timing:
-      preset: HBM2_1.8Gbps
-
-  Controller:
-    impl: Generic
-    Scheduler:
-      impl: FRFCFS
-    RefreshManager:
-      impl: AllBank
-    plugins:
-
-  AddrMapper:
-    impl: RoBaRaCoCh
\ No newline at end of file
+{
+  "frontend": {
+    "impl": "External",
+    "clock_ratio": 1
+  },
+  "memory_system": {
+    "impl": "GenericDRAM",
+    "clock_ratio": 1,
+    "channel_mapper": {
+      "impl": "PassThroughChannelMapper"
+    },
+    "controllers": [
+      {
+        "impl": "GenericDDR",
+        "wr_low_watermark": 0.2,
+        "wr_high_watermark": 0.8,
+        "read_buffer_size": 32,
+        "write_buffer_size": 32,
+        "priority_buffer_size": 1568,
+        "scheduler": {
+          "impl": "FRFCFS"
+        },
+        "refresh_manager": {
+          "impl": "AllBank",
+          "scope": "PseudoChannel"
+        },
+        "row_policy": {
+          "impl": "Open"
+        },
+        "addr_mapper": {
+          "impl": "RoBaRaCoCh"
+        },
+        "dram": {
+          "impl": "HBM2",
+          "org": {
+            "dq": 64,
+            "count": [
+              1,
+              2,
+              4,
+              4,
+              65536,
+              32
+            ]
+          },
+          "timing": [
+            2000,
+            2,
+            14,
+            14,
+            12,
+            14,
+            34,
+            48,
+            16,
+            5,
+            5,
+            2,
+            4,
+            4,
+            4,
+            6,
+            8,
+            15,
+            350,
+            160,
+            8,
+            3900,
+            122,
+            1000
+          ],
+          "channel_width": 64,
+          "read_latency": 16,
+          "timing_constraints": [
+            [
+              0,
+              [
+                0
+              ],
+              [
+                0,
+                1,
+                2,
+                7,
+                8
+              ],
+              2
+            ],
+            [
+              1,
+              [
+                3,
+                5
+              ],
+              [
+                3,
+                5
+              ],
+              2
+            ],
+            [
+              1,
+              [
+                4,
+                6
+              ],
+              [
+                4,
+                6
+              ],
+              2
+            ],
+            [
+              1,
+              [
+                3,
+                5
+              ],
+              [
+                3,
+                5
+              ],
+              2
+            ],
+            [
+              1,
+              [
+                4,
+                6
+              ],
+              [
+                4,
+                6
+              ],
+              2
+            ],
+            [
+              1,
+              [
+                3,
+                5
+              ],
+              [
+                4,
+                6
+              ],
+              13
+            ],
+            [
+              1,
+              [
+                4,
+                6
+              ],
+              [
+                3,
+                5
+              ],
+              13
+            ],
+            [
+              1,
+              [
+                3
+              ],
+              [
+                2
+              ],
+              5
+            ],
+            [
+              1,
+              [
+                4
+              ],
+              [
+                2
+              ],
+              23
+            ],
+            [
+              1,
+              [
+                0
+              ],
+              [
+                0
+              ],
+              4
+            ],
+            [
+              1,
+              [
+                0
+              ],
+              [
+                0
+              ],
+              15,
+              4
+            ],
+            [
+              1,
+              [
+                0
+              ],
+              [
+                2
+              ],
+              35
+            ],
+            [
+              1,
+              [
+                2
+              ],
+              [
+                0
+              ],
+              13
+            ],
+            [
+              1,
+              [
+                0
+              ],
+              [
+                7
+              ],
+              49
+            ],
+            [
+              1,
+              [
+                1,
+                2
+              ],
+              [
+                7
+              ],
+              14
+            ],
+            [
+              1,
+              [
+                5
+              ],
+              [
+                7
+              ],
+              19
+            ],
+            [
+              1,
+              [
+                6
+              ],
+              [
+                7
+              ],
+              37
+            ],
+            [
+              1,
+              [
+                7
+              ],
+              [
+                0
+              ],
+              349
+            ],
+            [
+              1,
+              [
+                7
+              ],
+              [
+                2
+              ],
+              350
+            ],
+            [
+              1,
+              [
+                8
+              ],
+              [
+                0
+              ],
+              7
+            ],
+            [
+              1,
+              [
+                0
+              ],
+              [
+                8
+              ],
+              5
+            ],
+            [
+              2,
+              [
+                3,
+                5
+              ],
+              [
+                3,
+                5
+              ],
+              4
+            ],
+            [
+              2,
+              [
+                4,
+                6
+              ],
+              [
+                4,
+                6
+              ],
+              4
+            ],
+            [
+              2,
+              [
+                4,
+                6
+              ],
+              [
+                3,
+                5
+              ],
+              15
+            ],
+            [
+              2,
+              [
+                0
+              ],
+              [
+                0
+              ],
+              4
+            ],
+            [
+              3,
+              [
+                0
+              ],
+              [
+                0
+              ],
+              48
+            ],
+            [
+              3,
+              [
+                0
+              ],
+              [
+                3,
+                5
+              ],
+              15
+            ],
+            [
+              3,
+              [
+                0
+              ],
+              [
+                4,
+                6
+              ],
+              13
+            ],
+            [
+              3,
+              [
+                0
+              ],
+              [
+                1
+              ],
+              35
+            ],
+            [
+              3,
+              [
+                1
+              ],
+              [
+                0
+              ],
+              13
+            ],
+            [
+              3,
+              [
+                3
+              ],
+              [
+                1
+              ],
+              5
+            ],
+            [
+              3,
+              [
+                4
+              ],
+              [
+                1
+              ],
+              23
+            ],
+            [
+              3,
+              [
+                5
+              ],
+              [
+                0
+              ],
+              18
+            ],
+            [
+              3,
+              [
+                6
+              ],
+              [
+                0
+              ],
+              36
+            ],
+            [
+              3,
+              [
+                8
+              ],
+              [
+                0
+              ],
+              159
+            ],
+            [
+              3,
+              [
+                0
+              ],
+              [
+                8
+              ],
+              49
+            ],
+            [
+              3,
+              [
+                1
+              ],
+              [
+                8
+              ],
+              14
+            ]
+          ]
+        }
+      }
+    ]
+  }
+}
\ No newline at end of file
diff --git a/configs/ramulator2_configs/LPDDR5.yaml b/configs/ramulator2_configs/LPDDR5.yaml
new file mode 100644
index 00000000..bf039f9f
--- /dev/null
+++ b/configs/ramulator2_configs/LPDDR5.yaml
@@ -0,0 +1,494 @@
+{
+  "frontend": {
+    "impl": "External",
+    "clock_ratio": 1
+  },
+  "memory_system": {
+    "impl": "GenericDRAM",
+    "clock_ratio": 1,
+    "channel_mapper": {
+      "impl": "PassThroughChannelMapper"
+    },
+    "controllers": [
+      {
+        "impl": "GenericDDR",
+        "wr_low_watermark": 0.2,
+        "wr_high_watermark": 0.8,
+        "read_buffer_size": 32,
+        "write_buffer_size": 32,
+        "priority_buffer_size": 1568,
+        "scheduler": {
+          "impl": "FRFCFS"
+        },
+        "refresh_manager": {
+          "impl": "AllBank",
+          "scope": "Rank"
+        },
+        "row_policy": {
+          "impl": "Open"
+        },
+        "addr_mapper": {
+          "impl": "RoBaRaCoCh"
+        },
+        "dram": {
+          "impl": "LPDDR5",
+          "org": {
+            "dq": 16,
+            "count": [
+              1,
+              1,
+              4,
+              4,
+              32768,
+              1024
+            ]
+          },
+          "timing": [
+            6400,
+            2,
+            17,
+            15,
+            15,
+            17,
+            34,
+            49,
+            28,
+            8,
+            9,
+            2,
+            2,
+            4,
+            2,
+            4,
+            4,
+            4,
+            5,
+            10,
+            16,
+            168,
+            96,
+            3125,
+            391,
+            1,
+            0,
+            8,
+            2,
+            1250
+          ],
+          "channel_width": 16,
+          "read_latency": 19,
+          "timing_constraints": [
+            [
+              0,
+              [
+                6,
+                8
+              ],
+              [
+                6,
+                8
+              ],
+              2
+            ],
+            [
+              0,
+              [
+                7,
+                9
+              ],
+              [
+                7,
+                9
+              ],
+              2
+            ],
+            [
+              3,
+              [
+                4
+              ],
+              [
+                6,
+                8
+              ],
+              0
+            ],
+            [
+              3,
+              [
+                5
+              ],
+              [
+                7,
+                9
+              ],
+              0
+            ],
+            [
+              1,
+              [
+                6,
+                8
+              ],
+              [
+                6,
+                8
+              ],
+              2
+            ],
+            [
+              1,
+              [
+                7,
+                9
+              ],
+              [
+                7,
+                9
+              ],
+              2
+            ],
+            [
+              1,
+              [
+                6,
+                8
+              ],
+              [
+                7,
+                9
+              ],
+              12
+            ],
+            [
+              1,
+              [
+                7,
+                9
+              ],
+              [
+                6,
+                8
+              ],
+              16
+            ],
+            [
+              1,
+              [
+                6,
+                8
+              ],
+              [
+                6,
+                7,
+                8,
+                9
+              ],
+              4,
+              1,
+              true
+            ],
+            [
+              1,
+              [
+                7,
+                9
+              ],
+              [
+                6,
+                8
+              ],
+              12,
+              1,
+              true
+            ],
+            [
+              1,
+              [
+                6
+              ],
+              [
+                3
+              ],
+              8
+            ],
+            [
+              1,
+              [
+                7
+              ],
+              [
+                3
+              ],
+              39
+            ],
+            [
+              1,
+              [
+                0
+              ],
+              [
+                0
+              ],
+              4
+            ],
+            [
+              1,
+              [
+                0
+              ],
+              [
+                0
+              ],
+              16,
+              4
+            ],
+            [
+              1,
+              [
+                0
+              ],
+              [
+                3
+              ],
+              34
+            ],
+            [
+              1,
+              [
+                3
+              ],
+              [
+                0
+              ],
+              17
+            ],
+            [
+              1,
+              [
+                2,
+                3
+              ],
+              [
+                2,
+                3
+              ],
+              2
+            ],
+            [
+              1,
+              [
+                0
+              ],
+              [
+                10
+              ],
+              49
+            ],
+            [
+              1,
+              [
+                2,
+                3
+              ],
+              [
+                10
+              ],
+              15
+            ],
+            [
+              1,
+              [
+                8
+              ],
+              [
+                10
+              ],
+              23
+            ],
+            [
+              1,
+              [
+                9
+              ],
+              [
+                10
+              ],
+              54
+            ],
+            [
+              1,
+              [
+                10
+              ],
+              [
+                0,
+                3
+              ],
+              168
+            ],
+            [
+              2,
+              [
+                6,
+                8
+              ],
+              [
+                6,
+                8
+              ],
+              4
+            ],
+            [
+              2,
+              [
+                7,
+                9
+              ],
+              [
+                7,
+                9
+              ],
+              4
+            ],
+            [
+              2,
+              [
+                7,
+                9
+              ],
+              [
+                6,
+                8
+              ],
+              21
+            ],
+            [
+              2,
+              [
+                0
+              ],
+              [
+                0
+              ],
+              4
+            ],
+            [
+              3,
+              [
+                0
+              ],
+              [
+                0
+              ],
+              49
+            ],
+            [
+              3,
+              [
+                0
+              ],
+              [
+                6,
+                7,
+                8,
+                9
+              ],
+              15
+            ],
+            [
+              3,
+              [
+                0
+              ],
+              [
+                2
+              ],
+              34
+            ],
+            [
+              3,
+              [
+                2
+              ],
+              [
+                0
+              ],
+              15
+            ],
+            [
+              3,
+              [
+                6
+              ],
+              [
+                2
+              ],
+              8
+            ],
+            [
+              3,
+              [
+                7
+              ],
+              [
+                2
+              ],
+              39
+            ],
+            [
+              3,
+              [
+                8
+              ],
+              [
+                0
+              ],
+              23
+            ],
+            [
+              3,
+              [
+                9
+              ],
+              [
+                0
+              ],
+              54
+            ],
+            [
+              3,
+              [
+                11
+              ],
+              [
+                0
+              ],
+              96
+            ],
+            [
+              3,
+              [
+                0
+              ],
+              [
+                11
+              ],
+              49
+            ],
+            [
+              3,
+              [
+                2
+              ],
+              [
+                11
+              ],
+              15
+            ]
+          ]
+        }
+      }
+    ]
+  }
+}
\ No newline at end of file
diff --git a/configs/ramulator2_configs/LPDDR5X.yaml b/configs/ramulator2_configs/LPDDR5X.yaml
new file mode 100644
index 00000000..4309aa6c
--- /dev/null
+++ b/configs/ramulator2_configs/LPDDR5X.yaml
@@ -0,0 +1,494 @@
+{
+  "frontend": {
+    "impl": "External",
+    "clock_ratio": 1
+  },
+  "memory_system": {
+    "impl": "GenericDRAM",
+    "clock_ratio": 1,
+    "channel_mapper": {
+      "impl": "PassThroughChannelMapper"
+    },
+    "controllers": [
+      {
+        "impl": "GenericDDR",
+        "wr_low_watermark": 0.2,
+        "wr_high_watermark": 0.8,
+        "read_buffer_size": 32,
+        "write_buffer_size": 32,
+        "priority_buffer_size": 1568,
+        "scheduler": {
+          "impl": "FRFCFS"
+        },
+        "refresh_manager": {
+          "impl": "AllBank",
+          "scope": "Rank"
+        },
+        "row_policy": {
+          "impl": "Open"
+        },
+        "addr_mapper": {
+          "impl": "RoBaRaCoCh"
+        },
+        "dram": {
+          "impl": "LPDDR5",
+          "org": {
+            "dq": 16,
+            "count": [
+              1,
+              1,
+              4,
+              4,
+              32768,
+              1024
+            ]
+          },
+          "timing": [
+            8533,
+            2,
+            23,
+            20,
+            20,
+            23,
+            46,
+            65,
+            38,
+            11,
+            12,
+            2,
+            2,
+            4,
+            2,
+            4,
+            6,
+            6,
+            7,
+            14,
+            22,
+            224,
+            128,
+            4165,
+            521,
+            1,
+            0,
+            8,
+            2,
+            938
+          ],
+          "channel_width": 16,
+          "read_latency": 25,
+          "timing_constraints": [
+            [
+              0,
+              [
+                6,
+                8
+              ],
+              [
+                6,
+                8
+              ],
+              2
+            ],
+            [
+              0,
+              [
+                7,
+                9
+              ],
+              [
+                7,
+                9
+              ],
+              2
+            ],
+            [
+              3,
+              [
+                4
+              ],
+              [
+                6,
+                8
+              ],
+              0
+            ],
+            [
+              3,
+              [
+                5
+              ],
+              [
+                7,
+                9
+              ],
+              0
+            ],
+            [
+              1,
+              [
+                6,
+                8
+              ],
+              [
+                6,
+                8
+              ],
+              2
+            ],
+            [
+              1,
+              [
+                7,
+                9
+              ],
+              [
+                7,
+                9
+              ],
+              2
+            ],
+            [
+              1,
+              [
+                6,
+                8
+              ],
+              [
+                7,
+                9
+              ],
+              15
+            ],
+            [
+              1,
+              [
+                7,
+                9
+              ],
+              [
+                6,
+                8
+              ],
+              21
+            ],
+            [
+              1,
+              [
+                6,
+                8
+              ],
+              [
+                6,
+                7,
+                8,
+                9
+              ],
+              4,
+              1,
+              true
+            ],
+            [
+              1,
+              [
+                7,
+                9
+              ],
+              [
+                6,
+                8
+              ],
+              15,
+              1,
+              true
+            ],
+            [
+              1,
+              [
+                6
+              ],
+              [
+                3
+              ],
+              11
+            ],
+            [
+              1,
+              [
+                7
+              ],
+              [
+                3
+              ],
+              52
+            ],
+            [
+              1,
+              [
+                0
+              ],
+              [
+                0
+              ],
+              6
+            ],
+            [
+              1,
+              [
+                0
+              ],
+              [
+                0
+              ],
+              22,
+              4
+            ],
+            [
+              1,
+              [
+                0
+              ],
+              [
+                3
+              ],
+              46
+            ],
+            [
+              1,
+              [
+                3
+              ],
+              [
+                0
+              ],
+              23
+            ],
+            [
+              1,
+              [
+                2,
+                3
+              ],
+              [
+                2,
+                3
+              ],
+              2
+            ],
+            [
+              1,
+              [
+                0
+              ],
+              [
+                10
+              ],
+              65
+            ],
+            [
+              1,
+              [
+                2,
+                3
+              ],
+              [
+                10
+              ],
+              20
+            ],
+            [
+              1,
+              [
+                8
+              ],
+              [
+                10
+              ],
+              31
+            ],
+            [
+              1,
+              [
+                9
+              ],
+              [
+                10
+              ],
+              72
+            ],
+            [
+              1,
+              [
+                10
+              ],
+              [
+                0,
+                3
+              ],
+              224
+            ],
+            [
+              2,
+              [
+                6,
+                8
+              ],
+              [
+                6,
+                8
+              ],
+              4
+            ],
+            [
+              2,
+              [
+                7,
+                9
+              ],
+              [
+                7,
+                9
+              ],
+              4
+            ],
+            [
+              2,
+              [
+                7,
+                9
+              ],
+              [
+                6,
+                8
+              ],
+              28
+            ],
+            [
+              2,
+              [
+                0
+              ],
+              [
+                0
+              ],
+              6
+            ],
+            [
+              3,
+              [
+                0
+              ],
+              [
+                0
+              ],
+              65
+            ],
+            [
+              3,
+              [
+                0
+              ],
+              [
+                6,
+                7,
+                8,
+                9
+              ],
+              20
+            ],
+            [
+              3,
+              [
+                0
+              ],
+              [
+                2
+              ],
+              46
+            ],
+            [
+              3,
+              [
+                2
+              ],
+              [
+                0
+              ],
+              20
+            ],
+            [
+              3,
+              [
+                6
+              ],
+              [
+                2
+              ],
+              11
+            ],
+            [
+              3,
+              [
+                7
+              ],
+              [
+                2
+              ],
+              52
+            ],
+            [
+              3,
+              [
+                8
+              ],
+              [
+                0
+              ],
+              31
+            ],
+            [
+              3,
+              [
+                9
+              ],
+              [
+                0
+              ],
+              72
+            ],
+            [
+              3,
+              [
+                11
+              ],
+              [
+                0
+              ],
+              128
+            ],
+            [
+              3,
+              [
+                0
+              ],
+              [
+                11
+              ],
+              65
+            ],
+            [
+              3,
+              [
+                2
+              ],
+              [
+                11
+              ],
+              20
+            ]
+          ]
+        }
+      }
+    ]
+  }
+}
\ No newline at end of file
diff --git a/configs/ramulator2_configs/gen_configs.py b/configs/ramulator2_configs/gen_configs.py
new file mode 100644
index 00000000..64eb62d2
--- /dev/null
+++ b/configs/ramulator2_configs/gen_configs.py
@@ -0,0 +1,109 @@
+"""
+Generate machine-readable ramulator2 v2.1 config files for PyTorchSim.
+
+Usage:
+    python gen_configs.py
+
+Each function generates a JSON config that C++ can load directly via
+Config::parse_config_file(). No preset resolution happens in C++ anymore.
+"""
+
+import json
+import sys
+import os
+
+# Add ramulator2 Python DSL to path
+RAMULATOR_PYTHON = os.path.join(os.path.dirname(__file__),
+                                "../../TOGSim/extern/ramulator2/python")
+sys.path.insert(0, RAMULATOR_PYTHON)
+
+import ramulator
+import ramulator.dram
+import ramulator.controller
+import ramulator.scheduler
+import ramulator.refresh_manager
+import ramulator.row_policy
+import ramulator.addr_mapper
+import ramulator.channel_mapper
+import ramulator.memory_system
+
+
+def make_config(dram_obj, clock_ratio=1, refresh_scope="Rank"):
+    """Wrap a DRAM object in a single-channel GenericDRAM config for PyTorchSim.
+
+    PyTorchSim creates one Ramulator2 instance per channel, so each config
+    always has exactly one controller (channel=1 in org is enforced by v2.1).
+    The wrapper overrides 'frontend' to ExternalFrontEnd automatically.
+
+    refresh_scope: level name for AllBank refresh.
+      - DDR4 / LPDDR5 / LPDDR5X → "Rank"
+      - HBM2 / HBM3              → "PseudoChannel"
+    """
+    ctrl = ramulator.controller.GenericDDR(
+        dram=dram_obj,
+        scheduler=ramulator.scheduler.FRFCFS(),
+        refresh_manager=ramulator.refresh_manager.AllBank(scope=refresh_scope),
+        row_policy=ramulator.row_policy.Open(),
+        addr_mapper=ramulator.addr_mapper.RoBaRaCoCh(),
+    )
+    ms = ramulator.memory_system.GenericDRAM(
+        clock_ratio=clock_ratio,
+        controllers=[ctrl],
+        # Single-channel per Ramulator2 instance — passthrough maps everything to ch 0
+        channel_mapper=ramulator.channel_mapper.PassThroughChannelMapper(),
+    )
+    return {
+        "frontend": {"impl": "External", "clock_ratio": 1},
+        "memory_system": ms.to_config(),
+    }
+
+
+def gen_hbm2():
+    # Available timing presets: HBM2_1600Mbps, HBM2_2000Mbps, HBM2_2400Mbps
+    # HBM2 has no Rank level — AllBank refresh scope must be PseudoChannel
+    dram = ramulator.dram.HBM2(org_preset="HBM2_8Gb", timing_preset="HBM2_2000Mbps")
+    return make_config(dram, clock_ratio=1, refresh_scope="PseudoChannel")
+
+
+def gen_hbm2_tpuv3():
+    # TPUv3 HBM2: 900MHz → ~1.8 Gbps. Closest available preset: HBM2_2000Mbps
+    dram = ramulator.dram.HBM2(org_preset="HBM2_8Gb", timing_preset="HBM2_2000Mbps")
+    return make_config(dram, clock_ratio=1, refresh_scope="PseudoChannel")
+
+
+def gen_ddr4():
+    # Available timing presets — check python/ramulator/dram/ddr4.py
+    dram = ramulator.dram.DDR4(org_preset="DDR4_8Gb_x8", timing_preset="DDR4_3200AA")
+    return make_config(dram, clock_ratio=1)
+
+
+def gen_lpddr5():
+    dram = ramulator.dram.LPDDR5(org_preset="LPDDR5_8Gb_x16", timing_preset="LPDDR5_6400")
+    return make_config(dram, clock_ratio=1)
+
+
+def gen_lpddr5x():
+    # LPDDR5X_8533: 8533 MT/s, tCK=938ps, CK=1066MHz
+    dram = ramulator.dram.LPDDR5(org_preset="LPDDR5_8Gb_x16", timing_preset="LPDDR5X_8533")
+    return make_config(dram, clock_ratio=1)
+
+
+CONFIGS = {
+    "HBM2.yaml":        gen_hbm2,
+    "HBM2_TPUv3.yaml":  gen_hbm2_tpuv3,
+    "DDR4.yaml":        gen_ddr4,
+    "LPDDR5.yaml":      gen_lpddr5,
+    "LPDDR5X.yaml":     gen_lpddr5x,
+}
+
+
+if __name__ == "__main__":
+    out_dir = os.path.dirname(os.path.abspath(__file__))
+    for filename, gen_fn in CONFIGS.items():
+        cfg = gen_fn()
+        out_path = os.path.join(out_dir, filename)
+        with open(out_path, "w") as f:
+            # json is valid yaml — C++ parse_config_file reads either
+            json.dump(cfg, f, indent=2)
+        print(f"Generated {out_path}")
+

From dd991c137c186dc06798b9fe6ef94295198f05f7 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 9 Apr 2026 21:30:27 +0900
Subject: [PATCH 159/194] [CI] Add thirdparty release manifest; pin base image
 tag and build on demand in docker-image workflow

---
 .github/workflows/docker-base-image-2-8.yml |  71 ----------
 .github/workflows/docker-image-2-8.yml      |  69 ---------
 .github/workflows/docker-image.yml          | 149 ++++++++++++++++++++
 .github/workflows/tag_release.yml           |  76 +++++++++-
 scripts/ci/thirdparty_base_pin.sh           |   6 +
 scripts/ci/thirdparty_github_asset_env.sh   |  54 +++++++
 thirdparty/github-releases.json             |  19 +++
 7 files changed, 303 insertions(+), 141 deletions(-)
 delete mode 100644 .github/workflows/docker-base-image-2-8.yml
 delete mode 100644 .github/workflows/docker-image-2-8.yml
 create mode 100644 .github/workflows/docker-image.yml
 create mode 100755 scripts/ci/thirdparty_base_pin.sh
 create mode 100755 scripts/ci/thirdparty_github_asset_env.sh
 create mode 100644 thirdparty/github-releases.json

diff --git a/.github/workflows/docker-base-image-2-8.yml b/.github/workflows/docker-base-image-2-8.yml
deleted file mode 100644
index 74e81e07..00000000
--- a/.github/workflows/docker-base-image-2-8.yml
+++ /dev/null
@@ -1,71 +0,0 @@
-name: Docker Base Image CI (PyTorch 2.8)
-
-on:
-  push:
-    branches: [ "base_v2.8" ]
-  workflow_dispatch:
-  repository_dispatch:
-    types: [ build_base ]
-
-jobs:
-  build:
-    runs-on: ubuntu-latest
-
-    permissions:
-      contents: read
-      packages: write
-
-    steps:
-      - name: Checkout Code
-        uses: actions/checkout@v4
-
-      - name: Log in to GitHub Container Registry
-        uses: docker/login-action@v3
-        with:
-          registry: ghcr.io
-          username: ${{ github.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Set environment
-        env:
-          GIT_ACCESS_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          if [ -n "${{ github.event.pull_request.head.sha }}" ]; then
-            echo "GITHUB_SHA=${{ github.event.pull_request.head.sha }}" >> $GITHUB_ENV
-            echo "GITHUB_SHA=${{ github.event.pull_request.head.sha }}"
-          else
-            echo "GITHUB_SHA=${{ github.sha }}" >> $GITHUB_ENV
-            echo "GITHUB_SHA=${{ github.sha }}"
-          fi
-
-          gem5_response_file=/tmp/releases-gem5-latest.json
-          curl -s https://api.github.com/repos/PSAL-POSTECH/GEM5/releases/latest > ${gem5_response_file}
-          GEM5_ASSET_ID=$(jq ".assets[0].id" ${gem5_response_file})
-          echo "GEM5_ASSET_ID=$GEM5_ASSET_ID"
-          echo "GEM5_ASSET_ID=$GEM5_ASSET_ID" >> $GITHUB_ENV
-
-          llvm_response_file=/tmp/releases-gem5-latest.json
-          curl -s https://api.github.com/repos/PSAL-POSTECH/llvm-project/releases/latest > ${llvm_response_file}
-          LLVM_ASSET_ID=$(jq ".assets[0].id" ${llvm_response_file})
-          echo "LLVM_ASSET_ID=$LLVM_ASSET_ID"
-          echo "LLVM_ASSET_ID=$LLVM_ASSET_ID" >> $GITHUB_ENV
-
-          spike_response_file=/tmp/releases-spike-latest.json
-          curl -s https://api.github.com/repos/PSAL-POSTECH/riscv-isa-sim/releases/latest > ${spike_response_file}
-          SPIKE_ASSET_ID=$(jq ".assets[0].id" ${spike_response_file})
-          echo "SPIKE_ASSET_ID=$SPIKE_ASSET_ID"
-          echo "SPIKE_ASSET_ID=$SPIKE_ASSET_ID" >> $GITHUB_ENV
-
-      - name: Build and Push Docker Image (PyTorch 2.8)
-        uses: docker/build-push-action@v4
-        with:
-          context: .
-          file: ./Dockerfile.base
-          push: true
-          build-args: |
-            PYTORCH_IMAGE=pytorch/pytorch:2.8.0-cuda12.6-cudnn9-devel
-            GEM5_ASSET_ID=${{ env.GEM5_ASSET_ID }}
-            LLVM_ASSET_ID=${{ env.LLVM_ASSET_ID }}
-            SPIKE_ASSET_ID=${{ env.SPIKE_ASSET_ID }}
-          tags: |
-            ghcr.io/psal-postech/torchsim_base_2_8:latest
diff --git a/.github/workflows/docker-image-2-8.yml b/.github/workflows/docker-image-2-8.yml
deleted file mode 100644
index 52464dff..00000000
--- a/.github/workflows/docker-image-2-8.yml
+++ /dev/null
@@ -1,69 +0,0 @@
-name: Docker image CI (PyTorch 2.8)
-
-on:
-  pull_request:
-    branches: [ "master", "develop" ]
-  workflow_dispatch:
-
-jobs:
-  build-and-test:
-    runs-on: self-hosted
-
-    permissions:
-      contents: read
-      packages: write
-
-    steps:
-      - name: Checkout Code
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event.pull_request.head.sha }}
-          submodules: recursive
-
-      - name: Login to GHCR
-        uses: docker/login-action@v3
-        with:
-          registry: ghcr.io
-          username: ${{ github.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Build and Push Docker Image (PyTorch 2.8)
-        uses: docker/build-push-action@v6
-        with:
-          context: .
-          file: ./Dockerfile
-          push: true
-          no-cache: true
-          build-args: |
-            BASE_IMAGE=ghcr.io/psal-postech/torchsim_base_2_8:latest
-          tags: ghcr.io/psal-postech/torchsim-test-2-8:${{ github.sha }}
-
-      - name: Wait for GHCR propagation
-        run: |
-          for i in {1..30}; do
-            echo "Checking if image exists in GHCR (attempt $i)..."
-            if docker manifest inspect ghcr.io/psal-postech/torchsim-test-2-8:${GITHUB_SHA} > /dev/null 2>&1; then
-              echo "Image is now available in GHCR."
-              exit 0
-            fi
-            echo "Image not yet available, retrying in 30 seconds..."
-            sleep 20
-          done
-          echo "Image did not become available in GHCR within expected time."
-          exit 1
-
-  test-pytorchsim-wrapper1:
-    needs: build-and-test
-    uses: ./.github/workflows/pytorchsim_test.yml
-    with:
-      image_name: ghcr.io/psal-postech/torchsim-test-2-8:${{ github.sha }}
-      vector_lane: 128
-      spad_size: 128
-
-  test-pytorchsim-wrapper2:
-    needs: build-and-test
-    uses: ./.github/workflows/pytorchsim_test.yml
-    with:
-      image_name: ghcr.io/psal-postech/torchsim-test-2-8:${{ github.sha }}
-      vector_lane: 32
-      spad_size: 32
diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml
new file mode 100644
index 00000000..67140c89
--- /dev/null
+++ b/.github/workflows/docker-image.yml
@@ -0,0 +1,149 @@
+name: Docker image CI
+
+on:
+  pull_request:
+    branches: [ "master", "develop" ]
+  workflow_dispatch:
+
+env:
+  BASE_IMAGE_REPO: ghcr.io/psal-postech/torchsim_base
+  # PR: head commit; otherwise workflow_dispatch uses the branch SHA
+  SOURCE_SHA: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+
+jobs:
+  ensure-base:
+    runs-on: ubuntu-latest
+    outputs:
+      base_image: ${{ steps.pin.outputs.base_image }}
+    permissions:
+      contents: read
+      packages: write
+
+    steps:
+      - name: Checkout Code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ env.SOURCE_SHA }}
+          submodules: recursive
+
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: PyTorch base image from manifest
+        run: |
+          PYTORCH_IMAGE=$(python3 -c "import json; from pathlib import Path; v=json.loads(Path('thirdparty/github-releases.json').read_text()).get('pytorch_image'); print(v or '')")
+          if [ -z "$PYTORCH_IMAGE" ]; then echo "thirdparty/github-releases.json: pytorch_image is required" >&2; exit 1; fi
+          echo "PYTORCH_IMAGE=$PYTORCH_IMAGE" >> "$GITHUB_ENV"
+
+      - name: Thirdparty pin
+        id: pin
+        run: |
+          PIN="$(bash scripts/ci/thirdparty_base_pin.sh)"
+          echo "pin=${PIN}" >> "$GITHUB_OUTPUT"
+          echo "base_image=${BASE_IMAGE_REPO}:thirdparty-${PIN}" >> "$GITHUB_OUTPUT"
+          echo "BASE_IMAGE=${BASE_IMAGE_REPO}:thirdparty-${PIN}" >> "$GITHUB_ENV"
+
+      - name: Check base image exists
+        id: exists
+        run: |
+          if docker manifest inspect "${BASE_IMAGE}" > /dev/null 2>&1; then
+            echo "ok=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "ok=false" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Resolve GitHub release asset IDs
+        if: steps.exists.outputs.ok != 'true'
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: bash scripts/ci/thirdparty_github_asset_env.sh >> "$GITHUB_ENV"
+
+      - name: Build and push base image (missing pin)
+        if: steps.exists.outputs.ok != 'true'
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          file: ./Dockerfile.base
+          push: true
+          build-args: |
+            PYTORCH_IMAGE=${{ env.PYTORCH_IMAGE }}
+            GEM5_ASSET_ID=${{ env.GEM5_ASSET_ID }}
+            LLVM_ASSET_ID=${{ env.LLVM_ASSET_ID }}
+            SPIKE_ASSET_ID=${{ env.SPIKE_ASSET_ID }}
+          tags: ${{ env.BASE_IMAGE }}
+
+  build-and-test:
+    needs: ensure-base
+    runs-on: self-hosted
+
+    permissions:
+      contents: read
+      packages: write
+
+    steps:
+      - name: Checkout Code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ env.SOURCE_SHA }}
+          submodules: recursive
+
+      - name: Login to GHCR
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Build and Push Docker Image
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          file: ./Dockerfile
+          push: true
+          no-cache: true
+          build-args: |
+            BASE_IMAGE=${{ needs.ensure-base.outputs.base_image }}
+          tags: ghcr.io/psal-postech/torchsim-test:${{ env.SOURCE_SHA }}
+
+      # Do not use GITHUB_SHA here: on pull_request it is the merge commit, while the image tag uses SOURCE_SHA (PR head).
+      - name: Wait for GHCR propagation
+        env:
+          IMAGE_SHA: ${{ env.SOURCE_SHA }}
+        run: |
+          IMG="ghcr.io/psal-postech/torchsim-test:${IMAGE_SHA}"
+          echo "Verifying tag matches push: ${IMAGE_SHA}"
+          for i in $(seq 1 30); do
+            echo "Checking if image exists in GHCR (attempt $i)..."
+            if docker buildx imagetools inspect "$IMG" > /dev/null 2>&1; then
+              echo "Image is now available in GHCR."
+              exit 0
+            fi
+            if [ "$i" -eq 1 ]; then
+              echo "buildx imagetools inspect failed; stderr (first attempt):"
+              docker buildx imagetools inspect "$IMG" 2>&1 || true
+            fi
+            echo "Image not yet available, retrying in 20 seconds..."
+            sleep 20
+          done
+          echo "Image did not become available in GHCR within expected time."
+          exit 1
+
+  test-pytorchsim-wrapper1:
+    needs: build-and-test
+    uses: ./.github/workflows/pytorchsim_test.yml
+    with:
+      image_name: ghcr.io/psal-postech/torchsim-test:${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      vector_lane: 128
+      spad_size: 128
+
+  test-pytorchsim-wrapper2:
+    needs: build-and-test
+    uses: ./.github/workflows/pytorchsim_test.yml
+    with:
+      image_name: ghcr.io/psal-postech/torchsim-test:${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      vector_lane: 32
+      spad_size: 32
diff --git a/.github/workflows/tag_release.yml b/.github/workflows/tag_release.yml
index 0728a583..f92fc060 100644
--- a/.github/workflows/tag_release.yml
+++ b/.github/workflows/tag_release.yml
@@ -5,8 +5,80 @@ on:
     tags:
       - 'v*'
 
+env:
+  BASE_IMAGE_REPO: ghcr.io/psal-postech/torchsim_base
+
 jobs:
+  ensure-base:
+    runs-on: ubuntu-latest
+    outputs:
+      base_image: ${{ steps.pin.outputs.base_image }}
+    permissions:
+      contents: read
+      packages: write
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          repository: PSAL-POSTECH/PyTorchSim
+          ref: ${{ github.sha }}
+          submodules: recursive
+
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: PyTorch base image from manifest
+        run: |
+          PYTORCH_IMAGE=$(python3 -c "import json; from pathlib import Path; v=json.loads(Path('thirdparty/github-releases.json').read_text()).get('pytorch_image'); print(v or '')")
+          if [ -z "$PYTORCH_IMAGE" ]; then echo "thirdparty/github-releases.json: pytorch_image is required" >&2; exit 1; fi
+          echo "PYTORCH_IMAGE=$PYTORCH_IMAGE" >> "$GITHUB_ENV"
+
+      - name: Thirdparty pin
+        id: pin
+        run: |
+          PIN="$(bash scripts/ci/thirdparty_base_pin.sh)"
+          echo "pin=${PIN}" >> "$GITHUB_OUTPUT"
+          echo "base_image=${BASE_IMAGE_REPO}:thirdparty-${PIN}" >> "$GITHUB_OUTPUT"
+          echo "BASE_IMAGE=${BASE_IMAGE_REPO}:thirdparty-${PIN}" >> "$GITHUB_ENV"
+
+      - name: Check base image exists
+        id: exists
+        run: |
+          if docker manifest inspect "${BASE_IMAGE}" > /dev/null 2>&1; then
+            echo "ok=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "ok=false" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Resolve GitHub release asset IDs
+        if: steps.exists.outputs.ok != 'true'
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: bash scripts/ci/thirdparty_github_asset_env.sh >> "$GITHUB_ENV"
+
+      - name: Build and push base image (missing pin)
+        if: steps.exists.outputs.ok != 'true'
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          file: ./Dockerfile.base
+          push: true
+          build-args: |
+            PYTORCH_IMAGE=${{ env.PYTORCH_IMAGE }}
+            GEM5_ASSET_ID=${{ env.GEM5_ASSET_ID }}
+            LLVM_ASSET_ID=${{ env.LLVM_ASSET_ID }}
+            SPIKE_ASSET_ID=${{ env.SPIKE_ASSET_ID }}
+          tags: |
+            ${{ env.BASE_IMAGE }}
+            ${{ env.BASE_IMAGE_REPO }}:latest
+
   build:
+    needs: ensure-base
     runs-on: self-hosted
 
     permissions:
@@ -42,4 +114,6 @@ jobs:
           push: true
           secrets: |
             GIT_ACCESS_TOKEN=${{ secrets.GIT_ACCESS_TOKEN }}
-          tags: ghcr.io/psal-postech/${{ env.IMAGE_TAG}}
\ No newline at end of file
+          build-args: |
+            BASE_IMAGE=${{ needs.ensure-base.outputs.base_image }}
+          tags: ghcr.io/psal-postech/${{ env.IMAGE_TAG }}
diff --git a/scripts/ci/thirdparty_base_pin.sh b/scripts/ci/thirdparty_base_pin.sh
new file mode 100755
index 00000000..6cfc7d9a
--- /dev/null
+++ b/scripts/ci/thirdparty_base_pin.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+# Deterministic short pin for tagging torchsim_base images (thirdparty + base Dockerfile).
+set -euo pipefail
+ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
+cd "$ROOT"
+{ cat thirdparty/github-releases.json; cat Dockerfile.base; } | sha256sum | awk '{print substr($1,1,12)}'
diff --git a/scripts/ci/thirdparty_github_asset_env.sh b/scripts/ci/thirdparty_github_asset_env.sh
new file mode 100755
index 00000000..8cbe9e12
--- /dev/null
+++ b/scripts/ci/thirdparty_github_asset_env.sh
@@ -0,0 +1,54 @@
+#!/usr/bin/env bash
+# Emit GEM5_ASSET_ID, LLVM_ASSET_ID, SPIKE_ASSET_ID lines for appending to GITHUB_ENV.
+# Requires: jq, curl, GITHUB_TOKEN, repo root as cwd or GITHUB_WORKSPACE.
+set -euo pipefail
+ROOT="${GITHUB_WORKSPACE:-$(cd "$(dirname "$0")/../.." && pwd)}"
+MANIFEST="${ROOT}/thirdparty/github-releases.json"
+if [ ! -f "$MANIFEST" ]; then
+  echo "Missing thirdparty manifest: $MANIFEST" >&2
+  exit 1
+fi
+if [ -z "${GITHUB_TOKEN:-}" ]; then
+  echo "GITHUB_TOKEN is not set" >&2
+  exit 1
+fi
+
+thirdparty_asset_id() {
+  local key="$1"
+  local out_var="$2"
+  local repo release_tag asset_name owner name api_url tmp id
+  repo=$(jq -r --arg k "$key" '.[$k].repository' "$MANIFEST")
+  release_tag=$(jq -r --arg k "$key" '.[$k].release_tag' "$MANIFEST")
+  asset_name=$(jq -r --arg k "$key" '.[$k].asset_name // ""' "$MANIFEST")
+  owner="${repo%%/*}"
+  name="${repo##*/}"
+  if [ "$release_tag" = "latest" ]; then
+    api_url="https://api.github.com/repos/${owner}/${name}/releases/latest"
+  else
+    api_url="https://api.github.com/repos/${owner}/${name}/releases/tags/${release_tag}"
+  fi
+  tmp=$(mktemp)
+  if ! curl -fsS -H "Authorization: Bearer ${GITHUB_TOKEN}" \
+    -H "Accept: application/vnd.github+json" \
+    -H "X-GitHub-Api-Version: 2022-11-28" \
+    "$api_url" -o "$tmp"; then
+    echo "Failed to fetch release metadata for ${key} (${owner}/${name}, ${release_tag})" >&2
+    rm -f "$tmp"
+    exit 1
+  fi
+  if [ -n "$asset_name" ]; then
+    id=$(jq -r --arg n "$asset_name" '.assets[] | select(.name == $n) | .id' "$tmp" | head -n1)
+  else
+    id=$(jq -r '.assets[0].id' "$tmp")
+  fi
+  rm -f "$tmp"
+  if [ -z "$id" ] || [ "$id" = "null" ]; then
+    echo "Could not resolve asset id for ${key} (${owner}/${name}, tag=${release_tag}, asset_name=${asset_name:-<first>})" >&2
+    exit 1
+  fi
+  echo "${out_var}=${id}"
+}
+
+thirdparty_asset_id gem5 GEM5_ASSET_ID
+thirdparty_asset_id llvm_project LLVM_ASSET_ID
+thirdparty_asset_id spike SPIKE_ASSET_ID
diff --git a/thirdparty/github-releases.json b/thirdparty/github-releases.json
new file mode 100644
index 00000000..25c220c9
--- /dev/null
+++ b/thirdparty/github-releases.json
@@ -0,0 +1,19 @@
+{
+  "description": "GitHub release pins for CI (docker base image). pytorch_image is the ARG PYTORCH_IMAGE for Dockerfile.base. Use release_tag \"latest\" or an exact release tag for GitHub deps. asset_name must match the release attachment filename. CI builds ghcr.io/.../torchsim_base:thirdparty-<12 hex> when missing (pin = sha256 of this file plus Dockerfile.base) and updates :latest on that push.",
+  "pytorch_image": "pytorch/pytorch:2.8.0-cuda12.6-cudnn9-devel",
+  "gem5": {
+    "repository": "PSAL-POSTECH/gem5",
+    "release_tag": "v1.0.1",
+    "asset_name": "gem5-release.tar.gz"
+  },
+  "llvm_project": {
+    "repository": "PSAL-POSTECH/llvm-project",
+    "release_tag": "v1.0.6",
+    "asset_name": "riscv-llvm-release.tar.gz"
+  },
+  "spike": {
+    "repository": "PSAL-POSTECH/riscv-isa-sim",
+    "release_tag": "v1.0.1",
+    "asset_name": "spike-release.tar.gz"
+  }
+}

From 54ccd4c897e563650647e532fa698b1f0f57d542 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 7 Apr 2026 16:33:47 +0900
Subject: [PATCH 160/194] [Frontend] Put TorchInductor cache under
 TORCHSIM_DUMP_PATH

---
 PyTorchSimDevice/torch_openreg/__init__.py    |  1 +
 PyTorchSimFrontend/extension_codecache.py     |  6 +--
 PyTorchSimFrontend/extension_config.py        | 38 +++++++++++++++++--
 PyTorchSimFrontend/mlir/mlir_autotune.py      |  4 +-
 .../mlir/mlir_codegen_backend.py              |  3 +-
 PyTorchSimFrontend/mlir/mlir_common.py        |  3 +-
 PyTorchSimFrontend/mlir/mlir_template.py      |  2 +-
 7 files changed, 45 insertions(+), 12 deletions(-)

diff --git a/PyTorchSimDevice/torch_openreg/__init__.py b/PyTorchSimDevice/torch_openreg/__init__.py
index 5e404f7d..e8158391 100644
--- a/PyTorchSimDevice/torch_openreg/__init__.py
+++ b/PyTorchSimDevice/torch_openreg/__init__.py
@@ -17,6 +17,7 @@
 torch.utils.generate_methods_for_privateuse1_backend(for_storage=True)
 
 sys.path.append(os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim'))
+import PyTorchSimFrontend.extension_config  # noqa: F401
 from PyTorchSimFrontend.mlir.mlir_codegen_backend import ExtensionWrapperCodegen
 from PyTorchSimFrontend.mlir.mlir_scheduling import MLIRScheduling
 torch._inductor.codegen.common.register_backend_for_device(
diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index ac711650..65c96f11 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -4,11 +4,11 @@
 import subprocess
 import torch
 
+from PyTorchSimFrontend import extension_config
 from torch._inductor.codecache import get_hash, write
 from torch._inductor.async_compile import AsyncCompile
 from AsmParser.tog_generator import tog_generator
 from PyTorchSimFrontend.mlir.mlir_caller_codegen import MLIRKernelCallerCodeGen
-from PyTorchSimFrontend import extension_config
 from Simulator.simulator import FunctionalSimulator, CycleSimulator, TOGSimulator
 
 # Configure logger for extension_codecache module (WARNING level by default)
@@ -20,7 +20,7 @@ def hash_prefix(hash_value):
     return hash_value[1:12]
 
 def get_write_path(src_code):
-    return os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "outputs", hash_prefix(get_hash(src_code.strip())))
+    return os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, hash_prefix(get_hash(src_code.strip())))
 
 
 def get_lock_path(write_path):
@@ -283,7 +283,7 @@ def run_kernel_simulation(*args, **kwargs):
             # Wait for compilation
             key = future.result()
             from filelock import FileLock
-            result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "outputs", hash_prefix(key))
+            result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, hash_prefix(key))
             lock = FileLock(get_lock_path(result_path), timeout=LOCK_TIMEOUT)
             with lock:
                 # Run simulator pass
diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py
index 1b7ccf8d..5dec8a4b 100644
--- a/PyTorchSimFrontend/extension_config.py
+++ b/PyTorchSimFrontend/extension_config.py
@@ -8,8 +8,42 @@
 CONFIG_GEM5_PATH = os.environ.get('GEM5_PATH', default="/workspace/gem5/build/RISCV/gem5.opt")
 CONFIG_TORCHSIM_LLVM_PATH = os.environ.get('TORCHSIM_LLVM_PATH', default="/usr/bin")
 
+CONFIG_TORCHSIM_TOG_HOST_CC = os.environ.get("TORCHSIM_TOG_HOST_CC", "gcc")
+
+def _default_tog_host_cflags():
+    """Host flags for ``dlopen``'d ``*_tog.so`` / ``tile_operation_graph.so``."""
+    if os.environ.get("TORCHSIM_TOG_HOST_CFLAGS"):
+        return os.environ["TORCHSIM_TOG_HOST_CFLAGS"]
+    if True: #int(os.environ.get("TORCHSIM_TOG_SO_DEBUG", "0")):
+        return (
+            "-g -Og -fno-omit-frame-pointer -fPIC -std=c11 "
+            "-Wall -Wextra -Wno-unused-variable -Wno-unused-parameter"
+        )
+    return (
+        "-O2 -fPIC -std=c11 -Wall -Wextra -Wno-unused-variable -Wno-unused-parameter"
+    )
+
+
+CONFIG_TORCHSIM_TOG_HOST_CFLAGS = _default_tog_host_cflags()
+
+
+def _default_tog_host_ldflags():
+    if os.environ.get("TORCHSIM_TOG_HOST_LDFLAGS"):
+        return os.environ["TORCHSIM_TOG_HOST_LDFLAGS"]
+    # Keep debug sections in .so; optional build-id helps GDB locate DWARF.
+    base = "-shared"
+    if int(os.environ.get("TORCHSIM_TOG_SO_DEBUG", "0")):
+        return base + " -Wl,--build-id"
+    return base
+
+
+CONFIG_TORCHSIM_TOG_HOST_LDFLAGS = _default_tog_host_ldflags()
+
 CONFIG_TORCHSIM_DUMP_MLIR_IR = int(os.environ.get("TORCHSIM_DUMP_MLIR_IR", default=False))
 CONFIG_TORCHSIM_DUMP_LLVM_IR = int(os.environ.get("TORCHSIM_DUMP_LLVM_IR", default=False))
+CONFIG_TORCHSIM_DUMP_PATH = os.environ.get("TORCHSIM_DUMP_PATH", os.path.join(CONFIG_TORCHSIM_DIR, "outputs"))
+CONFIG_TORCHSIM_LOG_PATH = os.environ.get("TORCHSIM_LOG_PATH", os.path.join(CONFIG_TORCHSIM_DIR, "togsim_results"))
+os.environ["TORCHINDUCTOR_CACHE_DIR"] = os.path.join(CONFIG_TORCHSIM_DUMP_PATH, ".torchinductor")
 
 def __getattr__(name):
     # TOGSim config
@@ -99,10 +133,6 @@ def __getattr__(name):
 
     if name == "CONFIG_TOGSIM_DEBUG_LEVEL":
         return os.environ.get("TOGSIM_DEBUG_LEVEL", "")
-    if name == "CONFIG_TORCHSIM_DUMP_PATH":
-        return os.environ.get('TORCHSIM_DUMP_PATH', default = CONFIG_TORCHSIM_DIR)
-    if name == "CONFIG_TORCHSIM_LOG_PATH":
-        return os.environ.get('TORCHSIM_LOG_PATH', default = os.path.join(CONFIG_TORCHSIM_DIR, "togsim_results"))
 
 # SRAM Buffer allocation plan
 def load_plan_from_module(module_path):
diff --git a/PyTorchSimFrontend/mlir/mlir_autotune.py b/PyTorchSimFrontend/mlir/mlir_autotune.py
index b8f5eaf9..fe1f86a1 100644
--- a/PyTorchSimFrontend/mlir/mlir_autotune.py
+++ b/PyTorchSimFrontend/mlir/mlir_autotune.py
@@ -21,7 +21,7 @@ def hash_prefix(hash_value):
     return hash_value[1:12]
 
 def get_write_path(src_code):
-    return os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "outputs", hash_prefix(get_hash(src_code.strip())))
+    return os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, hash_prefix(get_hash(src_code.strip())))
 
 @dataclasses.dataclass
 class MLIRBenchmarkRequest():
@@ -61,7 +61,7 @@ def make_run_fn(
         # Check already cached result.
         write_path = get_write_path(self.source_code)
         key,  _ = write(self.source_code, "mlir", specified_dir=write_path)
-        result_dir = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "outputs", hash_prefix(key), "togsim_result")
+        result_dir = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, hash_prefix(key), "togsim_result")
 
         # Find the most recent .log file in the result directory
         if os.path.exists(result_dir) and os.path.isdir(result_dir):
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 8bfdc57f..05102c79 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -9,6 +9,8 @@
 from typing import Optional
 from collections import defaultdict
 from concurrent.futures import ThreadPoolExecutor
+
+from PyTorchSimFrontend import extension_config
 from torch._dynamo.testing import rand_strided
 from torch._inductor.autotune_process import TensorMeta
 from torch._dynamo.utils import dynamo_timed
@@ -23,7 +25,6 @@
 )
 from torch.utils._sympy.functions import ModularIndexing, FloorDiv
 from PyTorchSimFrontend import extension_codecache
-from PyTorchSimFrontend import extension_config
 from . import mlir_common
 from .mlir_common import LoopLevel, LoopNest
 from .mlir_ops import ExtensionOverrides
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 23c02066..7d604c3a 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -9,6 +9,8 @@
 from functools import reduce
 from operator import mul
 import torch
+
+from PyTorchSimFrontend import extension_config
 from torch._inductor.codegen import common
 from torch._inductor.codegen import cpp
 from torch._inductor.virtualized import V
@@ -30,7 +32,6 @@
     sympy_subs,
     unique,
 )
-from PyTorchSimFrontend import extension_config
 from PyTorchSimFrontend import extension_codecache
 
 from PyTorchSimFrontend.extension_utils import (
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index b126d3af..6eb6efb4 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -13,6 +13,7 @@
 from typing import List, Optional
 from unittest.mock import patch
 
+from PyTorchSimFrontend import extension_config
 from torch._inductor.codegen.common import KernelTemplate, CSE, DeferredLine
 from torch._inductor.ir import Buffer, IRNode, TemplateBuffer, ChoiceCaller, ir_node_to_tensor
 from torch._inductor.select_algorithm import PartialRender
@@ -29,7 +30,6 @@
 from PyTorchSimFrontend.mlir.mlir_scheduling import SchedulerNode
 from torch._inductor.codegen import common
 
-from PyTorchSimFrontend import extension_config
 from . import mlir_common
 
 # Configure logger for mlir_template module

From 7019ff23c2f3ae2f82a6e21e0ecb3c59bd78f869 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 1 Apr 2026 23:46:05 +0900
Subject: [PATCH 161/194] [Frontend] Modify dma_start attribute position

---
 .../mlir/mlir_codegen_backend.py              |  6 ++---
 PyTorchSimFrontend/mlir/mlir_common.py        | 24 +++++++++++++++++--
 PyTorchSimFrontend/mlir/mlir_template.py      | 21 +++++++++-------
 3 files changed, 38 insertions(+), 13 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 05102c79..58d6a70d 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -536,7 +536,7 @@ def load(self, name: str, index: sympy.Expr):
         compute_index_var = ",".join(sram_index_var.split(",")[:-1] + [f"%{self.compute_idx}"])
 
         # MVIN Encoding
-        attribute = f"{{dram_stride={dram_stride}, sram_stride={tile_stride}, padding={padding}}}"
+        attribute = mlir_common.format_dma_op_attributes(dram_stride, tile_stride, int(padding))
         code = self.get_dma_code("MVIN", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
                                  dram_shape, tile_shape, attribute)
         self.cse.generate(dma_buffer, code, assignment = False) # FIXME: assignment = False does not support caching
@@ -607,7 +607,7 @@ def store(self, name: str, index: sympy.Expr, value, mode=None, *args, **kwargs)
             sram_index_var = self.spad_buffer_dict[str(value)][3]
 
         # Generate DMA instruction
-        attribute = f"{{dram_stride={dram_stride}, sram_stride={tile_stride}, padding=0}}"
+        attribute = mlir_common.format_dma_op_attributes(dram_stride, tile_stride, 0)
         code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
                                  dram_shape, tile_shape, attribute)
         self.dma_stores.writeline(common.DeferredLine(name, code))
@@ -736,7 +736,7 @@ def store_reduction(self, name, index, value):
                 ops._store(value, sram_var, sram_index_var, tile_shape, buffer_name=name)
 
             # Generate DMA instruction
-            attribute = f"{{dram_stride={dram_stride}, sram_stride={tile_stride}, padding=0}}"
+            attribute = mlir_common.format_dma_op_attributes(dram_stride, tile_stride, 0)
             code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
                                     dram_shape, tile_shape, attribute)
             self.reductions_suffix.writeline(common.DeferredLine(name, code))
diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py
index 7d604c3a..5cde19eb 100644
--- a/PyTorchSimFrontend/mlir/mlir_common.py
+++ b/PyTorchSimFrontend/mlir/mlir_common.py
@@ -3,8 +3,7 @@
 import contextvars
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import Dict
-from typing import List
+from typing import Dict, Iterable, List, Optional, Sequence, Union
 from collections import defaultdict
 from functools import reduce
 from operator import mul
@@ -120,6 +119,27 @@ def get_dtype_nbytes(dtype):
     }
 }
 
+def format_dma_op_attributes(
+    dram_stride: Sequence,
+    sram_stride: Sequence,
+    padding: int = 0,
+    *,
+    subtile_size: Optional[Sequence] = None,
+    async_type: Optional[int] = None,
+) -> str:
+    """Attribute dict for memref.dma_start; stride lists as bracketed integer lists."""
+    parts = [
+        f"dram_stride = {dram_stride}",
+        f"sram_stride = {sram_stride}",
+        f"padding = {int(padding)}",
+    ]
+    if subtile_size:
+        parts.append(f"subtile_size = {subtile_size}")
+        av = int(async_type) if async_type is not None else 1
+        parts.append(f"async = {av} : i64")
+    return "{" + ", ".join(parts) + "}"
+
+
 class ParallelLoopBuffer(IndentedBuffer):
     def indent(self, offset=1, attribute="", suffix=""):
         @contextlib.contextmanager
diff --git a/PyTorchSimFrontend/mlir/mlir_template.py b/PyTorchSimFrontend/mlir/mlir_template.py
index 6eb6efb4..c8fc036f 100644
--- a/PyTorchSimFrontend/mlir/mlir_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_template.py
@@ -952,14 +952,19 @@ def generate_dma_code():
                 zero_cse = self.get_const_cse(0, "index")
                 sram_index_var = ", ".join([f"%{str(zero_cse)}"]*tile_desc.get_nr_dim())
 
-                attribute_parts = [f"dram_stride={_dram_stride}", f"sram_stride={sram_strides}", f"padding={int(padding)}"]
                 if subtile_size:
-                    attribute_parts.append(f"subtile_size={subtile_size}, async={int(async_type) if async_type is not None else 1}")
-                attribute = "  {" + ", ".join(attribute_parts) + "}"
+                    attribute = mlir_common.format_dma_op_attributes(
+                        _dram_stride,
+                        sram_strides,
+                        int(padding),
+                        subtile_size=subtile_size,
+                        async_type=int(async_type) if async_type is not None else None,
+                    )
+                else:
+                    attribute = mlir_common.format_dma_op_attributes(_dram_stride, sram_strides, int(padding))
                 code = self.get_dma_code(dma_type, vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
-                                        dram_shape, tile_shape, "")
+                                        dram_shape, tile_shape, attribute)
                 local_code.writeline(code)
-                local_code.writeline(attribute)
             return textwrap.indent(local_code.getvalue(), " "*indent_size).strip()
 
         if not lazy_mode:
@@ -1025,7 +1030,7 @@ def load_epilogue(self, name: str, index: sympy.Expr):
             # Allocate sram buffer
             dram_shape = mlir_common.MLIRKernelArgs.get_mlir_shape(self.buffer_types[name])
             sram_var, sram_index_var = self.get_scratchpad_buffer(dtype, name, self.kernel_group.tile_desc, index)
-            attribute = f"{{dram_stride={dram_stride}, sram_stride={tile_stride}, padding=0}}"
+            attribute = mlir_common.format_dma_op_attributes(dram_stride, tile_stride, 0)
             code = self.get_dma_code("MVIN", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
                                      dram_shape, tile_shape, attribute)
             self.cse.generate(self.dma_loads, code, assignment = False)
@@ -1093,7 +1098,7 @@ def store_epilogue(self, name: str, index: sympy.Expr, value, *args, **kwargs):
             ops._store(value, sram_var, compute_index_var, tile_shape, buffer_name=buffer_name)
 
         # Generate DMA instruction
-        attribute = f"{{dram_stride={dram_stride}, sram_stride={tile_stride}, padding=0}}"
+        attribute = mlir_common.format_dma_op_attributes(dram_stride, tile_stride, 0)
         code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
                                  dram_shape, tile_shape, attribute)
         self.dma_stores.writeline(DeferredLine(name, code))
@@ -1244,7 +1249,7 @@ def store_reduction_epilogue(self, name, index, value):
 
         # MVOUT Encoding
         # Generate DMA instruction
-        attribute = f"{{dram_stride={dram_stride}, sram_stride={final_tile_stride}, padding=0}}"
+        attribute = mlir_common.format_dma_op_attributes(dram_stride, final_tile_stride, 0)
         code = self.get_dma_code("MVOUT", vlane_split_axis, vlane_stride, mlir_dtype, dram_var, index_var, sram_var, sram_index_var,
                                 dram_shape, final_tile_shape, attribute)
         self.reductions_suffix.writeline(DeferredLine(name, code))

From 178ef52143be2429c6d12070e6313ed2e1a550e6 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 14 Apr 2026 13:23:00 +0900
Subject: [PATCH 162/194] [TOGSim] DMA tag keys int64; Core trace helpers; log
 elem_bits

Refactor Core instruction/DMA traces into core_trace_log; add InstFinishTraceTag;
extend DMA/Instruction/TileGraphParser for int64_t tag keys and elem_bits in traces.
---
 TOGSim/include/Core.h            |  11 ++-
 TOGSim/include/CoreTraceLog.h    |  36 ++++++++
 TOGSim/include/DMA.h             |  31 +++----
 TOGSim/include/Instruction.h     |  37 ++++----
 TOGSim/include/SparseCore.h      |   3 +-
 TOGSim/include/TileGraphParser.h |  32 +++----
 TOGSim/include/TraceLogTags.h    |  34 ++++++++
 TOGSim/src/Core.cc               | 145 ++++++++++++++++++-------------
 TOGSim/src/CoreTraceLog.cc       | 122 ++++++++++++++++++++++++++
 TOGSim/src/DMA.cc                |  31 +++++--
 TOGSim/src/Instruction.cc        |  36 ++++++--
 TOGSim/src/SparseCore.cc         |  51 +++++++++--
 TOGSim/src/TileGraphParser.cc    |  70 +++++++--------
 13 files changed, 468 insertions(+), 171 deletions(-)
 create mode 100644 TOGSim/include/CoreTraceLog.h
 create mode 100644 TOGSim/include/TraceLogTags.h
 create mode 100644 TOGSim/src/CoreTraceLog.cc

diff --git a/TOGSim/include/Core.h b/TOGSim/include/Core.h
index e4d2f30a..286feb5f 100644
--- a/TOGSim/include/Core.h
+++ b/TOGSim/include/Core.h
@@ -10,6 +10,14 @@
 #include "Tile.h"
 #include "SimulationConfig.h"
 #include "DMA.h"
+#include "TraceLogTags.h"
+
+/** Log tag kind for Core::finish_instruction (see TraceLogTag names in TraceLogTags.h). */
+enum class InstFinishTraceTag {
+  Fnshed,
+  DmaIssueComplete,
+  DmaRespComplete,
+};
 
 class Core {
  public:
@@ -22,7 +30,8 @@ class Core {
   virtual void cycle();
   virtual void print_stats();
   virtual void print_current_stats();
-  virtual void finish_instruction(std::shared_ptr<Instruction>& inst);
+  virtual void finish_instruction(std::shared_ptr<Instruction>& inst,
+                                  InstFinishTraceTag tag = InstFinishTraceTag::Fnshed);
   virtual bool has_memory_request();
   virtual void pop_memory_request();
   virtual mem_fetch* top_memory_request() { return _request_queue.front(); }
diff --git a/TOGSim/include/CoreTraceLog.h b/TOGSim/include/CoreTraceLog.h
new file mode 100644
index 00000000..e78c1ef2
--- /dev/null
+++ b/TOGSim/include/CoreTraceLog.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <cstdint>
+#include <string>
+
+#include "Instruction.h"
+#include "TraceLogTags.h"
+
+/**
+ * Instruction / tile trace formatting and Core spdlog::trace helpers.
+ * Keeps Core.cc focused on simulation logic.
+ */
+namespace core_trace_log {
+
+std::string format_dma_inst_issued_detail(Instruction& inst);
+/** Opcode + (detail...) for DMA issue / skip traces. */
+std::string format_dma_inst_issued_trace_line(Instruction& inst);
+/** Opcode + (detail...) for COMP / BAR / MOVIN / MOVOUT finished or issued lines. */
+std::string format_instruction_detail_line(Instruction& inst);
+
+void trace_tile_scheduled(cycle_type core_cycle, uint32_t core_id, const std::string& tag15);
+
+void trace_instruction_line(cycle_type core_cycle,
+                            uint32_t core_id,
+                            const std::string& tag15,
+                            uint64_t global_inst_id,
+                            const std::string& message);
+
+void log_error_dma_instruction_invalid(cycle_type core_cycle, uint32_t core_id);
+void log_error_dram_responses_trace_not_finished(cycle_type core_cycle, uint32_t core_id);
+void log_error_instruction_already_finished(cycle_type core_cycle,
+                                            uint32_t core_id,
+                                            const std::string& opcode_name);
+void log_error_undefined_opcode();
+
+}  // namespace core_trace_log
diff --git a/TOGSim/include/DMA.h b/TOGSim/include/DMA.h
index 3056c626..08bdcab4 100644
--- a/TOGSim/include/DMA.h
+++ b/TOGSim/include/DMA.h
@@ -12,41 +12,41 @@
 #include "Memfetch.h"
 
 struct VectorCompare {
-    bool operator()(const std::vector<int>& a, const std::vector<int>& b) const {
+    bool operator()(const std::vector<int64_t>& a, const std::vector<int64_t>& b) const {
         return a < b;
     }
 };
 
 class DMA {
  public:
-  DMA(uint32_t id, uint32_t dram_req_size);
+  DMA(uint32_t id, uint32_t dram_req_size, bool l2_datacache_enabled);
 
   void issue_tile(std::shared_ptr<Instruction> inst);
   bool is_finished() { return _finished; }
   bool empty() { return _current_inst==nullptr; }
-  void register_tag(int subgraph_id, std::vector<int>& key) {
+  void register_tag(int subgraph_id, std::vector<int64_t>& key) {
     if (tag_table.find(subgraph_id) == tag_table.end()) {
-      tag_table[subgraph_id] = std::map<std::vector<int>, uint32_t>();
-      waiters[subgraph_id] = std::map<std::vector<int>, std::vector<std::shared_ptr<Instruction>>>();
+      tag_table[subgraph_id] = std::map<std::vector<int64_t>, uint32_t>();
+      waiters[subgraph_id] = std::map<std::vector<int64_t>, std::vector<std::shared_ptr<Instruction>>>();
     }
     tag_table[subgraph_id][key] = 0;
     waiters[subgraph_id][key] = std::vector<std::shared_ptr<Instruction>>();
   }
-  void set_tag_finish(int subgraph_id, std::vector<int>& key) {
+  void set_tag_finish(int subgraph_id, std::vector<int64_t>& key) {
     if (tag_table.find(subgraph_id) == tag_table.end()) {
       throw std::runtime_error("Subgraph does not exist in tag_table");
     }
     tag_table[subgraph_id][key] = 1;
   }
 
-  void set_tag_sparse(int subgraph_id, std::vector<int>& key) {
+  void set_tag_sparse(int subgraph_id, std::vector<int64_t>& key) {
     if (tag_table.find(subgraph_id) == tag_table.end()) {
       throw std::runtime_error("Subgraph does not exist in tag_table");
     }
     tag_table[subgraph_id][key] = -1;
   }
 
-  void mark_tag_used(int subgraph_id, std::vector<int>& key) {
+  void mark_tag_used(int subgraph_id, std::vector<int64_t>& key) {
     if (tag_table.find(subgraph_id) == tag_table.end()) {
       throw std::runtime_error("Subgraph does not exist in tag_table");
     } else if (!tag_table[subgraph_id][key]) {
@@ -59,7 +59,7 @@ class DMA {
     for (const auto& entry: tag_table) {
       auto subgraph_id = entry.first;
       for (const auto& tag_entry: tag_table[subgraph_id]) {
-        const std::vector<int>& tag_key = tag_entry.first;
+        const std::vector<int64_t>& tag_key = tag_entry.first;
         uint32_t value = tag_entry.second;
         if (value == 1) {
           spdlog::debug("[Tag Table][{}] Unused tag found: (key={}, val={})",
@@ -69,7 +69,7 @@ class DMA {
     }
   }
 
-  bool tag_key_exist(int subgraph_id, std::vector<int>& key) {
+  bool tag_key_exist(int subgraph_id, std::vector<int64_t>& key) {
     auto subgraph_it = tag_table.find(subgraph_id);
     if (subgraph_it == tag_table.end())
       return false;
@@ -78,7 +78,7 @@ class DMA {
     auto key_it = key_map.find(key);
     return key_it != key_map.end();
   }
-  uint32_t get_tag_finish(int subgraph_id, std::vector<int>& key) {
+  uint32_t get_tag_finish(int subgraph_id, std::vector<int64_t>& key) {
     auto subgraph_it = tag_table.find(subgraph_id);
     auto& key_map = subgraph_it->second;
     auto key_it = key_map.find(key);
@@ -95,7 +95,7 @@ class DMA {
     tag_table.erase(subgraph_id);
     waiters.erase(subgraph_id);
   }
-  void register_tag_waiter(int subgraph_id, std::vector<int>& key, std::shared_ptr<Instruction> inst) {
+  void register_tag_waiter(int subgraph_id, std::vector<int64_t>& key, std::shared_ptr<Instruction> inst) {
     auto subgraph_it = tag_table.find(subgraph_id);
     auto& key_map = subgraph_it->second;
     auto key_it = key_map.find(key);
@@ -104,7 +104,7 @@ class DMA {
     }
     waiters[subgraph_id][key].push_back(inst);
   }
-  std::vector<std::shared_ptr<Instruction>>& get_tag_waiter(int subgraph_id, std::vector<int>& key) {
+  std::vector<std::shared_ptr<Instruction>>& get_tag_waiter(int subgraph_id, std::vector<int64_t>& key) {
     auto subgraph_it = tag_table.find(subgraph_id);
     auto& key_map = subgraph_it->second;
     auto key_it = key_map.find(key);
@@ -129,8 +129,9 @@ class DMA {
   size_t _tile_idx_stride=1;
   uint32_t _tile_idx;
   bool _finished=true;
-  std::map<int, std::map<std::vector<int>, uint32_t>> tag_table;
-  std::map<int, std::map<std::vector<int>, std::vector<std::shared_ptr<Instruction>>>> waiters;
+  bool _l2_datacache_enabled = false;
+  std::map<int, std::map<std::vector<int64_t>, uint32_t>> tag_table;
+  std::map<int, std::map<std::vector<int64_t>, std::vector<std::shared_ptr<Instruction>>>> waiters;
   std::queue<mem_fetch*> _pending_accesses;
   bool _generated_once = false;
 };
diff --git a/TOGSim/include/Instruction.h b/TOGSim/include/Instruction.h
index 9fad13f4..bb62a440 100644
--- a/TOGSim/include/Instruction.h
+++ b/TOGSim/include/Instruction.h
@@ -18,13 +18,14 @@ typedef uint64_t addr_type;
 typedef uint64_t cycle_type;
 
 std::string opcode_to_string(Opcode opcode);
+std::string format_tag_key_list_hex(const std::vector<int64_t>& tag_keys);
 
 class Instruction : public std::enable_shared_from_this<Instruction> {
  public:
   Instruction(Opcode opcode, cycle_type compute_cycle, size_t num_parents, addr_type dram_addr,
-              std::vector<size_t> tile_size, std::vector<int> tile_stride, size_t precision,
-              std::vector<int> tag_idx_list, std::vector<int> tag_stride_list,
-              std::vector<int> accum_tag_idx_list);
+              std::vector<size_t> tile_size, std::vector<int> tile_stride, size_t elem_bits,
+              std::vector<int64_t> tag_idx_list, std::vector<int64_t> tag_stride_list,
+              std::vector<int64_t> accum_tag_idx_list);
   Instruction(Opcode opcode);
   void finish_instruction();
   void add_child(std::shared_ptr<Instruction> child);
@@ -32,6 +33,7 @@ class Instruction : public std::enable_shared_from_this<Instruction> {
   const Opcode get_opcode() { return opcode; }
   bool is_dma_read() { return opcode == Opcode::MOVIN; }
   bool is_dma_write() { return opcode == Opcode::MOVOUT; }
+  bool is_dma_instruction() const { return opcode == Opcode::MOVIN || opcode == Opcode::MOVOUT; }
   bool is_async_dma() { return _is_async_dma; }
   bool is_indirect_mode() { return _is_indirect_mode; }
   std::string get_indirect_index_path() { return _indirect_index_path; }
@@ -45,11 +47,12 @@ class Instruction : public std::enable_shared_from_this<Instruction> {
     }
   }
   size_t get_tile_numel() { return _tile_numel; }
-  size_t get_precision() { return _precision; }
+  size_t get_elem_bits() const { return _elem_bits; }
   void inc_waiting_request();
   void dec_waiting_request();
   size_t get_waiting_request() { return _nr_waiting_request; }
   std::vector<size_t>& get_tile_size() { return tile_size; }
+  std::vector<int>& get_tile_stride() { return tile_stride; }
   void set_overlapping_cycle(cycle_type cycle) { overlapping_cycle = cycle; }
   cycle_type get_overlapping_cycle() { return overlapping_cycle; }
   cycle_type get_compute_cycle() { return compute_cycle; }
@@ -68,12 +71,12 @@ class Instruction : public std::enable_shared_from_this<Instruction> {
   int get_compute_type() { return _compute_type; }
   void set_numa_id(int numa_id) { _numa_id = numa_id; }
   uint32_t get_numa_id() { return _numa_id; }
-  std::vector<int>& get_tag_idx_list() { return _tag_idx_list; }
-  std::vector<int>& get_tag_stride_list() { return _tag_stride_list; }
-  std::vector<int>& get_tag_id() { return _tag_key; }
-  void set_addr_name(std::string name, int id) { _addr_name = name; _addr_id = id; }
+  std::vector<int64_t>& get_tag_idx_list() { return _tag_idx_list; }
+  std::vector<int64_t>& get_tag_stride_list() { return _tag_stride_list; }
+  std::vector<int64_t>& get_tag_id() { return _tag_key; }
+  void set_addr_name(std::string name, int64_t id) { _addr_name = name; _addr_id = id; }
   std::string get_addr_name() { return _addr_name; }
-  int get_addr_id() { return _addr_id; }
+  int64_t get_addr_id() { return _addr_id; }
   void set_nr_inner_loop(int nr) { _nr_inner_loop = nr; }
   int get_nr_inner_loop() { return _nr_inner_loop; }
   void set_is_async(bool is_async) { _is_async_dma = is_async; }
@@ -81,6 +84,7 @@ class Instruction : public std::enable_shared_from_this<Instruction> {
   bool is_sparse_inst() { return _is_sparse_inst; }
   void set_sparse_state(bool state) { _is_sparse_inst = state; }
   std::set<std::shared_ptr<Instruction>>& get_child_inst() { return child_inst; }
+  uint64_t get_global_inst_id() const { return _global_inst_id; }
 
   cycle_type start_cycle;
   cycle_type finish_cycle;
@@ -89,6 +93,9 @@ class Instruction : public std::enable_shared_from_this<Instruction> {
   bool finished=false;
   int subgraph_id;
  private:
+  uint64_t _global_inst_id = 0;
+  static uint64_t _next_global_inst_id;
+
   void *_owner = nullptr;
   std::list<std::shared_ptr<Instruction>>* _owner_ready_queue_ref = nullptr;
   Opcode opcode;
@@ -100,17 +107,17 @@ class Instruction : public std::enable_shared_from_this<Instruction> {
   std::vector<int> tile_stride;
   size_t _tile_numel;
   size_t _nr_waiting_request=0;
-  size_t _precision=0;
+  size_t _elem_bits = 0;
   addr_type dram_addr;
   uint32_t _numa_id = 0; // For DMA instruction
   int _compute_type = 0;
-  std::vector<int> _tag_idx_list;
-  std::vector<int> _tag_stride_list;
-  std::vector<int> _tag_key;
-  std::vector<int> _accum_tag_idx_list;
+  std::vector<int64_t> _tag_idx_list;
+  std::vector<int64_t> _tag_stride_list;
+  std::vector<int64_t> _tag_key;
+  std::vector<int64_t> _accum_tag_idx_list;
   std::vector<addr_type> _trace_address;
   std::string _addr_name;
-  int _addr_id;
+  int64_t _addr_id = 0;
   int _nr_inner_loop = 0;
   bool _is_async_dma=false;
   bool _is_indirect_mode=false;
diff --git a/TOGSim/include/SparseCore.h b/TOGSim/include/SparseCore.h
index 02781ab3..a91004ed 100644
--- a/TOGSim/include/SparseCore.h
+++ b/TOGSim/include/SparseCore.h
@@ -59,7 +59,8 @@ class SparseCore : public Core {
   void print_stats() override;
   void print_current_stats() override;
   std::shared_ptr<Tile> pop_finished_tile() override;
-  void finish_instruction(std::shared_ptr<Instruction>& inst) override;
+  void finish_instruction(std::shared_ptr<Instruction>& inst,
+                          InstFinishTraceTag tag = InstFinishTraceTag::Fnshed) override;
   void dumpTrace(int stonne_core_id, const std::string& path);
   bool isTraceMode(int stonne_core_id) { return traceMode.at(stonne_core_id); }
   void setTraceMode(int stonne_core_id, bool mode) { traceMode.at(stonne_core_id) = mode; }
diff --git a/TOGSim/include/TileGraphParser.h b/TOGSim/include/TileGraphParser.h
index f067fb2d..d255a735 100644
--- a/TOGSim/include/TileGraphParser.h
+++ b/TOGSim/include/TileGraphParser.h
@@ -80,9 +80,9 @@ class TileGraphParser {
   int getCoreIdFromConfig(const YAML::Node& attribute_config, int subgraph_id);
   std::string getMetaByName(std::string key) { return _tog_meta[key]; }
   const YAML::Node& get_attribute_file() { return _attribute_config; }
-  std::vector<int> calc_tag(std::vector<int>& accum_tag, std::vector<int>& tag_idx, std::vector<int>& tag_stride);
-  void register_memory_tag(std::string name, std::vector<int>& tag_key);
-  bool check_memory_tag(std::string name, std::vector<int>& tag_key);
+  std::vector<int64_t> calc_tag(std::vector<int64_t>& accum_tag, std::vector<int64_t>& tag_idx, std::vector<int64_t>& tag_stride);
+  void register_memory_tag(std::string name, std::vector<int64_t>& tag_key);
+  bool check_memory_tag(std::string name, std::vector<int64_t>& tag_key);
   void clear_tag_table() { _tag_table.clear(); }
   std::string get_indirect_path() {
     namespace fs = std::filesystem;
@@ -118,12 +118,12 @@ class TileGraphParser {
   uint64_t get_dma_counter() { return dma_counter; }
   void inc_dma_counter() { dma_counter++; }
   bool is_sparse_tile(uint64_t idx) { return sparse_tile_set.find(idx) != sparse_tile_set.end(); }
-  int register_addr_name(const std::string& addr_name) {
+  int64_t register_addr_name(const std::string& addr_name) {
     if (_addr_name_map.find(addr_name) == _addr_name_map.end())
-      _addr_name_map[addr_name] = _addr_name_map.size();
+      _addr_name_map[addr_name] = static_cast<int64_t>(_addr_name_map.size());
     return _addr_name_map[addr_name];
   }
-  int get_addr_name_id(const std::string& addr_name) { return _addr_name_map[addr_name]; }
+  int64_t get_addr_name_id(const std::string& addr_name) { return _addr_name_map[addr_name]; }
 
  private:
   void register_tile(std::shared_ptr<TileNode> tile_node);
@@ -148,8 +148,8 @@ class TileGraphParser {
   std::vector<Interval<unsigned long long, int>> _cache_plan;
   std::map<std::string, std::tuple<int, int, LoopType>> _loop_size_map;
   std::map<std::string, std::string> _tog_meta;
-  std::map<std::pair<std::string, std::vector<int>>, uint32_t> _tag_table;
-  std::unordered_map<std::string, int> _addr_name_map;
+  std::map<std::pair<std::string, std::vector<int64_t>>, uint32_t> _tag_table;
+  std::unordered_map<std::string, int64_t> _addr_name_map;
 };
 
 class TileComputeNode : public TileNode {
@@ -171,11 +171,11 @@ class TileMemoryNode : public TileNode {
  public:
   TileMemoryNode(onnx::NodeProto& node);
   std::string get_base_addr_name() { return _base_addr_name; }
-  size_t get_precision() { return _element_size; }
+  size_t get_elem_bits() const { return _elem_bits; }
   std::vector<size_t> get_tile_size() { return _tile_size; }
   std::vector<int>& get_tile_stride() { return _tile_stride; }
   std::vector<std::string>& get_tag_idx_list() { return _tag_idx_list; }
-  std::vector<int>& get_tag_stride_list() { return _tag_stride_list; }
+  std::vector<int64_t>& get_tag_stride_list() { return _tag_stride_list; }
   std::vector<std::string>& get_loop_idx_list() { return _loop_idx_list; }
   std::vector<int>& get_loop_stride_list () { return _loop_stride_list; }
   bool is_async_node() { return _is_async; }
@@ -185,12 +185,12 @@ class TileMemoryNode : public TileNode {
  private:
   std::vector<size_t> _tile_size;
   std::vector<int> _tile_stride;
-  size_t _element_size;
+  size_t _elem_bits = 0;
   bool _is_async;
   bool _is_indirect;
   std::string _base_addr_name;
   std::vector<std::string> _tag_idx_list;
-  std::vector<int> _tag_stride_list;
+  std::vector<int64_t> _tag_stride_list;
   std::vector<std::string> _loop_idx_list;
   std::vector<int> _loop_stride_list;
 };
@@ -200,14 +200,14 @@ class TileMemoryWaitNode : public TileNode {
   TileMemoryWaitNode(onnx::NodeProto& node);
   std::string get_base_addr_name() { return _base_addr_name; }
   std::vector<std::string>& get_tag_idx_list() { return _tag_idx_list; }
-  std::vector<int>& get_tag_stride_list() { return _tag_stride_list; }
-  std::vector<int>& get_tag_divider_list() { return _tag_divider_list; }
+  std::vector<int64_t>& get_tag_stride_list() { return _tag_stride_list; }
+  std::vector<int64_t>& get_tag_divider_list() { return _tag_divider_list; }
   void print_node() override;
 
  private:
   std::vector<std::string> _tag_idx_list;
-  std::vector<int> _tag_stride_list;
-  std::vector<int> _tag_divider_list;
+  std::vector<int64_t> _tag_stride_list;
+  std::vector<int64_t> _tag_divider_list;
   std::string _base_addr_name;
 };
 
diff --git a/TOGSim/include/TraceLogTags.h b/TOGSim/include/TraceLogTags.h
new file mode 100644
index 00000000..6c158099
--- /dev/null
+++ b/TOGSim/include/TraceLogTags.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <string>
+#include <string_view>
+
+/** Trace bracket tags: max 15 characters; use pad15() so logs show a fixed 15-char field (space-padded). */
+namespace TraceLogTag {
+
+/** Right-pad (or truncate) to exactly 15 characters for aligned log columns. */
+inline std::string pad15(std::string_view sv) {
+  if (sv.size() > 15) {
+    sv = sv.substr(0, 15);
+  }
+  std::string out(sv);
+  out.resize(15, ' ');
+  return out;
+}
+
+inline constexpr const char* kTileScheduled = "TILE_SCHEDULED";
+
+inline constexpr const char* kInstructionIssued = "INST_ISSUED";
+inline constexpr const char* kInstructionFinished = "INST_FINISHED";
+/** Async MOVIN skipped: same tag still in flight. */
+inline constexpr const char* kInstructionSkipped = "INST_SKIP";
+
+inline constexpr const char* kAsyncDmaAllRequestsIssued = "ASYNC_DMA_ISSUE";
+inline constexpr const char* kAllDramResponsesReceived = "DRAM_RESP_DONE";
+
+inline constexpr const char* kL2CacheableStatusForAddress = "L2CACHE_STAT";
+inline constexpr const char* kDmaNumaPlacement = "DRAM_NUMA";
+
+/** Field label for get_global_inst_id() in trace lines (≤15 chars). */
+inline constexpr const char* kGlobalInstIdKey = "INST_ID";
+}  // namespace TraceLogTag
diff --git a/TOGSim/src/Core.cc b/TOGSim/src/Core.cc
index 1f831661..d9be4ca3 100644
--- a/TOGSim/src/Core.cc
+++ b/TOGSim/src/Core.cc
@@ -1,4 +1,7 @@
 #include "Core.h"
+#include "CoreTraceLog.h"
+#include <spdlog/spdlog.h>
+#include <algorithm>
 
 Core::Core(uint32_t id, SimulationConfig config)
     : _id(id),
@@ -6,7 +9,7 @@ Core::Core(uint32_t id, SimulationConfig config)
       _core_cycle(0),
       _stat_dma_cycle(0),
       _num_systolic_array_per_core(config.num_systolic_array_per_core),
-      _dma(id, config.dram_req_size) {
+      _dma(id, config.dram_req_size, config.l2d_type != L2CacheType::NOCACHE) {
   _sa_compute_pipeline.resize(_num_systolic_array_per_core);
   _stat_tot_sa_compute_cycle.resize(_num_systolic_array_per_core);
   _stat_sa_compute_cycle.resize(_num_systolic_array_per_core);
@@ -22,9 +25,9 @@ bool Core::can_issue(const std::shared_ptr<Tile>& op) {
 }
 
 void Core::issue(std::shared_ptr<Tile> op) {
-  if (op->get_instructions().size()){
-    spdlog::trace("[{}][Core {}][TILE_SCHEDULED]",
-      _core_cycle, _id);
+  if (op->get_instructions().size()) {
+    core_trace_log::trace_tile_scheduled(_core_cycle, _id,
+                                         TraceLogTag::pad15(TraceLogTag::kTileScheduled));
   }
   for (const auto& inst : op->get_instructions()) {
     if (inst->is_ready())
@@ -120,13 +123,16 @@ void Core::dma_cycle() {
     if (instruction->is_dma_read() && instruction->is_async_dma()) {
       auto& key = instruction->get_tag_id();
       assert(!_dma.get_tag_finish(instruction->subgraph_id, key));
+      spdlog::trace(
+          "[{}][Core {}] TOG async DMA response (table notify): tag_addr=0x{:016x} global_inst_id={} "
+          "subgraph_id={}",
+          _core_cycle,
+          _id,
+          static_cast<uint64_t>(static_cast<uintptr_t>(instruction->get_addr_id())),
+          instruction->get_global_inst_id(),
+          instruction->subgraph_id);
       _dma.set_tag_finish(instruction->subgraph_id, key);
-      spdlog::trace("[{}][Core {}] {} ASYNC FINISHED, subgraph_id: {} addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}",
-                    _core_cycle, _id, opcode_to_string(instruction->get_opcode()),
-                    instruction->subgraph_id, instruction->get_addr_name(),
-                    fmt::format("[{}]", fmt::join(instruction->get_tag_id(), ", ")),
-                    fmt::format("[{}]", fmt::join(instruction->get_tag_idx_list(), ", ")),
-                    fmt::format("[{}]", fmt::join(instruction->get_tag_stride_list(), ", ")));
+      finish_instruction(instruction, InstFinishTraceTag::DmaRespComplete);
       for (auto & wait_inst : _dma.get_tag_waiter(instruction->subgraph_id, key)) {
         _dma.mark_tag_used(instruction->subgraph_id, key);
         finish_instruction(wait_inst);
@@ -143,18 +149,18 @@ void Core::dma_cycle() {
         /* Only DMA write operation is finished! */
         finish_instruction(finished_inst);
       } else if (finished_inst->is_dma_read() && finished_inst->is_async_dma()) {
-        /* Register tag table for async dma load */
-        _dma.register_tag(finished_inst->subgraph_id, finished_inst->get_tag_id());
-        finish_instruction(finished_inst);
+        /* Register tag table for async dma load; see TraceLogTag::kAsyncDmaAllRequestsIssued */
+        finish_instruction(finished_inst, InstFinishTraceTag::DmaIssueComplete);
       } else if(!finished_inst->is_dma_read()) {
-        spdlog::error("[{}][Core {}] DMA instruction in not valid", _core_cycle, _id);
+        core_trace_log::log_error_dma_instruction_invalid(_core_cycle, _id);
         exit(EXIT_FAILURE);
       } else if (finished_inst->get_opcode() == Opcode::BAR) {
-        spdlog::trace("[{}][Core {}] {} FINISHED, addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _core_cycle, _id,
-                      opcode_to_string(finished_inst->get_opcode()), finished_inst->get_addr_name(),
-                      fmt::format("[{}]", fmt::join(finished_inst->get_tag_id(), ", ")),
-                      fmt::format("[{}]", fmt::join(finished_inst->get_tag_idx_list(), ", ")),
-                      fmt::format("[{}]", fmt::join(finished_inst->get_tag_stride_list(), ", ")));
+        core_trace_log::trace_instruction_line(_core_cycle,
+                                               _id,
+                                               TraceLogTag::pad15(TraceLogTag::kInstructionFinished),
+                                               finished_inst->get_global_inst_id(),
+                                               core_trace_log::format_instruction_detail_line(
+                                                   *finished_inst));
       }
       /*Pass to waiting queue */
       _dma_waiting_queue[finished_inst.get()] = std::move(finished_inst);
@@ -223,34 +229,37 @@ void Core::cycle() {
                 finish_instruction(inst);
               else
                 _dma.register_tag_waiter(inst->subgraph_id, key, inst);
-              spdlog::trace("[{}][Core {}][SIKIPPED] {}, addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _core_cycle, _id,
-                            opcode_to_string(inst->get_opcode()),
-                            inst->get_addr_name(),
-                            fmt::format("[{}]", fmt::join(inst->get_tag_id(), ", ")),
-                            fmt::format("[{}]", fmt::join(inst->get_tag_idx_list(), ", ")),
-                            fmt::format("[{}]", fmt::join(inst->get_tag_stride_list(), ", ")));
+              core_trace_log::trace_instruction_line(_core_cycle,
+                                                       _id,
+                                                       TraceLogTag::pad15(
+                                                           TraceLogTag::kInstructionSkipped),
+                                                       inst->get_global_inst_id(),
+                                                       core_trace_log::format_dma_inst_issued_trace_line(
+                                                           *inst));
               issued = true;
               _stat_tot_skipped_inst.at(static_cast<size_t>(inst->get_opcode()))++;
               break;
             } else {
-              spdlog::trace("[{}][Core {}][INST_ISSUED] {}, addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _core_cycle, _id,
-                            opcode_to_string(inst->get_opcode()),
-                            inst->get_addr_name(),
-                            fmt::format("[{}]", fmt::join(inst->get_tag_id(), ", ")),
-                            fmt::format("[{}]", fmt::join(inst->get_tag_idx_list(), ", ")),
-                            fmt::format("[{}]", fmt::join(inst->get_tag_stride_list(), ", ")));
+              core_trace_log::trace_instruction_line(_core_cycle,
+                                                       _id,
+                                                       TraceLogTag::pad15(
+                                                           TraceLogTag::kInstructionIssued),
+                                                       inst->get_global_inst_id(),
+                                                       core_trace_log::format_dma_inst_issued_trace_line(
+                                                           *inst));
+              _dma.register_tag(inst->subgraph_id, inst->get_tag_id());
               _ld_inst_queue.push(inst);
               issued = true;
               break;
             }
           }
         case Opcode::MOVOUT:
-          spdlog::trace("[{}][Core {}][INST_ISSUED] {}, addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _core_cycle, _id,
-                        opcode_to_string(inst->get_opcode()),
-                        inst->get_addr_name(),
-                        fmt::format("[{}]", fmt::join(inst->get_tag_id(), ", ")),
-                        fmt::format("[{}]", fmt::join(inst->get_tag_idx_list(), ", ")),
-                        fmt::format("[{}]", fmt::join(inst->get_tag_stride_list(), ", ")));
+          core_trace_log::trace_instruction_line(_core_cycle,
+                                                   _id,
+                                                   TraceLogTag::pad15(TraceLogTag::kInstructionIssued),
+                                                   inst->get_global_inst_id(),
+                                                   core_trace_log::format_dma_inst_issued_trace_line(
+                                                       *inst));
           _st_inst_queue.push(inst);
           issued = true;
           break;
@@ -273,8 +282,13 @@ void Core::cycle() {
               _stat_tot_skipped_inst.at(static_cast<size_t>(inst->get_opcode()))++;
               instructions.erase(it);
             } else {
-              spdlog::trace("[{}][Core {}][INST_ISSUED][SA {}] {}-{}, finsh at {}", _core_cycle, _id, _systolic_array_rr,
-                            opcode_to_string(inst->get_opcode()), inst->get_compute_type(), inst->finish_cycle);
+              core_trace_log::trace_instruction_line(_core_cycle,
+                                                       _id,
+                                                       TraceLogTag::pad15(
+                                                           TraceLogTag::kInstructionIssued),
+                                                       inst->get_global_inst_id(),
+                                                       core_trace_log::format_instruction_detail_line(
+                                                           *inst));
               target_pipeline.push(inst);
               issued = true;
               if (inst->get_compute_type()) {
@@ -300,16 +314,18 @@ void Core::cycle() {
             } else {
               _dma.register_tag_waiter(inst->subgraph_id, key, inst);
             }
-            spdlog::trace("[{}][Core {}][INST_ISSUED] {},  addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}", _core_cycle, _id,
-                            opcode_to_string(inst->get_opcode()), inst->get_addr_name(),
-                            fmt::format("[{}]", fmt::join(inst->get_tag_id(), ", ")),
-                            fmt::format("[{}]", fmt::join(inst->get_tag_idx_list(), ", ")),
-                            fmt::format("[{}]", fmt::join(inst->get_tag_stride_list(), ", ")));
+            core_trace_log::trace_instruction_line(_core_cycle,
+                                                     _id,
+                                                     TraceLogTag::pad15(
+                                                         TraceLogTag::kInstructionIssued),
+                                                     inst->get_global_inst_id(),
+                                                     core_trace_log::format_instruction_detail_line(
+                                                         *inst));
             issued = true;
           }
           break;
         default:
-          spdlog::error("Undefined instruction opcode type");
+          core_trace_log::log_error_undefined_opcode();
           exit(EXIT_FAILURE);
       }
 
@@ -341,27 +357,34 @@ void Core::cycle() {
   }
 }
 
-void Core::finish_instruction(std::shared_ptr<Instruction>& inst) {
+void Core::finish_instruction(std::shared_ptr<Instruction>& inst, InstFinishTraceTag tag) {
+  if (tag == InstFinishTraceTag::DmaRespComplete) {
+    if (!inst->finished) {
+      core_trace_log::log_error_dram_responses_trace_not_finished(_core_cycle, _id);
+      exit(EXIT_FAILURE);
+    }
+    core_trace_log::trace_instruction_line(_core_cycle,
+                                             _id,
+                                             TraceLogTag::pad15(TraceLogTag::kAllDramResponsesReceived),
+                                             inst->get_global_inst_id(),
+                                             core_trace_log::format_instruction_detail_line(*inst));
+    return;
+  }
   if (inst->finished) {
-    spdlog::error("[{}][Core {}][ERROR] {} inst already finished!!", _core_cycle, _id,
-                  opcode_to_string(inst->get_opcode()));
+    core_trace_log::log_error_instruction_already_finished(_core_cycle, _id,
+                                                           opcode_to_string(inst->get_opcode()));
     exit(EXIT_FAILURE);
   }
   inst->finish_instruction();
   static_cast<Tile*>(inst->get_owner())->inc_finished_inst();
-  if (inst->get_opcode() == Opcode::COMP) {
-    spdlog::trace("[{}][Core {}][INST_FINISHED] {}-{}",
-      _core_cycle, _id, opcode_to_string(inst->get_opcode()), inst->get_compute_type());
-  } else if (inst->get_opcode() != Opcode::BAR && inst->is_async_dma()){
-    spdlog::trace("[{}][Core {}][ASYNC] {} subgraph_id: {} addr_name: {} tag_id: {} tag_idx_list: {} tag_stride_list: {}",
-      _core_cycle, _id, opcode_to_string(inst->get_opcode()), inst->subgraph_id, inst->get_addr_name(),
-      inst->get_tag_id(),
-      fmt::format("[{}]", fmt::join(inst->get_tag_idx_list(), ", ")),
-      fmt::format("[{}]", fmt::join(inst->get_tag_stride_list(), ", ")));
-  } else if ((inst->get_opcode() == Opcode::MOVIN || inst->get_opcode() == Opcode::MOVOUT) && !inst->is_async_dma()) {
-    spdlog::trace("[{}][Core {}][INST_FINISHED] {} addr_name: {}", _core_cycle, _id,
-      opcode_to_string(inst->get_opcode()), inst->get_addr_name());
-  }
+  const char* trace_tag = (tag == InstFinishTraceTag::DmaIssueComplete)
+                              ? TraceLogTag::kAsyncDmaAllRequestsIssued
+                              : TraceLogTag::kInstructionFinished;
+  core_trace_log::trace_instruction_line(_core_cycle,
+                                           _id,
+                                           TraceLogTag::pad15(trace_tag),
+                                           inst->get_global_inst_id(),
+                                           core_trace_log::format_instruction_detail_line(*inst));
 }
 
 bool Core::running() {
diff --git a/TOGSim/src/CoreTraceLog.cc b/TOGSim/src/CoreTraceLog.cc
new file mode 100644
index 00000000..ebc31de0
--- /dev/null
+++ b/TOGSim/src/CoreTraceLog.cc
@@ -0,0 +1,122 @@
+#include "CoreTraceLog.h"
+
+#include <algorithm>
+
+#include <fmt/format.h>
+#include <fmt/ranges.h>
+#include <spdlog/spdlog.h>
+
+namespace core_trace_log {
+
+std::string format_dma_inst_issued_detail(Instruction& inst) {
+  const auto& ts = inst.get_tile_size();
+  const int rank = static_cast<int>(std::max<size_t>(1, ts.size()));
+  if (inst.get_opcode() == Opcode::MOVIN) {
+    return fmt::format(
+        "addr_name={} dram=0x{:016x} rank={} size=[{}] stride=[{}] elem_bits={} async={} indirect={} tag_id=[{}]",
+        inst.get_addr_name(),
+        static_cast<uint64_t>(inst.get_base_dram_address()),
+        rank,
+        fmt::join(ts, ","),
+        fmt::join(inst.get_tile_stride(), ","),
+        inst.get_elem_bits(),
+        inst.is_async_dma(),
+        inst.is_indirect_mode(),
+        format_tag_key_list_hex(inst.get_tag_id()));
+  }
+  uint64_t tag_hex = 0;
+  const auto& tidx = inst.get_tag_idx_list();
+  if (!tidx.empty()) {
+    tag_hex = static_cast<uint64_t>(tidx[0]);
+  }
+  return fmt::format(
+      "addr_name={} dram=0x{:016x} rank={} elem_bits={} async={} indirect={} tag=0x{:016x} stride=[{}] size=[{}] "
+      "tag_idx=[{}]",
+      inst.get_addr_name(),
+      static_cast<uint64_t>(inst.get_base_dram_address()),
+      rank,
+      inst.get_elem_bits(),
+      inst.is_async_dma(),
+      inst.is_indirect_mode(),
+      tag_hex,
+      fmt::join(inst.get_tile_stride(), ","),
+      fmt::join(ts, ","),
+      fmt::join(tidx, ","));
+}
+
+std::string format_dma_inst_issued_trace_line(Instruction& inst) {
+  return fmt::format("{} ({})", opcode_to_string(inst.get_opcode()), format_dma_inst_issued_detail(inst));
+}
+
+std::string format_instruction_detail_line(Instruction& inst) {
+  const Opcode op = inst.get_opcode();
+  const std::string opname = opcode_to_string(op);
+  if (op == Opcode::COMP) {
+    return fmt::format("{} (compute_type={} compute_cycle={} overlapping_cycle={})",
+                       opname,
+                       inst.get_compute_type(),
+                       inst.get_compute_cycle(),
+                       inst.get_overlapping_cycle());
+  }
+  if ((op == Opcode::MOVIN || op == Opcode::MOVOUT) && inst.is_async_dma()) {
+    return fmt::format("{} (ASYNC subgraph_id={} addr_name={} tag_id=[{}] tag_idx=[{}] tag_stride=[{}])",
+                       opname,
+                       inst.subgraph_id,
+                       inst.get_addr_name(),
+                       format_tag_key_list_hex(inst.get_tag_id()),
+                       fmt::join(inst.get_tag_idx_list(), ","),
+                       fmt::join(inst.get_tag_stride_list(), ","));
+  }
+  if (op == Opcode::MOVIN || op == Opcode::MOVOUT) {
+    return fmt::format("{} (addr_name={})", opname, inst.get_addr_name());
+  }
+  if (op == Opcode::BAR) {
+    return fmt::format("{} (addr_name={} tag_id=[{}] tag_idx=[{}] tag_stride=[{}])",
+                       opname,
+                       inst.get_addr_name(),
+                       format_tag_key_list_hex(inst.get_tag_id()),
+                       fmt::join(inst.get_tag_idx_list(), ","),
+                       fmt::join(inst.get_tag_stride_list(), ","));
+  }
+  return opname;
+}
+
+void trace_tile_scheduled(cycle_type core_cycle, uint32_t core_id, const std::string& tag15) {
+  spdlog::trace("[{}][Core {}][{}]", core_cycle, core_id, tag15);
+}
+
+void trace_instruction_line(cycle_type core_cycle,
+                            uint32_t core_id,
+                            const std::string& tag15,
+                            uint64_t global_inst_id,
+                            const std::string& message) {
+  spdlog::trace("[{}][Core {}][{}][{}={}] {}",
+                 core_cycle,
+                 core_id,
+                 tag15,
+                 TraceLogTag::kGlobalInstIdKey,
+                 global_inst_id,
+                 message);
+}
+
+void log_error_dma_instruction_invalid(cycle_type core_cycle, uint32_t core_id) {
+  spdlog::error("[{}][Core {}] DMA instruction in not valid", core_cycle, core_id);
+}
+
+void log_error_dram_responses_trace_not_finished(cycle_type core_cycle, uint32_t core_id) {
+  spdlog::error("[{}][Core {}][ERROR] ALL_DRAM_RESPONSES_RECEIVED trace but inst not finished yet",
+                core_cycle,
+                core_id);
+}
+
+void log_error_instruction_already_finished(cycle_type core_cycle,
+                                            uint32_t core_id,
+                                            const std::string& opcode_name) {
+  spdlog::error("[{}][Core {}][ERROR] {} inst already finished!!", core_cycle, core_id, opcode_name);
+}
+
+void log_error_undefined_opcode() {
+  spdlog::error("Undefined instruction opcode type");
+}
+
+}  // namespace core_trace_log
diff --git a/TOGSim/src/DMA.cc b/TOGSim/src/DMA.cc
index fefee6d2..5d509953 100644
--- a/TOGSim/src/DMA.cc
+++ b/TOGSim/src/DMA.cc
@@ -1,9 +1,11 @@
 #include "DMA.h"
 #include "TileGraph.h"
+#include "TraceLogTags.h"
 
-DMA::DMA(uint32_t id, uint32_t dram_req_size) {
+DMA::DMA(uint32_t id, uint32_t dram_req_size, bool l2_datacache_enabled) {
   _id = id;
   _dram_req_size = dram_req_size;
+  _l2_datacache_enabled = l2_datacache_enabled;
   _current_inst = nullptr;
   _finished = true;
 }
@@ -31,12 +33,27 @@ std::shared_ptr<std::vector<mem_fetch*>> DMA::get_memory_access(cycle_type core_
     bool is_cacheable =
       owner_subgraph->is_cacheable(base_daddr, base_daddr + _dram_req_size);
 
-    spdlog::trace("[{}][Core {}][SRAM] Address: 0x{:016x}, Is_cacheable: {}",
-                    core_cycle, _id, base_daddr, is_cacheable);
-    spdlog::trace("[{}][Core {}][NUMA] Subgraph id: {} , Numa id: {}, Arg: {} is_write: {}",
-                    core_cycle, _id, owner_subgraph->get_core_id(),
-                    _current_inst->get_numa_id(), _current_inst->get_addr_name(),
-                    _current_inst->is_dma_write());
+    if (_l2_datacache_enabled) {
+      spdlog::trace(
+          "[{}][Core {}][{}][INST_ID={}] dram=0x{:016x} cacheable={}",
+          core_cycle,
+          _id,
+          TraceLogTag::pad15(TraceLogTag::kL2CacheableStatusForAddress),
+          _current_inst->get_global_inst_id(),
+          base_daddr,
+          is_cacheable);
+    }
+    spdlog::trace(
+        "[{}][Core {}][{}][INST_ID={}] core_id={} subgraph_id={} numa_id={} addr_name={} is_write={}",
+        core_cycle,
+        _id,
+        TraceLogTag::pad15(TraceLogTag::kDmaNumaPlacement),
+        _current_inst->get_global_inst_id(),
+        owner_subgraph->get_core_id(),
+        _current_inst->subgraph_id,
+        _current_inst->get_numa_id(),
+        _current_inst->get_addr_name(),
+        _current_inst->is_dma_write());
     for (const auto& addr : *addr_set) {
       mem_access_type acc_type =
         _current_inst->is_dma_write() ? mem_access_type::GLOBAL_ACC_W
diff --git a/TOGSim/src/Instruction.cc b/TOGSim/src/Instruction.cc
index aef9079c..1dc3ff42 100644
--- a/TOGSim/src/Instruction.cc
+++ b/TOGSim/src/Instruction.cc
@@ -1,5 +1,23 @@
 #include "Instruction.h"
 
+#include <fmt/format.h>
+
+uint64_t Instruction::_next_global_inst_id = 0;
+
+std::string format_tag_key_list_hex(const std::vector<int64_t>& tag_keys) {
+  if (tag_keys.empty()) {
+    return {};
+  }
+  std::string out;
+  for (size_t i = 0; i < tag_keys.size(); ++i) {
+    if (i > 0) {
+      out.push_back(',');
+    }
+    out += fmt::format("0x{:016x}", static_cast<uint64_t>(tag_keys[i]));
+  }
+  return out;
+}
+
 std::string opcode_to_string(Opcode opcode) {
     switch (opcode) {
         case Opcode::MOVIN:        return "MOVIN";
@@ -11,13 +29,14 @@ std::string opcode_to_string(Opcode opcode) {
 }
 
 Instruction::Instruction(Opcode opcode, cycle_type compute_cycle, size_t num_parents,
-            addr_type dram_addr, std::vector<size_t> tile_size, std::vector<int> tile_stride, size_t precision,
-            std::vector<int> tag_idx_list, std::vector<int> tag_stride_list,
-            std::vector<int> accum_tag_idx_list)
+            addr_type dram_addr, std::vector<size_t> tile_size, std::vector<int> tile_stride, size_t elem_bits,
+            std::vector<int64_t> tag_idx_list, std::vector<int64_t> tag_stride_list,
+            std::vector<int64_t> accum_tag_idx_list)
   : opcode(opcode), compute_cycle(compute_cycle), ready_counter(num_parents), dram_addr(dram_addr),
-    tile_size(tile_size), tile_stride(tile_stride), _precision(precision),
+    tile_size(tile_size), tile_stride(tile_stride), _elem_bits(elem_bits),
     _tag_idx_list(tag_idx_list), _tag_stride_list(tag_stride_list),
     _accum_tag_idx_list(accum_tag_idx_list) {
+  _global_inst_id = _next_global_inst_id++;
   assert(_tag_idx_list.size()==_tag_stride_list.size());
   _tile_numel = 1;
   for (auto dim : tile_size)
@@ -26,6 +45,7 @@ Instruction::Instruction(Opcode opcode, cycle_type compute_cycle, size_t num_par
 
 Instruction::Instruction(Opcode opcode)
   : opcode(opcode) {
+  _global_inst_id = _next_global_inst_id++;
   _tile_numel = 1;
 }
 
@@ -51,9 +71,9 @@ void Instruction::dec_waiting_request() {
 
 void Instruction::prepare_tag_key() {
   /* Calculate tag key */
-  int key_offset = 0;
+  int64_t key_offset = 0;
   _tag_key.push_back(_addr_id);
-  for (int i=0; i<_tag_idx_list.size(); i++)
+  for (size_t i = 0; i < _tag_idx_list.size(); i++)
     key_offset += _tag_idx_list.at(i) * _tag_stride_list.at(i);
   for (auto accum_dim : _accum_tag_idx_list)
     _tag_key.push_back(accum_dim);
@@ -88,10 +108,10 @@ std::shared_ptr<std::set<addr_type>> Instruction::get_dram_address(addr_type dra
                               dim1*tile_stride.at(tile_stride.size() - 3) + \
                               dim2*tile_stride.at(tile_stride.size() - 2) + \
                               dim3*tile_stride.at(tile_stride.size() - 1);
-          address = dram_addr + address * _precision;
+          address = dram_addr + (address * _elem_bits + 7) / 8;
           if (indirect_index != NULL) {
             uint64_t index_val = indirect_index[index_count++];
-            address += index_val * _precision;
+            address += (index_val * _elem_bits + 7) / 8;
           }
           address_set->insert(address - (address & dram_req_size-1));
         }
diff --git a/TOGSim/src/SparseCore.cc b/TOGSim/src/SparseCore.cc
index d5629b9c..1bf1163a 100644
--- a/TOGSim/src/SparseCore.cc
+++ b/TOGSim/src/SparseCore.cc
@@ -1,4 +1,5 @@
 #include "SparseCore.h"
+#include "TraceLogTags.h"
 
 SparseCore::SparseCore(uint32_t id, SimulationConfig config) : Core(id, config) {
   /* Init stonne cores*/
@@ -239,7 +240,11 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) {
         {
           auto acc_type = mem_access_type::GLOBAL_ACC_R;
           auto type = mf_type::READ_REQUEST;
-          spdlog::trace("[{}][StonneCore {}/{}][INST_ISSUED] {}", _core_cycle, _id, subcore_id,
+          spdlog::trace("[{}][StonneCore {}/{}][{}] {}",
+                        _core_cycle,
+                        _id,
+                        subcore_id,
+                        TraceLogTag::pad15(TraceLogTag::kInstructionIssued),
                         opcode_to_string(inst->get_opcode()));
           for (auto addr : inst->get_trace_address()) {
             addr = addr - (addr & _config.dram_req_size-1);
@@ -260,7 +265,11 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) {
         {
           auto acc_type = mem_access_type::GLOBAL_ACC_W;
           auto type = mf_type::WRITE_REQUEST;
-          spdlog::trace("[{}][StonneCore {}/{}][INST_ISSUED] {}", _core_cycle, _id, subcore_id,
+          spdlog::trace("[{}][StonneCore {}/{}][{}] {}",
+                        _core_cycle,
+                        _id,
+                        subcore_id,
+                        TraceLogTag::pad15(TraceLogTag::kInstructionIssued),
                         opcode_to_string(inst->get_opcode()));
           for (auto addr : inst->get_trace_address()) {
             addr = addr - (addr & _config.dram_req_size-1);
@@ -285,8 +294,13 @@ void SparseCore::subCoreCycle(uint32_t subcore_id) {
             inst->finish_cycle = _core_cycle + inst->get_compute_cycle();
           else
             inst->finish_cycle = target_pipeline.back()->finish_cycle + inst->get_compute_cycle();
-          spdlog::trace("[{}][StonneCore {}/{}][INST_ISSUED] {}, finsh at {}", _core_cycle, _id, subcore_id,
-                          opcode_to_string(inst->get_opcode()), inst->finish_cycle);
+          spdlog::trace("[{}][StonneCore {}/{}][{}] {}, finish_at={}",
+                          _core_cycle,
+                          _id,
+                          subcore_id,
+                          TraceLogTag::pad15(TraceLogTag::kInstructionIssued),
+                          opcode_to_string(inst->get_opcode()),
+                          inst->finish_cycle);
           target_pipeline.push(inst);
           issued = true;
         }
@@ -397,7 +411,22 @@ std::shared_ptr<Tile> SparseCore::pop_finished_tile() {
   return result;
 }
 
-void SparseCore::finish_instruction(std::shared_ptr<Instruction>& inst) {
+void SparseCore::finish_instruction(std::shared_ptr<Instruction>& inst, InstFinishTraceTag tag) {
+  if (tag == InstFinishTraceTag::DmaRespComplete) {
+    if (!inst->finished) {
+      spdlog::error("[{}][StonneCore {}][Error] ALL_DRAM_RESPONSES_RECEIVED trace but inst not finished",
+                    _core_cycle,
+                    _id);
+      exit(EXIT_FAILURE);
+    }
+    spdlog::trace("[{}][StonneCore {}][{}][INST_ID={}] {}",
+                    _core_cycle,
+                    _id,
+                    TraceLogTag::pad15(TraceLogTag::kAllDramResponsesReceived),
+                    inst->get_global_inst_id(),
+                    opcode_to_string(inst->get_opcode()));
+    return;
+  }
   if (inst->finished) {
     spdlog::error("[{}][StonneCore {}][Error] {} inst already finished!!", _core_cycle, _id,
                   opcode_to_string(inst->get_opcode()));
@@ -405,12 +434,16 @@ void SparseCore::finish_instruction(std::shared_ptr<Instruction>& inst) {
   }
   inst->finish_instruction();
   static_cast<Tile*>(inst->get_owner())->inc_finished_inst();
+  const char* trace_tag = (tag == InstFinishTraceTag::DmaIssueComplete)
+                              ? TraceLogTag::kAsyncDmaAllRequestsIssued
+                              : TraceLogTag::kInstructionFinished;
+  const std::string tag15 = TraceLogTag::pad15(trace_tag);
   if (inst->get_opcode() == Opcode::COMP) {
-    spdlog::info("[{}][StonneCore {}][INST_FINISHED] {}",
-      _core_cycle, _id, opcode_to_string(inst->get_opcode()));
+    spdlog::info("[{}][StonneCore {}][{}] {}", _core_cycle, _id, tag15,
+                 opcode_to_string(inst->get_opcode()));
   } else if (inst->get_opcode() == Opcode::MOVIN || inst->get_opcode() == Opcode::MOVOUT) {
-    spdlog::info("[{}][StonneCore {}][INST_FINISHED] {}", _core_cycle, _id,
-      opcode_to_string(inst->get_opcode()));
+    spdlog::info("[{}][StonneCore {}][{}] {}", _core_cycle, _id, tag15,
+                 opcode_to_string(inst->get_opcode()));
   }
 }
 
diff --git a/TOGSim/src/TileGraphParser.cc b/TOGSim/src/TileGraphParser.cc
index 882aba6b..5060d336 100644
--- a/TOGSim/src/TileGraphParser.cc
+++ b/TOGSim/src/TileGraphParser.cc
@@ -192,7 +192,7 @@ TileMemoryNode::TileMemoryNode(onnx::NodeProto& node) : TileNode(node) {
     if (attribute.name() == "torchsim_base_addr") {
       _base_addr_name = attribute.s();
     } else if (attribute.name() == "torchsim_element_size") {
-      _element_size = attribute.i();
+      _elem_bits = static_cast<size_t>(attribute.i());
     } else if (attribute.name() == "torchsim_tile_size") {
       for (int i = 0; i < attribute.ints_size(); i++)
         _tile_size.push_back(attribute.ints(i));
@@ -204,7 +204,7 @@ TileMemoryNode::TileMemoryNode(onnx::NodeProto& node) : TileNode(node) {
         _tag_idx_list.push_back(attribute.strings(i));
     } else if (attribute.name() == "torchsim_tag_stride_list") {
       for (int i = 0; i < attribute.ints_size(); i++)
-        _tag_stride_list.push_back(attribute.ints(i));
+        _tag_stride_list.push_back(static_cast<int64_t>(attribute.ints(i)));
     } else if (attribute.name() == "torchsim_loop_idx_list") {
       for (int i = 0; i < attribute.strings_size(); i++)
         _loop_idx_list.push_back(attribute.strings(i));
@@ -226,7 +226,7 @@ void TileMemoryNode::print_node() {
   TileNode::print_node();
   std::string spaces(get_depth(), '\t');
   spdlog::debug("{} base_addr_name: {}", spaces, _base_addr_name);
-  spdlog::debug("{} element_size: {}", spaces, _element_size);
+  spdlog::debug("{} elem_bits: {}", spaces, _elem_bits);
   spdlog::debug("{} loop_stride_list: {} ", spaces, _loop_stride_list);
   spdlog::debug("{} tile_size: {} ", spaces, _tile_size);
   spdlog::debug("{} tile_stride: {} ", spaces, _tile_stride);
@@ -243,10 +243,10 @@ TileMemoryWaitNode::TileMemoryWaitNode(onnx::NodeProto& node) : TileNode(node) {
         _tag_idx_list.push_back(attribute.strings(i));
     } else if (attribute.name() == "torchsim_tag_stride_list") {
       for (int i = 0; i < attribute.ints_size(); i++)
-        _tag_stride_list.push_back(attribute.ints(i));
+        _tag_stride_list.push_back(static_cast<int64_t>(attribute.ints(i)));
     } else if (attribute.name() == "torchsim_tag_divider_list") {
       for (int i = 0; i < attribute.ints_size(); i++)
-        _tag_divider_list.push_back(attribute.ints(i));
+        _tag_divider_list.push_back(static_cast<int64_t>(attribute.ints(i)));
     } else if (attribute.name() == "torchsim_base_addr") {
       _base_addr_name = attribute.s();
     }
@@ -352,12 +352,12 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
 
       /* Base address setting */
       std::string base_addr_name = mem_node->get_base_addr_name();
-      int base_addr_id = tog_parser->register_addr_name(base_addr_name);
+      int64_t base_addr_id = tog_parser->register_addr_name(base_addr_name);
       addr_type base_addr = tog_parser->lookup(base_addr_name);
       addr_type offset = std::inner_product(iter_list.begin(), iter_list.end(), mem_node->get_loop_stride_list().begin(), 0);
 
-      std::vector<int> tag_list;
-      std::vector<int> accum_tag_list;
+      std::vector<int64_t> tag_list;
+      std::vector<int64_t> accum_tag_list;
       std::vector<uint32_t> outer_loop_idx;
       std::vector<uint32_t> outer_loop_size;
       /* Add accumulation loop info to accum_tag list */
@@ -406,8 +406,8 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
       }
 
       /* Check need to make this memory node */
-      std::vector<int>& tag_stride_list = mem_node->get_tag_stride_list();
-      std::vector<int> key = tog_parser->calc_tag(accum_tag_list, tag_list, tag_stride_list);
+      std::vector<int64_t>& tag_stride_list = mem_node->get_tag_stride_list();
+      std::vector<int64_t> key = tog_parser->calc_tag(accum_tag_list, tag_list, tag_stride_list);
       if (tog_parser->check_memory_tag(base_addr_name, key))
         continue;
       tog_parser->register_memory_tag(base_addr_name, key);
@@ -422,7 +422,7 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
       std::shared_ptr<Instruction> inst = std::make_shared<Instruction>(
         Opcode::MOVIN, 0,
         0, base_addr+offset,
-        mem_node->get_tile_size(), mem_node->get_tile_stride(), mem_node->get_precision(),
+        mem_node->get_tile_size(), mem_node->get_tile_stride(), mem_node->get_elem_bits(),
         tag_list, tag_stride_list, accum_tag_list
       );
       inst->set_addr_name(base_addr_name, base_addr_id);
@@ -465,7 +465,7 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
 
       /* Lookup given name's address */
       std::string base_addr_name = mem_node->get_base_addr_name();
-      int base_addr_id = tog_parser->register_addr_name(base_addr_name);
+      int64_t base_addr_id = tog_parser->register_addr_name(base_addr_name);
       addr_type base_addr = tog_parser->lookup(base_addr_name);
       addr_type offset = std::inner_product(iter_list.begin(), iter_list.end(), mem_node->get_loop_stride_list().begin(), 0);
 
@@ -482,8 +482,8 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
       std::shared_ptr<Instruction> inst = std::make_shared<Instruction>(
         Opcode::MOVOUT, 0,
         0, base_addr+offset,
-        mem_node->get_tile_size(), mem_node->get_tile_stride(), mem_node->get_precision(),
-        std::vector<int>(1), mem_node->get_tag_stride_list(), std::vector<int>()
+        mem_node->get_tile_size(), mem_node->get_tile_stride(), mem_node->get_elem_bits(),
+        std::vector<int64_t>(1, 0), mem_node->get_tag_stride_list(), std::vector<int64_t>()
       );
       inst->set_addr_name(base_addr_name, base_addr_id);
       inst->prepare_tag_key();
@@ -500,15 +500,15 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
       printIndexMap("[TOGParser] DMA Wait Node ", iter);
       std::shared_ptr<TileMemoryWaitNode> wait_node = std::static_pointer_cast<TileMemoryWaitNode>(tile_node);
       auto base_addr_name = wait_node->get_base_addr_name();
-      int base_addr_id = tog_parser->register_addr_name(base_addr_name);
+      int64_t base_addr_id = tog_parser->register_addr_name(base_addr_name);
       addr_type base_addr = tog_parser->lookup(base_addr_name);
       /* Lookup given name's address */
       std::vector<int> iter_list;
-      std::vector<int> tag_list;
-      std::vector<int>& tag_stride_list = wait_node->get_tag_stride_list();
-      std::vector<int>& tag_divider_list = wait_node->get_tag_divider_list();
-      std::vector<int> new_tag_stride_list;
-      std::vector<int> accum_tag_list;
+      std::vector<int64_t> tag_list;
+      std::vector<int64_t>& tag_stride_list = wait_node->get_tag_stride_list();
+      std::vector<int64_t>& tag_divider_list = wait_node->get_tag_divider_list();
+      std::vector<int64_t> new_tag_stride_list;
+      std::vector<int64_t> accum_tag_list;
       auto& wait_tag_list = wait_node->get_tag_idx_list();
 
       for (int i=0; i<wait_tag_list.size();i++) {
@@ -555,9 +555,9 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
     } else if (tile_node->get_type() == TileType::COMPUTE_NODE) {
       printIndexMap("[TOGParser] Compute Node ", iter);
       std::shared_ptr<TileComputeNode> compute_node = std::static_pointer_cast<TileComputeNode>(tile_node);
-      std::vector<int> tag_list = {0};
-      std::vector<int> tag_stride_list = {1};
-      std::vector<int> accum_tag_list;
+      std::vector<int64_t> tag_list = {0};
+      std::vector<int64_t> tag_stride_list = {1};
+      std::vector<int64_t> accum_tag_list;
       std::shared_ptr<Instruction> inst = std::make_shared<Instruction>(
         Opcode::COMP, compute_node->get_cycle(),
         0, 0,
@@ -587,9 +587,6 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
             inst->add_child(child_inst);
           }
         }
-        /* Add instruction to tile */
-        if (inst->get_opcode() == Opcode::MOVIN)
-          tile_vec.back()->inc_required_sram_size(inst->get_tile_numel() * inst->get_precision());
       }
       link_map.clear();
       /* iterate nested loop */
@@ -668,9 +665,6 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
         inst->add_child(child_inst);
       }
     }
-    /* Add instruction to tile */
-    if (inst->get_opcode() == Opcode::MOVIN)
-      tile_vec.back()->inc_required_sram_size(inst->get_tile_numel() * inst->get_precision());
   }
 
   return tile_vec;
@@ -691,13 +685,13 @@ TileGraphParser::TileGraphParser(std::string onnx_path, std::string attribute_pa
   _attribute_path = attribute_path;
 
   if (!std::filesystem::exists(onnx_path)) {
-    throw std::runtime_error("Error: ONNX file not found at path: " + onnx_path);
+    throw std::runtime_error("Error: TOG graph path not found: " + onnx_path);
   }
   /* Note: this parsing algorithm assume that all node are sorted in topological-order */
   std::ifstream model_istream(onnx_path);
   google::protobuf::io::IstreamInputStream zero_copy_input(&model_istream);
   onnx::ModelProto model_proto;
-
+ 
   /* Attribute parsing */
   if (_attribute_config["address_info"]) {
     const auto& address_info = _attribute_config["address_info"];
@@ -744,7 +738,7 @@ TileGraphParser::TileGraphParser(std::string onnx_path, std::string attribute_pa
   }
   load_sparse_meta_data();
 
-  /* ONNX file parsing */
+  /* TOG file parsing */
   _tog_path = onnx_path;
   model_proto.ParseFromZeroCopyStream(&zero_copy_input) && model_istream.eof();
 
@@ -904,10 +898,10 @@ void TileGraphParser::register_tile(std::shared_ptr<TileNode> tile_node) {
   }
 }
 
-std::vector<int> TileGraphParser::calc_tag(std::vector<int>& accum_tag, std::vector<int>& tag_idx, std::vector<int>& tag_stride) {
-  int key_offset = 0;
-  std::vector<int> tag_key;
-  for (int i=0; i<tag_idx.size(); i++)
+std::vector<int64_t> TileGraphParser::calc_tag(std::vector<int64_t>& accum_tag, std::vector<int64_t>& tag_idx, std::vector<int64_t>& tag_stride) {
+  int64_t key_offset = 0;
+  std::vector<int64_t> tag_key;
+  for (size_t i = 0; i < tag_idx.size(); i++)
     key_offset += tag_idx.at(i) * tag_stride.at(i);
   for (auto accum_dim : accum_tag)
     tag_key.push_back(accum_dim);
@@ -915,12 +909,12 @@ std::vector<int> TileGraphParser::calc_tag(std::vector<int>& accum_tag, std::vec
   return tag_key;
 }
 
-void TileGraphParser::register_memory_tag(std::string name, std::vector<int>& tag_key) {
+void TileGraphParser::register_memory_tag(std::string name, std::vector<int64_t>& tag_key) {
   assert(_tag_table.find(std::make_pair(name, tag_key))==_tag_table.end());
   _tag_table[std::make_pair(name, tag_key)] = true;
 }
 
-bool TileGraphParser::check_memory_tag(std::string name, std::vector<int>& tag_key) {
+bool TileGraphParser::check_memory_tag(std::string name, std::vector<int64_t>& tag_key) {
   return _tag_table.find(std::make_pair(name, tag_key))==_tag_table.end() ? false : true;
 }
 

From 352309a0266d7fc10fa402794314cf6abfa27769 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 10 Apr 2026 19:05:25 +0900
Subject: [PATCH 163/194] [TOGSim] Update DRAM stat printing

---
 TOGSim/src/Dram.cc | 97 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 96 insertions(+), 1 deletion(-)

diff --git a/TOGSim/src/Dram.cc b/TOGSim/src/Dram.cc
index 656e57f8..dcaf94bc 100644
--- a/TOGSim/src/Dram.cc
+++ b/TOGSim/src/Dram.cc
@@ -1,5 +1,32 @@
 #include "Dram.h"
 
+#include <iostream>
+
+namespace {
+
+/** Bytes/s effective GB/s and avg-per-channel utilization % for a window of `window_cycles` DRAM ticks. */
+struct DramBwSnapshot {
+  double bandwidth_gbs = 0;
+  double util_avg_ch_pct = 0;
+};
+
+DramBwSnapshot make_dram_bw_snapshot(long long total_rw_transactions, uint64_t window_cycles,
+                                     uint32_t n_ch, uint32_t req_size, uint32_t n_bl,
+                                     double dram_freq_mhz) {
+  DramBwSnapshot out;
+  if (window_cycles == 0 || n_ch == 0)
+    return out;
+  const double tx = static_cast<double>(total_rw_transactions);
+  const double w = static_cast<double>(window_cycles);
+  const double bytes_per_cycle = tx * static_cast<double>(req_size) / w;
+  out.bandwidth_gbs = bytes_per_cycle * dram_freq_mhz / 1000.0;
+  const double avg_per_ch = tx / static_cast<double>(n_ch);
+  out.util_avg_ch_pct = avg_per_ch * 100.0 * static_cast<double>(n_bl) / (2.0 * w);
+  return out;
+}
+
+}  // namespace
+
 uint32_t Dram::get_channel_id(mem_fetch* access) {
   uint32_t channel_id;
   if (_n_ch_per_partition >= 16)
@@ -87,6 +114,39 @@ void DramRamulator2::cycle() {
         _mem[ch]->return_queue_pop();
     }
   }
+
+  if (_n_ch == 0)
+    return;
+  const int iv = _config.dram_print_interval;
+  if (iv <= 0)
+    return;
+  const uint64_t cc = *_core_cycles;
+  if (cc % static_cast<uint64_t>(iv) != 0 || cc == 0)
+    return;
+
+  const double f_mhz = static_cast<double>(_config.dram_freq_mhz);
+  const uint64_t w = static_cast<uint64_t>(iv);
+  long long r_all = 0;
+  long long w_all = 0;
+  for (int ch = 0; ch < _n_ch; ch++) {
+    const long long r = _mem[ch]->interval_reads();
+    const long long wtxn = _mem[ch]->interval_writes();
+    r_all += r;
+    w_all += wtxn;
+    const DramBwSnapshot bw =
+        make_dram_bw_snapshot(r + wtxn, w, 1u, _req_size, _n_bl, f_mhz);
+    spdlog::trace(
+        "[DRAM] ch {} | BW {:.2f} GB/s, {:.2f}% util | {} reads, {} writes (interval {} cycles)",
+        ch, bw.bandwidth_gbs, bw.util_avg_ch_pct, r, wtxn, w);
+  }
+  const DramBwSnapshot bw_all =
+      make_dram_bw_snapshot(r_all + w_all, w, _n_ch, _req_size, _n_bl, f_mhz);
+  spdlog::info(
+      "[DRAM] all {} ch | BW {:.2f} GB/s, {:.2f}% util (avg/ch) | {} reads, {} writes (interval {} cycles)",
+      _n_ch, bw_all.bandwidth_gbs, bw_all.util_avg_ch_pct, r_all, w_all, w);
+  for (int ch = 0; ch < _n_ch; ch++) {
+    _mem[ch]->reset_interval_bw_counters();
+  }
 }
 
 void DramRamulator2::cache_cycle()  {
@@ -120,9 +180,44 @@ void DramRamulator2::pop(uint32_t cid) {
 }
 
 void DramRamulator2::print_stat() {
+  spdlog::info("========= DRAM stat =========");
+  if (_n_ch == 0)
+    return;
+
+  for (int ch = 0; ch < _n_ch; ch++) {
+    _mem[ch]->finalize_once();
+  }
+
+  spdlog::trace("=== Ramulator2 stats (channels 0.. {}) ===", _n_ch - 1);
+  for (int ch = 0; ch < _n_ch; ch++) {
+    std::cout << "--- channel " << ch << " ---\n";
+    _mem[ch]->print_stats_yaml(std::cout);
+  }
+  std::cout.flush();
+
+  const uint64_t cycles = *_core_cycles;
+  if (cycles == 0)
+    return;
+  const double f_mhz = static_cast<double>(_config.dram_freq_mhz);
+  spdlog::info("[DRAM] per-channel avg BW ({} sim cycles):", cycles);
+  long long tr_all = 0;
+  long long tw_all = 0;
   for (int ch = 0; ch < _n_ch; ch++) {
-    _mem[ch]->print(stdout);
+    const long long tr = _mem[ch]->total_reads();
+    const long long tw = _mem[ch]->total_writes();
+    tr_all += tr;
+    tw_all += tw;
+    const DramBwSnapshot bw =
+        make_dram_bw_snapshot(tr + tw, cycles, 1u, _req_size, _n_bl, f_mhz);
+    spdlog::info(
+        "[DRAM] ch {} | avg BW {:.2f} GB/s, {:.2f}% util | {} reads, {} writes",
+        ch, bw.bandwidth_gbs, bw.util_avg_ch_pct, tr, tw);
   }
+  const DramBwSnapshot bw_all = make_dram_bw_snapshot(
+      tr_all + tw_all, cycles, _n_ch, _req_size, _n_bl, f_mhz);
+  spdlog::info(
+      "[DRAM] all ch 0..{} | avg BW {:.2f} GB/s, {:.2f}% util (avg/ch) | {} reads, {} writes",
+      _n_ch - 1, bw_all.bandwidth_gbs, bw_all.util_avg_ch_pct, tr_all, tw_all);
 }
 
 void DramRamulator2::print_cache_stats() {

From d059a1930776756f43ae73d4f60d7ff63a2d70af Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 13 Apr 2026 16:05:44 +0900
Subject: [PATCH 164/194] [TOGSim] Fix conversion of global address to channel
 address

---
 TOGSim/include/Dram.h     |  5 +--
 TOGSim/src/Dram.cc        | 64 +++++++++++++++++++++++++++++++++------
 TOGSim/src/Instruction.cc |  4 +--
 3 files changed, 60 insertions(+), 13 deletions(-)

diff --git a/TOGSim/include/Dram.h b/TOGSim/include/Dram.h
index d28ac25f..978bcdf9 100644
--- a/TOGSim/include/Dram.h
+++ b/TOGSim/include/Dram.h
@@ -29,6 +29,8 @@ class Dram {
   virtual void print_stat() {}
   virtual void print_cache_stats() {};
   uint32_t get_channels_per_partition() { return _n_ch_per_partition; }
+  new_addr_type partition_dram_address(new_addr_type raw_addr) const;
+
  protected:
   SimulationConfig _config;
   CacheConfig _m_cache_config;
@@ -37,6 +39,7 @@ class Dram {
   uint32_t _n_partitions;
   uint32_t _n_ch_per_partition;
   uint32_t _req_size;
+  int _tx_log2 = 0;
   cycle_type _cycles;
   cycle_type* _core_cycles;
   std::vector<DelayQueue<mem_fetch*>> m_cache_latency_queue;
@@ -83,8 +86,6 @@ class SimpleDRAM: public Dram {
   void print_cache_stats() override;
  private:
   int _latency = 1;
-  int _tx_ch_log2;
-  int _tx_log2;
   std::vector<std::unique_ptr<DelayQueue<mem_fetch*>>> _mem;
 };
 
diff --git a/TOGSim/src/Dram.cc b/TOGSim/src/Dram.cc
index dcaf94bc..95a55ca3 100644
--- a/TOGSim/src/Dram.cc
+++ b/TOGSim/src/Dram.cc
@@ -4,6 +4,28 @@
 
 namespace {
 
+static bool is_power_of_2_u32(uint32_t n) { return n != 0 && (n & (n - 1)) == 0; }
+
+static uint32_t floor_log2_u32(uint32_t n) {
+  uint32_t r = 0;
+  while (n >>= 1)
+    ++r;
+  return r;
+}
+
+/** Smallest power of two >= n (n >= 1). */
+static uint32_t next_power_of_2_u32(uint32_t n) {
+  if (n <= 1)
+    return 1;
+  --n;
+  n |= n >> 1;
+  n |= n >> 2;
+  n |= n >> 4;
+  n |= n >> 8;
+  n |= n >> 16;
+  return n + 1;
+}
+
 /** Bytes/s effective GB/s and avg-per-channel utilization % for a window of `window_cycles` DRAM ticks. */
 struct DramBwSnapshot {
   double bandwidth_gbs = 0;
@@ -27,14 +49,38 @@ DramBwSnapshot make_dram_bw_snapshot(long long total_rw_transactions, uint64_t w
 
 }  // namespace
 
+new_addr_type Dram::partition_dram_address(new_addr_type raw_addr) const {
+  if (_req_size == 0 || _n_ch_per_partition == 0)
+    return raw_addr;
+  const new_addr_type tx = raw_addr >> _tx_log2;
+  const new_addr_type q = tx / _n_ch_per_partition;
+  return static_cast<new_addr_type>(q << _tx_log2);
+}
+
 uint32_t Dram::get_channel_id(mem_fetch* access) {
-  uint32_t channel_id;
-  if (_n_ch_per_partition >= 16)
-    channel_id = ipoly_hash_function((new_addr_type)access->get_addr()/_req_size, 0, _n_ch_per_partition);
-  else
-    channel_id = ipoly_hash_function((new_addr_type)access->get_addr()/_req_size, 0, 16) % _n_ch_per_partition;
+  uint32_t channel_in_partition = 0;
+  if (_n_ch_per_partition > 1) {
+    const new_addr_type tx = static_cast<new_addr_type>(access->get_addr() >> _tx_log2);
+    new_addr_type rest_high;
+    unsigned init_index = 0;
+    if (is_power_of_2_u32(_n_ch_per_partition)) {
+      const unsigned lb = floor_log2_u32(_n_ch_per_partition);
+      rest_high = tx >> lb;
+      init_index = static_cast<unsigned>(tx & (_n_ch_per_partition - 1u));
+    } else {
+      /* gpgpu-sim "gap" channels: quotient / remainder split at txn granularity. */
+      rest_high = tx / _n_ch_per_partition;
+      init_index = static_cast<unsigned>(tx % _n_ch_per_partition);
+    }
+    /* ipoly_hash_function only implements 16/32/64 (see Hashing.cc); fold like addrdec IPOLY + mod when needed. */
+    const uint32_t poly_n = next_power_of_2_u32(std::max(16u, _n_ch_per_partition));
+    const uint32_t poly_use = std::min(poly_n, 64u);
+    channel_in_partition =
+        static_cast<uint32_t>(ipoly_hash_function(rest_high, init_index, poly_use)) % _n_ch_per_partition;
+  }
 
-  channel_id += ((access->get_numa_id() % _n_partitions)* _n_ch_per_partition);
+  const uint32_t channel_id =
+      channel_in_partition + static_cast<uint32_t>(access->get_numa_id() % _n_partitions) * _n_ch_per_partition;
   return channel_id;
 }
 
@@ -46,6 +92,7 @@ Dram::Dram(SimulationConfig config, cycle_type* core_cycle) {
   _n_partitions = config.dram_num_partitions;
   _n_ch_per_partition = config.dram_channels_per_partitions;
   _config = config;
+  _tx_log2 = static_cast<int>(std::log2(_req_size));
 
   spdlog::info("[Config/DRAM] DRAM Bandwidth {} GB/s, Freq: {} MHz, Channels: {}, Request_size: {}B", config.max_dram_bandwidth(), config.dram_freq_mhz, _n_ch, _req_size);
   /* Initialize DRAM Channels */
@@ -160,7 +207,8 @@ bool DramRamulator2::is_full(uint32_t cid, mem_fetch* request) {
 }
 
 void DramRamulator2::push(uint32_t cid, mem_fetch* request) {
-  addr_type target_addr = (request->get_addr() >> _tx_ch_log2) << _tx_log2;
+  const addr_type raw_addr = request->get_addr();
+  const addr_type target_addr = partition_dram_address(raw_addr);
   request->set_addr(target_addr);
   m_from_crossbar_queue[cid].push(request);
 }
@@ -233,8 +281,6 @@ SimpleDRAM::SimpleDRAM(SimulationConfig config, cycle_type* core_cycle) : Dram(c
     _mem.push_back(std::make_unique<DelayQueue<mem_fetch*>>("SimpleDRAM", true, -1));
   }
   _latency =  config.dram_latency;
-  _tx_log2 = log2(_req_size);
-  _tx_ch_log2 = log2(_n_ch_per_partition) + _tx_log2;
 }
 
 bool SimpleDRAM::running() {
diff --git a/TOGSim/src/Instruction.cc b/TOGSim/src/Instruction.cc
index 1dc3ff42..f236d160 100644
--- a/TOGSim/src/Instruction.cc
+++ b/TOGSim/src/Instruction.cc
@@ -108,10 +108,10 @@ std::shared_ptr<std::set<addr_type>> Instruction::get_dram_address(addr_type dra
                               dim1*tile_stride.at(tile_stride.size() - 3) + \
                               dim2*tile_stride.at(tile_stride.size() - 2) + \
                               dim3*tile_stride.at(tile_stride.size() - 1);
-          address = dram_addr + (address * _elem_bits + 7) / 8;
+          address = dram_addr + (address * _elem_bits + 7) >> 3;
           if (indirect_index != NULL) {
             uint64_t index_val = indirect_index[index_count++];
-            address += (index_val * _elem_bits + 7) / 8;
+            address += (index_val * _elem_bits + 7) >> 3;
           }
           address_set->insert(address - (address & dram_req_size-1));
         }

From b6805674fbfa07651bcd31076f56a97f491aebc8 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 13 Apr 2026 22:16:58 +0900
Subject: [PATCH 165/194] [TOGSim] Adjust ramulator2.1 config

---
 TOGSim/extern/ramulator2                   |  2 +-
 configs/ramulator2_configs/DDR4.yaml       |  2 +-
 configs/ramulator2_configs/HBM2.yaml       |  2 +-
 configs/ramulator2_configs/HBM2_TPUv3.yaml |  6 ++---
 configs/ramulator2_configs/LPDDR5.yaml     |  4 ++--
 configs/ramulator2_configs/LPDDR5X.yaml    | 18 +++++++--------
 configs/ramulator2_configs/gen_configs.py  | 26 +++++++++++++++++-----
 7 files changed, 38 insertions(+), 22 deletions(-)

diff --git a/TOGSim/extern/ramulator2 b/TOGSim/extern/ramulator2
index 70e85563..ad6acd97 160000
--- a/TOGSim/extern/ramulator2
+++ b/TOGSim/extern/ramulator2
@@ -1 +1 @@
-Subproject commit 70e855630b7f582bc8fa7370bfd582dc71d8af63
+Subproject commit ad6acd97e9fc60c44ed96a49267b7c20ab76e4d3
diff --git a/configs/ramulator2_configs/DDR4.yaml b/configs/ramulator2_configs/DDR4.yaml
index 45799436..c4b16617 100644
--- a/configs/ramulator2_configs/DDR4.yaml
+++ b/configs/ramulator2_configs/DDR4.yaml
@@ -22,7 +22,7 @@
         },
         "refresh_manager": {
           "impl": "AllBank",
-          "scope": "Rank"
+          "scope": "Channel"
         },
         "row_policy": {
           "impl": "Open"
diff --git a/configs/ramulator2_configs/HBM2.yaml b/configs/ramulator2_configs/HBM2.yaml
index 2bdd1705..3dda8abf 100644
--- a/configs/ramulator2_configs/HBM2.yaml
+++ b/configs/ramulator2_configs/HBM2.yaml
@@ -11,7 +11,7 @@
     },
     "controllers": [
       {
-        "impl": "GenericDDR",
+        "impl": "HBM",
         "wr_low_watermark": 0.2,
         "wr_high_watermark": 0.8,
         "read_buffer_size": 32,
diff --git a/configs/ramulator2_configs/HBM2_TPUv3.yaml b/configs/ramulator2_configs/HBM2_TPUv3.yaml
index 2bdd1705..01cab613 100644
--- a/configs/ramulator2_configs/HBM2_TPUv3.yaml
+++ b/configs/ramulator2_configs/HBM2_TPUv3.yaml
@@ -11,11 +11,11 @@
     },
     "controllers": [
       {
-        "impl": "GenericDDR",
+        "impl": "HBM",
         "wr_low_watermark": 0.2,
         "wr_high_watermark": 0.8,
-        "read_buffer_size": 32,
-        "write_buffer_size": 32,
+        "read_buffer_size": 64,
+        "write_buffer_size": 64,
         "priority_buffer_size": 1568,
         "scheduler": {
           "impl": "FRFCFS"
diff --git a/configs/ramulator2_configs/LPDDR5.yaml b/configs/ramulator2_configs/LPDDR5.yaml
index bf039f9f..cbb08b5e 100644
--- a/configs/ramulator2_configs/LPDDR5.yaml
+++ b/configs/ramulator2_configs/LPDDR5.yaml
@@ -11,7 +11,7 @@
     },
     "controllers": [
       {
-        "impl": "GenericDDR",
+        "impl": "LPDDR5",
         "wr_low_watermark": 0.2,
         "wr_high_watermark": 0.8,
         "read_buffer_size": 32,
@@ -22,7 +22,7 @@
         },
         "refresh_manager": {
           "impl": "AllBank",
-          "scope": "Rank"
+          "scope": "Channel"
         },
         "row_policy": {
           "impl": "Open"
diff --git a/configs/ramulator2_configs/LPDDR5X.yaml b/configs/ramulator2_configs/LPDDR5X.yaml
index 4309aa6c..a8f454c4 100644
--- a/configs/ramulator2_configs/LPDDR5X.yaml
+++ b/configs/ramulator2_configs/LPDDR5X.yaml
@@ -11,7 +11,7 @@
     },
     "controllers": [
       {
-        "impl": "GenericDDR",
+        "impl": "LPDDR5",
         "wr_low_watermark": 0.2,
         "wr_high_watermark": 0.8,
         "read_buffer_size": 32,
@@ -22,7 +22,7 @@
         },
         "refresh_manager": {
           "impl": "AllBank",
-          "scope": "Rank"
+          "scope": "Channel"
         },
         "row_policy": {
           "impl": "Open"
@@ -52,7 +52,7 @@
             23,
             46,
             65,
-            38,
+            37,
             11,
             12,
             2,
@@ -63,7 +63,7 @@
             6,
             6,
             7,
-            14,
+            13,
             22,
             224,
             128,
@@ -220,7 +220,7 @@
               [
                 3
               ],
-              52
+              51
             ],
             [
               1,
@@ -314,7 +314,7 @@
               [
                 10
               ],
-              72
+              71
             ],
             [
               1,
@@ -361,7 +361,7 @@
                 6,
                 8
               ],
-              28
+              27
             ],
             [
               2,
@@ -434,7 +434,7 @@
               [
                 2
               ],
-              52
+              51
             ],
             [
               3,
@@ -454,7 +454,7 @@
               [
                 0
               ],
-              72
+              71
             ],
             [
               3,
diff --git a/configs/ramulator2_configs/gen_configs.py b/configs/ramulator2_configs/gen_configs.py
index 64eb62d2..d27cd6de 100644
--- a/configs/ramulator2_configs/gen_configs.py
+++ b/configs/ramulator2_configs/gen_configs.py
@@ -28,7 +28,12 @@
 import ramulator.memory_system
 
 
-def make_config(dram_obj, clock_ratio=1, refresh_scope="Rank"):
+def _dram_standard_name(dram_obj):
+    """DRAMStandard.name from class or instance (e.g. 'HBM2', 'DDR4')."""
+    return getattr(type(dram_obj), "name", None) or getattr(dram_obj, "name", None) or ""
+
+
+def make_config(dram_obj, clock_ratio=1, refresh_scope="Channel"):
     """Wrap a DRAM object in a single-channel GenericDRAM config for PyTorchSim.
 
     PyTorchSim creates one Ramulator2 instance per channel, so each config
@@ -36,10 +41,22 @@ def make_config(dram_obj, clock_ratio=1, refresh_scope="Rank"):
     The wrapper overrides 'frontend' to ExternalFrontEnd automatically.
 
     refresh_scope: level name for AllBank refresh.
-      - DDR4 / LPDDR5 / LPDDR5X → "Rank"
-      - HBM2 / HBM3              → "PseudoChannel"
+      - DDR4 / LPDDR5 / LPDDR5X -> "Channel"
+      - HBM2 / HBM3             -> "PseudoChannel"
+
+    Controller choice (matches C++ controller impls):
+      - HBM*      -> ramulator.controller.HBM
+      - LPDDR*    -> ramulator.controller.LPDDR5 (incl. LPDDR5X timing on the LPDDR5 DRAM model)
+      - otherwise -> GenericDDR
     """
-    ctrl = ramulator.controller.GenericDDR(
+    dram_name = str(_dram_standard_name(dram_obj)).upper()
+    if dram_name.startswith("HBM"):
+        ctrl_cls = ramulator.controller.HBM
+    elif dram_name.startswith("LPDDR"):
+        ctrl_cls = ramulator.controller.LPDDR5
+    else:
+        ctrl_cls = ramulator.controller.GenericDDR
+    ctrl = ctrl_cls(
         dram=dram_obj,
         scheduler=ramulator.scheduler.FRFCFS(),
         refresh_manager=ramulator.refresh_manager.AllBank(scope=refresh_scope),
@@ -70,7 +87,6 @@ def gen_hbm2_tpuv3():
     dram = ramulator.dram.HBM2(org_preset="HBM2_8Gb", timing_preset="HBM2_2000Mbps")
     return make_config(dram, clock_ratio=1, refresh_scope="PseudoChannel")
 
-
 def gen_ddr4():
     # Available timing presets — check python/ramulator/dram/ddr4.py
     dram = ramulator.dram.DDR4(org_preset="DDR4_8Gb_x8", timing_preset="DDR4_3200AA")

From 8bbb3c20d6372503812ce4201a9c00877eb79b1a Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 14 Apr 2026 11:40:50 +0900
Subject: [PATCH 166/194] [Version] Update a LLVM version dependecy

---
 thirdparty/github-releases.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/github-releases.json b/thirdparty/github-releases.json
index 25c220c9..34b63e54 100644
--- a/thirdparty/github-releases.json
+++ b/thirdparty/github-releases.json
@@ -8,7 +8,7 @@
   },
   "llvm_project": {
     "repository": "PSAL-POSTECH/llvm-project",
-    "release_tag": "v1.0.6",
+    "release_tag": "v1.0.7",
     "asset_name": "riscv-llvm-release.tar.gz"
   },
   "spike": {

From 93e8c7aad61f18286ce94f2e95a230007766450c Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 14 Apr 2026 13:52:52 +0900
Subject: [PATCH 167/194] [MLIR] Update MLIR version

---
 PyTorchSimFrontend/extension_codecache.py | 2 +-
 thirdparty/github-releases.json           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index 65c96f11..8da2d71c 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -95,7 +95,7 @@ def mlir_gem5_compile_command(filename, sample_filename, tog_file, vectorlane_si
             -dma-fine-grained='systolic-array-size={vectorlane_size}' \
             -global-idx='vlen={vlen}' \
             -test-pytorchsim-to-vcix='systolic-array-size={vectorlane_size} vlen={vlen}' \
-            -test-tile-operation-graph='vectorlane={vectorlane_size} tls_mode={extension_config.CONFIG_TLS_MODE}' \
+            -test-tile-operation-graph='vectorlane={vectorlane_size} sample-mode={extension_config.CONFIG_TLS_MODE}' \
             -test-memref-to-gemmini="vectorlane={vectorlane_size} timing=1" \
             -convert-linalg-to-loops \
             -convert-vector-to-scf='full-unroll' \
diff --git a/thirdparty/github-releases.json b/thirdparty/github-releases.json
index 34b63e54..ec89c24f 100644
--- a/thirdparty/github-releases.json
+++ b/thirdparty/github-releases.json
@@ -8,7 +8,7 @@
   },
   "llvm_project": {
     "repository": "PSAL-POSTECH/llvm-project",
-    "release_tag": "v1.0.7",
+    "release_tag": "v1.0.8",
     "asset_name": "riscv-llvm-release.tar.gz"
   },
   "spike": {

From 0993319df2e9b0144f2c6ffe6ee415063aa08672 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 14 Apr 2026 23:50:02 +0900
Subject: [PATCH 168/194] [Autotune] subprocess timeouts from first
 finite-cycle wall time

---
 PyTorchSimFrontend/extension_codecache.py     |  8 +++-
 PyTorchSimFrontend/extension_config.py        |  4 ++
 PyTorchSimFrontend/mlir/mlir_autotune.py      | 12 +++--
 .../mlir/mlir_codegen_backend.py              | 44 ++++++++++++++++---
 Simulator/simulator.py                        | 25 ++++++++++-
 5 files changed, 77 insertions(+), 16 deletions(-)

diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index 8da2d71c..6192c47b 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -279,7 +279,7 @@ def task():
             return key
         future = self.submit(task)
 
-        def run_kernel_simulation(*args, **kwargs):
+        def run_kernel_simulation(*args, autotune_subprocess_timeout_sec=None, **kwargs):
             # Wait for compilation
             key = future.result()
             from filelock import FileLock
@@ -311,7 +311,11 @@ def run_kernel_simulation(*args, **kwargs):
                     result = None # No result for non-autotune mode
                 else:
                     result_path = TOGSimulator.run_standalone(
-                        onnx_path, kernel_attribute_path, autotune_mode=autotune)
+                        onnx_path,
+                        kernel_attribute_path,
+                        autotune_mode=autotune,
+                        timeout_sec=autotune_subprocess_timeout_sec,
+                    )
                     result = TOGSimulator.get_result_from_file(result_path)
                 return result
         return run_kernel_simulation
diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py
index 5dec8a4b..cf8d806e 100644
--- a/PyTorchSimFrontend/extension_config.py
+++ b/PyTorchSimFrontend/extension_config.py
@@ -89,6 +89,10 @@ def __getattr__(name):
         return config_yaml["codegen_autotune_max_retry"]
     if name == "codegen_autotune_template_topk":
         return config_yaml["codegen_autotune_template_topk"]
+    # Added to first candidate wall time for other candidates' TOGSim subprocess timeout (>= 1 s).
+    if name == "codegen_autotune_wall_slack_sec":
+        v = float(config_yaml.get("codegen_autotune_wall_slack_sec", 15))
+        return max(1.0, v)
 
     # Compiler Optimization
     if name == "codegen_compiler_optimization":
diff --git a/PyTorchSimFrontend/mlir/mlir_autotune.py b/PyTorchSimFrontend/mlir/mlir_autotune.py
index fe1f86a1..3489afbd 100644
--- a/PyTorchSimFrontend/mlir/mlir_autotune.py
+++ b/PyTorchSimFrontend/mlir/mlir_autotune.py
@@ -1,4 +1,3 @@
-import functools
 import torch
 import os
 import dataclasses
@@ -76,7 +75,7 @@ def make_run_fn(
                 latest_log_file = log_files_with_time[0][0]
                 result_path = os.path.join(result_dir, latest_log_file)
                 result = TOGSimulator.get_result_from_file(result_path)
-                def cached_run_fn(*args, **kwargs):
+                def cached_run_fn(*args, autotune_subprocess_timeout_sec=None, **kwargs):
                     return result
                 return cached_run_fn
 
@@ -93,11 +92,10 @@ def cached_run_fn(*args, **kwargs):
             for tensor in list(input_tensors) + list(output_tensors)
         ]
 
-        # Generate partial function.
-        return functools.partial(
-            run_method,
-            *args,
-        )
+        def schedule_run(autotune_subprocess_timeout_sec=None):
+            return run_method(*args, autotune_subprocess_timeout_sec=autotune_subprocess_timeout_sec)
+
+        return schedule_run
 
     def update_workspace_size(self) -> None:
         # FIXME: Not implemented yet. Checkout torch/_inductor/codegen/rocm/rocm_benchmark_request.py
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 58d6a70d..492b7416 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -1,6 +1,7 @@
 import contextlib
 import sympy
 import sys
+import time
 import re
 import os
 from functools import reduce
@@ -1028,25 +1029,58 @@ def make_choices(self, nodes, kernel_name):
         return choices
 
     def autotune(self, *args):
-        def get_cycle(choice):
+        def get_cycle(choice, subprocess_timeout_sec=None):
             bench_runner = choice[0]
             for n_try in range(extension_config.codegen_autotune_max_retry): # TODO: make simple
                 try:
-                    out = bench_runner()
+                    if subprocess_timeout_sec is not None:
+                        out = bench_runner(
+                            autotune_subprocess_timeout_sec=subprocess_timeout_sec
+                        )
+                    else:
+                        out = bench_runner()
                     return out[-1]
-                except (extension_codecache.SpadOverflowError, RuntimeError) as e:
+                except (extension_codecache.SpadOverflowError, RuntimeError):
                     return float("inf")
             return float("inf") # Exceeded maximum number of autotuning attempts
         choices = self.make_choices(*args)
         if len(choices) == 0: # Can't autotune
             return [None, None, None]
 
+        slack_sec = float(extension_config.codegen_autotune_wall_slack_sec)
+
         # Get cycle time for each choice
         # Show progress bar only when CONFIG_DEBUG_MODE is off
         show_progress = not extension_config.CONFIG_DEBUG_MODE
         with ProgressBar("[Auto-tune] Running benchmarks", silent_mode=not show_progress) if show_progress else contextlib.nullcontext():
-            with ThreadPoolExecutor(max_workers=8) as executor:
-                results = list(executor.map(get_cycle, choices))
+            results = [float("inf")] * len(choices)
+            baseline_wall = None
+            parallel_from = 0
+
+            for idx, choice in enumerate(choices):
+                t0 = time.perf_counter()
+                c = get_cycle(choice, None)
+                elapsed = time.perf_counter() - t0
+                results[idx] = c
+                parallel_from = idx + 1
+                if c != float("inf"):
+                    baseline_wall = elapsed
+                    break
+
+            pending = choices[parallel_from:]
+            if baseline_wall is not None and pending:
+                timeout_sec = baseline_wall + slack_sec
+                workers = min(8, len(pending), os.cpu_count())
+                executor = ThreadPoolExecutor(max_workers=workers)
+                try:
+                    tail = list(
+                        executor.map(
+                            lambda ch: get_cycle(ch, timeout_sec), pending
+                        )
+                    )
+                finally:
+                    executor.shutdown(wait=True, cancel_futures=True)
+                results[parallel_from : parallel_from + len(tail)] = tail
 
         min_idx = results.index(min(results))
         if min(results) == float("inf"):
diff --git a/Simulator/simulator.py b/Simulator/simulator.py
index 5b00d5d4..2b9f05be 100644
--- a/Simulator/simulator.py
+++ b/Simulator/simulator.py
@@ -509,7 +509,14 @@ def get_togsim_command(config_path, togsim_path=None):
         return cmd
 
     @staticmethod
-    def run_standalone(model_path, attribute_path="", autotune_mode=False, config_path=None, togsim_path=None):
+    def run_standalone(
+        model_path,
+        attribute_path="",
+        autotune_mode=False,
+        config_path=None,
+        togsim_path=None,
+        timeout_sec=None,
+    ):
         """
         Run a single kernel simulation in standalone mode.
         This method starts a new TOGSim process, runs the kernel, and waits for completion.
@@ -521,6 +528,8 @@ def run_standalone(model_path, attribute_path="", autotune_mode=False, config_pa
             autotune_mode: If True, run in autotune mode (silent)
             config_path: Path to TOGSim config file (required)
             togsim_path: Path to TOGSim directory (optional, defaults to CONFIG_TORCHSIM_DIR/TOGSim)
+            timeout_sec: If set, terminate the Simulator subprocess after this many seconds
+                (autotune uses this to skip very slow tile candidates).
 
         Returns:
             Path to the simulation result log file
@@ -559,7 +568,19 @@ def run_standalone(model_path, attribute_path="", autotune_mode=False, config_pa
                 logger.debug(f"[TOGSim] cmd> {cmd}")
                 logger.info("[TOGSim] TOGSim simulation started")
             with ProgressBar("[TOGSim] Running simulation", silent_mode=autotune_mode):
-                result = subprocess.check_output(shlex.split(cmd))
+                completed = subprocess.run(
+                    shlex.split(cmd),
+                    capture_output=True,
+                    check=True,
+                    timeout=timeout_sec,
+                )
+                result = completed.stdout
+        except subprocess.TimeoutExpired as e:
+            logger.warning(
+                "[TOGSim] Simulator subprocess exceeded timeout (%.1f s); terminating.",
+                float(timeout_sec) if timeout_sec is not None else -1.0,
+            )
+            raise RuntimeError("TOGSim subprocess timeout") from e
         except subprocess.CalledProcessError as e:
             logger.error(f"[TOGSim] Command failed with exit code {e.returncode}")
             logger.error(f"[TOGSim] Error output: {e.output.decode() if isinstance(e.output, bytes) else e.output}")

From 46c49541795d80eb4d62aa5ff88ac423c37f64b7 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 15 Apr 2026 22:30:50 +0900
Subject: [PATCH 169/194] [Autotune] Add non-subtiling option in
 tile_candidates

---
 PyTorchSimFrontend/mlir/mlir_gemm_template.py                   | 1 +
 configs/systolic_ws_128x128_c1_simple_noc_tpuv3_timing_only.yml | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_gemm_template.py b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
index 9c61c3d9..8a8cd585 100644
--- a/PyTorchSimFrontend/mlir/mlir_gemm_template.py
+++ b/PyTorchSimFrontend/mlir/mlir_gemm_template.py
@@ -340,6 +340,7 @@ def select_tile(self, kernel, M, N, K, n_extra_node, n_extra_read, n_prologue_no
         for idx, (TILE_M, TILE_N, TILE_K) in enumerate(tile_candidates):
             # Case 1: calculate sub tile size for fine-grained DMA
             if extension_config.CONFIG_SUBTILE:
+                full_tile_candidates.append([TILE_M, TILE_N, TILE_K]*2)
                 SUB_TILE_M = TILE_M if (TILE_M < kernel.vector_lane or n_prologue_node) else kernel.vector_lane
                 if (TILE_M == M and TILE_N == N and TILE_N <= 512):
                     SUB_TILE_N = TILE_N if TILE_N < kernel.vector_lane else kernel.vector_lane
diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_timing_only.yml b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_timing_only.yml
index f8ac0a54..a7607108 100644
--- a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_timing_only.yml
+++ b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_timing_only.yml
@@ -26,5 +26,5 @@ pytorchsim_timing_mode: 1
 codegen_mapping_strategy: autotune
 codegen_external_mapping_file: ''
 codegen_autotune_max_retry: 10
-codegen_autotune_template_topk: 4
+codegen_autotune_template_topk: 8
 codegen_compiler_optimization: all

From fbe0bc0ea7afe507c43068e2b15089657d1aaf16 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 16 Apr 2026 21:04:12 +0900
Subject: [PATCH 170/194] [Lower] Add filter condition

---
 PyTorchSimFrontend/mlir/mlir_lowering.py | 134 +++++++++++++++++++----
 1 file changed, 111 insertions(+), 23 deletions(-)

diff --git a/PyTorchSimFrontend/mlir/mlir_lowering.py b/PyTorchSimFrontend/mlir/mlir_lowering.py
index b717089f..7f33d956 100644
--- a/PyTorchSimFrontend/mlir/mlir_lowering.py
+++ b/PyTorchSimFrontend/mlir/mlir_lowering.py
@@ -1,5 +1,5 @@
 import math
-from typing import List, Optional, Sequence
+from typing import Any, Callable, List, Optional, Sequence
 
 import torch
 from torch._inductor.lowering import lowerings, index_impl
@@ -29,26 +29,67 @@
 aten_spmm = MLIRExternKernelChoice(torch.sparse.mm, "custom_op::sparse_addmm")
 _orig_sort_values_stable_lowering = lowerings.get(aten.sort.values_stable)
 
-def tuned_mm(mat1, mat2, * ,layout=None):
+
+def _device_is_npu(device: Optional[torch.device]) -> bool:
+    return device is not None and device.type == "npu"
+
+
+def _tensor_args_all_npu(*roots, optional=()) -> bool:
+    """True only if every tensor-like IR node under roots/optional is on an NPU device."""
+    stack: list = list(roots) + list(optional)
+    while stack:
+        n = stack.pop()
+        if n is None:
+            continue
+        if isinstance(n, (list, tuple)):
+            stack.extend(n)
+            continue
+        get_dev = getattr(n, "get_device", None)
+        if get_dev is None:
+            continue
+        if not _device_is_npu(get_dev()):
+            return False
+    return True
+
+
+def _override_lowerings_npu(
+    aten_op: Any,
+    mlir_impl: Callable[..., Any],
+    npu_ok: Callable[..., bool],
+) -> None:
+    """Register mlir_impl for each overload; fall back to the prior lowering if npu_ok is false."""
+    for overload in aten_op.overloads():
+        op = getattr(aten_op, overload)
+        orig = lowerings.get(op)
+
+        def wrapped(*args, _orig=orig, **kwargs):
+            if not npu_ok(*args, **kwargs):
+                return _orig(*args, **kwargs)
+            return mlir_impl(*args, **kwargs)
+
+        lowerings[op] = wrapped
+
+
+def _mlir_tuned_mm(mat1, mat2, *, layout=None):
     m, n, k, layout, mat1, mat2 = mm_args(mat1, mat2, layout=layout)
     mlir_template = MLIRGemmTemplate([mat1, mat2], layout)
 
     return mlir_template.generate(input_nodes=[mat1, mat2], layout=layout).output_node()
 
-def tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
+def _mlir_tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
     m, n, k, layout, mat1, mat2, inp_expanded = mm_args(mat1, mat2, inp, layout=layout)
     mlir_template = MLIRGemmTemplate([mat1, mat2, inp_expanded], layout)
 
     return mlir_template.generate().output_node()
 
-def tuned_bmm(mat1, mat2, *, layout=None):
+def _mlir_tuned_bmm(mat1, mat2, *, layout=None):
     m, n, k, layout, mat1, mat2 = mm_args(mat1, mat2, layout=layout)
     mlir_template = MLIRBMMTemplate([mat1, mat2], layout)
 
     return mlir_template.generate().output_node()
 
 
-def tuned_flash_sdpa(
+def _mlir_tuned_flash_sdpa(
         query             : TensorBox,
         key               : TensorBox,
         value             : TensorBox,
@@ -69,7 +110,6 @@ def tuned_flash_sdpa(
     return (mlir_template.generate().output_node(), None, None, None, None, None, None, None, None)
 
 
-
 def conv_layout(
     x: TensorBox,
     weight: TensorBox,
@@ -104,7 +144,7 @@ def conv_layout(
         stride,
     )
 
-def convolution(
+def _mlir_convolution(
     x: TensorBox,
     weight: TensorBox,
     bias: TensorBox,
@@ -176,7 +216,7 @@ def maxpool_layout(
         stride,
     )
 
-def custom_maxpool(
+def _mlir_custom_maxpool(
     x: TensorBox,
     kernel_size: List[int],
     stride: List[int],
@@ -197,7 +237,7 @@ def custom_maxpool(
     template_node = mlir_template.generate().output_node()
     return template_node, x # FIXME: x is dummy IRNode, indices are not used in our case
 
-def sparse_addmm(*args, **kwargs):
+def _mlir_sparse_addmm(*args, **kwargs):
     _, sp_mat1, sp_mat2 = args
     mat1_layout = sp_mat1.layout
     out_range = args[0].data.data.data.ranges
@@ -207,7 +247,7 @@ def sparse_addmm(*args, **kwargs):
         )
     return aten_spmm.bind((sp_mat1, sp_mat2), layout).output_node()
 
-def custom_unsafe_index(x, indices):
+def _mlir_custom_unsafe_index(x, indices):
     # We can't fuse indirect access + indexed_expression + computation
     if isinstance(x, TensorBox):
         x.realize()
@@ -229,7 +269,7 @@ def _cat_layout(tensors: Sequence[TensorBox], dim: int) -> ir.Layout:
         stride,
     )
 
-def custom_cat_default(tensors: Sequence[TensorBox], dim: int = 0):
+def _mlir_custom_cat_default(tensors: Sequence[TensorBox], dim: int = 0):
     if tensors and dim < 0:
         dim += len(tensors[0].get_size())
     copy_default_lowering = lowerings.get(aten.copy_.default)
@@ -255,7 +295,7 @@ def custom_cat_default(tensors: Sequence[TensorBox], dim: int = 0):
     mlir_template = MLIRCatTemplate(list(new_tensors), layout, dim=dim)
     return mlir_template.generate().output_node()
 
-def custom_sort_default(
+def _mlir_custom_sort_default(
     value: TensorBox,
     dim: int = -1,
     descending: bool = False,
@@ -303,15 +343,63 @@ def _sort_layouts(x: TensorBox, dim: int, descending: bool):
     index_layout = ir.FixedLayout(x.get_device(), torch.int64, i_sizes, i_stride)
     return value_layout, index_layout
 
-lowerings.update({getattr(aten.mm, overload): tuned_mm for overload in aten.mm.overloads()})
-lowerings.update({getattr(aten.addmm, overload): tuned_addmm for overload in aten.addmm.overloads()})
-lowerings.update({getattr(aten.convolution, overload): convolution for overload in aten.convolution.overloads()})
-lowerings.update({getattr(aten.bmm, overload): tuned_bmm for overload in aten.bmm.overloads()})
-lowerings.update({getattr(aten._sparse_addmm, overload): sparse_addmm for overload in aten._sparse_addmm.overloads()})
-lowerings.update({getattr(aten._unsafe_index, overload): custom_unsafe_index for overload in aten._unsafe_index.overloads()})
-lowerings.update({getattr(aten.cat, overload): custom_cat_default for overload in aten.cat.overloads()})
-lowerings.update({getattr(aten.sort, overload): custom_sort_default for overload in aten.sort.overloads()})
-    
+_override_lowerings_npu(
+    aten.mm,
+    _mlir_tuned_mm,
+    lambda mat1, mat2, **_: _tensor_args_all_npu(mat1, mat2),
+)
+_override_lowerings_npu(
+    aten.addmm,
+    _mlir_tuned_addmm,
+    lambda inp, mat1, mat2, **_: _tensor_args_all_npu(inp, mat1, mat2),
+)
+_override_lowerings_npu(
+    aten.convolution,
+    _mlir_convolution,
+    lambda *a, **_: len(a) >= 2
+    and _tensor_args_all_npu(a[0], a[1], optional=(a[2] if len(a) > 2 else None,)),
+)
+_override_lowerings_npu(
+    aten.bmm,
+    _mlir_tuned_bmm,
+    lambda mat1, mat2, **_: _tensor_args_all_npu(mat1, mat2),
+)
+_override_lowerings_npu(
+    aten._sparse_addmm,
+    _mlir_sparse_addmm,
+    lambda *a, **_: len(a) >= 3 and _tensor_args_all_npu(a[1], a[2]),
+)
+_override_lowerings_npu(
+    aten._unsafe_index,
+    _mlir_custom_unsafe_index,
+    lambda x, indices, **_: _tensor_args_all_npu(x, indices),
+)
+_override_lowerings_npu(
+    aten.cat,
+    _mlir_custom_cat_default,
+    lambda *a, **_k: a and _tensor_args_all_npu(a[0]),
+)
+_override_lowerings_npu(
+    aten.sort,
+    _mlir_custom_sort_default,
+    lambda *a, **_k: a and _tensor_args_all_npu(a[0]),
+)
+
 if extension_config.CONFIG_USE_TIMING_POOLING:
-    lowerings.update({getattr(aten.max_pool2d_with_indices, overload): custom_maxpool for overload in aten.max_pool2d_with_indices.overloads()}) # FIXME: maxpool should be implemented as a template
-lowerings.update({getattr(aten._scaled_dot_product_fused_attention_overrideable, overload): tuned_flash_sdpa for overload in aten._scaled_dot_product_fused_attention_overrideable.overloads()})
+    _override_lowerings_npu(
+        aten.max_pool2d_with_indices,
+        _mlir_custom_maxpool,
+        lambda *a, **_: bool(a) and _tensor_args_all_npu(a[0]),
+    )
+
+_override_lowerings_npu(
+    aten._scaled_dot_product_fused_attention_overrideable,
+    _mlir_tuned_flash_sdpa,
+    lambda *a, **k: len(a) >= 3
+    and _tensor_args_all_npu(
+        a[0],
+        a[1],
+        a[2],
+        optional=(a[3] if len(a) > 3 else k.get("attn_bias"),),
+    ),
+)

From 901f93e0f3f5d1a797949d5d1a227c79836d22d6 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Mon, 20 Apr 2026 01:29:06 +0000
Subject: [PATCH 171/194] [Tutorial] ispass2026 session1

---
 PyTorchSimFrontend/extension_config.py        |  9 +-
 tutorial/session1/CompilerOptimization.ipynb  | 65 ++++++++++++--
 tutorial/session1/DNNServing.ipynb            | 87 +++++++++++-------
 tutorial/session1/ExecutionMode.ipynb         | 88 ++++++++++++++++---
 tutorial/session1/Inference.ipynb             | 31 ++++++-
 tutorial/session1/LogAnalysis.ipynb           | 32 ++++++-
 tutorial/session1/Mapping.ipynb               | 78 +++++++++++++---
 tutorial/session1/Training.ipynb              | 56 ++++++++++--
 .../session1/tutorial_external_mapping.json   |  2 +-
 9 files changed, 362 insertions(+), 86 deletions(-)

diff --git a/PyTorchSimFrontend/extension_config.py b/PyTorchSimFrontend/extension_config.py
index cf8d806e..d79ca390 100644
--- a/PyTorchSimFrontend/extension_config.py
+++ b/PyTorchSimFrontend/extension_config.py
@@ -41,9 +41,6 @@ def _default_tog_host_ldflags():
 
 CONFIG_TORCHSIM_DUMP_MLIR_IR = int(os.environ.get("TORCHSIM_DUMP_MLIR_IR", default=False))
 CONFIG_TORCHSIM_DUMP_LLVM_IR = int(os.environ.get("TORCHSIM_DUMP_LLVM_IR", default=False))
-CONFIG_TORCHSIM_DUMP_PATH = os.environ.get("TORCHSIM_DUMP_PATH", os.path.join(CONFIG_TORCHSIM_DIR, "outputs"))
-CONFIG_TORCHSIM_LOG_PATH = os.environ.get("TORCHSIM_LOG_PATH", os.path.join(CONFIG_TORCHSIM_DIR, "togsim_results"))
-os.environ["TORCHINDUCTOR_CACHE_DIR"] = os.path.join(CONFIG_TORCHSIM_DUMP_PATH, ".torchinductor")
 
 def __getattr__(name):
     # TOGSim config
@@ -137,6 +134,12 @@ def __getattr__(name):
 
     if name == "CONFIG_TOGSIM_DEBUG_LEVEL":
         return os.environ.get("TOGSIM_DEBUG_LEVEL", "")
+    if name == "CONFIG_TORCHSIM_DUMP_PATH":
+        dump_path = os.environ.get('TORCHSIM_DUMP_PATH', default = os.path.join(CONFIG_TORCHSIM_DIR, "outputs"))
+        os.environ["TORCHINDUCTOR_CACHE_DIR"] = os.path.join(dump_path, ".torchinductor")
+        return dump_path
+    if name == "CONFIG_TORCHSIM_LOG_PATH":
+        return os.environ.get('TORCHSIM_LOG_PATH', default = os.path.join(CONFIG_TORCHSIM_DIR, "togsim_results"))
 
 # SRAM Buffer allocation plan
 def load_plan_from_module(module_path):
diff --git a/tutorial/session1/CompilerOptimization.ipynb b/tutorial/session1/CompilerOptimization.ipynb
index d17a6b25..6c23bfec 100644
--- a/tutorial/session1/CompilerOptimization.ipynb
+++ b/tutorial/session1/CompilerOptimization.ipynb
@@ -10,7 +10,15 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-04-16T10:34:23.862488Z",
+     "iopub.status.busy": "2026-04-16T10:34:23.862221Z",
+     "iopub.status.idle": "2026-04-16T10:34:26.839597Z",
+     "shell.execute_reply": "2026-04-16T10:34:26.838615Z",
+     "shell.execute_reply.started": "2026-04-16T10:34:23.862467Z"
+    }
+   },
    "outputs": [],
    "source": [
     "import torch\n",
@@ -31,7 +39,15 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-04-16T10:34:26.840859Z",
+     "iopub.status.busy": "2026-04-16T10:34:26.840581Z",
+     "iopub.status.idle": "2026-04-16T10:34:46.109858Z",
+     "shell.execute_reply": "2026-04-16T10:34:46.108862Z",
+     "shell.execute_reply.started": "2026-04-16T10:34:26.840841Z"
+    }
+   },
    "outputs": [],
    "source": [
     "os.environ['TORCHSIM_DUMP_PATH']=os.path.join(os.getcwd(), \"fused\")\n",
@@ -50,10 +66,18 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-04-16T10:41:01.000313Z",
+     "iopub.status.busy": "2026-04-16T10:41:00.999980Z",
+     "iopub.status.idle": "2026-04-16T10:41:01.273172Z",
+     "shell.execute_reply": "2026-04-16T10:41:01.272081Z",
+     "shell.execute_reply.started": "2026-04-16T10:41:01.000290Z"
+    }
+   },
    "outputs": [],
    "source": [
-    "!cat /root/workspace/PyTorchSim/outputs/20251202_060538/togsim_result.log | grep \"Total execution cycle\""
+    "!cat /workspace/PyTorchSim/togsim_results/20260416_103442_5281e75b.log | grep \"Total execution cycle\""
    ]
   },
   {
@@ -66,7 +90,15 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-04-16T10:44:29.448759Z",
+     "iopub.status.busy": "2026-04-16T10:44:29.448400Z",
+     "iopub.status.idle": "2026-04-16T10:44:41.303261Z",
+     "shell.execute_reply": "2026-04-16T10:44:41.302462Z",
+     "shell.execute_reply.started": "2026-04-16T10:44:29.448732Z"
+    }
+   },
    "outputs": [],
    "source": [
     "os.environ['TORCHSIM_DUMP_PATH']=os.path.join(os.getcwd(), \"non_fused\")\n",
@@ -85,12 +117,27 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-04-16T10:46:37.996794Z",
+     "iopub.status.busy": "2026-04-16T10:46:37.996476Z",
+     "iopub.status.idle": "2026-04-16T10:46:38.497173Z",
+     "shell.execute_reply": "2026-04-16T10:46:38.496104Z",
+     "shell.execute_reply.started": "2026-04-16T10:46:37.996776Z"
+    }
+   },
    "outputs": [],
    "source": [
-    "!cat /root/workspace/PyTorchSim/outputs/20251202_055530/togsim_result.log | grep \"Total execution cycle\"\n",
-    "!cat /root/workspace/PyTorchSim/outputs/20251202_055532/togsim_result.log | grep \"Total execution cycle\""
+    "!cat /workspace/PyTorchSim/togsim_results/20260416_104436_000cb9bc.log | grep \"Total execution cycle\"\n",
+    "!cat /workspace/PyTorchSim/togsim_results/20260416_104440_e50cdae1.log | grep \"Total execution cycle\""
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
@@ -109,7 +156,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.13"
+   "version": "3.11.13"
   }
  },
  "nbformat": 4,
diff --git a/tutorial/session1/DNNServing.ipynb b/tutorial/session1/DNNServing.ipynb
index 741f463f..0b4e0837 100644
--- a/tutorial/session1/DNNServing.ipynb
+++ b/tutorial/session1/DNNServing.ipynb
@@ -10,7 +10,15 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-04-16T11:17:18.018872Z",
+     "iopub.status.busy": "2026-04-16T11:17:18.018643Z",
+     "iopub.status.idle": "2026-04-16T11:17:20.890421Z",
+     "shell.execute_reply": "2026-04-16T11:17:20.889693Z",
+     "shell.execute_reply.started": "2026-04-16T11:17:18.018853Z"
+    }
+   },
    "outputs": [],
    "source": [
     "import torch\n",
@@ -30,29 +38,32 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-04-16T11:17:20.891167Z",
+     "iopub.status.busy": "2026-04-16T11:17:20.890953Z",
+     "iopub.status.idle": "2026-04-16T11:19:42.197046Z",
+     "shell.execute_reply": "2026-04-16T11:19:42.196023Z",
+     "shell.execute_reply.started": "2026-04-16T11:17:20.891152Z"
+    }
+   },
    "outputs": [],
    "source": [
     "import torch\n",
     "from torchvision.models import resnet18\n",
-    "from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request\n",
+    "from Simulator.simulator import TOGSimulator\n",
     "from PyTorchSimFrontend import extension_config\n",
     "\n",
-    "scheduler = Scheduler(num_request_queue=1, engine_select=Scheduler.FIFO_ENGINE, togsim_config=extension_config.CONFIG_TOGSIM_CONFIG)\n",
-    "device = scheduler.execution_engine.module.custom_device()\n",
+    "device = torch.device(\"npu:0\")\n",
+    "config = extension_config.CONFIG_TOGSIM_CONFIG\n",
     "\n",
     "model = resnet18().eval()\n",
     "input = torch.randn(1, 3, 224, 224).to(device=device)\n",
     "opt_fn = torch.compile(dynamic=False)(model.to(device, memory_format=torch.channels_last))\n",
     "\n",
-    "SchedulerDNNModel.register_model(\"resnet18\", opt_fn)\n",
-    "request = Request(\"resnet18\", [input], [], request_queue_idx=0)\n",
-    "scheduler.add_request(request, request_time=0)\n",
-    "\n",
     "# Run scheduler\n",
-    "while not scheduler.is_finished():\n",
-    "    with torch.no_grad():\n",
-    "        scheduler.schedule()\n",
+    "with TOGSimulator(config_path=config):\n",
+    "    torch.npu.launch_model(opt_fn, input, stream_index=0, timestamp=0)\n",
     "\n",
     "print(\"ResNet18 Simulation Done\")"
    ]
@@ -73,37 +84,45 @@
     "import os\n",
     "import torch\n",
     "from torchvision.models import resnet18\n",
-    "\n",
-    "from Scheduler.scheduler import Scheduler, SchedulerDNNModel, Request, poisson_request_generator\n",
+    "from Simulator.simulator import TOGSimulator\n",
+    "from PyTorchSimFrontend import extension_config\n",
+    "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_timing_only.yml\"\n",
+    "from Scheduler.scheduler import poisson_request_generator\n",
     "TORCHSIM_DIR = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')\n",
     "\n",
-    "lambda_requests = 10\n",
-    "max_time = 30\n",
+    "model0_lambda = 5.0\n",
+    "max_time_msec = 1000.0\n",
     "\n",
     "target_model1 = resnet18().eval()\n",
     "\n",
-    "# Init scheduler\n",
-    "scheduler = Scheduler(num_request_queue=1, max_batch=32, engine_select=Scheduler.FIFO_ENGINE, togsim_config=extension_config.CONFIG_TOGSIM_CONFIG)\n",
-    "# Register compiled model\n",
-    "opt_model1 = torch.compile(target_model1.to(device=scheduler.execution_engine.module.custom_device(), memory_format=torch.channels_last), dynamic=False)\n",
-    "SchedulerDNNModel.register_model(\"resnet18\", opt_model1)\n",
+    "device = torch.device(\"npu:0\")\n",
+    "config = extension_config.CONFIG_TOGSIM_CONFIG\n",
+    "opt_model0 = torch.compile(target_model1.to(device=device, memory_format=torch.channels_last), dynamic=False)\n",
     "\n",
-    "# Generate time stamp\n",
-    "for request_time in poisson_request_generator(lambda_requests, max_time):\n",
-    "    # Init input data\n",
-    "    model_input1 = torch.randn(1, 3, 224, 224)\n",
+    "events = []\n",
+    "x = torch.randn(1, 3, 224, 224, device=device)\n",
+    "for t in poisson_request_generator(model0_lambda, max_msec_time=max_time_msec):\n",
+    "    events.append((t, 0, opt_model0, (x,)))  # stream_index 0 → queue / partition 0\n",
     "\n",
-    "    # Init request\n",
-    "    new_request1 = Request(\"resnet18\", [model_input1], [], request_queue_idx=0)\n",
+    "events.sort(key=lambda e: e[0])\n",
     "\n",
-    "    # Add request to scheduler\n",
-    "    print(\"[Reqest] Resnet18 request time: \", request_time, flush=True)\n",
-    "    scheduler.add_request(new_request1, request_time=request_time)\n",
     "\n",
-    "# Run scheduler\n",
-    "while not scheduler.is_finished():\n",
-    "    scheduler.schedule()"
+    "with TOGSimulator(config_path=config):\n",
+    "    for t_msec, stream_index, model, args in events:\n",
+    "        torch.npu.launch_model(\n",
+    "            model,\n",
+    "            *args,\n",
+    "            stream_index=stream_index,\n",
+    "            timestamp=int(t_msec),\n",
+    "        )"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
@@ -122,7 +141,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.13"
+   "version": "3.11.13"
   }
  },
  "nbformat": 4,
diff --git a/tutorial/session1/ExecutionMode.ipynb b/tutorial/session1/ExecutionMode.ipynb
index d94323db..bd7d7d73 100644
--- a/tutorial/session1/ExecutionMode.ipynb
+++ b/tutorial/session1/ExecutionMode.ipynb
@@ -10,7 +10,15 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-04-16T05:56:08.883802Z",
+     "iopub.status.busy": "2026-04-16T05:56:08.883406Z",
+     "iopub.status.idle": "2026-04-16T05:56:11.858647Z",
+     "shell.execute_reply": "2026-04-16T05:56:11.857788Z",
+     "shell.execute_reply.started": "2026-04-16T05:56:08.883784Z"
+    }
+   },
    "outputs": [],
    "source": [
     "import torch\n",
@@ -30,7 +38,15 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-04-16T05:56:11.859394Z",
+     "iopub.status.busy": "2026-04-16T05:56:11.859139Z",
+     "iopub.status.idle": "2026-04-16T05:56:31.283787Z",
+     "shell.execute_reply": "2026-04-16T05:56:31.282907Z",
+     "shell.execute_reply.started": "2026-04-16T05:56:11.859372Z"
+    }
+   },
    "outputs": [],
    "source": [
     "device = torch.device(\"npu:0\")\n",
@@ -52,7 +68,15 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-04-16T05:56:37.980561Z",
+     "iopub.status.busy": "2026-04-16T05:56:37.980194Z",
+     "iopub.status.idle": "2026-04-16T05:56:46.194881Z",
+     "shell.execute_reply": "2026-04-16T05:56:46.194059Z",
+     "shell.execute_reply.started": "2026-04-16T05:56:37.980534Z"
+    }
+   },
    "outputs": [],
    "source": [
     "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_functional_only.yml\"\n",
@@ -74,7 +98,15 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-04-16T05:56:46.195666Z",
+     "iopub.status.busy": "2026-04-16T05:56:46.195511Z",
+     "iopub.status.idle": "2026-04-16T05:56:49.736201Z",
+     "shell.execute_reply": "2026-04-16T05:56:49.735438Z",
+     "shell.execute_reply.started": "2026-04-16T05:56:46.195650Z"
+    }
+   },
    "outputs": [],
    "source": [
     "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_timing_only.yml\"\n",
@@ -97,7 +129,15 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-04-16T05:59:18.661437Z",
+     "iopub.status.busy": "2026-04-16T05:59:18.661188Z",
+     "iopub.status.idle": "2026-04-16T05:59:53.388013Z",
+     "shell.execute_reply": "2026-04-16T05:59:53.387130Z",
+     "shell.execute_reply.started": "2026-04-16T05:59:18.661408Z"
+    }
+   },
    "outputs": [],
    "source": [
     "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_timing_only.yml\"\n",
@@ -112,10 +152,18 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-04-16T06:00:06.720227Z",
+     "iopub.status.busy": "2026-04-16T06:00:06.719962Z",
+     "iopub.status.idle": "2026-04-16T06:00:06.979872Z",
+     "shell.execute_reply": "2026-04-16T06:00:06.978988Z",
+     "shell.execute_reply.started": "2026-04-16T06:00:06.720210Z"
+    }
+   },
    "outputs": [],
    "source": [
-    "!cat /root/workspace/PyTorchSim/outputs/20251202_160520/togsim_result.log | grep \"Total execution cycle\""
+    "!cat /workspace/PyTorchSim/togsim_results/20260416_055926_3c61ae14.log | grep \"Total execution cycle\""
    ]
   },
   {
@@ -128,7 +176,15 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-04-16T06:01:00.604737Z",
+     "iopub.status.busy": "2026-04-16T06:01:00.604494Z",
+     "iopub.status.idle": "2026-04-16T06:01:34.826968Z",
+     "shell.execute_reply": "2026-04-16T06:01:34.826043Z",
+     "shell.execute_reply.started": "2026-04-16T06:01:00.604717Z"
+    }
+   },
    "outputs": [],
    "source": [
     "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_2_cores.yml\"\n",
@@ -143,10 +199,18 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-04-16T06:01:53.294075Z",
+     "iopub.status.busy": "2026-04-16T06:01:53.293728Z",
+     "iopub.status.idle": "2026-04-16T06:01:53.549156Z",
+     "shell.execute_reply": "2026-04-16T06:01:53.548315Z",
+     "shell.execute_reply.started": "2026-04-16T06:01:53.294047Z"
+    }
+   },
    "outputs": [],
    "source": [
-    "!cat /root/workspace/PyTorchSim/outputs/20251202_160547/togsim_result.log | grep \"Total execution cycle\""
+    "!cat /workspace/PyTorchSim/togsim_results/20260416_060100_05df9481.log | grep \"Total execution cycle\""
    ]
   },
   {
@@ -159,7 +223,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "base",
    "language": "python",
    "name": "python3"
   },
@@ -173,7 +237,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.13"
+   "version": "3.11.13"
   }
  },
  "nbformat": 4,
diff --git a/tutorial/session1/Inference.ipynb b/tutorial/session1/Inference.ipynb
index 6fd54aed..caa5924e 100644
--- a/tutorial/session1/Inference.ipynb
+++ b/tutorial/session1/Inference.ipynb
@@ -11,7 +11,15 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-04-16T05:42:44.479626Z",
+     "iopub.status.busy": "2026-04-16T05:42:44.479480Z",
+     "iopub.status.idle": "2026-04-16T05:42:47.646477Z",
+     "shell.execute_reply": "2026-04-16T05:42:47.645578Z",
+     "shell.execute_reply.started": "2026-04-16T05:42:44.479609Z"
+    }
+   },
    "outputs": [],
    "source": [
     "import torch\n",
@@ -31,7 +39,15 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-04-16T05:42:47.968708Z",
+     "iopub.status.busy": "2026-04-16T05:42:47.968420Z",
+     "iopub.status.idle": "2026-04-16T05:42:49.772696Z",
+     "shell.execute_reply": "2026-04-16T05:42:49.771704Z",
+     "shell.execute_reply.started": "2026-04-16T05:42:47.968688Z"
+    }
+   },
    "outputs": [],
    "source": [
     "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
@@ -97,11 +113,18 @@
    "source": [
     "test_result(\"MatMul\", npu_out, cpu_out)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "base",
    "language": "python",
    "name": "python3"
   },
@@ -115,7 +138,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.13"
+   "version": "3.11.13"
   }
  },
  "nbformat": 4,
diff --git a/tutorial/session1/LogAnalysis.ipynb b/tutorial/session1/LogAnalysis.ipynb
index 24dae52b..5cd14f41 100644
--- a/tutorial/session1/LogAnalysis.ipynb
+++ b/tutorial/session1/LogAnalysis.ipynb
@@ -10,7 +10,15 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-04-16T10:00:05.422374Z",
+     "iopub.status.busy": "2026-04-16T10:00:05.422205Z",
+     "iopub.status.idle": "2026-04-16T10:00:08.512084Z",
+     "shell.execute_reply": "2026-04-16T10:00:08.511285Z",
+     "shell.execute_reply.started": "2026-04-16T10:00:05.422359Z"
+    }
+   },
    "outputs": [],
    "source": [
     "import torch\n",
@@ -32,7 +40,15 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-04-16T10:00:46.974212Z",
+     "iopub.status.busy": "2026-04-16T10:00:46.973814Z",
+     "iopub.status.idle": "2026-04-16T10:00:52.152064Z",
+     "shell.execute_reply": "2026-04-16T10:00:52.151231Z",
+     "shell.execute_reply.started": "2026-04-16T10:00:46.974195Z"
+    }
+   },
    "outputs": [],
    "source": [
     "device = torch.device(\"npu:0\")\n",
@@ -54,7 +70,15 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-04-16T10:25:36.625640Z",
+     "iopub.status.busy": "2026-04-16T10:25:36.625388Z",
+     "iopub.status.idle": "2026-04-16T10:25:40.123959Z",
+     "shell.execute_reply": "2026-04-16T10:25:40.123131Z",
+     "shell.execute_reply.started": "2026-04-16T10:25:36.625622Z"
+    }
+   },
    "outputs": [],
    "source": [
     "os.environ['TOGSIM_DEBUG_LEVEL']=\"trace\"\n",
@@ -90,7 +114,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.13"
+   "version": "3.11.13"
   }
  },
  "nbformat": 4,
diff --git a/tutorial/session1/Mapping.ipynb b/tutorial/session1/Mapping.ipynb
index 0b978bcb..92ddd5a8 100644
--- a/tutorial/session1/Mapping.ipynb
+++ b/tutorial/session1/Mapping.ipynb
@@ -10,7 +10,15 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-04-16T05:49:05.540163Z",
+     "iopub.status.busy": "2026-04-16T05:49:05.539948Z",
+     "iopub.status.idle": "2026-04-16T05:49:08.550103Z",
+     "shell.execute_reply": "2026-04-16T05:49:08.549146Z",
+     "shell.execute_reply.started": "2026-04-16T05:49:05.540146Z"
+    }
+   },
    "outputs": [],
    "source": [
     "import torch\n",
@@ -30,7 +38,15 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-04-16T05:49:08.550908Z",
+     "iopub.status.busy": "2026-04-16T05:49:08.550691Z",
+     "iopub.status.idle": "2026-04-16T05:49:28.225867Z",
+     "shell.execute_reply": "2026-04-16T05:49:28.225051Z",
+     "shell.execute_reply.started": "2026-04-16T05:49:08.550893Z"
+    }
+   },
    "outputs": [],
    "source": [
     "device = torch.device(\"npu:0\")\n",
@@ -45,10 +61,18 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-04-16T05:49:44.788982Z",
+     "iopub.status.busy": "2026-04-16T05:49:44.788640Z",
+     "iopub.status.idle": "2026-04-16T05:49:45.048201Z",
+     "shell.execute_reply": "2026-04-16T05:49:45.047229Z",
+     "shell.execute_reply.started": "2026-04-16T05:49:44.788954Z"
+    }
+   },
    "outputs": [],
    "source": [
-    "!cat /root/workspace/PyTorchSim/outputs/20251202_154524/togsim_result.log | grep \"Total execution cycle\""
+    "!cat /workspace/PyTorchSim/togsim_results/20260416_054924_5e1428f9.log | grep \"Total execution cycle\""
    ]
   },
   {
@@ -62,7 +86,15 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-04-16T05:49:53.216985Z",
+     "iopub.status.busy": "2026-04-16T05:49:53.216635Z",
+     "iopub.status.idle": "2026-04-16T05:50:11.043854Z",
+     "shell.execute_reply": "2026-04-16T05:50:11.042989Z",
+     "shell.execute_reply.started": "2026-04-16T05:49:53.216960Z"
+    }
+   },
    "outputs": [],
    "source": [
     "torch._dynamo.reset()\n",
@@ -79,10 +111,18 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-04-16T05:50:18.200344Z",
+     "iopub.status.busy": "2026-04-16T05:50:18.200118Z",
+     "iopub.status.idle": "2026-04-16T05:50:18.456838Z",
+     "shell.execute_reply": "2026-04-16T05:50:18.455901Z",
+     "shell.execute_reply.started": "2026-04-16T05:50:18.200327Z"
+    }
+   },
    "outputs": [],
    "source": [
-    "!cat /root/workspace/PyTorchSim/outputs/20251202_141933/togsim_result.log | grep \"Total execution cycle\""
+    "!cat /workspace/PyTorchSim/togsim_results/20260416_055004_6ef0f564.log | grep \"Total execution cycle\""
    ]
   },
   {
@@ -95,7 +135,15 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-04-16T11:22:40.778257Z",
+     "iopub.status.busy": "2026-04-16T11:22:40.777947Z",
+     "iopub.status.idle": "2026-04-16T11:23:10.573193Z",
+     "shell.execute_reply": "2026-04-16T11:23:10.572225Z",
+     "shell.execute_reply.started": "2026-04-16T11:22:40.778230Z"
+    }
+   },
    "outputs": [],
    "source": [
     "torch._dynamo.reset()\n",
@@ -112,10 +160,18 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-04-16T11:56:35.774938Z",
+     "iopub.status.busy": "2026-04-16T11:56:35.774682Z",
+     "iopub.status.idle": "2026-04-16T11:56:36.022450Z",
+     "shell.execute_reply": "2026-04-16T11:56:36.020569Z",
+     "shell.execute_reply.started": "2026-04-16T11:56:35.774921Z"
+    }
+   },
    "outputs": [],
    "source": [
-    "!cat /root/workspace/PyTorchSim/outputs/20251202_141951/togsim_result.log | grep \"Total execution cycle\""
+    "!cat /workspace/PyTorchSim/togsim_results/20260416_112306_10ad96fd.log | grep \"Total execution cycle\""
    ]
   },
   {
@@ -142,7 +198,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.13"
+   "version": "3.11.13"
   }
  },
  "nbformat": 4,
diff --git a/tutorial/session1/Training.ipynb b/tutorial/session1/Training.ipynb
index badf7ed7..1f86a5b8 100644
--- a/tutorial/session1/Training.ipynb
+++ b/tutorial/session1/Training.ipynb
@@ -10,7 +10,15 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-04-16T05:45:04.361593Z",
+     "iopub.status.busy": "2026-04-16T05:45:04.361471Z",
+     "iopub.status.idle": "2026-04-16T05:45:07.515245Z",
+     "shell.execute_reply": "2026-04-16T05:45:07.514397Z",
+     "shell.execute_reply.started": "2026-04-16T05:45:04.361578Z"
+    }
+   },
    "outputs": [],
    "source": [
     "import os\n",
@@ -33,7 +41,15 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-04-16T05:45:07.516141Z",
+     "iopub.status.busy": "2026-04-16T05:45:07.515901Z",
+     "iopub.status.idle": "2026-04-16T05:45:07.635695Z",
+     "shell.execute_reply": "2026-04-16T05:45:07.634872Z",
+     "shell.execute_reply.started": "2026-04-16T05:45:07.516123Z"
+    }
+   },
    "outputs": [],
    "source": [
     "torch.manual_seed(0)\n",
@@ -43,7 +59,7 @@
     "cpu_input.requires_grad = True\n",
     "cpu_weight.requires_grad = True\n",
     "\n",
-    "opt_fn = torch.matmul\n",
+    "opt_fn = torch.compile(torch.matmul)\n",
     "cpu_out = opt_fn(cpu_input, cpu_weight)\n",
     "\n",
     "loss_fn = torch.nn.CrossEntropyLoss()\n",
@@ -61,7 +77,15 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-04-16T05:45:07.636349Z",
+     "iopub.status.busy": "2026-04-16T05:45:07.636190Z",
+     "iopub.status.idle": "2026-04-16T05:45:13.350714Z",
+     "shell.execute_reply": "2026-04-16T05:45:13.349588Z",
+     "shell.execute_reply.started": "2026-04-16T05:45:07.636333Z"
+    }
+   },
    "outputs": [],
    "source": [
     "torch.manual_seed(0)\n",
@@ -82,7 +106,15 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-04-16T05:45:13.351955Z",
+     "iopub.status.busy": "2026-04-16T05:45:13.351757Z",
+     "iopub.status.idle": "2026-04-16T05:45:13.356589Z",
+     "shell.execute_reply": "2026-04-16T05:45:13.355757Z",
+     "shell.execute_reply.started": "2026-04-16T05:45:13.351935Z"
+    }
+   },
    "outputs": [],
    "source": [
     "def test_result(name, npu_out, cpu_out, rtol=1e-4, atol=1e-4):\n",
@@ -104,7 +136,15 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-04-16T05:45:13.357014Z",
+     "iopub.status.busy": "2026-04-16T05:45:13.356871Z",
+     "iopub.status.idle": "2026-04-16T05:45:13.361392Z",
+     "shell.execute_reply": "2026-04-16T05:45:13.360681Z",
+     "shell.execute_reply.started": "2026-04-16T05:45:13.357000Z"
+    }
+   },
    "outputs": [],
    "source": [
     "test_result(\"MatMul Input Grad\", npu_input.grad, cpu_input.grad)\n",
@@ -121,7 +161,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "base",
    "language": "python",
    "name": "python3"
   },
@@ -135,7 +175,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.13"
+   "version": "3.11.13"
   }
  },
  "nbformat": 4,
diff --git a/tutorial/session1/tutorial_external_mapping.json b/tutorial/session1/tutorial_external_mapping.json
index 3982d950..184a29da 100644
--- a/tutorial/session1/tutorial_external_mapping.json
+++ b/tutorial/session1/tutorial_external_mapping.json
@@ -2,6 +2,6 @@
     "1024_1024_1024" : {
         "TILE_M" : 512,
         "TILE_N" : 512,
-        "TILE_K" : 512
+        "TILE_K" : 256
     }
 }
\ No newline at end of file

From 174b3cc258bee77d5a5aaaab5437c3c9ac8acc4c Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 20 Apr 2026 12:25:58 +0900
Subject: [PATCH 172/194] [Tutorial] Add guideline for a hands-on

---
 PyTorchSimFrontend/mlir/mlir_ops.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/PyTorchSimFrontend/mlir/mlir_ops.py b/PyTorchSimFrontend/mlir/mlir_ops.py
index 218f60a9..58e8b73b 100644
--- a/PyTorchSimFrontend/mlir/mlir_ops.py
+++ b/PyTorchSimFrontend/mlir/mlir_ops.py
@@ -331,6 +331,13 @@ def exp2(operand, *args, **kwargs):
         # Hands-on part: implement exp2 using math.exp2
         # V.kernel.var_info = {operand: [tile_size, dtype]}
         # Ex) V.kernel.var_info[operand] = [8, "f32"]
+        #
+        # tile_size, dtype = V.kernel.var_info[operand]
+        # if tile_size > 1:
+        #     shape = f"vector<{tile_size}x{dtype}>"
+        # else:
+        #     shape = dtype
+        # return f'math.exp2 %{operand} : {shape}', [tile_size, dtype]
 
         ln2 = math.log(2)
         coeff = ops.constant(ln2, "f32")

From 6d64afa6568998698d49f3444ea54684f1462669 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 20 Apr 2026 12:53:52 +0900
Subject: [PATCH 173/194] [Tutorial] Add CI for tutorial image

---
 .github/workflows/docker-tutorial-image.yml |  17 +++-
 tutorial/jupyterhub/Dockerfile.ksc2025      |  90 -----------------
 tutorial/jupyterhub/Dockerfile.tutorial     | 103 ++++++++++++++++++++
 tutorial/jupyterhub/jupyterhub_config.py    |   6 +-
 4 files changed, 118 insertions(+), 98 deletions(-)
 delete mode 100644 tutorial/jupyterhub/Dockerfile.ksc2025
 create mode 100644 tutorial/jupyterhub/Dockerfile.tutorial

diff --git a/.github/workflows/docker-tutorial-image.yml b/.github/workflows/docker-tutorial-image.yml
index c0d8267d..e03bef22 100644
--- a/.github/workflows/docker-tutorial-image.yml
+++ b/.github/workflows/docker-tutorial-image.yml
@@ -2,7 +2,7 @@ name: Docker image for tutorial
 
 on:
   push:
-    branches: [ "tutorial" ]
+    branches: [ "ispass2026" ]
 
 jobs:
   build:
@@ -25,11 +25,18 @@ jobs:
           username: ${{ github.actor }}
           password: ${{ secrets.GITHUB_TOKEN }}
 
-      # Step 3: Build and Push Docker Image
+      - name: PyTorch base image from manifest
+        run: |
+          PYTORCH_IMAGE=$(python3 -c "import json; from pathlib import Path; v=json.loads(Path('thirdparty/github-releases.json').read_text()).get('pytorch_image'); print(v or '')")
+          if [ -z "$PYTORCH_IMAGE" ]; then echo "thirdparty/github-releases.json: pytorch_image is required" >&2; exit 1; fi
+          echo "PYTORCH_IMAGE=$PYTORCH_IMAGE" >> "$GITHUB_ENV"
+
       - name: Build and Push Docker Image
-        uses: docker/build-push-action@v4
+        uses: docker/build-push-action@v6
         with:
           context: .
-          file: ./tutorial/jupyterhub/Dockerfile.ksc2025
+          file: ./tutorial/jupyterhub/Dockerfile.tutorial
           push: true
-          tags: ghcr.io/psal-postech/torchsim_ksc2025:latest
+          build-args: |
+            PYTORCH_IMAGE=${{ env.PYTORCH_IMAGE }}
+          tags: ghcr.io/psal-postech/torchsim-tutorial:ispass2026
diff --git a/tutorial/jupyterhub/Dockerfile.ksc2025 b/tutorial/jupyterhub/Dockerfile.ksc2025
deleted file mode 100644
index 7633c048..00000000
--- a/tutorial/jupyterhub/Dockerfile.ksc2025
+++ /dev/null
@@ -1,90 +0,0 @@
-# Copyright (c) 2020 The Regents of the University of California
-# All Rights Reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-FROM pytorch/pytorch:2.2.0-cuda12.1-cudnn8-runtime
-
-# Copied from Gem5 Docker file
-ENV DEBIAN_FRONTEND=noninteractive
-RUN apt -y update && apt -y upgrade && \
-    apt -y install build-essential git m4 scons zlib1g zlib1g-dev \
-    libprotobuf-dev protobuf-compiler libprotoc-dev libgoogle-perftools-dev \
-    python3-dev python-is-python3 doxygen libboost-all-dev \
-    libhdf5-serial-dev python3-pydot libpng-dev libelf-dev pkg-config pip \
-    python3-venv black libssl-dev libasan5 libubsan1
-RUN pip install mypy pre-commit jupyter pydot tabulate jupyterlab_execute_time
-
-# Pass Access Token securely
-ENV PATH=$PATH:/root/.local/bin
-ENV LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:/opt/conda/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$LD_LIBRARY_PATH
-
-# Build Gem5
-RUN git clone https://github.com/PSAL-POSTECH/gem5.git --branch tutorial
-RUN cd gem5 && scons build/RISCV/gem5.opt -j $(nproc) && git checkout TorchSim
-ENV GEM5_PATH=/workspace/gem5/build/RISCV/gem5.opt
-
-# Build LLVM RISC-V
-RUN git clone https://github.com/PSAL-POSTECH/llvm-project.git --branch torchsim --depth 1
-RUN cd llvm-project && mkdir build && cd build && \
-    cmake -DLLVM_ENABLE_PROJECTS=mlir -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/riscv-llvm -DLLVM_TARGETS_TO_BUILD=RISCV -G "Unix Makefiles" ../llvm && \
-    make -j && make install
-
-# Store RISC-V LLVM for TorchSim
-ENV TORCHSIM_LLVM_PATH=/riscv-llvm/bin
-ENV TORCHSIM_DIR=/workspace/PyTorchSim
-
-# Download RISC-V tool chain
-RUN apt install -y wget && \
-    wget https://github.com/riscv-collab/riscv-gnu-toolchain/releases/download/2023.12.14/riscv64-glibc-ubuntu-22.04-llvm-nightly-2023.12.14-nightly.tar.gz && \
-    wget https://github.com/riscv-collab/riscv-gnu-toolchain/releases/download/2023.12.14/riscv64-elf-ubuntu-20.04-llvm-nightly-2023.12.14-nightly.tar.gz && \
-    tar -zxvf riscv64-elf-ubuntu-20.04-llvm-nightly-2023.12.14-nightly.tar.gz && tar -zxvf riscv64-elf-ubuntu-20.04-llvm-nightly-2023.12.14-nightly.tar.gz && \
-    rm *.tar.gz
-
-ENV RISCV=/workspace/riscv
-ENV PATH=$RISCV/bin:$PATH
-
-# Install Spike simulator
-RUN apt -y install device-tree-compiler
-RUN git clone https://github.com/PSAL-POSTECH/riscv-isa-sim.git --branch TorchSim && cd riscv-isa-sim && mkdir build && cd build && \
-    ../configure --prefix=$RISCV && make -j && make install
-
-# Install Proxy kernel
-RUN git clone https://github.com/riscv-software-src/riscv-pk.git && \
-     cd riscv-pk && git checkout 4f3debe4d04f56d31089c1c716a27e2d5245e9a1 && mkdir build && cd build && \
-    ../configure --prefix=$RISCV --host=riscv64-unknown-elf && make -j && make install
-
-# Install torchsim dependency
-RUN apt install ninja-build && pip install onnx matplotlib && pip install --user conan==1.56.0
-
-# Prepare PyTorchSim project
-RUN git clone https://github.com/PSAL-POSTECH/PyTorchSim.git --branch tutorial
-RUN cd PyTorchSim/TOGSim && \
-    git submodule update --recursive --init && \
-    mkdir -p build && \
-    cd build && \
-    conan install .. --build=missing && \
-    cmake .. && \
-    make -j$(nproc)
-
-RUN pip install jupyterhub jupyterlab
diff --git a/tutorial/jupyterhub/Dockerfile.tutorial b/tutorial/jupyterhub/Dockerfile.tutorial
new file mode 100644
index 00000000..5a0e7458
--- /dev/null
+++ b/tutorial/jupyterhub/Dockerfile.tutorial
@@ -0,0 +1,103 @@
+# Copyright (c) 2020 The Regents of the University of California
+# All Rights Reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# Base image: CI passes build-arg from ``thirdparty/github-releases.json`` (``pytorch_image``).
+# Default matches that manifest for local ``docker build``.
+ARG PYTORCH_IMAGE=pytorch/pytorch:2.8.0-cuda12.6-cudnn9-devel
+FROM ${PYTORCH_IMAGE}
+
+ENV DEBIAN_FRONTEND=noninteractive
+WORKDIR /workspace
+
+# Build deps (Gem5 / LLVM / TorchSim); keep layer lean where possible.
+RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \
+    build-essential git m4 scons zlib1g zlib1g-dev \
+    libprotobuf-dev protobuf-compiler libprotoc-dev libgoogle-perftools-dev \
+    python3-dev python-is-python3 doxygen libboost-all-dev \
+    libhdf5-serial-dev python3-pydot libpng-dev libelf-dev pkg-config \
+    python3-venv black libssl-dev libasan5 libubsan1 \
+    wget ca-certificates device-tree-compiler ninja-build \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN pip install --no-cache-dir mypy pre-commit jupyter pydot tabulate \
+    jupyterlab_execute_time onnx matplotlib conan==1.56.0
+
+ENV PATH=$PATH:/root/.local/bin
+ENV LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:/opt/conda/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$LD_LIBRARY_PATH
+
+# Gem5 (TorchSim branch before build)
+RUN git clone https://github.com/PSAL-POSTECH/gem5.git --branch ispass2026 \
+    && cd gem5 && git checkout TorchSim \
+    && scons build/RISCV/gem5.opt -j"$(nproc)"
+ENV GEM5_PATH=/workspace/gem5/build/RISCV/gem5.opt
+
+# LLVM MLIR (RISC-V)
+RUN git clone https://github.com/PSAL-POSTECH/llvm-project.git --branch ispass2026 --depth 1 \
+    && cd llvm-project && mkdir build && cd build \
+    && cmake -G Ninja \
+        -DLLVM_ENABLE_PROJECTS=mlir \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DCMAKE_INSTALL_PREFIX=/riscv-llvm \
+        -DLLVM_TARGETS_TO_BUILD=RISCV \
+        ../llvm \
+    && cmake --build . -j"$(nproc)" \
+    && cmake --install .
+ENV TORCHSIM_LLVM_PATH=/riscv-llvm/bin
+ENV TORCHSIM_DIR=/workspace/PyTorchSim
+
+# RISC-V GNU toolchains (glibc + bare-metal), Ubuntu release bundles
+RUN wget -q https://github.com/riscv-collab/riscv-gnu-toolchain/releases/download/2023.12.14/riscv64-glibc-ubuntu-22.04-llvm-nightly-2023.12.14-nightly.tar.gz \
+    && wget -q https://github.com/riscv-collab/riscv-gnu-toolchain/releases/download/2023.12.14/riscv64-elf-ubuntu-20.04-llvm-nightly-2023.12.14-nightly.tar.gz \
+    && tar -xzf riscv64-glibc-ubuntu-22.04-llvm-nightly-2023.12.14-nightly.tar.gz \
+    && tar -xzf riscv64-elf-ubuntu-20.04-llvm-nightly-2023.12.14-nightly.tar.gz \
+    && rm -f *.tar.gz
+
+ENV RISCV=/workspace/riscv
+ENV PATH=$RISCV/bin:$PATH
+
+# Spike
+RUN git clone https://github.com/PSAL-POSTECH/riscv-isa-sim.git --branch ispass2026 \
+    && cd riscv-isa-sim && mkdir build && cd build \
+    && ../configure --prefix="$RISCV" && make -j"$(nproc)" && make install
+
+# Proxy kernel
+RUN git clone https://github.com/riscv-software-src/riscv-pk.git \
+    && cd riscv-pk && git checkout 4f3debe4d04f56d31089c1c716a27e2d5245e9a1 \
+    && mkdir build && cd build \
+    && ../configure --prefix="$RISCV" --host=riscv64-unknown-elf \
+    && make -j"$(nproc)" && make install
+
+# PyTorchSim + TOGSim
+ENV CMAKE_POLICY_VERSION_MINIMUM=3.5
+RUN git clone https://github.com/PSAL-POSTECH/PyTorchSim.git --branch ispass2026 \
+    && cd PyTorchSim/TOGSim \
+    && git submodule update --recursive --init \
+    && mkdir -p build && cd build \
+    && conan install .. --build=missing \
+    && cmake -G Ninja .. \
+    && cmake --build . -j"$(nproc)"
+
+RUN pip install --no-cache-dir jupyterhub jupyterlab
diff --git a/tutorial/jupyterhub/jupyterhub_config.py b/tutorial/jupyterhub/jupyterhub_config.py
index a43c0543..36b03981 100644
--- a/tutorial/jupyterhub/jupyterhub_config.py
+++ b/tutorial/jupyterhub/jupyterhub_config.py
@@ -6,11 +6,11 @@
 # Spawner config
 # ------------------------------------------------------------------------------
 c.JupyterHub.spawner_class = 'dockerspawner.DockerSpawner'
-c.DockerSpawner.image = "ghcr.io/psal-postech/torchsim_ksc2025:latest"
+c.DockerSpawner.image = "ghcr.io/psal-postech/torchsim-tutorial:ispass2026"
 
 # Resource limit
-c.DockerSpawner.mem_limit = '16G'
-c.DockerSpawner.cpu_limit = 4.0
+c.DockerSpawner.mem_limit = '32G'
+c.DockerSpawner.cpu_limit = 8.0
 
 c.DockerSpawner.network_name = 'jupyterhub-network'
 c.Spawner.default_url = '/lab'

From 0043b0183162d2a8705af40b37aa65d3b8c0205b Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 20 Apr 2026 15:30:56 +0900
Subject: [PATCH 174/194] [Tutorial] Add missing script

---
 tutorial/jupyterhub/Dockerfile.tutorial | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tutorial/jupyterhub/Dockerfile.tutorial b/tutorial/jupyterhub/Dockerfile.tutorial
index 5a0e7458..d10ed1bc 100644
--- a/tutorial/jupyterhub/Dockerfile.tutorial
+++ b/tutorial/jupyterhub/Dockerfile.tutorial
@@ -100,4 +100,5 @@ RUN git clone https://github.com/PSAL-POSTECH/PyTorchSim.git --branch ispass2026
     && cmake -G Ninja .. \
     && cmake --build . -j"$(nproc)"
 
+RUN cd PyTorchSimDevice && python3 -m pip install --no-build-isolation -e .
 RUN pip install --no-cache-dir jupyterhub jupyterlab

From c83d3213a23358c65a4777a0ac7ba89446b58571 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 20 Apr 2026 16:16:41 +0900
Subject: [PATCH 175/194] [Tutorial] Fix paths in Dockerfile for gem5 and
 PyTorchSimDevice

---
 tutorial/jupyterhub/Dockerfile.tutorial | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tutorial/jupyterhub/Dockerfile.tutorial b/tutorial/jupyterhub/Dockerfile.tutorial
index d10ed1bc..303bcfa9 100644
--- a/tutorial/jupyterhub/Dockerfile.tutorial
+++ b/tutorial/jupyterhub/Dockerfile.tutorial
@@ -50,7 +50,7 @@ ENV LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:/opt/conda/lib:/usr/local/nvidia/l
 
 # Gem5 (TorchSim branch before build)
 RUN git clone https://github.com/PSAL-POSTECH/gem5.git --branch ispass2026 \
-    && cd gem5 && git checkout TorchSim \
+    && cd gem5 \
     && scons build/RISCV/gem5.opt -j"$(nproc)"
 ENV GEM5_PATH=/workspace/gem5/build/RISCV/gem5.opt
 
@@ -100,5 +100,5 @@ RUN git clone https://github.com/PSAL-POSTECH/PyTorchSim.git --branch ispass2026
     && cmake -G Ninja .. \
     && cmake --build . -j"$(nproc)"
 
-RUN cd PyTorchSimDevice && python3 -m pip install --no-build-isolation -e .
+RUN cd PyTorchSim/PyTorchSimDevice && python3 -m pip install --no-build-isolation -e .
 RUN pip install --no-cache-dir jupyterhub jupyterlab

From 50e210c0a769ca2e0291e4d8627a9f152fc5cbb5 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Mon, 20 Apr 2026 16:22:06 +0900
Subject: [PATCH 176/194] [Tutorial] fix

---
 tutorial/jupyterhub/Dockerfile.tutorial | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tutorial/jupyterhub/Dockerfile.tutorial b/tutorial/jupyterhub/Dockerfile.tutorial
index 303bcfa9..6cb6d7d2 100644
--- a/tutorial/jupyterhub/Dockerfile.tutorial
+++ b/tutorial/jupyterhub/Dockerfile.tutorial
@@ -49,9 +49,10 @@ ENV PATH=$PATH:/root/.local/bin
 ENV LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:/opt/conda/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$LD_LIBRARY_PATH
 
 # Gem5 (TorchSim branch before build)
-RUN git clone https://github.com/PSAL-POSTECH/gem5.git --branch ispass2026 \
+RUN git clone https://github.com/PSAL-POSTECH/gem5.git --branch tutorial \
     && cd gem5 \
     && scons build/RISCV/gem5.opt -j"$(nproc)"
+RUN cd gem5 && git checkout ispass2026
 ENV GEM5_PATH=/workspace/gem5/build/RISCV/gem5.opt
 
 # LLVM MLIR (RISC-V)

From 24062d1d215d18dac0fb9c94d962b7a463690a8f Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 21 Apr 2026 22:57:18 +0900
Subject: [PATCH 177/194] [Config] derive req size, freq, peak BW from
 Ramulator2; simplify simple DRAM YAML

---
 README.md                                     |  19 +-
 TOGSim/extern/ramulator2                      |   2 +-
 TOGSim/extern/ramulator_custom/.gitignore     |   6 -
 TOGSim/extern/ramulator_custom/CMakeLists.txt |  11 -
 .../include/ramulator/Ramulator.hpp           |  57 --
 TOGSim/extern/ramulator_custom/src/Config.cpp |  68 --
 TOGSim/extern/ramulator_custom/src/Config.h   | 120 ---
 .../extern/ramulator_custom/src/Controller.h  | 667 -----------------
 TOGSim/extern/ramulator_custom/src/DDR4.cpp   | 418 -----------
 TOGSim/extern/ramulator_custom/src/DDR4.h     | 220 ------
 TOGSim/extern/ramulator_custom/src/DRAM.h     | 453 ------------
 TOGSim/extern/ramulator_custom/src/HBM.cpp    | 413 -----------
 TOGSim/extern/ramulator_custom/src/HBM.h      | 228 ------
 TOGSim/extern/ramulator_custom/src/Memory.h   | 684 ------------------
 .../ramulator_custom/src/MemoryFactory.cpp    |  80 --
 .../ramulator_custom/src/MemoryFactory.h      |  84 ---
 .../extern/ramulator_custom/src/Ramulator.cpp | 171 -----
 .../extern/ramulator_custom/src/Refresh.cpp   | 255 -------
 TOGSim/extern/ramulator_custom/src/Refresh.h  | 137 ----
 .../extern/ramulator_custom/src/Request.cpp   |  90 ---
 TOGSim/extern/ramulator_custom/src/Request.h  |  54 --
 .../extern/ramulator_custom/src/Scheduler.h   | 377 ----------
 .../ramulator_custom/src/SpeedyController.h   | 304 --------
 .../extern/ramulator_custom/src/StatType.cpp  | 153 ----
 TOGSim/extern/ramulator_custom/src/StatType.h | 669 -----------------
 .../extern/ramulator_custom/src/Statistics.h  | 236 ------
 TOGSim/include/Common.h                       |   3 +-
 TOGSim/include/Dram.h                         |  10 +-
 TOGSim/include/SimulationConfig.h             |  29 +-
 TOGSim/include/Simulator.h                    |   5 +-
 TOGSim/src/Common.cc                          |  25 +-
 TOGSim/src/Dram.cc                            | 232 +++++-
 TOGSim/src/Simulator.cc                       |   6 +-
 TOGSim/src/main.cc                            |  22 +-
 configs/heterogeneous_c2_simple_noc.yml       |   2 -
 configs/ramulator2_configs/HBM2_TPUv2.yaml    | 476 ++++++++++++
 configs/ramulator2_configs/HBM2_TPUv3.yaml    |  70 +-
 configs/ramulator2_configs/gen_configs.py     |   8 +-
 configs/ramulator_configs/ALDRAM-config.cfg   |  30 -
 configs/ramulator_configs/DDR3-config.cfg     |  31 -
 configs/ramulator_configs/DDR4-config.cfg     |  31 -
 configs/ramulator_configs/DSARP-config.cfg    |  31 -
 configs/ramulator_configs/GDDR5-config.cfg    |  30 -
 configs/ramulator_configs/HBM-config.cfg      |  32 -
 .../HBM-config_ChRaBaRoCo.cfg                 |  32 -
 configs/ramulator_configs/HBM-config_FCFS.cfg |  32 -
 .../ramulator_configs/HBM-config_FRFCFS.cfg   |  32 -
 .../HBM-config_FRFCFS_Cap.cfg                 |  32 -
 .../HBM-config_FRFCFS_PriorHit.cfg            |  32 -
 .../HBM-config_RoBaRaCoCh.cfg                 |  32 -
 .../HBM-config_RoCoBaRaCh.cfg                 |  32 -
 .../ramulator_configs/HBMx0.5ch-config.cfg    |  30 -
 configs/ramulator_configs/HBMx2ch-config.cfg  |  30 -
 configs/ramulator_configs/LPDDR3-config.cfg   |  30 -
 configs/ramulator_configs/LPDDR4-config.cfg   |  30 -
 configs/ramulator_configs/PCM-config.cfg      |  30 -
 configs/ramulator_configs/SALP-config.cfg     |  31 -
 configs/ramulator_configs/STTMRAM-config.cfg  |  30 -
 configs/ramulator_configs/TLDRAM-config.cfg   |  31 -
 configs/ramulator_configs/WideIO-config.cfg   |  30 -
 configs/ramulator_configs/WideIO2-config.cfg  |  30 -
 configs/stonne_big_c1_simple_noc.yml          |   2 -
 configs/stonne_single_c1_simple_noc.yml       |   2 -
 .../systolic_ws_128x128_c1_booksim_tpuv2.yml  |   2 -
 .../systolic_ws_128x128_c1_booksim_tpuv3.yml  |   2 -
 ...ystolic_ws_128x128_c1_simple_noc_tpuv2.yml |   2 -
 ...ystolic_ws_128x128_c1_simple_noc_tpuv3.yml |   2 -
 ...ic_ws_128x128_c1_simple_noc_tpuv3_half.yml |   2 -
 ...28x128_c1_simple_noc_tpuv3_timing_only.yml |   2 -
 ...ystolic_ws_128x128_c1_simple_noc_tpuv4.yml |   2 -
 .../systolic_ws_128x128_c2_booksim_tpuv3.yml  |   2 -
 ...ws_128x128_c2_booksim_tpuv3_bw_quarter.yml |   9 +-
 .../systolic_ws_128x128_c2_chiplet_tpuv3.yml  |   2 -
 ...olic_ws_128x128_c2_chiplet_tpuv3_xnuma.yml |   2 -
 ...ystolic_ws_128x128_c2_simple_noc_tpuv2.yml |   2 -
 ...ystolic_ws_128x128_c2_simple_noc_tpuv3.yml |   2 -
 ...lic_ws_128x128_c2_simple_noc_tpuv3_ils.yml |   2 -
 ..._128x128_c2_simple_noc_tpuv3_partition.yml |   2 -
 ...ystolic_ws_128x128_c2_simple_noc_tpuv4.yml |   2 -
 configs/systolic_ws_8x8_c1_booksim.yml        |   2 -
 configs/systolic_ws_8x8_c1_simple_noc.yml     |   2 -
 .../session1/togsim_configs/togsim_config.yml |   2 -
 .../togsim_configs/togsim_config_2_cores.yml  |   2 -
 .../togsim_configs/togsim_config_autotune.yml |   2 -
 .../togsim_config_external_mapping.yml        |   2 -
 .../togsim_config_functional_only.yml         |   2 -
 ...togsim_config_no_compiler_optimization.yml |   2 -
 .../togsim_config_timing_only.yml             |   2 -
 88 files changed, 814 insertions(+), 6822 deletions(-)
 delete mode 100644 TOGSim/extern/ramulator_custom/.gitignore
 delete mode 100644 TOGSim/extern/ramulator_custom/CMakeLists.txt
 delete mode 100644 TOGSim/extern/ramulator_custom/include/ramulator/Ramulator.hpp
 delete mode 100644 TOGSim/extern/ramulator_custom/src/Config.cpp
 delete mode 100644 TOGSim/extern/ramulator_custom/src/Config.h
 delete mode 100644 TOGSim/extern/ramulator_custom/src/Controller.h
 delete mode 100644 TOGSim/extern/ramulator_custom/src/DDR4.cpp
 delete mode 100644 TOGSim/extern/ramulator_custom/src/DDR4.h
 delete mode 100644 TOGSim/extern/ramulator_custom/src/DRAM.h
 delete mode 100644 TOGSim/extern/ramulator_custom/src/HBM.cpp
 delete mode 100644 TOGSim/extern/ramulator_custom/src/HBM.h
 delete mode 100644 TOGSim/extern/ramulator_custom/src/Memory.h
 delete mode 100644 TOGSim/extern/ramulator_custom/src/MemoryFactory.cpp
 delete mode 100644 TOGSim/extern/ramulator_custom/src/MemoryFactory.h
 delete mode 100644 TOGSim/extern/ramulator_custom/src/Ramulator.cpp
 delete mode 100644 TOGSim/extern/ramulator_custom/src/Refresh.cpp
 delete mode 100644 TOGSim/extern/ramulator_custom/src/Refresh.h
 delete mode 100644 TOGSim/extern/ramulator_custom/src/Request.cpp
 delete mode 100644 TOGSim/extern/ramulator_custom/src/Request.h
 delete mode 100644 TOGSim/extern/ramulator_custom/src/Scheduler.h
 delete mode 100644 TOGSim/extern/ramulator_custom/src/SpeedyController.h
 delete mode 100644 TOGSim/extern/ramulator_custom/src/StatType.cpp
 delete mode 100644 TOGSim/extern/ramulator_custom/src/StatType.h
 delete mode 100644 TOGSim/extern/ramulator_custom/src/Statistics.h
 create mode 100644 configs/ramulator2_configs/HBM2_TPUv2.yaml
 delete mode 100644 configs/ramulator_configs/ALDRAM-config.cfg
 delete mode 100644 configs/ramulator_configs/DDR3-config.cfg
 delete mode 100644 configs/ramulator_configs/DDR4-config.cfg
 delete mode 100644 configs/ramulator_configs/DSARP-config.cfg
 delete mode 100644 configs/ramulator_configs/GDDR5-config.cfg
 delete mode 100644 configs/ramulator_configs/HBM-config.cfg
 delete mode 100644 configs/ramulator_configs/HBM-config_ChRaBaRoCo.cfg
 delete mode 100644 configs/ramulator_configs/HBM-config_FCFS.cfg
 delete mode 100644 configs/ramulator_configs/HBM-config_FRFCFS.cfg
 delete mode 100644 configs/ramulator_configs/HBM-config_FRFCFS_Cap.cfg
 delete mode 100644 configs/ramulator_configs/HBM-config_FRFCFS_PriorHit.cfg
 delete mode 100644 configs/ramulator_configs/HBM-config_RoBaRaCoCh.cfg
 delete mode 100644 configs/ramulator_configs/HBM-config_RoCoBaRaCh.cfg
 delete mode 100644 configs/ramulator_configs/HBMx0.5ch-config.cfg
 delete mode 100644 configs/ramulator_configs/HBMx2ch-config.cfg
 delete mode 100644 configs/ramulator_configs/LPDDR3-config.cfg
 delete mode 100644 configs/ramulator_configs/LPDDR4-config.cfg
 delete mode 100644 configs/ramulator_configs/PCM-config.cfg
 delete mode 100644 configs/ramulator_configs/SALP-config.cfg
 delete mode 100644 configs/ramulator_configs/STTMRAM-config.cfg
 delete mode 100644 configs/ramulator_configs/TLDRAM-config.cfg
 delete mode 100644 configs/ramulator_configs/WideIO-config.cfg
 delete mode 100644 configs/ramulator_configs/WideIO2-config.cfg

diff --git a/README.md b/README.md
index 03041355..a6dd399a 100644
--- a/README.md
+++ b/README.md
@@ -397,13 +397,18 @@ export TORCHSIM_USE_TIMING_POOLING=0 # use lightweight pooling for timing
   "vpu_spad_size_kb_per_lane" : 128, // Scratchpad memory size per lane (KB)
   "vpu_vector_length_bits" : 256,    // VPU vector register length (Bits)
 
-  "dram_type" : "ramulator2",        // DRAM type (ex. ramulator2, simple)
-  "dram_freq_mhz" : 940,             // DRAM frequency (MHz)
-  "dram_channels": 32,               // Number of DRAM channels
-  "dram_req_size": 32,               // DRAM request size (B)
-  "dram_latency" : 10,               // DRAM latency (cycle)
-  "dram_nbl" : 2,                    // DRAM burst length size
-  "dram_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml", // Ramulator2 config file path
+  "dram_type" : "ramulator2",        // DRAM type: ramulator2 | simple
+  "dram_channels": 32,               // Number of DRAM channels (topology; required for both types)
+  "dram_stats_print_period_cycles": 10000, // Optional DRAM stats interval
+  // ramulator2: per-request size (bytes), DRAM MHz, and per-channel peak GB/s are derived from ramulator_config_path
+  // (peak ≈ timing[0] as MT/s × channel_width × pseudo-channels for HBM2/3; MHz from Ramulator tCK).
+  // Optional: if you set dram_freq_mhz, it must exactly match that derived MHz or initialization fails
+  // (the error message includes tCK in ns and the derived MHz for debugging stale yml values).
+  // Do not set dram_bandwidth_gbps_* at top level.
+  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
+  // simple: dram_latency + dram_channels + optional dram_req_size_byte (default 32). Omit
+  // dram_bandwidth_gbps_* for latency-only; dram_freq_mhz defaults to core_freq_mhz.
+  // With dram_bandwidth_gbps_* set, dram_freq_mhz is required (credit refill per DRAM cycle).
 
   "l2d_type" : "datacache",
   "l2d_config" : "S:64:128:512,32,L:B:m:W:L,A:192:4,32:0,32",
diff --git a/TOGSim/extern/ramulator2 b/TOGSim/extern/ramulator2
index ad6acd97..d33bf3ac 160000
--- a/TOGSim/extern/ramulator2
+++ b/TOGSim/extern/ramulator2
@@ -1 +1 @@
-Subproject commit ad6acd97e9fc60c44ed96a49267b7c20ab76e4d3
+Subproject commit d33bf3ac26f3e7f838386ff7923ea6bc3ba61c31
diff --git a/TOGSim/extern/ramulator_custom/.gitignore b/TOGSim/extern/ramulator_custom/.gitignore
deleted file mode 100644
index 65a99dc1..00000000
--- a/TOGSim/extern/ramulator_custom/.gitignore
+++ /dev/null
@@ -1,6 +0,0 @@
-*.swp
-
-# Compiled Object files
-obj/
-
-# Compiled target executable files
diff --git a/TOGSim/extern/ramulator_custom/CMakeLists.txt b/TOGSim/extern/ramulator_custom/CMakeLists.txt
deleted file mode 100644
index 371de8df..00000000
--- a/TOGSim/extern/ramulator_custom/CMakeLists.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-cmake_minimum_required(VERSION 3.16)
-project(ramulator_project)
-
-file(GLOB_RECURSE RAMULATOR_SRCS CONFIGURE_DEPENDS src/*.cpp)
-add_library(ramulator1 STATIC ${RAMULATOR_SRCS})
-target_include_directories(ramulator1
-  PUBLIC include
-  PRIVATE include/ramulator
-  PRIVATE src
-)
-target_compile_options(ramulator1 PRIVATE -Wall -O3)
diff --git a/TOGSim/extern/ramulator_custom/include/ramulator/Ramulator.hpp b/TOGSim/extern/ramulator_custom/include/ramulator/Ramulator.hpp
deleted file mode 100644
index 4687b22b..00000000
--- a/TOGSim/extern/ramulator_custom/include/ramulator/Ramulator.hpp
+++ /dev/null
@@ -1,57 +0,0 @@
-#ifndef __RAMULATOR_H
-#define __RAMULATOR_H
-#include <cstdint>
-#include <string>
-#include <vector>
-#include <queue>
-#include <memory>
-#include <unordered_map>
-#include <functional>
-#include <robin_hood.h>
-namespace ram {
-class MemoryBase;
-class Request;
-class Ramulator {
-public:
-  Ramulator(const std::string ConfigFilePath, uint32_t num_core, bool is_pim = false);
-  ~Ramulator();
-  void tick();
-  bool isAvailable(int CtrlID, uint64_t Addr, bool IsWrite) const;
-  bool isAvailable(uint64_t Addr, bool IsWrite) const;
-  void push(int CtrlID, uint64_t Addr, bool IsWrite, uint32_t core_id, void* original_req);
-  void push(uint64_t Addr, bool IsWrite, uint32_t core_id, void* original_req);
-  bool isEmpty(int CtrlID) const;
-  const void* top(int CtrlID) const;
-  void pop(int CtrlID);
-  int getAtomicBytes() const;
-  int getNumChannels() const;
-  int getChannel(uint64_t Addr) const;
-  void print_stats();
-private:
-  std::unique_ptr<MemoryBase> MemBase;
-  class OutputPendingQueue;
-  std::vector<OutputPendingQueue> OutputPendingQueues;
-  using CallbackMap =
-    std::unordered_map<bool, std::function<void(const ram::Request&)>>;
-  CallbackMap Callbacks;
-  robin_hood::unordered_flat_set<int> hot_vids;
-  bool is_pim;
-  static std::unique_ptr<MemoryBase> createMemory(std::string ConfigFilePath, uint32_t num_core);
-};
-class Ramulator::OutputPendingQueue {
-public:
-  OutputPendingQueue(int Size);
-  bool isAvailable() const;
-  bool isAvailable(uint32_t count) const;
-  bool isEmpty() const;
-  void reserve();
-  void push(void* original_req);
-  const void* top() const;
-  void pop();
-private:
-  const int Size;
-  int NumReserved;
-  std::queue<void*> PendingQueue;
-};
-} // end namespace
-#endif
diff --git a/TOGSim/extern/ramulator_custom/src/Config.cpp b/TOGSim/extern/ramulator_custom/src/Config.cpp
deleted file mode 100644
index a82f6e95..00000000
--- a/TOGSim/extern/ramulator_custom/src/Config.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-#include "Config.h"
-
-using namespace std;
-using namespace ram;
-
-RamulatorConfig::RamulatorConfig(const std::string& fname) {
-  options["mapping"] = "RoBaRaCoCh";
-  options["scheduler"] = "FRFCFS";
-  parse(fname);
-}
-
-void RamulatorConfig::parse(const string& fname)
-{
-    ifstream file(fname);
-    assert(file.good() && "Bad config file");
-    string line;
-    while (getline(file, line)) {
-        char delim[] = " \t=";
-        vector<string> tokens;
-
-        while (true) {
-            size_t start = line.find_first_not_of(delim);
-            if (start == string::npos) 
-                break;
-
-            size_t end = line.find_first_of(delim, start);
-            if (end == string::npos) {
-                tokens.push_back(line.substr(start));
-                break;
-            }
-
-            tokens.push_back(line.substr(start, end - start));
-            line = line.substr(end);
-        }
-
-        // empty line
-        if (!tokens.size())
-            continue;
-
-        // comment line
-        if (tokens[0][0] == '#')
-            continue;
-
-        // parameter line
-        assert(tokens.size() == 2 && "Only allow two tokens in one line");
-
-        options[tokens[0]] = tokens[1];
-
-        if (tokens[0] == "channels") {
-          channels = atoi(tokens[1].c_str());
-        } else if (tokens[0] == "ranks") {
-          ranks = atoi(tokens[1].c_str());
-        } else if (tokens[0] == "subarrays") {
-          subarrays = atoi(tokens[1].c_str());
-        } else if (tokens[0] == "cpu_tick") {
-          cpu_tick = atoi(tokens[1].c_str());
-        } else if (tokens[0] == "mem_tick") {
-          mem_tick = atoi(tokens[1].c_str());
-        } else if (tokens[0] == "expected_limit_insts") {
-          expected_limit_insts = atoi(tokens[1].c_str());
-        } else if (tokens[0] == "warmup_insts") {
-          warmup_insts = atoi(tokens[1].c_str());
-        }
-    }
-    file.close();
-}
-
-
diff --git a/TOGSim/extern/ramulator_custom/src/Config.h b/TOGSim/extern/ramulator_custom/src/Config.h
deleted file mode 100644
index 2d8c12ce..00000000
--- a/TOGSim/extern/ramulator_custom/src/Config.h
+++ /dev/null
@@ -1,120 +0,0 @@
-#ifndef __CONFIG_H
-#define __CONFIG_H
-
-#include <string>
-#include <fstream>
-#include <vector>
-#include <map>
-#include <iostream>
-#include <cassert>
-
-namespace ram
-{
-
-class RamulatorConfig {
-
-private:
-    std::map<std::string, std::string> options;
-    int channels;
-    int ranks;
-    int subarrays;
-    int cpu_tick;
-    int mem_tick;
-    int core_num = 0;
-    long expected_limit_insts = 0;
-    long warmup_insts = 0;
-
-public:
-    RamulatorConfig() {}
-    RamulatorConfig(const std::string& fname);
-    void parse(const std::string& fname);
-    std::string operator [] (const std::string& name) const {
-      if (options.find(name) != options.end()) {
-        return (options.find(name))->second;
-      } else {
-        return "";
-      }
-    }
-
-    bool contains(const std::string& name) const {
-      if (options.find(name) != options.end()) {
-        return true;
-      } else {
-        return false;
-      }
-    }
-
-    void add (const std::string& name, const std::string& value) {
-      if (!contains(name)) {
-        options.insert(make_pair(name, value));
-      } else {
-        printf("ramulator::Config::add options[%s] already set.\n", name.c_str());
-      }
-    }
-
-    void set_core_num(int _core_num) {core_num = _core_num;}
-
-    int get_channels() const {return channels;}
-    int get_subarrays() const {return subarrays;}
-    int get_ranks() const {return ranks;}
-    int get_cpu_tick() const {return cpu_tick;}
-    int get_mem_tick() const {return mem_tick;}
-    int get_core_num() const {return core_num;}
-    long get_expected_limit_insts() const {return expected_limit_insts;}
-    long get_warmup_insts() const {return warmup_insts;}
-
-    bool has_l3_cache() const {
-      if (options.find("cache") != options.end()) {
-        const std::string& cache_option = (options.find("cache"))->second;
-        return (cache_option == "all") || (cache_option == "L3");
-      } else {
-        return false;
-      }
-    }
-    bool has_core_caches() const {
-      if (options.find("cache") != options.end()) {
-        const std::string& cache_option = (options.find("cache"))->second;
-        return (cache_option == "all" || cache_option == "L1L2");
-      } else {
-        return false;
-      }
-    }
-    bool is_early_exit() const {
-      // the default value is true
-      if (options.find("early_exit") != options.end()) {
-        if ((options.find("early_exit"))->second == "off") {
-          return false;
-        }
-        return true;
-      }
-      return true;
-    }
-    bool calc_weighted_speedup() const {
-      return (expected_limit_insts != 0);
-    }
-    bool record_cmd_trace() const {
-      // the default value is false
-      if (options.find("record_cmd_trace") != options.end()) {
-        if ((options.find("record_cmd_trace"))->second == "on") {
-          return true;
-        }
-        return false;
-      }
-      return false;
-    }
-    bool print_cmd_trace() const {
-      // the default value is false
-      if (options.find("print_cmd_trace") != options.end()) {
-        if ((options.find("print_cmd_trace"))->second == "on") {
-          return true;
-        }
-        return false;
-      }
-      return false;
-    }
-};
-
-
-} /* namespace ram */
-
-#endif /* _CONFIG_H */
diff --git a/TOGSim/extern/ramulator_custom/src/Controller.h b/TOGSim/extern/ramulator_custom/src/Controller.h
deleted file mode 100644
index 75ebba17..00000000
--- a/TOGSim/extern/ramulator_custom/src/Controller.h
+++ /dev/null
@@ -1,667 +0,0 @@
-#ifndef __CONTROLLER_H
-#define __CONTROLLER_H
-
-#include <cassert>
-#include <cstdio>
-#include <deque>
-#include <fstream>
-#include <list>
-#include <string>
-#include <vector>
-
-#include "Config.h"
-#include "DRAM.h"
-#include "Refresh.h"
-#include "Request.h"
-#include "Scheduler.h"
-#include "Statistics.h"
-
-// #include "ALDRAM.h"
-// #include "SALP.h"
-// #include "TLDRAM.h"
-
-using namespace std;
-
-namespace ram
-{
-
-    extern bool warmup_complete;
-
-template <typename T>
-class Controller
-{
-protected:
-    // For counting bandwidth
-    ScalarStat read_transaction_bytes;
-    ScalarStat write_transaction_bytes;
-
-    ScalarStat row_hits;
-    ScalarStat row_misses;
-    ScalarStat row_conflicts;
-    VectorStat read_row_hits;
-    VectorStat read_row_misses;
-    VectorStat read_row_conflicts;
-    VectorStat write_row_hits;
-    VectorStat write_row_misses;
-    VectorStat write_row_conflicts;
-    ScalarStat useless_activates;
-
-    ScalarStat read_latency_avg;
-    ScalarStat read_latency_sum;
-
-    ScalarStat req_queue_length_avg;
-    ScalarStat req_queue_length_sum;
-    ScalarStat read_req_queue_length_avg;
-    ScalarStat read_req_queue_length_sum;
-    ScalarStat write_req_queue_length_avg;
-    ScalarStat write_req_queue_length_sum;
-
-    VectorStat record_read_hits;
-    VectorStat record_read_misses;
-    VectorStat record_read_conflicts;
-    VectorStat record_write_hits;
-    VectorStat record_write_misses;
-    VectorStat record_write_conflicts;
-
-public:
-    /* Member Variables */
-    unsigned long clk = 0;
-    DRAM<T>* channel;
-
-    Scheduler<T>* scheduler;  // determines the highest priority request whose commands will be issued
-    RowPolicy<T>* rowpolicy;  // determines the row-policy (e.g., closed-row vs. open-row)
-    RowTable<T>* rowtable;  // tracks metadata about rows (e.g., which are open and for how long)
-    Refresh<T>* refresh;
-
-    struct Queue {
-        list<Request> q;
-        unsigned int max = 64;
-        unsigned int size() const {return q.size();}
-    };
-
-    Queue readq;  // queue for read requests
-    Queue writeq;  // queue for write requests
-    Queue actq; // read and write requests for which activate was issued are moved to 
-                   // actq, which has higher priority than readq and writeq.
-                   // This is an optimization
-                   // for avoiding useless activations (i.e., PRECHARGE
-                   // after ACTIVATE w/o READ of WRITE command)
-    Queue otherq;  // queue for all "other" requests (e.g., refresh)
-
-    deque<Request> pending;  // read requests that are about to receive data from DRAM
-    bool write_mode = false;  // whether write requests should be prioritized over reads
-    float wr_high_watermark = 0.8f; // threshold for switching to write mode
-    float wr_low_watermark = 0.2f; // threshold for switching back to read mode
-    //long refreshed = 0;  // last time refresh requests were generated
-
-    /* Command trace for DRAMPower 3.1 */
-    string cmd_trace_prefix = "cmd-trace-";
-    vector<ofstream> cmd_trace_files;
-    bool record_cmd_trace = false;
-    /* Commands to stdout */
-    bool print_cmd_trace = false;
-    RamulatorConfig& configs;
-    /* Constructor */
-    Controller(RamulatorConfig& configs, DRAM<T>* channel) :
-        configs(configs),
-        channel(channel),
-        cmd_trace_files(channel->children.size())
-    {
-        scheduler = new Scheduler<T>(this);
-        rowpolicy = new RowPolicy<T>(this);
-        rowtable = new RowTable<T>(this);
-        refresh = new Refresh<T>(this);
-        
-        record_cmd_trace = configs.record_cmd_trace();
-        print_cmd_trace = configs.print_cmd_trace();
-        if (record_cmd_trace){
-            if (configs["cmd_trace_prefix"] != "") {
-              cmd_trace_prefix = configs["cmd_trace_prefix"];
-            }
-            string prefix = cmd_trace_prefix + "chan-" + to_string(channel->id) + "-rank-";
-            string suffix = ".cmdtrace";
-            for (unsigned int i = 0; i < channel->children.size(); i++)
-                cmd_trace_files[i].open(prefix + to_string(i) + suffix);
-        }
-
-        // regStats
-
-        row_hits
-            .name("row_hits_channel_"+to_string(channel->id) + "_core")
-            .desc("Number of row hits per channel per core")
-            .precision(0)
-            ;
-        row_misses
-            .name("row_misses_channel_"+to_string(channel->id) + "_core")
-            .desc("Number of row misses per channel per core")
-            .precision(0)
-            ;
-        row_conflicts
-            .name("row_conflicts_channel_"+to_string(channel->id) + "_core")
-            .desc("Number of row conflicts per channel per core")
-            .precision(0)
-            ;
-
-        read_row_hits
-            .init(configs.get_core_num())
-            .name("read_row_hits_channel_"+to_string(channel->id) + "_core")
-            .desc("Number of row hits for read requests per channel per core")
-            .precision(0)
-            ;
-        read_row_misses
-            .init(configs.get_core_num())
-            .name("read_row_misses_channel_"+to_string(channel->id) + "_core")
-            .desc("Number of row misses for read requests per channel per core")
-            .precision(0)
-            ;
-        read_row_conflicts
-            .init(configs.get_core_num())
-            .name("read_row_conflicts_channel_"+to_string(channel->id) + "_core")
-            .desc("Number of row conflicts for read requests per channel per core")
-            .precision(0)
-            ;
-
-        write_row_hits
-            .init(configs.get_core_num())
-            .name("write_row_hits_channel_"+to_string(channel->id) + "_core")
-            .desc("Number of row hits for write requests per channel per core")
-            .precision(0)
-            ;
-        write_row_misses
-            .init(configs.get_core_num())
-            .name("write_row_misses_channel_"+to_string(channel->id) + "_core")
-            .desc("Number of row misses for write requests per channel per core")
-            .precision(0)
-            ;
-        write_row_conflicts
-            .init(configs.get_core_num())
-            .name("write_row_conflicts_channel_"+to_string(channel->id) + "_core")
-            .desc("Number of row conflicts for write requests per channel per core")
-            .precision(0)
-            ;
-
-        useless_activates
-            .name("useless_activates_"+to_string(channel->id)+ "_core")
-            .desc("Number of useless activations. E.g, ACT -> PRE w/o RD or WR")
-            .precision(0)
-            ;
-
-        read_transaction_bytes
-            .name("read_transaction_bytes_"+to_string(channel->id))
-            .desc("The total byte of read transaction per channel")
-            .precision(0)
-            ;
-        write_transaction_bytes
-            .name("write_transaction_bytes_"+to_string(channel->id))
-            .desc("The total byte of write transaction per channel")
-            .precision(0)
-            ;
-
-        read_latency_sum
-            .name("read_latency_sum_"+to_string(channel->id))
-            .desc("The memory latency cycles (in memory time domain) sum for all read requests in this channel")
-            .precision(0)
-            ;
-        read_latency_avg
-            .name("read_latency_avg_"+to_string(channel->id))
-            .desc("The average memory latency cycles (in memory time domain) per request for all read requests in this channel")
-            .precision(6)
-            ;
-
-        req_queue_length_sum
-            .name("req_queue_length_sum_"+to_string(channel->id))
-            .desc("Sum of read and write queue length per memory cycle per channel.")
-            .precision(0)
-            ;
-        req_queue_length_avg
-            .name("req_queue_length_avg_"+to_string(channel->id))
-            .desc("Average of read and write queue length per memory cycle per channel.")
-            .precision(6)
-            ;
-
-        read_req_queue_length_sum
-            .name("read_req_queue_length_sum_"+to_string(channel->id))
-            .desc("Read queue length sum per memory cycle per channel.")
-            .precision(0)
-            ;
-        read_req_queue_length_avg
-            .name("read_req_queue_length_avg_"+to_string(channel->id))
-            .desc("Read queue length average per memory cycle per channel.")
-            .precision(6)
-            ;
-
-        write_req_queue_length_sum
-            .name("write_req_queue_length_sum_"+to_string(channel->id))
-            .desc("Write queue length sum per memory cycle per channel.")
-            .precision(0)
-            ;
-        write_req_queue_length_avg
-            .name("write_req_queue_length_avg_"+to_string(channel->id))
-            .desc("Write queue length average per memory cycle per channel.")
-            .precision(6)
-            ;
-
-        record_read_hits
-            .init(configs.get_core_num())
-            .name("record_read_hits")
-            .desc("record read hit count for this core when it reaches request limit or to the end")
-            ;
-
-        record_read_misses
-            .init(configs.get_core_num())
-            .name("record_read_misses")
-            .desc("record_read_miss count for this core when it reaches request limit or to the end")
-            ;
-
-        record_read_conflicts
-            .init(configs.get_core_num())
-            .name("record_read_conflicts")
-            .desc("record read conflict count for this core when it reaches request limit or to the end")
-            ;
-
-        record_write_hits
-            .init(configs.get_core_num())
-            .name("record_write_hits")
-            .desc("record write hit count for this core when it reaches request limit or to the end")
-            ;
-
-        record_write_misses
-            .init(configs.get_core_num())
-            .name("record_write_misses")
-            .desc("record write miss count for this core when it reaches request limit or to the end")
-            ;
-
-        record_write_conflicts
-            .init(configs.get_core_num())
-            .name("record_write_conflicts")
-            .desc("record write conflict for this core when it reaches request limit or to the end")
-            ;
-    }
-
-    ~Controller(){
-        delete scheduler;
-        delete rowpolicy;
-        delete rowtable;
-        delete channel;
-        delete refresh;
-        for (auto& file : cmd_trace_files)
-            file.close();
-        cmd_trace_files.clear();
-    }
-
-    void finish(long read_req, long dram_cycles) {
-      read_latency_avg = read_latency_sum.value() / read_req;
-      req_queue_length_avg = req_queue_length_sum.value() / dram_cycles;
-      read_req_queue_length_avg = read_req_queue_length_sum.value() / dram_cycles;
-      write_req_queue_length_avg = write_req_queue_length_sum.value() / dram_cycles;
-      // call finish function of each channel
-      channel->finish(dram_cycles);
-    }
-
-    /* Member Functions */
-    Queue& get_queue(Request::Type type)
-    {
-        switch (int(type)) {
-            case int(Request::Type::READ): return readq;
-            case int(Request::Type::WRITE): return writeq;
-            default: return otherq;
-        }
-    }
-
-    bool done() const {
-      return readq.size() == 0 && writeq.size() == 0;
-    }
-
-    bool is_full(bool is_write) {
-      Request::Type type = is_write ? Request::Type::WRITE : Request::Type::READ;
-      auto& queue = get_queue(type);
-      assert(queue.size() <= queue.max);
-      return queue.size() == queue.max;
-    }
-
-    bool enqueue(Request& req)
-    {
-        Queue& queue = get_queue(req.type);
-        if (queue.max == queue.size())
-            return false;
-
-        req.arrive = clk;
-        queue.q.push_back(req);
-        // shortcut for read requests, if a write to same addr exists
-        // necessary for coherence
-        // FIX: currently disable this because the write request of newfeature
-        // FIX: is same as read address
-        // if (req.type == Request::Type::READ && find_if(writeq.q.begin(), writeq.q.end(),
-        //         [req](Request& wreq){ return req.addr == wreq.addr;}) != writeq.q.end()){
-        //     req.depart = clk + 1;
-        //     pending.push_back(req);
-        //     readq.q.pop_back();
-        // }
-        return true;
-    }
-
-    void tick()
-    {
-        clk++;
-        req_queue_length_sum += readq.size() + writeq.size() + pending.size();
-        read_req_queue_length_sum += readq.size() + pending.size();
-        write_req_queue_length_sum += writeq.size();
-
-        /*** 1. Serve completed reads ***/
-        if (pending.size()) {
-            Request& req = pending[0];
-            assert(req.type == Request::Type::READ);
-            if (req.depart <= clk) {
-                if (req.depart - req.arrive > 1) { // this request really accessed a row
-                  read_latency_sum += req.depart - req.arrive;
-                  channel->update_serving_requests(
-                      req.addr_vec.data(), -1, clk);
-                }
-                req.callback(req);
-                pending.pop_front();
-            }
-        }
-
-        /*** 2. Refresh scheduler ***/
-        refresh->tick_ref();
-
-        /*** 3. Should we schedule writes? ***/
-        if (!write_mode) {
-            // yes -- write queue is almost full or read queue is empty
-            if (writeq.size() > int(wr_high_watermark * writeq.max) || readq.size() == 0)
-                write_mode = true;
-        }
-        else {
-            // no -- write queue is almost empty and read queue is not empty
-            if (writeq.size() < int(wr_low_watermark * writeq.max) && readq.size() != 0)
-                write_mode = false;
-        }
-
-        /*** 4. Find the best command to schedule, if any ***/
-
-        // First check the actq (which has higher priority) to see if there
-        // are requests available to service in this cycle
-        Queue* queue = &actq;
-        typename T::Command cmd;
-        auto req = scheduler->get_head(queue->q);
-
-        bool is_valid_req = (req != queue->q.end());
-
-        if(is_valid_req) {
-            cmd = get_first_cmd(req);
-            is_valid_req = is_ready(cmd, req->addr_vec);
-        }
-
-        if (!is_valid_req) {
-            queue = !write_mode ? &readq : &writeq;
-
-            if (otherq.size())
-                queue = &otherq;  // "other" requests are rare, so we give them precedence over reads/writes
-
-            req = scheduler->get_head(queue->q);
-
-            is_valid_req = (req != queue->q.end());
-
-            if(is_valid_req){
-                cmd = get_first_cmd(req);
-                is_valid_req = is_ready(cmd, req->addr_vec);
-            }
-        }
-
-        if (!is_valid_req) {
-            // we couldn't find a command to schedule -- let's try to be speculative
-            auto cmd = T::Command::PRE;
-            vector<int> victim = rowpolicy->get_victim(cmd);
-            if (!victim.empty()){
-                issue_cmd(cmd, victim);
-            }
-            return;  // nothing more to be done this cycle
-        }
-
-        if (req->is_first_command) {
-            req->is_first_command = false;
-            int coreid = req->coreid;
-            if (req->type == Request::Type::READ || req->type == Request::Type::WRITE) {
-              channel->update_serving_requests(req->addr_vec.data(), 1, clk);
-            }
-            int tx = (channel->spec->prefetch_size * channel->spec->channel_width / 8);
-            if (req->type == Request::Type::READ) {
-                if (is_row_hit(req)) {
-                    ++read_row_hits[coreid];
-                    ++row_hits;
-                } else if (is_row_open(req)) {
-                    ++read_row_conflicts[coreid];
-                    ++row_conflicts;
-                } else {
-                    ++read_row_misses[coreid];
-                    ++row_misses;
-                }
-              read_transaction_bytes += tx;
-            } else if (req->type == Request::Type::WRITE) {
-              if (is_row_hit(req)) {
-                  ++write_row_hits[coreid];
-                  ++row_hits;
-              } else if (is_row_open(req)) {
-                  ++write_row_conflicts[coreid];
-                  ++row_conflicts;
-              } else {
-                  ++write_row_misses[coreid];
-                  ++row_misses;
-              }
-              write_transaction_bytes += tx;
-            }
-        }
-
-        // issue command on behalf of request
-        issue_cmd(cmd, get_addr_vec(cmd, req));
-
-        // check whether this is the last command (which finishes the request)
-        //if (cmd != channel->spec->translate[int(req->type)]){
-        if (cmd != channel->spec->translate[int(req->type)]) {
-            if(channel->spec->is_opening(cmd)) {
-                // promote the request that caused issuing activation to actq
-                actq.q.push_back(*req);
-                queue->q.erase(req);
-            }
-
-            return;
-        }
-
-        // set a future completion time for read requests
-        if (req->type == Request::Type::READ) {
-            req->depart = clk + channel->spec->read_latency;
-            pending.push_back(*req);
-        }
-
-        if (req->type == Request::Type::WRITE || req->type == Request::Type::PIM_WRITE) {
-            channel->update_serving_requests(req->addr_vec.data(), -1, clk);
-            req->callback(*req);
-        }
-
-        // remove request from queue
-        queue->q.erase(req);
-    }
-
-    bool is_ready(list<Request>::iterator req)
-    {
-        typename T::Command cmd = get_first_cmd(req);
-        return channel->check(cmd, req->addr_vec.data(), clk);
-    }
-
-    bool is_ready(typename T::Command cmd, const vector<int>& addr_vec)
-    {
-        return channel->check(cmd, addr_vec.data(), clk);
-    }
-
-    bool is_row_hit(list<Request>::iterator req)
-    {
-        // cmd must be decided by the request type, not the first cmd
-        typename T::Command cmd = channel->spec->translate[int(req->type)];
-        return channel->check_row_hit(cmd, req->addr_vec.data());
-    }
-
-    bool is_row_hit(typename T::Command cmd, const vector<int>& addr_vec)
-    {
-        return channel->check_row_hit(cmd, addr_vec.data());
-    }
-
-    bool is_row_open(list<Request>::iterator req)
-    {
-        // cmd must be decided by the request type, not the first cmd
-        typename T::Command cmd = channel->spec->translate[int(req->type)];
-        return channel->check_row_open(cmd, req->addr_vec.data());
-    }
-
-    bool is_row_open(typename T::Command cmd, const vector<int>& addr_vec)
-    {
-        return channel->check_row_open(cmd, addr_vec.data());
-    }
-
-    // void update_temp(ALDRAM::Temp current_temperature)
-    // {
-    // }
-
-    // For telling whether this channel is busying in processing read or write
-    bool is_active() {
-      return (channel->cur_serving_requests > 0);
-    }
-
-    // For telling whether this channel is under refresh
-    bool is_refresh() {
-      return clk <= channel->end_of_refreshing;
-    }
-
-    void set_high_writeq_watermark(const float watermark) {
-       wr_high_watermark = watermark; 
-    }
-
-    void set_low_writeq_watermark(const float watermark) {
-       wr_low_watermark = watermark;
-    }
-
-    void record_core(int coreid) {
-      record_read_hits[coreid] = read_row_hits[coreid];
-      record_read_misses[coreid] = read_row_misses[coreid];
-      record_read_conflicts[coreid] = read_row_conflicts[coreid];
-      record_write_hits[coreid] = write_row_hits[coreid];
-      record_write_misses[coreid] = write_row_misses[coreid];
-      record_write_conflicts[coreid] = write_row_conflicts[coreid];
-    }
-
-private:
-    typename T::Command get_first_cmd(list<Request>::iterator req)
-    {
-        typename T::Command cmd = channel->spec->translate[int(req->type)];
-        return channel->decode(cmd, req->addr_vec.data());
-    }
-
-    // upgrade to an autoprecharge command
-    void cmd_issue_autoprecharge(typename T::Command& cmd,
-                                            const vector<int>& addr_vec) {
-
-        // currently, autoprecharge is only used with closed row policy
-        if(channel->spec->is_accessing(cmd) && rowpolicy->type == RowPolicy<T>::Type::ClosedAP) {
-            // check if it is the last request to the opened row
-            Queue* queue = write_mode ? &writeq : &readq;
-
-            auto begin = addr_vec.begin();
-            vector<int> rowgroup(begin, begin + int(T::Level::Row) + 1);
-
-			int num_row_hits = 0;
-
-            for (auto itr = queue->q.begin(); itr != queue->q.end(); ++itr) {
-                if (is_row_hit(itr)) { 
-                    auto begin2 = itr->addr_vec.begin();
-                    vector<int> rowgroup2(begin2, begin2 + int(T::Level::Row) + 1);
-                    if(rowgroup == rowgroup2)
-                        num_row_hits++;
-                }
-            }
-
-            if(num_row_hits == 0) {
-                Queue* queue = &actq;
-                for (auto itr = queue->q.begin(); itr != queue->q.end(); ++itr) {
-                    if (is_row_hit(itr)) {
-                        auto begin2 = itr->addr_vec.begin();
-                        vector<int> rowgroup2(begin2, begin2 + int(T::Level::Row) + 1);
-                        if(rowgroup == rowgroup2)
-                            num_row_hits++;
-                    }
-                }
-            }
-
-            assert(num_row_hits > 0); // The current request should be a hit, 
-                                      // so there should be at least one request 
-                                      // that hits in the current open row
-            if(num_row_hits == 1) {
-                if(cmd == T::Command::RD)
-                    cmd = T::Command::RDA;
-                else if (cmd == T::Command::WR)
-                    cmd = T::Command::WRA;
-                else
-                    assert(false && "Unimplemented command type.");
-            }
-        }
-
-    }
-
-    void issue_cmd(typename T::Command cmd, const vector<int>& addr_vec)
-    {
-        cmd_issue_autoprecharge(cmd, addr_vec);
-        assert(is_ready(cmd, addr_vec));
-        channel->update(cmd, addr_vec.data(), clk);
-
-        if(cmd == T::Command::PRE){
-            if(rowtable->get_hits(addr_vec, true) == 0){
-                useless_activates++;
-            }
-        }
- 
-        rowtable->update(cmd, addr_vec, clk);
-        if (record_cmd_trace){
-            // select rank
-            auto& file = cmd_trace_files[addr_vec[1]];
-            string& cmd_name = channel->spec->command_name[int(cmd)];
-            file<<clk<<','<<cmd_name;
-            // TODO bad coding here
-            if (cmd_name == "PREA" || cmd_name == "REF")
-                file<<endl;
-            else{
-                int bank_id = addr_vec[int(T::Level::Bank)];
-                if (channel->spec->standard_name == "DDR4" || channel->spec->standard_name == "GDDR5")
-                    bank_id += addr_vec[int(T::Level::Bank) - 1] * channel->spec->org_entry.count[int(T::Level::Bank)];
-                file<<','<<bank_id<<endl;
-            }
-        }
-        if (print_cmd_trace){
-            printf("%5s %10ld:", channel->spec->command_name[int(cmd)].c_str(), clk);
-            for (int lev = 0; lev < int(T::Level::MAX); lev++)
-                printf(" %5d", addr_vec[lev]);
-            printf("\n");
-        }
-    }
-    vector<int> get_addr_vec(typename T::Command cmd, list<Request>::iterator req){
-        return req->addr_vec;
-    }
-};
-
-// template <>
-// vector<int> Controller<SALP>::get_addr_vec(
-//     SALP::Command cmd, list<Request>::iterator req);
-//
-// template <>
-// bool Controller<SALP>::is_ready(list<Request>::iterator req);
-//
-// template <>
-// void Controller<ALDRAM>::update_temp(ALDRAM::Temp current_temperature);
-//
-// template <>
-// void Controller<TLDRAM>::tick();
-//
-// template <>
-// void Controller<TLDRAM>::cmd_issue_autoprecharge(typename TLDRAM::Command& cmd,
-//                                                     const vector<int>& addr_vec);
-//
-} /*namespace ram*/
-
-#endif /*__CONTROLLER_H*/
diff --git a/TOGSim/extern/ramulator_custom/src/DDR4.cpp b/TOGSim/extern/ramulator_custom/src/DDR4.cpp
deleted file mode 100644
index 31064182..00000000
--- a/TOGSim/extern/ramulator_custom/src/DDR4.cpp
+++ /dev/null
@@ -1,418 +0,0 @@
-#include "DDR4.h"
-#include "DRAM.h"
-
-using namespace std;
-using namespace ram;
-
-string DDR4::standard_name = "DDR4";
-string DDR4::level_str [int(Level::MAX)] = {"Ch", "Ra", "Bg", "Ba", "Ro", "Co"};
-
-map<string, enum DDR4::Org> DDR4::org_map = {
-    {"DDR4_2Gb_x4", DDR4::Org::DDR4_2Gb_x4}, {"DDR4_2Gb_x8", DDR4::Org::DDR4_2Gb_x8}, {"DDR4_2Gb_x16", DDR4::Org::DDR4_2Gb_x16},
-    {"DDR4_4Gb_x4", DDR4::Org::DDR4_4Gb_x4}, {"DDR4_4Gb_x8", DDR4::Org::DDR4_4Gb_x8}, {"DDR4_4Gb_x16", DDR4::Org::DDR4_4Gb_x16},
-    {"DDR4_8Gb_x4", DDR4::Org::DDR4_8Gb_x4}, {"DDR4_8Gb_x8", DDR4::Org::DDR4_8Gb_x8}, {"DDR4_8Gb_x16", DDR4::Org::DDR4_8Gb_x16},
-};
-
-map<string, enum DDR4::Speed> DDR4::speed_map = {
-    {"DDR4_1600K", DDR4::Speed::DDR4_1600K}, {"DDR4_1600L", DDR4::Speed::DDR4_1600L},
-    {"DDR4_1866M", DDR4::Speed::DDR4_1866M}, {"DDR4_1866N", DDR4::Speed::DDR4_1866N},
-    {"DDR4_2133P", DDR4::Speed::DDR4_2133P}, {"DDR4_2133R", DDR4::Speed::DDR4_2133R},
-    {"DDR4_2400R", DDR4::Speed::DDR4_2400R}, {"DDR4_2400U", DDR4::Speed::DDR4_2400U},
-    {"DDR4_3200", DDR4::Speed::DDR4_3200},
-};
-
-
-DDR4::DDR4(Org org, Speed speed)
-    : org_entry(org_table[int(org)]),
-    speed_entry(speed_table[int(speed)]), 
-    read_latency(speed_entry.nCL + speed_entry.nBL)
-{
-    init_speed();
-    init_prereq();
-    init_rowhit(); // SAUGATA: added row hit function
-    init_rowopen();
-    init_lambda();
-    init_timing();
-}
-
-DDR4::DDR4(const string& org_str, const string& speed_str) :
-    DDR4(org_map[org_str], speed_map[speed_str]) 
-{
-}
-
-void DDR4::set_channel_number(int channel) {
-  org_entry.count[int(Level::Channel)] = channel;
-}
-
-void DDR4::set_rank_number(int rank) {
-  org_entry.count[int(Level::Rank)] = rank;
-}
-
-void DDR4::init_speed()
-{
-    const static int RRDS_TABLE[2][5] = {
-        {4, 4, 4, 4, 4},
-        {5, 5, 6, 7, 9}
-    };
-    const static int RRDL_TABLE[2][5] = {
-        {5, 5, 6, 6, 8},
-        {6, 6, 7, 8, 11}
-    };
-    const static int FAW_TABLE[3][5] = {
-        {16, 16, 16, 16, 16},
-        {20, 22, 23, 26, 34},
-        {28, 28, 32, 36, 48}
-    };
-    const static int RFC_TABLE[int(RefreshMode::MAX)][3][5] = {{   
-            {128, 150, 171, 192, 256},
-            {208, 243, 278, 312, 416},
-            {280, 327, 374, 420, 560}
-        },{
-            {88, 103, 118, 132,  176},
-            {128, 150, 171, 192, 256},
-            {208, 243, 278, 312, 416} 
-        },{
-            {72, 84, 96, 108, 144},
-            {88, 103, 118, 132, 176},
-            {128, 150, 171, 192, 256}  
-        }
-    };
-    const static int REFI_TABLE[5] = {
-        6240, 7280, 8320, 9360, 12480
-    };
-    const static int XS_TABLE[3][5] = {
-        {136, 159, 182, 204, 272},
-        {216, 252, 288, 324, 432},
-        {288, 336, 384, 432, 576}
-    };
-
-    int speed = 0, density = 0;
-    switch (speed_entry.rate) {
-        case 1600: speed = 0; break;
-        case 1866: speed = 1; break;
-        case 2133: speed = 2; break;
-        case 2400: speed = 3; break;
-        case 3200: speed = 4; break;
-        default: assert(false);
-    };
-    switch (org_entry.size >> 10){
-        case 2: density = 0; break;
-        case 4: density = 1; break;
-        case 8: density = 2; break;
-        default: assert(false);
-    }
-    speed_entry.nRRDS = RRDS_TABLE[org_entry.dq == 16? 1: 0][speed];
-    speed_entry.nRRDL = RRDL_TABLE[org_entry.dq == 16? 1: 0][speed];
-    speed_entry.nFAW = FAW_TABLE[org_entry.dq == 4? 0: org_entry.dq == 8? 1: 2][speed];
-    speed_entry.nRFC = RFC_TABLE[(int)refresh_mode][density][speed];
-    speed_entry.nREFI = (REFI_TABLE[speed] >> int(refresh_mode));
-    speed_entry.nXS = XS_TABLE[density][speed];
-}
-
-
-void DDR4::init_prereq()
-{
-    // RD
-    prereq[int(Level::Rank)][int(Command::RD)] = [] (DRAM<DDR4>* node, Command cmd, int id) {
-        switch (int(node->state)) {
-            case int(State::PowerUp): return Command::MAX;
-            case int(State::ActPowerDown): return Command::PDX;
-            case int(State::PrePowerDown): return Command::PDX;
-            case int(State::SelfRefresh): return Command::SRX;
-            default: {
-              assert(false);
-              return Command::MAX;
-            }
-        }};
-    prereq[int(Level::Bank)][int(Command::RD)] = [] (DRAM<DDR4>* node, Command cmd, int id) {
-        switch (int(node->state)) {
-            case int(State::Closed): return Command::ACT;
-            case int(State::Opened):
-                if (node->row_state.find(id) != node->row_state.end())
-                    return cmd;
-                else return Command::PRE;
-            default: {
-              assert(false);
-              return Command::MAX;
-            }
-        }};
-
-    // WR
-    prereq[int(Level::Rank)][int(Command::WR)] = prereq[int(Level::Rank)][int(Command::RD)];
-    prereq[int(Level::Bank)][int(Command::WR)] = prereq[int(Level::Bank)][int(Command::RD)];
-
-    // REF
-    prereq[int(Level::Rank)][int(Command::REF)] = [] (DRAM<DDR4>* node, Command cmd, int id) {
-        for (auto bg : node->children)
-            for (auto bank: bg->children) {
-                if (bank->state == State::Closed)
-                    continue;
-                return Command::PREA;
-            }
-        return Command::REF;};
-
-    // PD
-    prereq[int(Level::Rank)][int(Command::PDE)] = [] (DRAM<DDR4>* node, Command cmd, int id) {
-        switch (int(node->state)) {
-            case int(State::PowerUp): return Command::PDE;
-            case int(State::ActPowerDown): return Command::PDE;
-            case int(State::PrePowerDown): return Command::PDE;
-            case int(State::SelfRefresh): return Command::SRX;
-            default: {
-              assert(false);
-              return Command::MAX;
-            }
-        }};
-
-    // SR
-    prereq[int(Level::Rank)][int(Command::SRE)] = [] (DRAM<DDR4>* node, Command cmd, int id) {
-        switch (int(node->state)) {
-            case int(State::PowerUp): return Command::SRE;
-            case int(State::ActPowerDown): return Command::PDX;
-            case int(State::PrePowerDown): return Command::PDX;
-            case int(State::SelfRefresh): return Command::SRE;
-            default: {
-              assert(false);
-              return Command::MAX;
-            }
-        }};
-}
-
-// SAUGATA: added row hit check functions to see if the desired location is currently open
-void DDR4::init_rowhit()
-{
-    // RD
-    rowhit[int(Level::Bank)][int(Command::RD)] = [] (DRAM<DDR4>* node, Command cmd, int id) {
-        switch (int(node->state)) {
-            case int(State::Closed): return false;
-            case int(State::Opened):
-                if (node->row_state.find(id) != node->row_state.end())
-                    return true;
-                return false;
-            default: {
-              assert(false);
-              return false;
-            }
-        }};
-
-    // WR
-    rowhit[int(Level::Bank)][int(Command::WR)] = rowhit[int(Level::Bank)][int(Command::RD)];
-}
-
-void DDR4::init_rowopen()
-{
-    // RD
-    rowopen[int(Level::Bank)][int(Command::RD)] = [] (DRAM<DDR4>* node, Command cmd, int id) {
-        switch (int(node->state)) {
-            case int(State::Closed): return false;
-            case int(State::Opened): return true;
-            default: {
-              assert(false);
-              return false;
-            }
-        }};
-
-    // WR
-    rowopen[int(Level::Bank)][int(Command::WR)] = rowopen[int(Level::Bank)][int(Command::RD)];
-}
-
-void DDR4::init_lambda()
-{
-    lambda[int(Level::Bank)][int(Command::ACT)] = [] (DRAM<DDR4>* node, int id) {
-        node->state = State::Opened;
-        node->row_state[id] = State::Opened;};
-    lambda[int(Level::Bank)][int(Command::PRE)] = [] (DRAM<DDR4>* node, int id) {
-        node->state = State::Closed;
-        node->row_state.clear();};
-    lambda[int(Level::Rank)][int(Command::PREA)] = [] (DRAM<DDR4>* node, int id) {
-        for (auto bg : node->children)
-            for (auto bank : bg->children) {
-                bank->state = State::Closed;
-                bank->row_state.clear();
-            }};
-    lambda[int(Level::Rank)][int(Command::REF)] = [] (DRAM<DDR4>* node, int id) {};
-    lambda[int(Level::Bank)][int(Command::RD)] = [] (DRAM<DDR4>* node, int id) {};
-    lambda[int(Level::Bank)][int(Command::WR)] = [] (DRAM<DDR4>* node, int id) {};
-    lambda[int(Level::Bank)][int(Command::RDA)] = [] (DRAM<DDR4>* node, int id) {
-        node->state = State::Closed;
-        node->row_state.clear();};
-    lambda[int(Level::Bank)][int(Command::WRA)] = [] (DRAM<DDR4>* node, int id) {
-        node->state = State::Closed;
-        node->row_state.clear();};
-    lambda[int(Level::Rank)][int(Command::PDE)] = [] (DRAM<DDR4>* node, int id) {
-        for (auto bg : node->children)
-            for (auto bank : bg->children) {
-                if (bank->state == State::Closed)
-                    continue;
-                node->state = State::ActPowerDown;
-                return;
-            }
-        node->state = State::PrePowerDown;};
-    lambda[int(Level::Rank)][int(Command::PDX)] = [] (DRAM<DDR4>* node, int id) {
-        node->state = State::PowerUp;};
-    lambda[int(Level::Rank)][int(Command::SRE)] = [] (DRAM<DDR4>* node, int id) {
-        node->state = State::SelfRefresh;};
-    lambda[int(Level::Rank)][int(Command::SRX)] = [] (DRAM<DDR4>* node, int id) {
-        node->state = State::PowerUp;};
-}
-
-
-void DDR4::init_timing()
-{
-    SpeedEntry& s = speed_entry;
-    vector<TimingEntry> *t;
-
-    /*** Channel ***/ 
-    t = timing[int(Level::Channel)];
-
-    // CAS <-> CAS
-    t[int(Command::RD)].push_back({Command::RD, 1, s.nBL});
-    t[int(Command::RD)].push_back({Command::RDA, 1, s.nBL});
-    t[int(Command::RDA)].push_back({Command::RD, 1, s.nBL});
-    t[int(Command::RDA)].push_back({Command::RDA, 1, s.nBL});
-    t[int(Command::WR)].push_back({Command::WR, 1, s.nBL});
-    t[int(Command::WR)].push_back({Command::WRA, 1, s.nBL});
-    t[int(Command::WRA)].push_back({Command::WR, 1, s.nBL});
-    t[int(Command::WRA)].push_back({Command::WRA, 1, s.nBL});
-
-
-    /*** Rank ***/ 
-    t = timing[int(Level::Rank)];
-
-    // CAS <-> CAS
-    t[int(Command::RD)].push_back({Command::RD, 1, s.nCCDS});
-    t[int(Command::RD)].push_back({Command::RDA, 1, s.nCCDS});
-    t[int(Command::RDA)].push_back({Command::RD, 1, s.nCCDS});
-    t[int(Command::RDA)].push_back({Command::RDA, 1, s.nCCDS});
-    t[int(Command::WR)].push_back({Command::WR, 1, s.nCCDS});
-    t[int(Command::WR)].push_back({Command::WRA, 1, s.nCCDS});
-    t[int(Command::WRA)].push_back({Command::WR, 1, s.nCCDS});
-    t[int(Command::WRA)].push_back({Command::WRA, 1, s.nCCDS});
-    t[int(Command::RD)].push_back({Command::WR, 1, s.nCL + s.nBL + 2 - s.nCWL});
-    t[int(Command::RD)].push_back({Command::WRA, 1, s.nCL + s.nBL + 2 - s.nCWL});
-    t[int(Command::RDA)].push_back({Command::WR, 1, s.nCL + s.nBL + 2 - s.nCWL});
-    t[int(Command::RDA)].push_back({Command::WRA, 1, s.nCL + s.nBL + 2 - s.nCWL});
-    t[int(Command::WR)].push_back({Command::RD, 1, s.nCWL + s.nBL + s.nWTRS});
-    t[int(Command::WR)].push_back({Command::RDA, 1, s.nCWL + s.nBL + s.nWTRS});
-    t[int(Command::WRA)].push_back({Command::RD, 1, s.nCWL + s.nBL + s.nWTRS});
-    t[int(Command::WRA)].push_back({Command::RDA, 1, s.nCWL + s.nBL + s.nWTRS});
-
-    // CAS <-> CAS (between sibling ranks)
-    t[int(Command::RD)].push_back({Command::RD, 1, s.nBL + s.nRTRS, true});
-    t[int(Command::RD)].push_back({Command::RDA, 1, s.nBL + s.nRTRS, true});
-    t[int(Command::RDA)].push_back({Command::RD, 1, s.nBL + s.nRTRS, true});
-    t[int(Command::RDA)].push_back({Command::RDA, 1, s.nBL + s.nRTRS, true});
-    t[int(Command::RD)].push_back({Command::WR, 1, s.nBL + s.nRTRS, true});
-    t[int(Command::RD)].push_back({Command::WRA, 1, s.nBL + s.nRTRS, true});
-    t[int(Command::RDA)].push_back({Command::WR, 1, s.nBL + s.nRTRS, true});
-    t[int(Command::RDA)].push_back({Command::WRA, 1, s.nBL + s.nRTRS, true});
-    t[int(Command::RD)].push_back({Command::WR, 1, s.nCL + s.nBL + s.nRTRS - s.nCWL, true});
-    t[int(Command::RD)].push_back({Command::WRA, 1, s.nCL + s.nBL + s.nRTRS - s.nCWL, true});
-    t[int(Command::RDA)].push_back({Command::WR, 1, s.nCL + s.nBL + s.nRTRS - s.nCWL, true});
-    t[int(Command::RDA)].push_back({Command::WRA, 1, s.nCL + s.nBL + s.nRTRS - s.nCWL, true});
-    t[int(Command::WR)].push_back({Command::RD, 1, s.nCWL + s.nBL + s.nRTRS - s.nCL, true});
-    t[int(Command::WR)].push_back({Command::RDA, 1, s.nCWL + s.nBL + s.nRTRS - s.nCL, true});
-    t[int(Command::WRA)].push_back({Command::RD, 1, s.nCWL + s.nBL + s.nRTRS - s.nCL, true});
-    t[int(Command::WRA)].push_back({Command::RDA, 1, s.nCWL + s.nBL + s.nRTRS - s.nCL, true});
-
-    t[int(Command::RD)].push_back({Command::PREA, 1, s.nRTP});
-    t[int(Command::WR)].push_back({Command::PREA, 1, s.nCWL + s.nBL + s.nWR});
-
-    // CAS <-> PD
-    t[int(Command::RD)].push_back({Command::PDE, 1, s.nCL + s.nBL + 1});
-    t[int(Command::RDA)].push_back({Command::PDE, 1, s.nCL + s.nBL + 1});
-    t[int(Command::WR)].push_back({Command::PDE, 1, s.nCWL + s.nBL + s.nWR});
-    t[int(Command::WRA)].push_back({Command::PDE, 1, s.nCWL + s.nBL + s.nWR + 1}); // +1 for pre
-    t[int(Command::PDX)].push_back({Command::RD, 1, s.nXP});
-    t[int(Command::PDX)].push_back({Command::RDA, 1, s.nXP});
-    t[int(Command::PDX)].push_back({Command::WR, 1, s.nXP});
-    t[int(Command::PDX)].push_back({Command::WRA, 1, s.nXP});
-    
-    // CAS <-> SR: none (all banks have to be precharged)
-
-    // RAS <-> RAS
-    t[int(Command::ACT)].push_back({Command::ACT, 1, s.nRRDS});
-    t[int(Command::ACT)].push_back({Command::ACT, 4, s.nFAW});
-    t[int(Command::ACT)].push_back({Command::PREA, 1, s.nRAS});
-    t[int(Command::PREA)].push_back({Command::ACT, 1, s.nRP});
-
-    // RAS <-> REF
-    t[int(Command::ACT)].push_back({Command::REF, 1, s.nRC});
-    t[int(Command::PRE)].push_back({Command::REF, 1, s.nRP});
-    t[int(Command::PREA)].push_back({Command::REF, 1, s.nRP});
-    t[int(Command::RDA)].push_back({Command::REF, 1, s.nRTP + s.nRP});
-    t[int(Command::WRA)].push_back({Command::REF, 1, s.nCWL + s.nBL + s.nWR + s.nRP});
-    t[int(Command::REF)].push_back({Command::ACT, 1, s.nRFC});
-
-    // RAS <-> PD
-    t[int(Command::ACT)].push_back({Command::PDE, 1, 1});
-    t[int(Command::PDX)].push_back({Command::ACT, 1, s.nXP});
-    t[int(Command::PDX)].push_back({Command::PRE, 1, s.nXP});
-    t[int(Command::PDX)].push_back({Command::PREA, 1, s.nXP});
-
-    // RAS <-> SR
-    t[int(Command::PRE)].push_back({Command::SRE, 1, s.nRP});
-    t[int(Command::PREA)].push_back({Command::SRE, 1, s.nRP});
-    t[int(Command::SRX)].push_back({Command::ACT, 1, s.nXS});
-
-    // REF <-> REF
-    t[int(Command::REF)].push_back({Command::REF, 1, s.nRFC});
-
-    // REF <-> PD
-    t[int(Command::REF)].push_back({Command::PDE, 1, 1});
-    t[int(Command::PDX)].push_back({Command::REF, 1, s.nXP});
-
-    // REF <-> SR
-    t[int(Command::SRX)].push_back({Command::REF, 1, s.nXS});
-    
-    // PD <-> PD
-    t[int(Command::PDE)].push_back({Command::PDX, 1, s.nPD});
-    t[int(Command::PDX)].push_back({Command::PDE, 1, s.nXP});
-
-    // PD <-> SR
-    t[int(Command::PDX)].push_back({Command::SRE, 1, s.nXP});
-    t[int(Command::SRX)].push_back({Command::PDE, 1, s.nXS});
-    
-    // SR <-> SR
-    t[int(Command::SRE)].push_back({Command::SRX, 1, s.nCKESR});
-    t[int(Command::SRX)].push_back({Command::SRE, 1, s.nXS});
-
-    /*** Bank Group ***/ 
-    t = timing[int(Level::BankGroup)];
-    // CAS <-> CAS
-    t[int(Command::RD)].push_back({Command::RD, 1, s.nCCDL});
-    t[int(Command::RD)].push_back({Command::RDA, 1, s.nCCDL});
-    t[int(Command::RDA)].push_back({Command::RD, 1, s.nCCDL});
-    t[int(Command::RDA)].push_back({Command::RDA, 1, s.nCCDL});
-    t[int(Command::WR)].push_back({Command::WR, 1, s.nCCDL});
-    t[int(Command::WR)].push_back({Command::WRA, 1, s.nCCDL});
-    t[int(Command::WRA)].push_back({Command::WR, 1, s.nCCDL});
-    t[int(Command::WRA)].push_back({Command::WRA, 1, s.nCCDL});
-    t[int(Command::WR)].push_back({Command::RD, 1, s.nCWL + s.nBL + s.nWTRL});
-    t[int(Command::WR)].push_back({Command::RDA, 1, s.nCWL + s.nBL + s.nWTRL});
-    t[int(Command::WRA)].push_back({Command::RD, 1, s.nCWL + s.nBL + s.nWTRL});
-    t[int(Command::WRA)].push_back({Command::RDA, 1, s.nCWL + s.nBL + s.nWTRL});
-
-    // RAS <-> RAS
-    t[int(Command::ACT)].push_back({Command::ACT, 1, s.nRRDL});
-
-    /*** Bank ***/ 
-    t = timing[int(Level::Bank)];
-
-    // CAS <-> RAS
-    t[int(Command::ACT)].push_back({Command::RD, 1, s.nRCD});
-    t[int(Command::ACT)].push_back({Command::RDA, 1, s.nRCD});
-    t[int(Command::ACT)].push_back({Command::WR, 1, s.nRCD});
-    t[int(Command::ACT)].push_back({Command::WRA, 1, s.nRCD});
-
-    t[int(Command::RD)].push_back({Command::PRE, 1, s.nRTP});
-    t[int(Command::WR)].push_back({Command::PRE, 1, s.nCWL + s.nBL + s.nWR});
-
-    t[int(Command::RDA)].push_back({Command::ACT, 1, s.nRTP + s.nRP});
-    t[int(Command::WRA)].push_back({Command::ACT, 1, s.nCWL + s.nBL + s.nWR + s.nRP});
-
-    // RAS <-> RAS
-    t[int(Command::ACT)].push_back({Command::ACT, 1, s.nRC});
-    t[int(Command::ACT)].push_back({Command::PRE, 1, s.nRAS});
-    t[int(Command::PRE)].push_back({Command::ACT, 1, s.nRP});
-}
diff --git a/TOGSim/extern/ramulator_custom/src/DDR4.h b/TOGSim/extern/ramulator_custom/src/DDR4.h
deleted file mode 100644
index 0808dc80..00000000
--- a/TOGSim/extern/ramulator_custom/src/DDR4.h
+++ /dev/null
@@ -1,220 +0,0 @@
-#ifndef __DDR4_H
-#define __DDR4_H
-
-#include <map>
-#include <string>
-#include <vector>
-#include <functional>
-
-#include "Request.h"
-
-using namespace std;
-
-namespace ram
-{
-template <typename T>
-class DRAM;
-
-class DDR4
-{
-public:
-    static string standard_name;
-    enum class Org;
-    enum class Speed;
-    DDR4(Org org, Speed speed);
-    DDR4(const string& org_str, const string& speed_str);
-    
-    static map<string, enum Org> org_map;
-    static map<string, enum Speed> speed_map;
-    /* Level */
-    enum class Level : int
-    { 
-        Channel, Rank, BankGroup, Bank, Row, Column, MAX
-    };
-    
-    static std::string level_str [int(Level::MAX)];
-
-    /* Command */
-    enum class Command : int
-    { 
-        ACT, PRE, PREA, 
-        RD,  WR,  RDA,  WRA, 
-        REF, PDE, PDX,  SRE, SRX, 
-        MAX
-    };
-
-    string command_name[int(Command::MAX)] = {
-        "ACT", "PRE", "PREA", 
-        "RD",  "WR",  "RDA",  "WRA", 
-        "REF", "PDE", "PDX",  "SRE", "SRX"
-    };
-
-    Level scope[int(Command::MAX)] = {
-        Level::Row,    Level::Bank,   Level::Rank,   
-        Level::Column, Level::Column, Level::Column, Level::Column,
-        Level::Rank,   Level::Rank,   Level::Rank,   Level::Rank,   Level::Rank
-    };
-
-    bool is_opening(Command cmd) 
-    {
-        switch(int(cmd)) {
-            case int(Command::ACT):
-                return true;
-            default:
-                return false;
-        }
-    }
-
-    bool is_accessing(Command cmd) 
-    {
-        switch(int(cmd)) {
-            case int(Command::RD):
-            case int(Command::WR):
-            case int(Command::RDA):
-            case int(Command::WRA):
-                return true;
-            default:
-                return false;
-        }
-    }
-
-    bool is_closing(Command cmd) 
-    {
-        switch(int(cmd)) {
-            case int(Command::RDA):
-            case int(Command::WRA):
-            case int(Command::PRE):
-            case int(Command::PREA):
-                return true;
-            default:
-                return false;
-        }
-    }
-
-    bool is_refreshing(Command cmd) 
-    {
-        switch(int(cmd)) {
-            case int(Command::REF):
-                return true;
-            default:
-                return false;
-        }
-    }
-
-    /* State */
-    enum class State : int
-    {
-        Opened, Closed, PowerUp, ActPowerDown, PrePowerDown, SelfRefresh, MAX
-    } start[int(Level::MAX)] = {
-        State::MAX, State::PowerUp, State::MAX, State::Closed, State::Closed, State::MAX
-    };
-
-    /* Translate */
-    Command translate[int(Request::Type::MAX)] = {
-        Command::RD,  Command::WR,
-        Command::REF, Command::PDE, Command::SRE
-    };
-
-    /* Prereq */
-    function<Command(DRAM<DDR4>*, Command cmd, int)> prereq[int(Level::MAX)][int(Command::MAX)];
-
-    // SAUGATA: added function object container for row hit status
-    /* Row hit */
-    function<bool(DRAM<DDR4>*, Command cmd, int)> rowhit[int(Level::MAX)][int(Command::MAX)];
-    function<bool(DRAM<DDR4>*, Command cmd, int)> rowopen[int(Level::MAX)][int(Command::MAX)];
-
-    /* Timing */
-    struct TimingEntry
-    {
-        Command cmd;
-        int dist;
-        int val;
-        bool sibling;
-    }; 
-    vector<TimingEntry> timing[int(Level::MAX)][int(Command::MAX)];
-
-    /* Lambda */
-    function<void(DRAM<DDR4>*, int)> lambda[int(Level::MAX)][int(Command::MAX)];
-
-    /* Organization */
-    enum class Org : int
-    {
-        DDR4_2Gb_x4,   DDR4_2Gb_x8,   DDR4_2Gb_x16,
-        DDR4_4Gb_x4,   DDR4_4Gb_x8,   DDR4_4Gb_x16,
-        DDR4_8Gb_x4,   DDR4_8Gb_x8,   DDR4_8Gb_x16,
-        MAX
-    };
-
-    struct OrgEntry {
-        int size;
-        int dq;
-        int count[int(Level::MAX)];
-    } org_table[int(Org::MAX)] = {
-        {2<<10,  4, {0, 0, 4, 4, 1<<15, 1<<10}}, {2<<10,  8, {0, 0, 4, 4, 1<<14, 1<<10}}, {2<<10, 16, {0, 0, 2, 4, 1<<14, 1<<10}},
-        {4<<10,  4, {0, 0, 4, 4, 1<<16, 1<<10}}, {4<<10,  8, {0, 0, 4, 4, 1<<15, 1<<10}}, {4<<10, 16, {0, 0, 2, 4, 1<<15, 1<<10}},
-        {8<<10,  4, {0, 0, 4, 4, 1<<17, 1<<10}}, {8<<10,  8, {0, 0, 4, 4, 1<<16, 1<<10}}, {8<<10, 16, {0, 0, 2, 4, 1<<16, 1<<10}}
-    }, org_entry;
-
-    void set_channel_number(int channel);
-    void set_rank_number(int rank);
-
-    /* Speed */
-    enum class Speed : int
-    {
-        DDR4_1600K, DDR4_1600L,
-        DDR4_1866M, DDR4_1866N,
-        DDR4_2133P, DDR4_2133R,
-        DDR4_2400R, DDR4_2400U,
-        DDR4_3200,
-        MAX
-    };
-
-    enum class RefreshMode : int
-    {
-        Refresh_1X,
-        Refresh_2X,
-        Refresh_4X,
-        MAX
-    } refresh_mode = RefreshMode::Refresh_1X;
-
-    int prefetch_size = 8; // 8n prefetch DDR
-    int channel_width = 64;
-
-    struct SpeedEntry {
-        int rate;
-        double freq, tCK;
-        int nBL, nCCDS, nCCDL, nRTRS;
-        int nCL, nRCD, nRP, nCWL;
-        int nRAS, nRC;
-        int nRTP, nWTRS, nWTRL, nWR;
-        int nRRDS, nRRDL, nFAW;
-        int nRFC, nREFI;
-        int nPD, nXP, nXPDLL; // XPDLL not found in DDR4??
-        int nCKESR, nXS, nXSDLL; // nXSDLL TBD (nDLLK), nXS = (tRFC+10ns)/tCK
-    } speed_table[int(Speed::MAX)] = {
-        {1600, (400.0/3)*6, (3/0.4)/6, 4, 4, 5, 2, 11, 11, 11,  9, 28, 39, 6, 2, 6, 12, 0, 0, 0, 0, 0, 4, 5, 0, 5, 0, 0},
-        {1600, (400.0/3)*6, (3/0.4)/6, 4, 4, 5, 2, 12, 12, 12,  9, 28, 40, 6, 2, 6, 12, 0, 0, 0, 0, 0, 4, 5, 0, 5, 0, 0},
-        {1866, (400.0/3)*7, (3/0.4)/7, 4, 4, 5, 2, 13, 13, 13, 10, 32, 45, 7, 3, 7, 14, 0, 0, 0, 0, 0, 5, 6, 0, 6, 0, 0},
-        {1866, (400.0/3)*7, (3/0.4)/7, 4, 4, 5, 2, 14, 14, 14, 10, 32, 46, 7, 3, 7, 14, 0, 0, 0, 0, 0, 5, 6, 0, 6, 0, 0},
-        {2133, (400.0/3)*8, (3/0.4)/8, 4, 4, 6, 2, 15, 15, 15, 11, 36, 51, 8, 3, 8, 16, 0, 0, 0, 0, 0, 6, 7, 0, 7, 0, 0},
-        {2133, (400.0/3)*8, (3/0.4)/8, 4, 4, 6, 2, 16, 16, 16, 11, 36, 52, 8, 3, 8, 16, 0, 0, 0, 0, 0, 6, 7, 0, 7, 0, 0},
-        {2400, (400.0/3)*9, (3/0.4)/9, 4, 4, 6, 2, 16, 16, 16, 12, 39, 55, 9, 3, 9, 18, 0, 0, 0, 0, 0, 6, 8, 0, 7, 0, 0},
-        {2400, (400.0/3)*9, (3/0.4)/9, 4, 4, 6, 2, 18, 18, 18, 12, 39, 57, 9, 3, 9, 18, 0, 0, 0, 0, 0, 6, 8, 0, 7, 0, 0},
-        {3200, 1600, 0.625, prefetch_size/2/*DDR*/, 4,     10,   2,    22, 22,  22, 16,  56,  78, 12,  4,    12,   24, 8,    10,   40,  0,   0,    8,  10, 0,     8,     0,  0}
-        //rate, freq, tCK,  nBL,           nCCDS  nCCDL nRTRS nCL nRCD nRP nCWL nRAS nRC nRTP nWTRS nWTRL nWR nRRDS nRRDL nFAW nRFC nREFI nPD nXP nXPDLL nCKESR nXS nXSDLL
-    }, speed_entry;
-
-    int read_latency;
-
-private:
-    void init_speed();
-    void init_lambda();
-    void init_prereq();
-    void init_rowhit();  // SAUGATA: added function to check for row hits
-    void init_rowopen();
-    void init_timing();
-};
-
-} /*namespace ram*/
-
-#endif /*__DDR4_H*/
diff --git a/TOGSim/extern/ramulator_custom/src/DRAM.h b/TOGSim/extern/ramulator_custom/src/DRAM.h
deleted file mode 100644
index fe5405b6..00000000
--- a/TOGSim/extern/ramulator_custom/src/DRAM.h
+++ /dev/null
@@ -1,453 +0,0 @@
-#ifndef __DRAM_H
-#define __DRAM_H
-
-#include "Statistics.h"
-#include <iostream>
-#include <vector>
-#include <deque>
-#include <map>
-#include <functional>
-#include <algorithm>
-#include <cassert>
-#include <type_traits>
-
-#include <robin_hood.h>
-
-using namespace std;
-
-namespace ram
-{
-
-template <typename T>
-class DRAM
-{
-public:
-    ScalarStat active_cycles;
-    ScalarStat refresh_cycles;
-    ScalarStat busy_cycles;
-    ScalarStat active_refresh_overlap_cycles;
-
-    ScalarStat serving_requests;
-    ScalarStat average_serving_requests;
-
-    // Constructor
-    DRAM(T* spec, typename T::Level level);
-    ~DRAM();
-
-    // Specification (e.g., DDR3)
-    T* spec;
-
-    // Tree Organization (e.g., Channel->Rank->Bank->Row->Column)
-    typename T::Level level;
-    int id;
-    long size;
-    DRAM* parent;
-    vector<DRAM*> children;
-
-    // State (e.g., Opened, Closed)
-    typename T::State state;
-
-    // State of Rows:
-    // There are too many rows for them to be instantiated individually
-    // Instead, their bank (or an equivalent entity) tracks their state for them
-    robin_hood::unordered_flat_map<int, typename T::State> row_state;
-
-    // Insert a node as one of my child nodes
-    void insert(DRAM<T>* child);
-
-    // Decode a command into its "prerequisite" command (if any is needed)
-    typename T::Command decode(typename T::Command cmd, const int* addr);
-
-    // Check whether a command is ready to be scheduled
-    bool check(typename T::Command cmd, const int* addr, long clk);
-
-    // Check whether a command is a row hit
-    bool check_row_hit(typename T::Command cmd, const int* addr);
-
-    // Check whether a row is open
-    bool check_row_open(typename T::Command cmd, const int* addr);
-
-    // Return the earliest clock when a command is ready to be scheduled
-    long get_next(typename T::Command cmd, const int* addr);
-
-    // Update the timing/state of the tree, signifying that a command has been issued
-    void update(typename T::Command cmd, const int* addr, long clk);
-    // Update statistics:
-
-    // Update the number of requests it serves currently
-    void update_serving_requests(const int* addr, int delta, long clk);
-
-    // TIANSHI: current serving requests count
-    int cur_serving_requests = 0;
-    long begin_of_serving = -1;
-    long end_of_serving = -1;
-    long begin_of_cur_reqcnt = -1;
-    long begin_of_refreshing = -1;
-    long end_of_refreshing = -1;
-    std::vector<std::pair<long, long>> refresh_intervals;
-
-    // register statistics
-    void regStats(const std::string& identifier);
-
-    void finish(long dram_cycles);
-
-private:
-    // Constructor
-    DRAM(){}
-
-    // Timing
-    long cur_clk = 0;
-    long next[int(T::Command::MAX)]; // the earliest time in the future when a command could be ready
-    deque<long> prev[int(T::Command::MAX)]; // the most recent history of when commands were issued
-
-    // Lookup table for which commands must be preceded by which other commands (i.e., "prerequisite")
-    // E.g., a read command to a closed bank must be preceded by an activate command
-    function<typename T::Command(DRAM<T>*, typename T::Command cmd, int)>* prereq;
-
-    // SAUGATA: added table for row hits
-    // Lookup table for whether a command is a row hit
-    // E.g., a read command to a closed bank must be preceded by an activate command
-    function<bool(DRAM<T>*, typename T::Command cmd, int)>* rowhit;
-    function<bool(DRAM<T>*, typename T::Command cmd, int)>* rowopen;
-
-    // Lookup table between commands and the state transitions they trigger
-    // E.g., an activate command to a closed bank opens both the bank and the row
-    function<void(DRAM<T>*, int)>* lambda;
-
-    // Lookup table for timing parameters
-    // E.g., activate->precharge: tRAS@bank, activate->activate: tRC@bank
-    vector<typename T::TimingEntry>* timing;
-
-    // Helper Functions
-    void update_state(typename T::Command cmd, const int* addr);
-    void update_timing(typename T::Command cmd, const int* addr, long clk);
-}; /* class DRAM */
-
-
-// register statistics
-template <typename T>
-void DRAM<T>::regStats(const std::string& identifier) {
-    active_cycles
-        .name("active_cycles" + identifier + "_" + to_string(id))
-        .desc("Total active cycles for level " + identifier + "_" + to_string(id))
-        .precision(0)
-        ;
-    refresh_cycles
-        .name("refresh_cycles" + identifier + "_" + to_string(id))
-        .desc("(All-bank refresh only, only valid for rank level) The sum of cycles that is under refresh per memory cycle for level " + identifier + "_" + to_string(id))
-        .precision(0)
-        .flags(Stat::nozero)
-        ;
-    busy_cycles
-        .name("busy_cycles" + identifier + "_" + to_string(id))
-        .desc("(All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level " + identifier + "_" + to_string(id))
-        .precision(0)
-        ;
-    active_refresh_overlap_cycles
-        .name("active_refresh_overlap_cycles" + identifier + "_" + to_string(id))
-        .desc("(All-bank refresh only, only valid for rank level) The sum of cycles that are both active and under refresh per memory cycle for level " + identifier + "_" + to_string(id))
-        .precision(0)
-        .flags(Stat::nozero)
-        ;
-    serving_requests
-        .name("serving_requests" + identifier + "_" + to_string(id))
-        .desc("The sum of read and write requests that are served in this DRAM element per memory cycle for level " + identifier + "_" + to_string(id))
-        .precision(0)
-        ;
-    average_serving_requests
-        .name("average_serving_requests" + identifier + "_" + to_string(id))
-        .desc("The average of read and write requests that are served in this DRAM element per memory cycle for level " + identifier + "_" + to_string(id))
-        .precision(6)
-        ;
-
-    if (!children.size()) {
-      return;
-    }
-
-    // recursively register children statistics
-    for (auto child : children) {
-      child->regStats(identifier + "_" + to_string(id));
-    }
-}
-
-template <typename T>
-void DRAM<T>::finish(long dram_cycles) {
-  // finalize busy cycles
-  busy_cycles = active_cycles.value() + refresh_cycles.value() - active_refresh_overlap_cycles.value();
-
-  // finalize average serving requests
-  average_serving_requests = serving_requests.value() / dram_cycles;
-
-  if (!children.size()) {
-    return;
-  }
-
-  for (auto child : children) {
-    child->finish(dram_cycles);
-  }
-}
-
-// Constructor
-template <typename T>
-DRAM<T>::DRAM(T* spec, typename T::Level level) :
-    spec(spec), level(level), id(0), parent(NULL)
-{
-
-    state = spec->start[(int)level];
-    prereq = spec->prereq[int(level)];
-    rowhit = spec->rowhit[int(level)];
-    rowopen = spec->rowopen[int(level)];
-    lambda = spec->lambda[int(level)];
-    timing = spec->timing[int(level)];
-
-    fill_n(next, int(T::Command::MAX), -1); // initialize future
-    for (int cmd = 0; cmd < int(T::Command::MAX); cmd++) {
-        int dist = 0;
-        for (auto& t : timing[cmd])
-            dist = max(dist, t.dist);
-
-        if (dist)
-            prev[cmd].resize(dist, -1); // initialize history
-    }
-
-    // try to recursively construct my children
-    int child_level = int(level) + 1;
-    if (child_level == int(T::Level::Row))
-        return; // stop recursion: rows are not instantiated as nodes
-
-    int child_max = spec->org_entry.count[child_level];
-    if (!child_max)
-        return; // stop recursion: the number of children is unspecified
-
-    // recursively construct my children
-    for (int i = 0; i < child_max; i++) {
-        DRAM<T>* child = new DRAM<T>(spec, typename T::Level(child_level));
-        child->parent = this;
-        child->id = i;
-        children.push_back(child);
-    }
-
-}
-
-template <typename T>
-DRAM<T>::~DRAM()
-{
-    for (auto child: children)
-        delete child;
-}
-
-// Insert
-template <typename T>
-void DRAM<T>::insert(DRAM<T>* child)
-{
-    child->parent = this;
-    child->id = children.size();
-    children.push_back(child);
-}
-
-// Decode
-template <typename T>
-typename T::Command DRAM<T>::decode(typename T::Command cmd, const int* addr)
-{
-    int child_id = addr[int(level)+1];
-    if (prereq[int(cmd)]) {
-        typename T::Command prereq_cmd = prereq[int(cmd)](this, cmd, child_id);
-        if (prereq_cmd != T::Command::MAX)
-            return prereq_cmd; // stop recursion: there is a prerequisite at this level
-    }
-
-    if (child_id < 0 || !children.size())
-        return cmd; // stop recursion: there were no prequisites at any level
-
-    // recursively decode at my child
-    return children[child_id]->decode(cmd, addr);
-}
-
-
-// Check
-template <typename T>
-bool DRAM<T>::check(typename T::Command cmd, const int* addr, long clk)
-{
-    if (next[int(cmd)] != -1 && clk < next[int(cmd)])
-        return false; // stop recursion: the check failed at this level
-
-    int child_id = addr[int(level)+1];
-    if (child_id < 0 || level == spec->scope[int(cmd)] || !children.size())
-        return true; // stop recursion: the check passed at all levels
-
-    // recursively check my child
-    return children[child_id]->check(cmd, addr, clk);
-}
-
-// SAUGATA: added function to check whether a command is a row hit
-// Check row hits
-template <typename T>
-bool DRAM<T>::check_row_hit(typename T::Command cmd, const int* addr)
-{
-    int child_id = addr[int(level)+1];
-    if (rowhit[int(cmd)]) {
-        return rowhit[int(cmd)](this, cmd, child_id);  // stop recursion: there is a row hit at this level
-    }
-
-    if (child_id < 0 || !children.size())
-        return false; // stop recursion: there were no row hits at any level
-
-    // recursively check for row hits at my child
-    return children[child_id]->check_row_hit(cmd, addr);
-}
-
-template <typename T>
-bool DRAM<T>::check_row_open(typename T::Command cmd, const int* addr)
-{
-    int child_id = addr[int(level)+1];
-    if (rowopen[int(cmd)]) {
-        return rowopen[int(cmd)](this, cmd, child_id);  // stop recursion: there is a row hit at this level
-    }
-
-    if (child_id < 0 || !children.size())
-        return false; // stop recursion: there were no row hits at any level
-
-    // recursively check for row hits at my child
-    return children[child_id]->check_row_open(cmd, addr);
-}
-
-template <typename T>
-long DRAM<T>::get_next(typename T::Command cmd, const int* addr)
-{
-    long next_clk = max(cur_clk, next[int(cmd)]);
-    auto node = this;
-    for (int l = int(level); l < int(spec->scope[int(cmd)]) && node->children.size() && addr[l + 1] >= 0; l++){
-        node = node->children[addr[l + 1]];
-        next_clk = max(next_clk, node->next[int(cmd)]);
-    }
-    return next_clk;
-}
-
-// Update
-template <typename T>
-void DRAM<T>::update(typename T::Command cmd, const int* addr, long clk)
-{
-    cur_clk = clk;
-    update_state(cmd, addr);
-    update_timing(cmd, addr, clk);
-}
-
-
-// Update (State)
-template <typename T>
-void DRAM<T>::update_state(typename T::Command cmd, const int* addr)
-{
-    int child_id = addr[int(level)+1];
-    if (lambda[int(cmd)])
-        lambda[int(cmd)](this, child_id); // update this level
-
-    if (level == spec->scope[int(cmd)] || !children.size())
-        return; // stop recursion: updated all levels
-
-    // recursively update my child
-    children[child_id]->update_state(cmd, addr);
-}
-
-
-// Update (Timing)
-template <typename T>
-void DRAM<T>::update_timing(typename T::Command cmd, const int* addr, long clk)
-{
-    // I am not a target node: I am merely one of its siblings
-    if (id != addr[int(level)]) {
-        for (auto& t : timing[int(cmd)]) {
-            if (!t.sibling)
-                continue; // not an applicable timing parameter
-
-            assert (t.dist == 1);
-
-            long future = clk + t.val;
-            next[int(t.cmd)] = max(next[int(t.cmd)], future); // update future
-        }
-
-        return; // stop recursion: only target nodes should be recursed
-    }
-
-    // I am a target node
-    if (prev[int(cmd)].size()) {
-        prev[int(cmd)].pop_back();  // FIXME TIANSHI why pop back?
-        prev[int(cmd)].push_front(clk); // update history
-    }
-
-    for (auto& t : timing[int(cmd)]) {
-        if (t.sibling)
-            continue; // not an applicable timing parameter
-
-        long past = prev[int(cmd)][t.dist-1];
-        if (past < 0)
-            continue; // not enough history
-
-        long future = past + t.val;
-        next[int(t.cmd)] = max(next[int(t.cmd)], future); // update future
-        // TIANSHI: for refresh statistics
-        if (spec->is_refreshing(cmd) && spec->is_opening(t.cmd)) {
-          assert(past == clk);
-          begin_of_refreshing = clk;
-          end_of_refreshing = max(end_of_refreshing, next[int(t.cmd)]);
-          refresh_cycles += end_of_refreshing - clk;
-          if (cur_serving_requests > 0) {
-            refresh_intervals.push_back(make_pair(begin_of_refreshing, end_of_refreshing));
-          }
-        }
-    }
-
-    // Some commands have timings that are higher that their scope levels, thus
-    // we do not stop at the cmd's scope level
-    if (!children.size())
-        return; // stop recursion: updated all levels
-
-    // recursively update *all* of my children
-    for (auto child : children)
-        child->update_timing(cmd, addr, clk);
-
-}
-
-template <typename T>
-void DRAM<T>::update_serving_requests(const int* addr, int delta, long clk) {
-  assert(id == addr[int(level)]);
-  assert(delta == 1 || delta == -1);
-  // update total serving requests
-  if (begin_of_cur_reqcnt != -1 && cur_serving_requests > 0) {
-    serving_requests += (clk - begin_of_cur_reqcnt) * cur_serving_requests;
-    active_cycles += clk - begin_of_cur_reqcnt;
-  }
-  // update begin of current request number
-  begin_of_cur_reqcnt = clk;
-  cur_serving_requests += delta;
-  assert(cur_serving_requests >= 0);
-
-  if (delta == 1 && cur_serving_requests == 1) {
-    // transform from inactive to active
-    begin_of_serving = clk;
-    if (end_of_refreshing > begin_of_serving) {
-      active_refresh_overlap_cycles += end_of_refreshing - begin_of_serving;
-    }
-  } else if (cur_serving_requests == 0) {
-    // transform from active to inactive
-    assert(begin_of_serving != -1);
-    assert(delta == -1);
-    active_cycles += clk - begin_of_cur_reqcnt;
-    end_of_serving = clk;
-
-    for (const auto& ref: refresh_intervals) {
-      active_refresh_overlap_cycles += min(end_of_serving, ref.second) - ref.first;
-    }
-    refresh_intervals.clear();
-  }
-
-  int child_id = addr[int(level) + 1];
-  // We only count the level bank or the level higher than bank
-  if (child_id < 0 || !children.size() || (int(level) > int(T::Level::Bank)) ) {
-    return;
-  }
-  children[child_id]->update_serving_requests(addr, delta, clk);
-}
-
-} /* namespace ram */
-
-#endif /* __DRAM_H */
diff --git a/TOGSim/extern/ramulator_custom/src/HBM.cpp b/TOGSim/extern/ramulator_custom/src/HBM.cpp
deleted file mode 100644
index 00f8f704..00000000
--- a/TOGSim/extern/ramulator_custom/src/HBM.cpp
+++ /dev/null
@@ -1,413 +0,0 @@
-#include "HBM.h"
-#include "DRAM.h"
-
-#include <cassert>
-
-using namespace std;
-using namespace ram;
-
-string HBM::standard_name = "HBM";
-string HBM::level_str [int(Level::MAX)] = {"Ch", "Ra", "Bg", "Ba", "Ro", "Co"};
-
-map<string, enum HBM::Org> HBM::org_map = {
-    {"HBM_1Gb", HBM::Org::HBM_1Gb},
-    {"HBM_2Gb", HBM::Org::HBM_2Gb},
-    {"HBM_4Gb", HBM::Org::HBM_4Gb},
-};
-
-map<string, enum HBM::Speed> HBM::speed_map = {
-    {"HBM_1Gbps", HBM::Speed::HBM_1Gbps},
-    {"HBM_2Gbps", HBM::Speed::HBM_2Gbps},
-};
-
-HBM::HBM(Org org, Speed speed)
-    : org_entry(org_table[int(org)]),
-    speed_entry(speed_table[int(speed)]),
-    read_latency(speed_entry.nCL + speed_entry.nBL)
-{
-    init_speed();
-    init_prereq();
-    init_rowhit(); // SAUGATA: added row hit function
-    init_rowopen();
-    init_lambda();
-    init_timing();
-}
-
-HBM::HBM(const string& org_str, const string& speed_str) :
-    HBM(org_map[org_str], speed_map[speed_str])
-{
-}
-
-void HBM::set_channel_number(int channel) {
-  org_entry.count[int(Level::Channel)] = channel;
-}
-
-void HBM::set_rank_number(int rank) {
-  org_entry.count[int(Level::Rank)] = rank;
-}
-
-
-void HBM::init_speed()
-{
-    const static int RFC_TABLE[int(Speed::MAX)][int(Org::MAX)] = {
-        {55, 80, 130},
-        {110, 160, 260}
-    };
-    const static int REFI1B_TABLE[int(Speed::MAX)][int(Org::MAX)] = {
-        {64, 128, 256},
-        {128, 256, 512}
-    };
-    const static int XS_TABLE[int(Speed::MAX)][int(Org::MAX)] = {
-        {60, 85, 135},
-        {120, 170, 270}
-    };
-
-    int speed = 0, density = 0;
-    switch (speed_entry.rate) {
-        case 1000: speed = 0; break;
-        case 2000: speed = 1; break;
-        default: assert(false);
-    };
-    switch (org_entry.size >> 10){
-        case 1: density = 0; break;
-        case 2: density = 1; break;
-        case 4: density = 2; break;
-        default: assert(false);
-    }
-    speed_entry.nRFC = RFC_TABLE[speed][density];
-    speed_entry.nREFI1B = REFI1B_TABLE[speed][density];
-    speed_entry.nXS = XS_TABLE[speed][density];
-}
-
-
-void HBM::init_prereq()
-{
-    // RD
-    prereq[int(Level::Rank)][int(Command::RD)] = [] (DRAM<HBM>* node, Command cmd, int id) {
-        switch (int(node->state)) {
-            case int(State::PowerUp): return Command::MAX;
-            case int(State::ActPowerDown): return Command::PDX;
-            case int(State::PrePowerDown): return Command::PDX;
-            case int(State::SelfRefresh): return Command::SRX;
-            default: {
-              assert(false);
-              return Command::MAX;
-            }
-        }};
-    prereq[int(Level::Bank)][int(Command::RD)] = [] (DRAM<HBM>* node, Command cmd, int id) {
-        switch (int(node->state)) {
-            case int(State::Closed): return Command::ACT;
-            case int(State::Opened):
-                if (node->row_state.find(id) != node->row_state.end())
-                    return cmd;
-                else return Command::PRE;
-            default: {
-              assert(false);
-              return Command::MAX;
-            }
-        }};
-
-    // WR
-    prereq[int(Level::Rank)][int(Command::WR)] = prereq[int(Level::Rank)][int(Command::RD)];
-    prereq[int(Level::Rank)][int(Command::PIM_WR)] = prereq[int(Level::Rank)][int(Command::RD)];
-
-    prereq[int(Level::Bank)][int(Command::WR)] = prereq[int(Level::Bank)][int(Command::RD)];
-
-    // REF
-    prereq[int(Level::Rank)][int(Command::REF)] = [] (DRAM<HBM>* node, Command cmd, int id) {
-        for (auto bg : node->children)
-            for (auto bank: bg->children) {
-                if (bank->state == State::Closed)
-                    continue;
-                return Command::PREA;
-            }
-        return Command::REF;};
-
-    // REFSB
-    prereq[int(Level::Bank)][int(Command::REFSB)] = [] (DRAM<HBM>* node, Command cmd, int id) {
-        if (node->state == State::Closed) return Command::REFSB;
-        return Command::PRE;};
-
-    // PD
-    prereq[int(Level::Rank)][int(Command::PDE)] = [] (DRAM<HBM>* node, Command cmd, int id) {
-        switch (int(node->state)) {
-            case int(State::PowerUp): return Command::PDE;
-            case int(State::ActPowerDown): return Command::PDE;
-            case int(State::PrePowerDown): return Command::PDE;
-            case int(State::SelfRefresh): return Command::SRX;
-            default: {
-              assert(false);
-              return Command::MAX;
-            }
-        }};
-
-    // SR
-    prereq[int(Level::Rank)][int(Command::SRE)] = [] (DRAM<HBM>* node, Command cmd, int id) {
-        switch (int(node->state)) {
-            case int(State::PowerUp): return Command::SRE;
-            case int(State::ActPowerDown): return Command::PDX;
-            case int(State::PrePowerDown): return Command::PDX;
-            case int(State::SelfRefresh): return Command::SRE;
-            default: {
-              assert(false);
-              return Command::MAX;
-            }
-        }};
-}
-
-// SAUGATA: added row hit check functions to see if the desired location is currently open
-void HBM::init_rowhit()
-{
-    // RD
-    rowhit[int(Level::Bank)][int(Command::RD)] = [] (DRAM<HBM>* node, Command cmd, int id) {
-        switch (int(node->state)) {
-            case int(State::Closed): return false;
-            case int(State::Opened):
-                if (node->row_state.find(id) != node->row_state.end())
-                    return true;
-                return false;
-            default: {
-              assert(false);
-              return false;
-            }
-        }};
-
-    // WR
-    rowhit[int(Level::Bank)][int(Command::WR)] = rowhit[int(Level::Bank)][int(Command::RD)];
-    rowhit[int(Level::Bank)][int(Command::PIM_WR)] = [] (DRAM<HBM>* node, Command cmd, int id) {
-      return true;
-    };
-}
-
-void HBM::init_rowopen()
-{
-    // RD
-    rowopen[int(Level::Bank)][int(Command::RD)] = [] (DRAM<HBM>* node, Command cmd, int id) {
-        switch (int(node->state)) {
-            case int(State::Closed): return false;
-            case int(State::Opened): return true;
-            default: {
-              assert(false);
-              return false;
-            }
-        }};
-
-    // WR
-    rowopen[int(Level::Bank)][int(Command::WR)] = rowopen[int(Level::Bank)][int(Command::RD)];
-    rowopen[int(Level::Bank)][int(Command::PIM_WR)] = [] (DRAM<HBM>* node, Command cmd, int id) {
-      return true;
-    };
-}
-
-void HBM::init_lambda()
-{
-    lambda[int(Level::Bank)][int(Command::ACT)] = [] (DRAM<HBM>* node, int id) {
-        node->state = State::Opened;
-        node->row_state[id] = State::Opened;};
-    lambda[int(Level::Bank)][int(Command::PRE)] = [] (DRAM<HBM>* node, int id) {
-        node->state = State::Closed;
-        node->row_state.clear();};
-    lambda[int(Level::Rank)][int(Command::PREA)] = [] (DRAM<HBM>* node, int id) {
-        for (auto bg : node->children)
-            for (auto bank : bg->children) {
-                bank->state = State::Closed;
-                bank->row_state.clear();
-            }};
-    lambda[int(Level::Rank)][int(Command::REF)] = [] (DRAM<HBM>* node, int id) {};
-    lambda[int(Level::Bank)][int(Command::RD)] = [] (DRAM<HBM>* node, int id) {};
-    lambda[int(Level::Bank)][int(Command::WR)] = [] (DRAM<HBM>* node, int id) {};
-    lambda[int(Level::Bank)][int(Command::PIM_WR)] = [] (DRAM<HBM>* node, int id) {};
-    lambda[int(Level::Bank)][int(Command::RDA)] = [] (DRAM<HBM>* node, int id) {
-        node->state = State::Closed;
-        node->row_state.clear();};
-    lambda[int(Level::Bank)][int(Command::WRA)] = [] (DRAM<HBM>* node, int id) {
-        node->state = State::Closed;
-        node->row_state.clear();};
-    lambda[int(Level::Rank)][int(Command::PDE)] = [] (DRAM<HBM>* node, int id) {
-        for (auto bg : node->children)
-            for (auto bank : bg->children) {
-                if (bank->state == State::Closed)
-                    continue;
-                node->state = State::ActPowerDown;
-                return;
-            }
-        node->state = State::PrePowerDown;};
-    lambda[int(Level::Rank)][int(Command::PDX)] = [] (DRAM<HBM>* node, int id) {
-        node->state = State::PowerUp;};
-    lambda[int(Level::Rank)][int(Command::SRE)] = [] (DRAM<HBM>* node, int id) {
-        node->state = State::SelfRefresh;};
-    lambda[int(Level::Rank)][int(Command::SRX)] = [] (DRAM<HBM>* node, int id) {
-        node->state = State::PowerUp;};
-}
-
-
-void HBM::init_timing()
-{
-    SpeedEntry& s = speed_entry;
-    vector<TimingEntry> *t;
-
-    /*** Channel ***/
-    t = timing[int(Level::Channel)];
-
-    // CAS <-> CAS
-    t[int(Command::RD)].push_back({Command::RD, 1, s.nBL});
-    t[int(Command::RD)].push_back({Command::RDA, 1, s.nBL});
-    t[int(Command::RDA)].push_back({Command::RD, 1, s.nBL});
-    t[int(Command::RDA)].push_back({Command::RDA, 1, s.nBL});
-    t[int(Command::WR)].push_back({Command::WR, 1, s.nBL});
-    t[int(Command::WR)].push_back({Command::WRA, 1, s.nBL});
-    t[int(Command::WRA)].push_back({Command::WR, 1, s.nBL});
-    t[int(Command::WRA)].push_back({Command::WRA, 1, s.nBL});
-
-    // PIM_WR
-    t[int(Command::WR)].push_back({Command::PIM_WR, 1, s.nBL});
-    t[int(Command::PIM_WR)].push_back({Command::WR, 1, s.nBL});
-    t[int(Command::PIM_WR)].push_back({Command::PIM_WR, 1, s.nBL});
-
-    /*** Rank ***/
-    t = timing[int(Level::Rank)];
-
-    // CAS <-> CAS
-    t[int(Command::RD)].push_back({Command::RD, 1, s.nCCDS});
-    t[int(Command::RD)].push_back({Command::RDA, 1, s.nCCDS});
-    t[int(Command::RDA)].push_back({Command::RD, 1, s.nCCDS});
-    t[int(Command::RDA)].push_back({Command::RDA, 1, s.nCCDS});
-
-    t[int(Command::WR)].push_back({Command::WR, 1, s.nCCDS});
-    t[int(Command::WR)].push_back({Command::PIM_WR, 1, s.nCCDS});
-    t[int(Command::PIM_WR)].push_back({Command::WR, 1, s.nCCDS});
-    t[int(Command::PIM_WR)].push_back({Command::WR, 1, s.nCCDS});
-
-    t[int(Command::WR)].push_back({Command::WRA, 1, s.nCCDS});
-    t[int(Command::WRA)].push_back({Command::WR, 1, s.nCCDS});
-    t[int(Command::WRA)].push_back({Command::WRA, 1, s.nCCDS});
-    t[int(Command::RD)].push_back({Command::WR, 1, s.nCL + s.nCCDS + 2 - s.nCWL});
-    t[int(Command::RD)].push_back({Command::WRA, 1, s.nCL + s.nCCDS + 2 - s.nCWL});
-    t[int(Command::RDA)].push_back({Command::WR, 1, s.nCL + s.nCCDS + 2 - s.nCWL});
-    t[int(Command::RDA)].push_back({Command::WRA, 1, s.nCL + s.nCCDS + 2 - s.nCWL});
-
-    t[int(Command::WR)].push_back({Command::RD, 1, s.nCWL + s.nBL + s.nWTRS});
-    t[int(Command::WR)].push_back({Command::RDA, 1, s.nCWL + s.nBL + s.nWTRS});
-
-    t[int(Command::PIM_WR)].push_back({Command::RD, 1, s.nCWL + s.nBL + s.nWTRS});
-    t[int(Command::PIM_WR)].push_back({Command::RDA, 1, s.nCWL + s.nBL + s.nWTRS});
-
-    t[int(Command::WRA)].push_back({Command::RD, 1, s.nCWL + s.nBL + s.nWTRS});
-    t[int(Command::WRA)].push_back({Command::RDA, 1, s.nCWL + s.nBL + s.nWTRS});
-
-    t[int(Command::RD)].push_back({Command::PREA, 1, s.nRTP});
-    t[int(Command::WR)].push_back({Command::PREA, 1, s.nCWL + s.nBL + s.nWR});
-
-    // CAS <-> PD
-    t[int(Command::RD)].push_back({Command::PDE, 1, s.nCL + s.nBL + 1});
-    t[int(Command::RDA)].push_back({Command::PDE, 1, s.nCL + s.nBL + 1});
-    t[int(Command::WR)].push_back({Command::PDE, 1, s.nCWL + s.nBL + s.nWR});
-    t[int(Command::WRA)].push_back({Command::PDE, 1, s.nCWL + s.nBL + s.nWR + 1}); // +1 for pre
-    t[int(Command::PDX)].push_back({Command::RD, 1, s.nXP});
-    t[int(Command::PDX)].push_back({Command::RDA, 1, s.nXP});
-    t[int(Command::PDX)].push_back({Command::WR, 1, s.nXP});
-    t[int(Command::PDX)].push_back({Command::WRA, 1, s.nXP});
-
-    // CAS <-> SR: none (all banks have to be precharged)
-
-    // RAS <-> RAS
-    t[int(Command::ACT)].push_back({Command::ACT, 1, s.nRRDS});
-    t[int(Command::ACT)].push_back({Command::ACT, 4, s.nFAW});
-    t[int(Command::ACT)].push_back({Command::PREA, 1, s.nRAS});
-    t[int(Command::PREA)].push_back({Command::ACT, 1, s.nRP});
-
-    // RAS <-> REF
-    t[int(Command::PRE)].push_back({Command::REF, 1, s.nRP});
-    t[int(Command::PREA)].push_back({Command::REF, 1, s.nRP});
-    t[int(Command::REF)].push_back({Command::ACT, 1, s.nRFC});
-
-    // RAS <-> PD
-    t[int(Command::ACT)].push_back({Command::PDE, 1, 1});
-    t[int(Command::PDX)].push_back({Command::ACT, 1, s.nXP});
-    t[int(Command::PDX)].push_back({Command::PRE, 1, s.nXP});
-    t[int(Command::PDX)].push_back({Command::PREA, 1, s.nXP});
-
-    // RAS <-> SR
-    t[int(Command::PRE)].push_back({Command::SRE, 1, s.nRP});
-    t[int(Command::PREA)].push_back({Command::SRE, 1, s.nRP});
-    t[int(Command::SRX)].push_back({Command::ACT, 1, s.nXS});
-
-    // REF <-> REF
-    t[int(Command::REF)].push_back({Command::REF, 1, s.nRFC});
-
-    // REF <-> PD
-    t[int(Command::REF)].push_back({Command::PDE, 1, 1});
-    t[int(Command::PDX)].push_back({Command::REF, 1, s.nXP});
-
-    // REF <-> SR
-    t[int(Command::SRX)].push_back({Command::REF, 1, s.nXS});
-
-    // PD <-> PD
-    t[int(Command::PDE)].push_back({Command::PDX, 1, s.nPD});
-    t[int(Command::PDX)].push_back({Command::PDE, 1, s.nXP});
-
-    // PD <-> SR
-    t[int(Command::PDX)].push_back({Command::SRE, 1, s.nXP});
-    t[int(Command::SRX)].push_back({Command::PDE, 1, s.nXS});
-
-    // SR <-> SR
-    t[int(Command::SRE)].push_back({Command::SRX, 1, s.nCKESR});
-    t[int(Command::SRX)].push_back({Command::SRE, 1, s.nXS});
-
-    /*** Bank Group ***/
-    t = timing[int(Level::BankGroup)];
-    // CAS <-> CAS
-    t[int(Command::RD)].push_back({Command::RD, 1, s.nCCDL});
-    t[int(Command::RD)].push_back({Command::RDA, 1, s.nCCDL});
-    t[int(Command::RDA)].push_back({Command::RD, 1, s.nCCDL});
-    t[int(Command::RDA)].push_back({Command::RDA, 1, s.nCCDL});
-    t[int(Command::WR)].push_back({Command::WR, 1, s.nCCDL});
-    t[int(Command::WR)].push_back({Command::WRA, 1, s.nCCDL});
-    t[int(Command::PIM_WR)].push_back({Command::WR, 1, s.nCCDL});
-    t[int(Command::PIM_WR)].push_back({Command::WRA, 1, s.nCCDL});
-
-    t[int(Command::WR)].push_back({Command::PIM_WR, 1, s.nCCDL});
-
-    t[int(Command::PIM_WR)].push_back({Command::PIM_WR, 1, s.nCCDL});
-
-    t[int(Command::WRA)].push_back({Command::WR, 1, s.nCCDL});
-    t[int(Command::WRA)].push_back({Command::WRA, 1, s.nCCDL});
-    t[int(Command::WRA)].push_back({Command::WR, 1, s.nCCDL});
-    t[int(Command::WRA)].push_back({Command::WRA, 1, s.nCCDL});
-    t[int(Command::WR)].push_back({Command::RD, 1, s.nCWL + s.nBL + s.nWTRL});
-    t[int(Command::WR)].push_back({Command::RDA, 1, s.nCWL + s.nBL + s.nWTRL});
-
-    t[int(Command::PIM_WR)].push_back({Command::RD, 1, s.nCWL + s.nBL + s.nWTRL});
-    t[int(Command::PIM_WR)].push_back({Command::RDA, 1, s.nCWL + s.nBL + s.nWTRL});
-
-    t[int(Command::WRA)].push_back({Command::RD, 1, s.nCWL + s.nBL + s.nWTRL});
-    t[int(Command::WRA)].push_back({Command::RDA, 1, s.nCWL + s.nBL + s.nWTRL});
-
-    // RAS <-> RAS
-    t[int(Command::ACT)].push_back({Command::ACT, 1, s.nRRDL});
-
-    /*** Bank ***/
-    t = timing[int(Level::Bank)];
-
-    // CAS <-> RAS
-    t[int(Command::ACT)].push_back({Command::RD, 1, s.nRCDR});
-    t[int(Command::ACT)].push_back({Command::RDA, 1, s.nRCDR});
-    t[int(Command::ACT)].push_back({Command::WR, 1, s.nRCDW});
-    t[int(Command::ACT)].push_back({Command::WRA, 1, s.nRCDW});
-
-    t[int(Command::RD)].push_back({Command::PRE, 1, s.nRTP});
-    t[int(Command::WR)].push_back({Command::PRE, 1, s.nCWL + s.nBL + s.nWR});
-
-    t[int(Command::RDA)].push_back({Command::ACT, 1, s.nRTP + s.nRP});
-    t[int(Command::WRA)].push_back({Command::ACT, 1, s.nCWL + s.nBL + s.nWR + s.nRP});
-
-    // RAS <-> RAS
-    t[int(Command::ACT)].push_back({Command::ACT, 1, s.nRC});
-    t[int(Command::ACT)].push_back({Command::PRE, 1, s.nRAS});
-    t[int(Command::PRE)].push_back({Command::ACT, 1, s.nRP});
-
-    // REFSB
-    t[int(Command::PRE)].push_back({Command::REFSB, 1, s.nRP});
-    t[int(Command::REFSB)].push_back({Command::REFSB, 1, s.nRFC});
-    t[int(Command::REFSB)].push_back({Command::ACT, 1, s.nRFC});
-}
diff --git a/TOGSim/extern/ramulator_custom/src/HBM.h b/TOGSim/extern/ramulator_custom/src/HBM.h
deleted file mode 100644
index b52f0500..00000000
--- a/TOGSim/extern/ramulator_custom/src/HBM.h
+++ /dev/null
@@ -1,228 +0,0 @@
-#ifndef __HBM_H
-#define __HBM_H
-
-#include <map>
-#include <vector>
-#include <string>
-#include <functional>
-
-#include "Request.h"
-
-using namespace std;
-
-namespace ram
-{
-template <typename T>
-class DRAM;
-
-class HBM
-{
-public:
-    static string standard_name;
-    enum class Org;
-    enum class Speed;
-    HBM(Org org, Speed speed);
-    HBM(const string& org_str, const string& speed_str);
-
-    static map<string, enum Org> org_map;
-    static map<string, enum Speed> speed_map;
-
-    /* Level */
-    enum class Level : int
-    {
-        Channel, Rank, BankGroup, Bank, Row, Column, MAX
-    };
-    
-    static std::string level_str [int(Level::MAX)];
-
-    /* Command */
-    enum class Command : int
-    {
-        ACT, PRE,   PREA,
-        RD,  WR,    PIM_WR, RDA, WRA,
-        REF, REFSB, PDE, PDX,  SRE, SRX,
-        MAX
-    };
-
-    // REFSB and REF is not compatible, choose one or the other.
-    // REFSB can be issued to banks in any order, as long as REFI1B
-    // is satisfied for all banks
-
-    string command_name[int(Command::MAX)] = {
-        "ACT", "PRE",   "PREA",
-        "RD",  "WR",    "PIM_WR", "RDA",  "WRA",
-        "REF", "REFSB", "PDE",  "PDX",  "SRE", "SRX"
-    };
-
-    Level scope[int(Command::MAX)] = {
-        Level::Row,    Level::Bank,   Level::Rank,
-        Level::Column, Level::Column, Level::Column, Level::Column, Level::Column,
-        Level::Rank,   Level::Bank,   Level::Rank,   Level::Rank,   Level::Rank,   Level::Rank
-    };
-
-    bool is_opening(Command cmd)
-    {
-        switch(int(cmd)) {
-            case int(Command::ACT):
-                return true;
-            default:
-                return false;
-        }
-    }
-
-    bool is_accessing(Command cmd)
-    {
-        switch(int(cmd)) {
-            case int(Command::RD):
-            case int(Command::WR):
-            case int(Command::RDA):
-            case int(Command::WRA):
-                return true;
-            default:
-                return false;
-        }
-    }
-
-    bool is_closing(Command cmd)
-    {
-        switch(int(cmd)) {
-            case int(Command::RDA):
-            case int(Command::WRA):
-            case int(Command::PRE):
-            case int(Command::PREA):
-                return true;
-            default:
-                return false;
-        }
-    }
-
-    bool is_refreshing(Command cmd)
-    {
-        switch(int(cmd)) {
-            case int(Command::REF):
-            case int(Command::REFSB):
-                return true;
-            default:
-                return false;
-        }
-    }
-
-    /* State */
-    enum class State : int
-    {
-        Opened, Closed, PowerUp, ActPowerDown, PrePowerDown, SelfRefresh, MAX
-    } start[int(Level::MAX)] = {
-        State::MAX, State::PowerUp, State::MAX, State::Closed, State::Closed, State::MAX
-    };
-
-    /* Translate */
-    Command translate[int(Request::Type::MAX)] = {
-        Command::RD,  Command::WR, Command::PIM_WR,
-        Command::REF, Command::PDE, Command::SRE
-    };
-
-    /* Prereq */
-    function<Command(DRAM<HBM>*, Command cmd, int)> prereq[int(Level::MAX)][int(Command::MAX)];
-
-    // SAUGATA: added function object container for row hit status
-    /* Row hit */
-    function<bool(DRAM<HBM>*, Command cmd, int)> rowhit[int(Level::MAX)][int(Command::MAX)];
-    function<bool(DRAM<HBM>*, Command cmd, int)> rowopen[int(Level::MAX)][int(Command::MAX)];
-
-    /* Timing */
-    struct TimingEntry
-    {
-        Command cmd;
-        int dist;
-        int val;
-        bool sibling;
-    };
-    vector<TimingEntry> timing[int(Level::MAX)][int(Command::MAX)];
-
-    /* Lambda */
-    function<void(DRAM<HBM>*, int)> lambda[int(Level::MAX)][int(Command::MAX)];
-
-    /* Organization */
-    enum class Org : int
-    { // per channel density here. Each stack comes with 8 channels
-        HBM_1Gb,
-        HBM_2Gb,
-        HBM_4Gb,
-        MAX
-    };
-
-    struct OrgEntry {
-        int size;
-        int dq;
-        int count[int(Level::MAX)];
-    } org_table[int(Org::MAX)] = {
-        {1<<10, 128, {0, 0, 4, 2, 1<<20, 1<<(6+1)}},
-        {2<<10, 128, {0, 0, 4, 2, 1<<20, 1<<(6+1)}},
-        {4<<10, 128, {0, 0, 4, 4, 1<<20, 1<<(6+1)}},
-    }, org_entry;
-
-    void set_channel_number(int channel);
-    void set_rank_number(int rank);
-
-    /* Speed */
-    enum class Speed : int
-    {
-        HBM_1Gbps,
-        HBM_2Gbps,
-        MAX
-    };
-
-    int prefetch_size = 2; // burst length could be 2 and 4 (choose 4 here), 2n prefetch
-    int channel_width = 128;
-
-    struct SpeedEntry {
-        int rate;
-        double freq, tCK;
-        int nBL, nCCDS, nCCDL;
-        int nCL, nRCDR, nRCDW, nRP, nCWL;
-        int nRAS, nRC;
-        int nRTP, nWTRS, nWTRL, nWR;
-        int nRRDS, nRRDL, nFAW;
-        int nRFC, nREFI, nREFI1B;
-        int nPD, nXP;
-        int nCKESR, nXS;
-    } speed_table[int(Speed::MAX)] = {
-        {1000, // rate
-         500, 2.0, // freq, tCK
-         // FIX: Why is nBL set to 2 instead of 1?
-         // FIX: It seems that this is because a single request corresponds to 64B,
-         // ,which means that `prefetch_size = 4`.
-         1, 1, 2, // nBL, nCCDS, nCCDL
-         7, 7, 6, 7, 4, // nCL, nRCDR, nRCDW, nRP, nCWL
-         17, 24, // nRAS, nRC
-         7, 2, 4, 8, // nRTP, nWTRS, nWTRL, nWR
-         4, 5, 20, // nRRDS, nRRDL, nFAW
-         0, 1950, 0, // nRFC, nREFI, nREFI1B
-         5, 5,  // nPD, nXP
-         5, 0 },  // nCKESR, nXS
-        {2000, 
-         1000, 1.0, 
-         1, 1, 2, 
-         14, 14, 12, 14, 8, 
-         34, 48, 
-         14, 4, 8, 16, 
-         8, 10, 40, 
-         0, 3900, 0, 
-         10, 10, 
-         10, 0},
-    }, speed_entry;
-
-    int read_latency;
-
-private:
-    void init_speed();
-    void init_lambda();
-    void init_prereq();
-    void init_rowhit();  // SAUGATA: added function to check for row hits
-    void init_rowopen();
-    void init_timing();
-};
-
-} /*namespace ram*/
-
-#endif /*__HBM_H*/
diff --git a/TOGSim/extern/ramulator_custom/src/Memory.h b/TOGSim/extern/ramulator_custom/src/Memory.h
deleted file mode 100644
index 45f7cc8b..00000000
--- a/TOGSim/extern/ramulator_custom/src/Memory.h
+++ /dev/null
@@ -1,684 +0,0 @@
-#ifndef __RAM_MEMORY_H
-#define __RAM_MEMORY_H
-
-#include "DRAM.h"
-#include "Request.h"
-#include "Controller.h"
-//#include "SpeedyController.h"
-#include "Statistics.h"
-// #include "GDDR5.h"
-#include "HBM.h"
-#include "Config.h"
-// #include "LPDDR3.h"
-// #include "LPDDR4.h"
-// #include "WideIO2.h"
-// #include "DSARP.h"
-#include <vector>
-#include <functional>
-#include <cmath>
-#include <cassert>
-#include <tuple>
-
-using namespace std;
-
-typedef vector<unsigned int> MapSrcVector;
-typedef map<unsigned int, MapSrcVector > MapSchemeEntry;
-typedef map<unsigned int, MapSchemeEntry> MapScheme;
-
-namespace ram
-{
-class MemoryBase{
-public:
-    MemoryBase() {}
-    virtual ~MemoryBase() {}
-    virtual double clk_ns() = 0;
-    virtual void tick() = 0;
-    virtual bool send(Request req) = 0;
-    virtual int pending_requests() = 0;
-    virtual void finish(void) = 0;
-    virtual long page_allocator(long addr, int coreid) = 0;
-    virtual void record_core(int coreid) = 0;
-    virtual void set_high_writeq_watermark(const float watermark) = 0;
-    virtual void set_low_writeq_watermark(const float watermark) = 0;
-    virtual bool done() const = 0;
-    virtual int get_transaction_bytes() const = 0;
-    virtual int get_num_channels() const = 0;
-    virtual bool is_full(int ch, bool is_write) const = 0;
-    virtual std::vector<int> decode_mem_addr(uint64_t addr) = 0;
-};
-
-template <class T, template<typename> class Controller = Controller >
-class Memory : public MemoryBase
-{
-protected:
-  ScalarStat dram_capacity;
-  ScalarStat num_dram_cycles;
-  ScalarStat num_incoming_requests;
-  VectorStat num_read_requests;
-  VectorStat num_write_requests;
-  ScalarStat ramulator_active_cycles;
-  VectorStat incoming_requests_per_channel;
-  VectorStat incoming_read_reqs_per_channel;
-
-  ScalarStat physical_page_replacement;
-  ScalarStat maximum_bandwidth;
-  ScalarStat in_queue_req_num_sum;
-  ScalarStat in_queue_read_req_num_sum;
-  ScalarStat in_queue_write_req_num_sum;
-  ScalarStat in_queue_req_num_avg;
-  ScalarStat in_queue_read_req_num_avg;
-  ScalarStat in_queue_write_req_num_avg;
-
-  VectorStat record_read_requests;
-  VectorStat record_write_requests;
-
-  long max_address;
-  MapScheme mapping_scheme;
-  
-public:
-    enum class Type {
-        ChRaBaRoCo,
-        RoBaRaCoCh,
-        RoCoBaRaCh,
-        MAX,
-    // } type = Type::ChRaBaRoCo;
-    } type = Type::RoBaRaCoCh;
-
-    enum class Translation {
-      None,
-      Random,
-      MAX,
-    } translation = Translation::None;
-
-    std::map<string, Translation> name_to_translation = {
-      {"None", Translation::None},
-      {"Random", Translation::Random},
-    };
-
-    vector<int> free_physical_pages;
-    long free_physical_pages_remaining;
-    map<pair<int, long>, long> page_translation;
-
-    vector<Controller<T>*> ctrls;
-    T * spec;
-    vector<int> addr_bits;
-    string mapping_file;
-    bool use_mapping_file;
-    bool dump_mapping;
-    
-    int tx_bits;
-
-    Memory(RamulatorConfig& configs, vector<Controller<T>*> ctrls)
-        : ctrls(ctrls),
-          spec(ctrls[0]->channel->spec),
-          addr_bits(int(T::Level::MAX))
-    {
-        // make sure 2^N channels/ranks
-        // TODO support channel number that is not powers of 2
-        int *sz = spec->org_entry.count;
-        assert((sz[0] & (sz[0] - 1)) == 0);
-        assert((sz[1] & (sz[1] - 1)) == 0);
-        // validate size of one transaction
-        int tx = (spec->prefetch_size * spec->channel_width / 8);
-        tx_bits = calc_log2(tx);
-        assert((1<<tx_bits) == tx);
-        
-        // Parsing mapping file and initialize mapping table
-        use_mapping_file = false;
-        dump_mapping = false;
-        if (spec->standard_name.substr(0, 4) == "DDR3"){
-            if (configs["mapping"] != "defaultmapping"){
-              init_mapping_with_file(configs["mapping"]);
-              // dump_mapping = true;
-              use_mapping_file = true;
-            }
-        }
-        // If hi address bits will not be assigned to Rows
-        // then the chips must not be LPDDRx 6Gb, 12Gb etc.
-
-        if(configs["mapping"] == "RoBaRaCoCh") {
-            type = Type::RoBaRaCoCh;
-        }
-        else if(configs["mapping"] == "RoCoBaRaCh") {
-            type = Type::RoCoBaRaCh;
-        }
-        else if(configs["mapping"] == "ChRaBaRoCo") {
-            type = Type::ChRaBaRoCo;
-        }
-        else {
-            use_mapping_file = true;
-            init_mapping_with_file(configs["mapping"]);
-        }
-
-        if (type != Type::RoBaRaCoCh && spec->standard_name.substr(0, 5) == "LPDDR")
-            assert((sz[int(T::Level::Row)] & (sz[int(T::Level::Row)] - 1)) == 0);
-
-        max_address = spec->channel_width / 8;
-
-        for (unsigned int lev = 0; lev < addr_bits.size(); lev++) {
-          addr_bits[lev] = calc_log2(sz[lev]);
-            max_address *= sz[lev];
-        }
-
-        addr_bits[int(T::Level::MAX) - 1] -= calc_log2(spec->prefetch_size);
-
-        // Initiating translation
-        if (configs.contains("translation")) {
-          translation = name_to_translation[configs["translation"]];
-        }
-        if (translation != Translation::None) {
-          // construct a list of available pages
-          // TODO: this should not assume a 4KB page!
-          free_physical_pages_remaining = max_address >> 12;
-
-          free_physical_pages.resize(free_physical_pages_remaining, -1);
-        }
-
-        dram_capacity
-            .name("dram_capacity")
-            .desc("Number of bytes in simulated DRAM")
-            .precision(0)
-            ;
-        dram_capacity = max_address;
-
-        num_dram_cycles
-            .name("dram_cycles")
-            .desc("Number of DRAM cycles simulated")
-            .precision(0)
-            ;
-        num_incoming_requests
-            .name("incoming_requests")
-            .desc("Number of incoming requests to DRAM")
-            .precision(0)
-            ;
-        num_read_requests
-            .init(configs.get_core_num())
-            .name("read_requests")
-            .desc("Number of incoming read requests to DRAM per core")
-            .precision(0)
-            ;
-        num_write_requests
-            .init(configs.get_core_num())
-            .name("write_requests")
-            .desc("Number of incoming write requests to DRAM per core")
-            .precision(0)
-            ;
-        incoming_requests_per_channel
-            .init(sz[int(T::Level::Channel)])
-            .name("incoming_requests_per_channel")
-            .desc("Number of incoming requests to each DRAM channel")
-            ;
-        incoming_read_reqs_per_channel
-            .init(sz[int(T::Level::Channel)])
-            .name("incoming_read_reqs_per_channel")
-            .desc("Number of incoming read requests to each DRAM channel")
-            ;
-
-        ramulator_active_cycles
-            .name("ramulator_active_cycles")
-            .desc("The total number of cycles that the DRAM part is active (serving R/W)")
-            .precision(0)
-            ;
-        physical_page_replacement
-            .name("physical_page_replacement")
-            .desc("The number of times that physical page replacement happens.")
-            .precision(0)
-            ;
-        maximum_bandwidth
-            .name("maximum_bandwidth")
-            .desc("The theoretical maximum bandwidth (Bps)")
-            .precision(0)
-            ;
-        in_queue_req_num_sum
-            .name("in_queue_req_num_sum")
-            .desc("Sum of read/write queue length")
-            .precision(0)
-            ;
-        in_queue_read_req_num_sum
-            .name("in_queue_read_req_num_sum")
-            .desc("Sum of read queue length")
-            .precision(0)
-            ;
-        in_queue_write_req_num_sum
-            .name("in_queue_write_req_num_sum")
-            .desc("Sum of write queue length")
-            .precision(0)
-            ;
-        in_queue_req_num_avg
-            .name("in_queue_req_num_avg")
-            .desc("Average of read/write queue length per memory cycle")
-            .precision(6)
-            ;
-        in_queue_read_req_num_avg
-            .name("in_queue_read_req_num_avg")
-            .desc("Average of read queue length per memory cycle")
-            .precision(6)
-            ;
-        in_queue_write_req_num_avg
-            .name("in_queue_write_req_num_avg")
-            .desc("Average of write queue length per memory cycle")
-            .precision(6)
-            ;
-        record_read_requests
-            .init(configs.get_core_num())
-            .name("record_read_requests")
-            .desc("record read requests for this core when it reaches request limit or to the end")
-            ;
-
-        record_write_requests
-            .init(configs.get_core_num())
-            .name("record_write_requests")
-            .desc("record write requests for this core when it reaches request limit or to the end")
-            ;
-
-    }
-
-    ~Memory()
-    {
-        for (auto ctrl: ctrls)
-            delete ctrl;
-        delete spec;
-    }
-
-    double clk_ns()
-    {
-        return spec->speed_entry.tCK;
-    }
-
-    void record_core(int coreid) {
-      record_read_requests[coreid] = num_read_requests[coreid];
-      record_write_requests[coreid] = num_write_requests[coreid];
-      for (auto ctrl : ctrls) {
-        ctrl->record_core(coreid);
-      }
-    }
-
-    void tick()
-    {
-        ++num_dram_cycles;
-        int cur_que_req_num = 0;
-        int cur_que_readreq_num = 0;
-        int cur_que_writereq_num = 0;
-        for (auto ctrl : ctrls) {
-          cur_que_req_num += ctrl->readq.size() + ctrl->writeq.size() + ctrl->pending.size();
-          cur_que_readreq_num += ctrl->readq.size() + ctrl->pending.size();
-          cur_que_writereq_num += ctrl->writeq.size();
-        }
-        in_queue_req_num_sum += cur_que_req_num;
-        in_queue_read_req_num_sum += cur_que_readreq_num;
-        in_queue_write_req_num_sum += cur_que_writereq_num;
-
-        bool is_active = false;
-        for (auto ctrl : ctrls) {
-          is_active = is_active || ctrl->is_active();
-          ctrl->tick();
-        }
-        if (is_active) {
-          ramulator_active_cycles++;
-        }
-    }
-
-    bool is_full(int ch, bool is_write) const {
-      return ctrls[ch]->is_full(is_write);
-    }
-
-    int get_num_channels() const {
-      return ctrls.size();
-    }
-
-    int get_transaction_bytes() const {
-      return (spec->prefetch_size * (spec->channel_width / 8));
-    }
-    
-    std::vector<int> decode_mem_addr(uint64_t target_addr) {
-      std::vector<int> addr_vec(addr_bits.size(), 0);
-      uint64_t addr = target_addr;
-      // Each transaction size is 2^tx_bits, so first clear the lowest tx_bits bits
-      clear_lower_bits(addr, tx_bits);
-      if (use_mapping_file){
-        apply_mapping(addr, addr_vec);
-      } 
-      else {
-        switch(int(type)){
-            case int(Type::ChRaBaRoCo):
-            for (int i = addr_bits.size() - 1; i >= 0; i--)
-                addr_vec[i] = slice_lower_bits(addr, addr_bits[i]);
-            break;
-            case int(Type::RoBaRaCoCh):
-            addr_vec[0] = slice_lower_bits(addr, addr_bits[0]);
-            addr_vec[addr_bits.size() - 1] = 
-                slice_lower_bits(addr, addr_bits[addr_bits.size() - 1]);
-            for (int i = 1; i <= int(T::Level::Row); i++)
-                addr_vec[i] = slice_lower_bits(addr, addr_bits[i]);
-            break;
-            case int(Type::RoCoBaRaCh):
-            for (int i = 0; i <= int(T::Level::Bank); ++i) {
-                addr_vec[i] = slice_lower_bits(addr, addr_bits[i]);
-            }
-            addr_vec[int(T::Level::Column)] = 
-                slice_lower_bits(addr, addr_bits[int(T::Level::Column)]);
-            addr_vec[int(T::Level::Row)] =
-                slice_lower_bits(addr, addr_bits[int(T::Level::Row)]);
-            break;
-            default:
-                assert(false);
-        }
-      }
-      return addr_vec;
-    }
-
-    bool send(Request req)
-    {
-        // req.addr_vec.resize(addr_bits.size());
-        // long addr = req.addr;
-        // int coreid = req.coreid;
-        //
-        // // Each transaction size is 2^tx_bits, so first clear the lowest tx_bits bits
-        // clear_lower_bits(addr, tx_bits);
-        //
-        // if (use_mapping_file){
-        //     apply_mapping(addr, req.addr_vec);
-        // }
-        // else {
-        //     switch(int(type)){
-        //         case int(Type::ChRaBaRoCo):
-        //             for (int i = addr_bits.size() - 1; i >= 0; i--)
-        //                 req.addr_vec[i] = slice_lower_bits(addr, addr_bits[i]);
-        //             break;
-        //         case int(Type::RoBaRaCoCh):
-        //             req.addr_vec[0] = slice_lower_bits(addr, addr_bits[0]);
-        //             req.addr_vec[addr_bits.size() - 1] = slice_lower_bits(addr, addr_bits[addr_bits.size() - 1]);
-        //             for (int i = 1; i <= int(T::Level::Row); i++)
-        //                 req.addr_vec[i] = slice_lower_bits(addr, addr_bits[i]);
-        //             break;
-        //         default:
-        //             assert(false);
-        //     }
-        // }
-
-        if(ctrls[req.getChannelID()]->enqueue(req)) {
-            // tally stats here to avoid double counting for requests that aren't enqueued
-            ++num_incoming_requests;
-            if (req.type == Request::Type::READ) {
-              ++num_read_requests[req.coreid];
-              ++incoming_read_reqs_per_channel[req.addr_vec[int(T::Level::Channel)]];
-            }
-            if (req.type == Request::Type::WRITE) {
-              ++num_write_requests[req.coreid];
-            }
-            ++incoming_requests_per_channel[req.addr_vec[int(T::Level::Channel)]];
-            return true;
-        }
-
-        return false;
-    }
-    
-    void init_mapping_with_file(string filename){
-        ifstream file(filename);
-        assert(file.good() && "Bad mapping file");
-        // possible line types are:
-        // 0. Empty line
-        // 1. Direct bit assignment   : component N   = x
-        // 2. Direct range assignment : component N:M = x:y
-        // 3. XOR bit assignment      : component N   = x y z ...
-        // 4. Comment line            : # comment here
-        string line;
-        char delim[] = " \t";
-        while (getline(file, line)) {
-            short capture_flags = 0;
-            int level = -1;
-            int target_bit = -1, target_bit2 = -1;
-            int source_bit = -1, source_bit2 = -1;
-            // cout << "Processing: " << line << endl;
-            bool is_range = false;
-            while (true) { // process next word
-                size_t start = line.find_first_not_of(delim);
-                if (start == string::npos) // no more words
-                    break;
-                size_t end = line.find_first_of(delim, start);
-                string word = line.substr(start, end - start);
-                
-                if (word.at(0) == '#')// starting a comment
-                    break;
-                
-                size_t col_index;
-                int source_min, target_min, target_max;
-                switch (capture_flags){
-                    case 0: // capturing the component name
-                        // fetch component level from channel spec
-                        for (int i = 0; i < int(T::Level::MAX); i++)
-                            if (word.find(T::level_str[i]) != string::npos) {
-                                level = i;
-                                capture_flags ++;
-                            }
-                        break;
-
-                    case 1: // capturing target bit(s)
-                        col_index = word.find(":");
-                        if ( col_index != string::npos ){
-                            target_bit2 = stoi(word.substr(col_index+1));
-                            word = word.substr(0,col_index);
-                            is_range = true;
-                        }
-                        target_bit = stoi(word);
-                        capture_flags ++;
-                        break;
-
-                    case 2: //this should be the delimiter
-                        assert(word.find("=") != string::npos);
-                        capture_flags ++;
-                        break;
-
-                    case 3:
-                        if (is_range){
-                            col_index = word.find(":");
-                            source_bit  = stoi(word.substr(0,col_index));
-                            source_bit2 = stoi(word.substr(col_index+1));
-                            assert(source_bit2 - source_bit == target_bit2 - target_bit);
-                            source_min = min(source_bit, source_bit2);
-                            target_min = min(target_bit, target_bit2);
-                            target_max = max(target_bit, target_bit2);
-                            while (target_min <= target_max){
-                                mapping_scheme[level][target_min].push_back(source_min);
-                                // cout << target_min << " <- " << source_min << endl;
-                                source_min ++;
-                                target_min ++;
-                            }
-                        }
-                        else {
-                            source_bit = stoi(word);
-                            mapping_scheme[level][target_bit].push_back(source_bit);
-                        }
-                }
-                if (end == string::npos) { // this is the last word
-                    break;
-                }
-                line = line.substr(end);
-            }
-        }
-        if (dump_mapping)
-            dump_mapping_scheme();
-    }
-    
-    void dump_mapping_scheme(){
-        cout << "Mapping Scheme: " << endl;
-        for (MapScheme::iterator mapit = mapping_scheme.begin(); mapit != mapping_scheme.end(); mapit++)
-        {
-            int level = mapit->first;
-            for (MapSchemeEntry::iterator entit = mapit->second.begin(); entit != mapit->second.end(); entit++){
-                cout << T::level_str[level] << "[" << entit->first << "] := ";
-                cout << "PhysicalAddress[" << *(entit->second.begin()) << "]";
-                entit->second.erase(entit->second.begin());
-                for (MapSrcVector::iterator it = entit->second.begin() ; it != entit->second.end(); it ++)
-                    cout << " xor PhysicalAddress[" << *it << "]";
-                cout << endl;
-            }
-        }
-    }
-    
-    void apply_mapping(long addr, std::vector<int>& addr_vec){
-        int *sz = spec->org_entry.count;
-        int addr_total_bits = sizeof(addr_vec)*8;
-        int addr_bits [int(T::Level::MAX)];
-        for (int i = 0 ; i < int(T::Level::MAX) ; i ++)
-        {
-            if ( i != int(T::Level::Row))
-            {
-                addr_bits[i] = calc_log2(sz[i]);
-                addr_total_bits -= addr_bits[i];
-            }
-        }
-        // Row address is an integer.
-        addr_bits[int(T::Level::Row)] = min((int)sizeof(int)*8, max(addr_total_bits, calc_log2(sz[int(T::Level::Row)])));
-
-        // printf("Address: %lx => ",addr);
-        for (unsigned int lvl = 0; lvl < int(T::Level::MAX); lvl++)
-        {
-            unsigned int lvl_bits = addr_bits[lvl];
-            addr_vec[lvl] = 0;
-            for (unsigned int bitindex = 0 ; bitindex < lvl_bits ; bitindex++){
-                bool bitvalue = false;
-                for (MapSrcVector::iterator it = mapping_scheme[lvl][bitindex].begin() ;
-                    it != mapping_scheme[lvl][bitindex].end(); it ++)
-                {
-                    bitvalue = bitvalue xor get_bit_at(addr, *it);
-                }
-                addr_vec[lvl] |= (bitvalue << bitindex);
-            }
-            // printf("%s: %x, ",T::level_str[lvl].c_str(),addr_vec[lvl]);
-        }
-        // printf("\n");
-    }
-
-    int pending_requests()
-    {
-        int reqs = 0;
-        for (auto ctrl: ctrls)
-            reqs += ctrl->readq.size() + ctrl->writeq.size() + ctrl->otherq.size() + ctrl->actq.size() + ctrl->pending.size();
-        return reqs;
-    }
-
-    void set_high_writeq_watermark(const float watermark) {
-        for (auto ctrl: ctrls)
-            ctrl->set_high_writeq_watermark(watermark);
-    }
-
-    void set_low_writeq_watermark(const float watermark) {
-    for (auto ctrl: ctrls)
-        ctrl->set_low_writeq_watermark(watermark);
-    }
-
-    void finish(void) {
-      dram_capacity = max_address;
-      int *sz = spec->org_entry.count;
-      maximum_bandwidth = spec->speed_entry.rate * 1e6 * spec->channel_width * sz[int(T::Level::Channel)] / 8;
-      long dram_cycles = num_dram_cycles.value();
-      for (auto ctrl : ctrls) {
-        long read_req = long(incoming_read_reqs_per_channel[ctrl->channel->id].value());
-        ctrl->finish(read_req, dram_cycles);
-      }
-
-      // finalize average queueing requests
-      in_queue_req_num_avg = in_queue_req_num_sum.value() / dram_cycles;
-      in_queue_read_req_num_avg = in_queue_read_req_num_sum.value() / dram_cycles;
-      in_queue_write_req_num_avg = in_queue_write_req_num_sum.value() / dram_cycles;
-    }
-
-    bool done() const {
-      return std::all_of(
-          std::begin(ctrls), 
-          std::end(ctrls), 
-          [](const auto &ctrl) {
-            return ctrl->done();
-          });
-    }
-
-    long page_allocator(long addr, int coreid) {
-        long virtual_page_number = addr >> 12;
-
-        switch(int(translation)) {
-            case int(Translation::None): {
-              return addr;
-            }
-            case int(Translation::Random): {
-                auto target = make_pair(coreid, virtual_page_number);
-                if(page_translation.find(target) == page_translation.end()) {
-                    // page doesn't exist, so assign a new page
-                    // make sure there are physical pages left to be assigned
-
-                    // if physical page doesn't remain, replace a previous assigned
-                    // physical page.
-                    if (!free_physical_pages_remaining) {
-                      physical_page_replacement++;
-                      long phys_page_to_read = lrand() % free_physical_pages.size();
-                      assert(free_physical_pages[phys_page_to_read] != -1);
-                      page_translation[target] = phys_page_to_read;
-                    } else {
-                        // assign a new page
-                        long phys_page_to_read = lrand() % free_physical_pages.size();
-                        // if the randomly-selected page was already assigned
-                        if(free_physical_pages[phys_page_to_read] != -1) {
-                            long starting_page_of_search = phys_page_to_read;
-
-                            do {
-                                // iterate through the list until we find a free page
-                                // TODO: does this introduce serious non-randomness?
-                                ++phys_page_to_read;
-                                phys_page_to_read %= free_physical_pages.size();
-                            }
-                            while((phys_page_to_read != starting_page_of_search) && free_physical_pages[phys_page_to_read] != -1);
-                        }
-
-                        assert(free_physical_pages[phys_page_to_read] == -1);
-
-                        page_translation[target] = phys_page_to_read;
-                        free_physical_pages[phys_page_to_read] = coreid;
-                        --free_physical_pages_remaining;
-                    }
-                }
-
-                // SAUGATA TODO: page size should not always be fixed to 4KB
-                return (page_translation[target] << 12) | (addr & ((1 << 12) - 1));
-            }
-            default: {
-                assert(false);
-                return -1;
-            }
-        }
-
-    }
-
-private:
-
-    int calc_log2(int val){
-        int n = 0;
-        while ((val >>= 1))
-            n ++;
-        return n;
-    }
-    int slice_lower_bits(uint64_t & addr, int bits)
-    {
-        int lbits = addr & ((1<<bits) - 1);
-        addr >>= bits;
-        return lbits;
-    }
-    bool get_bit_at(uint64_t addr, int bit)
-    {
-        return (((addr >> bit) & 1) == 1);
-    }
-    void clear_lower_bits(uint64_t & addr, int bits)
-    {
-        addr >>= bits;
-    }
-    long lrand(void) {
-        if(sizeof(int) < sizeof(long)) {
-            return static_cast<long>(rand()) << (sizeof(int) * 8) | rand();
-        }
-
-        return rand();
-    }
-};
-
-} /*namespace ram*/
-
-#endif /*__MEMORY_H*/
diff --git a/TOGSim/extern/ramulator_custom/src/MemoryFactory.cpp b/TOGSim/extern/ramulator_custom/src/MemoryFactory.cpp
deleted file mode 100644
index 9a15f3d1..00000000
--- a/TOGSim/extern/ramulator_custom/src/MemoryFactory.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-#include "MemoryFactory.h"
-// #include "LPDDR4.h"
-// #include "WideIO.h"
-// #include "WideIO2.h"
-#include "HBM.h"
-//#include "SALP.h"
-
-using namespace ram;
-
-namespace ram
-{
-//
-// template <>
-// void MemoryFactory<LPDDR4>::validate(int channels, int ranks, RamulatorConfig& configs) {
-//     assert(channels >= 2 && "LPDDR4 requires 2, 4, 8 ... channels");
-// }
-//
-// template <>
-// void MemoryFactory<WideIO>::validate(int channels, int ranks, RamulatorConfig& configs) {
-//     assert(channels == 4 && "WideIO comes with 4 channels");
-// }
-//
-// template <>
-// void MemoryFactory<WideIO2>::validate(int channels, int ranks, RamulatorConfig& configs) {
-//     assert((channels == 4 || channels == 8) && "WideIO2 comes with 4 or 8 channels");
-//     assert((ranks == 1 || ranks == 2) && "WideIO2 comes with 1 or 2 ranks");
-// }
-
-template <>
-void MemoryFactory<HBM>::validate(int channels, int ranks, RamulatorConfig& configs) {
-    assert(channels == 8 && "HBM comes with 8 channels");
-}
-
-// template <>
-// MemoryBase *MemoryFactory<WideIO2>::create(RamulatorConfig& configs, int cacheline) {
-//     int channels = stoi(configs["channels"], NULL, 0);
-//     int ranks = stoi(configs["ranks"], NULL, 0);
-//     validate(channels, ranks, configs);
-//
-//     const string& org_name = configs["org"];
-//     const string& speed_name = configs["speed"];
-//
-//     WideIO2 *spec = new WideIO2(org_name, speed_name, channels);
-//
-//     extend_channel_width(spec, cacheline);
-//
-//     return (MemoryBase *)populate_memory(configs, spec, channels, ranks);
-// }
-//
-//
-// template <>
-// MemoryBase *MemoryFactory<SALP>::create(RamulatorConfig& configs, int cacheline) {
-//     int channels = stoi(configs["channels"], NULL, 0);
-//     int ranks = stoi(configs["ranks"], NULL, 0);
-//     int subarrays = stoi(configs["subarrays"], NULL, 0);
-//     validate(channels, ranks, configs);
-//
-//     const string& std_name = configs["standard"];
-//     const string& org_name = configs["org"];
-//     const string& speed_name = configs["speed"];
-//
-//     SALP *spec = new SALP(org_name, speed_name, std_name, subarrays);
-//
-//     extend_channel_width(spec, cacheline);
-//
-//     return (MemoryBase *)populate_memory(configs, spec, channels, ranks);
-// }
-
-}
-
-// This function can be used by autoconf AC_CHECK_LIB since
-// apparently it can't detect C++ functions.
-// Basically just an entry in the symbol table
-// extern "C"
-// {
-//     void libramulator_is_present(void)
-//     {
-//         ;
-//     }
-// }
diff --git a/TOGSim/extern/ramulator_custom/src/MemoryFactory.h b/TOGSim/extern/ramulator_custom/src/MemoryFactory.h
deleted file mode 100644
index be10213b..00000000
--- a/TOGSim/extern/ramulator_custom/src/MemoryFactory.h
+++ /dev/null
@@ -1,84 +0,0 @@
-#ifndef __MEMORY_FACTORY_H
-#define __MEMORY_FACTORY_H
-
-#include <map>
-#include <string>
-#include <cassert>
-#include <memory>
-
-#include "Memory.h"
-#include "DRAM.h"
-#include "Controller.h"
-#include "Config.h"
-
-using namespace std;
-
-namespace ram
-{
-template <typename T>
-class MemoryFactory {
-public:
-    static void extend_channel_width(T* spec, int cacheline)
-    {
-        int channel_unit = spec->prefetch_size * spec->channel_width / 8;
-        int gang_number = cacheline / channel_unit;
-        
-        assert(gang_number >= 1 && 
-            "cacheline size must be greater or equal to minimum channel width");
-        
-        assert(cacheline == gang_number * channel_unit &&
-            "cacheline size must be a multiple of minimum channel width");
-        
-        spec->channel_width *= gang_number;
-    }
-
-    static std::unique_ptr<Memory<T>> populate_memory(RamulatorConfig& configs, 
-                                                      T *spec, 
-                                                      int channels, int ranks) {
-        int& default_ranks = spec->org_entry.count[int(T::Level::Rank)];
-        int& default_channels = spec->org_entry.count[int(T::Level::Channel)];
-
-        if (default_channels == 0) default_channels = channels;
-        if (default_ranks == 0) default_ranks = ranks;
-
-        vector<Controller<T> *> ctrls;
-        for (int c = 0; c < channels; c++){
-            DRAM<T>* channel = new DRAM<T>(spec, T::Level::Channel);
-            channel->id = c;
-            channel->regStats("");
-            ctrls.push_back(new Controller<T>(configs, channel));
-        }
-        return std::make_unique<Memory<T>>(configs, ctrls);
-    }
-
-    static void validate(int channels, int ranks, RamulatorConfig& configs) {
-        assert(channels > 0 && ranks > 0);
-    }
-
-    static std::unique_ptr<MemoryBase> create(RamulatorConfig& configs, 
-                                              int cacheline) {
-        int channels = stoi(configs["channels"], NULL, 0);
-        int ranks = stoi(configs["ranks"], NULL, 0);
-        
-        validate(channels, ranks, configs);
-
-        const string& org_name = configs["org"];
-        const string& speed_name = configs["speed"];
-
-        T *spec = new T(org_name, speed_name);
-
-        // Set channel width statically in the header file
-        //extend_channel_width(spec, cacheline);
-
-        return populate_memory(configs, spec, channels, ranks);
-    }
-};
-
-// template <>
-// MemoryBase *MemoryFactory<WideIO2>::create(RamulatorConfig& configs, int cacheline);
-// template <>
-// MemoryBase *MemoryFactory<SALP>::create(RamulatorConfig& configs, int cacheline);
-
-} /*namespace ram*/
-
-#endif /*__MEMORY_FACTORY_H*/
diff --git a/TOGSim/extern/ramulator_custom/src/Ramulator.cpp b/TOGSim/extern/ramulator_custom/src/Ramulator.cpp
deleted file mode 100644
index 6d37f8b1..00000000
--- a/TOGSim/extern/ramulator_custom/src/Ramulator.cpp
+++ /dev/null
@@ -1,171 +0,0 @@
-#include "Ramulator.hpp"
-#include "Memory.h"
-#include "MemoryFactory.h"
-#include "DDR4.h"
-#include "HBM.h"
-#include "Request.h"
-
-namespace ram {
-  // TODO: init outputpendingqueue
-Ramulator::Ramulator(const std::string ConfigFilePath, uint32_t num_core, bool is_pim) 
-    : MemBase(createMemory(ConfigFilePath, num_core)), is_pim(is_pim) {
-  for (int ch = 0; ch < MemBase->get_num_channels(); ++ch) {
-    OutputPendingQueues.push_back(OutputPendingQueue(64));
-  }
-  Callbacks[false] = [&](const ram::Request& Req) {
-    int CtrlID = Req.getChannelID();
-    // TODO: check pending queue reservation logic
-    OutputPendingQueues[CtrlID].push(Req.orignal_request);
-  };
-  Callbacks[true] = [&](const ram::Request& Req) {
-    int CtrlID = Req.getChannelID();
-    // // TODO: check pending queue reservation logic
-    OutputPendingQueues[CtrlID].push(Req.orignal_request);
-  };
-
-  if (is_pim) {
-    int hot_vid = -1;
-    int in_degrees = -1;
-    int total_vid = 0;
-  }
-  Stat::statlist.output("./ramulator.stats");
-}
-
-void Ramulator::tick() {
-  MemBase->tick();
-}
-
-bool Ramulator::isAvailable(int CtrlID, uint64_t Addr, bool IsWrite) const {
-  std::vector<int> MemAddr = MemBase->decode_mem_addr(Addr);
-  assert(CtrlID == MemAddr[0]);
-  return  OutputPendingQueues[CtrlID].isAvailable(1) && !MemBase->is_full(CtrlID, IsWrite);
-}
-
-bool Ramulator::isAvailable(uint64_t Addr, bool IsWrite) const {
-  // TODO: need to avoid decoding memory addr whenever `isAvailable` is called
-  std::vector<int> MemAddr = MemBase->decode_mem_addr(Addr);
-  uint32_t CtrlID = MemAddr[0];
-    
-  bool result = OutputPendingQueues[CtrlID].isAvailable(1) && !MemBase->is_full(CtrlID, IsWrite);
-  
-  return result;
-}
-
-
-void Ramulator::push(int CtrlID, uint64_t Addr, bool IsWrite, uint32_t core_id, void* orignal_req) {
-  std::vector<int> MemAddr = MemBase->decode_mem_addr(Addr);
-  //Ensure CtrlID match with decoded address
-  assert(CtrlID == MemAddr[0]); 
-  if (IsWrite) {
-    Request req(Request::Type::WRITE, Addr, MemAddr, Callbacks[IsWrite], orignal_req);
-    req.coreid = core_id;
-    bool isSent = MemBase->send(req);
-    assert(isSent);
-  } else {
-    Request req(Request::Type::READ, Addr, MemAddr, Callbacks[IsWrite], orignal_req);
-    req.coreid = core_id;
-    bool isSent = MemBase->send(req);
-    assert(isSent);
-  }
-
-  OutputPendingQueues[CtrlID].reserve();
-}
-
-void Ramulator::push(uint64_t Addr, bool IsWrite, uint32_t core_id, void* original_req) {
-  std::vector<int> MemAddr = MemBase->decode_mem_addr(Addr);
-  const int CtrlID = MemAddr[0];
-  // TODO: vid check here
-  if (IsWrite) {
-    Request req(Request::Type::WRITE, Addr, MemAddr, Callbacks[IsWrite], original_req);
-    req.coreid = core_id;
-    bool isSent = MemBase->send(req);
-    assert(isSent);
-  } else {
-    Request req(Request::Type::READ, Addr, MemAddr, Callbacks[IsWrite], original_req);
-    req.coreid = core_id;
-    bool isSent = MemBase->send(req);
-    assert(isSent);
-  }
-
-  OutputPendingQueues[CtrlID].reserve();
-}
-
-bool Ramulator::isEmpty(int CtrlID) const {
-  return OutputPendingQueues[CtrlID].isEmpty();
-}
-const void* Ramulator::top(int CtrlID) const {
-  return OutputPendingQueues[CtrlID].top();
-}
-void Ramulator::pop(int CtrlID) {
-  OutputPendingQueues[CtrlID].pop();
-}
-
-int Ramulator::getAtomicBytes() const {
-  return MemBase->get_transaction_bytes();
-}
-
-int Ramulator::getNumChannels() const {
-  return MemBase->get_num_channels();
-}
-
-int Ramulator::getChannel(uint64_t Addr) const {
-  std::vector<int> MemAddr = MemBase->decode_mem_addr(Addr);
-  return MemAddr[0];
-}
-
-void Ramulator::print_stats() {
-  MemBase->finish();
-  Stat::statlist.printall();
-}
-
-std::unique_ptr<MemoryBase> 
-Ramulator::createMemory(const std::string ConfigFilePath, uint32_t num_core) {
-  RamulatorConfig Config(ConfigFilePath);
-  Config.set_core_num(num_core);
-  std::string MemType = Config["standard"];
-  if (MemType == "DDR4") {
-    return MemoryFactory<DDR4>::create(Config, 32);
-  } else if (MemType == "HBM") {
-    return MemoryFactory<HBM>::create(Config, 32);
-  } else {
-    assert(false);
-    return nullptr;
-  }
-}
-Ramulator::OutputPendingQueue::OutputPendingQueue(int Size)
-    : Size(Size),
-      NumReserved(0) {}
-
-bool Ramulator::OutputPendingQueue::isAvailable() const {
-  return NumReserved + PendingQueue.size() < Size;
-}
-
-bool Ramulator::OutputPendingQueue::isAvailable(uint32_t count) const {
-  return NumReserved + PendingQueue.size() + count - 1 < Size;
-}
-
-void Ramulator::OutputPendingQueue::reserve() {
-  assert(NumReserved < Size);
-  NumReserved++;
-}
-
-void Ramulator::OutputPendingQueue::push(void* Addr) {
-  PendingQueue.push(Addr);
-  assert(NumReserved > 0);
-  NumReserved--;
-}
-
-bool Ramulator::OutputPendingQueue::isEmpty() const {
-  return PendingQueue.empty();
-}
-
-void Ramulator::OutputPendingQueue::pop() {
-  PendingQueue.pop();
-}
-const void* Ramulator::OutputPendingQueue::top() const {
-  return PendingQueue.front();
-}
-
-Ramulator::~Ramulator() = default;
-
-}
diff --git a/TOGSim/extern/ramulator_custom/src/Refresh.cpp b/TOGSim/extern/ramulator_custom/src/Refresh.cpp
deleted file mode 100644
index 20281f64..00000000
--- a/TOGSim/extern/ramulator_custom/src/Refresh.cpp
+++ /dev/null
@@ -1,255 +0,0 @@
-/*
- * Refresh.cpp
- *
- * Mainly DSARP specialization at the moment.
- *
- *  Created on: Mar 17, 2015
- *      Author: kevincha
- */
-
-#include <stdlib.h>
-
-#include "Refresh.h"
-#include "Controller.h"
-#include "DRAM.h"
-// #include "DSARP.h"
-
-using namespace std;
-using namespace ram;
-
-namespace ram {
-
-/**** DSARP specialization ****/
-// template<>
-// Refresh<DSARP>::Refresh(Controller<DSARP>* ctrl) : ctrl(ctrl) {
-//   clk = refreshed = 0;
-//   max_rank_count = ctrl->channel->children.size();
-//   max_bank_count = ctrl->channel->spec->org_entry.count[(int)DSARP::Level::Bank];
-//   max_sa_count = ctrl->channel->spec->org_entry.count[(int)DSARP::Level::SubArray];
-//
-//   // Init refresh counters
-//   for (int r = 0; r < max_rank_count; r++) {
-//     bank_ref_counters.push_back(0);
-//     bank_refresh_backlog.push_back(new vector<int>(max_bank_count, 0));
-//     vector<int> sa_counters(ctrl->channel->spec->org_entry.count[(int)DSARP::Level::SubArray], 0);
-//     subarray_ref_counters.push_back(sa_counters);
-//   }
-//
-//   level_chan = (int)DSARP::Level::Channel;
-//   level_rank = (int)DSARP::Level::Rank;
-//   level_bank = (int)DSARP::Level::Bank;
-//   level_sa   = (int)DSARP::Level::SubArray;
-// }
-//
-// template<>
-// void Refresh<DSARP>::early_inject_refresh() {
-//   // Only enabled during reads
-//   if (ctrl->write_mode)
-//     return;
-//
-//   // OoO bank-level refresh
-//   vector<bool> is_bank_occupied(max_rank_count * max_bank_count, false);
-//   Controller<DSARP>::Queue& rdq = ctrl->readq;
-//
-//   // Figure out which banks are idle in order to refresh one of them
-//   for (auto req: rdq.q)
-//   {
-//     assert(req.addr_vec[level_chan] == ctrl->channel->id);
-//     int ridx = req.addr_vec[level_rank] * max_bank_count;
-//     int bidx = req.addr_vec[level_bank];
-//     is_bank_occupied[ridx+bidx] = true;
-//   }
-//
-//   // Try to pick an idle bank to refresh per rank
-//   for (int r = 0; r < max_rank_count; r++) {
-//     // Randomly pick a bank to examine
-//     int bidx_start = rand() % max_bank_count;
-//
-//     for (int b = 0; b < max_bank_count; b++)
-//     {
-//       int bidx = (bidx_start + b) % max_bank_count;
-//       // Idle cycle only
-//       if (is_bank_occupied[(r * max_bank_count) + bidx])
-//         continue;
-//
-//       // Pending refresh
-//       bool pending_ref = false;
-//       for (Request req : ctrl->otherq.q)
-//         if (req.type == Request::Type::REFRESH
-//             && req.addr_vec[level_chan] == ctrl->channel->id
-//             && req.addr_vec[level_rank] == r && req.addr_vec[level_bank] == bidx)
-//           pending_ref = true;
-//       if (pending_ref)
-//         continue;
-//
-//       // Only pull in refreshes when we are almost running out of credits
-//       if ((*(bank_refresh_backlog[r]))[bidx] >= backlog_early_pull_threshold ||
-//           ctrl->otherq.q.size() >= ctrl->otherq.max)
-//         continue;
-//
-//       // Refresh now
-//       refresh_target(ctrl, r, bidx, subarray_ref_counters[r][bidx]);
-//       // One credit for delaying a future ref
-//       (*(bank_refresh_backlog[r]))[bidx]++;
-//       subarray_ref_counters[r][bidx] = (subarray_ref_counters[r][bidx]+1) % max_sa_count;
-//       break;
-//     }
-//   }
-// }
-
-// template<>
-// void Refresh<DSARP>::inject_refresh(bool b_ref_rank) {
-//   // Rank-level refresh
-//   if (b_ref_rank)
-//     for (auto rank : ctrl->channel->children)
-//       refresh_target(ctrl, rank->id, -1, -1);
-//   // Bank-level refresh. Simultaneously issue to all ranks (better performance than staggered refreshes).
-//   else {
-//     for (auto rank : ctrl->channel->children) {
-//       int rid = rank->id;
-//       int bid = bank_ref_counters[rid];
-//
-//       // Behind refresh schedule by 1 ref
-//       (*(bank_refresh_backlog[rid]))[bid]--;
-//
-//       // Next time, refresh the next bank in the same bank
-//       bank_ref_counters[rid] = (bank_ref_counters[rid] + 1) % max_bank_count;
-//
-//       // Check to see if we can skip a refresh
-//       if (ctrl->channel->spec->type == DSARP::Type::DARP ||
-//         ctrl->channel->spec->type == DSARP::Type::DSARP) {
-//
-//         bool ref_now = false;
-//         // 1. Any pending refrehes?
-//         bool pending_ref = false;
-//         for (Request req : ctrl->otherq.q) {
-//           if (req.type == Request::Type::REFRESH) {
-//             pending_ref = true;
-//             break;
-//           }
-//         }
-//
-//         // 2. Track readq
-//         if (!pending_ref && ctrl->readq.size() == 0)
-//           ref_now = true;
-//
-//         // 3. Track log status. If we are too behind the schedule, then we need to refresh now.
-//         if ((*(bank_refresh_backlog[rid]))[bid] <= backlog_min)
-//           ref_now = true;
-//
-//         // Otherwise skip refresh
-//         if (!ref_now)
-//           continue;
-//       }
-//
-//       refresh_target(ctrl, rid, bid, subarray_ref_counters[rid][bid]);
-//       // Get 1 ref credit
-//       (*(bank_refresh_backlog[rid]))[bid]++;
-//       // Next time, refresh the next sa in the same bank
-//       subarray_ref_counters[rid][bid] = (subarray_ref_counters[rid][bid]+1) % max_sa_count;
-//     }
-//   }
-//   refreshed = clk;
-// }
-//
-// first = wrq.count; second = bank idx
-typedef pair<int, int> wrq_idx;
-bool wrq_comp (wrq_idx l, wrq_idx r)
-{
-  return l.first < r.first;
-}
-
-// WRP
-// template<>
-// void Refresh<DSARP>::wrp() {
-//   for (int ref_rid = 0; ref_rid < max_rank_count; ref_rid++)
-//   {
-//     // Pending refresh in the rank?
-//     bool pending_ref = false;
-//     for (Request req : ctrl->otherq.q) {
-//       if (req.type == Request::Type::REFRESH && req.addr_vec[level_rank] == ref_rid) {
-//         pending_ref = true;
-//         break;
-//       }
-//     }
-//     if (pending_ref)
-//       continue;
-//
-//     // Find the bank with the lowest number of writes+reads
-//     vector<wrq_idx> sorted_bank_demand;
-//     for (int b = 0; b < max_bank_count; b++)
-//       sorted_bank_demand.push_back(wrq_idx(0,b));
-//     // Filter out all the writes to this rank
-//     int total_wr = 0;
-//     for (auto req : ctrl->writeq.q) {
-//       if (req.addr_vec[level_rank] == ref_rid) {
-//         sorted_bank_demand[req.addr_vec[level_bank]].first++;
-//         total_wr++;
-//       }
-//     }
-//     // If there's no write, just skip.
-//     if (total_wr == 0)
-//       continue;
-//
-//     // Add read
-//     for (auto req : ctrl->readq.q)
-//       if (req.addr_vec[level_rank] == ref_rid)
-//         sorted_bank_demand[req.addr_vec[level_bank]].first++;
-//
-//     // Sort based on the entries
-//     std::sort(sorted_bank_demand.begin(), sorted_bank_demand.end(), wrq_comp);
-//
-//     // Randomly select an idle bank to refresh
-//     int top_idle_idx = 0;
-//     for (int i = 0; i < max_bank_count; i++) {
-//       if (sorted_bank_demand[i].second != 0) {
-//         top_idle_idx = i;
-//         break;
-//       }
-//     }
-//
-//     // Select a bank to ref
-//     int ref_bid_idx = (top_idle_idx == 0) ? 0 : rand() % top_idle_idx;
-//     int ref_bid = sorted_bank_demand[ref_bid_idx].second;
-//
-//     // Make sure we don't exceed the credit
-//     if ((*(bank_refresh_backlog[ref_rid]))[ref_bid] < backlog_max
-//         && ctrl->otherq.q.size() < ctrl->otherq.max) {
-//       refresh_target(ctrl, ref_rid, ref_bid, subarray_ref_counters[ref_rid][ref_bid]);
-//       // Get 1 ref credit
-//       (*(bank_refresh_backlog[ref_rid]))[ref_bid]++;
-//       subarray_ref_counters[ref_rid][ref_bid] = (subarray_ref_counters[ref_rid][ref_bid]+1) % max_sa_count;
-//     }
-//   }
-// }
-//
-// // OoO refresh of DSARP
-// template<>
-// void Refresh<DSARP>::tick_ref() {
-//   clk++;
-//
-//   bool b_ref_rank = ctrl->channel->spec->b_ref_rank;
-//   int refresh_interval =
-//       (b_ref_rank) ?
-//           ctrl->channel->spec->speed_entry.nREFI :
-//           ctrl->channel->spec->speed_entry.nREFIpb;
-//
-//   // DARP
-//   if (ctrl->channel->spec->type == DSARP::Type::DARP ||
-//     ctrl->channel->spec->type == DSARP::Type::DSARP) {
-//     // Write-Refresh Parallelization. Issue refreshes when the controller enters writeback mode
-//     if (!ctrl_write_mode && ctrl->write_mode)
-//       wrp();
-//     // Record write mode
-//     ctrl_write_mode = ctrl->write_mode;
-//     // Inject early to pull in some refreshes during read mode
-//     early_inject_refresh();
-//   }
-//
-//   // Time to schedule a refresh and also try to skip some refreshes
-//   if ((clk - refreshed) >= refresh_interval)
-//     inject_refresh(b_ref_rank);
-// }
-/**** End DSARP specialization ****/
-
-} /* namespace ram */
diff --git a/TOGSim/extern/ramulator_custom/src/Refresh.h b/TOGSim/extern/ramulator_custom/src/Refresh.h
deleted file mode 100644
index 36c08b55..00000000
--- a/TOGSim/extern/ramulator_custom/src/Refresh.h
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Refresh.h
- *
- * This is a refresh scheduler. A list of refresh policies implemented:
- *
- * 1. All-bank refresh
- * 2. Per-bank refresh (only DSARP memory module has been completed to work with REFpb).
- *     The other modules (LPDDRx) have not been updated to pass a knob to turn on/off REFpb.
- * 3. A re-implementation of DSARP from the refresh mechanisms proposed in Chang et al.,
- * "Improving DRAM Performance by Parallelizing Refreshes with Accesses", HPCA 2014.
- *
- *  Created on: Mar 17, 2015
- *      Author: kevincha
- */
-
-#ifndef __REFRESH_H_
-#define __REFRESH_H_
-
-#include <stddef.h>
-#include <cassert>
-#include <iostream>
-#include <vector>
-
-#include "Request.h"
-// #include "DSARP.h"
-// #include "ALDRAM.h"
-
-using namespace std;
-
-namespace ram {
-
-template <typename T>
-class Controller;
-
-template <typename T>
-class Refresh {
-public:
-  Controller<T>* ctrl;
-  long clk, refreshed;
-  // Per-bank refresh counter to track the refresh progress for each rank
-  vector<int> bank_ref_counters;
-  int max_rank_count, max_bank_count;
-  int level_chan, level_rank, level_bank, level_sa;
-
-  // ctor
-  Refresh(Controller<T>* ctrl) : ctrl(ctrl) {
-    clk = refreshed = 0;
-    max_rank_count = ctrl->channel->children.size();
-    max_bank_count = ctrl->channel->spec->org_entry.count[(int)T::Level::Bank];
-
-    // Init refresh counters
-    for (int r = 0; r < max_rank_count; r++) {
-      bank_ref_counters.push_back(0);
-      bank_refresh_backlog.push_back(new vector<int>(max_bank_count, 0));
-    }
-
-    level_chan = (int)T::Level::Channel;
-    level_rank = (int)T::Level::Rank;
-    level_bank = (int)T::Level::Bank;
-    level_sa   = -1; // Most DRAM doesn't have subarray level
-  }
-
-  // dtor
-  virtual ~Refresh() {
-    // Clean up backlog
-    for (unsigned int i = 0; i < bank_refresh_backlog.size(); i++)
-      delete bank_refresh_backlog[i];
-  }
-
-  // Basic refresh scheduling for all bank refresh that is applicable to all DRAM types
-  void tick_ref() {
-    clk++;
-
-    int refresh_interval = ctrl->channel->spec->speed_entry.nREFI;
-
-    // Time to schedule a refresh
-    if ((clk - refreshed) >= refresh_interval) {
-      inject_refresh(true);
-      // ALDRAM: update timing parameters based on temperatures
-      // ALDRAM::Temp current_temperature = ALDRAM::Temp::COLD;
-      // ctrl->update_temp(current_temperature);
-    }
-  }
-
-private:
-  // Keeping track of refresh status of every bank: + means ahead of schedule, - means behind schedule
-  vector<vector<int>*> bank_refresh_backlog;
-  // Keeping track of which subarray to refresh next
-  vector<vector<int>> subarray_ref_counters;
-  int max_sa_count = 0;
-  // As defined in the standards
-  int backlog_max = 8;
-  int backlog_min = -8;
-  int backlog_early_pull_threshold = -6;
-  bool ctrl_write_mode = false;
-
-  // Refresh based on the specified address
-  void refresh_target(Controller<T>* ctrl, int rank, int bank, int sa)
-  {
-    vector<int> addr_vec(int(T::Level::MAX), -1);
-    addr_vec[0] = ctrl->channel->id;
-    addr_vec[1] = rank;
-    addr_vec[2] = bank;
-    addr_vec[3] = sa;
-    Request req(addr_vec, Request::Type::REFRESH, NULL);
-    bool res = ctrl->enqueue(req);
-    assert(res);
-  }
-
-  // Inject refresh at either rank or bank level
-  void inject_refresh(bool b_ref_rank) {
-    // Rank-level refresh
-    if (b_ref_rank) {
-      for (auto rank : ctrl->channel->children)
-        refresh_target(ctrl, rank->id, -1, -1);
-    }
-    // Bank-level refresh. Simultaneously issue to all ranks (better performance than staggered refreshes).
-    else {
-      for (auto rank : ctrl->channel->children)
-        refresh_target(ctrl, rank->id, bank_ref_counters[rank->id], -1);
-    }
-    refreshed = clk;
-  }
-
-  // DSARP
-  void early_inject_refresh();
-  void wrp();
-};
-
-// Declaration of specialized constructor and tick_ref, so the compiler knows
-// where to look for these definitions when controller calls them!
-// template<> Refresh<DSARP>::Refresh(Controller<DSARP>* ctrl);
-// template<> void Refresh<DSARP>::tick_ref();
-
-} /* namespace ram */
-
-#endif /* SRC_REFRESH_H_ */
diff --git a/TOGSim/extern/ramulator_custom/src/Request.cpp b/TOGSim/extern/ramulator_custom/src/Request.cpp
deleted file mode 100644
index 7bbd90fe..00000000
--- a/TOGSim/extern/ramulator_custom/src/Request.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-#include "Request.h"
-
-namespace ram {
-
-Request::Request() {}
-
-Request::Request(Type Type, uint64_t Addr, std::vector<int> AddrVec,
-                 function<void(const Request&)> &cb)
-    : type(Type),
-      is_first_command(true),
-      addr(Addr),
-      addr_vec(AddrVec),
-      coreid(0),
-      arrive(0),
-      depart(0),
-      callback(cb) {}
-
-Request::Request(Type Type, uint64_t Addr, std::vector<int> AddrVec,
-                 function<void(const Request&)> &cb, void* original_req)
-    : type(Type),
-      is_first_command(true),
-      addr(Addr),
-      addr_vec(AddrVec),
-      coreid(0),
-      arrive(0),
-      depart(0),
-      callback(cb),
-      orignal_request(original_req) {}
-
-Request::Request(Type Type, uint64_t Addr, std::vector<int> AddrVec,
-                 function<void(const Request&)> &cb, int vid)
-    : type(Type),
-      is_first_command(true),
-      addr(Addr),
-      addr_vec(AddrVec),
-      coreid(0),
-      arrive(0),
-      depart(0),
-      vid(vid),
-      callback(cb) {}
-
-Request::Request(std::vector<int> addr_vec, Type type,
-                 function<void(Request&)> cb) 
-    : type(type),
-      is_first_command(true),
-      addr(-1),
-      BaseAddr(-1),
-      addr_vec(addr_vec),
-      coreid(0),
-      arrive(0),
-      depart(0),
-      callback(cb) {}
-      
-Request::Request(std::vector<int> addr_vec, Type type,
-                 function<void(Request&)> cb, void* original_req) 
-    : type(type),
-      is_first_command(true),
-      addr(-1),
-      BaseAddr(-1),
-      addr_vec(addr_vec),
-      coreid(0),
-      arrive(0),
-      depart(0),
-      callback(cb),
-      orignal_request(original_req) {}
-
-Request::Request(Type Type, uint64_t BaseAddr, uint64_t Addr, 
-                 std::vector<int> AddrVec, function<void(const Request&)> &cb)
-    : type(Type),
-      is_first_command(true),
-      addr(Addr),
-      BaseAddr(BaseAddr),
-      addr_vec(AddrVec),
-      coreid(0),
-      arrive(0),
-      depart(0),
-      callback(cb) {}
-
-bool Request::isRead() const {
-  return type == Type::READ;
-}
-bool Request::isWrite() const {
-  return type == Type::WRITE;
-}
-int Request::getChannelID() const {
-  return addr_vec[0];
-}
-
-} // end namespace
-
diff --git a/TOGSim/extern/ramulator_custom/src/Request.h b/TOGSim/extern/ramulator_custom/src/Request.h
deleted file mode 100644
index 8f70856e..00000000
--- a/TOGSim/extern/ramulator_custom/src/Request.h
+++ /dev/null
@@ -1,54 +0,0 @@
-#ifndef __REQUEST_H
-#define __REQUEST_H
-
-#include <vector>
-#include <functional>
-#include <cstdint>
-
-using namespace std;
-
-namespace ram {
-class Request {
-public:
-  enum class Type {
-    READ, WRITE, PIM_WRITE, REFRESH, POWERDOWN, SELFREFRESH, EXTENSION, MAX
-  };
-  Type type;
-  bool is_first_command;
-  uint64_t addr;
-  uint64_t BaseAddr;
-  //int HandlerID;
-
-  vector<int> addr_vec;
-  // specify which node this request sent from
-  int coreid;       // to remove compile errors
-
-  uint64_t arrive;
-  uint64_t depart;
-
-  int vid = -1;
-  void* orignal_request;
-  function<void(Request&)> callback; // call back with more info
-
-  bool isRead() const;
-  bool isWrite() const;
-  int getChannelID() const;
-
-  // Used to generate refresh request
-  Request();
-  Request(std::vector<int> addr_vec, Type type, function<void(Request&)> cb);
-  Request(std::vector<int> addr_vec, Type type, function<void(Request&)> cb, void* original_req);
-  Request(Type type, uint64_t Addr, 
-          std::vector<int> AddrVec, function<void(const Request&)> &cb);
-  Request(Type type, uint64_t Addr, 
-          std::vector<int> AddrVec, function<void(const Request&)> &cb, void* orignal_req);
-  Request(Type type, uint64_t Addr, 
-          std::vector<int> AddrVec, function<void(const Request&)> &cb, int vid);
-  Request(Type type, uint64_t BaseAddr, uint64_t Addr, 
-          std::vector<int> AddrVec, function<void(const Request&)> &cb);
-};
-
-} /*namespace ram*/
-
-#endif /*__REQUEST_H*/
-
diff --git a/TOGSim/extern/ramulator_custom/src/Scheduler.h b/TOGSim/extern/ramulator_custom/src/Scheduler.h
deleted file mode 100644
index 778bfcd4..00000000
--- a/TOGSim/extern/ramulator_custom/src/Scheduler.h
+++ /dev/null
@@ -1,377 +0,0 @@
-/***************************** SCHEDULER.H ***********************************
-- SAFARI GROUP
-
-This file contains the different scheduling policies and row policies that the 
-memory controller can use to schedule requests.
-
-Current Memory Scheduling Policies:
-
-1) FCFS - First Come First Serve
-        This scheduling policy schedules memory requests chronologically
-
-2) FRFCFS - Frist Ready First Come First Serve
-        This scheduling policy first checks if a request is READY(meets all 
-        timing parameters), if yes then it is prioritized. If multiple requests
-        are ready, they they are scheduled chronologically. Otherwise, it 
-        behaves the same way as FCFS. 
-
-3) FRFCFS_Cap - First Ready First Come First Serve Cap
-       This scheduling policy behaves the same way as FRFCS, except that it has
-       a cap on the number of hits you can get in a certain row. The CAP VALUE
-       can be altered by changing the number for the "cap" variable in 
-       line number 76. 
-
-4) FRFCFS_PriorHit - First Ready First Come First Serve Prioritize Hits
-       This scheduling policy behaves the same way as FRFCFS, except that it
-       prioritizes row hits more than readiness. 
-
-You can select which scheduler you want to use by changing the value of 
-"type" variable on line number 74.
-
-                _______________________________________
-
-Current Row Policies:
-
-1) Closed   - Precharges a row as soon as there are no pending references to 
-              the active row.
-2) ClosedAP - Closed Auto Precharge
-3) Opened   - Precharges a row only if there are pending references to 
-              other rows.
-4) Timeout  - Precharges a row after X time if there are no pending references.
-              'X' time can be changed by changing the variable timeout 
-              on line number 221
-
-*****************************************************************************/
-
-#ifndef __SCHEDULER_H
-#define __SCHEDULER_H
-
-#include "DRAM.h"
-#include "Request.h"
-//#include "Controller.h"
-#include <vector>
-#include <map>
-#include <list>
-#include <functional>
-#include <cassert>
-
-using namespace std;
-
-namespace ram
-{
-
-template <typename T>
-class Controller;
-
-template <typename T>
-class Scheduler
-{
-public:
-    Controller<T>* ctrl;
-
-    enum class Type {
-        FCFS, FRFCFS, FRFCFS_Cap, FRFCFS_PriorHit, MAX
-    } type = Type::FRFCFS; //Change this line to change scheduling policy
-
-    long cap = 16; //Change this line to change cap
-
-    Scheduler(Controller<T>* ctrl) : ctrl(ctrl) {
-        std::cout << "DRAM Contorller scheduler : " << ctrl->configs["scheduler"] << std::endl;
-        if(ctrl->configs["scheduler"] == "FCFS") {
-            type = Type::FCFS;
-        }
-        else if(ctrl->configs["scheduler"] == "FRFCFS") {
-            type = Type::FRFCFS;
-        } 
-        else if(ctrl->configs["scheduler"] == "FRFCFS_Cap") {
-            type = Type::FRFCFS_Cap;
-        } 
-        else if(ctrl->configs["scheduler"] == "FRFCFS_PriorHit") {
-            type = Type::FRFCFS_PriorHit;
-        }
-    }
-
-    list<Request>::iterator get_head(list<Request>& q)
-    {
-        // TODO make the decision at compile time
-        if (type != Type::FRFCFS_PriorHit) {
-            //If queue is empty, return end of queue
-            if (!q.size())
-                return q.end();
-
-            //Else return based on the policy
-            auto head = q.begin();
-            for (auto itr = next(q.begin(), 1); itr != q.end(); itr++)
-                head = compare[int(type)](head, itr);
-
-            return head;
-        } 
-        else { //Code to get around edge cases for FRFCFS_PriorHit
-            
-       //If queue is empty, return end of queue
-            if (!q.size())
-                return q.end();
-
-       //Else return based on FRFCFS_PriorHit Scheduling Policy
-            auto head = q.begin();
-            for (auto itr = next(q.begin(), 1); itr != q.end(); itr++) {
-                head = compare[int(Type::FRFCFS_PriorHit)](head, itr);
-            }
-
-            if (this->ctrl->is_ready(head) && this->ctrl->is_row_hit(head)) {
-                return head;
-            }
-
-            // prepare a list of hit request
-            vector<vector<int>> hit_reqs;
-            for (auto itr = q.begin() ; itr != q.end() ; ++itr) {
-                if (this->ctrl->is_row_hit(itr)) {
-                    auto begin = itr->addr_vec.begin();
-                    // TODO Here it assumes all DRAM standards use PRE to close a row
-                    // It's better to make it more general.
-                    auto end = begin + int(ctrl->channel->spec->scope[int(T::Command::PRE)]) + 1;
-                    vector<int> rowgroup(begin, end); // bank or subarray
-                    hit_reqs.push_back(rowgroup);
-                }
-            }
-            // if we can't find proper request, we need to return q.end(),
-            // so that no command will be scheduled
-            head = q.end();
-            for (auto itr = q.begin(); itr != q.end(); itr++) {
-                bool violate_hit = false;
-                if ((!this->ctrl->is_row_hit(itr)) && this->ctrl->is_row_open(itr)) {
-                    // so the next instruction to be scheduled is PRE, might violate hit
-                    auto begin = itr->addr_vec.begin();
-                    // TODO Here it assumes all DRAM standards use PRE to close a row
-                    // It's better to make it more general.
-                    auto end = begin + int(ctrl->channel->spec->scope[int(T::Command::PRE)]) + 1;
-                    vector<int> rowgroup(begin, end); // bank or subarray
-                    for (const auto& hit_req_rowgroup : hit_reqs) {
-                        if (rowgroup == hit_req_rowgroup) {
-                            violate_hit = true;
-                            break;
-                        }  
-                    }
-                }
-                if (violate_hit) {
-                    continue;
-                }
-                // If it comes here, that means it won't violate any hit request
-                if (head == q.end()) {
-                    head = itr;
-                } else {
-                    head = compare[int(Type::FRFCFS)](head, itr);
-                }
-            }
-
-            return head;
-        }
-    }
-
-//Compare functions for each memory schedulers
-private:
-    typedef list<Request>::iterator ReqIter;
-    function<ReqIter(ReqIter, ReqIter)> compare[int(Type::MAX)] = {
-        // FCFS
-        [this] (ReqIter req1, ReqIter req2) {
-            if (req1->arrive <= req2->arrive) return req1;
-            return req2;},
-
-        // FRFCFS
-        [this] (ReqIter req1, ReqIter req2) {
-            bool ready1 = this->ctrl->is_ready(req1);
-            bool ready2 = this->ctrl->is_ready(req2);
-
-            if (ready1 ^ ready2) {
-                if (ready1) return req1;
-                return req2;
-            }
-
-            if (req1->arrive <= req2->arrive) return req1;
-            return req2;},
-
-        // FRFCFS_CAP
-        [this] (ReqIter req1, ReqIter req2) {
-            bool ready1 = this->ctrl->is_ready(req1);
-            bool ready2 = this->ctrl->is_ready(req2);
-
-            ready1 = ready1 && (this->ctrl->rowtable->get_hits(req1->addr_vec) <= this->cap);
-            ready2 = ready2 && (this->ctrl->rowtable->get_hits(req2->addr_vec) <= this->cap);
-
-            if (ready1 ^ ready2) {
-                if (ready1) return req1;
-                return req2;
-            }
-
-            if (req1->arrive <= req2->arrive) return req1;
-            return req2;},
-        // FRFCFS_PriorHit
-        [this] (ReqIter req1, ReqIter req2) {
-            bool ready1 = this->ctrl->is_ready(req1) && this->ctrl->is_row_hit(req1);
-            bool ready2 = this->ctrl->is_ready(req2) && this->ctrl->is_row_hit(req2);
-
-            if (ready1 ^ ready2) {
-                if (ready1) return req1;
-                return req2;
-            }
-
-            if (req1->arrive <= req2->arrive) return req1;
-            return req2;}
-    };
-};
-
-
-// Row Precharge Policy
-template <typename T>
-class RowPolicy
-{
-public:
-    Controller<T>* ctrl;
-
-    enum class Type {
-        Closed, ClosedAP, Opened, Timeout, MAX
-    } type = Type::Opened;
-
-    int timeout = 50;
-
-    RowPolicy(Controller<T>* ctrl) : ctrl(ctrl) {}
-
-    vector<int> get_victim(typename T::Command cmd)
-    {
-        return policy[int(type)](cmd);
-    }
-
-private:
-    function<vector<int>(typename T::Command)> policy[int(Type::MAX)] = {
-        // Closed
-        [this] (typename T::Command cmd) -> vector<int> {
-            for (auto& kv : this->ctrl->rowtable->table) {
-                if (!this->ctrl->is_ready(cmd, kv.first))
-                    continue;
-                return kv.first;
-            }
-            return vector<int>();},
-
-        // ClosedAP
-        [this] (typename T::Command cmd) -> vector<int> {
-            for (auto& kv : this->ctrl->rowtable->table) {
-                if (!this->ctrl->is_ready(cmd, kv.first))
-                    continue;
-                return kv.first;
-            }
-            return vector<int>();},
-
-        // Opened
-        [this] (typename T::Command cmd) {
-            return vector<int>();},
-
-        // Timeout
-        [this] (typename T::Command cmd) -> vector<int> {
-            for (auto& kv : this->ctrl->rowtable->table) {
-                auto& entry = kv.second;
-                if (this->ctrl->clk - entry.timestamp < timeout)
-                    continue;
-                if (!this->ctrl->is_ready(cmd, kv.first))
-                    continue;
-                return kv.first;
-            }
-            return vector<int>();}
-    };
-
-};
-
-
-template <typename T>
-class RowTable
-{
-public:
-    Controller<T>* ctrl;
-
-    struct Entry {
-        int row;
-        int hits;
-        long timestamp;
-    };
-
-    map<vector<int>, Entry> table;
-
-    RowTable(Controller<T>* ctrl) : ctrl(ctrl) {}
-
-    void update(typename T::Command cmd, const vector<int>& addr_vec, long clk)
-    {
-        auto begin = addr_vec.begin();
-        auto end = begin + int(T::Level::Row);
-        vector<int> rowgroup(begin, end); // bank or subarray
-        int row = *end;
-
-        T* spec = ctrl->channel->spec;
-
-        if (spec->is_opening(cmd))
-            table.insert({rowgroup, {row, 0, clk}});
-
-        if (spec->is_accessing(cmd)) {
-            // we are accessing a row -- update its entry
-            auto match = table.find(rowgroup);
-            assert(match != table.end());
-            assert(match->second.row == row);
-            match->second.hits++;
-            match->second.timestamp = clk;
-        } /* accessing */
-
-        if (spec->is_closing(cmd)) {
-          // we are closing one or more rows -- remove their entries
-          int n_rm = 0;
-          int scope;
-          if (spec->is_accessing(cmd))
-            scope = int(T::Level::Row) - 1; //special condition for RDA and WRA
-          else
-            scope = int(spec->scope[int(cmd)]);
-
-          for (auto it = table.begin(); it != table.end();) {
-            if (equal(begin, begin + scope + 1, it->first.begin())) {
-              n_rm++;
-              it = table.erase(it);
-            }
-            else
-              it++;
-          }
-
-          assert(n_rm > 0);
-        } /* closing */
-    }
-
-    int get_hits(const vector<int>& addr_vec, const bool to_opened_row = false)
-    {
-        auto begin = addr_vec.begin();
-        auto end = begin + int(T::Level::Row);
-
-        vector<int> rowgroup(begin, end);
-        int row = *end;
-
-        auto itr = table.find(rowgroup);
-        if (itr == table.end())
-            return 0;
-
-        if(!to_opened_row && (itr->second.row != row))
-            return 0;
-
-        return itr->second.hits;
-    }
-
-    int get_open_row(const vector<int>& addr_vec) {
-        auto begin = addr_vec.begin();
-        auto end = begin + int(T::Level::Row);
-
-        vector<int> rowgroup(begin, end);
-
-        auto itr = table.find(rowgroup);
-        if(itr == table.end())
-            return -1;
-
-        return itr->second.row;
-    }
-};
-
-} /*namespace ram*/
-
-#endif /*__SCHEDULER_H*/
diff --git a/TOGSim/extern/ramulator_custom/src/SpeedyController.h b/TOGSim/extern/ramulator_custom/src/SpeedyController.h
deleted file mode 100644
index 981ce900..00000000
--- a/TOGSim/extern/ramulator_custom/src/SpeedyController.h
+++ /dev/null
@@ -1,304 +0,0 @@
-#ifndef __SPEEDYCONTROLLER_H
-#define __SPEEDYCONTROLLER_H
-
-#include "Config.h"
-#include "DRAM.h"
-#include "Request.h"
-#include "Statistics.h"
-#include <iostream>
-#include <fstream>
-#include <vector>
-#include <string>
-#include <algorithm>
-#include <cassert>
-#include <utility>
-#include <queue>
-
-using namespace std;
-
-namespace ram
-{
-
-template <typename T>
-class SpeedyController
-// A FR-FCFS Open Row Controller, optimized for simulation speed.
-// Not For SALP-2
-{
-protected:
-  ScalarStat row_hits;
-  ScalarStat row_misses;
-private:
-    class compair_depart_clk{
-    public:
-        bool operator()(const Request& lhs, const Request& rhs) {
-            return lhs.depart > rhs.depart;
-        }
-    };
-public:
-    /* Command trace for DRAMPower 3.1 */
-    string cmd_trace_prefix = "cmd-trace-";
-    vector<ofstream> cmd_trace_files;
-    bool record_cmd_trace = false;
-    /* Commands to stdout */
-    bool print_cmd_trace = false;
-    /* Member Variables */
-    const unsigned int queue_capacity = 32;
-    long clk = 0;
-    DRAM<T>* channel;
-
-    double write_hi = 0.875;
-    double write_low = 0.5;
-
-    // request, first command, earliest clk
-    typedef tuple<Request, typename T::Command, long> request_info;
-    typedef vector<request_info> request_queue;
-    request_queue readq;   // queue for read requests
-    request_queue writeq;  // queue for write requests
-    request_queue otherq;  // queue for all "other" requests (e.g., refresh)
-
-    // read requests that are about to receive data from DRAM
-    priority_queue<Request, vector<Request>, compair_depart_clk> pending;
-
-    bool write_mode = false;  // whether write requests should be prioritized over reads
-    long refreshed = 0;  // last time refresh requests were generated
-
-    /* Constructor */
-    SpeedyController(RamulatorConfig& configs, DRAM<T>* channel) :
-        channel(channel)
-    {
-        record_cmd_trace = configs.record_cmd_trace();
-        print_cmd_trace = configs.print_cmd_trace();
-        if (record_cmd_trace){
-            string prefix = cmd_trace_prefix + "chan-" + to_string(channel->id) + "-rank-";
-            string suffix = ".cmdtrace";
-            for (unsigned int i = 0; i < channel->children.size(); i++)
-                cmd_trace_files.emplace_back(prefix + to_string(i) + suffix);
-        }
-        readq.reserve(queue_capacity);
-        writeq.reserve(queue_capacity);
-        otherq.reserve(queue_capacity);
-
-        // regStats
-
-        row_hits
-            .name("row_hits_channel_"+to_string(channel->id))
-            .desc("Number of row hits")
-            .precision(0)
-            ;
-        row_misses
-            .name("row_misses_channel_"+to_string(channel->id))
-            .desc("Number of row misses")
-            .precision(0)
-            ;
-    }
-
-    ~SpeedyController(){
-        delete channel;
-        for (auto& file : cmd_trace_files)
-            file.close();
-    }
-
-    /* Member Functions */
-
-    void finish(int read_req, int write_req, int dram_cycles) {
-      // call finish function of each channel
-      channel->finish(dram_cycles);
-    }
-
-    bool enqueue(Request& req)
-    {
-        request_queue& q =
-            req.type == Request::Type::READ? readq:
-            req.type == Request::Type::WRITE? writeq:
-                                             otherq;
-        if (queue_capacity == q.size())
-            return false;
-
-        req.arrive = clk;
-        if (req.type == Request::Type::READ){
-            for (auto& info : writeq)
-                if (req.addr == get<0>(info).addr){
-                    req.depart = clk + 1;
-                    pending.push(req);
-                    return true;
-                }
-        }
-        typename T::Command first_cmd = get_first_cmd(req);
-        long first_clk = channel->get_next(first_cmd, req.addr_vec.data());
-        q.emplace_back(req, first_cmd, first_clk);
-        push_heap(q.begin(), q.end(), compair_first_clk);;
-        return true;
-    }
-
-    void tick()
-    {
-        clk++;
-
-        /*** 1. Serve completed reads ***/
-        if (pending.size()) {
-            Request req = pending.top();
-            if (req.depart <= clk) {
-                req.depart = clk; // actual depart clk
-                req.callback(req);
-                pending.pop();
-            }
-        }
-
-        /*** 2. Should we schedule refreshes? ***/
-        int refresh_interval = channel->spec->speed_entry.nREFI;
-        if (clk - refreshed >= refresh_interval) {
-            auto req_type = Request::Type::REFRESH;
-            vector<int> addr_vec(int(T::Level::MAX), -1);
-            addr_vec[0] = channel->id;
-            for (auto child : channel->children) {
-                addr_vec[1] = child->id;
-                Request req(addr_vec, req_type, NULL);
-                bool res = enqueue(req);
-                assert(res);
-            }
-
-            refreshed = clk;
-        }
-
-        /*** 3. Should we schedule writes? ***/
-        if (!write_mode) {
-            // yes -- write queue is almost full or read queue is empty
-            if (writeq.size() >= (unsigned int)(write_hi * queue_capacity) || readq.size() == 0)
-                write_mode = true;
-        }
-        else {
-            // no -- write queue is almost empty and read queue is not empty
-            if (writeq.size() <= (unsigned int)(write_low * queue_capacity) && readq.size() != 0)
-                write_mode = false;
-        }
-
-        /*** 4. Find the best command to schedule, if any ***/
-        request_queue& q = otherq.size()? otherq: write_mode ? writeq : readq;
-
-        schedule(q);
-    }
-
-    bool is_row_hit(Request& req)
-    {
-        typename T::Command cmd = get_first_cmd(req);
-        return channel->check_row_hit(cmd, req.addr_vec.data());
-    }
-
-private:
-
-    static bool compair_first_clk(const request_info& lhs, const request_info& rhs) {
-        return (get<2>(lhs) > get<2>(rhs));
-    }
-
-    typename T::Command get_first_cmd(Request& req)
-    {
-        typename T::Command cmd = channel->spec->translate[int(req.type)];
-        switch (int(req.type)){
-            case int(Request::Type::READ):
-            case int(Request::Type::WRITE):{
-                auto node = channel;
-                for (int i = 1; i < int(T::Level::Row); i++)
-                    node = node->children[req.addr_vec[i]];
-                assert(int(node->level) == int(T::Level::Row) - 1);
-                if (node->state == T::State::Closed) return T::Command::ACT;
-                else if (node->row_state.find(req.addr_vec[int(T::Level::Row)]) != node->row_state.end()) return cmd;
-                else return T::Command::PRE;
-            }
-            case int(Request::Type::REFRESH):
-                return channel->decode(cmd, req.addr_vec.data());
-            default:
-                assert(false);
-        }
-        // return channel->decode(cmd, req.addr_vec.data());
-    }
-    void update(typename T::Command cmd, bool state_change, vector<int>::iterator& begin, vector<int>::iterator& end, request_queue& q){
-        if (q.empty()) return;
-
-        for (auto& info : q) {
-            bool addr_eq = equal(begin, end, get<0>(info).addr_vec.begin());
-            if (state_change && addr_eq)
-                get<1>(info) = get_first_cmd(get<0>(info));
-            if ((cmd == T::Command::RD || cmd == T::Command::WR)
-                && get<1>(info) == T::Command::ACT)
-                continue;
-            get<2>(info) = channel->get_next(get<1>(info), get<0>(info).addr_vec.data());
-        }
-        make_heap(q.begin(), q.end(), compair_first_clk);
-    }
-
-    void schedule(request_queue& q){
-        if (q.empty()) return;
-
-        Request& req = get<0>(q[0]);
-        typename T::Command& first_cmd = get<1>(q[0]);
-        long first_clk = get<2>(q[0]);
-
-        if (first_clk > clk) return;
-
-        if (req.is_first_command) {
-            req.is_first_command = false;
-            if (req.type == Request::Type::READ || req.type == Request::Type::WRITE) {
-                if (is_row_hit(req))
-                    ++row_hits;
-                else
-                    ++row_misses;
-            }
-        }
-
-        issue_cmd(first_cmd, req.addr_vec.data());
-
-        if (first_cmd == channel->spec->translate[int(req.type)]){
-            if (req.type == Request::Type::READ) {
-                req.depart = clk + channel->spec->read_latency;
-                pending.push(req);
-            }
-            pop_heap(q.begin(), q.end(), compair_first_clk);
-            q.pop_back();
-        }
-
-        bool state_change = channel->spec->is_opening(first_cmd)
-                        || channel->spec->is_closing(first_cmd)
-                        || channel->spec->is_refreshing(first_cmd);
-
-        auto begin = req.addr_vec.begin();
-        auto end = begin + 1;
-        for (; end < begin + int(T::Level::Row) && *end >= 0; end++);
-
-        update(first_cmd, state_change, begin, end, readq);
-        update(first_cmd, state_change, begin, end, writeq);
-        update(first_cmd, state_change, begin, end, otherq);
-    }
-
-    void issue_cmd(typename T::Command cmd, int* addr_vec)
-    {
-        // assert(channel->check(cmd, addr_vec, clk));
-        channel->update(cmd, addr_vec, clk);
-
-        if (record_cmd_trace){
-            // select rank
-            auto& file = cmd_trace_files[addr_vec[1]];
-            string& cmd_name = channel->spec->command_name[int(cmd)];
-            file<<clk<<','<<cmd_name;
-            // TODO bad coding here
-            if (cmd_name == "PREA" || cmd_name == "REF")
-                file<<endl;
-            else {
-                int bank_id = addr_vec[int(T::Level::Bank)];
-                if (channel->spec->standard_name == "DDR4" || channel->spec->standard_name == "GDDR5")
-                    bank_id += addr_vec[int(T::Level::Bank) - 1] *
-                        channel->spec->org_entry.count[int(T::Level::Bank)];
-                file<<','<<bank_id<<endl;
-            }
-        }
-        if (print_cmd_trace){
-            printf("%5s %10ld:", channel->spec->command_name[int(cmd)].c_str(), clk);
-            for (int lev = 0; lev < int(T::Level::MAX); lev++)
-                printf(" %5d", addr_vec[lev]);
-            printf("\n");
-        }
-    }
-};
-
-} /*namespace ram*/
-
-#endif /*__SPEEDYCONTROLLER_H*/
diff --git a/TOGSim/extern/ramulator_custom/src/StatType.cpp b/TOGSim/extern/ramulator_custom/src/StatType.cpp
deleted file mode 100644
index 843f76c8..00000000
--- a/TOGSim/extern/ramulator_custom/src/StatType.cpp
+++ /dev/null
@@ -1,153 +0,0 @@
-#include "StatType.h"
-
-namespace Stat {
-
-// Statistics list
-StatList statlist;
-
-// The smallest timing granularity.
-Tick curTick = 0;
-
-std::vector<StatBase*> all_stats;
-void reset_stats() {
-    for(auto s : all_stats)
-        s->reset();
-}
-
-void
-Histogram::grow_out()
-{
-    int size = cvec.size();
-    int zero = size / 2; // round down!
-    int top_half = zero + (size - zero + 1) / 2; // round up!
-    int bottom_half = (size - zero) / 2; // round down!
-
-    // grow down
-    int low_pair = zero - 1;
-    for (int i = zero - 1; i >= bottom_half; i--) {
-        cvec[i] = cvec[low_pair];
-        if (low_pair - 1 >= 0)
-            cvec[i] += cvec[low_pair - 1];
-        low_pair -= 2;
-    }
-    assert(low_pair == 0 || low_pair == -1 || low_pair == -2);
-
-    for (int i = bottom_half - 1; i >= 0; i--)
-        cvec[i] = Counter();
-
-    // grow up
-    int high_pair = zero;
-    for (int i = zero; i < top_half; i++) {
-        cvec[i] = cvec[high_pair];
-        if (high_pair + 1 < size)
-            cvec[i] += cvec[high_pair + 1];
-        high_pair += 2;
-    }
-    assert(high_pair == size || high_pair == size + 1);
-
-    for (int i = top_half; i < size; i++)
-        cvec[i] = Counter();
-
-    max_bucket *= 2;
-    min_bucket *= 2;
-    bucket_size *= 2;
-}
-
-void
-Histogram::grow_convert()
-{
-    int size = cvec.size();
-    int half = (size + 1) / 2; // round up!
-    //bool even = (size & 1) == 0;
-
-    int pair = size - 1;
-    for (int i = size - 1; i >= half; --i) {
-        cvec[i] = cvec[pair];
-        if (pair - 1 >= 0)
-            cvec[i] += cvec[pair - 1];
-        pair -= 2;
-    }
-
-    for (int i = half - 1; i >= 0; i--)
-        cvec[i] = Counter();
-
-    min_bucket = -max_bucket;// - (even ? bucket_size : 0);
-    bucket_size *= 2;
-}
-
-void
-Histogram::grow_up()
-{
-    int size = cvec.size();
-    int half = (size + 1) / 2; // round up!
-
-    int pair = 0;
-    for (int i = 0; i < half; i++) {
-        cvec[i] = cvec[pair];
-        if (pair + 1 < size)
-            cvec[i] += cvec[pair + 1];
-        pair += 2;
-    }
-    assert(pair == size || pair == size + 1);
-
-    for (int i = half; i < size; i++)
-        cvec[i] = Counter();
-
-    max_bucket *= 2;
-    bucket_size *= 2;
-}
-
-void
-Histogram::add(Histogram &hs)
-{
-    size_type b_size = hs.size();
-    assert(size() == b_size);
-    assert(min_bucket == hs.min_bucket);
-
-    sum += hs.sum;
-    logs += hs.logs;
-    squares += hs.squares;
-    samples += hs.samples;
-
-    while(bucket_size > hs.bucket_size)
-        hs.grow_up();
-    while(bucket_size < hs.bucket_size)
-        grow_up();
-
-    for (uint32_t i = 0; i < b_size; i++)
-        cvec[i] += hs.cvec[i];
-}
-
-void
-Histogram::sample(Counter val, int number)
-{
-    assert(min_bucket < max_bucket);
-    if (val < min_bucket) {
-        if (min_bucket == 0)
-            grow_convert();
-
-        while (val < min_bucket)
-            grow_out();
-    } else if (val >= max_bucket + bucket_size) {
-        if (min_bucket == 0) {
-            while (val >= max_bucket + bucket_size)
-                grow_up();
-        } else {
-            while (val >= max_bucket + bucket_size)
-                grow_out();
-        }
-    }
-
-    size_type index =
-        (int64_t)std::floor((val - min_bucket) / bucket_size);
-
-    assert(index >= 0 && index < size());
-    cvec[index] += number;
-
-    sum += val * number;
-    squares += val * val * number;
-    logs += log(val) * number;
-    samples += number;
-}
-
-} /* namespace Stats */
diff --git a/TOGSim/extern/ramulator_custom/src/StatType.h b/TOGSim/extern/ramulator_custom/src/StatType.h
deleted file mode 100644
index 1a7d5ca9..00000000
--- a/TOGSim/extern/ramulator_custom/src/StatType.h
+++ /dev/null
@@ -1,669 +0,0 @@
-#ifndef __STATTYPE_H
-#define __STATTYPE_H
-
-#include <limits>
-#include <fstream>
-#include <string>
-#include <vector>
-
-#include <cassert>
-#include <cmath>
-#include <cstdlib>
-
-namespace ram {
-
-class ScalarStat;
-class AverageStat;
-class VectorStat;
-class AverageVectorStat;
-} // namespace ram
-
-namespace Stat {
-
-const double eps = 1e-8;
-
-typedef unsigned int size_type;
-typedef unsigned int off_type;
-typedef double Counter;
-typedef double Result;
-typedef uint64_t Tick;
-typedef std::vector<Counter> VCounter;
-typedef std::vector<Result> VResult;
-typedef std::numeric_limits<Counter> CounterLimits;
-
-class StatBase;
-extern std::vector<StatBase*> all_stats;
-void reset_stats();
-
-// Flags
-const uint16_t init      = 0x00000001;
-const uint16_t display   = 0x00000002;
-const uint16_t total     = 0x00000010;
-const uint16_t pdf       = 0x00000020;
-const uint16_t cdf       = 0x00000040;
-const uint16_t dist      = 0x00000080;
-const uint16_t nozero    = 0x00000100;
-const uint16_t nonan     = 0x00000200;
-
-class Flags {
- protected:
-  uint16_t flags;
- public:
-  Flags(){}
-  Flags(uint16_t flags):flags(flags){}
-  void operator=(uint16_t _flags){flags = _flags;}
-  bool is_total() const {return flags & total;}
-  bool is_pdf() const {return flags & pdf;}
-  bool is_nozero() const {return flags & nozero;}
-  bool is_nonan() const {return flags & nonan;}
-  bool is_cdf() const {return flags & cdf;}
-  bool is_display() const {return flags & display;}
-};
-
-class StatBase {
- public:
-    StatBase() {
-        all_stats.push_back(this);
-    }
-
-
-  // TODO implement print for Distribution, Histogram,
-  // AverageDeviation, StandardDeviation
-  virtual void print(std::ofstream& file) = 0;
-
-  virtual size_type size() const = 0;
-  virtual bool zero() const = 0;
-  virtual void prepare() = 0;
-  virtual void reset() = 0;
-
-  virtual VResult vresult() const { return VResult(); };
-  virtual Result total() const { return Result(); };
-
-  virtual bool is_display() const  = 0;
-  virtual bool is_nozero() const = 0;
-};
-
-class StatList {
- protected:
-  std::vector<StatBase*> list;
-  std::ofstream stat_output;
- public:
-  void add(StatBase* stat) {
-    list.push_back(stat);
-  }
-  void output(std::string filename) {
-    stat_output.open(filename.c_str(), std::ios_base::out);
-    if (!stat_output.good()) {
-      assert(false && "!stat_output.good()");
-    }
-  }
-  void printall() {
-    for(off_type i = 0 ; i < list.size() ; ++i) {
-      if (!list[i]) {
-        continue;
-      }
-      if (list[i]->is_nozero() && list[i]->zero()) {
-        continue;
-      }
-      if (list[i]->is_display()) {
-        list[i]->prepare();
-        list[i]->print(stat_output);
-      }
-    }
-  }
-  ~StatList() {
-    stat_output.close();
-  }
-};
-
-extern StatList statlist;
-
-template<class Derived>
-class Stat : public StatBase {
- protected:
-  std::string _name;
-  std::string _desc;
-  int _precision = 1;
-  Flags _flags = display;
-  std::string separatorString;
- public:
-  Stat() {
-    statlist.add(selfptr());
-  }
-  Derived &self() {return *static_cast<Derived*>(this);}
-  Derived *selfptr() {return static_cast<Derived*>(this);}
-  Derived &name(const std::string &__name) {
-    _name = __name;
-    return self();
-  };
-  Derived &desc(const std::string &__desc) {
-    _desc = __desc;
-    return self();
-  };
-  Derived &precision(int __precision) {
-    _precision = __precision;
-    return self();
-  };
-  Derived &flags(Flags __flags) {
-    _flags = __flags;
-    return self();
-  };
-
-  template <class GenericStat>
-  Derived &prereq(const GenericStat & prereq) {
-    // TODO deal with prereq;
-    // only print the stat if the prereq is not zero.
-    return self();
-  }
-
-  Derived &setSeparator(std::string str) {
-    separatorString = str;
-    return self();
-  }
-  const std::string& setSeparator() const {return separatorString;}
-
-  size_type size() const { return 0; }
-
-  virtual void print(std::ofstream& file) {};
-  virtual void printname(std::ofstream& file) {
-    file.width(40);
-    file << _name;
-  }
-
-  virtual void printdesc(std::ofstream& file) {
-    file.width(40);
-    file << "# " << _desc << std::endl;
-  }
-
-  virtual bool is_display() const {
-    return _flags.is_display();
-  }
-
-  virtual bool is_nozero() const {
-    return _flags.is_nozero();
-  }
-};
-
-template <class ScalarType>
-class ScalarBase: public Stat<ScalarType> {
- public:
-  virtual Counter value() const = 0;
-  virtual Result result() const = 0;
-  virtual Result total() const = 0;
-
-  size_type size() const {return 1;}
-  VResult vresult() const {return VResult(1, result());}
-
-  virtual void print(std::ofstream& file) {
-    Stat<ScalarType>::printname(file);
-    // TODO deal with flag
-    file.precision(Stat<ScalarType>::_precision);
-    file.width(20);
-    Result res = Stat<ScalarType>::self().result();
-    file << std::fixed << res;
-    Stat<ScalarType>::printdesc(file);
-  }
-};
-
-class ConstValue: public ScalarBase<ConstValue> {
- private:
-  Counter _value;
- public:
-  ConstValue(Counter __value):_value(__value){}
-
-  void operator ++ () { ++_value; }
-  void operator -- () { --_value; }
-  void operator ++ (int) { _value++; }
-  void operator -- (int) { _value--; }
-
-  template <typename U>
-  void operator = (const U &v) { _value = v; }
-
-  template <typename U>
-  void operator += (const U &v) { _value += v;}
-
-  template <typename U>
-  void operator -= (const U &v) { _value -= v;}
-
-
-  Counter value() const {return _value;}
-  Result result() const {return (Result)_value;}
-  Result total() const {return result();}
-  bool zero() const {return (fabs(_value) < eps);}
-  void prepare() {}
-  void reset() {}
-};
-
-class Scalar: public ScalarBase<Scalar> {
- private:
-  Counter _value;
- public:
-  Scalar():_value(0) {}
-  Counter value() const {return _value;}
-  Result result() const {return (Result)_value;}
-  Result total() const {return (Result)_value;}
-
-  void operator ++ () { ++_value; }
-  void operator -- () { --_value; }
-  void operator ++ (int) { _value++; }
-  void operator -- (int) { _value--; }
-
-  template <typename U>
-  void operator = (const U &v) { _value = v; }
-
-  template <typename U>
-  void operator += (const U &v) { _value += v;}
-
-  template <typename U>
-  void operator -= (const U &v) { _value -= v;}
-
-
-  virtual bool zero() const {return (fabs(_value) < eps);}
-  void prepare() {}
-  void reset() {_value = Counter();}
-
-};
-
-extern Tick curTick;
-
-class Average: public ScalarBase<Average> {
- private:
-  Counter current;
-  Tick lastReset;
-  Result total_val;
-  Tick last;
- public:
-  Average():current(0), lastReset(0), total_val(0), last(0){}
-
-  void set(Counter val) {
-    total_val += current * (curTick - last);
-    last = curTick;
-    current = val;
-  }
-  void inc(Counter val) {
-    set(current + val);
-  }
-  void dec(Counter val) {
-    set(current - val);
-  }
-  void operator ++ () { inc(1); }
-  void operator -- () { dec(1); }
-  void operator ++ (int) { inc(1); }
-  void operator -- (int) { dec(1); }
-
-  template <typename U>
-  void operator = (const U &v) { set(v); }
-
-  template <typename U>
-  void operator += (const U &v) { inc(v);}
-
-  template <typename U>
-  void operator -= (const U &v) { dec(v);}
-
-
-  bool zero() const { return (fabs(total_val) < eps); }
-  void prepare() {
-    total_val += current * (curTick - last);
-    last = curTick;
-  }
-  void reset() {
-    total_val = 0.0;
-    last = curTick;
-    lastReset = curTick;
-  }
-
-  Counter value() const { return current; }
-  Result result() const {
-    assert(last == curTick);
-    return (Result)(total_val + current)/ (Result)(curTick - lastReset + 1);
-  }
-  Result total() const {return result();}
-};
-
-template<class Derived, class Element>
-class VectorBase: public Stat<Derived> {
- private:
-  size_type _size = 0;
-  std::vector<Element> data;
-
- public:
-  void init(size_type __size) {
-    _size = __size;
-    data.resize(size());
-    for (off_type i = 0 ; i < size() ; ++i) {
-      data[i].flags(0)
-             .name("[" + std::string(1, char(i + '0')) + "]");
-    }
-  }
-  size_type size() const {return _size;}
-  // Copy the values to a local vector and return a reference to it.
-  void value(VCounter& vec) const {
-    vec.resize(size());
-    for (off_type i = 0 ; i < size() ; ++i) {
-      vec[i] = data[i].value();
-    }
-  }
-  // Copy the results to a local vector and return a reference to it.
-  void result(VResult& vec) const {
-    vec.resize(size());
-    for (off_type i = 0 ; i < size() ; ++i) {
-      vec[i] = data[i].result();
-    }
-  }
-
-  Result total() const {
-    Result sum = 0.0;
-    for (off_type i = 0 ; i < size() ; ++i) {
-      sum += data[i].result();
-    }
-    return sum;
-  }
-
-  VResult vresult() const {
-    VResult vres;
-    for (off_type i = 0 ; i < size() ; ++i) {
-      vres[i] = data[i].result();
-    }
-    return vres;
-  }
-
-  bool check() const {
-    // We don't separate storage and access as gem5 does.
-    // So here is always true.
-    return true;
-  }
-
-  Element &operator[](off_type index) {
-    assert(index >= 0 && index < size());
-    return data[index];
-  }
-
-  bool zero() const {
-    return (fabs(total()) < eps);
-  }
-
-  void prepare() {
-    for (off_type i = 0 ; i < size() ; ++i) {
-      data[i].prepare();
-    }
-  }
-  void reset() {
-    for (off_type i = 0 ; i < size() ; ++i) {
-      data[i].reset();
-    }
-  }
-  void print(std::ofstream& file) {
-    Stat<Derived>::printname(file);
-    file.precision(Stat<Derived>::_precision);
-    file.width(20);
-    file << std::fixed << total();
-    Stat<Derived>::printdesc(file);
-    for (off_type i = 0 ; i < size() ; ++i) {
-      data[i].print(file);
-    }
-  }
-};
-
-class Vector: public VectorBase<Vector, Scalar> {
-};
-
-class AverageVector: public VectorBase<AverageVector, Average> {
-};
-
-class Distribution: public Stat<Distribution> {
- private:
-  // Parameter part:
-  Counter param_min;
-  Counter param_max;
-  Counter param_bucket_size;
-  Counter param_buckets;
-
-  // The minimum value to track
-  Counter min_track;
-  // The maximum value to track
-  Counter max_track;
-  // The number of entries in each bucket
-  Counter bucket_size;
-
-  Counter min_val;
-  Counter max_val;
-  // The number of values sampled less than min
-  Counter underflow;
-  // The number of values sampled more than max
-  Counter overflow;
-  // The current sum
-  Counter sum;
-  // The sum of squares
-  Counter squares;
-  // The number of samples
-  Counter samples;
-  // Counter for each bucket
-  VCounter cvec;
-
- public:
-  Distribution():param_min(Counter()), param_max(Counter()),
-      param_bucket_size(Counter()) { reset(); }
-  void init(Counter min, Counter max, Counter bkt) {
-    param_min = min;
-    param_max = max;
-    param_bucket_size = bkt;
-    param_buckets = (size_type)ceil((max - min + 1.0) / bkt);
-    cvec.resize(param_buckets);
-
-    reset();
-  }
-  void sample(Counter val, int number) {
-    if (val < min_track)
-      underflow += number;
-    else if (val > max_track)
-      overflow += number;
-    else {
-      size_type index =
-          (size_type)std::floor((val - min_track) / bucket_size);
-      assert(index < size());
-      cvec[index] += number;
-    }
-
-    if (val < min_val)
-      min_val = val;
-
-    if (val > max_val)
-      max_val = val;
-
-    sum += val * number;
-    squares += val * val * number;
-    samples += number;
-  }
-
-  size_type size() const {return cvec.size();}
-  bool zero() const {
-    return (fabs(samples) < eps);
-  }
-  void prepare() {};
-  void reset() {
-    min_track = param_min;
-    max_track = param_max;
-    bucket_size = param_bucket_size;
-
-    min_val = CounterLimits::max();
-    max_val = CounterLimits::min();
-    underflow = Counter();
-    overflow = Counter();
-
-    size_type _size = cvec.size();
-    for (off_type i = 0 ; i < _size ; ++i) {
-      cvec[i] = Counter();
-    }
-
-    sum = Counter();
-    squares = Counter();
-    samples = Counter();
-  };
-  void add(Distribution &d) {
-    size_type d_size = d.size();
-    assert(size() == d_size);
-    assert(min_track == d.min_track);
-    assert(max_track == d.max_track);
-
-    underflow += d.underflow;
-    overflow += d.overflow;
-
-    sum += d.sum;
-    squares += d.squares;
-    samples += d.samples;
-
-    if (d.min_val < min_val) {
-      min_val = d.min_val;
-    }
-
-    if (d.max_val > max_val) {
-      max_val = d.max_val;
-    }
-
-    for (off_type i = 0 ; i < d_size ; ++i) {
-      cvec[i] += d.cvec[i];
-    }
-  }
-};
-
-class Histogram: public Stat<Histogram> {
- private:
-  size_type param_buckets;
-
-  Counter min_bucket;
-  Counter max_bucket;
-  Counter bucket_size;
-
-  Counter sum;
-  Counter logs;
-  Counter squares;
-  Counter samples;
-  VCounter cvec;
-
- public:
-  Histogram():param_buckets(0) { reset(); }
-  Histogram(size_type __buckets):cvec(__buckets) {
-    init(__buckets);
-  }
-  void init(size_type __buckets) {
-    cvec.resize(__buckets);
-    param_buckets = __buckets;
-    reset();
-  }
-
-  void grow_up();
-  void grow_out();
-  void grow_convert();
-  void add(Histogram& hs);
-  void sample(Counter val, int number);
-
-  bool zero() const {
-    return (fabs(samples) < eps);
-  }
-  void prepare() {}
-  void reset() {
-    min_bucket = 0;
-    max_bucket = param_buckets - 1;
-    bucket_size = 1;
-
-    size_type size = param_buckets;
-    for (off_type i = 0 ; i < size ; ++i) {
-      cvec[i] = Counter();
-    }
-
-    sum = Counter();
-    squares = Counter();
-    samples = Counter();
-    logs = Counter();
-  }
-
-  size_type size() const {return param_buckets;}
-};
-
-class StandardDeviation: public Stat<StandardDeviation> {
- private:
-  Counter sum;
-  Counter squares;
-  Counter samples;
-
- public:
-  StandardDeviation():sum(Counter()), squares(Counter()),
-      samples(Counter()) {}
-  void sample(Counter val, int number) {
-    Counter value = val * number;
-    sum += value;
-    squares += value * value;
-    samples += number;
-  }
-  size_type size() const {return 1;}
-  bool zero() const {return (fabs(samples) < eps);}
-  void prepare() {}
-  void reset() {
-    sum = Counter();
-    squares = Counter();
-    samples = Counter();
-  }
-  void add(StandardDeviation& sd) {
-    sum += sd.sum;
-    squares += sd.squares;
-    samples += sd.samples;
-  }
-};
-
-class AverageDeviation: public Stat<AverageDeviation> {
- private:
-  Counter sum;
-  Counter squares;
-
- public:
-  AverageDeviation():sum(Counter()), squares(Counter()) {}
-  void sample(Counter val, int number) {
-    Counter value = val * number;
-    sum += value;
-    squares += value * value;
-  }
-  size_type size() const {return 1;}
-  bool zero() const {return (fabs(sum) < eps);}
-  void prepare() {}
-  void reset() {
-    sum = Counter();
-    squares = Counter();
-  }
-  void add(AverageDeviation& ad) {
-    sum += ad.sum;
-    squares += ad.squares;
-  }
-};
-
-class Op {
- private:
-  std::string opstring;
- public:
-  Op() {}
-  Op(std::string __opstring):opstring(__opstring){}
-  Result operator() (Result r) const {
-    if (opstring == "-") {
-      return -r;
-    } else {
-      assert("Unary operation can only be unary negation." && false);
-    }
-  }
-  Result operator() (Result l, Result r) const {
-    if (opstring == "+") {
-      return l + r;
-    } else if (opstring == "-") {
-      return l - r;
-    } else if (opstring == "*") {
-      return l * r;
-    } else if (opstring == "/") {
-      assert(fabs(r) > 1e-8 || "divide zero error");
-      return l / r;
-    } else {
-      assert("invalid binary opstring " && false);
-    }
-  }
-};
-
-} // namespace Stats
-
-#endif
diff --git a/TOGSim/extern/ramulator_custom/src/Statistics.h b/TOGSim/extern/ramulator_custom/src/Statistics.h
deleted file mode 100644
index 8cf555f7..00000000
--- a/TOGSim/extern/ramulator_custom/src/Statistics.h
+++ /dev/null
@@ -1,236 +0,0 @@
-#ifndef __STATISTICS_H
-#define __STATISTICS_H
-
-#include <string>
-
-// FIXME Find better way to decide where does it come from
-#include "StatType.h"
-
-/*
-  IMPORTANT NOTE - Read this first!
-
-  This version of the file provides wrappers to the gem5 statistics classes.
-  Feel free to go through this file, though it can be difficult to follow
-  with the degree of abstraction going on. In short, this file currently
-  provides the following mapping of stat classes. In almost all cases, the
-  wrapper provides identical and complete functionality to the gem5 stat
-  classes. All of our classes are defined in the ramulator namespace.
-
-  GEM5 CLASS --> RAMULATOR CLASS
-  ==============================
-  Stat::Scalar --> ScalarStat
-  Stat::Average --> AverageStat
-  Stat::Vector --> VectorStat
-  Stat::AverageVector --> AverageVectorStat
-  Stat::Distribution --> DistributionStat
-  Stat::Histogram --> HistogramStat
-  Stat::StandardDeviation --> StandardDeviationStat
-  Stat::AverageDeviation --> AverageDeviationStat
-
-  All of the stats that you create will be named "ramulator.<your name>"
-  automatically, and will be dumped at the end of simulation into the gem5
-  stats file.
-*/
-
-namespace ram {
-
-template<class StatType>
-class StatBase { // wrapper for Stat::DataWrap
-  protected:
-    StatType stat;
-    std::string statName;
-
-    StatBase<StatType> & self() { return *this; }
-  public:
-    StatBase() {}
-
-    StatBase(std::string _name) {
-      name(_name);
-    }
-
-    StatBase(std::string _name, std::string _desc) {
-      name(_name);
-      desc(_desc);
-    }
-
-    StatBase<StatType> & name(std::string _name) {
-      statName = _name;
-      stat.name("ramulator." + _name);
-
-      return self();
-    }
-
-    const std::string &name(void) const { return statName; }
-
-    StatBase<StatType> & setSeparator(const std::string & _sep) {
-      stat.setSeparator(_sep);
-      return self();
-    }
-
-    const std::string &setSeparator() const { return stat.setSeparator(); }
-
-    StatBase<StatType> & desc(std::string _desc) {
-      stat.desc(_desc);
-      return self();
-    }
-
-    StatBase<StatType> & precision(int _precision) {
-      stat.precision(_precision);
-      return self();
-    }
-
-    StatBase<StatType> & flags(Stat::Flags _flags) {
-      stat.flags(_flags);
-      return self();
-    }
-
-    template <class Stat>
-    StatBase<StatType> & prereq(const Stat & _prereq) {
-      stat.prereq(_prereq);
-      return self();
-    }
-
-    Stat::size_type size(void) const { return stat.size(); }
-    bool zero(void) const { return stat.zero(); }
-    void prepare(void) { stat.prepare(); }
-    void reset(void) { stat.reset(); }
-};
-
-template<class StatType>
-class StatBaseVec : public StatBase<StatType> { // wrapper for Stat::DataWrapVec
-  protected:
-    StatBaseVec<StatType> & self() { return *this; }
-
-  public:
-    StatBaseVec<StatType> & subname(Stat::off_type index, const std::string & name) {
-      StatBase<StatType>::stat.subname(index, name);
-      return self();
-    }
-
-    StatBaseVec<StatType> & subdesc(Stat::off_type index, const std::string & desc) {
-      StatBase<StatType>::stat.subdesc(index, desc);
-      return self();
-    }
-};
-
-template<class StatType>
-class ScalarStatBase : public StatBase<StatType> { // wrapper for Stat::ScalarBase
-  public:
-    Stat::Counter value() const { return StatBase<StatType>::stat.value(); };
-    void operator++() { ++StatBase<StatType>::stat; }
-    void operator--() { --StatBase<StatType>::stat; }
-
-    void operator++(int) { StatBase<StatType>::stat++; }
-    void operator--(int) { StatBase<StatType>::stat--; }
-
-    template <typename U>
-    void operator=(const U &v) { StatBase<StatType>::stat = v; }
-
-    template <typename U>
-    void operator+=(const U &v) { StatBase<StatType>::stat += v; }
-
-    template <typename U>
-    void operator-=(const U &v) { StatBase<StatType>::stat -= v; }
-};
-
-template<class StatType, class Element>
-class VectorStatBase : public StatBaseVec<StatType> { // wrapper for Stat::VectorBase
-  protected:
-    VectorStatBase<StatType, Element> & self() { return *this; }
-
-  public:
-    void value(Stat::VCounter & vec) const { StatBase<StatType>::stat.value(vec); }
-    void result(Stat::VResult & vec) const { StatBase<StatType>::stat.result(vec); }
-    Stat::Result total(void) const { return StatBase<StatType>::stat.total(); }
-
-    bool check(void) const { return StatBase<StatType>::stat.check(); }
-
-    VectorStatBase<StatType, Element> & init(Stat::size_type size) {
-      StatBase<StatType>::stat.init(size);
-      return self();
-    }
-
-    Element &operator[](Stat::off_type index) { return StatBase<StatType>::stat[index]; }
-};
-
-
-template<class StatType>
-class DistStatBase : public StatBase<StatType> { // wrapper for Stat::DistBase
-  public:
-    template<typename U>
-    void sample(const U &v, int n = 1) { StatBase<StatType>::stat.sample(v, n); }
-
-    void add(DistStatBase & d) { StatBase<StatType>::stat.add(d.StatBase<StatType>::stat); }
-};
-
-
-/*
-  nice wrappers for the gem5 stats classes used throughout the rest of the code
-*/
-
-class ScalarStat : public ScalarStatBase<Stat::Scalar> {
-  public:
-    using ScalarStatBase<Stat::Scalar>::operator=;
-};
-
-class IntervalScalarStat : public ScalarStatBase<Stat::Scalar> {
-  public:
-    using ScalarStatBase<Stat::Scalar>::operator=;
-};
-
-class AverageStat : public ScalarStatBase<Stat::Average> {
-  public:
-    using ScalarStatBase<Stat::Average>::operator=;
-};
-
-class VectorStat : public VectorStatBase<Stat::Vector, Stat::Scalar> {
-};
-
-class IntervalVectorStat : public VectorStatBase<Stat::Vector, Stat::Scalar> {
-};
-
-class AverageVectorStat : public VectorStatBase<Stat::AverageVector, Stat::Average> {
-};
-
-class DistributionStat : public DistStatBase<Stat::Distribution> {
-  protected:
-    DistributionStat & self() { return *this; }
-
-  public:
-    DistributionStat & init(Stat::Counter min, Stat::Counter max, Stat::Counter bkt) {
-      StatBase<Stat::Distribution>::stat.init(min, max, bkt);
-      return self();
-    }
-
-};
-
-class HistogramStat : public DistStatBase<Stat::Histogram> {
-  protected:
-    HistogramStat & self() { return *this; }
-
-  public:
-    HistogramStat & init(Stat::size_type size) {
-      StatBase<Stat::Histogram>::stat.init(size);
-      return self();
-    }
-};
-
-class StandardDeviationStat : public DistStatBase<Stat::StandardDeviation> {
-};
-
-class AverageDeviationStat : public DistStatBase<Stat::AverageDeviation> {
-};
-
-/*
-  Stats TODO
-  * Formula
-  * VectorDistribution
-  * VectorStandardDeviation
-  * VectorAverageDeviation
-  * Vector2d
-  * SparseHistogram
-*/
-
-} /* namespace ram */
-
-#endif
diff --git a/TOGSim/include/Common.h b/TOGSim/include/Common.h
index 2fd62681..b228fe45 100644
--- a/TOGSim/include/Common.h
+++ b/TOGSim/include/Common.h
@@ -28,4 +28,5 @@ typedef uint64_t addr_type;
 typedef uint64_t cycle_type;
 
 bool loadConfig(const std::string& config_path, YAML::Node& config_yaml);
-SimulationConfig initialize_config(YAML::Node config);
\ No newline at end of file
+SimulationConfig initialize_config(const YAML::Node& config,
+                                     const std::string& config_file_path = {});
\ No newline at end of file
diff --git a/TOGSim/include/Dram.h b/TOGSim/include/Dram.h
index 978bcdf9..4a897559 100644
--- a/TOGSim/include/Dram.h
+++ b/TOGSim/include/Dram.h
@@ -1,5 +1,6 @@
 #ifndef DRAM_H
 #define DRAM_H
+#include <optional>
 #include <robin_hood.h>
 #include <cstdint>
 #include <queue>
@@ -35,7 +36,6 @@ class Dram {
   SimulationConfig _config;
   CacheConfig _m_cache_config;
   uint32_t _n_ch;
-  uint32_t _n_bl;
   uint32_t _n_partitions;
   uint32_t _n_ch_per_partition;
   uint32_t _req_size;
@@ -51,6 +51,10 @@ class Dram {
 
 class DramRamulator2 : public Dram {
  public:
+  static void apply_ramulator_config_to_simulation_config(
+      SimulationConfig& cfg, const std::string& ramulator_config_path,
+      std::optional<uint32_t> dram_freq_mhz_stated = std::nullopt);
+
   DramRamulator2(SimulationConfig config, cycle_type *core_cycle);
 
   virtual bool running() override;
@@ -72,6 +76,8 @@ class DramRamulator2 : public Dram {
 
 class SimpleDRAM: public Dram {
  public:
+  static void apply_yaml_to_simulation_config(const YAML::Node& config, SimulationConfig& cfg);
+
   SimpleDRAM(SimulationConfig config, cycle_type *core_cycle);
 
   virtual bool running() override;
@@ -87,6 +93,8 @@ class SimpleDRAM: public Dram {
  private:
   int _latency = 1;
   std::vector<std::unique_ptr<DelayQueue<mem_fetch*>>> _mem;
+  std::vector<double> _bw_credit_bytes;
+  double _bytes_per_dram_cycle = 0.;
 };
 
 #endif
\ No newline at end of file
diff --git a/TOGSim/include/SimulationConfig.h b/TOGSim/include/SimulationConfig.h
index 090f5520..2ef08618 100644
--- a/TOGSim/include/SimulationConfig.h
+++ b/TOGSim/include/SimulationConfig.h
@@ -1,5 +1,8 @@
 #pragma once
 
+#include <cstdint>
+#include <filesystem>
+#include <map>
 #include <string>
 #include <yaml-cpp/yaml.h>
 
@@ -12,6 +15,9 @@ enum class IcntType { SIMPLE, BOOKSIM2 };
 enum class L2CacheType { NOCACHE, DATACACHE };
 
 struct SimulationConfig {
+  /* Path to the top-level hardware YAML passed to the simulator (empty if not from a file). */
+  std::string config_file_path;
+
   /* Core config */
   std::vector<CoreType> core_type;
   std::string stonne_config_path;
@@ -30,7 +36,7 @@ struct SimulationConfig {
   uint32_t dram_channels;
   uint32_t dram_req_size;
   uint32_t dram_latency;
-  uint32_t dram_nbl = 1;
+  float dram_bandwidth_gbps_per_channel = 0.f;
   uint32_t dram_print_interval;
   std::string dram_config_path;
 
@@ -61,7 +67,24 @@ struct SimulationConfig {
     return addr - (addr % dram_req_size);
   }
 
-  float max_dram_bandwidth() {
-    return dram_freq_mhz * dram_channels * dram_req_size * 2 / dram_nbl / 1000; // GB/s
+  float max_dram_bandwidth() const {
+    if (dram_bandwidth_gbps_per_channel > 0.f)
+      return dram_bandwidth_gbps_per_channel * static_cast<float>(dram_channels);
+    return 0.f;
+  }
+
+  /** Resolve `path` for opening on disk: absolute paths as-is; relative paths against top-level config dir. */
+  std::string resolve_against_simulation_config(const std::string& path) const {
+    namespace fs = std::filesystem;
+    if (path.empty())
+      return path;
+    fs::path p(path);
+    fs::path abs = p.is_absolute() ? fs::absolute(p)
+                 : !config_file_path.empty()
+                     ? fs::absolute(fs::path(config_file_path).parent_path() / p)
+                     : fs::absolute(p);
+    std::error_code ec;
+    fs::path canon = fs::weakly_canonical(abs, ec);
+    return (ec ? abs : canon).string();
   }
 };
\ No newline at end of file
diff --git a/TOGSim/include/Simulator.h b/TOGSim/include/Simulator.h
index a0b8b9c5..e3542d51 100644
--- a/TOGSim/include/Simulator.h
+++ b/TOGSim/include/Simulator.h
@@ -23,7 +23,7 @@ namespace fs = std::filesystem;
 
 class Simulator {
  public:
-  Simulator(SimulationConfig config);
+  Simulator(SimulationConfig config, YAML::Node hardware_config_yaml);
   void enqueue_graph(int partion_id, std::unique_ptr<TileGraph> tile_graph) {
     if (partion_id < 0 || static_cast<uint32_t>(partion_id) >= _config.num_partition) {
       spdlog::error("[Enqueue_graph] Invalid partition_id: {} (valid range: 0 to {}). "
@@ -41,6 +41,8 @@ class Simulator {
   std::unique_ptr<Scheduler>& get_partition_scheduler(int core_id) { return _partition_scheduler.at(get_partition_id(core_id)); }
   void print_core_stat();
   void cycle();
+  const SimulationConfig& get_config() const { return _config; }
+  const YAML::Node& get_hardware_config_yaml() const { return _hardware_config_yaml; }
  private:
   void core_cycle();
   void dram_cycle();
@@ -49,6 +51,7 @@ class Simulator {
   void set_cycle_mask();
   uint32_t get_dest_node(mem_fetch *access);
   SimulationConfig _config;
+  YAML::Node _hardware_config_yaml;
   uint32_t _n_cores;
   uint32_t _n_sp_cores;
   uint32_t _noc_node_per_core;
diff --git a/TOGSim/src/Common.cc b/TOGSim/src/Common.cc
index b15381a6..ede991c8 100644
--- a/TOGSim/src/Common.cc
+++ b/TOGSim/src/Common.cc
@@ -1,5 +1,9 @@
 #include "Common.h"
 
+#include "Dram.h"
+
+#include <optional>
+
 bool loadConfig(const std::string& config_path, YAML::Node& config_yaml) {
   try {
     config_yaml = YAML::LoadFile(config_path);
@@ -26,8 +30,10 @@ T get_config_value(const YAML::Node& config, std::string key) {
   }
 }
 
-SimulationConfig initialize_config(YAML::Node config) {
+SimulationConfig initialize_config(const YAML::Node& config,
+                                     const std::string& config_file_path) {
   SimulationConfig parsed_config;
+  parsed_config.config_file_path = config_file_path;
   YAML::Emitter emitter;
   emitter << config;
   spdlog::info("PyTorchSim config:\n{}", emitter.c_str());
@@ -73,18 +79,25 @@ SimulationConfig initialize_config(YAML::Node config) {
 
   if (dram_type_str == "simple") {
     parsed_config.dram_type = DramType::SIMPLE;
-    parsed_config.dram_latency = get_config_value<uint32_t>(config, "dram_latency");
   } else if (dram_type_str == "ramulator2") {
     parsed_config.dram_type = DramType::RAMULATOR2;
-    parsed_config.dram_config_path = get_config_value<std::string>(config, "ramulator_config_path");
+    const std::string ramulator_config_rel =
+        get_config_value<std::string>(config, "ramulator_config_path");
+    parsed_config.dram_config_path =
+        parsed_config.resolve_against_simulation_config(ramulator_config_rel);
   } else {
     throw std::runtime_error(fmt::format("Not implemented dram type {} ", dram_type_str));
   }
 
-  parsed_config.dram_freq_mhz = get_config_value<uint32_t>(config, "dram_freq_mhz");
   parsed_config.dram_channels = get_config_value<uint32_t>(config, "dram_channels");
-  parsed_config.dram_req_size = get_config_value<uint32_t>(config, "dram_req_size_byte");
-  parsed_config.dram_nbl = get_config_value<uint32_t>(config, "dram_num_burst_length");
+
+  if (parsed_config.dram_type == DramType::RAMULATOR2) {
+    DramRamulator2::apply_ramulator_config_to_simulation_config(
+        parsed_config, parsed_config.dram_config_path,
+        config["dram_freq_mhz"] ? std::optional<uint32_t>(config["dram_freq_mhz"].as<uint32_t>()) : std::nullopt);
+  } else {
+    SimpleDRAM::apply_yaml_to_simulation_config(config, parsed_config);
+  }
 
   if (config["dram_stats_print_period_cycles"])
     parsed_config.dram_print_interval = config["dram_stats_print_period_cycles"].as<uint32_t>();
diff --git a/TOGSim/src/Dram.cc b/TOGSim/src/Dram.cc
index 95a55ca3..798acb7b 100644
--- a/TOGSim/src/Dram.cc
+++ b/TOGSim/src/Dram.cc
@@ -1,6 +1,18 @@
 #include "Dram.h"
 
+#include <cmath>
+#include <filesystem>
 #include <iostream>
+#include <optional>
+#include <stdexcept>
+#include <string>
+
+#include <spdlog/fmt/fmt.h>
+
+#include "ramulator/base/config.h"
+#include "ramulator/base/factory.h"
+#include "ramulator/frontend/i_frontend.h"
+#include "ramulator/memory_system/i_memory_system.h"
 
 namespace {
 
@@ -26,15 +38,15 @@ static uint32_t next_power_of_2_u32(uint32_t n) {
   return n + 1;
 }
 
-/** Bytes/s effective GB/s and avg-per-channel utilization % for a window of `window_cycles` DRAM ticks. */
+/** Bytes/s effective GB/s and utilization % vs `peak_gbps_per_channel` (x n_ch aggregate peak). */
 struct DramBwSnapshot {
   double bandwidth_gbs = 0;
   double util_avg_ch_pct = 0;
 };
 
 DramBwSnapshot make_dram_bw_snapshot(long long total_rw_transactions, uint64_t window_cycles,
-                                     uint32_t n_ch, uint32_t req_size, uint32_t n_bl,
-                                     double dram_freq_mhz) {
+                                     uint32_t n_ch, uint32_t req_size, double dram_freq_mhz,
+                                     float peak_gbps_per_channel) {
   DramBwSnapshot out;
   if (window_cycles == 0 || n_ch == 0)
     return out;
@@ -42,13 +54,108 @@ DramBwSnapshot make_dram_bw_snapshot(long long total_rw_transactions, uint64_t w
   const double w = static_cast<double>(window_cycles);
   const double bytes_per_cycle = tx * static_cast<double>(req_size) / w;
   out.bandwidth_gbs = bytes_per_cycle * dram_freq_mhz / 1000.0;
-  const double avg_per_ch = tx / static_cast<double>(n_ch);
-  out.util_avg_ch_pct = avg_per_ch * 100.0 * static_cast<double>(n_bl) / (2.0 * w);
+  const double peak_total_gbs =
+      static_cast<double>(peak_gbps_per_channel) * static_cast<double>(n_ch);
+  if (peak_gbps_per_channel > 0.f && peak_total_gbs > 0.0)
+    out.util_avg_ch_pct = 100.0 * out.bandwidth_gbs / peak_total_gbs;
   return out;
 }
 
+static float peak_gbps_per_channel_from_ramulator_yaml(const Ramulator::ConfigNode& cfg) {
+  const Ramulator::ConfigNode controllers = cfg["memory_system"]["controllers"];
+  const auto& ctrls = controllers.seq();
+  if (ctrls.empty())
+    throw std::runtime_error("memory_system.controllers is empty");
+  const Ramulator::ConfigNode dram = ctrls[0]["dram"];
+  const int ch_width = dram["channel_width"].as<int>();
+  if (ch_width <= 0)
+    throw std::runtime_error("invalid channel_width");
+  const Ramulator::ConfigNode timing_node = dram["timing"];
+  const auto& timing = timing_node.seq();
+  if (timing.empty())
+    throw std::runtime_error("dram.timing is empty");
+  const int rate = timing[0].as<int>();
+  if (rate <= 0)
+    throw std::runtime_error("invalid dram.timing[0] (rate / MT/s)");
+
+  int pseudo_ch = 1;
+  const std::string impl = dram["impl"].as<std::string>("");
+  if (impl == "HBM2" || impl == "HBM3") {
+    const Ramulator::ConfigNode org = dram["org"];
+    const Ramulator::ConfigNode org_count = org["count"];
+    const auto& counts = org_count.seq();
+    if (counts.size() > 1)
+      pseudo_ch = std::max(1, counts[1].as<int>());
+  }
+
+  return static_cast<float>(static_cast<double>(rate) * static_cast<double>(pseudo_ch) *
+                             static_cast<double>(ch_width) / 8.0 / 1000.0);
+}
+
 }  // namespace
 
+void DramRamulator2::apply_ramulator_config_to_simulation_config(
+    SimulationConfig& cfg, const std::string& ramulator_config_path,
+    std::optional<uint32_t> dram_freq_mhz_stated) {
+  Ramulator::ConfigNode config = Ramulator::Config::parse_config_file(ramulator_config_path);
+  Ramulator::ConfigNode frontend_config;
+  frontend_config.set("impl", std::string("External"));
+  frontend_config.set("clock_ratio", 1u);
+  config.set("frontend", frontend_config);
+
+  float peak_gbps = 0.f;
+  try {
+    peak_gbps = peak_gbps_per_channel_from_ramulator_yaml(config);
+  } catch (const std::exception& e) {
+    throw std::runtime_error(std::string("[Config/DRAM] Ramulator peak GB/s from yaml: ") + e.what() + " (" +
+                             ramulator_config_path + ")");
+  }
+
+  Ramulator::IFrontEnd* fe = Ramulator::Factory::create_frontend(config);
+  Ramulator::IMemorySystem* mem = Ramulator::Factory::create_memory_system(config);
+  fe->connect_memory_system(mem);
+  mem->connect_frontend(fe);
+
+  const float tck_ns = mem->get_tCK();
+  if (tck_ns <= 0.f) {
+    fe->finalize();
+    mem->finalize();
+    delete fe;
+    delete mem;
+    throw std::runtime_error("[Config/DRAM] Ramulator probe: invalid get_tCK() for " + ramulator_config_path);
+  }
+
+  const int tx_bytes = mem->get_tx_bytes();
+  if (tx_bytes <= 0) {
+    fe->finalize();
+    mem->finalize();
+    delete fe;
+    delete mem;
+    throw std::runtime_error("[Config/DRAM] Ramulator probe: invalid get_tx_bytes() for " + ramulator_config_path);
+  }
+
+  fe->finalize();
+  mem->finalize();
+  delete fe;
+  delete mem;
+
+  cfg.dram_req_size = static_cast<uint32_t>(tx_bytes);
+  cfg.dram_freq_mhz = static_cast<uint32_t>(std::lround(1000.0f / tck_ns));
+  cfg.dram_bandwidth_gbps_per_channel = peak_gbps;
+
+  if (dram_freq_mhz_stated.has_value()) {
+    if (*dram_freq_mhz_stated != cfg.dram_freq_mhz) {
+      throw std::runtime_error(fmt::format(
+          "[Config/DRAM] ramulator2: top-level dram_freq_mhz {} does not match Ramulator timing "
+          "(DRAM clock {} MHz from tCK={:.6g} ns, i.e. round(1000/tCK)); remove dram_freq_mhz to use the derived "
+          "value, or align the Ramulator YAML with the top-level yml. ramulator_config_path={}",
+          *dram_freq_mhz_stated, cfg.dram_freq_mhz, static_cast<double>(tck_ns), ramulator_config_path));
+    }
+    spdlog::info("[Config/DRAM] ramulator2: dram_freq_mhz {} matches Ramulator-derived DRAM clock (tCK={:.6g} ns)",
+                 *dram_freq_mhz_stated, static_cast<double>(tck_ns));
+  }
+}
+
 new_addr_type Dram::partition_dram_address(new_addr_type raw_addr) const {
   if (_req_size == 0 || _n_ch_per_partition == 0)
     return raw_addr;
@@ -87,7 +194,6 @@ uint32_t Dram::get_channel_id(mem_fetch* access) {
 Dram::Dram(SimulationConfig config, cycle_type* core_cycle) {
   _core_cycles = core_cycle;
   _n_ch = config.dram_channels;
-  _n_bl = config.dram_nbl;
   _req_size = config.dram_req_size;
   _n_partitions = config.dram_num_partitions;
   _n_ch_per_partition = config.dram_channels_per_partitions;
@@ -127,9 +233,8 @@ DramRamulator2::DramRamulator2(SimulationConfig config, cycle_type* core_cycle)
   /* Initialize DRAM Channels */
   _mem.resize(_n_ch);
   for (int ch = 0; ch < _n_ch; ch++) {
-    _mem[ch] = std::make_unique<Ramulator2>(
-      ch, _n_ch, config.dram_config_path, "Ramulator2", _config.dram_print_interval, _n_bl,
-      _req_size, config.dram_freq_mhz);
+    _mem[ch] = std::make_unique<Ramulator2>(ch, _n_ch, config.dram_config_path, "Ramulator2",
+                                            _config.dram_print_interval, _req_size, config.dram_freq_mhz);
   }
   _tx_log2 = log2(_req_size);
   _tx_ch_log2 = log2(_n_ch_per_partition) + _tx_log2;
@@ -180,14 +285,14 @@ void DramRamulator2::cycle() {
     const long long wtxn = _mem[ch]->interval_writes();
     r_all += r;
     w_all += wtxn;
-    const DramBwSnapshot bw =
-        make_dram_bw_snapshot(r + wtxn, w, 1u, _req_size, _n_bl, f_mhz);
+    const DramBwSnapshot bw = make_dram_bw_snapshot(
+        r + wtxn, w, 1u, _req_size, f_mhz, _config.dram_bandwidth_gbps_per_channel);
     spdlog::trace(
         "[DRAM] ch {} | BW {:.2f} GB/s, {:.2f}% util | {} reads, {} writes (interval {} cycles)",
         ch, bw.bandwidth_gbs, bw.util_avg_ch_pct, r, wtxn, w);
   }
-  const DramBwSnapshot bw_all =
-      make_dram_bw_snapshot(r_all + w_all, w, _n_ch, _req_size, _n_bl, f_mhz);
+  const DramBwSnapshot bw_all = make_dram_bw_snapshot(
+      r_all + w_all, w, _n_ch, _req_size, f_mhz, _config.dram_bandwidth_gbps_per_channel);
   spdlog::info(
       "[DRAM] all {} ch | BW {:.2f} GB/s, {:.2f}% util (avg/ch) | {} reads, {} writes (interval {} cycles)",
       _n_ch, bw_all.bandwidth_gbs, bw_all.util_avg_ch_pct, r_all, w_all, w);
@@ -247,7 +352,7 @@ void DramRamulator2::print_stat() {
   if (cycles == 0)
     return;
   const double f_mhz = static_cast<double>(_config.dram_freq_mhz);
-  spdlog::info("[DRAM] per-channel avg BW ({} sim cycles):", cycles);
+  spdlog::info("[DRAM] per-channel avg BW");
   long long tr_all = 0;
   long long tw_all = 0;
   for (int ch = 0; ch < _n_ch; ch++) {
@@ -255,14 +360,14 @@ void DramRamulator2::print_stat() {
     const long long tw = _mem[ch]->total_writes();
     tr_all += tr;
     tw_all += tw;
-    const DramBwSnapshot bw =
-        make_dram_bw_snapshot(tr + tw, cycles, 1u, _req_size, _n_bl, f_mhz);
+    const DramBwSnapshot bw = make_dram_bw_snapshot(
+        tr + tw, cycles, 1u, _req_size, f_mhz, _config.dram_bandwidth_gbps_per_channel);
     spdlog::info(
         "[DRAM] ch {} | avg BW {:.2f} GB/s, {:.2f}% util | {} reads, {} writes",
         ch, bw.bandwidth_gbs, bw.util_avg_ch_pct, tr, tw);
   }
   const DramBwSnapshot bw_all = make_dram_bw_snapshot(
-      tr_all + tw_all, cycles, _n_ch, _req_size, _n_bl, f_mhz);
+      tr_all + tw_all, cycles, _n_ch, _req_size, f_mhz, _config.dram_bandwidth_gbps_per_channel);
   spdlog::info(
       "[DRAM] all ch 0..{} | avg BW {:.2f} GB/s, {:.2f}% util (avg/ch) | {} reads, {} writes",
       _n_ch - 1, bw_all.bandwidth_gbs, bw_all.util_avg_ch_pct, tr_all, tw_all);
@@ -274,13 +379,78 @@ void DramRamulator2::print_cache_stats() {
   }
 }
 
+void SimpleDRAM::apply_yaml_to_simulation_config(const YAML::Node& config, SimulationConfig& cfg) {
+  if (!config["dram_latency"])
+    throw std::runtime_error("[Config/DRAM] simple: dram_latency is required");
+  cfg.dram_latency = config["dram_latency"].as<uint32_t>();
+
+  auto yaml_get_u32 = [](const YAML::Node& n, const char* key, uint32_t def) -> uint32_t {
+    if (n[key])
+      return n[key].as<uint32_t>();
+    return def;
+  };
+
+  cfg.dram_req_size = yaml_get_u32(config, "dram_req_size_byte", 32u);
+  if (cfg.dram_req_size == 0)
+    throw std::runtime_error("[Config/DRAM] simple: dram_req_size_byte must be > 0");
+
+  const bool has_per_ch_bw = static_cast<bool>(config["dram_bandwidth_gbps_per_channel"]);
+  const bool has_total_bw = static_cast<bool>(config["dram_bandwidth_gbps_total"]);
+  if (has_per_ch_bw && has_total_bw)
+    throw std::runtime_error(
+        "[Config/DRAM] simple: set only one of dram_bandwidth_gbps_per_channel or dram_bandwidth_gbps_total");
+
+  const bool has_bw_cap = has_per_ch_bw || has_total_bw;
+  if (has_bw_cap) {
+    float per_ch = 0.f;
+    if (has_total_bw) {
+      const float tot = config["dram_bandwidth_gbps_total"].as<float>();
+      if (cfg.dram_channels == 0)
+        throw std::runtime_error("[Config/DRAM] dram_channels must be > 0 for dram_bandwidth_gbps_total");
+      per_ch = tot / static_cast<float>(cfg.dram_channels);
+    } else {
+      per_ch = config["dram_bandwidth_gbps_per_channel"].as<float>();
+    }
+    if (per_ch <= 0.f)
+      throw std::runtime_error("[Config/DRAM] simple: dram_bandwidth_gbps_* must be > 0");
+    cfg.dram_bandwidth_gbps_per_channel = per_ch;
+  } else {
+    cfg.dram_bandwidth_gbps_per_channel = 0.f;
+  }
+
+  if (has_bw_cap && !config["dram_freq_mhz"])
+    throw std::runtime_error(
+        "[Config/DRAM] simple: dram_freq_mhz is required when dram_bandwidth_gbps_per_channel or "
+        "dram_bandwidth_gbps_total is set (credit refill is per simulated DRAM cycle)");
+  cfg.dram_freq_mhz = yaml_get_u32(config, "dram_freq_mhz", cfg.core_freq_mhz);
+
+  if (cfg.dram_freq_mhz == 0) {
+    throw std::runtime_error("[Config/DRAM] simple: dram_freq_mhz must be > 0");
+  }
+}
+
 SimpleDRAM::SimpleDRAM(SimulationConfig config, cycle_type* core_cycle) : Dram(config, core_cycle) {
-  /* Initialize DRAM Channels */
-  spdlog::info("[SimpleDRAM] DRAM latecny: {}", config.dram_latency);
+  spdlog::info("[SimpleDRAM] DRAM latency: {}", config.dram_latency);
   for (int ch = 0; ch < _n_ch; ch++) {
     _mem.push_back(std::make_unique<DelayQueue<mem_fetch*>>("SimpleDRAM", true, -1));
   }
-  _latency =  config.dram_latency;
+  _latency = config.dram_latency;
+  _bw_credit_bytes.assign(static_cast<size_t>(_n_ch), static_cast<double>(_req_size) * 2.0);
+  if (config.dram_freq_mhz > 0 && config.dram_bandwidth_gbps_per_channel > 0.f) {
+    _bytes_per_dram_cycle =
+        static_cast<double>(config.dram_bandwidth_gbps_per_channel) * 1000.0 /
+        static_cast<double>(config.dram_freq_mhz);
+  } else {
+    _bytes_per_dram_cycle = 0.;
+  }
+  if (config.dram_bandwidth_gbps_per_channel > 0.f)
+    spdlog::info("[SimpleDRAM] peak {:.2f} GB/s total, {:.2f} GB/s per channel, {:.4f} B/cycle per channel",
+                 config.max_dram_bandwidth(), config.dram_bandwidth_gbps_per_channel, _bytes_per_dram_cycle);
+  else
+    spdlog::info(
+        "[SimpleDRAM] no bandwidth cap (latency-only); dram_latency {} cycles, dram_freq_mhz {} for tick "
+        "alignment",
+        config.dram_latency, config.dram_freq_mhz);
 }
 
 bool SimpleDRAM::running() {
@@ -297,20 +467,30 @@ void SimpleDRAM::cycle() {
   for (int ch = 0; ch < _n_ch; ch++) {
     _mem[ch]->cycle();
 
+    if (_bytes_per_dram_cycle > 0.0)
+      _bw_credit_bytes[static_cast<size_t>(ch)] += _bytes_per_dram_cycle;
+
     // From Cache to DRAM
     if (mem_fetch* req = _m_caches[ch]->top()) {
-      //spdlog::info("[Cache->DRAM] mem_fetch: addr={:#x}", req->get_addr());
-
-      _mem[ch]->push(req, _latency);
-      _m_caches[ch]->pop();
+      const double need = static_cast<double>(_req_size);
+      bool admit = true;
+      if (_bytes_per_dram_cycle > 0.0) {
+        if (_bw_credit_bytes[static_cast<size_t>(ch)] < need)
+          admit = false;
+        else
+          _bw_credit_bytes[static_cast<size_t>(ch)] -= need;
+      }
+      if (admit) {
+        _mem[ch]->push(req, _latency);
+        _m_caches[ch]->pop();
+      }
     }
 
     // From DRAM to Cache
     if (_mem[ch]->arrived()) {
       mem_fetch* req = _mem[ch]->top();
       req->set_reply();
-      //spdlog::info("[DRAM->Cache] mem_fetch: addr={:#x}", req->get_addr());
-      if(_m_caches[ch]->push(req))
+      if (_m_caches[ch]->push(req))
         _mem[ch]->pop();
     }
   }
diff --git a/TOGSim/src/Simulator.cc b/TOGSim/src/Simulator.cc
index d7fe9f1b..9bd3407f 100644
--- a/TOGSim/src/Simulator.cc
+++ b/TOGSim/src/Simulator.cc
@@ -1,7 +1,9 @@
 #include "Simulator.h"
 
-Simulator::Simulator(SimulationConfig config)
-    : _config(config), _core_cycles(0) {
+Simulator::Simulator(SimulationConfig config, YAML::Node hardware_config_yaml)
+    : _config(config),
+      _hardware_config_yaml(std::move(hardware_config_yaml)),
+      _core_cycles(0) {
   // Create dram object
   _core_period = 1000000 / (config.core_freq_mhz);
   _icnt_period = 1000000 / (config.icnt_freq_mhz);
diff --git a/TOGSim/src/main.cc b/TOGSim/src/main.cc
index 57e0e696..f985bdf4 100644
--- a/TOGSim/src/main.cc
+++ b/TOGSim/src/main.cc
@@ -84,11 +84,13 @@ void process_trace_file(Simulator* simulator, std::string trace_file_path, const
   simulator->cycle();
 }
 
-Simulator* create_simulator(const YAML::Node& config_yaml) {
-  SimulationConfig config = initialize_config(config_yaml);
-
-  auto simulator = new Simulator(config);
-  return simulator;
+Simulator* create_simulator(const std::string& config_path) {
+  YAML::Node config_yaml;
+  if (!loadConfig(config_path, config_yaml)) {
+    return nullptr;
+  }
+  SimulationConfig config = initialize_config(config_yaml, config_path);
+  return new Simulator(config, std::move(config_yaml));
 }
 
 int main(int argc, char** argv) {
@@ -138,21 +140,19 @@ int main(int argc, char** argv) {
   /* Create simulator */
   cmd_parser.set_if_defined("config", &config_path);
 
-  // Load config once for reuse
-  YAML::Node config_yaml;
-  if (!loadConfig(config_path, config_yaml)) {
+  auto simulator = create_simulator(config_path);
+  if (!simulator) {
     spdlog::error("[TOGSim] Failed to load config file: {}", config_path);
     exit(1);
   }
 
-  auto simulator = create_simulator(config_yaml);
-
   // Get trace file path
   cmd_parser.set_if_defined("models_list", &trace_file_path);
 
   if (!trace_file_path.empty()) {
     // Process trace file (unified mode: supports both FIFO and regular file)
-    process_trace_file(simulator, trace_file_path, config_yaml);
+    process_trace_file(simulator, trace_file_path,
+                       simulator->get_hardware_config_yaml());
     spdlog::info("Simulation finished");
     simulator->print_core_stat();
   } else {
diff --git a/configs/heterogeneous_c2_simple_noc.yml b/configs/heterogeneous_c2_simple_noc.yml
index 9c596d85..8a3401fe 100644
--- a/configs/heterogeneous_c2_simple_noc.yml
+++ b/configs/heterogeneous_c2_simple_noc.yml
@@ -16,8 +16,6 @@ vpu_vector_length_bits: 256
 dram_type: ramulator2
 dram_freq_mhz: 940
 dram_channels: 16
-dram_req_size_byte: 32
-dram_num_burst_length: 2
 dram_stats_print_period_cycles: 10000
 ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
 
diff --git a/configs/ramulator2_configs/HBM2_TPUv2.yaml b/configs/ramulator2_configs/HBM2_TPUv2.yaml
new file mode 100644
index 00000000..88c1adf3
--- /dev/null
+++ b/configs/ramulator2_configs/HBM2_TPUv2.yaml
@@ -0,0 +1,476 @@
+{
+  "frontend": {
+    "impl": "External",
+    "clock_ratio": 1
+  },
+  "memory_system": {
+    "impl": "GenericDRAM",
+    "clock_ratio": 1,
+    "channel_mapper": {
+      "impl": "PassThroughChannelMapper"
+    },
+    "controllers": [
+      {
+        "impl": "HBM",
+        "wr_low_watermark": 0.2,
+        "wr_high_watermark": 0.8,
+        "read_buffer_size": 64,
+        "write_buffer_size": 64,
+        "priority_buffer_size": 1568,
+        "scheduler": {
+          "impl": "FRFCFS"
+        },
+        "refresh_manager": {
+          "impl": "AllBank",
+          "scope": "PseudoChannel"
+        },
+        "row_policy": {
+          "impl": "Open"
+        },
+        "addr_mapper": {
+          "impl": "RoBaRaCoCh"
+        },
+        "dram": {
+          "impl": "HBM2",
+          "org": {
+            "dq": 64,
+            "count": [
+              1,
+              2,
+              4,
+              4,
+              65536,
+              32
+            ]
+          },
+          "timing": [
+            1400,
+            2,
+            9,
+            9,
+            7,
+            9,
+            22,
+            31,
+            11,
+            4,
+            4,
+            2,
+            4,
+            4,
+            4,
+            5,
+            6,
+            11,
+            245,
+            112,
+            6,
+            2730,
+            86,
+            1429
+          ],
+          "channel_width": 64,
+          "read_latency": 11,
+          "timing_constraints": [
+            [
+              0,
+              [
+                0
+              ],
+              [
+                0,
+                1,
+                2,
+                7,
+                8
+              ],
+              2
+            ],
+            [
+              1,
+              [
+                3,
+                5
+              ],
+              [
+                3,
+                5
+              ],
+              2
+            ],
+            [
+              1,
+              [
+                4,
+                6
+              ],
+              [
+                4,
+                6
+              ],
+              2
+            ],
+            [
+              1,
+              [
+                3,
+                5
+              ],
+              [
+                3,
+                5
+              ],
+              2
+            ],
+            [
+              1,
+              [
+                4,
+                6
+              ],
+              [
+                4,
+                6
+              ],
+              2
+            ],
+            [
+              1,
+              [
+                3,
+                5
+              ],
+              [
+                4,
+                6
+              ],
+              9
+            ],
+            [
+              1,
+              [
+                4,
+                6
+              ],
+              [
+                3,
+                5
+              ],
+              11
+            ],
+            [
+              1,
+              [
+                3
+              ],
+              [
+                2
+              ],
+              4
+            ],
+            [
+              1,
+              [
+                4
+              ],
+              [
+                2
+              ],
+              17
+            ],
+            [
+              1,
+              [
+                0
+              ],
+              [
+                0
+              ],
+              4
+            ],
+            [
+              1,
+              [
+                0
+              ],
+              [
+                0
+              ],
+              11,
+              4
+            ],
+            [
+              1,
+              [
+                0
+              ],
+              [
+                2
+              ],
+              23
+            ],
+            [
+              1,
+              [
+                2
+              ],
+              [
+                0
+              ],
+              8
+            ],
+            [
+              1,
+              [
+                0
+              ],
+              [
+                7
+              ],
+              32
+            ],
+            [
+              1,
+              [
+                1,
+                2
+              ],
+              [
+                7
+              ],
+              9
+            ],
+            [
+              1,
+              [
+                5
+              ],
+              [
+                7
+              ],
+              13
+            ],
+            [
+              1,
+              [
+                6
+              ],
+              [
+                7
+              ],
+              26
+            ],
+            [
+              1,
+              [
+                7
+              ],
+              [
+                0
+              ],
+              244
+            ],
+            [
+              1,
+              [
+                7
+              ],
+              [
+                2
+              ],
+              245
+            ],
+            [
+              1,
+              [
+                8
+              ],
+              [
+                0
+              ],
+              5
+            ],
+            [
+              1,
+              [
+                0
+              ],
+              [
+                8
+              ],
+              5
+            ],
+            [
+              2,
+              [
+                3,
+                5
+              ],
+              [
+                3,
+                5
+              ],
+              4
+            ],
+            [
+              2,
+              [
+                4,
+                6
+              ],
+              [
+                4,
+                6
+              ],
+              4
+            ],
+            [
+              2,
+              [
+                4,
+                6
+              ],
+              [
+                3,
+                5
+              ],
+              12
+            ],
+            [
+              2,
+              [
+                0
+              ],
+              [
+                0
+              ],
+              4
+            ],
+            [
+              3,
+              [
+                0
+              ],
+              [
+                0
+              ],
+              31
+            ],
+            [
+              3,
+              [
+                0
+              ],
+              [
+                3,
+                5
+              ],
+              10
+            ],
+            [
+              3,
+              [
+                0
+              ],
+              [
+                4,
+                6
+              ],
+              8
+            ],
+            [
+              3,
+              [
+                0
+              ],
+              [
+                1
+              ],
+              23
+            ],
+            [
+              3,
+              [
+                1
+              ],
+              [
+                0
+              ],
+              8
+            ],
+            [
+              3,
+              [
+                3
+              ],
+              [
+                1
+              ],
+              4
+            ],
+            [
+              3,
+              [
+                4
+              ],
+              [
+                1
+              ],
+              17
+            ],
+            [
+              3,
+              [
+                5
+              ],
+              [
+                0
+              ],
+              12
+            ],
+            [
+              3,
+              [
+                6
+              ],
+              [
+                0
+              ],
+              25
+            ],
+            [
+              3,
+              [
+                8
+              ],
+              [
+                0
+              ],
+              111
+            ],
+            [
+              3,
+              [
+                0
+              ],
+              [
+                8
+              ],
+              32
+            ],
+            [
+              3,
+              [
+                1
+              ],
+              [
+                8
+              ],
+              9
+            ]
+          ]
+        }
+      }
+    ]
+  }
+}
\ No newline at end of file
diff --git a/configs/ramulator2_configs/HBM2_TPUv3.yaml b/configs/ramulator2_configs/HBM2_TPUv3.yaml
index 01cab613..50a3ea3b 100644
--- a/configs/ramulator2_configs/HBM2_TPUv3.yaml
+++ b/configs/ramulator2_configs/HBM2_TPUv3.yaml
@@ -44,15 +44,15 @@
             ]
           },
           "timing": [
-            2000,
+            1880,
             2,
-            14,
-            14,
-            12,
-            14,
-            34,
-            48,
-            16,
+            13,
+            13,
+            11,
+            13,
+            31,
+            44,
+            15,
             5,
             5,
             2,
@@ -62,15 +62,15 @@
             6,
             8,
             15,
-            350,
-            160,
+            329,
+            151,
             8,
-            3900,
-            122,
-            1000
+            3666,
+            115,
+            1064
           ],
           "channel_width": 64,
-          "read_latency": 16,
+          "read_latency": 15,
           "timing_constraints": [
             [
               0,
@@ -144,7 +144,7 @@
                 4,
                 6
               ],
-              13
+              12
             ],
             [
               1,
@@ -176,7 +176,7 @@
               [
                 2
               ],
-              23
+              22
             ],
             [
               1,
@@ -207,7 +207,7 @@
               [
                 2
               ],
-              35
+              32
             ],
             [
               1,
@@ -217,7 +217,7 @@
               [
                 0
               ],
-              13
+              12
             ],
             [
               1,
@@ -227,7 +227,7 @@
               [
                 7
               ],
-              49
+              45
             ],
             [
               1,
@@ -238,7 +238,7 @@
               [
                 7
               ],
-              14
+              13
             ],
             [
               1,
@@ -248,7 +248,7 @@
               [
                 7
               ],
-              19
+              18
             ],
             [
               1,
@@ -258,7 +258,7 @@
               [
                 7
               ],
-              37
+              35
             ],
             [
               1,
@@ -268,7 +268,7 @@
               [
                 0
               ],
-              349
+              328
             ],
             [
               1,
@@ -278,7 +278,7 @@
               [
                 2
               ],
-              350
+              329
             ],
             [
               1,
@@ -354,7 +354,7 @@
               [
                 0
               ],
-              48
+              44
             ],
             [
               3,
@@ -365,7 +365,7 @@
                 3,
                 5
               ],
-              15
+              14
             ],
             [
               3,
@@ -376,7 +376,7 @@
                 4,
                 6
               ],
-              13
+              12
             ],
             [
               3,
@@ -386,7 +386,7 @@
               [
                 1
               ],
-              35
+              32
             ],
             [
               3,
@@ -396,7 +396,7 @@
               [
                 0
               ],
-              13
+              12
             ],
             [
               3,
@@ -416,7 +416,7 @@
               [
                 1
               ],
-              23
+              22
             ],
             [
               3,
@@ -426,7 +426,7 @@
               [
                 0
               ],
-              18
+              17
             ],
             [
               3,
@@ -436,7 +436,7 @@
               [
                 0
               ],
-              36
+              34
             ],
             [
               3,
@@ -446,7 +446,7 @@
               [
                 0
               ],
-              159
+              150
             ],
             [
               3,
@@ -456,7 +456,7 @@
               [
                 8
               ],
-              49
+              45
             ],
             [
               3,
@@ -466,7 +466,7 @@
               [
                 8
               ],
-              14
+              13
             ]
           ]
         }
diff --git a/configs/ramulator2_configs/gen_configs.py b/configs/ramulator2_configs/gen_configs.py
index d27cd6de..1c630e5c 100644
--- a/configs/ramulator2_configs/gen_configs.py
+++ b/configs/ramulator2_configs/gen_configs.py
@@ -83,8 +83,11 @@ def gen_hbm2():
 
 
 def gen_hbm2_tpuv3():
-    # TPUv3 HBM2: 900MHz → ~1.8 Gbps. Closest available preset: HBM2_2000Mbps
-    dram = ramulator.dram.HBM2(org_preset="HBM2_8Gb", timing_preset="HBM2_2000Mbps")
+    dram = ramulator.dram.HBM2(org_preset="HBM2_8Gb", timing_preset="HBM2_1880Mbps")
+    return make_config(dram, clock_ratio=1, refresh_scope="PseudoChannel")
+
+def gen_hbm2_tpuv2():
+    dram = ramulator.dram.HBM2(org_preset="HBM2_8Gb", timing_preset="HBM2_1400Mbps")
     return make_config(dram, clock_ratio=1, refresh_scope="PseudoChannel")
 
 def gen_ddr4():
@@ -107,6 +110,7 @@ def gen_lpddr5x():
 CONFIGS = {
     "HBM2.yaml":        gen_hbm2,
     "HBM2_TPUv3.yaml":  gen_hbm2_tpuv3,
+    "HBM2_TPUv2.yaml":  gen_hbm2_tpuv2,
     "DDR4.yaml":        gen_ddr4,
     "LPDDR5.yaml":      gen_lpddr5,
     "LPDDR5X.yaml":     gen_lpddr5x,
diff --git a/configs/ramulator_configs/ALDRAM-config.cfg b/configs/ramulator_configs/ALDRAM-config.cfg
deleted file mode 100644
index 91cef49c..00000000
--- a/configs/ramulator_configs/ALDRAM-config.cfg
+++ /dev/null
@@ -1,30 +0,0 @@
-########################
-# Example config file
-# Comments start with #
-# There are restrictions for valid channel/rank numbers
- standard = ALDRAM
- channels = 1
- ranks = 1
- speed = ALDRAM_1600K
- org = ALDRAM_4Gb_x8
-# record_cmd_trace: (default is off): on, off
- record_cmd_trace = off
-# print_cmd_trace: (default is off): on, off
- print_cmd_trace = off
-
-### Below are parameters only for CPU trace
- cpu_tick = 4
- mem_tick = 1
-### Below are parameters only for multicore mode
-# When early_exit is on, all cores will be terminated when the earliest one finishes.
- early_exit = on
-# early_exit = on, off (default value is on)
-# If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
- expected_limit_insts = 200000000
- warmup_insts = 100000000
- cache = no
-# cache = no, L1L2, L3, all (default value is no)
- translation = None
-# translation = None, Random (default value is None)
-#
-########################
diff --git a/configs/ramulator_configs/DDR3-config.cfg b/configs/ramulator_configs/DDR3-config.cfg
deleted file mode 100644
index 777f6b58..00000000
--- a/configs/ramulator_configs/DDR3-config.cfg
+++ /dev/null
@@ -1,31 +0,0 @@
-########################
-# Example config file
-# Comments start with #
-# There are restrictions for valid channel/rank numbers
- standard = DDR3
- channels = 1
- ranks = 1
- speed = DDR3_1600K
- org = DDR3_2Gb_x8
-# record_cmd_trace: (default is off): on, off
- record_cmd_trace = off
-# print_cmd_trace: (default is off): on, off
- print_cmd_trace = off
-
-### Below are parameters only for CPU trace
- cpu_tick = 4
- mem_tick = 1
-### Below are parameters only for multicore mode
-# When early_exit is on, all cores will be terminated when the earliest one finishes.
- early_exit = on
-# early_exit = on, off (default value is on)
-# If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
- expected_limit_insts = 200000000
-# warmup_insts = 100000000
- warmup_insts = 0
- cache = no
-# cache = no, L1L2, L3, all (default value is no)
- translation = None
-# translation = None, Random (default value is None)
-#
-########################
diff --git a/configs/ramulator_configs/DDR4-config.cfg b/configs/ramulator_configs/DDR4-config.cfg
deleted file mode 100644
index 3f2cd4fd..00000000
--- a/configs/ramulator_configs/DDR4-config.cfg
+++ /dev/null
@@ -1,31 +0,0 @@
-########################
-# Example config file
-# Comments start with #
-# There are restrictions for valid channel/rank numbers
- standard = DDR4
- channels = 2
- ranks = 1
- speed = DDR4_3200
- org = DDR4_4Gb_x8
-# record_cmd_trace: (default is off): on, off
- record_cmd_trace = off
-# print_cmd_trace: (default is off): on, off
- print_cmd_trace = off
-
-### Below are parameters only for CPU trace
- cpu_tick = 8
- mem_tick = 3
-### Below are parameters only for multicore mode
-# When early_exit is on, all cores will be terminated when the earliest one finishes.
- early_exit = on
-# early_exit = on, off (default value is on)
-# If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
- expected_limit_insts = 200000000
-#  warmup_insts = 100000000
- warmup_insts = 0
- cache = no
-# cache = no, L1L2, L3, all (default value is no)
- translation = None
-# translation = None, Random (default value is None)
-#
-########################
diff --git a/configs/ramulator_configs/DSARP-config.cfg b/configs/ramulator_configs/DSARP-config.cfg
deleted file mode 100644
index b67c067c..00000000
--- a/configs/ramulator_configs/DSARP-config.cfg
+++ /dev/null
@@ -1,31 +0,0 @@
-########################
-# Example config file
-# Comments start with #
-# There are restrictions for valid channel/rank numbers
- standard = DSARP
- subarrays = 8
- channels = 1
- ranks = 1
- speed = DSARP_1333
- org = DSARP_8Gb_x8
-# record_cmd_trace: (default is off): on, off
- record_cmd_trace = off
-# print_cmd_trace: (default is off): on, off
- print_cmd_trace = off
-
-### Below are parameters only for CPU trace
- cpu_tick = 4
- mem_tick = 1
-### Below are parameters only for multicore mode
-# When early_exit is on, all cores will be terminated when the earliest one finishes.
- early_exit = on
-# early_exit = on, off (default value is on)
-# If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
- expected_limit_insts = 200000000
- warmup_insts = 100000000
- cache = no
-# cache = no, L1L2, L3, all (default value is no)
- translation = None
-# translation = None, Random (default value is None)
-#
-########################
diff --git a/configs/ramulator_configs/GDDR5-config.cfg b/configs/ramulator_configs/GDDR5-config.cfg
deleted file mode 100644
index 96006841..00000000
--- a/configs/ramulator_configs/GDDR5-config.cfg
+++ /dev/null
@@ -1,30 +0,0 @@
-########################
-# Example config file
-# Comments start with #
-# There are restrictions for valid channel/rank numbers
- standard = GDDR5
- channels = 1
- ranks = 1
- speed = GDDR5_6000
- org = GDDR5_8Gb_x16
-# record_cmd_trace: (default is off): on, off
- record_cmd_trace = off
-# print_cmd_trace: (default is off): on, off
- print_cmd_trace = off
-
-### Below are parameters only for CPU trace
- cpu_tick = 2
- mem_tick = 1
-### Below are parameters only for multicore mode
-# When early_exit is on, all cores will be terminated when the earliest one finishes.
- early_exit = on
-# early_exit = on, off (default value is on)
-# If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
- expected_limit_insts = 200000000
- warmup_insts = 100000000
- cache = no
-# cache = no, L1L2, L3, all (default value is no)
- translation = None
-# translation = None, Random (default value is None)
-#
-########################
diff --git a/configs/ramulator_configs/HBM-config.cfg b/configs/ramulator_configs/HBM-config.cfg
deleted file mode 100644
index 9e1dcb9e..00000000
--- a/configs/ramulator_configs/HBM-config.cfg
+++ /dev/null
@@ -1,32 +0,0 @@
-########################
-# Example config file
-# Comments start with #
-# There are restrictions for valid channel/rank numbers
- standard = HBM
- channels = 32
- ranks = 1
- speed = HBM_2Gbps
- org = HBM_4Gb
-# record_cmd_trace: (default is off): on, off
- record_cmd_trace = off
-# print_cmd_trace: (default is off): on, off
- print_cmd_trace = off
-
-### Below are parameters only for CPU trace
- cpu_tick = 32
- mem_tick = 5
-### Below are parameters only for multicore mode
-# When early_exit is on, all cores will be terminated when the earliest one finishes.
- early_exit = on
-# early_exit = on, off (default value is on)
-# If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
- expected_limit_insts = 200000000
- warmup_insts = 100000000
- cache = no
-# cache = no, L1L2, L3, all (default value is no)
- translation = None
- mapping = RoBaRaCoCh
- scheduler = FRFCFS
-# translation = None, Random (default value is None)
-#
-########################
diff --git a/configs/ramulator_configs/HBM-config_ChRaBaRoCo.cfg b/configs/ramulator_configs/HBM-config_ChRaBaRoCo.cfg
deleted file mode 100644
index b8318c23..00000000
--- a/configs/ramulator_configs/HBM-config_ChRaBaRoCo.cfg
+++ /dev/null
@@ -1,32 +0,0 @@
-########################
-# Example config file
-# Comments start with #
-# There are restrictions for valid channel/rank numbers
- standard = HBM
- channels = 8
- ranks = 1
- speed = HBM_1Gbps
- org = HBM_4Gb
-# record_cmd_trace: (default is off): on, off
- record_cmd_trace = off
-# print_cmd_trace: (default is off): on, off
- print_cmd_trace = off
-
-### Below are parameters only for CPU trace
- cpu_tick = 32
- mem_tick = 5
-### Below are parameters only for multicore mode
-# When early_exit is on, all cores will be terminated when the earliest one finishes.
- early_exit = on
-# early_exit = on, off (default value is on)
-# If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
- expected_limit_insts = 200000000
- warmup_insts = 100000000
- cache = no
-# cache = no, L1L2, L3, all (default value is no)
- translation = None
- mapping = ChRaBaRoCo
- scheduler = FRFCFS
-# translation = None, Random (default value is None)
-#
-########################
diff --git a/configs/ramulator_configs/HBM-config_FCFS.cfg b/configs/ramulator_configs/HBM-config_FCFS.cfg
deleted file mode 100644
index cd9aa1e5..00000000
--- a/configs/ramulator_configs/HBM-config_FCFS.cfg
+++ /dev/null
@@ -1,32 +0,0 @@
-########################
-# Example config file
-# Comments start with #
-# There are restrictions for valid channel/rank numbers
- standard = HBM
- channels = 8
- ranks = 1
- speed = HBM_1Gbps
- org = HBM_4Gb
-# record_cmd_trace: (default is off): on, off
- record_cmd_trace = off
-# print_cmd_trace: (default is off): on, off
- print_cmd_trace = off
-
-### Below are parameters only for CPU trace
- cpu_tick = 32
- mem_tick = 5
-### Below are parameters only for multicore mode
-# When early_exit is on, all cores will be terminated when the earliest one finishes.
- early_exit = on
-# early_exit = on, off (default value is on)
-# If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
- expected_limit_insts = 200000000
- warmup_insts = 100000000
- cache = no
-# cache = no, L1L2, L3, all (default value is no)
- translation = None
- mapping = RoBaRaCoCh
- scheduler = FCFS
-# translation = None, Random (default value is None)
-#
-########################
diff --git a/configs/ramulator_configs/HBM-config_FRFCFS.cfg b/configs/ramulator_configs/HBM-config_FRFCFS.cfg
deleted file mode 100644
index f08d705f..00000000
--- a/configs/ramulator_configs/HBM-config_FRFCFS.cfg
+++ /dev/null
@@ -1,32 +0,0 @@
-########################
-# Example config file
-# Comments start with #
-# There are restrictions for valid channel/rank numbers
- standard = HBM
- channels = 8
- ranks = 1
- speed = HBM_1Gbps
- org = HBM_4Gb
-# record_cmd_trace: (default is off): on, off
- record_cmd_trace = off
-# print_cmd_trace: (default is off): on, off
- print_cmd_trace = off
-
-### Below are parameters only for CPU trace
- cpu_tick = 32
- mem_tick = 5
-### Below are parameters only for multicore mode
-# When early_exit is on, all cores will be terminated when the earliest one finishes.
- early_exit = on
-# early_exit = on, off (default value is on)
-# If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
- expected_limit_insts = 200000000
- warmup_insts = 100000000
- cache = no
-# cache = no, L1L2, L3, all (default value is no)
- translation = None
- mapping = RoBaRaCoCh
- scheduler = FRFCFS
-# translation = None, Random (default value is None)
-#
-########################
diff --git a/configs/ramulator_configs/HBM-config_FRFCFS_Cap.cfg b/configs/ramulator_configs/HBM-config_FRFCFS_Cap.cfg
deleted file mode 100644
index 52a68486..00000000
--- a/configs/ramulator_configs/HBM-config_FRFCFS_Cap.cfg
+++ /dev/null
@@ -1,32 +0,0 @@
-########################
-# Example config file
-# Comments start with #
-# There are restrictions for valid channel/rank numbers
- standard = HBM
- channels = 8
- ranks = 1
- speed = HBM_1Gbps
- org = HBM_4Gb
-# record_cmd_trace: (default is off): on, off
- record_cmd_trace = off
-# print_cmd_trace: (default is off): on, off
- print_cmd_trace = off
-
-### Below are parameters only for CPU trace
- cpu_tick = 32
- mem_tick = 5
-### Below are parameters only for multicore mode
-# When early_exit is on, all cores will be terminated when the earliest one finishes.
- early_exit = on
-# early_exit = on, off (default value is on)
-# If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
- expected_limit_insts = 200000000
- warmup_insts = 100000000
- cache = no
-# cache = no, L1L2, L3, all (default value is no)
- translation = None
- mapping = RoBaRaCoCh
- scheduler = FRFCFS_Cap
-# translation = None, Random (default value is None)
-#
-########################
diff --git a/configs/ramulator_configs/HBM-config_FRFCFS_PriorHit.cfg b/configs/ramulator_configs/HBM-config_FRFCFS_PriorHit.cfg
deleted file mode 100644
index 55d9f4e7..00000000
--- a/configs/ramulator_configs/HBM-config_FRFCFS_PriorHit.cfg
+++ /dev/null
@@ -1,32 +0,0 @@
-########################
-# Example config file
-# Comments start with #
-# There are restrictions for valid channel/rank numbers
- standard = HBM
- channels = 8
- ranks = 1
- speed = HBM_1Gbps
- org = HBM_4Gb
-# record_cmd_trace: (default is off): on, off
- record_cmd_trace = off
-# print_cmd_trace: (default is off): on, off
- print_cmd_trace = off
-
-### Below are parameters only for CPU trace
- cpu_tick = 32
- mem_tick = 5
-### Below are parameters only for multicore mode
-# When early_exit is on, all cores will be terminated when the earliest one finishes.
- early_exit = on
-# early_exit = on, off (default value is on)
-# If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
- expected_limit_insts = 200000000
- warmup_insts = 100000000
- cache = no
-# cache = no, L1L2, L3, all (default value is no)
- translation = None
- mapping = RoBaRaCoCh
- scheduler = FRFCFS_PriorHit
-# translation = None, Random (default value is None)
-#
-########################
diff --git a/configs/ramulator_configs/HBM-config_RoBaRaCoCh.cfg b/configs/ramulator_configs/HBM-config_RoBaRaCoCh.cfg
deleted file mode 100644
index f08d705f..00000000
--- a/configs/ramulator_configs/HBM-config_RoBaRaCoCh.cfg
+++ /dev/null
@@ -1,32 +0,0 @@
-########################
-# Example config file
-# Comments start with #
-# There are restrictions for valid channel/rank numbers
- standard = HBM
- channels = 8
- ranks = 1
- speed = HBM_1Gbps
- org = HBM_4Gb
-# record_cmd_trace: (default is off): on, off
- record_cmd_trace = off
-# print_cmd_trace: (default is off): on, off
- print_cmd_trace = off
-
-### Below are parameters only for CPU trace
- cpu_tick = 32
- mem_tick = 5
-### Below are parameters only for multicore mode
-# When early_exit is on, all cores will be terminated when the earliest one finishes.
- early_exit = on
-# early_exit = on, off (default value is on)
-# If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
- expected_limit_insts = 200000000
- warmup_insts = 100000000
- cache = no
-# cache = no, L1L2, L3, all (default value is no)
- translation = None
- mapping = RoBaRaCoCh
- scheduler = FRFCFS
-# translation = None, Random (default value is None)
-#
-########################
diff --git a/configs/ramulator_configs/HBM-config_RoCoBaRaCh.cfg b/configs/ramulator_configs/HBM-config_RoCoBaRaCh.cfg
deleted file mode 100644
index 648e9ab4..00000000
--- a/configs/ramulator_configs/HBM-config_RoCoBaRaCh.cfg
+++ /dev/null
@@ -1,32 +0,0 @@
-########################
-# Example config file
-# Comments start with #
-# There are restrictions for valid channel/rank numbers
- standard = HBM
- channels = 8
- ranks = 1
- speed = HBM_1Gbps
- org = HBM_4Gb
-# record_cmd_trace: (default is off): on, off
- record_cmd_trace = off
-# print_cmd_trace: (default is off): on, off
- print_cmd_trace = off
-
-### Below are parameters only for CPU trace
- cpu_tick = 32
- mem_tick = 5
-### Below are parameters only for multicore mode
-# When early_exit is on, all cores will be terminated when the earliest one finishes.
- early_exit = on
-# early_exit = on, off (default value is on)
-# If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
- expected_limit_insts = 200000000
- warmup_insts = 100000000
- cache = no
-# cache = no, L1L2, L3, all (default value is no)
- translation = None
- mapping = RoCoBaRaCh
- scheduler = FRFCFS
-# translation = None, Random (default value is None)
-#
-########################
diff --git a/configs/ramulator_configs/HBMx0.5ch-config.cfg b/configs/ramulator_configs/HBMx0.5ch-config.cfg
deleted file mode 100644
index 064c8291..00000000
--- a/configs/ramulator_configs/HBMx0.5ch-config.cfg
+++ /dev/null
@@ -1,30 +0,0 @@
-########################
-# Example config file
-# Comments start with #
-# There are restrictions for valid channel/rank numbers
- standard = HBM
- channels = 4
- ranks = 1
- speed = HBM_1Gbps
- org = HBM_4Gb
-# record_cmd_trace: (default is off): on, off
- record_cmd_trace = off
-# print_cmd_trace: (default is off): on, off
- print_cmd_trace = off
-
-### Below are parameters only for CPU trace
- cpu_tick = 32
- mem_tick = 5
-### Below are parameters only for multicore mode
-# When early_exit is on, all cores will be terminated when the earliest one finishes.
- early_exit = on
-# early_exit = on, off (default value is on)
-# If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
- expected_limit_insts = 200000000
- warmup_insts = 100000000
- cache = no
-# cache = no, L1L2, L3, all (default value is no)
- translation = None
-# translation = None, Random (default value is None)
-#
-########################
diff --git a/configs/ramulator_configs/HBMx2ch-config.cfg b/configs/ramulator_configs/HBMx2ch-config.cfg
deleted file mode 100644
index 17635ad0..00000000
--- a/configs/ramulator_configs/HBMx2ch-config.cfg
+++ /dev/null
@@ -1,30 +0,0 @@
-########################
-# Example config file
-# Comments start with #
-# There are restrictions for valid channel/rank numbers
- standard = HBM
- channels = 16
- ranks = 1
- speed = HBM_1Gbps
- org = HBM_4Gb
-# record_cmd_trace: (default is off): on, off
- record_cmd_trace = off
-# print_cmd_trace: (default is off): on, off
- print_cmd_trace = off
-
-### Below are parameters only for CPU trace
- cpu_tick = 32
- mem_tick = 5
-### Below are parameters only for multicore mode
-# When early_exit is on, all cores will be terminated when the earliest one finishes.
- early_exit = on
-# early_exit = on, off (default value is on)
-# If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
- expected_limit_insts = 200000000
- warmup_insts = 100000000
- cache = no
-# cache = no, L1L2, L3, all (default value is no)
- translation = None
-# translation = None, Random (default value is None)
-#
-########################
diff --git a/configs/ramulator_configs/LPDDR3-config.cfg b/configs/ramulator_configs/LPDDR3-config.cfg
deleted file mode 100644
index b5618bc3..00000000
--- a/configs/ramulator_configs/LPDDR3-config.cfg
+++ /dev/null
@@ -1,30 +0,0 @@
-########################
-# Example config file
-# Comments start with #
-# There are restrictions for valid channel/rank numbers
- standard = LPDDR3
- channels = 1
- ranks = 1
- speed = LPDDR3_1600
- org = LPDDR3_8Gb_x16
-# record_cmd_trace: (default is off): on, off
- record_cmd_trace = off
-# print_cmd_trace: (default is off): on, off
- print_cmd_trace = off
-
-### Below are parameters only for CPU trace
- cpu_tick = 4
- mem_tick = 1
-### Below are parameters only for multicore mode
-# When early_exit is on, all cores will be terminated when the earliest one finishes.
- early_exit = on
-# early_exit = on, off (default value is on)
-# If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
- expected_limit_insts = 200000000
- warmup_insts = 100000000
- cache = no
-# cache = no, L1L2, L3, all (default value is no)
- translation = None
-# translation = None, Random (default value is None)
-#
-########################
diff --git a/configs/ramulator_configs/LPDDR4-config.cfg b/configs/ramulator_configs/LPDDR4-config.cfg
deleted file mode 100644
index b74512c9..00000000
--- a/configs/ramulator_configs/LPDDR4-config.cfg
+++ /dev/null
@@ -1,30 +0,0 @@
-########################
-# Example config file
-# Comments start with #
-# There are restrictions for valid channel/rank numbers
- standard = LPDDR4
- channels = 2
- ranks = 1
- speed = LPDDR4_2400
- org = LPDDR4_8Gb_x16
-# record_cmd_trace: (default is off): on, off
- record_cmd_trace = off
-# print_cmd_trace: (default is off): on, off
- print_cmd_trace = off
-
-### Below are parameters only for CPU trace
- cpu_tick = 8
- mem_tick = 3
-### Below are parameters only for multicore mode
-# When early_exit is on, all cores will be terminated when the earliest one finishes.
- early_exit = on
-# early_exit = on, off (default value is on)
-# If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
- expected_limit_insts = 200000000
- warmup_insts = 100000000
- cache = no
-# cache = no, L1L2, L3, all (default value is no)
- translation = None
-# translation = None, Random (default value is None)
-#
-########################
diff --git a/configs/ramulator_configs/PCM-config.cfg b/configs/ramulator_configs/PCM-config.cfg
deleted file mode 100644
index 1bd7fcce..00000000
--- a/configs/ramulator_configs/PCM-config.cfg
+++ /dev/null
@@ -1,30 +0,0 @@
-########################
-# Example config file
-# Comments start with #
-# There are restrictions for valid channel/rank numbers
- standard = PCM
- channels = 1
- ranks = 1
- speed = PCM_800D
- org = PCM_2Gb_x8
-# record_cmd_trace: (default is off): on, off
- record_cmd_trace = off
-# print_cmd_trace: (default is off): on, off
- print_cmd_trace = off
-
-### Below are parameters only for CPU trace
- cpu_tick = 4
- mem_tick = 1
-### Below are parameters only for multicore mode
-# When early_exit is on, all cores will be terminated when the earliest one finishes.
- early_exit = on
-# early_exit = on, off (default value is on)
-# If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
- expected_limit_insts = 200000000
- warmup_insts = 100000000
- cache = no
-# cache = no, L1L2, L3, all (default value is no)
- translation = None
-# translation = None, Random (default value is None)
-#
-########################
diff --git a/configs/ramulator_configs/SALP-config.cfg b/configs/ramulator_configs/SALP-config.cfg
deleted file mode 100644
index 0e5a809a..00000000
--- a/configs/ramulator_configs/SALP-config.cfg
+++ /dev/null
@@ -1,31 +0,0 @@
-########################
-# Example config file
-# Comments start with #
-# There are restrictions for valid channel/rank numbers
- standard = SALP-MASA
- subarrays = 8
- channels = 1
- ranks = 1
- speed = SALP_1600K
- org = SALP_4Gb_x8
-# record_cmd_trace: (default is off): on, off
- record_cmd_trace = off
-# print_cmd_trace: (default is off): on, off
- print_cmd_trace = off
-
-### Below are parameters only for CPU trace
- cpu_tick = 4
- mem_tick = 1
-### Below are parameters only for multicore mode
-# When early_exit is on, all cores will be terminated when the earliest one finishes.
- early_exit = on
-# early_exit = on, off (default value is on)
-# If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
- expected_limit_insts = 200000000
- warmup_insts = 100000000
- cache = no
-# cache = no, L1L2, L3, all (default value is no)
- translation = None
-# translation = None, Random (default value is None)
-#
-########################
diff --git a/configs/ramulator_configs/STTMRAM-config.cfg b/configs/ramulator_configs/STTMRAM-config.cfg
deleted file mode 100644
index b689e514..00000000
--- a/configs/ramulator_configs/STTMRAM-config.cfg
+++ /dev/null
@@ -1,30 +0,0 @@
-########################
-# Example config file
-# Comments start with #
-# There are restrictions for valid channel/rank numbers
- standard = STTMRAM
- channels = 4
- ranks = 1
- speed = STT_1600_1_2
- org = STTMRAM_2Gb_x8
-# record_cmd_trace: (default is off): on, off
- record_cmd_trace = off
-# print_cmd_trace: (default is off): on, off
- print_cmd_trace = off
-
-### Below are parameters only for CPU trace
- cpu_tick = 4
- mem_tick = 1
-### Below are parameters only for multicore mode
-# When early_exit is on, all cores will be terminated when the earliest one finishes.
- early_exit = on
-# early_exit = on, off (default value is on)
-# If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
- expected_limit_insts = 200000000
- warmup_insts = 100000000
- cache = no
-# cache = no, L1L2, L3, all (default value is no)
- translation = None
-# translation = None, Random (default value is None)
-#
-########################
diff --git a/configs/ramulator_configs/TLDRAM-config.cfg b/configs/ramulator_configs/TLDRAM-config.cfg
deleted file mode 100644
index 0f7e06e9..00000000
--- a/configs/ramulator_configs/TLDRAM-config.cfg
+++ /dev/null
@@ -1,31 +0,0 @@
-########################
-# Example config file
-# Comments start with #
-# There are restrictions for valid channel/rank numbers
- standard = TLDRAM
- subarrays = 16
- channels = 1
- ranks = 1
- speed = TLDRAM_1600K
- org = TLDRAM_4Gb_x8
-# record_cmd_trace: (default is off): on, off
- record_cmd_trace = off
-# print_cmd_trace: (default is off): on, off
- print_cmd_trace = off
-
-### Below are parameters only for CPU trace
- cpu_tick = 4
- mem_tick = 1
-### Below are parameters only for multicore mode
-# When early_exit is on, all cores will be terminated when the earliest one finishes.
- early_exit = on
-# early_exit = on, off (default value is on)
-# If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
- expected_limit_insts = 200000000
- warmup_insts = 100000000
- cache = no
-# cache = no, L1L2, L3, all (default value is no)
- translation = None
-# translation = None, Random (default value is None)
-#
-########################
diff --git a/configs/ramulator_configs/WideIO-config.cfg b/configs/ramulator_configs/WideIO-config.cfg
deleted file mode 100644
index 5270d3cb..00000000
--- a/configs/ramulator_configs/WideIO-config.cfg
+++ /dev/null
@@ -1,30 +0,0 @@
-########################
-# Example config file
-# Comments start with #
-# There are restrictions for valid channel/rank numbers
- standard = WideIO
- channels = 4
- ranks = 1
- speed = WideIO_266
- org = WideIO_8Gb
-# record_cmd_trace: (default is off): on, off
- record_cmd_trace = off
-# print_cmd_trace: (default is off): on, off
- print_cmd_trace = off
-
-### Below are parameters only for CPU trace
- cpu_tick = 4
- mem_tick = 1
-### Below are parameters only for multicore mode
-# When early_exit is on, all cores will be terminated when the earliest one finishes.
- early_exit = on
-# early_exit = on, off (default value is on)
-# If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
- expected_limit_insts = 200000000
- warmup_insts = 100000000
- cache = no
-# cache = no, L1L2, L3, all (default value is no)
- translation = None
-# translation = None, Random (default value is None)
-#
-########################
diff --git a/configs/ramulator_configs/WideIO2-config.cfg b/configs/ramulator_configs/WideIO2-config.cfg
deleted file mode 100644
index 324b78fe..00000000
--- a/configs/ramulator_configs/WideIO2-config.cfg
+++ /dev/null
@@ -1,30 +0,0 @@
-########################
-# Example config file
-# Comments start with #
-# There are restrictions for valid channel/rank numbers
- standard = WideIO2
- channels = 8
- ranks = 1
- speed = WideIO2_1066
- org = WideIO2_8Gb
-# record_cmd_trace: (default is off): on, off
- record_cmd_trace = off
-# print_cmd_trace: (default is off): on, off
- print_cmd_trace = off
-
-### Below are parameters only for CPU trace
- cpu_tick = 6
- mem_tick = 1
-### Below are parameters only for multicore mode
-# When early_exit is on, all cores will be terminated when the earliest one finishes.
- early_exit = on
-# early_exit = on, off (default value is on)
-# If expected_limit_insts is set, some per-core statistics will be recorded when this limit (or the end of the whole trace if it's shorter than specified limit) is reached. The simulation won't stop and will roll back automatically until the last one reaches the limit.
- expected_limit_insts = 200000000
- warmup_insts = 100000000
- cache = no
-# cache = no, L1L2, L3, all (default value is no)
- translation = None
-# translatino = None, Random (default value is None)
-#
-########################
diff --git a/configs/stonne_big_c1_simple_noc.yml b/configs/stonne_big_c1_simple_noc.yml
index b14838c8..9bbfd6df 100644
--- a/configs/stonne_big_c1_simple_noc.yml
+++ b/configs/stonne_big_c1_simple_noc.yml
@@ -10,8 +10,6 @@ num_stonne_port: 64
 dram_type: ramulator2
 dram_freq_mhz: 940
 dram_channels: 8
-dram_req_size_byte: 32
-dram_num_burst_length: 2
 dram_stats_print_period_cycless: 10000
 ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
 
diff --git a/configs/stonne_single_c1_simple_noc.yml b/configs/stonne_single_c1_simple_noc.yml
index 0ed7962c..d1087301 100644
--- a/configs/stonne_single_c1_simple_noc.yml
+++ b/configs/stonne_single_c1_simple_noc.yml
@@ -10,8 +10,6 @@ num_stonne_port: 8
 dram_type: ramulator2
 dram_freq_mhz: 700
 dram_channels: 8
-dram_req_size_byte: 32
-dram_num_burst_length: 2
 dram_stats_print_period_cycles: 10000
 ramulator_config_path: ../configs/ramulator2_configs/HBM2.yaml
 
diff --git a/configs/systolic_ws_128x128_c1_booksim_tpuv2.yml b/configs/systolic_ws_128x128_c1_booksim_tpuv2.yml
index 08149005..fb07eb6a 100644
--- a/configs/systolic_ws_128x128_c1_booksim_tpuv2.yml
+++ b/configs/systolic_ws_128x128_c1_booksim_tpuv2.yml
@@ -9,8 +9,6 @@ vpu_vector_length_bits: 256
 dram_type: ramulator2
 dram_freq_mhz: 700
 dram_channels: 16
-dram_req_size_byte: 32
-dram_num_burst_length: 2
 dram_stats_print_period_cycles: 10000
 ramulator_config_path: ../configs/ramulator2_configs/HBM2.yaml
 
diff --git a/configs/systolic_ws_128x128_c1_booksim_tpuv3.yml b/configs/systolic_ws_128x128_c1_booksim_tpuv3.yml
index 12304ce2..f830419b 100644
--- a/configs/systolic_ws_128x128_c1_booksim_tpuv3.yml
+++ b/configs/systolic_ws_128x128_c1_booksim_tpuv3.yml
@@ -10,8 +10,6 @@ vpu_vector_length_bits: 256
 dram_type: ramulator2
 dram_freq_mhz: 940
 dram_channels: 16
-dram_req_size_byte: 32
-dram_num_burst_length: 2
 dram_stats_print_period_cycles: 10000
 ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
 
diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.yml b/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.yml
index aec29ff8..6277cc39 100644
--- a/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.yml
+++ b/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.yml
@@ -9,8 +9,6 @@ vpu_vector_length_bits: 256
 dram_type: ramulator2
 dram_freq_mhz: 700
 dram_channels: 32
-dram_req_size_byte: 32
-dram_num_burst_length: 2
 dram_stats_print_period_cycless: 10000
 ramulator_config_path: ../configs/ramulator2_configs/HBM2.yaml
 
diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml
index 72873f1c..ff976784 100644
--- a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml
+++ b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml
@@ -10,8 +10,6 @@ vpu_vector_length_bits: 256
 dram_type: ramulator2
 dram_freq_mhz: 940
 dram_channels: 16
-dram_req_size_byte: 32
-dram_num_burst_length: 2
 dram_stats_print_period_cycles: 10000
 ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
 
diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.yml b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.yml
index c2e962e3..2ed1bb12 100644
--- a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.yml
+++ b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.yml
@@ -10,8 +10,6 @@ vpu_vector_length_bits: 256
 dram_type: ramulator2
 dram_freq_mhz: 940
 dram_channels: 8
-dram_req_size_byte: 32
-dram_num_burst_length: 2
 dram_stats_print_period_cycles: 10000
 ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
 
diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_timing_only.yml b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_timing_only.yml
index a7607108..1bcc9bb3 100644
--- a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_timing_only.yml
+++ b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_timing_only.yml
@@ -10,8 +10,6 @@ vpu_vector_length_bits: 256
 dram_type: ramulator2
 dram_freq_mhz: 940
 dram_channels: 16
-dram_req_size_byte: 32
-dram_num_burst_length: 2
 dram_stats_print_period_cycles: 10000
 ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
 
diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.yml b/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.yml
index 0415876d..3328cf77 100644
--- a/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.yml
+++ b/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.yml
@@ -10,8 +10,6 @@ vpu_vector_length_bits: 256
 dram_type: ramulator2
 dram_freq_mhz: 1200
 dram_channels: 16
-dram_req_size_byte: 32
-dram_num_burst_length: 2
 dram_stats_print_period_cycles: 10000
 ramulator_config_path: ../configs/ramulator2_configs/HBM2.yaml
 l2d_type: datacache
diff --git a/configs/systolic_ws_128x128_c2_booksim_tpuv3.yml b/configs/systolic_ws_128x128_c2_booksim_tpuv3.yml
index e411c0f3..bf01913b 100644
--- a/configs/systolic_ws_128x128_c2_booksim_tpuv3.yml
+++ b/configs/systolic_ws_128x128_c2_booksim_tpuv3.yml
@@ -10,8 +10,6 @@ vpu_vector_length_bits: 256
 dram_type: ramulator2
 dram_freq_mhz: 940
 dram_channels: 32
-dram_req_size_byte: 32
-dram_num_burst_length: 2
 dram_stats_print_period_cycles: 10000
 ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
 
diff --git a/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.yml b/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.yml
index f164b108..8c71c528 100644
--- a/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.yml
+++ b/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.yml
@@ -9,13 +9,10 @@ vpu_spad_size_kb_per_lane: 128
 vpu_vector_length_bits: 256
 
 dram_type: ramulator2
-dram_freq: 940
+dram_freq_mhz: 940
 dram_channels: 8
-dram_req_size: 32
-dram_latency: 10
-dram_nbl: 2
-dram_print_interval: 10000
-dram_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
+dram_stats_print_period_cycles: 10000
+ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
 
 icnt_type: booksim2
 icnt_latency_cycles: 10
diff --git a/configs/systolic_ws_128x128_c2_chiplet_tpuv3.yml b/configs/systolic_ws_128x128_c2_chiplet_tpuv3.yml
index e38f091f..d058f188 100644
--- a/configs/systolic_ws_128x128_c2_chiplet_tpuv3.yml
+++ b/configs/systolic_ws_128x128_c2_chiplet_tpuv3.yml
@@ -10,8 +10,6 @@ vpu_vector_length_bits: 256
 dram_type: ramulator2
 dram_freq_mhz: 940
 dram_channels: 32
-dram_req_size_byte: 32
-dram_num_burst_length: 2
 dram_stats_print_period_cycles: 10000
 dram_num_partitions: 2
 ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
diff --git a/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.yml b/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.yml
index 57696243..019a0f0f 100644
--- a/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.yml
+++ b/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.yml
@@ -10,8 +10,6 @@ vpu_vector_length_bits: 256
 dram_type: ramulator2
 dram_freq_mhz: 940
 dram_channels: 32
-dram_req_size_byte: 32
-dram_num_burst_length: 2
 dram_stats_print_period_cycles: 10000
 dram_num_partitions: 1
 ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml b/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml
index f0686055..918510d8 100644
--- a/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml
+++ b/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml
@@ -9,8 +9,6 @@ vpu_vector_length_bits: 256
 dram_type: ramulator2
 dram_freq_mhz: 700
 dram_channels: 32
-dram_req_size_byte: 32
-dram_num_burst_length: 2
 dram_stats_print_period_cycles: 10000
 ramulator_config_path: ../configs/ramulator2_configs/HBM2.yaml
 
diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.yml b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.yml
index 511a5a09..a0985aec 100644
--- a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.yml
+++ b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.yml
@@ -10,8 +10,6 @@ vpu_vector_length_bits: 256
 dram_type: ramulator2
 dram_freq_mhz: 940
 dram_channels: 32
-dram_req_size_byte: 32
-dram_num_burst_length: 2
 dram_stats_print_period_cycles: 10000
 ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
 
diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_ils.yml b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_ils.yml
index ce2d932d..166e2e25 100644
--- a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_ils.yml
+++ b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_ils.yml
@@ -13,8 +13,6 @@ vpu_vector_length_bits: 256
 dram_type: ramulator2
 dram_freq_mhz: 940
 dram_channels: 32
-dram_req_size_byte: 32
-dram_num_burst_length: 2
 dram_stats_print_period_cycles: 10000
 ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
 
diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.yml b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.yml
index 499ad823..6119e83d 100644
--- a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.yml
+++ b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.yml
@@ -10,8 +10,6 @@ vpu_vector_length_bits: 256
 dram_type: ramulator2
 dram_freq_mhz: 940
 dram_channels: 32
-dram_req_size_byte: 32
-dram_num_burst_length: 2
 dram_stats_print_period_cycles: 10000
 ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
 
diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml b/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml
index da40f01e..9e87511f 100644
--- a/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml
+++ b/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml
@@ -10,8 +10,6 @@ vpu_vector_length_bits: 256
 dram_type: ramulator2
 dram_freq_mhz: 1200
 dram_channels: 32
-dram_req_size_byte: 32
-dram_num_burst_length: 2
 dram_stats_print_period_cycles: 10000
 ramulator_config_path: ../configs/ramulator2_configs/HBM2.yaml
 l2d_type: datacache
diff --git a/configs/systolic_ws_8x8_c1_booksim.yml b/configs/systolic_ws_8x8_c1_booksim.yml
index 6fd305f9..f46d380e 100644
--- a/configs/systolic_ws_8x8_c1_booksim.yml
+++ b/configs/systolic_ws_8x8_c1_booksim.yml
@@ -9,8 +9,6 @@ vpu_vector_length_bits: 256
 dram_type: ramulator2
 dram_freq_mhz: 800
 dram_channels: 1
-dram_req_size_byte: 64
-dram_num_burst_length: 4
 dram_stats_print_period_cycles: 100000
 ramulator_config_path: ../configs/ramulator2_configs/DDR4.yaml
 
diff --git a/configs/systolic_ws_8x8_c1_simple_noc.yml b/configs/systolic_ws_8x8_c1_simple_noc.yml
index 274f633c..1be24b85 100644
--- a/configs/systolic_ws_8x8_c1_simple_noc.yml
+++ b/configs/systolic_ws_8x8_c1_simple_noc.yml
@@ -9,8 +9,6 @@ vpu_vector_length_bits: 256
 dram_type: ramulator2
 dram_freq_mhz: 800
 dram_channels: 1
-dram_req_size_byte: 64
-dram_num_burst_length: 4
 dram_stats_print_period_cycles: 100000
 ramulator_config_path: ../configs/ramulator2_configs/DDR4.yaml
 
diff --git a/tutorial/session1/togsim_configs/togsim_config.yml b/tutorial/session1/togsim_configs/togsim_config.yml
index 72873f1c..ff976784 100644
--- a/tutorial/session1/togsim_configs/togsim_config.yml
+++ b/tutorial/session1/togsim_configs/togsim_config.yml
@@ -10,8 +10,6 @@ vpu_vector_length_bits: 256
 dram_type: ramulator2
 dram_freq_mhz: 940
 dram_channels: 16
-dram_req_size_byte: 32
-dram_num_burst_length: 2
 dram_stats_print_period_cycles: 10000
 ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
 
diff --git a/tutorial/session1/togsim_configs/togsim_config_2_cores.yml b/tutorial/session1/togsim_configs/togsim_config_2_cores.yml
index 3b9b8fc8..a3a4ab93 100644
--- a/tutorial/session1/togsim_configs/togsim_config_2_cores.yml
+++ b/tutorial/session1/togsim_configs/togsim_config_2_cores.yml
@@ -10,8 +10,6 @@ vpu_vector_length_bits: 256
 dram_type: ramulator2
 dram_freq_mhz: 940
 dram_channels: 32
-dram_req_size_byte: 32
-dram_num_burst_length: 2
 dram_stats_print_period_cycles: 10000
 ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
 
diff --git a/tutorial/session1/togsim_configs/togsim_config_autotune.yml b/tutorial/session1/togsim_configs/togsim_config_autotune.yml
index 2726736a..1ec99521 100644
--- a/tutorial/session1/togsim_configs/togsim_config_autotune.yml
+++ b/tutorial/session1/togsim_configs/togsim_config_autotune.yml
@@ -10,8 +10,6 @@ vpu_vector_length_bits: 256
 dram_type: ramulator2
 dram_freq_mhz: 940
 dram_channels: 16
-dram_req_size_byte: 32
-dram_num_burst_length: 2
 dram_stats_print_period_cycles: 10000
 ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
 
diff --git a/tutorial/session1/togsim_configs/togsim_config_external_mapping.yml b/tutorial/session1/togsim_configs/togsim_config_external_mapping.yml
index 468a0b44..58c8165d 100644
--- a/tutorial/session1/togsim_configs/togsim_config_external_mapping.yml
+++ b/tutorial/session1/togsim_configs/togsim_config_external_mapping.yml
@@ -10,8 +10,6 @@ vpu_vector_length_bits: 256
 dram_type: ramulator2
 dram_freq_mhz: 940
 dram_channels: 16
-dram_req_size_byte: 32
-dram_num_burst_length: 2
 dram_stats_print_period_cycles: 10000
 ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
 
diff --git a/tutorial/session1/togsim_configs/togsim_config_functional_only.yml b/tutorial/session1/togsim_configs/togsim_config_functional_only.yml
index a1f1b432..b53ca4e0 100644
--- a/tutorial/session1/togsim_configs/togsim_config_functional_only.yml
+++ b/tutorial/session1/togsim_configs/togsim_config_functional_only.yml
@@ -10,8 +10,6 @@ vpu_vector_length_bits: 256
 dram_type: ramulator2
 dram_freq_mhz: 940
 dram_channels: 16
-dram_req_size_byte: 32
-dram_num_burst_length: 2
 dram_stats_print_period_cycles: 10000
 ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
 
diff --git a/tutorial/session1/togsim_configs/togsim_config_no_compiler_optimization.yml b/tutorial/session1/togsim_configs/togsim_config_no_compiler_optimization.yml
index 62d627a6..e47b63eb 100644
--- a/tutorial/session1/togsim_configs/togsim_config_no_compiler_optimization.yml
+++ b/tutorial/session1/togsim_configs/togsim_config_no_compiler_optimization.yml
@@ -10,8 +10,6 @@ vpu_vector_length_bits: 256
 dram_type: ramulator2
 dram_freq_mhz: 940
 dram_channels: 16
-dram_req_size_byte: 32
-dram_num_burst_length: 2
 dram_stats_print_period_cycles: 10000
 ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
 
diff --git a/tutorial/session1/togsim_configs/togsim_config_timing_only.yml b/tutorial/session1/togsim_configs/togsim_config_timing_only.yml
index 0024c073..24017861 100644
--- a/tutorial/session1/togsim_configs/togsim_config_timing_only.yml
+++ b/tutorial/session1/togsim_configs/togsim_config_timing_only.yml
@@ -10,8 +10,6 @@ vpu_vector_length_bits: 256
 dram_type: ramulator2
 dram_freq_mhz: 940
 dram_channels: 16
-dram_req_size_byte: 32
-dram_num_burst_length: 2
 dram_stats_print_period_cycles: 10000
 ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
 

From 67d87ce3e87825164521823e4228058bc55b75da Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 21 Apr 2026 23:30:34 +0900
Subject: [PATCH 178/194] [TOGSim/Log] Improve simulator log clarity and
 wording

---
 TOGSim/src/Common.cc    |  8 ++++----
 TOGSim/src/Core.cc      | 24 ++++++++++++------------
 TOGSim/src/Dram.cc      | 20 +++++++++++---------
 TOGSim/src/Simulator.cc | 15 +++++++--------
 TOGSim/src/main.cc      | 11 ++++++-----
 5 files changed, 40 insertions(+), 38 deletions(-)

diff --git a/TOGSim/src/Common.cc b/TOGSim/src/Common.cc
index ede991c8..ccb30760 100644
--- a/TOGSim/src/Common.cc
+++ b/TOGSim/src/Common.cc
@@ -7,7 +7,7 @@
 bool loadConfig(const std::string& config_path, YAML::Node& config_yaml) {
   try {
     config_yaml = YAML::LoadFile(config_path);
-    spdlog::info("[LoadConfig] Success to open \"{}\"", config_path);
+    spdlog::info("[LoadConfig] Loaded configuration file \"{}\"", config_path);
     return true;
   } catch (const YAML::BadFile& e) {
     spdlog::error("[LoadConfig] Failed to open \"{}\" (File not found or inaccessible)", config_path);
@@ -159,16 +159,16 @@ SimulationConfig initialize_config(const YAML::Node& config,
       if (config["partition"][core_partition]) {
           uint32_t partition_id = config["partition"][core_partition].as<uint32_t>();
           parsed_config.partiton_map[i] = partition_id;
-          spdlog::info("[Config/Core] CPU {}: Partition {}", i, partition_id);
+          spdlog::info("[Config/Core] core_id: {}, partition_id: {}", i, partition_id);
       } else {
-          spdlog::warn("[Config/Core] CPU {}: Partition key not found, defaulting to 0", i);
+          spdlog::warn("[Config/Core] core_id: {}, partition: missing in config, using partition_id 0", i);
           parsed_config.partiton_map[i] = 0;
       }
     }
   } else {
     for (int i=0; i<parsed_config.num_cores; i++) {
       parsed_config.partiton_map[i] = 0;
-      spdlog::info("[Config/Core] CPU {}: Partition {}", i, 0);
+      spdlog::info("[Config/Core] core_id: {}, partition_id: 0 (no partition section)", i);
     }
   }
   return parsed_config;
diff --git a/TOGSim/src/Core.cc b/TOGSim/src/Core.cc
index d9be4ca3..9dad8597 100644
--- a/TOGSim/src/Core.cc
+++ b/TOGSim/src/Core.cc
@@ -445,18 +445,18 @@ void Core::print_stats() {
       auto gemm   = _stat_gemm_inst;
       auto vector = inst - gemm;
       if (skipped)
-        spdlog::info("Core [{}] : {:8} inst_count {} (GEMM: {}, Vector: {}), skipped inst_count {}",
+        spdlog::info("Core [{}] : {:8} inst_count: {} (GEMM: {}, Vector: {}), skipped inst_count {}",
             _id, name, inst, gemm, vector, skipped);
       else
-        spdlog::info("Core [{}] : {:8} inst_count {} (GEMM: {}, Vector: {})",
+        spdlog::info("Core [{}] : {:8} inst_count: {} (GEMM: {}, Vector: {})",
             _id, name, inst, gemm, vector);
     }
     else {
       if (skipped)
-        spdlog::info("Core [{}] : {:8} inst_count {}, skipped inst_count {}",
+        spdlog::info("Core [{}] : {:8} inst_count: {}, skipped inst_count: {}",
             _id, name, inst, skipped);
       else
-        spdlog::info("Core [{}] : {:8} inst_count {}",
+        spdlog::info("Core [{}] : {:8} inst_count: {}",
             _id, name, inst);
     }
   }
@@ -464,14 +464,14 @@ void Core::print_stats() {
   for (int i=0; i<_num_systolic_array_per_core; i++)
     sa_utilization.push_back(static_cast<float>(_stat_tot_sa_compute_cycle.at(i) * 100) / _core_cycle);
   for (int i=0; i<_num_systolic_array_per_core; i++)
-    spdlog::info("Core [{}] : Systolic array [{}] utilization(%) {:.2f}, active_cycles {}, idle_cycles {}", _id, i, sa_utilization.at(i),
+    spdlog::info("Core [{}] : Systolic array [{}] utilization(%): {:.2f}, active_cycles: {}, idle_cycles: {}", _id, i, sa_utilization.at(i),
       _stat_tot_sa_compute_cycle.at(i), _stat_tot_sa_compute_idle_cycle.at(i));
   float dram_bw = _config.dram_req_size * _stat_tot_mem_response * _config.core_freq_mhz / (_core_cycle * 1000); // B/cycle
-  spdlog::info("Core [{}] : DMA active_cycles, {} DMA idle_cycles {}, DRAM BW {:.3f} GB/s ({} responses)", _id, _stat_tot_dma_cycle, _stat_tot_dma_idle_cycle, dram_bw, _stat_tot_mem_response);
-  spdlog::info("Core [{}] : Vector unit utilization(%) {:.2f}, active cycle {}, idle_cycle {}", _id,
+  spdlog::info("Core [{}] : DMA active_cycles: {}, DMA idle_cycles: {}, DRAM BW: {:.3f} GB/s ({} responses)", _id, _stat_tot_dma_cycle, _stat_tot_dma_idle_cycle, dram_bw, _stat_tot_mem_response);
+  spdlog::info("Core [{}] : Vector unit utilization(%): {:.2f}, active cycle: {}, idle_cycle: {}", _id,
     static_cast<float>(_stat_tot_vu_compute_cycle * 100) / _core_cycle, _stat_tot_vu_compute_cycle, _stat_tot_vu_compute_idle_cycle);
   spdlog::info("Core [{}] : NUMA local memory: {} requests, remote memory: {} requests", _id, _stat_numa_local_access, _stat_numa_remote_access);
-  spdlog::info("Core [{}] : Total_cycles {}", _id, _core_cycle);
+  spdlog::info("Core [{}] : Total_cycles: {}", _id, _core_cycle);
 }
 
 void Core::print_current_stats() {
@@ -485,12 +485,12 @@ void Core::print_current_stats() {
 
   spdlog::info("========= Core stat =========");
   for (int i=0; i<_num_systolic_array_per_core; i++)
-    spdlog::info("Core [{}] : Systolic array [{}] utilization(%) {:.2f}, active_cycles {}, idle_cycles {}", _id, i, sa_utilization.at(i),
+    spdlog::info("Core [{}] : Systolic array [{}] utilization(%): {:.2f}, active_cycles: {}, idle_cycles: {}", _id, i, sa_utilization.at(i),
       _stat_sa_compute_cycle.at(i), _stat_sa_compute_idle_cycle.at(i));
-  spdlog::info("Core [{}] : DMA active_cycles {}, DMA idle_cycles {}, DRAM BW {:.3f} GB/s ({} responses)", _id, _stat_dma_cycle, _stat_dma_idle_cycle, dram_bw, _stat_mem_response);
-  spdlog::info("Core [{}] : Vector unit Utilization(%) {:.2f}, active_cycles {}, idle_cycles {}", _id,
+  spdlog::info("Core [{}] : DMA active_cycles: {}, DMA idle_cycles: {}, DRAM BW: {:.3f} GB/s ({} responses)", _id, _stat_dma_cycle, _stat_dma_idle_cycle, dram_bw, _stat_mem_response);
+  spdlog::info("Core [{}] : Vector unit Utilization(%): {:.2f}, active_cycles: {}, idle_cycles: {}", _id,
     static_cast<float>(_stat_vu_compute_cycle * 100) / _config.core_print_interval, _stat_vu_compute_cycle, _stat_vu_compute_idle_cycle);
-  spdlog::info("Core [{}] : Total_cycles {}", _id, _core_cycle);
+  spdlog::info("Core [{}] : Total_cycles: {}", _id, _core_cycle);
   update_stats();
 }
 
diff --git a/TOGSim/src/Dram.cc b/TOGSim/src/Dram.cc
index 798acb7b..5211ef47 100644
--- a/TOGSim/src/Dram.cc
+++ b/TOGSim/src/Dram.cc
@@ -151,8 +151,6 @@ void DramRamulator2::apply_ramulator_config_to_simulation_config(
           "value, or align the Ramulator YAML with the top-level yml. ramulator_config_path={}",
           *dram_freq_mhz_stated, cfg.dram_freq_mhz, static_cast<double>(tck_ns), ramulator_config_path));
     }
-    spdlog::info("[Config/DRAM] ramulator2: dram_freq_mhz {} matches Ramulator-derived DRAM clock (tCK={:.6g} ns)",
-                 *dram_freq_mhz_stated, static_cast<double>(tck_ns));
   }
 }
 
@@ -200,7 +198,8 @@ Dram::Dram(SimulationConfig config, cycle_type* core_cycle) {
   _config = config;
   _tx_log2 = static_cast<int>(std::log2(_req_size));
 
-  spdlog::info("[Config/DRAM] DRAM Bandwidth {} GB/s, Freq: {} MHz, Channels: {}, Request_size: {}B", config.max_dram_bandwidth(), config.dram_freq_mhz, _n_ch, _req_size);
+  spdlog::info("[Config/DRAM] Total bandwidth {:.2f} GB/s, {} MHz, {} channels, {} bytes per request",
+               static_cast<double>(config.max_dram_bandwidth()), config.dram_freq_mhz, _n_ch, _req_size);
   /* Initialize DRAM Channels */
   for (int ch = 0; ch < _n_ch; ch++) {
     m_to_crossbar_queue.push_back(std::queue<mem_fetch*>());
@@ -288,13 +287,15 @@ void DramRamulator2::cycle() {
     const DramBwSnapshot bw = make_dram_bw_snapshot(
         r + wtxn, w, 1u, _req_size, f_mhz, _config.dram_bandwidth_gbps_per_channel);
     spdlog::trace(
-        "[DRAM] ch {} | BW {:.2f} GB/s, {:.2f}% util | {} reads, {} writes (interval {} cycles)",
+        "[DRAM] channel {} | {:.2f} GB/s avg., {:.2f}% of utilization | {} reads, {} writes "
+        "(interval {} cycles)",
         ch, bw.bandwidth_gbs, bw.util_avg_ch_pct, r, wtxn, w);
   }
   const DramBwSnapshot bw_all = make_dram_bw_snapshot(
       r_all + w_all, w, _n_ch, _req_size, f_mhz, _config.dram_bandwidth_gbps_per_channel);
   spdlog::info(
-      "[DRAM] all {} ch | BW {:.2f} GB/s, {:.2f}% util (avg/ch) | {} reads, {} writes (interval {} cycles)",
+      "[DRAM] all {} channels combined | {:.2f} GB/s aggregate, {:.2f}% of utilization (avg. per channel) | "
+      "{} reads, {} writes (interval {} cycles)",
       _n_ch, bw_all.bandwidth_gbs, bw_all.util_avg_ch_pct, r_all, w_all, w);
   for (int ch = 0; ch < _n_ch; ch++) {
     _mem[ch]->reset_interval_bw_counters();
@@ -333,7 +334,7 @@ void DramRamulator2::pop(uint32_t cid) {
 }
 
 void DramRamulator2::print_stat() {
-  spdlog::info("========= DRAM stat =========");
+  spdlog::info("=== DRAM statistics ===");
   if (_n_ch == 0)
     return;
 
@@ -352,7 +353,7 @@ void DramRamulator2::print_stat() {
   if (cycles == 0)
     return;
   const double f_mhz = static_cast<double>(_config.dram_freq_mhz);
-  spdlog::info("[DRAM] per-channel avg BW");
+  spdlog::info("[DRAM] Per-channel average bandwidth");
   long long tr_all = 0;
   long long tw_all = 0;
   for (int ch = 0; ch < _n_ch; ch++) {
@@ -363,13 +364,14 @@ void DramRamulator2::print_stat() {
     const DramBwSnapshot bw = make_dram_bw_snapshot(
         tr + tw, cycles, 1u, _req_size, f_mhz, _config.dram_bandwidth_gbps_per_channel);
     spdlog::info(
-        "[DRAM] ch {} | avg BW {:.2f} GB/s, {:.2f}% util | {} reads, {} writes",
+        "[DRAM] channel {} | {:.2f} GB/s avg., {:.2f}% of utilization | {} reads, {} writes",
         ch, bw.bandwidth_gbs, bw.util_avg_ch_pct, tr, tw);
   }
   const DramBwSnapshot bw_all = make_dram_bw_snapshot(
       tr_all + tw_all, cycles, _n_ch, _req_size, f_mhz, _config.dram_bandwidth_gbps_per_channel);
   spdlog::info(
-      "[DRAM] all ch 0..{} | avg BW {:.2f} GB/s, {:.2f}% util (avg/ch) | {} reads, {} writes",
+      "[DRAM] channels 0..{} combined | {:.2f} GB/s aggregate, {:.2f}% of utilization (avg. per channel) | "
+      "{} reads, {} writes",
       _n_ch - 1, bw_all.bandwidth_gbs, bw_all.util_avg_ch_pct, tr_all, tw_all);
 }
 
diff --git a/TOGSim/src/Simulator.cc b/TOGSim/src/Simulator.cc
index 9bd3407f..eb3b8670 100644
--- a/TOGSim/src/Simulator.cc
+++ b/TOGSim/src/Simulator.cc
@@ -25,11 +25,11 @@ Simulator::Simulator(SimulationConfig config, YAML::Node hardware_config_yaml)
   _cores.resize(_n_cores);
   for (int core_index = 0; core_index < _n_cores; core_index++) {
     if (config.core_type[core_index] == CoreType::WS_MESH) {
-      spdlog::info("[Config/Core] Core {}: {} MHz, Systolic array per core: {}",
-        core_index, config.core_freq_mhz, config.num_systolic_array_per_core);
+      spdlog::info("[Config/Core] Core {}: core_freq_mhz: {}, systolic_arrays_per_core: {}",
+                   core_index, config.core_freq_mhz, config.num_systolic_array_per_core);
       _cores.at(core_index) = std::make_unique<Core>(core_index, _config);
     } else if(config.core_type[core_index] == CoreType::STONNE) {
-      spdlog::info("[Config/Core] Core {}: {} MHz, Stonne Core selected", core_index, config.core_freq_mhz);
+      spdlog::info("[Config/Core] Core {}: core_freq_mhz: {}, core_type: Stonne", core_index, config.core_freq_mhz);
       _cores.at(core_index) = std::make_unique<SparseCore>(core_index, _config);
     } else {
       throw std::runtime_error(fmt::format("Not implemented Core type {} ",
@@ -46,8 +46,7 @@ Simulator::Simulator(SimulationConfig config, YAML::Node hardware_config_yaml)
                                        .string();
     spdlog::info("[Config/DRAM] Ramulator2 config path: {}", ramulator_config);
     YAML::Node dram_config = YAML::LoadFile(ramulator_config);
-    spdlog::info("Ramulator2 config: ");
-    std::cout << dram_config << std::endl;
+    spdlog::info("[Config/DRAM] Ramulator2 configuration:\n{}", YAML::Dump(dram_config));
     config.dram_config_path = ramulator_config;
     _dram = std::make_unique<DramRamulator2>(config, &_core_cycles);
   } else {
@@ -56,12 +55,12 @@ Simulator::Simulator(SimulationConfig config, YAML::Node hardware_config_yaml)
   }
 
   // Create interconnect object
-  spdlog::info("[Config/Interconnect] Interconnect freq: {} MHz", config.icnt_freq_mhz);
+  spdlog::info("[Config/Interconnect] interconnect_freq_mhz: {}", config.icnt_freq_mhz);
   if (config.icnt_type == IcntType::SIMPLE) {
-    spdlog::info("[Config/Interconnect] SimpleInerconnect selected");
+    spdlog::info("[Config/Interconnect] Simple interconnect selected");
     _icnt = std::make_unique<SimpleInterconnect>(config);
   } else if (config.icnt_type == IcntType::BOOKSIM2) {
-    spdlog::info("[Config/Interconnect] BookSim2 selected");
+    spdlog::info("[Config/Interconnect] BookSim2 interconnect selected");
     _icnt = std::make_unique<Booksim2Interconnect>(config);
   } else {
     spdlog::error("[Configuration] Invalid interconnect type...!");
diff --git a/TOGSim/src/main.cc b/TOGSim/src/main.cc
index f985bdf4..010826ef 100644
--- a/TOGSim/src/main.cc
+++ b/TOGSim/src/main.cc
@@ -13,13 +13,14 @@ namespace fs = std::filesystem;
 namespace po = boost::program_options;
 
 
-void launchKernel(Simulator* simulator, unsigned int kernel_id, std::string onnx_path, std::string attribute_path, const YAML::Node& config_yaml, cycle_type request_time=0, int partiton_id=0, int device_id=0) {
+void launchKernel(Simulator* simulator, unsigned int kernel_id, std::string onnx_path, std::string attribute_path, const YAML::Node& config_yaml, cycle_type request_time=0, int partition_id=0, int device_id=0) {
   auto graph_praser = TileGraphParser(onnx_path, attribute_path, config_yaml);
   std::unique_ptr<TileGraph>& tile_graph = graph_praser.get_tile_graph();
   tile_graph->set_arrival_time(request_time ? request_time : simulator->get_core_cycle());
   tile_graph->set_kernel_id(kernel_id);
-  spdlog::info("[Scheduler {}] Enqueued kernel id: {}, tog_path: {}, operation: {}, request_time: {}", partiton_id, kernel_id, onnx_path, tile_graph->get_name(), request_time);
-  simulator->enqueue_graph(partiton_id, std::move(tile_graph));
+  spdlog::info("[Scheduler {}] Enqueued kernel_id: {}, tog_path: {}, operation: {}, request_time_cycles: {}",
+               partition_id, kernel_id, onnx_path, tile_graph->get_name(), request_time);
+  simulator->enqueue_graph(partition_id, std::move(tile_graph));
 }
 
 void process_trace_file(Simulator* simulator, std::string trace_file_path, const YAML::Node& config_yaml) {
@@ -30,7 +31,7 @@ void process_trace_file(Simulator* simulator, std::string trace_file_path, const
     spdlog::error("[TOGSim] Failed to open trace file: {}", trace_file_path);
     return;
   }
-  spdlog::info("[TOGSim] Reading from trace file: {}", trace_file_path);
+  spdlog::info("[TOGSim] Reading trace file: {}", trace_file_path);
 
   // Read all available commands and process them
   std::string line;
@@ -123,7 +124,7 @@ int main(int argc, char** argv) {
     if (i > 0) cmd_oss << " ";
     cmd_oss << argv[i];
   }
-  spdlog::info("[TOGSim] Run command: {}", cmd_oss.str());
+  spdlog::info("[TOGSim] Command line: {}", cmd_oss.str());
 
   std::string level = "info";
   cmd_parser.set_if_defined("log_level", &level);

From 28745d641c55a1ed9c72991d9c1241712f9e68d8 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 21 Apr 2026 23:41:12 +0900
Subject: [PATCH 179/194] [Tutorial] Update session2 jupyter notebook

---
 tutorial/session2/Hands_on.ipynb | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/tutorial/session2/Hands_on.ipynb b/tutorial/session2/Hands_on.ipynb
index 9a7c35e3..a2e6899f 100644
--- a/tutorial/session2/Hands_on.ipynb
+++ b/tutorial/session2/Hands_on.ipynb
@@ -37,18 +37,20 @@
     "\n",
     "device = torch.device(\"npu:0\")\n",
     "\n",
-    "def test_result(name, out, cpu_out, rtol=1e-4, atol=1e-4):\n",
-    "    if torch.allclose(out.cpu(), cpu_out, rtol=rtol, atol=atol):\n",
+    "def test_result(name, npu_out, cpu_out, rtol=1e-4, atol=1e-4):\n",
+    "    if torch.allclose(npu_out.cpu(), cpu_out, rtol=rtol, atol=atol):\n",
     "        message = f\"|{name} Test Passed|\"\n",
     "        print(\"-\" * len(message))\n",
     "        print(message)\n",
     "        print(\"-\" * len(message))\n",
+    "        print(\"npu out: \", npu_out.cpu().reshape(-1)[:5])\n",
+    "        print(\"cpu out: \", cpu_out.reshape(-1)[:5])\n",
     "    else:\n",
     "        message = f\"|{name} Test Failed|\"\n",
     "        print(\"-\" * len(message))\n",
     "        print(message)\n",
     "        print(\"-\" * len(message))\n",
-    "        print(\"custom out: \", out.cpu())\n",
+    "        print(\"npu out: \", npu_out.cpu())\n",
     "        print(\"cpu out: \", cpu_out)\n",
     "        exit(1)\n",
     "\n",
@@ -91,6 +93,8 @@
     }
    ],
    "source": [
+    "# os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_functional_only.yml\" \n",
+    "\n",
     "input = torch.randn(16, 16)\n",
     "npu_x = input.to(device=device)\n",
     "cpu_x = input.to(\"cpu\")\n",

From 3cdfb7c352b4d53ba99efe872daaf0dca39dbf0c Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Tue, 21 Apr 2026 23:56:02 +0900
Subject: [PATCH 180/194] [Tutorial] Fix ramulator config path

---
 tutorial/session1/togsim_configs/togsim_config.yml              | 2 +-
 tutorial/session1/togsim_configs/togsim_config_2_cores.yml      | 2 +-
 tutorial/session1/togsim_configs/togsim_config_autotune.yml     | 2 +-
 .../session1/togsim_configs/togsim_config_external_mapping.yml  | 2 +-
 .../session1/togsim_configs/togsim_config_functional_only.yml   | 2 +-
 .../togsim_configs/togsim_config_no_compiler_optimization.yml   | 2 +-
 tutorial/session1/togsim_configs/togsim_config_timing_only.yml  | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tutorial/session1/togsim_configs/togsim_config.yml b/tutorial/session1/togsim_configs/togsim_config.yml
index ff976784..eb23c833 100644
--- a/tutorial/session1/togsim_configs/togsim_config.yml
+++ b/tutorial/session1/togsim_configs/togsim_config.yml
@@ -11,7 +11,7 @@ dram_type: ramulator2
 dram_freq_mhz: 940
 dram_channels: 16
 dram_stats_print_period_cycles: 10000
-ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
+ramulator_config_path: /workspace/PyTorchSim/configs/ramulator2_configs/HBM2_TPUv3.yaml
 
 icnt_type: simple
 icnt_latency_cycles: 10
diff --git a/tutorial/session1/togsim_configs/togsim_config_2_cores.yml b/tutorial/session1/togsim_configs/togsim_config_2_cores.yml
index a3a4ab93..09be00fe 100644
--- a/tutorial/session1/togsim_configs/togsim_config_2_cores.yml
+++ b/tutorial/session1/togsim_configs/togsim_config_2_cores.yml
@@ -11,7 +11,7 @@ dram_type: ramulator2
 dram_freq_mhz: 940
 dram_channels: 32
 dram_stats_print_period_cycles: 10000
-ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
+ramulator_config_path: /workspace/PyTorchSim/configs/ramulator2_configs/HBM2_TPUv3.yaml
 
 icnt_type: simple
 icnt_latency_cycles: 10
diff --git a/tutorial/session1/togsim_configs/togsim_config_autotune.yml b/tutorial/session1/togsim_configs/togsim_config_autotune.yml
index 1ec99521..669c592f 100644
--- a/tutorial/session1/togsim_configs/togsim_config_autotune.yml
+++ b/tutorial/session1/togsim_configs/togsim_config_autotune.yml
@@ -11,7 +11,7 @@ dram_type: ramulator2
 dram_freq_mhz: 940
 dram_channels: 16
 dram_stats_print_period_cycles: 10000
-ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
+ramulator_config_path: /workspace/PyTorchSim/configs/ramulator2_configs/HBM2_TPUv3.yaml
 
 icnt_type: simple
 icnt_latency_cycles: 10
diff --git a/tutorial/session1/togsim_configs/togsim_config_external_mapping.yml b/tutorial/session1/togsim_configs/togsim_config_external_mapping.yml
index 58c8165d..485956bb 100644
--- a/tutorial/session1/togsim_configs/togsim_config_external_mapping.yml
+++ b/tutorial/session1/togsim_configs/togsim_config_external_mapping.yml
@@ -11,7 +11,7 @@ dram_type: ramulator2
 dram_freq_mhz: 940
 dram_channels: 16
 dram_stats_print_period_cycles: 10000
-ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
+ramulator_config_path: /workspace/PyTorchSim/configs/ramulator2_configs/HBM2_TPUv3.yaml
 
 icnt_type: simple
 icnt_latency_cycles: 10
diff --git a/tutorial/session1/togsim_configs/togsim_config_functional_only.yml b/tutorial/session1/togsim_configs/togsim_config_functional_only.yml
index b53ca4e0..990b955c 100644
--- a/tutorial/session1/togsim_configs/togsim_config_functional_only.yml
+++ b/tutorial/session1/togsim_configs/togsim_config_functional_only.yml
@@ -11,7 +11,7 @@ dram_type: ramulator2
 dram_freq_mhz: 940
 dram_channels: 16
 dram_stats_print_period_cycles: 10000
-ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
+ramulator_config_path: /workspace/PyTorchSim/configs/ramulator2_configs/HBM2_TPUv3.yaml
 
 icnt_type: simple
 icnt_latency_cycles: 10
diff --git a/tutorial/session1/togsim_configs/togsim_config_no_compiler_optimization.yml b/tutorial/session1/togsim_configs/togsim_config_no_compiler_optimization.yml
index e47b63eb..f56ab6f1 100644
--- a/tutorial/session1/togsim_configs/togsim_config_no_compiler_optimization.yml
+++ b/tutorial/session1/togsim_configs/togsim_config_no_compiler_optimization.yml
@@ -11,7 +11,7 @@ dram_type: ramulator2
 dram_freq_mhz: 940
 dram_channels: 16
 dram_stats_print_period_cycles: 10000
-ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
+ramulator_config_path: /workspace/PyTorchSim/configs/ramulator2_configs/HBM2_TPUv3.yaml
 
 icnt_type: simple
 icnt_latency_cycles: 10
diff --git a/tutorial/session1/togsim_configs/togsim_config_timing_only.yml b/tutorial/session1/togsim_configs/togsim_config_timing_only.yml
index 24017861..ad4fb90e 100644
--- a/tutorial/session1/togsim_configs/togsim_config_timing_only.yml
+++ b/tutorial/session1/togsim_configs/togsim_config_timing_only.yml
@@ -11,7 +11,7 @@ dram_type: ramulator2
 dram_freq_mhz: 940
 dram_channels: 16
 dram_stats_print_period_cycles: 10000
-ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml
+ramulator_config_path: /workspace/PyTorchSim/configs/ramulator2_configs/HBM2_TPUv3.yaml
 
 icnt_type: simple
 icnt_latency_cycles: 10

From 46b8e3df8a96cd7b9dca6169814925f4b3d27c32 Mon Sep 17 00:00:00 2001
From: Yunseon Shin <yunseon0518@naver.com>
Date: Wed, 22 Apr 2026 03:13:48 +0000
Subject: [PATCH 181/194] [Tutorial] Clean up session1 notebooks

- Remove cell execution timestamps from metadata
- Simplify path setup: remove base_dir/sys.path.append, use absolute paths
- Replace extension_config.CONFIG_TOGSIM_CONFIG with direct config paths
- Update log file paths to latest run timestamps
- Adjust tensor sizes and minor wording fixes
---
 tutorial/session1/CompilerOptimization.ipynb |  68 ++------
 tutorial/session1/DNNServing.ipynb           |  39 +----
 tutorial/session1/ExecutionMode.ipynb        | 158 ++-----------------
 tutorial/session1/Inference.ipynb            |  35 ++--
 tutorial/session1/LogAnalysis.ipynb          |  35 +---
 tutorial/session1/Mapping.ipynb              |  94 +++--------
 tutorial/session1/TOGSimConfig.ipynb         |  97 ++++++++++++
 tutorial/session1/Training.ipynb             |  87 +++-------
 8 files changed, 183 insertions(+), 430 deletions(-)
 create mode 100644 tutorial/session1/TOGSimConfig.ipynb

diff --git a/tutorial/session1/CompilerOptimization.ipynb b/tutorial/session1/CompilerOptimization.ipynb
index 6c23bfec..f8eea728 100644
--- a/tutorial/session1/CompilerOptimization.ipynb
+++ b/tutorial/session1/CompilerOptimization.ipynb
@@ -10,23 +10,12 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2026-04-16T10:34:23.862488Z",
-     "iopub.status.busy": "2026-04-16T10:34:23.862221Z",
-     "iopub.status.idle": "2026-04-16T10:34:26.839597Z",
-     "shell.execute_reply": "2026-04-16T10:34:26.838615Z",
-     "shell.execute_reply.started": "2026-04-16T10:34:23.862467Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "import torch\n",
     "import os\n",
-    "import sys\n",
-    "base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')\n",
-    "sys.path.append(base_dir)\n",
-    "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_timing_only.yml\""
+    "os.environ['TOGSIM_CONFIG']=\"/workspace/PyTorchSim/tutorial/session1/togsim_configs/togsim_config_timing_only.yml\""
    ]
   },
   {
@@ -39,15 +28,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2026-04-16T10:34:26.840859Z",
-     "iopub.status.busy": "2026-04-16T10:34:26.840581Z",
-     "iopub.status.idle": "2026-04-16T10:34:46.109858Z",
-     "shell.execute_reply": "2026-04-16T10:34:46.108862Z",
-     "shell.execute_reply.started": "2026-04-16T10:34:26.840841Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "os.environ['TORCHSIM_DUMP_PATH']=os.path.join(os.getcwd(), \"fused\")\n",
@@ -66,43 +47,28 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2026-04-16T10:41:01.000313Z",
-     "iopub.status.busy": "2026-04-16T10:41:00.999980Z",
-     "iopub.status.idle": "2026-04-16T10:41:01.273172Z",
-     "shell.execute_reply": "2026-04-16T10:41:01.272081Z",
-     "shell.execute_reply.started": "2026-04-16T10:41:01.000290Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
-    "!cat /workspace/PyTorchSim/togsim_results/20260416_103442_5281e75b.log | grep \"Total execution cycle\""
+    "log_path = \"\"\n",
+    "!cat $log_path | grep \"Total execution cycle\""
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Disable fusion"
+    "### Disabling fusion"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2026-04-16T10:44:29.448759Z",
-     "iopub.status.busy": "2026-04-16T10:44:29.448400Z",
-     "iopub.status.idle": "2026-04-16T10:44:41.303261Z",
-     "shell.execute_reply": "2026-04-16T10:44:41.302462Z",
-     "shell.execute_reply.started": "2026-04-16T10:44:29.448732Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "os.environ['TORCHSIM_DUMP_PATH']=os.path.join(os.getcwd(), \"non_fused\")\n",
-    "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_no_compiler_optimization.yml\"\n",
+    "os.environ['TOGSIM_CONFIG']=\"/workspace/PyTorchSim/tutorial/session1/togsim_configs/togsim_config_no_compiler_optimization.yml\"\n",
     "\n",
     "input = torch.randn(1024, 1024).to(device=device)\n",
     "weight = torch.randn(1024, 1024).to(device=device)\n",
@@ -117,19 +83,13 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2026-04-16T10:46:37.996794Z",
-     "iopub.status.busy": "2026-04-16T10:46:37.996476Z",
-     "iopub.status.idle": "2026-04-16T10:46:38.497173Z",
-     "shell.execute_reply": "2026-04-16T10:46:38.496104Z",
-     "shell.execute_reply.started": "2026-04-16T10:46:37.996776Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
-    "!cat /workspace/PyTorchSim/togsim_results/20260416_104436_000cb9bc.log | grep \"Total execution cycle\"\n",
-    "!cat /workspace/PyTorchSim/togsim_results/20260416_104440_e50cdae1.log | grep \"Total execution cycle\""
+    "log_path = \"\"\n",
+    "!cat $log_path | grep \"Total execution cycle\"\n",
+    "log_path = \"\"\n",
+    "!cat $log_path | grep \"Total execution cycle\""
    ]
   },
   {
diff --git a/tutorial/session1/DNNServing.ipynb b/tutorial/session1/DNNServing.ipynb
index 0b4e0837..f7f2ea4d 100644
--- a/tutorial/session1/DNNServing.ipynb
+++ b/tutorial/session1/DNNServing.ipynb
@@ -10,22 +10,11 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2026-04-16T11:17:18.018872Z",
-     "iopub.status.busy": "2026-04-16T11:17:18.018643Z",
-     "iopub.status.idle": "2026-04-16T11:17:20.890421Z",
-     "shell.execute_reply": "2026-04-16T11:17:20.889693Z",
-     "shell.execute_reply.started": "2026-04-16T11:17:18.018853Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "import torch\n",
-    "import os\n",
-    "import sys\n",
-    "base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')\n",
-    "sys.path.append(base_dir)"
+    "import os"
    ]
   },
   {
@@ -38,15 +27,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2026-04-16T11:17:20.891167Z",
-     "iopub.status.busy": "2026-04-16T11:17:20.890953Z",
-     "iopub.status.idle": "2026-04-16T11:19:42.197046Z",
-     "shell.execute_reply": "2026-04-16T11:19:42.196023Z",
-     "shell.execute_reply.started": "2026-04-16T11:17:20.891152Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "import torch\n",
@@ -55,7 +36,7 @@
     "from PyTorchSimFrontend import extension_config\n",
     "\n",
     "device = torch.device(\"npu:0\")\n",
-    "config = extension_config.CONFIG_TOGSIM_CONFIG\n",
+    "config = \"/workspace/PyTorchSim/tutorial/session1/togsim_configs/togsim_config_timing_only.yml\"\n",
     "\n",
     "model = resnet18().eval()\n",
     "input = torch.randn(1, 3, 224, 224).to(device=device)\n",
@@ -81,14 +62,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import os\n",
-    "import torch\n",
-    "from torchvision.models import resnet18\n",
-    "from Simulator.simulator import TOGSimulator\n",
-    "from PyTorchSimFrontend import extension_config\n",
-    "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_timing_only.yml\"\n",
     "from Scheduler.scheduler import poisson_request_generator\n",
-    "TORCHSIM_DIR = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')\n",
     "\n",
     "model0_lambda = 5.0\n",
     "max_time_msec = 1000.0\n",
@@ -96,7 +70,7 @@
     "target_model1 = resnet18().eval()\n",
     "\n",
     "device = torch.device(\"npu:0\")\n",
-    "config = extension_config.CONFIG_TOGSIM_CONFIG\n",
+    "config = \"/workspace/PyTorchSim/tutorial/session1/togsim_configs/togsim_config_timing_only.yml\"\n",
     "opt_model0 = torch.compile(target_model1.to(device=device, memory_format=torch.channels_last), dynamic=False)\n",
     "\n",
     "events = []\n",
@@ -104,9 +78,6 @@
     "for t in poisson_request_generator(model0_lambda, max_msec_time=max_time_msec):\n",
     "    events.append((t, 0, opt_model0, (x,)))  # stream_index 0 → queue / partition 0\n",
     "\n",
-    "events.sort(key=lambda e: e[0])\n",
-    "\n",
-    "\n",
     "with TOGSimulator(config_path=config):\n",
     "    for t_msec, stream_index, model, args in events:\n",
     "        torch.npu.launch_model(\n",
diff --git a/tutorial/session1/ExecutionMode.ipynb b/tutorial/session1/ExecutionMode.ipynb
index bd7d7d73..9d0b051f 100644
--- a/tutorial/session1/ExecutionMode.ipynb
+++ b/tutorial/session1/ExecutionMode.ipynb
@@ -10,22 +10,11 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2026-04-16T05:56:08.883802Z",
-     "iopub.status.busy": "2026-04-16T05:56:08.883406Z",
-     "iopub.status.idle": "2026-04-16T05:56:11.858647Z",
-     "shell.execute_reply": "2026-04-16T05:56:11.857788Z",
-     "shell.execute_reply.started": "2026-04-16T05:56:08.883784Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "import torch\n",
-    "import os\n",
-    "import sys\n",
-    "base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')\n",
-    "sys.path.append(base_dir)"
+    "import os"
    ]
   },
   {
@@ -38,21 +27,13 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2026-04-16T05:56:11.859394Z",
-     "iopub.status.busy": "2026-04-16T05:56:11.859139Z",
-     "iopub.status.idle": "2026-04-16T05:56:31.283787Z",
-     "shell.execute_reply": "2026-04-16T05:56:31.282907Z",
-     "shell.execute_reply.started": "2026-04-16T05:56:11.859372Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "device = torch.device(\"npu:0\")\n",
     "\n",
-    "input = torch.randn(1024, 1024).to(device=device)\n",
-    "weight = torch.randn(1024, 1024).to(device=device)\n",
+    "input = torch.randn(512, 512).to(device=device)\n",
+    "weight = torch.randn(512, 512).to(device=device)\n",
     "\n",
     "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
     "npu_out = opt_fn(input, weight)"
@@ -62,57 +43,19 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Functional only mode"
+    "### Functional-only mode"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2026-04-16T05:56:37.980561Z",
-     "iopub.status.busy": "2026-04-16T05:56:37.980194Z",
-     "iopub.status.idle": "2026-04-16T05:56:46.194881Z",
-     "shell.execute_reply": "2026-04-16T05:56:46.194059Z",
-     "shell.execute_reply.started": "2026-04-16T05:56:37.980534Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_functional_only.yml\"\n",
-    "\n",
-    "input = torch.randn(1024, 1024).to(device=device)\n",
-    "weight = torch.randn(1024, 1024).to(device=device)\n",
-    "\n",
-    "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
-    "npu_out = opt_fn(input, weight)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
    "metadata": {},
-   "source": [
-    "### Timing only mode"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2026-04-16T05:56:46.195666Z",
-     "iopub.status.busy": "2026-04-16T05:56:46.195511Z",
-     "iopub.status.idle": "2026-04-16T05:56:49.736201Z",
-     "shell.execute_reply": "2026-04-16T05:56:49.735438Z",
-     "shell.execute_reply.started": "2026-04-16T05:56:46.195650Z"
-    }
-   },
    "outputs": [],
    "source": [
-    "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_timing_only.yml\"\n",
+    "os.environ['TOGSIM_CONFIG']=f\"/workspace/PyTorchSim/tutorial/session1/togsim_configs/togsim_config_functional_only.yml\"\n",
     "\n",
-    "input = torch.randn(1024, 1024).to(device=device)\n",
-    "weight = torch.randn(1024, 1024).to(device=device)\n",
+    "input = torch.randn(512, 512).to(device=device)\n",
+    "weight = torch.randn(512, 512).to(device=device)\n",
     "\n",
     "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
     "npu_out = opt_fn(input, weight)"
@@ -122,97 +65,24 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## TOGSim Configuration\n",
-    "### Single Core"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2026-04-16T05:59:18.661437Z",
-     "iopub.status.busy": "2026-04-16T05:59:18.661188Z",
-     "iopub.status.idle": "2026-04-16T05:59:53.388013Z",
-     "shell.execute_reply": "2026-04-16T05:59:53.387130Z",
-     "shell.execute_reply.started": "2026-04-16T05:59:18.661408Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_timing_only.yml\"\n",
-    "\n",
-    "input = torch.randn(2048, 2048).to(device=device)\n",
-    "weight = torch.randn(2048, 2048).to(device=device)\n",
-    "\n",
-    "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
-    "npu_out = opt_fn(input, weight)"
+    "### Timing-only mode"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2026-04-16T06:00:06.720227Z",
-     "iopub.status.busy": "2026-04-16T06:00:06.719962Z",
-     "iopub.status.idle": "2026-04-16T06:00:06.979872Z",
-     "shell.execute_reply": "2026-04-16T06:00:06.978988Z",
-     "shell.execute_reply.started": "2026-04-16T06:00:06.720210Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "!cat /workspace/PyTorchSim/togsim_results/20260416_055926_3c61ae14.log | grep \"Total execution cycle\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
    "metadata": {},
-   "source": [
-    "### Multi-Core"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2026-04-16T06:01:00.604737Z",
-     "iopub.status.busy": "2026-04-16T06:01:00.604494Z",
-     "iopub.status.idle": "2026-04-16T06:01:34.826968Z",
-     "shell.execute_reply": "2026-04-16T06:01:34.826043Z",
-     "shell.execute_reply.started": "2026-04-16T06:01:00.604717Z"
-    }
-   },
    "outputs": [],
    "source": [
-    "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_2_cores.yml\"\n",
+    "os.environ['TOGSIM_CONFIG']=f\"/workspace/PyTorchSim/tutorial/session1/togsim_configs/togsim_config_timing_only.yml\"\n",
     "\n",
-    "input = torch.randn(2048, 2048).to(device=device)\n",
-    "weight = torch.randn(2048, 2048).to(device=device)\n",
+    "input = torch.randn(512, 512).to(device=device)\n",
+    "weight = torch.randn(512, 512).to(device=device)\n",
     "\n",
     "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
     "npu_out = opt_fn(input, weight)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2026-04-16T06:01:53.294075Z",
-     "iopub.status.busy": "2026-04-16T06:01:53.293728Z",
-     "iopub.status.idle": "2026-04-16T06:01:53.549156Z",
-     "shell.execute_reply": "2026-04-16T06:01:53.548315Z",
-     "shell.execute_reply.started": "2026-04-16T06:01:53.294047Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "!cat /workspace/PyTorchSim/togsim_results/20260416_060100_05df9481.log | grep \"Total execution cycle\""
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -223,7 +93,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "base",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
diff --git a/tutorial/session1/Inference.ipynb b/tutorial/session1/Inference.ipynb
index caa5924e..18325d80 100644
--- a/tutorial/session1/Inference.ipynb
+++ b/tutorial/session1/Inference.ipynb
@@ -11,22 +11,10 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2026-04-16T05:42:44.479626Z",
-     "iopub.status.busy": "2026-04-16T05:42:44.479480Z",
-     "iopub.status.idle": "2026-04-16T05:42:47.646477Z",
-     "shell.execute_reply": "2026-04-16T05:42:47.645578Z",
-     "shell.execute_reply.started": "2026-04-16T05:42:44.479609Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
-    "import torch\n",
-    "import os\n",
-    "import sys\n",
-    "base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')\n",
-    "sys.path.append(base_dir)"
+    "import torch"
    ]
   },
   {
@@ -39,15 +27,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2026-04-16T05:42:47.968708Z",
-     "iopub.status.busy": "2026-04-16T05:42:47.968420Z",
-     "iopub.status.idle": "2026-04-16T05:42:49.772696Z",
-     "shell.execute_reply": "2026-04-16T05:42:49.771704Z",
-     "shell.execute_reply.started": "2026-04-16T05:42:47.968688Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
@@ -90,13 +70,16 @@
    "outputs": [],
    "source": [
     "def test_result(name, npu_out, cpu_out, rtol=1e-4, atol=1e-4):\n",
+    "    torch.set_printoptions(edgeitems=3)\n",
     "    if torch.allclose(npu_out.cpu(), cpu_out, rtol=rtol, atol=atol):\n",
-    "        message = f\"|{name} Test Passed|\"\n",
+    "        message = f\"|{name} Functionality Test Passed|\"\n",
     "        print(\"-\" * len(message))\n",
     "        print(message)\n",
     "        print(\"-\" * len(message))\n",
+    "        print(\"npu out: \", npu_out.cpu()[0, :5])\n",
+    "        print(\"cpu out: \", cpu_out[0, :5])\n",
     "    else:\n",
-    "        message = f\"|{name} Test Failed|\"\n",
+    "        message = f\"|{name} Functionality Test Failed|\"\n",
     "        print(\"-\" * len(message))\n",
     "        print(message)\n",
     "        print(\"-\" * len(message))\n",
@@ -124,7 +107,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "base",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
diff --git a/tutorial/session1/LogAnalysis.ipynb b/tutorial/session1/LogAnalysis.ipynb
index 5cd14f41..9b393384 100644
--- a/tutorial/session1/LogAnalysis.ipynb
+++ b/tutorial/session1/LogAnalysis.ipynb
@@ -10,23 +10,12 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2026-04-16T10:00:05.422374Z",
-     "iopub.status.busy": "2026-04-16T10:00:05.422205Z",
-     "iopub.status.idle": "2026-04-16T10:00:08.512084Z",
-     "shell.execute_reply": "2026-04-16T10:00:08.511285Z",
-     "shell.execute_reply.started": "2026-04-16T10:00:05.422359Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "import torch\n",
     "import os\n",
-    "import sys\n",
-    "base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')\n",
-    "sys.path.append(base_dir)\n",
-    "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_timing_only.yml\"\n",
+    "os.environ['TOGSIM_CONFIG']=\"/workspace/PyTorchSim/tutorial/session1/togsim_configs/togsim_config_timing_only.yml\"\n",
     "os.environ['TORCHSIM_LOG_PATH']=os.path.join(os.getcwd(), \"togsim_results\")"
    ]
   },
@@ -40,15 +29,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2026-04-16T10:00:46.974212Z",
-     "iopub.status.busy": "2026-04-16T10:00:46.973814Z",
-     "iopub.status.idle": "2026-04-16T10:00:52.152064Z",
-     "shell.execute_reply": "2026-04-16T10:00:52.151231Z",
-     "shell.execute_reply.started": "2026-04-16T10:00:46.974195Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "device = torch.device(\"npu:0\")\n",
@@ -70,15 +51,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2026-04-16T10:25:36.625640Z",
-     "iopub.status.busy": "2026-04-16T10:25:36.625388Z",
-     "iopub.status.idle": "2026-04-16T10:25:40.123959Z",
-     "shell.execute_reply": "2026-04-16T10:25:40.123131Z",
-     "shell.execute_reply.started": "2026-04-16T10:25:36.625622Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "os.environ['TOGSIM_DEBUG_LEVEL']=\"trace\"\n",
diff --git a/tutorial/session1/Mapping.ipynb b/tutorial/session1/Mapping.ipynb
index 92ddd5a8..d463c287 100644
--- a/tutorial/session1/Mapping.ipynb
+++ b/tutorial/session1/Mapping.ipynb
@@ -10,22 +10,11 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2026-04-16T05:49:05.540163Z",
-     "iopub.status.busy": "2026-04-16T05:49:05.539948Z",
-     "iopub.status.idle": "2026-04-16T05:49:08.550103Z",
-     "shell.execute_reply": "2026-04-16T05:49:08.549146Z",
-     "shell.execute_reply.started": "2026-04-16T05:49:05.540146Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "import torch\n",
-    "import os\n",
-    "import sys\n",
-    "base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')\n",
-    "sys.path.append(base_dir)"
+    "import os"
    ]
   },
   {
@@ -38,15 +27,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2026-04-16T05:49:08.550908Z",
-     "iopub.status.busy": "2026-04-16T05:49:08.550691Z",
-     "iopub.status.idle": "2026-04-16T05:49:28.225867Z",
-     "shell.execute_reply": "2026-04-16T05:49:28.225051Z",
-     "shell.execute_reply.started": "2026-04-16T05:49:08.550893Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "device = torch.device(\"npu:0\")\n",
@@ -61,45 +42,30 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2026-04-16T05:49:44.788982Z",
-     "iopub.status.busy": "2026-04-16T05:49:44.788640Z",
-     "iopub.status.idle": "2026-04-16T05:49:45.048201Z",
-     "shell.execute_reply": "2026-04-16T05:49:45.047229Z",
-     "shell.execute_reply.started": "2026-04-16T05:49:44.788954Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
-    "!cat /workspace/PyTorchSim/togsim_results/20260416_054924_5e1428f9.log | grep \"Total execution cycle\""
+    "log_path = \"\"\n",
+    "!cat $log_path | grep \"Total execution cycle\""
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Manual Mapping\n",
-    "User can set tile size manually."
+    "### External Mapping\n",
+    "User can set tile size manually from external file."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2026-04-16T05:49:53.216985Z",
-     "iopub.status.busy": "2026-04-16T05:49:53.216635Z",
-     "iopub.status.idle": "2026-04-16T05:50:11.043854Z",
-     "shell.execute_reply": "2026-04-16T05:50:11.042989Z",
-     "shell.execute_reply.started": "2026-04-16T05:49:53.216960Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "torch._dynamo.reset()\n",
     "\n",
-    "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_external_mapping.yml\"\n",
+    "os.environ['TOGSIM_CONFIG']=\"/workspace/PyTorchSim/tutorial/session1/togsim_configs/togsim_config_external_mapping.yml\"\n",
     "\n",
     "input = torch.randn(1024, 1024).to(device=device)\n",
     "weight = torch.randn(1024, 1024).to(device=device)\n",
@@ -111,18 +77,11 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2026-04-16T05:50:18.200344Z",
-     "iopub.status.busy": "2026-04-16T05:50:18.200118Z",
-     "iopub.status.idle": "2026-04-16T05:50:18.456838Z",
-     "shell.execute_reply": "2026-04-16T05:50:18.455901Z",
-     "shell.execute_reply.started": "2026-04-16T05:50:18.200327Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
-    "!cat /workspace/PyTorchSim/togsim_results/20260416_055004_6ef0f564.log | grep \"Total execution cycle\""
+    "log_path = \"\"\n",
+    "!cat $log_path | grep \"Total execution cycle\""
    ]
   },
   {
@@ -135,20 +94,12 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2026-04-16T11:22:40.778257Z",
-     "iopub.status.busy": "2026-04-16T11:22:40.777947Z",
-     "iopub.status.idle": "2026-04-16T11:23:10.573193Z",
-     "shell.execute_reply": "2026-04-16T11:23:10.572225Z",
-     "shell.execute_reply.started": "2026-04-16T11:22:40.778230Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "torch._dynamo.reset()\n",
-    "\n",
-    "os.environ['TOGSIM_CONFIG']=f\"{base_dir}/tutorial/session1/togsim_configs/togsim_config_autotune.yml\"\n",
+    "os.environ[\"TORCHINDUCTOR_CACHE_DIR\"]=os.path.join(os.getcwd(), \"autotune\")\n",
+    "os.environ['TOGSIM_CONFIG']=\"/workspace/PyTorchSim/tutorial/session1/togsim_configs/togsim_config_autotune.yml\"\n",
     "\n",
     "input = torch.randn(1024, 1024).to(device=device)\n",
     "weight = torch.randn(1024, 1024).to(device=device)\n",
@@ -160,18 +111,11 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2026-04-16T11:56:35.774938Z",
-     "iopub.status.busy": "2026-04-16T11:56:35.774682Z",
-     "iopub.status.idle": "2026-04-16T11:56:36.022450Z",
-     "shell.execute_reply": "2026-04-16T11:56:36.020569Z",
-     "shell.execute_reply.started": "2026-04-16T11:56:35.774921Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
-    "!cat /workspace/PyTorchSim/togsim_results/20260416_112306_10ad96fd.log | grep \"Total execution cycle\""
+    "log_path = \"\"\n",
+    "!cat $log_path | grep \"Total execution cycle\""
    ]
   },
   {
diff --git a/tutorial/session1/TOGSimConfig.ipynb b/tutorial/session1/TOGSimConfig.ipynb
new file mode 100644
index 00000000..a8c1bb6e
--- /dev/null
+++ b/tutorial/session1/TOGSimConfig.ipynb
@@ -0,0 +1,97 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## TOGSim Configuration\n",
+    "### Single Core"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.environ['TOGSIM_CONFIG']=\"/workspace/PyTorchSim/tutorial/session1/togsim_configs/togsim_config_timing_only.yml\"\n",
+    "\n",
+    "input = torch.randn(2048, 2048).to(device=device)\n",
+    "weight = torch.randn(2048, 2048).to(device=device)\n",
+    "\n",
+    "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
+    "npu_out = opt_fn(input, weight)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "log_path = \"\"\n",
+    "!cat $log_path | grep \"Total execution cycle\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Multi-Core"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.environ['TOGSIM_CONFIG']=\"/workspace/PyTorchSim/tutorial/session1/togsim_configs/togsim_config_2_cores.yml\"\n",
+    "\n",
+    "input = torch.randn(2048, 2048).to(device=device)\n",
+    "weight = torch.randn(2048, 2048).to(device=device)\n",
+    "\n",
+    "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
+    "npu_out = opt_fn(input, weight)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "log_path = \"\"\n",
+    "!cat $log_path | grep \"Total execution cycle\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/tutorial/session1/Training.ipynb b/tutorial/session1/Training.ipynb
index 1f86a5b8..0ec85a3d 100644
--- a/tutorial/session1/Training.ipynb
+++ b/tutorial/session1/Training.ipynb
@@ -10,25 +10,10 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2026-04-16T05:45:04.361593Z",
-     "iopub.status.busy": "2026-04-16T05:45:04.361471Z",
-     "iopub.status.idle": "2026-04-16T05:45:07.515245Z",
-     "shell.execute_reply": "2026-04-16T05:45:07.514397Z",
-     "shell.execute_reply.started": "2026-04-16T05:45:04.361578Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
-    "import os\n",
-    "import sys\n",
-    "import torch\n",
-    "base_dir = os.environ.get('TORCHSIM_DIR', default='/workspace/PyTorchSim')\n",
-    "sys.path.append(base_dir)\n",
-    "\n",
-    "cpu_device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
-    "npu_device = torch.device(\"npu:0\")"
+    "import torch"
    ]
   },
   {
@@ -41,23 +26,15 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2026-04-16T05:45:07.516141Z",
-     "iopub.status.busy": "2026-04-16T05:45:07.515901Z",
-     "iopub.status.idle": "2026-04-16T05:45:07.635695Z",
-     "shell.execute_reply": "2026-04-16T05:45:07.634872Z",
-     "shell.execute_reply.started": "2026-04-16T05:45:07.516123Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
+    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+    "\n",
     "torch.manual_seed(0)\n",
-    "cpu_input = torch.randn(128, 128).to(cpu_device)\n",
-    "cpu_weight = torch.randn(128, 128).to(cpu_device)\n",
-    "cpu_target = torch.randn(128, 128).to(cpu_device)\n",
-    "cpu_input.requires_grad = True\n",
-    "cpu_weight.requires_grad = True\n",
+    "cpu_input = torch.randn(128, 128).to(device).requires_grad_()\n",
+    "cpu_weight = torch.randn(128, 128).to(device).requires_grad_()\n",
+    "cpu_target = torch.randn(128, 128).to(device).requires_grad_()\n",
     "\n",
     "opt_fn = torch.compile(torch.matmul)\n",
     "cpu_out = opt_fn(cpu_input, cpu_weight)\n",
@@ -77,23 +54,15 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2026-04-16T05:45:07.636349Z",
-     "iopub.status.busy": "2026-04-16T05:45:07.636190Z",
-     "iopub.status.idle": "2026-04-16T05:45:13.350714Z",
-     "shell.execute_reply": "2026-04-16T05:45:13.349588Z",
-     "shell.execute_reply.started": "2026-04-16T05:45:07.636333Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
+    "device = torch.device(\"npu:0\")\n",
+    "\n",
     "torch.manual_seed(0)\n",
-    "npu_input = torch.randn(128, 128).to(npu_device)\n",
-    "npu_weight = torch.randn(128, 128).to(npu_device)\n",
-    "npu_target = torch.randn(128, 128).to(npu_device)\n",
-    "npu_input.requires_grad = True\n",
-    "npu_weight.requires_grad = True\n",
+    "npu_input = torch.randn(128, 128).to(device).requires_grad_()\n",
+    "npu_weight = torch.randn(128, 128).to(device).requires_grad_()\n",
+    "npu_target = torch.randn(128, 128).to(device).requires_grad_()\n",
     "\n",
     "opt_fn = torch.compile(torch.matmul)\n",
     "npu_out = opt_fn(npu_input, npu_weight)\n",
@@ -106,25 +75,19 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2026-04-16T05:45:13.351955Z",
-     "iopub.status.busy": "2026-04-16T05:45:13.351757Z",
-     "iopub.status.idle": "2026-04-16T05:45:13.356589Z",
-     "shell.execute_reply": "2026-04-16T05:45:13.355757Z",
-     "shell.execute_reply.started": "2026-04-16T05:45:13.351935Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def test_result(name, npu_out, cpu_out, rtol=1e-4, atol=1e-4):\n",
     "    if torch.allclose(npu_out.cpu(), cpu_out, rtol=rtol, atol=atol):\n",
-    "        message = f\"|{name} Test Passed|\"\n",
+    "        message = f\"|{name} Functionality Test Passed|\"\n",
     "        print(\"-\" * len(message))\n",
     "        print(message)\n",
     "        print(\"-\" * len(message))\n",
+    "        print(\"npu out: \", npu_out.cpu()[0, :5])\n",
+    "        print(\"cpu out: \", cpu_out[0, :5])\n",
     "    else:\n",
-    "        message = f\"|{name} Test Failed|\"\n",
+    "        message = f\"|{name} Functionality Test Failed|\"\n",
     "        print(\"-\" * len(message))\n",
     "        print(message)\n",
     "        print(\"-\" * len(message))\n",
@@ -136,15 +99,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2026-04-16T05:45:13.357014Z",
-     "iopub.status.busy": "2026-04-16T05:45:13.356871Z",
-     "iopub.status.idle": "2026-04-16T05:45:13.361392Z",
-     "shell.execute_reply": "2026-04-16T05:45:13.360681Z",
-     "shell.execute_reply.started": "2026-04-16T05:45:13.357000Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "test_result(\"MatMul Input Grad\", npu_input.grad, cpu_input.grad)\n",
@@ -161,7 +116,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "base",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },

From 9df9b078ac2d55463590e2ada08334e856a6e0db Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 22 Apr 2026 17:53:08 +0900
Subject: [PATCH 182/194] [Doc] update README for v1.1.0 release

---
 README.md | 296 +++++++++++++++++++++++++++++-------------------------
 1 file changed, 159 insertions(+), 137 deletions(-)

diff --git a/README.md b/README.md
index a6dd399a..6f6a6abc 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,7 @@ PyTorchSim is a comprehensive, high-speed, cycle-accurate NPU simulation framewo
 
 
 For more details, please refer to our [paper](https://doi.org/10.1145/3725843.3756045)!
+> **Disclaimer.** PyTorchSim is an independent project. It is neither part of the official [PyTorch](https://pytorch.org/) distribution nor affiliated with or endorsed by the PyTorch Foundation. The name reflects that this work builds on the open-source PyTorch compiler stack as its front-end for research purposes.
 
 ## Navigation
 [Overview](#pytorchsim-framework-overview) | [Model Zoo](#model-zoo) | [Getting Started](#getting-started)
@@ -22,12 +23,13 @@ For more details, please refer to our [paper](https://doi.org/10.1145/3725843.37
 ## PyTorchSim Framework Overview
 ![Overview](/docs/overview.jpg)
 PyTorchSim consists of **two main** components:
-- **Compiler**: Integrated of [PyTorch2](https://github.com/pytorch/pytorch) compiler stack and generates NPU machine code and TOG for existing PyTorch models.
+- **Compiler**: Integrated with the [PyTorch2](https://github.com/pytorch/pytorch) compiler stack; it generates NPU machine code and TOG for existing PyTorch models.
 - **TOGSim**: Executes TOG for high-speed simulation and accurately models shared resources (DRAM, NoC) through integrated cycle-accurate simulators ([BookSim](https://github.com/booksim/booksim2) and [Ramulator2](https://github.com/CMU-SAFARI/ramulator2)).
 
 PyTorchSim **supports**:
 - DNN inference and [training](#training)
 - Data-dependent timing modeling (e.g. sparsity)
+- [One continuous TOGSim session](#one-togsim-session-one-continuous-log) (single log across multiple forwards)
 - [Multi-tenancy](#multi-tenancy)
 - [Compiler optimizations](#compiler-optimizations)
 - [Mapping](#mapping)
@@ -38,13 +40,16 @@ PyTorchSim **supports**:
 |---|:-:|:-:|---|
 | ResNet-18 | <img src="https://avatars.githubusercontent.com/u/21003710?s=48&v=4" width="20"/> | ✅ | channel last format |
 | ResNet-50 | <img src="https://avatars.githubusercontent.com/u/21003710?s=48&v=4" width="20"/> | ✅ | channel last format |
+| MobileNet-v2 | <img src="https://avatars.githubusercontent.com/u/21003710?s=48&v=4" width="20"/> | ✅ | `tests/MobileNet/` (torchvision) |
+| YOLOv5 | <img src="https://avatars.githubusercontent.com/u/21003710?s=48&v=4" width="20"/> | ✅ | `tests/Yolov5/` |
 | BERT | <img src="https://avatars.githubusercontent.com/u/21003710?s=48&v=4" width="20"/> | ✅ |  |
 | GPT-2 | <img src="https://avatars.githubusercontent.com/u/21003710?s=48&v=4" width="20"/> | ✅ |  |
-| ViT | <img src="https://avatars.githubusercontent.com/u/21003710?s=48&v=4" width="20"/> | ✅ |  |
+| ViT | <img src="https://avatars.githubusercontent.com/u/21003710?s=48&v=4" width="20"/> | ✅ | `tests/test_vit.py` |
 | Mistral | <img src="https://avatars.githubusercontent.com/u/21003710?s=48&v=4" width="20"/> | ✅ | |
-| Diffusion | 🤗 | ✅ |  |
-| Llama-4 | 🤗 | ⏳ | Under Development |
-| DeepSeek v1 | 🤗 | ⏳ | Under Development |
+| Stable-diffusion v1 | 🤗 | ✅ |  |
+| Llama 2/3 | 🤗 | ✅ | `tests/Llama/` (blocks & decode-style paths) |
+| DeepSeek-V3 (base) | 🤗 | ✅ | `tests/DeepSeek/` — several ops(e.g., gate ops) are not cycle-modeled |
+| Llama-4 | 🤗 | ⏳ | Under development |
 <!-- ## Requirements
 
 ### OS Distribution
@@ -58,7 +63,7 @@ cmake == 3.26.4
 conan == 1.56.0
 python >= 3.10
 pytorch == 2.2.0
-risc-v64-unknown-elf-gcc == 13.2.0
+riscv64-unknown-elf-gcc == 13.2.0
 ```
 Our provided Docker environment resolves software dependencies.
 
@@ -90,16 +95,16 @@ To download the latest Docker image and set up the environment, use the followin
 docker run -it --ipc=host --name torchsim -w /workspace/PyTorchSim ghcr.io/psal-postech/torchsim-ci:v1.0.1 bash
 ```
 ### Manual Setting (Optional)
-This script provides building [Gem5](https://github.com/PSAL-POSTECH/gem5.git), [LLVM](https://github.com/PSAL-POSTECH/llvm-project.git), and [Spike](https://github.com/PSAL-POSTECH/riscv-isa-sim.git) simulator from source code for specific experts.
+This script builds [Gem5](https://github.com/PSAL-POSTECH/gem5.git), [LLVM](https://github.com/PSAL-POSTECH/llvm-project.git), and [Spike](https://github.com/PSAL-POSTECH/riscv-isa-sim.git) from source for advanced users.
 ```bash
-bash script/build_from_source.sh
+bash scripts/build_from_source.sh
 ```
 ### Run Examples
-The `tests` directory contains several AI workloads examples.
+The `tests` directory contains several AI workload examples.
 ```bash
 python tests/test_matmul.py 
 ```
-The result is stored to `TORCHSIM_LOG_PATH/hash/togsim_result/`. The log file contains detailed core, memory, and interconnect stats.
+The result is written to `${TORCHSIM_LOG_PATH}/togsim_result/XXX.log`. The log file contains detailed core, memory, and interconnect stats.
 
 ### Run Your Own Model on PyTorchSim
 You can run your own PyTorch model on PyTorchSim by setting up a custom NPU device.  
@@ -109,7 +114,7 @@ import torch
 
 device = torch.device("npu:0")
 
-# Declare you own model (e.g. resnet18 from torchvision)
+# Declare your own model (e.g. resnet18 from torchvision)
 from torchvision.models import resnet18
 model = resnet18().eval()
 x = torch.randn(1, 3, 224, 224, dtype=torch.float32)
@@ -128,11 +133,11 @@ PyTorchSim automatically generates a Tile-Operation Graph (TOG), and runs it thr
 ### Result
 Running log in CLI
 ```bash
-Wrapper Codegen Path = /tmp/torchinductor_root/fo/cfofsp5nwmpqxctouan2v2t5y7qp5vwrgvw4swssx4ca4us3c5tx.py
-[Gem5] Gem5 is running.
-[Spike] Running Spike simulator
-[TOGSim] TOGSim is running..
-[TOGSim] Simulation log is stored to "/workspace/PyTorchSim/togsim_results/20251205_080553.log"
+[2026-04-22 11:29:20.139] [INFO] [pytorchsimfrontend.mlir.generated_wrapper] Wrapper Codegen Path = /workspace/PyTorchSim/outputs/.torchinductor/ru/cruz5mvhqeci3avet3ebv6outo6rbo7uiv477tj7u2zjlvfp6k5k.py
+[2026-04-22 11:29:20.638] [INFO] [simulator.simulator] [Gem5] Gem5 simulation started
+[2026-04-22 11:29:26.138] [INFO] [simulator.simulator] [Spike] Running Spike simulator
+[2026-04-22 11:29:27.609] [INFO] [simulator.simulator] [TOGSim] TOGSim simulation started
+[2026-04-22 11:29:28.217] [INFO] [simulator.simulator] [TOGSim] Simulation log is stored to "/workspace/PyTorchSim/togsim_results/20260422_112927_6fb9d704.log"
 ----------------------------
 |Matmul Forward Test Passed|
 ----------------------------
@@ -140,61 +145,44 @@ Wrapper Codegen Path = /tmp/torchinductor_root/fo/cfofsp5nwmpqxctouan2v2t5y7qp5v
 
 Simulation consists of three steps
 
-1. `Gem5` obatins compute latency for TOG.
+1. `Gem5` obtains compute latency for TOG.
 2. `Spike` verifies the output code.
-3. `TOGSim` simulates a NPU architecture.
+3. `TOGSim` simulates an NPU architecture.
 
-If you want to turn off the `SpikeSimulator` for fast simulation, you can set as below.
+The log contains memory & core stats.
 ```bash
-export pytorchsim_functional_mode=False
-```
-Log contains memory & core stats.
-```bash
-[2025-12-05 08:05:52.538] [info] HBM2-CH_0: avg BW utilization 49% (768 reads, 256 writes)
-[2025-12-05 08:05:52.538] [info] Row hits: 956, Row misses: 32, Row conflicts: 36
-[2025-12-05 08:05:52.538] [info] HBM2-CH_1: avg BW utilization 49% (768 reads, 256 writes)
-[2025-12-05 08:05:52.538] [info] Row hits: 956, Row misses: 32, Row conflicts: 36
-[2025-12-05 08:05:52.538] [info] HBM2-CH_2: avg BW utilization 49% (768 reads, 256 writes)
-[2025-12-05 08:05:52.538] [info] Row hits: 959, Row misses: 32, Row conflicts: 33
-[2025-12-05 08:05:52.538] [info] HBM2-CH_3: avg BW utilization 49% (768 reads, 256 writes)
-[2025-12-05 08:05:52.538] [info] Row hits: 956, Row misses: 32, Row conflicts: 36
-[2025-12-05 08:05:52.538] [info] HBM2-CH_4: avg BW utilization 49% (768 reads, 256 writes)
-[2025-12-05 08:05:52.538] [info] Row hits: 959, Row misses: 32, Row conflicts: 33
-[2025-12-05 08:05:52.538] [info] HBM2-CH_5: avg BW utilization 49% (768 reads, 256 writes)
-[2025-12-05 08:05:52.538] [info] Row hits: 959, Row misses: 32, Row conflicts: 33
-[2025-12-05 08:05:52.538] [info] HBM2-CH_6: avg BW utilization 49% (768 reads, 256 writes)
-[2025-12-05 08:05:52.538] [info] Row hits: 956, Row misses: 32, Row conflicts: 36
-[2025-12-05 08:05:52.538] [info] HBM2-CH_7: avg BW utilization 49% (768 reads, 256 writes)
-[2025-12-05 08:05:52.538] [info] Row hits: 958, Row misses: 32, Row conflicts: 34
-[2025-12-05 08:05:52.538] [info] HBM2-CH_8: avg BW utilization 49% (768 reads, 256 writes)
-[2025-12-05 08:05:52.538] [info] Row hits: 959, Row misses: 32, Row conflicts: 33
-[2025-12-05 08:05:52.538] [info] HBM2-CH_9: avg BW utilization 49% (768 reads, 256 writes)
-[2025-12-05 08:05:52.538] [info] Row hits: 959, Row misses: 32, Row conflicts: 33
-[2025-12-05 08:05:52.538] [info] HBM2-CH_10: avg BW utilization 49% (768 reads, 256 writes)
-[2025-12-05 08:05:52.538] [info] Row hits: 958, Row misses: 32, Row conflicts: 34
-[2025-12-05 08:05:52.538] [info] HBM2-CH_11: avg BW utilization 49% (768 reads, 256 writes)
-[2025-12-05 08:05:52.538] [info] Row hits: 959, Row misses: 32, Row conflicts: 33
-[2025-12-05 08:05:52.538] [info] HBM2-CH_12: avg BW utilization 49% (768 reads, 256 writes)
-[2025-12-05 08:05:52.538] [info] Row hits: 958, Row misses: 32, Row conflicts: 34
-[2025-12-05 08:05:52.538] [info] HBM2-CH_13: avg BW utilization 49% (768 reads, 256 writes)
-[2025-12-05 08:05:52.538] [info] Row hits: 958, Row misses: 32, Row conflicts: 34
-[2025-12-05 08:05:52.538] [info] HBM2-CH_14: avg BW utilization 49% (768 reads, 256 writes)
-[2025-12-05 08:05:52.538] [info] Row hits: 959, Row misses: 32, Row conflicts: 33
-[2025-12-05 08:05:52.538] [info] HBM2-CH_15: avg BW utilization 49% (768 reads, 256 writes)
-[2025-12-05 08:05:52.538] [info] ===== Instructions count =====
-[2025-12-05 08:05:52.538] [info] Core [0] : MOVIN    inst_count 3
-[2025-12-05 08:05:52.538] [info] Core [0] : MOVOUT   inst_count 1
-[2025-12-05 08:05:52.538] [info] Core [0] : COMP     inst_count 10 (GEMM: 8, Vector: 2)
-[2025-12-05 08:05:52.538] [info] Core [0] : BAR      inst_count 8
-[2025-12-05 08:05:52.538] [info] ========= Core stat =========
-[2025-12-05 08:05:52.538] [info] Core [0] : Systolic array [0] utilization(%) 12.40, active_cycles 256, idle_cycles 1809
-[2025-12-05 08:05:52.538] [info] Core [0] : Systolic array [1] utilization(%) 12.40, active_cycles 256, idle_cycles 1809
-[2025-12-05 08:05:52.538] [info] Core [0] : DMA active_cycles, 1024 DMA idle_cycles 1041, DRAM BW 238.000 GB/s (16384 responses)
-[2025-12-05 08:05:52.538] [info] Core [0] : Vector unit utilization(%) 2.42, active cycle 50, idle_cycle 0
-[2025-12-05 08:05:52.538] [info] Core [0] : NUMA local memory: 16384 requests, remote memory: 0 requests
-[2025-12-05 08:05:52.538] [info] Core [0] : Total_cycles 2065
-[2025-12-05 08:05:52.538] [info] Total execution cycles: 2065
-[2025-12-05 08:05:52.538] [info] Wall-clock time for simulation: 0.147463 seconds
+[2026-04-22 11:29:28.215] [info] [DRAM] Per-channel average bandwidth
+[2026-04-22 11:29:28.215] [info] [DRAM] channel 0 | 15.51 GB/s avg., 51.56% of utilization | 4096 reads, 2048 writes
+[2026-04-22 11:29:28.215] [info] [DRAM] channel 1 | 15.51 GB/s avg., 51.56% of utilization | 4096 reads, 2048 writes
+[2026-04-22 11:29:28.215] [info] [DRAM] channel 2 | 15.51 GB/s avg., 51.56% of utilization | 4096 reads, 2048 writes
+[2026-04-22 11:29:28.215] [info] [DRAM] channel 3 | 15.51 GB/s avg., 51.56% of utilization | 4096 reads, 2048 writes
+[2026-04-22 11:29:28.215] [info] [DRAM] channel 4 | 15.51 GB/s avg., 51.56% of utilization | 4096 reads, 2048 writes
+[2026-04-22 11:29:28.215] [info] [DRAM] channel 5 | 15.51 GB/s avg., 51.56% of utilization | 4096 reads, 2048 writes
+[2026-04-22 11:29:28.215] [info] [DRAM] channel 6 | 15.51 GB/s avg., 51.56% of utilization | 4096 reads, 2048 writes
+[2026-04-22 11:29:28.215] [info] [DRAM] channel 7 | 15.51 GB/s avg., 51.56% of utilization | 4096 reads, 2048 writes
+[2026-04-22 11:29:28.215] [info] [DRAM] channel 8 | 15.51 GB/s avg., 51.56% of utilization | 4096 reads, 2048 writes
+[2026-04-22 11:29:28.215] [info] [DRAM] channel 9 | 15.51 GB/s avg., 51.56% of utilization | 4096 reads, 2048 writes
+[2026-04-22 11:29:28.215] [info] [DRAM] channel 10 | 15.51 GB/s avg., 51.56% of utilization | 4096 reads, 2048 writes
+[2026-04-22 11:29:28.215] [info] [DRAM] channel 11 | 15.51 GB/s avg., 51.56% of utilization | 4096 reads, 2048 writes
+[2026-04-22 11:29:28.215] [info] [DRAM] channel 12 | 15.51 GB/s avg., 51.56% of utilization | 4096 reads, 2048 writes
+[2026-04-22 11:29:28.215] [info] [DRAM] channel 13 | 15.51 GB/s avg., 51.56% of utilization | 4096 reads, 2048 writes
+[2026-04-22 11:29:28.215] [info] [DRAM] channel 14 | 15.51 GB/s avg., 51.56% of utilization | 4096 reads, 2048 writes
+[2026-04-22 11:29:28.215] [info] [DRAM] channel 15 | 15.51 GB/s avg., 51.56% of utilization | 4096 reads, 2048 writes
+[2026-04-22 11:29:28.215] [info] [DRAM] channels 0..15 combined | 248.13 GB/s aggregate, 51.56% of utilization (avg. per channel) | 65536 reads, 32768 writes
+[2026-04-22 11:29:28.215] [info] ===== Instructions count =====
+[2026-04-22 11:29:28.215] [info] Core [0] : MOVIN    inst_count: 2
+[2026-04-22 11:29:28.215] [info] Core [0] : MOVOUT   inst_count: 1
+[2026-04-22 11:29:28.215] [info] Core [0] : COMP     inst_count: 81 (GEMM: 80, Vector: 1)
+[2026-04-22 11:29:28.215] [info] Core [0] : BAR      inst_count: 80
+[2026-04-22 11:29:28.215] [info] ========= Core stat =========
+[2026-04-22 11:29:28.215] [info] Core [0] : Systolic array [0] utilization(%): 34.37, active_cycles: 4096, idle_cycles: 7821
+[2026-04-22 11:29:28.215] [info] Core [0] : Systolic array [1] utilization(%): 34.37, active_cycles: 4096, idle_cycles: 7821
+[2026-04-22 11:29:28.215] [info] Core [0] : DMA active_cycles: 6144, DMA idle_cycles: 5773, DRAM BW: 248.000 GB/s (98304 responses)
+[2026-04-22 11:29:28.215] [info] Core [0] : Vector unit utilization(%): 2.55, active cycle: 304, idle_cycle: 0
+[2026-04-22 11:29:28.215] [info] Core [0] : NUMA local memory: 98304 requests, remote memory: 0 requests
+[2026-04-22 11:29:28.215] [info] Core [0] : Total_cycles: 11917
+[2026-04-22 11:29:28.215] [info] Total execution cycles: 11917
+[2026-04-22 11:29:28.215] [info] Wall-clock time for simulation: 0.602899 seconds
 ```
 The log is dumped in `TORCHSIM_LOG_PATH` and you can set the path as below.
 ```bash
@@ -209,17 +197,17 @@ compiled_step = torch.compile(dynamic=False)(optimizer.step)
 
 optimizer.zero_grad()
 loss.backward()
-opt_step()
+compiled_step()
 ```
 `tests/test_mlp.py` provides an example of MLP training.
 
-## Multi-tenancy
+## One TOGSim session, one continuous log
 
-While the **`with TOGSimulator(config_path=...)`** block is active, **`TOGSIM_CONFIG`** is set to that YAML so **compilation and TOGSim use the same** hardware description.
+By default, **each compiled operation** can run TOGSim in a **standalone** way—typically **one simulator process and one log file per kernel**. That matches single-kernel workflows but splits traces when you run many forwards in a row.
 
-### 1. One TOGSim session, one continuous log
+**`with TOGSimulator(config_path=...)`** keeps **one TOGSim session** open for the block: successive calls (e.g. multiple **`compiled_model(...)`** forwards) run **in sequence in the same process**, so the timeline and shared resources **continue in a single log** instead of restarting for every op. **`TOGSIM_CONFIG`** is set to the given YAML for the block so **codegen and TOGSim** still share one hardware file.
 
-If you want **one** log where kernels are simulated **in sequence** as a single run, wrap the code you already use to execute the compiled model with **`with TOGSimulator(config_path=...)`**. No other API is required; every forward inside the block shares that session.
+Use the same API you already use; only wrap the region you want co-simulated:
 
 ```python
 import torch
@@ -231,7 +219,9 @@ with TOGSimulator(config_path=config):
     y = compiled_model(x)
 ```
 
-### 2. Multi-tenancy and explicit scheduling (`launch_model`)
+<a id="multi-tenancy"></a>
+
+## Multi-tenancy and explicit scheduling (`launch_model`)
 
 For **multi-tenant** or **interleaved** execution, you usually need to attach a **timestamp** and a **`stream_index`** to each launch so the simulator can order work correctly. Use **`torch.npu.launch_model(compiled_model, *inputs, stream_index=..., timestamp=...)`** for that; plain `compiled_model(x)` does not carry those parameters.
 
@@ -289,8 +279,6 @@ for t in poisson_request_generator(model1_lambda, max_msec_time=max_time_msec):
     x = torch.randn(128, 768, device=device)
     events.append((t, 1, opt_model1, (x,)))  # stream_index 1 → queue / partition 1
 
-events.sort(key=lambda e: e[0])
-
 with TOGSimulator(config_path=config):
     for t_msec, stream_index, model, args in events:
         torch.npu.launch_model(
@@ -317,16 +305,16 @@ Depending on tensor shape, use different convolution template:
 ## Mapping
 PyTorchSim provides three mapping strategies.
 ### Heuristic-based mapping
-We adopt and modified heuristic-based mapping of [GEMMINI](https://github.com/ucb-bar/gemmini) by default, which maximizes the utilization of scratchpad memory.
+We adopted and modified heuristic-based mapping from [GEMMINI](https://github.com/ucb-bar/gemmini) by default, which maximizes the utilization of scratchpad memory.
 ### Auto-tuning
-Heuristic method may not be optimal for all cases. PyTorchSim provides auto-tuning to find the best mapping for GEMM, CONV, and vector operations. It reduces the search space by sorting candidates based on scratchpad memory utilization and picking the top-k candidates. Search parameters include tile shape and vector lane stride.
+The heuristic method may not be optimal for all cases. PyTorchSim provides auto-tuning to find the best mapping for GEMM, CONV, and vector operations. It reduces the search space by sorting candidates based on scratchpad memory utilization and picking the top-k candidates. Search parameters include tile shape and vector lane stride.
 
 To enable this, update your configuration file as follows:
 ```bash
 "codegen_mapping_strategy" : "autotune"
 ```
-### Manunal setting
-Users can utilizing third-party mapping tools (e.g., Timeloop). You can explicitly set the mapping file path in the configuration file to apply your own mapping strategies.
+### Manual setting
+Users can use third-party mapping tools (e.g., Timeloop). You can explicitly set the mapping file path in the configuration file to apply your own mapping strategies.
 ```bash
 "codegen_mapping_strategy" : "external",
 "codegen_external_mapping_file" : "path/to/mapping_file.json",
@@ -355,7 +343,7 @@ Key: "M_N_K" for GEMM
 ## L2 Cache
 It supports L2 cache as persistent cache. User can provide software-managed allocation/eviction strategy for tensors with persistent cache.
 
-Common Memory (CMEM) is a new feature introduced in the latest TPUs (newer than TPUv3). Multiple cores share this memory, which provides high bandwidth. Reusable tensors are stored and loaded from CMEM to avoid off-chip traffic. Our L2 cache can work like as CMEM
+Common Memory (CMEM) is a new feature introduced in the latest TPUs (newer than TPUv3). Multiple cores share this memory, which provides high bandwidth. Reusable tensors are stored and loaded from CMEM to avoid off-chip traffic. Our L2 cache can work like CMEM.
 
 To allocate a tensor in L2 cache, set the environment variable as shown below. The `tpuv4` directory provides example plans for L2 cache obtained from TPUv4 profiling.
 ```bash
@@ -378,69 +366,103 @@ You can configure these options using environment variables.
 ```bash
 export TORCHSIM_DIR=/workspace/PyTorchSim # home directory
 
-# Plan which tensor allocated in TPUv4's CMEM
+# Plan which tensors are allocated in TPUv4's CMEM
 export SRAM_BUFFER_PLAN_PATH=/workspace/PyTorchSim/tpuv4/gemm_plan.py
-
-export TORCHSIM_TLS_MODE=1 # User can choose TLS or ILS mode
 export TORCHSIM_USE_TIMING_POOLING=0 # use lightweight pooling for timing
 ```
 ## TOGSim Configuration
 ![NPU_Core](./docs/npu_core.jpg)
 
-`configs` directory contains example NPU configuration files in the JSON format.
-```
-  "num_cores" : 2,                   // Number of NPU cores
-  "core_freq_mhz" : 940,             // Core's frequency (MHz)
-  "num_systolic_array_per_core" : 2, // Number of systolic array per core
-
-  "vpu_num_lanes" : 128,             // Number of VPU lanes
-  "vpu_spad_size_kb_per_lane" : 128, // Scratchpad memory size per lane (KB)
-  "vpu_vector_length_bits" : 256,    // VPU vector register length (Bits)
-
-  "dram_type" : "ramulator2",        // DRAM type: ramulator2 | simple
-  "dram_channels": 32,               // Number of DRAM channels (topology; required for both types)
-  "dram_stats_print_period_cycles": 10000, // Optional DRAM stats interval
-  // ramulator2: per-request size (bytes), DRAM MHz, and per-channel peak GB/s are derived from ramulator_config_path
-  // (peak ≈ timing[0] as MT/s × channel_width × pseudo-channels for HBM2/3; MHz from Ramulator tCK).
-  // Optional: if you set dram_freq_mhz, it must exactly match that derived MHz or initialization fails
-  // (the error message includes tCK in ns and the derived MHz for debugging stale yml values).
-  // Do not set dram_bandwidth_gbps_* at top level.
-  "ramulator_config_path" : "../configs/ramulator2_configs/HBM2_TPUv3.yaml",
-  // simple: dram_latency + dram_channels + optional dram_req_size_byte (default 32). Omit
-  // dram_bandwidth_gbps_* for latency-only; dram_freq_mhz defaults to core_freq_mhz.
-  // With dram_bandwidth_gbps_* set, dram_freq_mhz is required (credit refill per DRAM cycle).
-
-  "l2d_type" : "datacache",
-  "l2d_config" : "S:64:128:512,32,L:B:m:W:L,A:192:4,32:0,32",
-
-  "icnt_type" : "simple",              // Interconnect type (ex. booksim, simple)
-  "icnt_latency" : 7,                  // Interconnect latency (cycle)
-  "icnt_freq_mhz" : 940,               // Interconnect frequency (MHz)
-  "icnt_injection_ports_per_core" : 16 // Interconnect injection ports per core
-  "icnt_config_path" : "../configs/booksim2_configs/fly_c4_m32.icnt", // Booksim2 config file path
-
-  "scheduler" : "simple",            // Scheduler type (Now, only support simple scheduler)
-  "num_partition" : 2,               // Multi-core Partitioning
-  "partition": {                     // allocate request queue index
-    "core_0":0,
-    "core_1":1
-  },
-
-  "codegen_mapping_strategy" : "heuristic", // Compiler mapping strategy (ex. "heuristic", "autotune", "external-then-heuristic", "external-then-autotune")
-  "codegen_external_mapping_file" : "",     // Path to external mapping file
-  "codegen_autotune_max_retry": 10,         // Maximum retries for autotuning
-  "codegen_autotune_template_topk": 4,      // Top-K templates to consider during autotuning
-  // Compiler optimization level/options.
-  // Value can be "all", "none", or a list of specific optimizations:
-  // ["fusion", "reduction_epilogue", "reduction_reduction", "prologue", "single_batch_conv", "multi_tile_conv", "subtile"]
-  "codegen_compiler_optimization" : "all"
-```
-You can set TOGSim config path as below.
-```bash
-export TOGSIM_CONFIG=/workspace/PyTorchSim/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml
-```
+The `configs/` directory holds **YAML** (`.yml`) hardware descriptions. Set `TOGSIM_CONFIG` to one of these files. The **same file** is read by the **compiler** (`PyTorchSimFrontend/extension_config.py`) for `vpu_*`, `pytorchsim_*`, and `codegen_*` fields, and by **TOGSim** (`TOGSim/src/Common.cc`) for the simulator-specific keys below.
+
+### Reference layout (matches `configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml`)
+
+```yaml
+# --- Core (TOGSim) ---
+num_cores: 1
+core_freq_mhz: 940
+core_stats_print_period_cycles: 10000
+num_systolic_array_per_core: 2
+# Optional: one entry per core, default ws_mesh
+# core_type: [ws_mesh, ws_mesh]
+# Optional STONNE cores: stonne_config_path, num_stonne_per_core, num_stonne_port
+
+# --- VPU / scratchpad (compiler codegen; same YAML) ---
+vpu_num_lanes: 128
+vpu_spad_size_kb_per_lane: 128
+vpu_vector_length_bits: 256
+
+# --- DRAM config ---
+dram_type: ramulator2          # ramulator2 | simple
+dram_freq_mhz: 940
+dram_channels: 16
+dram_stats_print_period_cycles: 10000
+ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv3.yaml  # resolved relative to this YAML’s directory
+# For ramulator2: request size, DRAM MHz, and per-channel peak GB/s are derived from the Ramulator YAML.
+# dram_freq_mhz must exactly match MHz derived from Ramulator tCK or startup fails.
+
+# simple DRAM (alternative to ramulator2):
+# dram_type: simple
+# dram_latency: 100
+# dram_req_size_byte: 32   # optional, default 32
+# dram_freq_mhz: <MHz>     # defaults to core_freq_mhz if omitted
+# Optional bandwidth cap (set only one of the two):
+# dram_bandwidth_gbps_per_channel: ...
+# dram_bandwidth_gbps_total: ...
+# If either bandwidth key is set, dram_freq_mhz is required.
+
+# Optional: NUMA-style DRAM partitions (channels must divide evenly)
+# dram_num_partitions: 2
+
+# --- Interconnect (TOGSim) ---
+icnt_type: simple              # simple | booksim2
+icnt_latency_cycles: 10        # used when icnt_type is simple
+icnt_freq_mhz: 940
+icnt_injection_ports_per_core: 16
+# icnt_stats_print_period_cycles: 0   # optional
+# For icnt_type: booksim2, use booksim_config_path (not icnt_config_path):
+# booksim_config_path: ../configs/booksim2_configs/fly_c16_m16.icnt
+
+# --- Functional / timing flags (compiler; same YAML) ---
+pytorchsim_functional_mode: 1  # 1 = run Spike validation, 0 = skip for faster runs
+pytorchsim_timing_mode: 1
+
+# --- Compiler mapping / optimizations (same YAML) ---
+codegen_mapping_strategy: heuristic   # heuristic | autotune | external-then-heuristic | external-then-autotune
+codegen_external_mapping_file: ''
+codegen_autotune_max_retry: 10
+codegen_autotune_template_topk: 4
+codegen_compiler_optimization: all    # all | none | list of option names
+
+# --- Optional L2 (TOGSim) ---
+# l2d_type: nocache            # default if omitted
+# l2d_type: datacache
+# l2d_config: "S:64:128:512,32,..."   # required when l2d_type is datacache (AccelSim-style string)
+
+# --- Optional scheduler / partitions (TOGSim; multi-queue) ---
+# scheduler: simple
+# num_partition: 2
+# partition:
+#   core_0: 0
+#   core_1: 1
+```
+
+### Key fields (quick reference)
+
+One-line meaning for each group (details in the YAML block above).
+
+- **Core (`num_cores`, `core_freq_mhz`, `core_stats_print_period_cycles`, `num_systolic_array_per_core`, optional `core_type`, STONNE keys)**: how many cores, their clock, stats cadence, systolic count per core, and optional non-default mesh vs STONNE mix.
+- **VPU (`vpu_*`)**: vector lane count, per-lane scratchpad (KB), and vector register width—**compiler** uses these for tiling/codegen.
+- **DRAM (`dram_type`, `dram_channels`, …)**: `ramulator2` uses `ramulator_config_path`; `simple` uses fixed latency and optional bandwidth caps (`dram_bandwidth_gbps_*`, `dram_freq_mhz` when capped). `dram_num_partitions` splits channels for NUMA-style addressing.
+- **Interconnect (`icnt_*`, `booksim_config_path`)**: `simple` adds fixed hop latency (`icnt_latency_cycles`); `booksim2` points at a BookSim2 topology file.
+- **Codegen (`codegen_*`)**: mapping strategy (heuristic / autotune / external-hybrid), external JSON path, autotune search limits, and fusion/optimization set for the PyTorch compiler path.
+- **L2 (`l2d_type`, `l2d_config`, optional `l2d_hit_latency`)**: optional data cache between cores and DRAM; `l2d_config` uses AccelSim-style cache geometry strings.
+- **Scheduler (`scheduler`, `num_partition`, `partition`)**: request queues per partition and `core_i` → queue index mapping for multi-tenant / `launch_model` routing.
+- **`pytorchsim_functional_mode`**: **`1`** runs **Spike** on generated code; **`0`** skips it for faster iteration.
+- **`pytorchsim_timing_mode`**: **`1`** keeps the cycle-aware tile-graph path that feeds **TOGSim**; **`0`** turns that timing path off (functional-style runs; often paired with `pytorchsim_functional_mode` in tutorial configs).
+
 ## Future Works
-Currently, PyTorchSim supports PyTorch 2.2. Support for newer versions will be added soon.
+We plan to broaden **model coverage** (more architectures and workloads), improve **dynamic-shape** support in the compiler and simulator path, and extend **eager-mode** integration so a wider range of PyTorch programs can be exercised without relying solely on `torch.compile`-style flows.
 
 ## Artifact Evaluation
 Artifact evaluation is being prepared for v1.0.0.

From 69ce680472883756eb0a39f761b9b8189ecc3d52 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 22 Apr 2026 19:17:22 +0900
Subject: [PATCH 183/194] [Config] Tighten Ramulator2 config output and log raw
 on-disk file - gen_configs: use JSONEncoder to emit more compact JSON
 (regenerated yaml files) - Simulator: read Ramulator2 config with ifstream
 and log text instead of YAML::Dump

---
 TOGSim/src/Simulator.cc                    |  17 +-
 configs/ramulator2_configs/DDR4.yaml       | 436 ++----------------
 configs/ramulator2_configs/HBM2.yaml       | 491 ++------------------
 configs/ramulator2_configs/HBM2_TPUv2.yaml | 491 ++------------------
 configs/ramulator2_configs/HBM2_TPUv3.yaml | 491 ++------------------
 configs/ramulator2_configs/LPDDR5.yaml     | 507 ++-------------------
 configs/ramulator2_configs/LPDDR5X.yaml    | 507 ++-------------------
 configs/ramulator2_configs/gen_configs.py  |  15 +-
 8 files changed, 206 insertions(+), 2749 deletions(-)

diff --git a/TOGSim/src/Simulator.cc b/TOGSim/src/Simulator.cc
index eb3b8670..d987d787 100644
--- a/TOGSim/src/Simulator.cc
+++ b/TOGSim/src/Simulator.cc
@@ -1,5 +1,9 @@
 #include "Simulator.h"
 
+#include <fstream>
+#include <sstream>
+#include <string>
+
 Simulator::Simulator(SimulationConfig config, YAML::Node hardware_config_yaml)
     : _config(config),
       _hardware_config_yaml(std::move(hardware_config_yaml)),
@@ -45,8 +49,17 @@ Simulator::Simulator(SimulationConfig config, YAML::Node hardware_config_yaml)
                                        .append(config.dram_config_path)
                                        .string();
     spdlog::info("[Config/DRAM] Ramulator2 config path: {}", ramulator_config);
-    YAML::Node dram_config = YAML::LoadFile(ramulator_config);
-    spdlog::info("[Config/DRAM] Ramulator2 configuration:\n{}", YAML::Dump(dram_config));
+    {
+      std::ifstream in(ramulator_config);
+      if (!in) {
+        spdlog::warn("[Config/DRAM] Could not open Ramulator2 config: {}", ramulator_config);
+      } else {
+        std::ostringstream ss;
+        ss << in.rdbuf();
+        const std::string raw = ss.str();
+        spdlog::info("[Config/DRAM] Ramulator2 configuration :\n{}", raw);
+      }
+    }
     config.dram_config_path = ramulator_config;
     _dram = std::make_unique<DramRamulator2>(config, &_core_cycles);
   } else {
diff --git a/configs/ramulator2_configs/DDR4.yaml b/configs/ramulator2_configs/DDR4.yaml
index c4b16617..6e8bc4ba 100644
--- a/configs/ramulator2_configs/DDR4.yaml
+++ b/configs/ramulator2_configs/DDR4.yaml
@@ -9,413 +9,37 @@
     "channel_mapper": {
       "impl": "PassThroughChannelMapper"
     },
-    "controllers": [
-      {
-        "impl": "GenericDDR",
-        "wr_low_watermark": 0.2,
-        "wr_high_watermark": 0.8,
-        "read_buffer_size": 32,
-        "write_buffer_size": 32,
-        "priority_buffer_size": 1568,
-        "scheduler": {
-          "impl": "FRFCFS"
+    "controllers": [{
+      "impl": "GenericDDR",
+      "wr_low_watermark": 0.2,
+      "wr_high_watermark": 0.8,
+      "read_buffer_size": 32,
+      "write_buffer_size": 32,
+      "priority_buffer_size": 1568,
+      "scheduler": {
+        "impl": "FRFCFS"
+      },
+      "refresh_manager": {
+        "impl": "AllBank",
+        "scope": "Channel"
+      },
+      "row_policy": {
+        "impl": "Open"
+      },
+      "addr_mapper": {
+        "impl": "RoBaRaCoCh"
+      },
+      "dram": {
+        "impl": "DDR4",
+        "org": {
+          "dq": 8,
+          "count": [1, 1, 4, 4, 65536, 1024]
         },
-        "refresh_manager": {
-          "impl": "AllBank",
-          "scope": "Channel"
-        },
-        "row_policy": {
-          "impl": "Open"
-        },
-        "addr_mapper": {
-          "impl": "RoBaRaCoCh"
-        },
-        "dram": {
-          "impl": "DDR4",
-          "org": {
-            "dq": 8,
-            "count": [
-              1,
-              1,
-              4,
-              4,
-              65536,
-              1024
-            ]
-          },
-          "timing": [
-            3200,
-            4,
-            22,
-            22,
-            22,
-            52,
-            74,
-            24,
-            12,
-            16,
-            4,
-            8,
-            4,
-            8,
-            4,
-            12,
-            34,
-            576,
-            12480,
-            2,
-            625
-          ],
-          "channel_width": 64,
-          "read_latency": 26,
-          "timing_constraints": [
-            [
-              0,
-              [
-                3,
-                5
-              ],
-              [
-                3,
-                5
-              ],
-              4
-            ],
-            [
-              0,
-              [
-                4,
-                6
-              ],
-              [
-                4,
-                6
-              ],
-              4
-            ],
-            [
-              1,
-              [
-                3,
-                5
-              ],
-              [
-                3,
-                5
-              ],
-              4
-            ],
-            [
-              1,
-              [
-                4,
-                6
-              ],
-              [
-                4,
-                6
-              ],
-              4
-            ],
-            [
-              1,
-              [
-                3,
-                5
-              ],
-              [
-                4,
-                6
-              ],
-              12
-            ],
-            [
-              1,
-              [
-                4,
-                6
-              ],
-              [
-                3,
-                5
-              ],
-              24
-            ],
-            [
-              1,
-              [
-                3,
-                5
-              ],
-              [
-                3,
-                4,
-                5,
-                6
-              ],
-              6,
-              1,
-              true
-            ],
-            [
-              1,
-              [
-                4,
-                6
-              ],
-              [
-                3,
-                5
-              ],
-              0,
-              1,
-              true
-            ],
-            [
-              1,
-              [
-                3
-              ],
-              [
-                2
-              ],
-              12
-            ],
-            [
-              1,
-              [
-                4
-              ],
-              [
-                2
-              ],
-              44
-            ],
-            [
-              1,
-              [
-                0
-              ],
-              [
-                0
-              ],
-              4
-            ],
-            [
-              1,
-              [
-                0
-              ],
-              [
-                0
-              ],
-              34,
-              4
-            ],
-            [
-              1,
-              [
-                0
-              ],
-              [
-                2
-              ],
-              52
-            ],
-            [
-              1,
-              [
-                2
-              ],
-              [
-                0
-              ],
-              22
-            ],
-            [
-              1,
-              [
-                0
-              ],
-              [
-                7
-              ],
-              74
-            ],
-            [
-              1,
-              [
-                1,
-                2
-              ],
-              [
-                7
-              ],
-              22
-            ],
-            [
-              1,
-              [
-                5
-              ],
-              [
-                7
-              ],
-              34
-            ],
-            [
-              1,
-              [
-                6
-              ],
-              [
-                7
-              ],
-              66
-            ],
-            [
-              1,
-              [
-                7
-              ],
-              [
-                0,
-                2
-              ],
-              576
-            ],
-            [
-              2,
-              [
-                3,
-                5
-              ],
-              [
-                3,
-                5
-              ],
-              8
-            ],
-            [
-              2,
-              [
-                4,
-                6
-              ],
-              [
-                4,
-                6
-              ],
-              8
-            ],
-            [
-              2,
-              [
-                4,
-                6
-              ],
-              [
-                3,
-                5
-              ],
-              32
-            ],
-            [
-              2,
-              [
-                0
-              ],
-              [
-                0
-              ],
-              8
-            ],
-            [
-              3,
-              [
-                0
-              ],
-              [
-                0
-              ],
-              74
-            ],
-            [
-              3,
-              [
-                0
-              ],
-              [
-                3,
-                4,
-                5,
-                6
-              ],
-              22
-            ],
-            [
-              3,
-              [
-                0
-              ],
-              [
-                1
-              ],
-              52
-            ],
-            [
-              3,
-              [
-                1
-              ],
-              [
-                0
-              ],
-              22
-            ],
-            [
-              3,
-              [
-                3
-              ],
-              [
-                1
-              ],
-              12
-            ],
-            [
-              3,
-              [
-                4
-              ],
-              [
-                1
-              ],
-              44
-            ],
-            [
-              3,
-              [
-                5
-              ],
-              [
-                0
-              ],
-              34
-            ],
-            [
-              3,
-              [
-                6
-              ],
-              [
-                0
-              ],
-              66
-            ]
-          ]
-        }
+        "timing": [3200, 4, 22, 22, 22, 52, 74, 24, 12, 16, 4, 8, 4, 8, 4, 12, 34, 576, 12480, 2, 625],
+        "channel_width": 64,
+        "read_latency": 26,
+        "timing_constraints": [[0, [3, 5], [3, 5], 4], [0, [4, 6], [4, 6], 4], [1, [3, 5], [3, 5], 4], [1, [4, 6], [4, 6], 4], [1, [3, 5], [4, 6], 12], [1, [4, 6], [3, 5], 24], [1, [3, 5], [3, 4, 5, 6], 6, 1, true], [1, [4, 6], [3, 5], 0, 1, true], [1, [3], [2], 12], [1, [4], [2], 44], [1, [0], [0], 4], [1, [0], [0], 34, 4], [1, [0], [2], 52], [1, [2], [0], 22], [1, [0], [7], 74], [1, [1, 2], [7], 22], [1, [5], [7], 34], [1, [6], [7], 66], [1, [7], [0, 2], 576], [2, [3, 5], [3, 5], 8], [2, [4, 6], [4, 6], 8], [2, [4, 6], [3, 5], 32], [2, [0], [0], 8], [3, [0], [0], 74], [3, [0], [3, 4, 5, 6], 22], [3, [0], [1], 52], [3, [1], [0], 22], [3, [3], [1], 12], [3, [4], [1], 44], [3, [5], [0], 34], [3, [6], [0], 66]]
       }
-    ]
+    }]
   }
 }
\ No newline at end of file
diff --git a/configs/ramulator2_configs/HBM2.yaml b/configs/ramulator2_configs/HBM2.yaml
index 3dda8abf..3c14bedb 100644
--- a/configs/ramulator2_configs/HBM2.yaml
+++ b/configs/ramulator2_configs/HBM2.yaml
@@ -9,468 +9,37 @@
     "channel_mapper": {
       "impl": "PassThroughChannelMapper"
     },
-    "controllers": [
-      {
-        "impl": "HBM",
-        "wr_low_watermark": 0.2,
-        "wr_high_watermark": 0.8,
-        "read_buffer_size": 32,
-        "write_buffer_size": 32,
-        "priority_buffer_size": 1568,
-        "scheduler": {
-          "impl": "FRFCFS"
+    "controllers": [{
+      "impl": "HBM",
+      "wr_low_watermark": 0.2,
+      "wr_high_watermark": 0.8,
+      "read_buffer_size": 32,
+      "write_buffer_size": 32,
+      "priority_buffer_size": 1568,
+      "scheduler": {
+        "impl": "FRFCFS"
+      },
+      "refresh_manager": {
+        "impl": "AllBank",
+        "scope": "PseudoChannel"
+      },
+      "row_policy": {
+        "impl": "Open"
+      },
+      "addr_mapper": {
+        "impl": "RoBaRaCoCh"
+      },
+      "dram": {
+        "impl": "HBM2",
+        "org": {
+          "dq": 64,
+          "count": [1, 2, 4, 4, 65536, 32]
         },
-        "refresh_manager": {
-          "impl": "AllBank",
-          "scope": "PseudoChannel"
-        },
-        "row_policy": {
-          "impl": "Open"
-        },
-        "addr_mapper": {
-          "impl": "RoBaRaCoCh"
-        },
-        "dram": {
-          "impl": "HBM2",
-          "org": {
-            "dq": 64,
-            "count": [
-              1,
-              2,
-              4,
-              4,
-              65536,
-              32
-            ]
-          },
-          "timing": [
-            2000,
-            2,
-            14,
-            14,
-            12,
-            14,
-            34,
-            48,
-            16,
-            5,
-            5,
-            2,
-            4,
-            4,
-            4,
-            6,
-            8,
-            15,
-            350,
-            160,
-            8,
-            3900,
-            122,
-            1000
-          ],
-          "channel_width": 64,
-          "read_latency": 16,
-          "timing_constraints": [
-            [
-              0,
-              [
-                0
-              ],
-              [
-                0,
-                1,
-                2,
-                7,
-                8
-              ],
-              2
-            ],
-            [
-              1,
-              [
-                3,
-                5
-              ],
-              [
-                3,
-                5
-              ],
-              2
-            ],
-            [
-              1,
-              [
-                4,
-                6
-              ],
-              [
-                4,
-                6
-              ],
-              2
-            ],
-            [
-              1,
-              [
-                3,
-                5
-              ],
-              [
-                3,
-                5
-              ],
-              2
-            ],
-            [
-              1,
-              [
-                4,
-                6
-              ],
-              [
-                4,
-                6
-              ],
-              2
-            ],
-            [
-              1,
-              [
-                3,
-                5
-              ],
-              [
-                4,
-                6
-              ],
-              13
-            ],
-            [
-              1,
-              [
-                4,
-                6
-              ],
-              [
-                3,
-                5
-              ],
-              13
-            ],
-            [
-              1,
-              [
-                3
-              ],
-              [
-                2
-              ],
-              5
-            ],
-            [
-              1,
-              [
-                4
-              ],
-              [
-                2
-              ],
-              23
-            ],
-            [
-              1,
-              [
-                0
-              ],
-              [
-                0
-              ],
-              4
-            ],
-            [
-              1,
-              [
-                0
-              ],
-              [
-                0
-              ],
-              15,
-              4
-            ],
-            [
-              1,
-              [
-                0
-              ],
-              [
-                2
-              ],
-              35
-            ],
-            [
-              1,
-              [
-                2
-              ],
-              [
-                0
-              ],
-              13
-            ],
-            [
-              1,
-              [
-                0
-              ],
-              [
-                7
-              ],
-              49
-            ],
-            [
-              1,
-              [
-                1,
-                2
-              ],
-              [
-                7
-              ],
-              14
-            ],
-            [
-              1,
-              [
-                5
-              ],
-              [
-                7
-              ],
-              19
-            ],
-            [
-              1,
-              [
-                6
-              ],
-              [
-                7
-              ],
-              37
-            ],
-            [
-              1,
-              [
-                7
-              ],
-              [
-                0
-              ],
-              349
-            ],
-            [
-              1,
-              [
-                7
-              ],
-              [
-                2
-              ],
-              350
-            ],
-            [
-              1,
-              [
-                8
-              ],
-              [
-                0
-              ],
-              7
-            ],
-            [
-              1,
-              [
-                0
-              ],
-              [
-                8
-              ],
-              5
-            ],
-            [
-              2,
-              [
-                3,
-                5
-              ],
-              [
-                3,
-                5
-              ],
-              4
-            ],
-            [
-              2,
-              [
-                4,
-                6
-              ],
-              [
-                4,
-                6
-              ],
-              4
-            ],
-            [
-              2,
-              [
-                4,
-                6
-              ],
-              [
-                3,
-                5
-              ],
-              15
-            ],
-            [
-              2,
-              [
-                0
-              ],
-              [
-                0
-              ],
-              4
-            ],
-            [
-              3,
-              [
-                0
-              ],
-              [
-                0
-              ],
-              48
-            ],
-            [
-              3,
-              [
-                0
-              ],
-              [
-                3,
-                5
-              ],
-              15
-            ],
-            [
-              3,
-              [
-                0
-              ],
-              [
-                4,
-                6
-              ],
-              13
-            ],
-            [
-              3,
-              [
-                0
-              ],
-              [
-                1
-              ],
-              35
-            ],
-            [
-              3,
-              [
-                1
-              ],
-              [
-                0
-              ],
-              13
-            ],
-            [
-              3,
-              [
-                3
-              ],
-              [
-                1
-              ],
-              5
-            ],
-            [
-              3,
-              [
-                4
-              ],
-              [
-                1
-              ],
-              23
-            ],
-            [
-              3,
-              [
-                5
-              ],
-              [
-                0
-              ],
-              18
-            ],
-            [
-              3,
-              [
-                6
-              ],
-              [
-                0
-              ],
-              36
-            ],
-            [
-              3,
-              [
-                8
-              ],
-              [
-                0
-              ],
-              159
-            ],
-            [
-              3,
-              [
-                0
-              ],
-              [
-                8
-              ],
-              49
-            ],
-            [
-              3,
-              [
-                1
-              ],
-              [
-                8
-              ],
-              14
-            ]
-          ]
-        }
+        "timing": [2000, 2, 14, 14, 12, 14, 34, 48, 16, 5, 5, 2, 4, 4, 4, 6, 8, 15, 350, 160, 8, 3900, 122, 1000],
+        "channel_width": 64,
+        "read_latency": 16,
+        "timing_constraints": [[0, [0], [0, 1, 2, 7, 8], 2], [1, [3, 5], [3, 5], 2], [1, [4, 6], [4, 6], 2], [1, [3, 5], [3, 5], 2], [1, [4, 6], [4, 6], 2], [1, [3, 5], [4, 6], 13], [1, [4, 6], [3, 5], 13], [1, [3], [2], 5], [1, [4], [2], 23], [1, [0], [0], 4], [1, [0], [0], 15, 4], [1, [0], [2], 35], [1, [2], [0], 13], [1, [0], [7], 49], [1, [1, 2], [7], 14], [1, [5], [7], 19], [1, [6], [7], 37], [1, [7], [0], 349], [1, [7], [2], 350], [1, [8], [0], 7], [1, [0], [8], 5], [2, [3, 5], [3, 5], 4], [2, [4, 6], [4, 6], 4], [2, [4, 6], [3, 5], 15], [2, [0], [0], 4], [3, [0], [0], 48], [3, [0], [3, 5], 15], [3, [0], [4, 6], 13], [3, [0], [1], 35], [3, [1], [0], 13], [3, [3], [1], 5], [3, [4], [1], 23], [3, [5], [0], 18], [3, [6], [0], 36], [3, [8], [0], 159], [3, [0], [8], 49], [3, [1], [8], 14]]
       }
-    ]
+    }]
   }
 }
\ No newline at end of file
diff --git a/configs/ramulator2_configs/HBM2_TPUv2.yaml b/configs/ramulator2_configs/HBM2_TPUv2.yaml
index 88c1adf3..0d7313fb 100644
--- a/configs/ramulator2_configs/HBM2_TPUv2.yaml
+++ b/configs/ramulator2_configs/HBM2_TPUv2.yaml
@@ -9,468 +9,37 @@
     "channel_mapper": {
       "impl": "PassThroughChannelMapper"
     },
-    "controllers": [
-      {
-        "impl": "HBM",
-        "wr_low_watermark": 0.2,
-        "wr_high_watermark": 0.8,
-        "read_buffer_size": 64,
-        "write_buffer_size": 64,
-        "priority_buffer_size": 1568,
-        "scheduler": {
-          "impl": "FRFCFS"
+    "controllers": [{
+      "impl": "HBM",
+      "wr_low_watermark": 0.2,
+      "wr_high_watermark": 0.8,
+      "read_buffer_size": 64,
+      "write_buffer_size": 64,
+      "priority_buffer_size": 1568,
+      "scheduler": {
+        "impl": "FRFCFS"
+      },
+      "refresh_manager": {
+        "impl": "AllBank",
+        "scope": "PseudoChannel"
+      },
+      "row_policy": {
+        "impl": "Open"
+      },
+      "addr_mapper": {
+        "impl": "RoBaRaCoCh"
+      },
+      "dram": {
+        "impl": "HBM2",
+        "org": {
+          "dq": 64,
+          "count": [1, 2, 4, 4, 65536, 32]
         },
-        "refresh_manager": {
-          "impl": "AllBank",
-          "scope": "PseudoChannel"
-        },
-        "row_policy": {
-          "impl": "Open"
-        },
-        "addr_mapper": {
-          "impl": "RoBaRaCoCh"
-        },
-        "dram": {
-          "impl": "HBM2",
-          "org": {
-            "dq": 64,
-            "count": [
-              1,
-              2,
-              4,
-              4,
-              65536,
-              32
-            ]
-          },
-          "timing": [
-            1400,
-            2,
-            9,
-            9,
-            7,
-            9,
-            22,
-            31,
-            11,
-            4,
-            4,
-            2,
-            4,
-            4,
-            4,
-            5,
-            6,
-            11,
-            245,
-            112,
-            6,
-            2730,
-            86,
-            1429
-          ],
-          "channel_width": 64,
-          "read_latency": 11,
-          "timing_constraints": [
-            [
-              0,
-              [
-                0
-              ],
-              [
-                0,
-                1,
-                2,
-                7,
-                8
-              ],
-              2
-            ],
-            [
-              1,
-              [
-                3,
-                5
-              ],
-              [
-                3,
-                5
-              ],
-              2
-            ],
-            [
-              1,
-              [
-                4,
-                6
-              ],
-              [
-                4,
-                6
-              ],
-              2
-            ],
-            [
-              1,
-              [
-                3,
-                5
-              ],
-              [
-                3,
-                5
-              ],
-              2
-            ],
-            [
-              1,
-              [
-                4,
-                6
-              ],
-              [
-                4,
-                6
-              ],
-              2
-            ],
-            [
-              1,
-              [
-                3,
-                5
-              ],
-              [
-                4,
-                6
-              ],
-              9
-            ],
-            [
-              1,
-              [
-                4,
-                6
-              ],
-              [
-                3,
-                5
-              ],
-              11
-            ],
-            [
-              1,
-              [
-                3
-              ],
-              [
-                2
-              ],
-              4
-            ],
-            [
-              1,
-              [
-                4
-              ],
-              [
-                2
-              ],
-              17
-            ],
-            [
-              1,
-              [
-                0
-              ],
-              [
-                0
-              ],
-              4
-            ],
-            [
-              1,
-              [
-                0
-              ],
-              [
-                0
-              ],
-              11,
-              4
-            ],
-            [
-              1,
-              [
-                0
-              ],
-              [
-                2
-              ],
-              23
-            ],
-            [
-              1,
-              [
-                2
-              ],
-              [
-                0
-              ],
-              8
-            ],
-            [
-              1,
-              [
-                0
-              ],
-              [
-                7
-              ],
-              32
-            ],
-            [
-              1,
-              [
-                1,
-                2
-              ],
-              [
-                7
-              ],
-              9
-            ],
-            [
-              1,
-              [
-                5
-              ],
-              [
-                7
-              ],
-              13
-            ],
-            [
-              1,
-              [
-                6
-              ],
-              [
-                7
-              ],
-              26
-            ],
-            [
-              1,
-              [
-                7
-              ],
-              [
-                0
-              ],
-              244
-            ],
-            [
-              1,
-              [
-                7
-              ],
-              [
-                2
-              ],
-              245
-            ],
-            [
-              1,
-              [
-                8
-              ],
-              [
-                0
-              ],
-              5
-            ],
-            [
-              1,
-              [
-                0
-              ],
-              [
-                8
-              ],
-              5
-            ],
-            [
-              2,
-              [
-                3,
-                5
-              ],
-              [
-                3,
-                5
-              ],
-              4
-            ],
-            [
-              2,
-              [
-                4,
-                6
-              ],
-              [
-                4,
-                6
-              ],
-              4
-            ],
-            [
-              2,
-              [
-                4,
-                6
-              ],
-              [
-                3,
-                5
-              ],
-              12
-            ],
-            [
-              2,
-              [
-                0
-              ],
-              [
-                0
-              ],
-              4
-            ],
-            [
-              3,
-              [
-                0
-              ],
-              [
-                0
-              ],
-              31
-            ],
-            [
-              3,
-              [
-                0
-              ],
-              [
-                3,
-                5
-              ],
-              10
-            ],
-            [
-              3,
-              [
-                0
-              ],
-              [
-                4,
-                6
-              ],
-              8
-            ],
-            [
-              3,
-              [
-                0
-              ],
-              [
-                1
-              ],
-              23
-            ],
-            [
-              3,
-              [
-                1
-              ],
-              [
-                0
-              ],
-              8
-            ],
-            [
-              3,
-              [
-                3
-              ],
-              [
-                1
-              ],
-              4
-            ],
-            [
-              3,
-              [
-                4
-              ],
-              [
-                1
-              ],
-              17
-            ],
-            [
-              3,
-              [
-                5
-              ],
-              [
-                0
-              ],
-              12
-            ],
-            [
-              3,
-              [
-                6
-              ],
-              [
-                0
-              ],
-              25
-            ],
-            [
-              3,
-              [
-                8
-              ],
-              [
-                0
-              ],
-              111
-            ],
-            [
-              3,
-              [
-                0
-              ],
-              [
-                8
-              ],
-              32
-            ],
-            [
-              3,
-              [
-                1
-              ],
-              [
-                8
-              ],
-              9
-            ]
-          ]
-        }
+        "timing": [1400, 2, 9, 9, 7, 9, 22, 31, 11, 4, 4, 2, 4, 4, 4, 5, 6, 11, 245, 112, 6, 2730, 86, 1429],
+        "channel_width": 64,
+        "read_latency": 11,
+        "timing_constraints": [[0, [0], [0, 1, 2, 7, 8], 2], [1, [3, 5], [3, 5], 2], [1, [4, 6], [4, 6], 2], [1, [3, 5], [3, 5], 2], [1, [4, 6], [4, 6], 2], [1, [3, 5], [4, 6], 9], [1, [4, 6], [3, 5], 11], [1, [3], [2], 4], [1, [4], [2], 17], [1, [0], [0], 4], [1, [0], [0], 11, 4], [1, [0], [2], 23], [1, [2], [0], 8], [1, [0], [7], 32], [1, [1, 2], [7], 9], [1, [5], [7], 13], [1, [6], [7], 26], [1, [7], [0], 244], [1, [7], [2], 245], [1, [8], [0], 5], [1, [0], [8], 5], [2, [3, 5], [3, 5], 4], [2, [4, 6], [4, 6], 4], [2, [4, 6], [3, 5], 12], [2, [0], [0], 4], [3, [0], [0], 31], [3, [0], [3, 5], 10], [3, [0], [4, 6], 8], [3, [0], [1], 23], [3, [1], [0], 8], [3, [3], [1], 4], [3, [4], [1], 17], [3, [5], [0], 12], [3, [6], [0], 25], [3, [8], [0], 111], [3, [0], [8], 32], [3, [1], [8], 9]]
       }
-    ]
+    }]
   }
 }
\ No newline at end of file
diff --git a/configs/ramulator2_configs/HBM2_TPUv3.yaml b/configs/ramulator2_configs/HBM2_TPUv3.yaml
index 50a3ea3b..9f5e66ff 100644
--- a/configs/ramulator2_configs/HBM2_TPUv3.yaml
+++ b/configs/ramulator2_configs/HBM2_TPUv3.yaml
@@ -9,468 +9,37 @@
     "channel_mapper": {
       "impl": "PassThroughChannelMapper"
     },
-    "controllers": [
-      {
-        "impl": "HBM",
-        "wr_low_watermark": 0.2,
-        "wr_high_watermark": 0.8,
-        "read_buffer_size": 64,
-        "write_buffer_size": 64,
-        "priority_buffer_size": 1568,
-        "scheduler": {
-          "impl": "FRFCFS"
+    "controllers": [{
+      "impl": "HBM",
+      "wr_low_watermark": 0.2,
+      "wr_high_watermark": 0.8,
+      "read_buffer_size": 64,
+      "write_buffer_size": 64,
+      "priority_buffer_size": 1568,
+      "scheduler": {
+        "impl": "FRFCFS"
+      },
+      "refresh_manager": {
+        "impl": "AllBank",
+        "scope": "PseudoChannel"
+      },
+      "row_policy": {
+        "impl": "Open"
+      },
+      "addr_mapper": {
+        "impl": "RoBaRaCoCh"
+      },
+      "dram": {
+        "impl": "HBM2",
+        "org": {
+          "dq": 64,
+          "count": [1, 2, 4, 4, 65536, 32]
         },
-        "refresh_manager": {
-          "impl": "AllBank",
-          "scope": "PseudoChannel"
-        },
-        "row_policy": {
-          "impl": "Open"
-        },
-        "addr_mapper": {
-          "impl": "RoBaRaCoCh"
-        },
-        "dram": {
-          "impl": "HBM2",
-          "org": {
-            "dq": 64,
-            "count": [
-              1,
-              2,
-              4,
-              4,
-              65536,
-              32
-            ]
-          },
-          "timing": [
-            1880,
-            2,
-            13,
-            13,
-            11,
-            13,
-            31,
-            44,
-            15,
-            5,
-            5,
-            2,
-            4,
-            4,
-            4,
-            6,
-            8,
-            15,
-            329,
-            151,
-            8,
-            3666,
-            115,
-            1064
-          ],
-          "channel_width": 64,
-          "read_latency": 15,
-          "timing_constraints": [
-            [
-              0,
-              [
-                0
-              ],
-              [
-                0,
-                1,
-                2,
-                7,
-                8
-              ],
-              2
-            ],
-            [
-              1,
-              [
-                3,
-                5
-              ],
-              [
-                3,
-                5
-              ],
-              2
-            ],
-            [
-              1,
-              [
-                4,
-                6
-              ],
-              [
-                4,
-                6
-              ],
-              2
-            ],
-            [
-              1,
-              [
-                3,
-                5
-              ],
-              [
-                3,
-                5
-              ],
-              2
-            ],
-            [
-              1,
-              [
-                4,
-                6
-              ],
-              [
-                4,
-                6
-              ],
-              2
-            ],
-            [
-              1,
-              [
-                3,
-                5
-              ],
-              [
-                4,
-                6
-              ],
-              12
-            ],
-            [
-              1,
-              [
-                4,
-                6
-              ],
-              [
-                3,
-                5
-              ],
-              13
-            ],
-            [
-              1,
-              [
-                3
-              ],
-              [
-                2
-              ],
-              5
-            ],
-            [
-              1,
-              [
-                4
-              ],
-              [
-                2
-              ],
-              22
-            ],
-            [
-              1,
-              [
-                0
-              ],
-              [
-                0
-              ],
-              4
-            ],
-            [
-              1,
-              [
-                0
-              ],
-              [
-                0
-              ],
-              15,
-              4
-            ],
-            [
-              1,
-              [
-                0
-              ],
-              [
-                2
-              ],
-              32
-            ],
-            [
-              1,
-              [
-                2
-              ],
-              [
-                0
-              ],
-              12
-            ],
-            [
-              1,
-              [
-                0
-              ],
-              [
-                7
-              ],
-              45
-            ],
-            [
-              1,
-              [
-                1,
-                2
-              ],
-              [
-                7
-              ],
-              13
-            ],
-            [
-              1,
-              [
-                5
-              ],
-              [
-                7
-              ],
-              18
-            ],
-            [
-              1,
-              [
-                6
-              ],
-              [
-                7
-              ],
-              35
-            ],
-            [
-              1,
-              [
-                7
-              ],
-              [
-                0
-              ],
-              328
-            ],
-            [
-              1,
-              [
-                7
-              ],
-              [
-                2
-              ],
-              329
-            ],
-            [
-              1,
-              [
-                8
-              ],
-              [
-                0
-              ],
-              7
-            ],
-            [
-              1,
-              [
-                0
-              ],
-              [
-                8
-              ],
-              5
-            ],
-            [
-              2,
-              [
-                3,
-                5
-              ],
-              [
-                3,
-                5
-              ],
-              4
-            ],
-            [
-              2,
-              [
-                4,
-                6
-              ],
-              [
-                4,
-                6
-              ],
-              4
-            ],
-            [
-              2,
-              [
-                4,
-                6
-              ],
-              [
-                3,
-                5
-              ],
-              15
-            ],
-            [
-              2,
-              [
-                0
-              ],
-              [
-                0
-              ],
-              4
-            ],
-            [
-              3,
-              [
-                0
-              ],
-              [
-                0
-              ],
-              44
-            ],
-            [
-              3,
-              [
-                0
-              ],
-              [
-                3,
-                5
-              ],
-              14
-            ],
-            [
-              3,
-              [
-                0
-              ],
-              [
-                4,
-                6
-              ],
-              12
-            ],
-            [
-              3,
-              [
-                0
-              ],
-              [
-                1
-              ],
-              32
-            ],
-            [
-              3,
-              [
-                1
-              ],
-              [
-                0
-              ],
-              12
-            ],
-            [
-              3,
-              [
-                3
-              ],
-              [
-                1
-              ],
-              5
-            ],
-            [
-              3,
-              [
-                4
-              ],
-              [
-                1
-              ],
-              22
-            ],
-            [
-              3,
-              [
-                5
-              ],
-              [
-                0
-              ],
-              17
-            ],
-            [
-              3,
-              [
-                6
-              ],
-              [
-                0
-              ],
-              34
-            ],
-            [
-              3,
-              [
-                8
-              ],
-              [
-                0
-              ],
-              150
-            ],
-            [
-              3,
-              [
-                0
-              ],
-              [
-                8
-              ],
-              45
-            ],
-            [
-              3,
-              [
-                1
-              ],
-              [
-                8
-              ],
-              13
-            ]
-          ]
-        }
+        "timing": [1880, 2, 13, 13, 11, 13, 31, 44, 15, 5, 5, 2, 4, 4, 4, 6, 8, 15, 329, 151, 8, 3666, 115, 1064],
+        "channel_width": 64,
+        "read_latency": 15,
+        "timing_constraints": [[0, [0], [0, 1, 2, 7, 8], 2], [1, [3, 5], [3, 5], 2], [1, [4, 6], [4, 6], 2], [1, [3, 5], [3, 5], 2], [1, [4, 6], [4, 6], 2], [1, [3, 5], [4, 6], 12], [1, [4, 6], [3, 5], 13], [1, [3], [2], 5], [1, [4], [2], 22], [1, [0], [0], 4], [1, [0], [0], 15, 4], [1, [0], [2], 32], [1, [2], [0], 12], [1, [0], [7], 45], [1, [1, 2], [7], 13], [1, [5], [7], 18], [1, [6], [7], 35], [1, [7], [0], 328], [1, [7], [2], 329], [1, [8], [0], 7], [1, [0], [8], 5], [2, [3, 5], [3, 5], 4], [2, [4, 6], [4, 6], 4], [2, [4, 6], [3, 5], 15], [2, [0], [0], 4], [3, [0], [0], 44], [3, [0], [3, 5], 14], [3, [0], [4, 6], 12], [3, [0], [1], 32], [3, [1], [0], 12], [3, [3], [1], 5], [3, [4], [1], 22], [3, [5], [0], 17], [3, [6], [0], 34], [3, [8], [0], 150], [3, [0], [8], 45], [3, [1], [8], 13]]
       }
-    ]
+    }]
   }
 }
\ No newline at end of file
diff --git a/configs/ramulator2_configs/LPDDR5.yaml b/configs/ramulator2_configs/LPDDR5.yaml
index cbb08b5e..13c43738 100644
--- a/configs/ramulator2_configs/LPDDR5.yaml
+++ b/configs/ramulator2_configs/LPDDR5.yaml
@@ -9,486 +9,37 @@
     "channel_mapper": {
       "impl": "PassThroughChannelMapper"
     },
-    "controllers": [
-      {
+    "controllers": [{
+      "impl": "LPDDR5",
+      "wr_low_watermark": 0.2,
+      "wr_high_watermark": 0.8,
+      "read_buffer_size": 32,
+      "write_buffer_size": 32,
+      "priority_buffer_size": 1568,
+      "scheduler": {
+        "impl": "FRFCFS"
+      },
+      "refresh_manager": {
+        "impl": "AllBank",
+        "scope": "Channel"
+      },
+      "row_policy": {
+        "impl": "Open"
+      },
+      "addr_mapper": {
+        "impl": "RoBaRaCoCh"
+      },
+      "dram": {
         "impl": "LPDDR5",
-        "wr_low_watermark": 0.2,
-        "wr_high_watermark": 0.8,
-        "read_buffer_size": 32,
-        "write_buffer_size": 32,
-        "priority_buffer_size": 1568,
-        "scheduler": {
-          "impl": "FRFCFS"
+        "org": {
+          "dq": 16,
+          "count": [1, 1, 4, 4, 32768, 1024]
         },
-        "refresh_manager": {
-          "impl": "AllBank",
-          "scope": "Channel"
-        },
-        "row_policy": {
-          "impl": "Open"
-        },
-        "addr_mapper": {
-          "impl": "RoBaRaCoCh"
-        },
-        "dram": {
-          "impl": "LPDDR5",
-          "org": {
-            "dq": 16,
-            "count": [
-              1,
-              1,
-              4,
-              4,
-              32768,
-              1024
-            ]
-          },
-          "timing": [
-            6400,
-            2,
-            17,
-            15,
-            15,
-            17,
-            34,
-            49,
-            28,
-            8,
-            9,
-            2,
-            2,
-            4,
-            2,
-            4,
-            4,
-            4,
-            5,
-            10,
-            16,
-            168,
-            96,
-            3125,
-            391,
-            1,
-            0,
-            8,
-            2,
-            1250
-          ],
-          "channel_width": 16,
-          "read_latency": 19,
-          "timing_constraints": [
-            [
-              0,
-              [
-                6,
-                8
-              ],
-              [
-                6,
-                8
-              ],
-              2
-            ],
-            [
-              0,
-              [
-                7,
-                9
-              ],
-              [
-                7,
-                9
-              ],
-              2
-            ],
-            [
-              3,
-              [
-                4
-              ],
-              [
-                6,
-                8
-              ],
-              0
-            ],
-            [
-              3,
-              [
-                5
-              ],
-              [
-                7,
-                9
-              ],
-              0
-            ],
-            [
-              1,
-              [
-                6,
-                8
-              ],
-              [
-                6,
-                8
-              ],
-              2
-            ],
-            [
-              1,
-              [
-                7,
-                9
-              ],
-              [
-                7,
-                9
-              ],
-              2
-            ],
-            [
-              1,
-              [
-                6,
-                8
-              ],
-              [
-                7,
-                9
-              ],
-              12
-            ],
-            [
-              1,
-              [
-                7,
-                9
-              ],
-              [
-                6,
-                8
-              ],
-              16
-            ],
-            [
-              1,
-              [
-                6,
-                8
-              ],
-              [
-                6,
-                7,
-                8,
-                9
-              ],
-              4,
-              1,
-              true
-            ],
-            [
-              1,
-              [
-                7,
-                9
-              ],
-              [
-                6,
-                8
-              ],
-              12,
-              1,
-              true
-            ],
-            [
-              1,
-              [
-                6
-              ],
-              [
-                3
-              ],
-              8
-            ],
-            [
-              1,
-              [
-                7
-              ],
-              [
-                3
-              ],
-              39
-            ],
-            [
-              1,
-              [
-                0
-              ],
-              [
-                0
-              ],
-              4
-            ],
-            [
-              1,
-              [
-                0
-              ],
-              [
-                0
-              ],
-              16,
-              4
-            ],
-            [
-              1,
-              [
-                0
-              ],
-              [
-                3
-              ],
-              34
-            ],
-            [
-              1,
-              [
-                3
-              ],
-              [
-                0
-              ],
-              17
-            ],
-            [
-              1,
-              [
-                2,
-                3
-              ],
-              [
-                2,
-                3
-              ],
-              2
-            ],
-            [
-              1,
-              [
-                0
-              ],
-              [
-                10
-              ],
-              49
-            ],
-            [
-              1,
-              [
-                2,
-                3
-              ],
-              [
-                10
-              ],
-              15
-            ],
-            [
-              1,
-              [
-                8
-              ],
-              [
-                10
-              ],
-              23
-            ],
-            [
-              1,
-              [
-                9
-              ],
-              [
-                10
-              ],
-              54
-            ],
-            [
-              1,
-              [
-                10
-              ],
-              [
-                0,
-                3
-              ],
-              168
-            ],
-            [
-              2,
-              [
-                6,
-                8
-              ],
-              [
-                6,
-                8
-              ],
-              4
-            ],
-            [
-              2,
-              [
-                7,
-                9
-              ],
-              [
-                7,
-                9
-              ],
-              4
-            ],
-            [
-              2,
-              [
-                7,
-                9
-              ],
-              [
-                6,
-                8
-              ],
-              21
-            ],
-            [
-              2,
-              [
-                0
-              ],
-              [
-                0
-              ],
-              4
-            ],
-            [
-              3,
-              [
-                0
-              ],
-              [
-                0
-              ],
-              49
-            ],
-            [
-              3,
-              [
-                0
-              ],
-              [
-                6,
-                7,
-                8,
-                9
-              ],
-              15
-            ],
-            [
-              3,
-              [
-                0
-              ],
-              [
-                2
-              ],
-              34
-            ],
-            [
-              3,
-              [
-                2
-              ],
-              [
-                0
-              ],
-              15
-            ],
-            [
-              3,
-              [
-                6
-              ],
-              [
-                2
-              ],
-              8
-            ],
-            [
-              3,
-              [
-                7
-              ],
-              [
-                2
-              ],
-              39
-            ],
-            [
-              3,
-              [
-                8
-              ],
-              [
-                0
-              ],
-              23
-            ],
-            [
-              3,
-              [
-                9
-              ],
-              [
-                0
-              ],
-              54
-            ],
-            [
-              3,
-              [
-                11
-              ],
-              [
-                0
-              ],
-              96
-            ],
-            [
-              3,
-              [
-                0
-              ],
-              [
-                11
-              ],
-              49
-            ],
-            [
-              3,
-              [
-                2
-              ],
-              [
-                11
-              ],
-              15
-            ]
-          ]
-        }
+        "timing": [6400, 2, 17, 15, 15, 17, 34, 49, 28, 8, 9, 2, 2, 4, 2, 4, 4, 4, 5, 10, 16, 168, 96, 3125, 391, 1, 0, 8, 2, 1250],
+        "channel_width": 16,
+        "read_latency": 19,
+        "timing_constraints": [[0, [6, 8], [6, 8], 2], [0, [7, 9], [7, 9], 2], [3, [4], [6, 8], 0], [3, [5], [7, 9], 0], [1, [6, 8], [6, 8], 2], [1, [7, 9], [7, 9], 2], [1, [6, 8], [7, 9], 12], [1, [7, 9], [6, 8], 16], [1, [6, 8], [6, 7, 8, 9], 4, 1, true], [1, [7, 9], [6, 8], 12, 1, true], [1, [6], [3], 8], [1, [7], [3], 39], [1, [0], [0], 4], [1, [0], [0], 16, 4], [1, [0], [3], 34], [1, [3], [0], 17], [1, [2, 3], [2, 3], 2], [1, [0], [10], 49], [1, [2, 3], [10], 15], [1, [8], [10], 23], [1, [9], [10], 54], [1, [10], [0, 3], 168], [2, [6, 8], [6, 8], 4], [2, [7, 9], [7, 9], 4], [2, [7, 9], [6, 8], 21], [2, [0], [0], 4], [3, [0], [0], 49], [3, [0], [6, 7, 8, 9], 15], [3, [0], [2], 34], [3, [2], [0], 15], [3, [6], [2], 8], [3, [7], [2], 39], [3, [8], [0], 23], [3, [9], [0], 54], [3, [11], [0], 96], [3, [0], [11], 49], [3, [2], [11], 15]]
       }
-    ]
+    }]
   }
 }
\ No newline at end of file
diff --git a/configs/ramulator2_configs/LPDDR5X.yaml b/configs/ramulator2_configs/LPDDR5X.yaml
index a8f454c4..d4b0a4b4 100644
--- a/configs/ramulator2_configs/LPDDR5X.yaml
+++ b/configs/ramulator2_configs/LPDDR5X.yaml
@@ -9,486 +9,37 @@
     "channel_mapper": {
       "impl": "PassThroughChannelMapper"
     },
-    "controllers": [
-      {
+    "controllers": [{
+      "impl": "LPDDR5",
+      "wr_low_watermark": 0.2,
+      "wr_high_watermark": 0.8,
+      "read_buffer_size": 32,
+      "write_buffer_size": 32,
+      "priority_buffer_size": 1568,
+      "scheduler": {
+        "impl": "FRFCFS"
+      },
+      "refresh_manager": {
+        "impl": "AllBank",
+        "scope": "Channel"
+      },
+      "row_policy": {
+        "impl": "Open"
+      },
+      "addr_mapper": {
+        "impl": "RoBaRaCoCh"
+      },
+      "dram": {
         "impl": "LPDDR5",
-        "wr_low_watermark": 0.2,
-        "wr_high_watermark": 0.8,
-        "read_buffer_size": 32,
-        "write_buffer_size": 32,
-        "priority_buffer_size": 1568,
-        "scheduler": {
-          "impl": "FRFCFS"
+        "org": {
+          "dq": 16,
+          "count": [1, 1, 4, 4, 32768, 1024]
         },
-        "refresh_manager": {
-          "impl": "AllBank",
-          "scope": "Channel"
-        },
-        "row_policy": {
-          "impl": "Open"
-        },
-        "addr_mapper": {
-          "impl": "RoBaRaCoCh"
-        },
-        "dram": {
-          "impl": "LPDDR5",
-          "org": {
-            "dq": 16,
-            "count": [
-              1,
-              1,
-              4,
-              4,
-              32768,
-              1024
-            ]
-          },
-          "timing": [
-            8533,
-            2,
-            23,
-            20,
-            20,
-            23,
-            46,
-            65,
-            37,
-            11,
-            12,
-            2,
-            2,
-            4,
-            2,
-            4,
-            6,
-            6,
-            7,
-            13,
-            22,
-            224,
-            128,
-            4165,
-            521,
-            1,
-            0,
-            8,
-            2,
-            938
-          ],
-          "channel_width": 16,
-          "read_latency": 25,
-          "timing_constraints": [
-            [
-              0,
-              [
-                6,
-                8
-              ],
-              [
-                6,
-                8
-              ],
-              2
-            ],
-            [
-              0,
-              [
-                7,
-                9
-              ],
-              [
-                7,
-                9
-              ],
-              2
-            ],
-            [
-              3,
-              [
-                4
-              ],
-              [
-                6,
-                8
-              ],
-              0
-            ],
-            [
-              3,
-              [
-                5
-              ],
-              [
-                7,
-                9
-              ],
-              0
-            ],
-            [
-              1,
-              [
-                6,
-                8
-              ],
-              [
-                6,
-                8
-              ],
-              2
-            ],
-            [
-              1,
-              [
-                7,
-                9
-              ],
-              [
-                7,
-                9
-              ],
-              2
-            ],
-            [
-              1,
-              [
-                6,
-                8
-              ],
-              [
-                7,
-                9
-              ],
-              15
-            ],
-            [
-              1,
-              [
-                7,
-                9
-              ],
-              [
-                6,
-                8
-              ],
-              21
-            ],
-            [
-              1,
-              [
-                6,
-                8
-              ],
-              [
-                6,
-                7,
-                8,
-                9
-              ],
-              4,
-              1,
-              true
-            ],
-            [
-              1,
-              [
-                7,
-                9
-              ],
-              [
-                6,
-                8
-              ],
-              15,
-              1,
-              true
-            ],
-            [
-              1,
-              [
-                6
-              ],
-              [
-                3
-              ],
-              11
-            ],
-            [
-              1,
-              [
-                7
-              ],
-              [
-                3
-              ],
-              51
-            ],
-            [
-              1,
-              [
-                0
-              ],
-              [
-                0
-              ],
-              6
-            ],
-            [
-              1,
-              [
-                0
-              ],
-              [
-                0
-              ],
-              22,
-              4
-            ],
-            [
-              1,
-              [
-                0
-              ],
-              [
-                3
-              ],
-              46
-            ],
-            [
-              1,
-              [
-                3
-              ],
-              [
-                0
-              ],
-              23
-            ],
-            [
-              1,
-              [
-                2,
-                3
-              ],
-              [
-                2,
-                3
-              ],
-              2
-            ],
-            [
-              1,
-              [
-                0
-              ],
-              [
-                10
-              ],
-              65
-            ],
-            [
-              1,
-              [
-                2,
-                3
-              ],
-              [
-                10
-              ],
-              20
-            ],
-            [
-              1,
-              [
-                8
-              ],
-              [
-                10
-              ],
-              31
-            ],
-            [
-              1,
-              [
-                9
-              ],
-              [
-                10
-              ],
-              71
-            ],
-            [
-              1,
-              [
-                10
-              ],
-              [
-                0,
-                3
-              ],
-              224
-            ],
-            [
-              2,
-              [
-                6,
-                8
-              ],
-              [
-                6,
-                8
-              ],
-              4
-            ],
-            [
-              2,
-              [
-                7,
-                9
-              ],
-              [
-                7,
-                9
-              ],
-              4
-            ],
-            [
-              2,
-              [
-                7,
-                9
-              ],
-              [
-                6,
-                8
-              ],
-              27
-            ],
-            [
-              2,
-              [
-                0
-              ],
-              [
-                0
-              ],
-              6
-            ],
-            [
-              3,
-              [
-                0
-              ],
-              [
-                0
-              ],
-              65
-            ],
-            [
-              3,
-              [
-                0
-              ],
-              [
-                6,
-                7,
-                8,
-                9
-              ],
-              20
-            ],
-            [
-              3,
-              [
-                0
-              ],
-              [
-                2
-              ],
-              46
-            ],
-            [
-              3,
-              [
-                2
-              ],
-              [
-                0
-              ],
-              20
-            ],
-            [
-              3,
-              [
-                6
-              ],
-              [
-                2
-              ],
-              11
-            ],
-            [
-              3,
-              [
-                7
-              ],
-              [
-                2
-              ],
-              51
-            ],
-            [
-              3,
-              [
-                8
-              ],
-              [
-                0
-              ],
-              31
-            ],
-            [
-              3,
-              [
-                9
-              ],
-              [
-                0
-              ],
-              71
-            ],
-            [
-              3,
-              [
-                11
-              ],
-              [
-                0
-              ],
-              128
-            ],
-            [
-              3,
-              [
-                0
-              ],
-              [
-                11
-              ],
-              65
-            ],
-            [
-              3,
-              [
-                2
-              ],
-              [
-                11
-              ],
-              20
-            ]
-          ]
-        }
+        "timing": [8533, 2, 23, 20, 20, 23, 46, 65, 37, 11, 12, 2, 2, 4, 2, 4, 6, 6, 7, 13, 22, 224, 128, 4165, 521, 1, 0, 8, 2, 938],
+        "channel_width": 16,
+        "read_latency": 25,
+        "timing_constraints": [[0, [6, 8], [6, 8], 2], [0, [7, 9], [7, 9], 2], [3, [4], [6, 8], 0], [3, [5], [7, 9], 0], [1, [6, 8], [6, 8], 2], [1, [7, 9], [7, 9], 2], [1, [6, 8], [7, 9], 15], [1, [7, 9], [6, 8], 21], [1, [6, 8], [6, 7, 8, 9], 4, 1, true], [1, [7, 9], [6, 8], 15, 1, true], [1, [6], [3], 11], [1, [7], [3], 51], [1, [0], [0], 6], [1, [0], [0], 22, 4], [1, [0], [3], 46], [1, [3], [0], 23], [1, [2, 3], [2, 3], 2], [1, [0], [10], 65], [1, [2, 3], [10], 20], [1, [8], [10], 31], [1, [9], [10], 71], [1, [10], [0, 3], 224], [2, [6, 8], [6, 8], 4], [2, [7, 9], [7, 9], 4], [2, [7, 9], [6, 8], 27], [2, [0], [0], 6], [3, [0], [0], 65], [3, [0], [6, 7, 8, 9], 20], [3, [0], [2], 46], [3, [2], [0], 20], [3, [6], [2], 11], [3, [7], [2], 51], [3, [8], [0], 31], [3, [9], [0], 71], [3, [11], [0], 128], [3, [0], [11], 65], [3, [2], [11], 20]]
       }
-    ]
+    }]
   }
 }
\ No newline at end of file
diff --git a/configs/ramulator2_configs/gen_configs.py b/configs/ramulator2_configs/gen_configs.py
index 1c630e5c..32d6a0bd 100644
--- a/configs/ramulator2_configs/gen_configs.py
+++ b/configs/ramulator2_configs/gen_configs.py
@@ -116,6 +116,18 @@ def gen_lpddr5x():
     "LPDDR5X.yaml":     gen_lpddr5x,
 }
 
+class CompactJSONEncoder(json.JSONEncoder):
+    def encode(self, o, level=0):
+        indent = '  ' * level
+        if isinstance(o, list):
+            return '[' + ', '.join(self.encode(i, level) for i in o) + ']'
+        if isinstance(o, dict):
+            items = ',\n'.join(
+                f'{indent}  {json.dumps(k)}: {self.encode(v, level + 1)}'
+                for k, v in o.items()
+            )
+            return '{\n' + items + '\n' + indent + '}'
+        return super().encode(o)
 
 if __name__ == "__main__":
     out_dir = os.path.dirname(os.path.abspath(__file__))
@@ -123,7 +135,6 @@ def gen_lpddr5x():
         cfg = gen_fn()
         out_path = os.path.join(out_dir, filename)
         with open(out_path, "w") as f:
-            # json is valid yaml — C++ parse_config_file reads either
-            json.dump(cfg, f, indent=2)
+            f.write(CompactJSONEncoder().encode(cfg))
         print(f"Generated {out_path}")
 

From 9bfd11b4942e1df8c02f9e199a8cccbc438058b7 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 22 Apr 2026 19:43:27 +0900
Subject: [PATCH 184/194] [CI] run cycle + speedup in one job

---
 .github/workflows/pytorchsim_test.yml       | 35 ++++++++++++++++++---
 experiments/artifact/speedup/run_speedup.sh | 19 +++++++----
 2 files changed, 43 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/pytorchsim_test.yml b/.github/workflows/pytorchsim_test.yml
index a7613b6e..4b4fab80 100644
--- a/.github/workflows/pytorchsim_test.yml
+++ b/.github/workflows/pytorchsim_test.yml
@@ -689,7 +689,7 @@ jobs:
             ${{ inputs.image_name }} python3 PyTorchSim/tests/DeepSeek/test_deepseek_v3_base.py
 
   test_accuracy:
-    name: Run test_accuracy
+    name: Run test_accuracy and test_speedup
     runs-on: self-hosted
     if: inputs.vector_lane == 128
     steps:
@@ -700,14 +700,30 @@ jobs:
           username: ${{ github.actor }}
           password: ${{ secrets.GITHUB_TOKEN }}
 
-      - name: Run run_cycle.sh
+      - name: run_cycle, accuracy summary, run_speedup
+        env:
+          ART_DIR: ${{ runner.temp }}/pt_sim_cycle_and_speedup
         run: |
-          echo "Running run_cycle.sh"
+          set -e
+          set -u
+          set -o pipefail
+          mkdir -p "$ART_DIR"
           docker run --rm \
             -e vpu_num_lanes="${{ inputs.vector_lane }}" \
             -e vpu_spad_size_kb_per_lane="${{ inputs.spad_size }}" \
-            ${{ inputs.image_name }} bash -c \
-            "cd /workspace && PyTorchSim/experiments/artifact/cycle_validation/run_cycle.sh >/dev/null 2>&1 && cat PyTorchSim/experiments/artifact/cycle_validation/summary_cycle.out" > summary_cycle.out
+            -e SKIP_ILS=1 \
+            -e SPEEDUP_ITERS=2 \
+            -v "$ART_DIR:/artifacts" \
+            ${{ inputs.image_name }} bash -c 'set -eu -o pipefail; cd /workspace; export TORCHSIM_DIR="${TORCHSIM_DIR:-/workspace/PyTorchSim}"; \
+            PyTorchSim/experiments/artifact/cycle_validation/run_cycle.sh >/dev/null 2>&1; \
+            cp "$TORCHSIM_DIR/experiments/artifact/cycle_validation/summary_cycle.out" /artifacts/summary_cycle.out; \
+            PyTorchSim/experiments/artifact/speedup/run_speedup.sh; \
+            bash PyTorchSim/experiments/artifact/speedup/check_speedup_smoke.sh; \
+            cp "$TORCHSIM_DIR/experiments/artifact/speedup/summary_speedup.log" /artifacts/summary_speedup.log' \
+            2>&1 | tee speedup_smoke.log
+          # Same layout as the old split job (root for accuracy artifact, both logs for speedup)
+          cp -f "$ART_DIR/summary_cycle.out" summary_cycle.out
+          cp -f "$ART_DIR/summary_speedup.log" summary_speedup.log
 
       - name: Upload Accuracy Report Artifact
         uses: actions/upload-artifact@v4
@@ -715,3 +731,12 @@ jobs:
           name: accuracy-report
           path: summary_cycle.out
           if-no-files-found: error
+
+      - name: Upload speedup smoke log
+        uses: actions/upload-artifact@v4
+        with:
+          name: speedup-smoke
+          path: |
+            speedup_smoke.log
+            summary_speedup.log
+          if-no-files-found: error
diff --git a/experiments/artifact/speedup/run_speedup.sh b/experiments/artifact/speedup/run_speedup.sh
index cb5ee511..a8018ffe 100755
--- a/experiments/artifact/speedup/run_speedup.sh
+++ b/experiments/artifact/speedup/run_speedup.sh
@@ -5,6 +5,9 @@ LOG_DIR=$TORCHSIM_DIR/experiments/artifact/logs
 CONFIG_DIR="$TORCHSIM_DIR/configs"
 EXTRACT_TRACE="$TORCHSIM_DIR/experiments/artifact/speedup/scripts/extract_trace_from_log.py"
 TRACE_CACHE_DIR="$TORCHSIM_DIR/experiments/artifact/speedup/trace_cache"
+# CI: e.g. SKIP_ILS=1, SPEEDUP_ITERS=1 (shorter, no ILS re-runs)
+: "${SKIP_ILS:=0}"
+: "${SPEEDUP_ITERS:=5}"
 mkdir -p "$TRACE_CACHE_DIR"
 
 configs=(
@@ -63,7 +66,7 @@ for log_file in "$LOG_DIR"/*.log; do
     sum_all_iters=0.0
     iter_count=0
 
-    for iter in {1..5}; do
+    for iter in $(seq 1 "${SPEEDUP_ITERS}"); do
       echo "[Iter $iter] Running simulation for workload=$workload config=$config"
       # Build command: replace --config and --models_list in base_cmd with our config and trace
       cmd=$(echo "$base_cmd" | sed -E "s|--config [^ ]+|--config $CONFIG_DIR/$config|" | sed -E "s|--models_list [^ ]+|--models_list $trace_file|")
@@ -92,10 +95,14 @@ for log_file in "$LOG_DIR"/*.log; do
   done
 done
 
-# ILS mode should be run separately
-$TORCHSIM_DIR/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh
-$TORCHSIM_DIR/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh
-$TORCHSIM_DIR/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh
-$TORCHSIM_DIR/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh
+# ILS: optional (skip in CI; slow and separate from simple-noc / booksim re-sims)
+if [[ "$SKIP_ILS" != "1" ]]; then
+  $TORCHSIM_DIR/experiments/artifact/speedup/scripts/run_speed_ils_matmul.sh
+  $TORCHSIM_DIR/experiments/artifact/speedup/scripts/run_speed_ils_conv.sh
+  $TORCHSIM_DIR/experiments/artifact/speedup/scripts/run_speed_ils_bert.sh
+  $TORCHSIM_DIR/experiments/artifact/speedup/scripts/run_speed_ils_resnet.sh
+else
+  echo "[*] SKIP_ILS=1 — skipping ILS matmul/conv/bert/resnet."
+fi
 
 python3 $TORCHSIM_DIR/experiments/artifact/speedup/summary_speedup.py | tee "$TORCHSIM_DIR/experiments/artifact/speedup/summary_speedup.log"
\ No newline at end of file

From 300b1cd7fd4a86cbe344512da4551f38abec129d Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 22 Apr 2026 20:05:02 +0900
Subject: [PATCH 185/194] [Doc] Update docker image tag

---
 README.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index ac52511f..43954af7 100644
--- a/README.md
+++ b/README.md
@@ -49,7 +49,8 @@ PyTorchSim **supports**:
 | Stable-diffusion v1 | 🤗 | ✅ |  |
 | Llama 2/3 | 🤗 | ✅ | `tests/Llama/` (blocks & decode-style paths) |
 | DeepSeek-V3 (base) | 🤗 | ✅ | `tests/DeepSeek/` — several ops(e.g., gate ops) are not cycle-modeled |
-| Llama-4 | 🤗 | ⏳ | Under development |
+| Llama-4 | 🤗 | ⏳ | In development |
+| Broader model support | — | ⏳ | In development |
 <!-- ## Requirements
 
 ### OS Distribution
@@ -92,7 +93,7 @@ To download the latest Docker image and set up the environment, use the followin
 
 ```bash
 # Run the Docker container
-docker run -it --ipc=host --name torchsim -w /workspace/PyTorchSim ghcr.io/psal-postech/torchsim-ci:v1.0.1 bash
+docker run -it --ipc=host --name torchsim -w /workspace/PyTorchSim ghcr.io/psal-postech/torchsim-ci:v1.1.0 bash
 ```
 ### Manual Setting (Optional)
 This script builds [Gem5](https://github.com/PSAL-POSTECH/gem5.git), [LLVM](https://github.com/PSAL-POSTECH/llvm-project.git), and [Spike](https://github.com/PSAL-POSTECH/riscv-isa-sim.git) from source for advanced users.

From 73467df6209b6afc24ba34276f5a750630cb5219 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Wed, 22 Apr 2026 19:48:24 +0900
Subject: [PATCH 186/194] [Tutorial] build image with PyTorchSim at triggering
 commit

---
 .github/workflows/docker-tutorial-image.yml |  1 +
 tutorial/jupyterhub/Dockerfile.tutorial     | 11 ++++++++---
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/docker-tutorial-image.yml b/.github/workflows/docker-tutorial-image.yml
index e03bef22..9f3f0b59 100644
--- a/.github/workflows/docker-tutorial-image.yml
+++ b/.github/workflows/docker-tutorial-image.yml
@@ -39,4 +39,5 @@ jobs:
           push: true
           build-args: |
             PYTORCH_IMAGE=${{ env.PYTORCH_IMAGE }}
+            PYTORCHSIM_GIT_REF=${{ github.sha }}
           tags: ghcr.io/psal-postech/torchsim-tutorial:ispass2026
diff --git a/tutorial/jupyterhub/Dockerfile.tutorial b/tutorial/jupyterhub/Dockerfile.tutorial
index 6cb6d7d2..f0a355a2 100644
--- a/tutorial/jupyterhub/Dockerfile.tutorial
+++ b/tutorial/jupyterhub/Dockerfile.tutorial
@@ -29,6 +29,9 @@
 ARG PYTORCH_IMAGE=pytorch/pytorch:2.8.0-cuda12.6-cudnn9-devel
 FROM ${PYTORCH_IMAGE}
 
+# CI passes ``github.sha``; local builds can use a branch (default) or a tag/SHA.
+ARG PYTORCHSIM_GIT_REF=ispass2026
+
 ENV DEBIAN_FRONTEND=noninteractive
 WORKDIR /workspace
 
@@ -91,11 +94,13 @@ RUN git clone https://github.com/riscv-software-src/riscv-pk.git \
     && ../configure --prefix="$RISCV" --host=riscv64-unknown-elf \
     && make -j"$(nproc)" && make install
 
-# PyTorchSim + TOGSim
+# PyTorchSim + TOGSim (CI checks out the triggering commit; use PYTORCHSIM_GIT_REF for local builds)
 ENV CMAKE_POLICY_VERSION_MINIMUM=3.5
-RUN git clone https://github.com/PSAL-POSTECH/PyTorchSim.git --branch ispass2026 \
-    && cd PyTorchSim/TOGSim \
+RUN git clone https://github.com/PSAL-POSTECH/PyTorchSim.git \
+    && cd PyTorchSim \
+    && git checkout "${PYTORCHSIM_GIT_REF}" \
     && git submodule update --recursive --init \
+    && cd TOGSim \
     && mkdir -p build && cd build \
     && conan install .. --build=missing \
     && cmake -G Ninja .. \

From edf8b4486c3ca85a7ab7d2837888e692033b456f Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 23 Apr 2026 13:36:41 +0900
Subject: [PATCH 187/194] [TOGSim] Fix booksim config path

---
 TOGSim/src/Common.cc       | 5 ++++-
 TOGSim/src/Interconnect.cc | 8 +++++---
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/TOGSim/src/Common.cc b/TOGSim/src/Common.cc
index ccb30760..3f84d885 100644
--- a/TOGSim/src/Common.cc
+++ b/TOGSim/src/Common.cc
@@ -139,7 +139,10 @@ SimulationConfig initialize_config(const YAML::Node& config,
       parsed_config.icnt_latency = config["icnt_latency_cycles"].as<uint32_t>();
   } else if (icnt_type_str == "booksim2") {
     parsed_config.icnt_type = IcntType::BOOKSIM2;
-    parsed_config.icnt_config_path = get_config_value<std::string>(config, "booksim_config_path");
+    const std::string booksim_config_rel =
+        get_config_value<std::string>(config, "booksim_config_path");
+    parsed_config.icnt_config_path =
+        parsed_config.resolve_against_simulation_config(booksim_config_rel);
   } else
     throw std::runtime_error(fmt::format("Not implemented icnt type {} ", icnt_type_str));
 
diff --git a/TOGSim/src/Interconnect.cc b/TOGSim/src/Interconnect.cc
index 096efe3d..dc699d7b 100644
--- a/TOGSim/src/Interconnect.cc
+++ b/TOGSim/src/Interconnect.cc
@@ -78,11 +78,13 @@ Booksim2Interconnect::Booksim2Interconnect(SimulationConfig config) {
   _config = config;
   _n_nodes = config.num_cores * _config.icnt_injection_ports_per_core + config.dram_channels;
   spdlog::info("Initialize Booksim2");
+  /* Same base as Simulator.cc DRAM: TORCHSIM_DIR + configs + path; absolute icnt_config_path replaces prefix. */
   char* onnxim_path_env = std::getenv("TORCHSIM_DIR");
-  std::string onnxim_path = onnxim_path_env != NULL?
-    std::string(onnxim_path_env) + "/TOGSim" : std::string("./");
+  std::string onnxim_path =
+      onnxim_path_env != NULL ? std::string(onnxim_path_env) : std::string("./");
 
-  _config_path = fs::path(onnxim_path).append("configs").append((std::string)config.icnt_config_path).string();
+  _config_path =
+      fs::path(onnxim_path).append("configs").append(config.icnt_config_path).string();
   spdlog::info("Booksim 2 config path : {}", _config_path);
   print_config(_config_path);
   _booksim = std::make_unique<booksim2::Interconnect>(_config_path, _n_nodes);

From a60cacc850e30de6ce4856edf460bd892af40143 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 23 Apr 2026 13:57:20 +0900
Subject: [PATCH 188/194] [Config] Fix ramulator config

---
 TOGSim/extern/ramulator2                      |  2 +-
 configs/ramulator2_configs/HBM2.yaml          |  4 +-
 configs/ramulator2_configs/HBM2_TPUv4.yaml    | 45 +++++++++++++++++++
 configs/ramulator2_configs/gen_configs.py     | 37 ++++++++++-----
 configs/stonne_single_c1_simple_noc.yml       |  2 +-
 .../systolic_ws_128x128_c1_booksim_tpuv2.yml  |  2 +-
 ...ystolic_ws_128x128_c1_simple_noc_tpuv2.yml |  2 +-
 ...ystolic_ws_128x128_c1_simple_noc_tpuv4.yml |  2 +-
 ...ystolic_ws_128x128_c2_simple_noc_tpuv2.yml |  2 +-
 ...ystolic_ws_128x128_c2_simple_noc_tpuv4.yml |  2 +-
 .../artifact/cycle_validation/run_cycle.sh    | 16 +++----
 experiments/artifact/speedup/run_speedup.sh   | 24 +++++-----
 12 files changed, 103 insertions(+), 37 deletions(-)
 create mode 100644 configs/ramulator2_configs/HBM2_TPUv4.yaml

diff --git a/TOGSim/extern/ramulator2 b/TOGSim/extern/ramulator2
index d33bf3ac..272ea843 160000
--- a/TOGSim/extern/ramulator2
+++ b/TOGSim/extern/ramulator2
@@ -1 +1 @@
-Subproject commit d33bf3ac26f3e7f838386ff7923ea6bc3ba61c31
+Subproject commit 272ea843dffdef0719efe69c68d67de0ed9194db
diff --git a/configs/ramulator2_configs/HBM2.yaml b/configs/ramulator2_configs/HBM2.yaml
index 3c14bedb..743f28b6 100644
--- a/configs/ramulator2_configs/HBM2.yaml
+++ b/configs/ramulator2_configs/HBM2.yaml
@@ -13,8 +13,8 @@
       "impl": "HBM",
       "wr_low_watermark": 0.2,
       "wr_high_watermark": 0.8,
-      "read_buffer_size": 32,
-      "write_buffer_size": 32,
+      "read_buffer_size": 64,
+      "write_buffer_size": 64,
       "priority_buffer_size": 1568,
       "scheduler": {
         "impl": "FRFCFS"
diff --git a/configs/ramulator2_configs/HBM2_TPUv4.yaml b/configs/ramulator2_configs/HBM2_TPUv4.yaml
new file mode 100644
index 00000000..aefaa9c8
--- /dev/null
+++ b/configs/ramulator2_configs/HBM2_TPUv4.yaml
@@ -0,0 +1,45 @@
+{
+  "frontend": {
+    "impl": "External",
+    "clock_ratio": 1
+  },
+  "memory_system": {
+    "impl": "GenericDRAM",
+    "clock_ratio": 1,
+    "channel_mapper": {
+      "impl": "PassThroughChannelMapper"
+    },
+    "controllers": [{
+      "impl": "HBM",
+      "wr_low_watermark": 0.2,
+      "wr_high_watermark": 0.8,
+      "read_buffer_size": 64,
+      "write_buffer_size": 64,
+      "priority_buffer_size": 1568,
+      "scheduler": {
+        "impl": "FRFCFS"
+      },
+      "refresh_manager": {
+        "impl": "AllBank",
+        "scope": "PseudoChannel"
+      },
+      "row_policy": {
+        "impl": "Open"
+      },
+      "addr_mapper": {
+        "impl": "RoBaRaCoCh"
+      },
+      "dram": {
+        "impl": "HBM2",
+        "org": {
+          "dq": 64,
+          "count": [1, 2, 4, 4, 65536, 32]
+        },
+        "timing": [2400, 2, 17, 17, 14, 17, 40, 57, 19, 6, 6, 2, 4, 5, 5, 8, 10, 19, 421, 193, 10, 4682, 147, 833],
+        "channel_width": 64,
+        "read_latency": 19,
+        "timing_constraints": [[0, [0], [0, 1, 2, 7, 8], 2], [1, [3, 5], [3, 5], 2], [1, [4, 6], [4, 6], 2], [1, [3, 5], [3, 5], 2], [1, [4, 6], [4, 6], 2], [1, [3, 5], [4, 6], 15], [1, [4, 6], [3, 5], 16], [1, [3], [2], 6], [1, [4], [2], 27], [1, [0], [0], 5], [1, [0], [0], 19, 4], [1, [0], [2], 41], [1, [2], [0], 16], [1, [0], [7], 58], [1, [1, 2], [7], 17], [1, [5], [7], 23], [1, [6], [7], 44], [1, [7], [0], 420], [1, [7], [2], 421], [1, [8], [0], 9], [1, [0], [8], 6], [2, [3, 5], [3, 5], 4], [2, [4, 6], [4, 6], 4], [2, [4, 6], [3, 5], 18], [2, [0], [0], 5], [3, [0], [0], 57], [3, [0], [3, 5], 18], [3, [0], [4, 6], 15], [3, [0], [1], 41], [3, [1], [0], 16], [3, [3], [1], 6], [3, [4], [1], 27], [3, [5], [0], 22], [3, [6], [0], 43], [3, [8], [0], 192], [3, [0], [8], 58], [3, [1], [8], 17]]
+      }
+    }]
+  }
+}
\ No newline at end of file
diff --git a/configs/ramulator2_configs/gen_configs.py b/configs/ramulator2_configs/gen_configs.py
index 32d6a0bd..65721f28 100644
--- a/configs/ramulator2_configs/gen_configs.py
+++ b/configs/ramulator2_configs/gen_configs.py
@@ -51,18 +51,31 @@ def make_config(dram_obj, clock_ratio=1, refresh_scope="Channel"):
     """
     dram_name = str(_dram_standard_name(dram_obj)).upper()
     if dram_name.startswith("HBM"):
-        ctrl_cls = ramulator.controller.HBM
+        ctrl = ramulator.controller.HBM(
+            dram=dram_obj,
+            scheduler=ramulator.scheduler.FRFCFS(),
+            refresh_manager=ramulator.refresh_manager.AllBank(scope=refresh_scope),
+            row_policy=ramulator.row_policy.Open(),
+            addr_mapper=ramulator.addr_mapper.RoBaRaCoCh(),
+            read_buffer_size=64,
+            write_buffer_size=64,
+        )
     elif dram_name.startswith("LPDDR"):
-        ctrl_cls = ramulator.controller.LPDDR5
+        ctrl = ramulator.controller.LPDDR5(
+            dram=dram_obj,
+            scheduler=ramulator.scheduler.FRFCFS(),
+            refresh_manager=ramulator.refresh_manager.AllBank(scope=refresh_scope),
+            row_policy=ramulator.row_policy.Open(),
+            addr_mapper=ramulator.addr_mapper.RoBaRaCoCh(),
+        )
     else:
-        ctrl_cls = ramulator.controller.GenericDDR
-    ctrl = ctrl_cls(
-        dram=dram_obj,
-        scheduler=ramulator.scheduler.FRFCFS(),
-        refresh_manager=ramulator.refresh_manager.AllBank(scope=refresh_scope),
-        row_policy=ramulator.row_policy.Open(),
-        addr_mapper=ramulator.addr_mapper.RoBaRaCoCh(),
-    )
+        ctrl = ramulator.controller.GenericDDR(
+            dram=dram_obj,
+            scheduler=ramulator.scheduler.FRFCFS(),
+            refresh_manager=ramulator.refresh_manager.AllBank(scope=refresh_scope),
+            row_policy=ramulator.row_policy.Open(),
+            addr_mapper=ramulator.addr_mapper.RoBaRaCoCh(),
+        )
     ms = ramulator.memory_system.GenericDRAM(
         clock_ratio=clock_ratio,
         controllers=[ctrl],
@@ -81,6 +94,9 @@ def gen_hbm2():
     dram = ramulator.dram.HBM2(org_preset="HBM2_8Gb", timing_preset="HBM2_2000Mbps")
     return make_config(dram, clock_ratio=1, refresh_scope="PseudoChannel")
 
+def gen_hbm2_tpuv4():
+    dram = ramulator.dram.HBM2(org_preset="HBM2_8Gb", timing_preset="HBM2_2400Mbps")
+    return make_config(dram, clock_ratio=1, refresh_scope="PseudoChannel")
 
 def gen_hbm2_tpuv3():
     dram = ramulator.dram.HBM2(org_preset="HBM2_8Gb", timing_preset="HBM2_1880Mbps")
@@ -109,6 +125,7 @@ def gen_lpddr5x():
 
 CONFIGS = {
     "HBM2.yaml":        gen_hbm2,
+    "HBM2_TPUv4.yaml":  gen_hbm2_tpuv4,
     "HBM2_TPUv3.yaml":  gen_hbm2_tpuv3,
     "HBM2_TPUv2.yaml":  gen_hbm2_tpuv2,
     "DDR4.yaml":        gen_ddr4,
diff --git a/configs/stonne_single_c1_simple_noc.yml b/configs/stonne_single_c1_simple_noc.yml
index d1087301..f862d7a9 100644
--- a/configs/stonne_single_c1_simple_noc.yml
+++ b/configs/stonne_single_c1_simple_noc.yml
@@ -11,7 +11,7 @@ dram_type: ramulator2
 dram_freq_mhz: 700
 dram_channels: 8
 dram_stats_print_period_cycles: 10000
-ramulator_config_path: ../configs/ramulator2_configs/HBM2.yaml
+ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv2.yaml
 
 icnt_type: simple
 icnt_latency_cycles: 10
diff --git a/configs/systolic_ws_128x128_c1_booksim_tpuv2.yml b/configs/systolic_ws_128x128_c1_booksim_tpuv2.yml
index fb07eb6a..6d2537d9 100644
--- a/configs/systolic_ws_128x128_c1_booksim_tpuv2.yml
+++ b/configs/systolic_ws_128x128_c1_booksim_tpuv2.yml
@@ -10,7 +10,7 @@ dram_type: ramulator2
 dram_freq_mhz: 700
 dram_channels: 16
 dram_stats_print_period_cycles: 10000
-ramulator_config_path: ../configs/ramulator2_configs/HBM2.yaml
+ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv2.yaml
 
 icnt_type: booksim2
 icnt_freq_mhz: 700
diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.yml b/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.yml
index 6277cc39..1a8c60f6 100644
--- a/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.yml
+++ b/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.yml
@@ -10,7 +10,7 @@ dram_type: ramulator2
 dram_freq_mhz: 700
 dram_channels: 32
 dram_stats_print_period_cycless: 10000
-ramulator_config_path: ../configs/ramulator2_configs/HBM2.yaml
+ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv2.yaml
 
 icnt_type: simple
 icnt_latency_cycles: 10
diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.yml b/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.yml
index 3328cf77..39d195b0 100644
--- a/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.yml
+++ b/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.yml
@@ -11,7 +11,7 @@ dram_type: ramulator2
 dram_freq_mhz: 1200
 dram_channels: 16
 dram_stats_print_period_cycles: 10000
-ramulator_config_path: ../configs/ramulator2_configs/HBM2.yaml
+ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv4.yaml
 l2d_type: datacache
 l2d_config: S:128:128:512,32,L:T:m:W:L,A:192:4,32:0,32
 
diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml b/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml
index 918510d8..348babae 100644
--- a/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml
+++ b/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml
@@ -10,7 +10,7 @@ dram_type: ramulator2
 dram_freq_mhz: 700
 dram_channels: 32
 dram_stats_print_period_cycles: 10000
-ramulator_config_path: ../configs/ramulator2_configs/HBM2.yaml
+ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv2.yaml
 
 icnt_type: simple
 icnt_latency_cycles: 10
diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml b/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml
index 9e87511f..9100c22a 100644
--- a/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml
+++ b/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml
@@ -11,7 +11,7 @@ dram_type: ramulator2
 dram_freq_mhz: 1200
 dram_channels: 32
 dram_stats_print_period_cycles: 10000
-ramulator_config_path: ../configs/ramulator2_configs/HBM2.yaml
+ramulator_config_path: ../configs/ramulator2_configs/HBM2_TPUv4.yaml
 l2d_type: datacache
 l2d_config: S:64:128:512,32,L:B:m:W:L,A:192:4,32:0,32
 
diff --git a/experiments/artifact/cycle_validation/run_cycle.sh b/experiments/artifact/cycle_validation/run_cycle.sh
index e49538d0..d6a501fd 100755
--- a/experiments/artifact/cycle_validation/run_cycle.sh
+++ b/experiments/artifact/cycle_validation/run_cycle.sh
@@ -65,7 +65,7 @@ if should_run matmul; then
     echo "==================================================="
     echo "[*] Running Matmul size=$sz"
     echo "==================================================="
-    python3 $TORCHSIM_DIR/experiments/gemm.py --size $sz | tee $LOG_DIR/${name}.log
+    python3 $TORCHSIM_DIR/experiments/gemm.py --size $sz 2>&1 | tee $LOG_DIR/${name}.log
   done
 fi
 
@@ -85,7 +85,7 @@ if should_run conv; then
     echo "==================================================="
     echo "[*] Running Conv size=$sz"
     echo "==================================================="
-    python3 $TORCHSIM_DIR/experiments/conv.py --size $sz | tee $LOG_DIR/${name}.log
+    python3 $TORCHSIM_DIR/experiments/conv.py --size $sz 2>&1 | tee $LOG_DIR/${name}.log
   done
 fi
 
@@ -97,7 +97,7 @@ if should_run layernorm; then
     echo "==================================================="
     echo "[*] Running LayerNorm size=$sz"
     echo "==================================================="
-    python3 $TORCHSIM_DIR/experiments/layernorm.py --size $sz | tee $LOG_DIR/${name}.log
+    python3 $TORCHSIM_DIR/experiments/layernorm.py --size $sz 2>&1 | tee $LOG_DIR/${name}.log
   done
 fi
 
@@ -109,7 +109,7 @@ if should_run softmax; then
     echo "==================================================="
     echo "[*] Running Softmax size=$sz"
     echo "==================================================="
-    python3 $TORCHSIM_DIR/experiments/softmax.py --size $sz | tee $LOG_DIR/${name}.log
+    python3 $TORCHSIM_DIR/experiments/softmax.py --size $sz 2>&1 | tee $LOG_DIR/${name}.log
   done
 fi
 
@@ -121,7 +121,7 @@ if should_run attention; then
     echo "==================================================="
     echo "[*] Running Attention size=$sz"
     echo "==================================================="
-    python3 $TORCHSIM_DIR/experiments/attention.py --size $sz | tee $LOG_DIR/${name}.log
+    python3 $TORCHSIM_DIR/experiments/attention.py --size $sz 2>&1 | tee $LOG_DIR/${name}.log
   done
 fi
 
@@ -132,7 +132,7 @@ if should_run resnet; then
     echo "==================================================="
     echo "[*] Running $model"
     echo "==================================================="
-    python3 $TORCHSIM_DIR/experiments/${model}.py | tee $LOG_DIR/${model}.log
+    python3 $TORCHSIM_DIR/experiments/${model}.py 2>&1 | tee $LOG_DIR/${model}.log
   done
 fi
 
@@ -143,11 +143,11 @@ if should_run bert; then
     echo "==================================================="
     echo "[*] Running BERT size=$model"
     echo "==================================================="
-    python3 $TORCHSIM_DIR/experiments/BERT.py --size $model | tee $LOG_DIR/bert_${model}.log
+    python3 $TORCHSIM_DIR/experiments/BERT.py --size $model 2>&1 | tee $LOG_DIR/bert_${model}.log
   done
 fi
 
 # Cycle Summary
 if should_run summary; then
-  python3 $TORCHSIM_DIR/experiments/artifact/cycle_validation/summary_cycle.py | tee "$TORCHSIM_DIR/experiments/artifact/cycle_validation/summary_cycle.out"
+  python3 $TORCHSIM_DIR/experiments/artifact/cycle_validation/summary_cycle.py 2>&1 | tee "$TORCHSIM_DIR/experiments/artifact/cycle_validation/summary_cycle.out"
 fi
diff --git a/experiments/artifact/speedup/run_speedup.sh b/experiments/artifact/speedup/run_speedup.sh
index a8018ffe..eaea64f7 100755
--- a/experiments/artifact/speedup/run_speedup.sh
+++ b/experiments/artifact/speedup/run_speedup.sh
@@ -3,12 +3,9 @@ set -e
 
 LOG_DIR=$TORCHSIM_DIR/experiments/artifact/logs
 CONFIG_DIR="$TORCHSIM_DIR/configs"
-EXTRACT_TRACE="$TORCHSIM_DIR/experiments/artifact/speedup/scripts/extract_trace_from_log.py"
-TRACE_CACHE_DIR="$TORCHSIM_DIR/experiments/artifact/speedup/trace_cache"
 # CI: e.g. SKIP_ILS=1, SPEEDUP_ITERS=1 (shorter, no ILS re-runs)
 : "${SKIP_ILS:=0}"
 : "${SPEEDUP_ITERS:=5}"
-mkdir -p "$TRACE_CACHE_DIR"
 
 configs=(
     "systolic_ws_128x128_c2_simple_noc_tpuv3.yml"
@@ -32,7 +29,7 @@ output_dir="$TORCHSIM_DIR/experiments/artifact/speedup/results"
 mkdir -p "$output_dir"
 
 echo "[*] Scanning log files in: $LOG_DIR"
-echo "[*] Extracting [TOGSim] Run command and trace from logs"
+echo "[*] Extracting Simulator command and trace path from logs ([TOGSim] Run command|Command line, Trace log is stored to)"
 echo ""
 
 for log_file in "$LOG_DIR"/*.log; do
@@ -45,17 +42,24 @@ for log_file in "$LOG_DIR"/*.log; do
   fi
   echo "==> Workload: $workload"
 
-  # === Extract [TOGSim] Run command from log ===
-  base_cmd=$(grep "\[TOGSim\] Run command:" "$log_file" 2>/dev/null | sed 's/.*\[TOGSim\] Run command: //' | head -1)
+  # === Extract Simulator invocation from log (TOGSim renamed the log tag) ===
+  # Legacy logs: "[TOGSim] Run command: ..."  Current TOGSim/main.cc: "[TOGSim] Command line: ..."
+  base_cmd=$(grep -E "\[TOGSim\] (Run command|Command line):" "$log_file" 2>/dev/null | sed -E 's/.*\[TOGSim\] (Run command|Command line): //' | head -1)
   if [[ -z "$base_cmd" ]]; then
-    echo "    Skipping: no [TOGSim] Run command found in $log_file"
+    echo "    Skipping: no [TOGSim] Run command / Command line found in $log_file"
     continue
   fi
 
-  # === Get trace file (replace FIFO in command; stored trace or generate from log) ===
-  trace_file=$(python3 "$EXTRACT_TRACE" "$log_file" "$TRACE_CACHE_DIR/${workload}.trace" 2>/dev/null) || true
+  # === Trace file: PyTorchSim logs it as "[TOGSim] Trace log is stored to \"<path>.trace\"" (often on stderr, now merged in cycle logs) ===
+  trace_line=$(grep -F '[TOGSim] Trace log is stored to' "$log_file" 2>/dev/null | tail -n 1) || true
+  if [[ -z "$trace_line" ]]; then
+    echo "    Skipping: no [TOGSim] Trace log is stored to ... line in $log_file"
+    continue
+  fi
+  trace_file="${trace_line#*Trace log is stored to \"}"
+  trace_file="${trace_file%%\"*}"
   if [[ -z "$trace_file" || ! -f "$trace_file" ]]; then
-    echo "    Skipping: could not extract trace from $log_file"
+    echo "    Skipping: trace path missing or not a file: ${trace_file:-<empty>} (from $log_file)"
     continue
   fi
 

From 794c5f3dc6dca3f58a0497fc788b8dfd06143a44 Mon Sep 17 00:00:00 2001
From: Your Name <tsdaf@google.com>
Date: Thu, 23 Apr 2026 02:14:04 +0000
Subject: [PATCH 189/194] [Tutorial] change param *_out to *_output

---
 tutorial/session1/CompilerOptimization.ipynb | 11 ++------
 tutorial/session1/DNNServing.ipynb           |  7 -----
 tutorial/session1/ExecutionMode.ipynb        | 13 ++-------
 tutorial/session1/Inference.ipynb            | 29 ++++++++------------
 tutorial/session1/LogAnalysis.ipynb          | 11 ++------
 tutorial/session1/Mapping.ipynb              | 13 ++-------
 tutorial/session1/TOGSimConfig.ipynb         | 11 ++------
 tutorial/session1/Training.ipynb             | 26 ++++++++----------
 tutorial/session2/Hands_on.ipynb             | 21 ++++++++------
 9 files changed, 47 insertions(+), 95 deletions(-)

diff --git a/tutorial/session1/CompilerOptimization.ipynb b/tutorial/session1/CompilerOptimization.ipynb
index f8eea728..d83a253c 100644
--- a/tutorial/session1/CompilerOptimization.ipynb
+++ b/tutorial/session1/CompilerOptimization.ipynb
@@ -41,7 +41,7 @@
     "    return torch.relu(torch.matmul(a, b))\n",
     "\n",
     "opt_fn = torch.compile(dynamic=False)(gemm_relu)\n",
-    "out = opt_fn(input, weight)"
+    "npu_output = opt_fn(input, weight)"
    ]
   },
   {
@@ -77,7 +77,7 @@
     "    return torch.relu(torch.matmul(a, b))\n",
     "\n",
     "opt_fn = torch.compile(dynamic=False)(gemm_relu)\n",
-    "out = opt_fn(input, weight)"
+    "npu_output = opt_fn(input, weight)"
    ]
   },
   {
@@ -91,13 +91,6 @@
     "log_path = \"\"\n",
     "!cat $log_path | grep \"Total execution cycle\""
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/tutorial/session1/DNNServing.ipynb b/tutorial/session1/DNNServing.ipynb
index f7f2ea4d..beb5d8d1 100644
--- a/tutorial/session1/DNNServing.ipynb
+++ b/tutorial/session1/DNNServing.ipynb
@@ -87,13 +87,6 @@
     "            timestamp=int(t_msec),\n",
     "        )"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/tutorial/session1/ExecutionMode.ipynb b/tutorial/session1/ExecutionMode.ipynb
index 9d0b051f..8d499184 100644
--- a/tutorial/session1/ExecutionMode.ipynb
+++ b/tutorial/session1/ExecutionMode.ipynb
@@ -36,7 +36,7 @@
     "weight = torch.randn(512, 512).to(device=device)\n",
     "\n",
     "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
-    "npu_out = opt_fn(input, weight)"
+    "npu_output = opt_fn(input, weight)"
    ]
   },
   {
@@ -58,7 +58,7 @@
     "weight = torch.randn(512, 512).to(device=device)\n",
     "\n",
     "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
-    "npu_out = opt_fn(input, weight)"
+    "npu_output = opt_fn(input, weight)"
    ]
   },
   {
@@ -80,15 +80,8 @@
     "weight = torch.randn(512, 512).to(device=device)\n",
     "\n",
     "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
-    "npu_out = opt_fn(input, weight)"
+    "npu_output = opt_fn(input, weight)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/tutorial/session1/Inference.ipynb b/tutorial/session1/Inference.ipynb
index 18325d80..07759ff2 100644
--- a/tutorial/session1/Inference.ipynb
+++ b/tutorial/session1/Inference.ipynb
@@ -37,7 +37,7 @@
     "weight = torch.randn(128, 128).to(device)\n",
     "\n",
     "opt_fn = torch.compile(torch.matmul)\n",
-    "cpu_out = opt_fn(input, weight)"
+    "cpu_output = opt_fn(input, weight)"
    ]
   },
   {
@@ -60,7 +60,7 @@
     "weight = torch.randn(128, 128).to(device)\n",
     "\n",
     "opt_fn = torch.compile(torch.matmul)\n",
-    "npu_out = opt_fn(input, weight)"
+    "npu_output = opt_fn(input, weight)"
    ]
   },
   {
@@ -69,22 +69,24 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def test_result(name, npu_out, cpu_out, rtol=1e-4, atol=1e-4):\n",
-    "    torch.set_printoptions(edgeitems=3)\n",
-    "    if torch.allclose(npu_out.cpu(), cpu_out, rtol=rtol, atol=atol):\n",
+    "def test_result(name, npu_output, cpu_output, rtol=1e-4, atol=1e-4, preview=5):\n",
+    "    npu_output_cpu = npu_output.cpu()\n",
+    "\n",
+    "    if torch.allclose(npu_output_cpu, cpu_output, rtol=rtol, atol=atol):\n",
     "        message = f\"|{name} Functionality Test Passed|\"\n",
     "        print(\"-\" * len(message))\n",
     "        print(message)\n",
     "        print(\"-\" * len(message))\n",
-    "        print(\"npu out: \", npu_out.cpu()[0, :5])\n",
-    "        print(\"cpu out: \", cpu_out[0, :5])\n",
+    "\n",
+    "        print(f\"npu output (first {preview}): {npu_output_cpu[0, :preview]} ...\")\n",
+    "        print(f\"cpu output (first {preview}): {cpu_output[0, :preview]} ...\")\n",
     "    else:\n",
     "        message = f\"|{name} Functionality Test Failed|\"\n",
     "        print(\"-\" * len(message))\n",
     "        print(message)\n",
     "        print(\"-\" * len(message))\n",
-    "        print(\"npu out: \", npu_out.cpu())\n",
-    "        print(\"cpu out: \", cpu_out)\n",
+    "        print(\"npu output: \", npu_output_cpu)\n",
+    "        print(\"cpu output: \", cpu_output)\n",
     "        exit(1)"
    ]
   },
@@ -94,15 +96,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "test_result(\"MatMul\", npu_out, cpu_out)"
+    "test_result(\"MatMul\", npu_output, cpu_output)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/tutorial/session1/LogAnalysis.ipynb b/tutorial/session1/LogAnalysis.ipynb
index 9b393384..d7ca4fa2 100644
--- a/tutorial/session1/LogAnalysis.ipynb
+++ b/tutorial/session1/LogAnalysis.ipynb
@@ -38,7 +38,7 @@
     "weight = torch.randn(1024, 1024).to(device=device)\n",
     "\n",
     "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
-    "npu_out = opt_fn(input, weight)"
+    "npu_output = opt_fn(input, weight)"
    ]
   },
   {
@@ -60,15 +60,8 @@
     "weight = torch.randn(1024, 1024).to(device=device)\n",
     "\n",
     "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
-    "npu_out = opt_fn(input, weight)"
+    "npu_outut = opt_fn(input, weight)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/tutorial/session1/Mapping.ipynb b/tutorial/session1/Mapping.ipynb
index d463c287..d3e1dde5 100644
--- a/tutorial/session1/Mapping.ipynb
+++ b/tutorial/session1/Mapping.ipynb
@@ -36,7 +36,7 @@
     "weight = torch.randn(1024, 1024).to(device=device)\n",
     "\n",
     "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
-    "npu_out = opt_fn(input, weight)"
+    "npu_outut = opt_fn(input, weight)"
    ]
   },
   {
@@ -71,7 +71,7 @@
     "weight = torch.randn(1024, 1024).to(device=device)\n",
     "\n",
     "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
-    "npu_out = opt_fn(input, weight)"
+    "npu_outut = opt_fn(input, weight)"
    ]
   },
   {
@@ -105,7 +105,7 @@
     "weight = torch.randn(1024, 1024).to(device=device)\n",
     "\n",
     "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
-    "npu_out = opt_fn(input, weight)"
+    "npu_outut = opt_fn(input, weight)"
    ]
   },
   {
@@ -117,13 +117,6 @@
     "log_path = \"\"\n",
     "!cat $log_path | grep \"Total execution cycle\""
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/tutorial/session1/TOGSimConfig.ipynb b/tutorial/session1/TOGSimConfig.ipynb
index a8c1bb6e..1add32f1 100644
--- a/tutorial/session1/TOGSimConfig.ipynb
+++ b/tutorial/session1/TOGSimConfig.ipynb
@@ -20,7 +20,7 @@
     "weight = torch.randn(2048, 2048).to(device=device)\n",
     "\n",
     "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
-    "npu_out = opt_fn(input, weight)"
+    "npu_outut = opt_fn(input, weight)"
    ]
   },
   {
@@ -52,7 +52,7 @@
     "weight = torch.randn(2048, 2048).to(device=device)\n",
     "\n",
     "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
-    "npu_out = opt_fn(input, weight)"
+    "npu_outut = opt_fn(input, weight)"
    ]
   },
   {
@@ -64,13 +64,6 @@
     "log_path = \"\"\n",
     "!cat $log_path | grep \"Total execution cycle\""
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/tutorial/session1/Training.ipynb b/tutorial/session1/Training.ipynb
index 0ec85a3d..e4191342 100644
--- a/tutorial/session1/Training.ipynb
+++ b/tutorial/session1/Training.ipynb
@@ -37,7 +37,7 @@
     "cpu_target = torch.randn(128, 128).to(device).requires_grad_()\n",
     "\n",
     "opt_fn = torch.compile(torch.matmul)\n",
-    "cpu_out = opt_fn(cpu_input, cpu_weight)\n",
+    "cpu_output = opt_fn(cpu_input, cpu_weight)\n",
     "\n",
     "loss_fn = torch.nn.CrossEntropyLoss()\n",
     "cpu_loss = loss_fn(cpu_out, cpu_target)\n",
@@ -65,7 +65,7 @@
     "npu_target = torch.randn(128, 128).to(device).requires_grad_()\n",
     "\n",
     "opt_fn = torch.compile(torch.matmul)\n",
-    "npu_out = opt_fn(npu_input, npu_weight)\n",
+    "npu_output = opt_fn(npu_input, npu_weight)\n",
     "\n",
     "loss_fn = torch.nn.CrossEntropyLoss()\n",
     "npu_loss = loss_fn(npu_out, npu_target)\n",
@@ -78,21 +78,24 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def test_result(name, npu_out, cpu_out, rtol=1e-4, atol=1e-4):\n",
-    "    if torch.allclose(npu_out.cpu(), cpu_out, rtol=rtol, atol=atol):\n",
+    "def test_result(name, npu_output, cpu_output, rtol=1e-4, atol=1e-4, preview=5):\n",
+    "    npu_output_cpu = npu_output.cpu()\n",
+    "\n",
+    "    if torch.allclose(npu_output_cpu, cpu_output, rtol=rtol, atol=atol):\n",
     "        message = f\"|{name} Functionality Test Passed|\"\n",
     "        print(\"-\" * len(message))\n",
     "        print(message)\n",
     "        print(\"-\" * len(message))\n",
-    "        print(\"npu out: \", npu_out.cpu()[0, :5])\n",
-    "        print(\"cpu out: \", cpu_out[0, :5])\n",
+    "\n",
+    "        print(f\"npu output (first {preview}): {npu_output_cpu[0, :preview]} ...\")\n",
+    "        print(f\"cpu output (first {preview}): {cpu_output[0, :preview]} ...\")\n",
     "    else:\n",
     "        message = f\"|{name} Functionality Test Failed|\"\n",
     "        print(\"-\" * len(message))\n",
     "        print(message)\n",
     "        print(\"-\" * len(message))\n",
-    "        print(\"npu out: \", npu_out.cpu())\n",
-    "        print(\"cpu out: \", cpu_out)\n",
+    "        print(\"npu output: \", npu_output_cpu)\n",
+    "        print(\"cpu output: \", cpu_output)\n",
     "        exit(1)"
    ]
   },
@@ -105,13 +108,6 @@
     "test_result(\"MatMul Input Grad\", npu_input.grad, cpu_input.grad)\n",
     "test_result(\"MatMul Weight Grad\", npu_weight.grad, cpu_weight.grad)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/tutorial/session2/Hands_on.ipynb b/tutorial/session2/Hands_on.ipynb
index a2e6899f..3e81e410 100644
--- a/tutorial/session2/Hands_on.ipynb
+++ b/tutorial/session2/Hands_on.ipynb
@@ -37,21 +37,24 @@
     "\n",
     "device = torch.device(\"npu:0\")\n",
     "\n",
-    "def test_result(name, npu_out, cpu_out, rtol=1e-4, atol=1e-4):\n",
-    "    if torch.allclose(npu_out.cpu(), cpu_out, rtol=rtol, atol=atol):\n",
-    "        message = f\"|{name} Test Passed|\"\n",
+    "def test_result(name, npu_output, cpu_output, rtol=1e-4, atol=1e-4, preview=5):\n",
+    "    npu_output_cpu = npu_output.cpu()\n",
+    "\n",
+    "    if torch.allclose(npu_output_cpu, cpu_output, rtol=rtol, atol=atol):\n",
+    "        message = f\"|{name} Functionality Test Passed|\"\n",
     "        print(\"-\" * len(message))\n",
     "        print(message)\n",
     "        print(\"-\" * len(message))\n",
-    "        print(\"npu out: \", npu_out.cpu().reshape(-1)[:5])\n",
-    "        print(\"cpu out: \", cpu_out.reshape(-1)[:5])\n",
+    "\n",
+    "        print(f\"npu output (first {preview}): {npu_output_cpu[0, :preview]} ...\")\n",
+    "        print(f\"cpu output (first {preview}): {cpu_output.reshape(-1)[0, :preview]} ...\")\n",
     "    else:\n",
-    "        message = f\"|{name} Test Failed|\"\n",
+    "        message = f\"|{name} Functionality Test Failed|\"\n",
     "        print(\"-\" * len(message))\n",
     "        print(message)\n",
     "        print(\"-\" * len(message))\n",
-    "        print(\"npu out: \", npu_out.cpu())\n",
-    "        print(\"cpu out: \", cpu_out)\n",
+    "        print(\"npu output: \", npu_output_cpu)\n",
+    "        print(\"cpu output: \", cpu_output)\n",
     "        exit(1)\n",
     "\n",
     "def test_exponent2(device, size=(128, 128)):\n",
@@ -255,7 +258,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.13"
+   "version": "3.11.13"
   }
  },
  "nbformat": 4,

From 81ce7c3d6fca166364aa2a78d8a2dbfdc6831c00 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 23 Apr 2026 15:46:08 +0900
Subject: [PATCH 190/194] [Tutorial] Set default shell as bash

---
 tutorial/jupyterhub/jupyterhub_config.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tutorial/jupyterhub/jupyterhub_config.py b/tutorial/jupyterhub/jupyterhub_config.py
index 36b03981..0657e93b 100644
--- a/tutorial/jupyterhub/jupyterhub_config.py
+++ b/tutorial/jupyterhub/jupyterhub_config.py
@@ -7,6 +7,7 @@
 # ------------------------------------------------------------------------------
 c.JupyterHub.spawner_class = 'dockerspawner.DockerSpawner'
 c.DockerSpawner.image = "ghcr.io/psal-postech/torchsim-tutorial:ispass2026"
+c.DockerSpawner.environment = {'SHELL': '/bin/bash'}
 
 # Resource limit
 c.DockerSpawner.mem_limit = '32G'

From b00a967032e5f58fe3444bf980adf2bf30c02b1d Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 23 Apr 2026 17:22:12 +0900
Subject: [PATCH 191/194] [Tutorial] Seperate log analysis hands-on

---
 tutorial/session1/InfoLogAnalysis.ipynb       | 66 +++++++++++++++++++
 ...gAnalysis.ipynb => TraceLogAnalysis.ipynb} | 25 +------
 2 files changed, 68 insertions(+), 23 deletions(-)
 create mode 100644 tutorial/session1/InfoLogAnalysis.ipynb
 rename tutorial/session1/{LogAnalysis.ipynb => TraceLogAnalysis.ipynb} (75%)

diff --git a/tutorial/session1/InfoLogAnalysis.ipynb b/tutorial/session1/InfoLogAnalysis.ipynb
new file mode 100644
index 00000000..42dabe19
--- /dev/null
+++ b/tutorial/session1/InfoLogAnalysis.ipynb
@@ -0,0 +1,66 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## TOGSim Log Analysis"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import os\n",
+    "os.environ['TOGSIM_CONFIG']=\"/workspace/PyTorchSim/tutorial/session1/togsim_configs/togsim_config_timing_only.yml\"\n",
+    "os.environ['TORCHSIM_LOG_PATH']=os.path.join(os.getcwd(), \"info_results\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### log level info"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "device = torch.device(\"npu:0\")\n",
+    "\n",
+    "input = torch.randn(1024, 1024).to(device=device)\n",
+    "weight = torch.randn(1024, 1024).to(device=device)\n",
+    "\n",
+    "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
+    "npu_output = opt_fn(input, weight)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/tutorial/session1/LogAnalysis.ipynb b/tutorial/session1/TraceLogAnalysis.ipynb
similarity index 75%
rename from tutorial/session1/LogAnalysis.ipynb
rename to tutorial/session1/TraceLogAnalysis.ipynb
index d7ca4fa2..6b7f6f40 100644
--- a/tutorial/session1/LogAnalysis.ipynb
+++ b/tutorial/session1/TraceLogAnalysis.ipynb
@@ -16,29 +16,7 @@
     "import torch\n",
     "import os\n",
     "os.environ['TOGSIM_CONFIG']=\"/workspace/PyTorchSim/tutorial/session1/togsim_configs/togsim_config_timing_only.yml\"\n",
-    "os.environ['TORCHSIM_LOG_PATH']=os.path.join(os.getcwd(), \"togsim_results\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### log level info"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "device = torch.device(\"npu:0\")\n",
-    "\n",
-    "input = torch.randn(1024, 1024).to(device=device)\n",
-    "weight = torch.randn(1024, 1024).to(device=device)\n",
-    "\n",
-    "opt_fn = torch.compile(dynamic=False)(torch.matmul)\n",
-    "npu_output = opt_fn(input, weight)"
+    "os.environ['TORCHSIM_LOG_PATH']=os.path.join(os.getcwd(), \"trace_results\")"
    ]
   },
   {
@@ -55,6 +33,7 @@
    "outputs": [],
    "source": [
     "os.environ['TOGSIM_DEBUG_LEVEL']=\"trace\"\n",
+    "device = torch.device(\"npu:0\")\n",
     "\n",
     "input = torch.randn(1024, 1024).to(device=device)\n",
     "weight = torch.randn(1024, 1024).to(device=device)\n",

From 614aa2c99c127343844a7be670f41588f8a3a65c Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Thu, 23 Apr 2026 17:38:29 +0900
Subject: [PATCH 192/194] [CI] Fix speedup script + add missing file

---
 .../artifact/speedup/check_speedup_smoke.sh   | 34 ++++++++++
 .../artifact/speedup/summary_speedup.py       | 62 ++++++++++++++++---
 2 files changed, 86 insertions(+), 10 deletions(-)
 create mode 100644 experiments/artifact/speedup/check_speedup_smoke.sh

diff --git a/experiments/artifact/speedup/check_speedup_smoke.sh b/experiments/artifact/speedup/check_speedup_smoke.sh
new file mode 100644
index 00000000..a907faa5
--- /dev/null
+++ b/experiments/artifact/speedup/check_speedup_smoke.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+# Post-condition for run_speedup.sh: at least one (workload, config) result
+# with a numeric average. Run after cycle_validation logs + run_speedup.
+set -euo pipefail
+
+if [[ -z "${TORCHSIM_DIR:-}" ]]; then
+  echo "check_speedup_smoke: TORCHSIM_DIR is not set" >&2
+  exit 1
+fi
+
+results="${TORCHSIM_DIR}/experiments/artifact/speedup/results"
+if [[ ! -d "$results" ]]; then
+  echo "check_speedup_smoke: missing results dir: $results" >&2
+  exit 1
+fi
+
+n_ok=0
+shopt -s nullglob
+files=("$results"/*.txt)
+if ((${#files[@]} == 0)); then
+  echo "check_speedup_smoke: no .txt under $results" >&2
+  exit 1
+fi
+for f in "${files[@]}"; do
+  if grep -qE "Average simulation time[[:space:]]*=[[:space:]]*[0-9]+([.][0-9]+)?" "$f"; then
+    n_ok=$((n_ok + 1))
+  fi
+done
+
+if (( n_ok < 1 )); then
+  echo "check_speedup_smoke: no .txt in $results with a numeric Average simulation time" >&2
+  exit 1
+fi
+echo "check_speedup_smoke: OK ($n_ok result file(s) with a numeric average)"
diff --git a/experiments/artifact/speedup/summary_speedup.py b/experiments/artifact/speedup/summary_speedup.py
index 67a741a0..710930b4 100644
--- a/experiments/artifact/speedup/summary_speedup.py
+++ b/experiments/artifact/speedup/summary_speedup.py
@@ -9,7 +9,7 @@
 BASELINE_CSV = os.path.join(TORCHSIM_DIR, "experiments/artifact/baseline_latency.csv")
 
 
-def plot_speedup_bars(data: dict, filename: str):
+def plot_speedup_bars(data: dict, filename: str, geomean_speedups: tuple | None = None):
     colors = {
         'Accel-Sim': '#A6A6A6',
         'mNPUSim': '#E97132',
@@ -19,6 +19,7 @@ def plot_speedup_bars(data: dict, filename: str):
     }
 
     labels = list(data.keys())
+    geomean_row = bool(labels) and labels[-1] == "Geomean"
     num_sims = len(colors)
     bar_width = 1
     fig, ax = plt.subplots(figsize=(48, 16))
@@ -34,13 +35,18 @@ def plot_speedup_bars(data: dict, filename: str):
         x_pos.append(x_offset + bar_width * (num_sims // 2))
         x_offset += bar_width * (num_sims + 2)
 
-    for sim, (heights, xpos) in grouped_data.items():
+    for sim_i, (sim, (heights, xpos)) in enumerate(grouped_data.items()):
         bars = ax.bar(xpos, heights, width=bar_width, color=colors[sim], label=sim, edgecolor='black')
         mae_val = heights[-1]
+        if geomean_row and geomean_speedups is not None and sim_i < len(geomean_speedups):
+            raw_g = geomean_speedups[sim_i]
+            bar_lbl = "N/A" if raw_g is None else f"{float(raw_g):.2f}x"
+        else:
+            bar_lbl = f"{mae_val:.1f}x"
         ax.text(
             xpos[-1],
             mae_val + 2 if mae_val >= 0 else mae_val - 6,
-            f'{mae_val:.1f}x',
+            bar_lbl,
             ha='center',
             va='bottom' if mae_val >= 0 else 'top',
             fontsize=9,
@@ -76,16 +82,30 @@ def format_with_speedup(value, ref, speedup_list=None):
         return "N/A", 0.0
 
 def compute_geomean(errors):
+    """Geometric mean of positive speedups, or None if unavailable (CI may skip ILS / some sims)."""
     if not errors:
-        return "N/A"
+        return None
     filtered = [abs(e) for e in errors if e > 0]
     if not filtered:
-        return "0.00x"
+        return None
     prod = 1.0
     for e in filtered:
         prod *= e
-    geo = prod ** (1.0 / len(filtered))
-    return geo
+    return prod ** (1.0 / len(filtered))
+
+
+def format_geomean_cell(g):
+    """One table cell for geomean row (25 chars: same as '{x:>24.2f}x')."""
+    if g is None:
+        return f"{'N/A x':>25}"
+    return f"{float(g):>24.2f}x"
+
+
+def geomean_bar_height(g):
+    """Bar height for log-scale plot when geomean is missing."""
+    if isinstance(g, (int, float)) and g > 0:
+        return float(g)
+    return 1.0
 
 if __name__ == "__main__":
     # 1. Generate cycle_map
@@ -149,8 +169,30 @@ def compute_geomean(errors):
     geomean_torchsim_ils_sn = compute_geomean(torchsim_ils_sn_speedup)
     geomean_torchsim_sn = compute_geomean(torchsim_sn_speedup)
     geomean_torchsim_cn = compute_geomean(torchsim_cn_speedup)
-    plot_data["Geomean"] = [geomean_accelsim, geomean_mnpusim, geomean_torchsim_ils_sn, geomean_torchsim_sn, geomean_torchsim_cn]
+    plot_data["Geomean"] = [
+        geomean_accelsim,
+        geomean_bar_height(geomean_mnpusim),
+        geomean_bar_height(geomean_torchsim_ils_sn),
+        geomean_bar_height(geomean_torchsim_sn),
+        geomean_bar_height(geomean_torchsim_cn),
+    ]
     print("=" * 165)
-    print(f"{'Geomean Speedup':>30} {'1x':>25} {geomean_mnpusim:>24.2f}x {geomean_torchsim_ils_sn:>24.2f}x {geomean_torchsim_sn:>24.2f}x {geomean_torchsim_cn:>24.2f}x")
+    print(
+        f"{'Geomean Speedup':>30} {'1x':>25} "
+        f"{format_geomean_cell(geomean_mnpusim)} "
+        f"{format_geomean_cell(geomean_torchsim_ils_sn)} "
+        f"{format_geomean_cell(geomean_torchsim_sn)} "
+        f"{format_geomean_cell(geomean_torchsim_cn)}"
+    )
     path = os.path.join(TORCHSIM_DIR, "experiments/artifact/speedup/speedup.png")
-    plot_speedup_bars(plot_data, path)
+    plot_speedup_bars(
+        plot_data,
+        path,
+        geomean_speedups=(
+            geomean_accelsim,
+            geomean_mnpusim,
+            geomean_torchsim_ils_sn,
+            geomean_torchsim_sn,
+            geomean_torchsim_cn,
+        ),
+    )

From 1b1fa6da0e3042b1af5154f2e02fc183a3942c3a Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 24 Apr 2026 15:10:04 +0900
Subject: [PATCH 193/194] [Doc] Update artifact evaluation explanation in
 README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 43954af7..2e11cb0f 100644
--- a/README.md
+++ b/README.md
@@ -478,7 +478,7 @@ KSC 2025 tutorial recordings are only available in Korean. The tutorial material
 We plan to broaden **model coverage** (more architectures and workloads), improve **dynamic-shape** support in the compiler and simulator path, and extend **eager-mode** integration so a wider range of PyTorch programs can be exercised without relying solely on `torch.compile`-style flows.
 
 ## Artifact Evaluation
-Artifact evaluation is being prepared for v1.0.0.
+Artifact evaluation is available for v1.0.0.
 The following scripts reproduce the validation and speedup results from the paper.
 ### Build
 ```bash

From 48c4979d0bc4903dea993bffb61b22159e903152 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <wonhyuk@postech.ac.kr>
Date: Fri, 24 Apr 2026 15:43:36 +0900
Subject: [PATCH 194/194] [Doc] Enhance README clarity and examples

---
 README.md | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 2e11cb0f..2dd3c490 100644
--- a/README.md
+++ b/README.md
@@ -147,7 +147,7 @@ Running log in CLI
 Simulation consists of three steps
 
 1. `Gem5` obtains compute latency for TOG.
-2. `Spike` verifies the output code.
+2. `Spike` verifies the output code. It can also be used to model data-dependent timings.
 3. `TOGSim` simulates an NPU architecture.
 
 The log contains memory & core stats.
@@ -204,9 +204,9 @@ compiled_step()
 
 ## One TOGSim session, one continuous log
 
-By default, **each compiled operation** can run TOGSim in a **standalone** way—typically **one simulator process and one log file per kernel**. That matches single-kernel workflows but splits traces when you run many forwards in a row.
+By default, each compiled operation can run TOGSim in a standalone way—typically **one simulator process and one log file per kernel**. That matches single-kernel workflows but splits traces when you run many forwards in a row.
 
-**`with TOGSimulator(config_path=...)`** keeps **one TOGSim session** open for the block: successive calls (e.g. multiple **`compiled_model(...)`** forwards) run **in sequence in the same process**, so the timeline and shared resources **continue in a single log** instead of restarting for every op. **`TOGSIM_CONFIG`** is set to the given YAML for the block so **codegen and TOGSim** still share one hardware file.
+`with TOGSimulator(config_path=...)` keeps **one TOGSim session** open for the block: successive calls (e.g. multiple `compiled_model(...)` forwards) run in sequence in the same process, so the timeline and shared resources continue in a single log instead of restarting for every op. `TOGSIM_CONFIG` is set to the given YAML for the block so codegen and TOGSim still share one hardware file.
 
 Use the same API you already use; only wrap the region you want co-simulated:
 
@@ -224,11 +224,11 @@ with TOGSimulator(config_path=config):
 
 ## Multi-tenancy and explicit scheduling (`launch_model`)
 
-For **multi-tenant** or **interleaved** execution, you usually need to attach a **timestamp** and a **`stream_index`** to each launch so the simulator can order work correctly. Use **`torch.npu.launch_model(compiled_model, *inputs, stream_index=..., timestamp=...)`** for that; plain `compiled_model(x)` does not carry those parameters.
+For **multi-tenant** or **interleaved** execution, you usually need to attach a **timestamp** and a `stream_index` to each launch so the simulator can order work correctly. Use `torch.npu.launch_model(compiled_model, *inputs, stream_index=..., timestamp=...)` for that; plain `compiled_model(x)` does not carry those parameters.
 
-**`stream_index`** is the **request-queue / partition index** in the TOGSim config: it must match the **values** in the **`partition`** map (each queue index is mapped to a **core**). For example, `stream_index=0` goes to the queue bound to `core_0`, `stream_index=1` to the queue for `core_1`, and so on.
+`stream_index` is the **request-queue / partition index** in the TOGSim config: it must match the **values** in the `partition` map (each queue index is mapped to a **core**). For example, `stream_index=0` goes to the queue bound to `core_0`, `stream_index=1` to the queue for `core_1`, and so on.
 
-**`timestamp`** is in **nanoseconds** (simulation time for ordering launches). Use `0` when you do not need explicit times beyond submission order.
+`timestamp`** is in **nanoseconds** (simulation time for ordering launches). Use `0` when you do not need explicit times beyond submission order.
 
 ```python
 with TOGSimulator(config_path=config):
@@ -239,7 +239,7 @@ with TOGSimulator(config_path=config):
     torch.npu.launch_model(opt_model2, x2, stream_index=1, timestamp=0)
 ```
 
-Here **`synchronize()`** acts as a barrier: it does not return until every **`launch_model`** issued **above** it has finished in the simulator. The later pair of `launch_model` calls therefore runs only after those earlier models have fully completed—so the sync is the point in the timeline where **all preceding launches are done**.
+Here `synchronize()` acts as a barrier: it does not return until every `launch_model` issued **above** it has finished in the simulator. The later pair of `launch_model` calls therefore runs only after those earlier models have fully completed—so the sync is the point in the timeline where **all preceding launches are done**.
 
 ```bash
 python tests/test_scheduler.py
@@ -247,8 +247,8 @@ python tests/test_scheduler.py
 
 Use a TOGSim config(`.yml`) that defines **partitions** when mapping queues to cores, for example:
 
-- **`num_partition`**: Number of independent request queues (valid **`stream_index`** values are `0 … num_partition-1`).
-- **`partition`**: Maps each **core** name to a **queue index**; that index is the same **`stream_index`** you pass to **`launch_model`**.
+- `num_partition`: Number of independent request queues (valid `stream_index` values are `0 … num_partition-1`).
+- `partition`: Maps each **core** name to a **queue index**; that index is the same `stream_index` you pass to `launch_model`.
 
 ```
   "num_partition" : 2,
@@ -262,7 +262,7 @@ Here `stream_index=0` selects queue `0` (core_0), `stream_index=1` selects queue
 
 ### 3. Load generation (Poisson arrivals)
 
-The **`poisson_request_generator`** in **`Scheduler.scheduler`** yields synthetic **arrival times** (in **milliseconds**). Merge those with **`launch_model`**: convert each time to **nanoseconds** for **`timestamp`**, set **`stream_index`** to the target partition queue, and run all launches inside one **`with TOGSimulator(...)`** so a **single** log captures the full trace.
+The `poisson_request_generator` in `Scheduler.scheduler` yields synthetic **arrival times** (in **milliseconds**). Merge those with `launch_model`: convert each time to **nanoseconds** for `timestamp`, set `stream_index` to the target partition queue, and run all launches inside one `with TOGSimulator(...)` so a **single** log captures the full trace.
 
 ```python
 from Scheduler.scheduler import poisson_request_generator
@@ -376,7 +376,7 @@ export TORCHSIM_USE_TIMING_POOLING=0 # use lightweight pooling for timing
 
 The `configs/` directory holds **YAML** (`.yml`) hardware descriptions. Set `TOGSIM_CONFIG` to one of these files. The **same file** is read by the **compiler** (`PyTorchSimFrontend/extension_config.py`) for `vpu_*`, `pytorchsim_*`, and `codegen_*` fields, and by **TOGSim** (`TOGSim/src/Common.cc`) for the simulator-specific keys below.
 
-### Reference layout (matches `configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml`)
+### Reference layout
 
 ```yaml
 # --- Core (TOGSim) ---
@@ -475,7 +475,7 @@ KSC 2025 tutorial recordings are only available in Korean. The tutorial material
 
 
 ## Future Works
-We plan to broaden **model coverage** (more architectures and workloads), improve **dynamic-shape** support in the compiler and simulator path, and extend **eager-mode** integration so a wider range of PyTorch programs can be exercised without relying solely on `torch.compile`-style flows.
+We plan to broaden **model coverage** (more workloads), support **dynamic-shape**, and extend **eager-mode** integration so a wider range of PyTorch programs can be simulated.
 
 ## Artifact Evaluation
 Artifact evaluation is available for v1.0.0.