From 56057dbdb22c1a96a1b436b794cbf7b0f633036e Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 19 May 2026 04:39:58 +0000
Subject: [PATCH] Cache _load_architecture by model_type

Architecture yaml was reloaded once per scheduler iteration when running
with --analytical-modeling. The non-analytical path goes through
_load_perf_db which is cached, but the analytical branch in
_build_trace_ctx calls _load_architecture directly with no cache.

For a 33-iter baseline run on Llama-3.1-8B this accounted for ~12% of
total wall-clock (~10ms per iter on yaml.safe_load).
---
 serving/core/trace_generator.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/serving/core/trace_generator.py b/serving/core/trace_generator.py
index d0c770ac..0297d528 100644
--- a/serving/core/trace_generator.py
+++ b/serving/core/trace_generator.py
@@ -22,6 +22,10 @@
 # ----------------------------------------------------------------------
 _perf_db_cache = {}
 
+# key: model_type
+# value: parsed architecture yaml dict
+_arch_cache = {}
+
 logger = get_logger("TraceGenerator")
 
 
@@ -245,6 +249,8 @@ def flush(self, ctx, enable_attn_offloading=False):
 
 def _load_architecture(model_type):
     """Load catalog + sequence from profiler/models/<model_type>.yaml."""
+    if model_type in _arch_cache:
+        return _arch_cache[model_type]
     path = _arch_yaml_path(model_type)
     if not os.path.isfile(path):
         raise FileNotFoundError(
@@ -257,6 +263,7 @@ def _load_architecture(model_type):
         raise KeyError(
             f"Architecture yaml {path} must define both 'catalog' and 'sequence'."
         )
+    _arch_cache[model_type] = arch
     return arch