From 56057dbdb22c1a96a1b436b794cbf7b0f633036e Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 19 May 2026 04:39:58 +0000 Subject: [PATCH] Cache _load_architecture by model_type Architecture yaml was reloaded once per scheduler iteration when running with --analytical-modeling. The non-analytical path goes through _load_perf_db which is cached, but the analytical branch in _build_trace_ctx calls _load_architecture directly with no cache. For a 33-iter baseline run on Llama-3.1-8B this accounted for ~12% of total wall-clock (~10ms per iter on yaml.safe_load). --- serving/core/trace_generator.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/serving/core/trace_generator.py b/serving/core/trace_generator.py index d0c770ac..0297d528 100644 --- a/serving/core/trace_generator.py +++ b/serving/core/trace_generator.py @@ -22,6 +22,10 @@ # ---------------------------------------------------------------------- _perf_db_cache = {} +# key: model_type +# value: parsed architecture yaml dict +_arch_cache = {} + logger = get_logger("TraceGenerator") @@ -245,6 +249,8 @@ def flush(self, ctx, enable_attn_offloading=False): def _load_architecture(model_type): """Load catalog + sequence from profiler/models/.yaml.""" + if model_type in _arch_cache: + return _arch_cache[model_type] path = _arch_yaml_path(model_type) if not os.path.isfile(path): raise FileNotFoundError( @@ -257,6 +263,7 @@ def _load_architecture(model_type): raise KeyError( f"Architecture yaml {path} must define both 'catalog' and 'sequence'." ) + _arch_cache[model_type] = arch return arch