From 269e19d8df8c2217863a7b78d82220b0f138f8eb Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Mon, 18 May 2026 14:16:44 +0000
Subject: [PATCH] Disable prefix caching + chunked prefill in
 spec_compression_stress.sh

The simulator defaults --enable-prefix-caching and
--enable-chunked-prefill to true. The stress script never opted out,
so all three modes (baseline / self_verify / cpu_verify) were
implicitly comparing radix-attention + chunked-prefill paths instead
of the lean prefill/decode pipeline the comparison is designed to
isolate.

Pin both off so the run results reflect only the spec-decode and
KV-compression deltas the stress is meant to study. Print the choice
in the run banner so it's visible in the log.
---
 serving/spec_compression_stress.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/serving/spec_compression_stress.sh b/serving/spec_compression_stress.sh
index cdf2d667..0d93f332 100755
--- a/serving/spec_compression_stress.sh
+++ b/serving/spec_compression_stress.sh
@@ -199,6 +199,7 @@ echo "Workload:  $NUM_REQ requests, prompt=$PROMPT_LEN, decode=$OUTPUT_LEN, t=0
 echo "H100:      ${H100_SCALE}x (nominal: tflops=989 mem_bw=3350)"
 echo "Spec:      draft=$DRAFT_TOKEN_COUNT  alpha=$ACCEPTANCE_RATE  max_pending=$MAX_PENDING_TOKENS  max_verify_tokens=$MAX_VERIFY_TOKENS"
 echo "Compress:  ratio=$KV_COMPRESSION_RATIO  period=$KV_COMPRESSION_PERIOD"
+echo "Scheduler: prefix-caching=off  chunked-prefill=off"
 echo "Log level: $LOG_LEVEL"
 echo ""
 
@@ -230,6 +231,8 @@ run_mode() {
         --output "$csv"
         --num-req "$NUM_REQ"
         --max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS"
+        --no-enable-prefix-caching
+        --no-enable-chunked-prefill
         --log-level "$LOG_LEVEL"
         "${extra_args[@]}")