From 269e19d8df8c2217863a7b78d82220b0f138f8eb Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 18 May 2026 14:16:44 +0000 Subject: [PATCH] Disable prefix caching + chunked prefill in spec_compression_stress.sh The simulator defaults --enable-prefix-caching and --enable-chunked-prefill to true. The stress script never opted out, so all three modes (baseline / self_verify / cpu_verify) were implicitly comparing radix-attention + chunked-prefill paths instead of the lean prefill/decode pipeline the comparison is designed to isolate. Pin both off so the run results reflect only the spec-decode and KV-compression deltas the stress is meant to study. Print the choice in the run banner so it's visible in the log. --- serving/spec_compression_stress.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/serving/spec_compression_stress.sh b/serving/spec_compression_stress.sh index cdf2d667..0d93f332 100755 --- a/serving/spec_compression_stress.sh +++ b/serving/spec_compression_stress.sh @@ -199,6 +199,7 @@ echo "Workload: $NUM_REQ requests, prompt=$PROMPT_LEN, decode=$OUTPUT_LEN, t=0 echo "H100: ${H100_SCALE}x (nominal: tflops=989 mem_bw=3350)" echo "Spec: draft=$DRAFT_TOKEN_COUNT alpha=$ACCEPTANCE_RATE max_pending=$MAX_PENDING_TOKENS max_verify_tokens=$MAX_VERIFY_TOKENS" echo "Compress: ratio=$KV_COMPRESSION_RATIO period=$KV_COMPRESSION_PERIOD" +echo "Scheduler: prefix-caching=off chunked-prefill=off" echo "Log level: $LOG_LEVEL" echo "" @@ -230,6 +231,8 @@ run_mode() { --output "$csv" --num-req "$NUM_REQ" --max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" + --no-enable-prefix-caching + --no-enable-chunked-prefill --log-level "$LOG_LEVEL" "${extra_args[@]}")