FuJacob · Jam-Cai · May 25, 2026 · May 25, 2026 · May 25, 2026
diff --git a/Cotabby/Models/LlamaRuntimeModels.swift b/Cotabby/Models/LlamaRuntimeModels.swift
@@ -143,7 +143,7 @@ struct LlamaRuntimeConfiguration: Equatable, Sendable {
             "gemma-3-1b-it-Q4_K_M.gguf",
             "Qwen3-0.6B-Q4_K_M.gguf"
         ],
-        contextWindowTokens: 2048,
+        contextWindowTokens: 4096,
         batchSize: 512,
         gpuLayerCount: -1
     )

diff --git a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift
@@ -165,6 +165,8 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
 
     // MARK: - Summary generation (concurrent with autocomplete)
 
+    private static let summarizeContextWindowCap = 2048
+
     /// Generates a summary using an ephemeral sequence so the autocomplete cache is unaffected.
     /// The lifecycle guard prevents `shutdown()` from unloading the model while sampling is active.
     func summarize(
@@ -195,7 +197,20 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
             throw LlamaRuntimeError.generationFailed("Tokenization returned no prompt tokens.")
         }
 
-        let maxPromptTokens = max(1, preparedRuntime.contextWindowTokens - options.maxPredictionTokens)
+        // Summary generation is auxiliary visual-context work, so cap the KV slot budget it may
+        // consume from the shared pool below autocomplete's full window. The KV cache is a single
+        // pool allocated once at model load; this cap keeps the summarize sequence from monopolising
+        // slots and evicting the autocomplete cache, rather than allocating any extra memory.
+        let summarizeContextWindow = min(
+            preparedRuntime.contextWindowTokens,
+            Self.summarizeContextWindowCap
+        )
+
+        // Clamp prediction tokens to the capped window so prompt + generated tokens together stay
+        // within budget. The loop below shares this bound, keeping the invariant correct even if a
+        // future caller passes a maxPredictionTokens larger than the cap.
+        let maxPredictionTokens = min(options.maxPredictionTokens, summarizeContextWindow - 1)
+        let maxPromptTokens = max(1, summarizeContextWindow - maxPredictionTokens)
         let promptTokens = allPromptTokens.count > maxPromptTokens
             ? Array(allPromptTokens.suffix(maxPromptTokens))
             : allPromptTokens
@@ -214,7 +229,7 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
         }
 
         var generatedText = ""
-        for _ in 0 ..< options.maxPredictionTokens {
+        for _ in 0 ..< maxPredictionTokens {
             // Cooperative cancellation: return partial text on timeout.
             if Task.isCancelled { break }