From 993f245a7834dfe10c4c948d470461c14f23510f Mon Sep 17 00:00:00 2001
From: jam-cai <jamescaicjm@gmail.com>
Date: Sun, 24 May 2026 23:24:33 -0400
Subject: [PATCH 1/2] Increase llama context window safely

---
 Cotabby/Models/LlamaRuntimeModels.swift         |  2 +-
 Cotabby/Services/Runtime/LlamaRuntimeCore.swift | 12 +++++++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/Cotabby/Models/LlamaRuntimeModels.swift b/Cotabby/Models/LlamaRuntimeModels.swift
index c8c8ba15..94d1e217 100644
--- a/Cotabby/Models/LlamaRuntimeModels.swift
+++ b/Cotabby/Models/LlamaRuntimeModels.swift
@@ -285,7 +285,7 @@ struct LlamaRuntimeConfiguration: Equatable, Sendable {
             "gemma-3-1b-it-Q4_K_M.gguf",
             "Qwen3-0.6B-Q4_K_M.gguf"
         ],
-        contextWindowTokens: 2048,
+        contextWindowTokens: 4096,
         batchSize: 512,
         gpuLayerCount: -1
     )
diff --git a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift
index 2761b2ca..22515110 100644
--- a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift
+++ b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift
@@ -174,6 +174,8 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
 
     // MARK: - Summary generation (concurrent with autocomplete)
 
+    private static let summarizeContextWindowCap = 2048
+
     /// Generates a summary using an ephemeral sequence so the autocomplete cache is unaffected.
     /// The lifecycle guard prevents `shutdown()` from unloading the model while sampling is active.
     func summarize(
@@ -204,7 +206,15 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
             throw LlamaRuntimeError.generationFailed("Tokenization returned no prompt tokens.")
         }
 
-        let maxPromptTokens = max(1, preparedRuntime.contextWindowTokens - options.maxPredictionTokens)
+        // Summary generation is auxiliary visual-context work, so keep its temporary context
+        // smaller than autocomplete's main KV cache. That lets autocomplete use a larger default
+        // window without doubling peak memory when summarization runs alongside it.
+        let summarizeContextWindow = min(
+            preparedRuntime.contextWindowTokens,
+            Self.summarizeContextWindowCap
+        )
+
+        let maxPromptTokens = max(1, summarizeContextWindow - options.maxPredictionTokens)
         let promptTokens = allPromptTokens.count > maxPromptTokens
             ? Array(allPromptTokens.suffix(maxPromptTokens))
             : allPromptTokens

From c12dde48f1a57db34d2663ccf3255b43330122dd Mon Sep 17 00:00:00 2001
From: Jacob Fu <141651335+FuJacob@users.noreply.github.com>
Date: Mon, 25 May 2026 04:04:07 -0700
Subject: [PATCH 2/2] Clamp summarize prediction tokens to KV slot budget and
 clarify comment

Bound the summarize generation loop by the same cap as the prompt budget so
prompt + generated tokens stay within the capped window even if a future
caller passes a larger maxPredictionTokens. Reword the rationale to describe
the shared KV slot budget rather than peak memory, since the KV cache is a
single pool allocated once at model load.
---
 Cotabby/Services/Runtime/LlamaRuntimeCore.swift | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift
index 9a3531a2..9391b130 100644
--- a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift
+++ b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift
@@ -197,15 +197,20 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
             throw LlamaRuntimeError.generationFailed("Tokenization returned no prompt tokens.")
         }
 
-        // Summary generation is auxiliary visual-context work, so keep its temporary context
-        // smaller than autocomplete's main KV cache. That lets autocomplete use a larger default
-        // window without doubling peak memory when summarization runs alongside it.
+        // Summary generation is auxiliary visual-context work, so cap the KV slot budget it may
+        // consume from the shared pool below autocomplete's full window. The KV cache is a single
+        // pool allocated once at model load; this cap keeps the summarize sequence from monopolising
+        // slots and evicting the autocomplete cache, rather than allocating any extra memory.
         let summarizeContextWindow = min(
             preparedRuntime.contextWindowTokens,
             Self.summarizeContextWindowCap
         )
 
-        let maxPromptTokens = max(1, summarizeContextWindow - options.maxPredictionTokens)
+        // Clamp prediction tokens to the capped window so prompt + generated tokens together stay
+        // within budget. The loop below shares this bound, keeping the invariant correct even if a
+        // future caller passes a maxPredictionTokens larger than the cap.
+        let maxPredictionTokens = min(options.maxPredictionTokens, summarizeContextWindow - 1)
+        let maxPromptTokens = max(1, summarizeContextWindow - maxPredictionTokens)
         let promptTokens = allPromptTokens.count > maxPromptTokens
             ? Array(allPromptTokens.suffix(maxPromptTokens))
             : allPromptTokens
@@ -224,7 +229,7 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
         }
 
         var generatedText = ""
-        for _ in 0 ..< options.maxPredictionTokens {
+        for _ in 0 ..< maxPredictionTokens {
             // Cooperative cancellation: return partial text on timeout.
             if Task.isCancelled { break }