From 993f245a7834dfe10c4c948d470461c14f23510f Mon Sep 17 00:00:00 2001 From: jam-cai Date: Sun, 24 May 2026 23:24:33 -0400 Subject: [PATCH 1/2] Increase llama context window safely --- Cotabby/Models/LlamaRuntimeModels.swift | 2 +- Cotabby/Services/Runtime/LlamaRuntimeCore.swift | 12 +++++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/Cotabby/Models/LlamaRuntimeModels.swift b/Cotabby/Models/LlamaRuntimeModels.swift index c8c8ba15..94d1e217 100644 --- a/Cotabby/Models/LlamaRuntimeModels.swift +++ b/Cotabby/Models/LlamaRuntimeModels.swift @@ -285,7 +285,7 @@ struct LlamaRuntimeConfiguration: Equatable, Sendable { "gemma-3-1b-it-Q4_K_M.gguf", "Qwen3-0.6B-Q4_K_M.gguf" ], - contextWindowTokens: 2048, + contextWindowTokens: 4096, batchSize: 512, gpuLayerCount: -1 ) diff --git a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift index 2761b2ca..22515110 100644 --- a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift +++ b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift @@ -174,6 +174,8 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { // MARK: - Summary generation (concurrent with autocomplete) + private static let summarizeContextWindowCap = 2048 + /// Generates a summary using an ephemeral sequence so the autocomplete cache is unaffected. /// The lifecycle guard prevents `shutdown()` from unloading the model while sampling is active. func summarize( @@ -204,7 +206,15 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { throw LlamaRuntimeError.generationFailed("Tokenization returned no prompt tokens.") } - let maxPromptTokens = max(1, preparedRuntime.contextWindowTokens - options.maxPredictionTokens) + // Summary generation is auxiliary visual-context work, so keep its temporary context + // smaller than autocomplete's main KV cache. That lets autocomplete use a larger default + // window without doubling peak memory when summarization runs alongside it. + let summarizeContextWindow = min( + preparedRuntime.contextWindowTokens, + Self.summarizeContextWindowCap + ) + + let maxPromptTokens = max(1, summarizeContextWindow - options.maxPredictionTokens) let promptTokens = allPromptTokens.count > maxPromptTokens ? Array(allPromptTokens.suffix(maxPromptTokens)) : allPromptTokens From c12dde48f1a57db34d2663ccf3255b43330122dd Mon Sep 17 00:00:00 2001 From: Jacob Fu <141651335+FuJacob@users.noreply.github.com> Date: Mon, 25 May 2026 04:04:07 -0700 Subject: [PATCH 2/2] Clamp summarize prediction tokens to KV slot budget and clarify comment Bound the summarize generation loop by the same cap as the prompt budget so prompt + generated tokens stay within the capped window even if a future caller passes a larger maxPredictionTokens. Reword the rationale to describe the shared KV slot budget rather than peak memory, since the KV cache is a single pool allocated once at model load. --- Cotabby/Services/Runtime/LlamaRuntimeCore.swift | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift index 9a3531a2..9391b130 100644 --- a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift +++ b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift @@ -197,15 +197,20 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { throw LlamaRuntimeError.generationFailed("Tokenization returned no prompt tokens.") } - // Summary generation is auxiliary visual-context work, so keep its temporary context - // smaller than autocomplete's main KV cache. That lets autocomplete use a larger default - // window without doubling peak memory when summarization runs alongside it. + // Summary generation is auxiliary visual-context work, so cap the KV slot budget it may + // consume from the shared pool below autocomplete's full window. The KV cache is a single + // pool allocated once at model load; this cap keeps the summarize sequence from monopolising + // slots and evicting the autocomplete cache, rather than allocating any extra memory. let summarizeContextWindow = min( preparedRuntime.contextWindowTokens, Self.summarizeContextWindowCap ) - let maxPromptTokens = max(1, summarizeContextWindow - options.maxPredictionTokens) + // Clamp prediction tokens to the capped window so prompt + generated tokens together stay + // within budget. The loop below shares this bound, keeping the invariant correct even if a + // future caller passes a maxPredictionTokens larger than the cap. + let maxPredictionTokens = min(options.maxPredictionTokens, summarizeContextWindow - 1) + let maxPromptTokens = max(1, summarizeContextWindow - maxPredictionTokens) let promptTokens = allPromptTokens.count > maxPromptTokens ? Array(allPromptTokens.suffix(maxPromptTokens)) : allPromptTokens @@ -224,7 +229,7 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { } var generatedText = "" - for _ in 0 ..< options.maxPredictionTokens { + for _ in 0 ..< maxPredictionTokens { // Cooperative cancellation: return partial text on timeout. if Task.isCancelled { break }