diff --git a/Cotabby/Models/LlamaRuntimeModels.swift b/Cotabby/Models/LlamaRuntimeModels.swift index fe7c8ff2..e996e056 100644 --- a/Cotabby/Models/LlamaRuntimeModels.swift +++ b/Cotabby/Models/LlamaRuntimeModels.swift @@ -143,7 +143,7 @@ struct LlamaRuntimeConfiguration: Equatable, Sendable { "gemma-3-1b-it-Q4_K_M.gguf", "Qwen3-0.6B-Q4_K_M.gguf" ], - contextWindowTokens: 2048, + contextWindowTokens: 4096, batchSize: 512, gpuLayerCount: -1 ) diff --git a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift index fb8eed5b..9391b130 100644 --- a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift +++ b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift @@ -165,6 +165,8 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { // MARK: - Summary generation (concurrent with autocomplete) + private static let summarizeContextWindowCap = 2048 + /// Generates a summary using an ephemeral sequence so the autocomplete cache is unaffected. /// The lifecycle guard prevents `shutdown()` from unloading the model while sampling is active. func summarize( @@ -195,7 +197,20 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { throw LlamaRuntimeError.generationFailed("Tokenization returned no prompt tokens.") } - let maxPromptTokens = max(1, preparedRuntime.contextWindowTokens - options.maxPredictionTokens) + // Summary generation is auxiliary visual-context work, so cap the KV slot budget it may + // consume from the shared pool below autocomplete's full window. The KV cache is a single + // pool allocated once at model load; this cap keeps the summarize sequence from monopolising + // slots and evicting the autocomplete cache, rather than allocating any extra memory. + let summarizeContextWindow = min( + preparedRuntime.contextWindowTokens, + Self.summarizeContextWindowCap + ) + + // Clamp prediction tokens to the capped window so prompt + generated tokens together stay + // within budget. The loop below shares this bound, keeping the invariant correct even if a + // future caller passes a maxPredictionTokens larger than the cap. + let maxPredictionTokens = min(options.maxPredictionTokens, summarizeContextWindow - 1) + let maxPromptTokens = max(1, summarizeContextWindow - maxPredictionTokens) let promptTokens = allPromptTokens.count > maxPromptTokens ? Array(allPromptTokens.suffix(maxPromptTokens)) : allPromptTokens @@ -214,7 +229,7 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { } var generatedText = "" - for _ in 0 ..< options.maxPredictionTokens { + for _ in 0 ..< maxPredictionTokens { // Cooperative cancellation: return partial text on timeout. if Task.isCancelled { break }