Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cotabby/Models/LlamaRuntimeModels.swift
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ struct LlamaRuntimeConfiguration: Equatable, Sendable {
"gemma-3-1b-it-Q4_K_M.gguf",
"Qwen3-0.6B-Q4_K_M.gguf"
],
contextWindowTokens: 2048,
contextWindowTokens: 4096,
Comment thread
greptile-apps[bot] marked this conversation as resolved.
batchSize: 512,
gpuLayerCount: -1
)
Expand Down
19 changes: 17 additions & 2 deletions Cotabby/Services/Runtime/LlamaRuntimeCore.swift
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,8 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {

// MARK: - Summary generation (concurrent with autocomplete)

private static let summarizeContextWindowCap = 2048

/// Generates a summary using an ephemeral sequence so the autocomplete cache is unaffected.
/// The lifecycle guard prevents `shutdown()` from unloading the model while sampling is active.
func summarize(
Expand Down Expand Up @@ -195,7 +197,20 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
throw LlamaRuntimeError.generationFailed("Tokenization returned no prompt tokens.")
}

let maxPromptTokens = max(1, preparedRuntime.contextWindowTokens - options.maxPredictionTokens)
// Summary generation is auxiliary visual-context work, so cap the KV slot budget it may
// consume from the shared pool below autocomplete's full window. The KV cache is a single
// pool allocated once at model load; this cap keeps the summarize sequence from monopolising
// slots and evicting the autocomplete cache, rather than allocating any extra memory.
let summarizeContextWindow = min(
preparedRuntime.contextWindowTokens,
Self.summarizeContextWindowCap
)

// Clamp prediction tokens to the capped window so prompt + generated tokens together stay
// within budget. The loop below shares this bound, keeping the invariant correct even if a
// future caller passes a maxPredictionTokens larger than the cap.
let maxPredictionTokens = min(options.maxPredictionTokens, summarizeContextWindow - 1)
let maxPromptTokens = max(1, summarizeContextWindow - maxPredictionTokens)
let promptTokens = allPromptTokens.count > maxPromptTokens
? Array(allPromptTokens.suffix(maxPromptTokens))
: allPromptTokens
Expand All @@ -214,7 +229,7 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
}

var generatedText = ""
for _ in 0 ..< options.maxPredictionTokens {
for _ in 0 ..< maxPredictionTokens {
// Cooperative cancellation: return partial text on timeout.
if Task.isCancelled { break }

Expand Down
Loading