Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 0 additions & 13 deletions Cotabby/Models/LlamaRuntimeModels.swift
Original file line number Diff line number Diff line change
Expand Up @@ -201,19 +201,6 @@ struct LlamaGenerationOptions: Equatable, Sendable {
/// Average per-token log-probability below which a completion is suppressed as low-confidence.
/// Defaults to -infinity, which disables suppression entirely.
var confidenceFloor: Double = -.infinity

static func summary(maxPredictionTokens: Int, temperature: Double) -> LlamaGenerationOptions {
LlamaGenerationOptions(
maxPredictionTokens: maxPredictionTokens,
temperature: temperature,
topK: 40,
topP: 0.95,
minP: 0.05,
// Higher penalty than autocomplete (1.05) because summaries span more tokens and
// are more prone to looping when OCR input contains repeated phrases.
repetitionPenalty: 1.4
)
}
}

/// The concrete runtime assets selected during bootstrap after checking available model files.
Expand Down
64 changes: 0 additions & 64 deletions Cotabby/Services/Runtime/LlamaRuntimeCore.swift
Original file line number Diff line number Diff line change
Expand Up @@ -250,70 +250,6 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
return generatedText
}

// MARK: - Summary generation (concurrent with autocomplete)

/// Generates a summary using an ephemeral sequence so the autocomplete cache is unaffected.
/// The lifecycle guard prevents `shutdown()` from unloading the model while sampling is active.
func summarize(
prompt: String,
options: LlamaGenerationOptions
) throws -> String {
guard let preparedRuntime else {
throw LlamaRuntimeError.unavailable("The llama model is not loaded.")
}

lifecycleCondition.lock()
guard !isShuttingDown else {
lifecycleCondition.unlock()
throw LlamaRuntimeError.unavailable("The runtime is shutting down.")
}
activeOperationCount += 1
lifecycleCondition.unlock()

defer {
lifecycleCondition.lock()
activeOperationCount -= 1
lifecycleCondition.broadcast()
lifecycleCondition.unlock()
}

let allPromptTokens = tokenize(prompt)
guard !allPromptTokens.isEmpty else {
throw LlamaRuntimeError.generationFailed("Tokenization returned no prompt tokens.")
}

let maxPromptTokens = max(1, preparedRuntime.contextWindowTokens - options.maxPredictionTokens)
let promptTokens = allPromptTokens.count > maxPromptTokens
? Array(allPromptTokens.suffix(maxPromptTokens))
: allPromptTokens

let config = Self.samplingConfig(from: options)
let seqID = engine.createSequence(config)
guard seqID >= 0 else {
throw LlamaRuntimeError.generationFailed("Unable to create summary sequence.")
}
defer { engine.destroySequence(seqID) }

var tokens = promptTokens
let status = engine.decodePrompt(seqID, &tokens, Int32(tokens.count), 0)
guard status == .ok else {
throw LlamaRuntimeError.generationFailed("Summary prompt decoding failed.")
}

var generatedText = ""
for _ in 0 ..< options.maxPredictionTokens {
// Cooperative cancellation: return partial text on timeout.
if Task.isCancelled { break }

let result = engine.sampleNext(seqID)
if result.is_eos || result.was_cancelled { break }

generatedText += Self.extractPiece(result)
}

return generatedText
}

// MARK: - Cache and lifecycle

/// Drops the reusable autocomplete sequence while keeping the loaded model alive.
Expand Down
42 changes: 0 additions & 42 deletions Cotabby/Services/Runtime/LlamaRuntimeManager.swift
Original file line number Diff line number Diff line change
Expand Up @@ -145,48 +145,6 @@ final class LlamaRuntimeManager: ObservableObject {
}
}

/// Generates a short summary using an ephemeral context so the autocomplete cache is unaffected.
func summarize(
prompt: String,
maxPredictionTokens: Int,
temperature: Double
) async throws -> String {
_ = try await preparedRuntime()

let core = self.core
let options = LlamaGenerationOptions.summary(
maxPredictionTokens: maxPredictionTokens,
temperature: temperature
)
do {
let task = Task.detached {
try core.summarize(
prompt: prompt,
options: options
)
}
return try await withTaskCancellationHandler {
// Same pattern as `generate`: the detached task returns partial text on cancel,
// so surface the cancel here via `Task.checkCancellation()` to keep the catch
// below reachable and the runtime vocabulary consistent across both paths.
let partial = try await task.value
try Task.checkCancellation()
return partial
} onCancel: {
task.cancel()
}
} catch is CancellationError {
throw LlamaRuntimeError.cancelled
} catch let error as LlamaRuntimeError {
diagnostics.lastError = error.localizedDescription
throw error
} catch {
let runtimeError = LlamaRuntimeError.generationFailed(error.localizedDescription)
diagnostics.lastError = runtimeError.localizedDescription
throw runtimeError
}
}

/// Clears the native prompt KV cache without unloading the model.
func resetPromptCache() {
core.resetPromptCache()
Expand Down