diff --git a/Cotabby/Models/LlamaRuntimeModels.swift b/Cotabby/Models/LlamaRuntimeModels.swift index 864775c..d39f8ed 100644 --- a/Cotabby/Models/LlamaRuntimeModels.swift +++ b/Cotabby/Models/LlamaRuntimeModels.swift @@ -201,19 +201,6 @@ struct LlamaGenerationOptions: Equatable, Sendable { /// Average per-token log-probability below which a completion is suppressed as low-confidence. /// Defaults to -infinity, which disables suppression entirely. var confidenceFloor: Double = -.infinity - - static func summary(maxPredictionTokens: Int, temperature: Double) -> LlamaGenerationOptions { - LlamaGenerationOptions( - maxPredictionTokens: maxPredictionTokens, - temperature: temperature, - topK: 40, - topP: 0.95, - minP: 0.05, - // Higher penalty than autocomplete (1.05) because summaries span more tokens and - // are more prone to looping when OCR input contains repeated phrases. - repetitionPenalty: 1.4 - ) - } } /// The concrete runtime assets selected during bootstrap after checking available model files. diff --git a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift index 5403e6e..f230f16 100644 --- a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift +++ b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift @@ -250,70 +250,6 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { return generatedText } - // MARK: - Summary generation (concurrent with autocomplete) - - /// Generates a summary using an ephemeral sequence so the autocomplete cache is unaffected. - /// The lifecycle guard prevents `shutdown()` from unloading the model while sampling is active. - func summarize( - prompt: String, - options: LlamaGenerationOptions - ) throws -> String { - guard let preparedRuntime else { - throw LlamaRuntimeError.unavailable("The llama model is not loaded.") - } - - lifecycleCondition.lock() - guard !isShuttingDown else { - lifecycleCondition.unlock() - throw LlamaRuntimeError.unavailable("The runtime is shutting down.") - } - activeOperationCount += 1 - lifecycleCondition.unlock() - - defer { - lifecycleCondition.lock() - activeOperationCount -= 1 - lifecycleCondition.broadcast() - lifecycleCondition.unlock() - } - - let allPromptTokens = tokenize(prompt) - guard !allPromptTokens.isEmpty else { - throw LlamaRuntimeError.generationFailed("Tokenization returned no prompt tokens.") - } - - let maxPromptTokens = max(1, preparedRuntime.contextWindowTokens - options.maxPredictionTokens) - let promptTokens = allPromptTokens.count > maxPromptTokens - ? Array(allPromptTokens.suffix(maxPromptTokens)) - : allPromptTokens - - let config = Self.samplingConfig(from: options) - let seqID = engine.createSequence(config) - guard seqID >= 0 else { - throw LlamaRuntimeError.generationFailed("Unable to create summary sequence.") - } - defer { engine.destroySequence(seqID) } - - var tokens = promptTokens - let status = engine.decodePrompt(seqID, &tokens, Int32(tokens.count), 0) - guard status == .ok else { - throw LlamaRuntimeError.generationFailed("Summary prompt decoding failed.") - } - - var generatedText = "" - for _ in 0 ..< options.maxPredictionTokens { - // Cooperative cancellation: return partial text on timeout. - if Task.isCancelled { break } - - let result = engine.sampleNext(seqID) - if result.is_eos || result.was_cancelled { break } - - generatedText += Self.extractPiece(result) - } - - return generatedText - } - // MARK: - Cache and lifecycle /// Drops the reusable autocomplete sequence while keeping the loaded model alive. diff --git a/Cotabby/Services/Runtime/LlamaRuntimeManager.swift b/Cotabby/Services/Runtime/LlamaRuntimeManager.swift index d8d2680..6d3e079 100644 --- a/Cotabby/Services/Runtime/LlamaRuntimeManager.swift +++ b/Cotabby/Services/Runtime/LlamaRuntimeManager.swift @@ -145,48 +145,6 @@ final class LlamaRuntimeManager: ObservableObject { } } - /// Generates a short summary using an ephemeral context so the autocomplete cache is unaffected. - func summarize( - prompt: String, - maxPredictionTokens: Int, - temperature: Double - ) async throws -> String { - _ = try await preparedRuntime() - - let core = self.core - let options = LlamaGenerationOptions.summary( - maxPredictionTokens: maxPredictionTokens, - temperature: temperature - ) - do { - let task = Task.detached { - try core.summarize( - prompt: prompt, - options: options - ) - } - return try await withTaskCancellationHandler { - // Same pattern as `generate`: the detached task returns partial text on cancel, - // so surface the cancel here via `Task.checkCancellation()` to keep the catch - // below reachable and the runtime vocabulary consistent across both paths. - let partial = try await task.value - try Task.checkCancellation() - return partial - } onCancel: { - task.cancel() - } - } catch is CancellationError { - throw LlamaRuntimeError.cancelled - } catch let error as LlamaRuntimeError { - diagnostics.lastError = error.localizedDescription - throw error - } catch { - let runtimeError = LlamaRuntimeError.generationFailed(error.localizedDescription) - diagnostics.lastError = runtimeError.localizedDescription - throw runtimeError - } - } - /// Clears the native prompt KV cache without unloading the model. func resetPromptCache() { core.resetPromptCache()