FuJacob · FuJacob · Jun 1, 2026 · Jun 1, 2026
diff --git a/Cotabby/Models/LlamaRuntimeModels.swift b/Cotabby/Models/LlamaRuntimeModels.swift
@@ -201,19 +201,6 @@ struct LlamaGenerationOptions: Equatable, Sendable {
     /// Average per-token log-probability below which a completion is suppressed as low-confidence.
     /// Defaults to -infinity, which disables suppression entirely.
     var confidenceFloor: Double = -.infinity
-
-    static func summary(maxPredictionTokens: Int, temperature: Double) -> LlamaGenerationOptions {
-        LlamaGenerationOptions(
-            maxPredictionTokens: maxPredictionTokens,
-            temperature: temperature,
-            topK: 40,
-            topP: 0.95,
-            minP: 0.05,
-            // Higher penalty than autocomplete (1.05) because summaries span more tokens and
-            // are more prone to looping when OCR input contains repeated phrases.
-            repetitionPenalty: 1.4
-        )
-    }
 }
 
 /// The concrete runtime assets selected during bootstrap after checking available model files.

diff --git a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift
@@ -250,70 +250,6 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
         return generatedText
     }
 
-    // MARK: - Summary generation (concurrent with autocomplete)
-
-    /// Generates a summary using an ephemeral sequence so the autocomplete cache is unaffected.
-    /// The lifecycle guard prevents `shutdown()` from unloading the model while sampling is active.
-    func summarize(
-        prompt: String,
-        options: LlamaGenerationOptions
-    ) throws -> String {
-        guard let preparedRuntime else {
-            throw LlamaRuntimeError.unavailable("The llama model is not loaded.")
-        }
-
-        lifecycleCondition.lock()
-        guard !isShuttingDown else {
-            lifecycleCondition.unlock()
-            throw LlamaRuntimeError.unavailable("The runtime is shutting down.")
-        }
-        activeOperationCount += 1
-        lifecycleCondition.unlock()
-
-        defer {
-            lifecycleCondition.lock()
-            activeOperationCount -= 1
-            lifecycleCondition.broadcast()
-            lifecycleCondition.unlock()
-        }
-
-        let allPromptTokens = tokenize(prompt)
-        guard !allPromptTokens.isEmpty else {
-            throw LlamaRuntimeError.generationFailed("Tokenization returned no prompt tokens.")
-        }
-
-        let maxPromptTokens = max(1, preparedRuntime.contextWindowTokens - options.maxPredictionTokens)
-        let promptTokens = allPromptTokens.count > maxPromptTokens
-            ? Array(allPromptTokens.suffix(maxPromptTokens))
-            : allPromptTokens
-
-        let config = Self.samplingConfig(from: options)
-        let seqID = engine.createSequence(config)
-        guard seqID >= 0 else {
-            throw LlamaRuntimeError.generationFailed("Unable to create summary sequence.")
-        }
-        defer { engine.destroySequence(seqID) }
-
-        var tokens = promptTokens
-        let status = engine.decodePrompt(seqID, &tokens, Int32(tokens.count), 0)
-        guard status == .ok else {
-            throw LlamaRuntimeError.generationFailed("Summary prompt decoding failed.")
-        }
-
-        var generatedText = ""
-        for _ in 0 ..< options.maxPredictionTokens {
-            // Cooperative cancellation: return partial text on timeout.
-            if Task.isCancelled { break }
-
-            let result = engine.sampleNext(seqID)
-            if result.is_eos || result.was_cancelled { break }
-
-            generatedText += Self.extractPiece(result)
-        }
-
-        return generatedText
-    }
-
     // MARK: - Cache and lifecycle
 
     /// Drops the reusable autocomplete sequence while keeping the loaded model alive.

diff --git a/Cotabby/Services/Runtime/LlamaRuntimeManager.swift b/Cotabby/Services/Runtime/LlamaRuntimeManager.swift
@@ -145,48 +145,6 @@ final class LlamaRuntimeManager: ObservableObject {
         }
     }
 
-    /// Generates a short summary using an ephemeral context so the autocomplete cache is unaffected.
-    func summarize(
-        prompt: String,
-        maxPredictionTokens: Int,
-        temperature: Double
-    ) async throws -> String {
-        _ = try await preparedRuntime()
-
-        let core = self.core
-        let options = LlamaGenerationOptions.summary(
-            maxPredictionTokens: maxPredictionTokens,
-            temperature: temperature
-        )
-        do {
-            let task = Task.detached {
-                try core.summarize(
-                    prompt: prompt,
-                    options: options
-                )
-            }
-            return try await withTaskCancellationHandler {
-                // Same pattern as `generate`: the detached task returns partial text on cancel,
-                // so surface the cancel here via `Task.checkCancellation()` to keep the catch
-                // below reachable and the runtime vocabulary consistent across both paths.
-                let partial = try await task.value
-                try Task.checkCancellation()
-                return partial
-            } onCancel: {
-                task.cancel()
-            }
-        } catch is CancellationError {
-            throw LlamaRuntimeError.cancelled
-        } catch let error as LlamaRuntimeError {
-            diagnostics.lastError = error.localizedDescription
-            throw error
-        } catch {
-            let runtimeError = LlamaRuntimeError.generationFailed(error.localizedDescription)
-            diagnostics.lastError = runtimeError.localizedDescription
-            throw runtimeError
-        }
-    }
-
     /// Clears the native prompt KV cache without unloading the model.
     func resetPromptCache() {
         core.resetPromptCache()