FuJacob · FuJacob · May 27, 2026 · greptile-apps · May 27, 2026
diff --git a/Cotabby.xcodeproj/project.pbxproj b/Cotabby.xcodeproj/project.pbxproj
@@ -1173,7 +1173,7 @@
 			isa = XCRemoteSwiftPackageReference;
 			repositoryURL = "https://github.com/FuJacob/cotabbyinference.git";
 			requirement = {
-				branch = main;
+				branch = "feat/per-sequence-thread-budget";
 				kind = branch;
 			};
 		};

diff --git a/Cotabby/Models/LlamaRuntimeModels.swift b/Cotabby/Models/LlamaRuntimeModels.swift
@@ -204,6 +204,11 @@ struct LlamaGenerationOptions: Equatable, Sendable {
     let minP: Double
     let repetitionPenalty: Double
     var seed: UInt32?
+    /// CPU threads the sequence's context may use. 0 means "the runtime default" (all hardware
+    /// threads). Autocomplete leaves this at 0 to stay as fast as possible; the background
+    /// summarizer sets a smaller budget so it decodes concurrently with autocomplete instead of
+    /// oversubscribing every core. Does not affect sampling output, only execution width.
+    var threadCount: Int = 0
 
     static func summary(maxPredictionTokens: Int, temperature: Double) -> LlamaGenerationOptions {
         LlamaGenerationOptions(
@@ -214,9 +219,17 @@ struct LlamaGenerationOptions: Equatable, Sendable {
             minP: 0.05,
             // Higher penalty than autocomplete (1.05) because summaries span more tokens and
             // are more prone to looping when OCR input contains repeated phrases.
-            repetitionPenalty: 1.4
+            repetitionPenalty: 1.4,
+            // Background work: cap to roughly a quarter of the cores (min 2) so the summarizer
+            // runs alongside latency-critical autocomplete rather than fighting it for every core.
+            threadCount: Self.backgroundThreadBudget
         )
     }
+
+    /// ~1/4 of the machine's cores, floored at 2, used for background (summary) sequences.
+    static var backgroundThreadBudget: Int {
+        max(2, ProcessInfo.processInfo.activeProcessorCount / 4)
+    }
 }
 
 /// The concrete runtime assets selected during bootstrap after checking available model files.

diff --git a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift
@@ -381,7 +381,8 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
             top_p: Float(options.topP),
             min_p: Float(options.minP),
             repetition_penalty: Float(options.repetitionPenalty),
-            seed: options.seed ?? 0
+            seed: options.seed ?? 0,
+            thread_count: Int32(options.threadCount)
         )
     }
 

diff --git a/project.yml b/project.yml
@@ -13,7 +13,9 @@ packages:
     exactVersion: 2.9.1
   CotabbyInference:
     url: https://github.com/FuJacob/cotabbyinference.git
-    branch: main
+    # Temporarily tracks the per-sequence thread-budget branch (cotabbyinference#1).
+    # Move back to `branch: main` once that PR merges.
+    branch: feat/per-sequence-thread-budget
   swift-log:
     url: https://github.com/apple/swift-log.git
     from: 1.12.1