diff --git a/Cotabby.xcodeproj/project.pbxproj b/Cotabby.xcodeproj/project.pbxproj index 1ef9e6ae..12ef7377 100644 --- a/Cotabby.xcodeproj/project.pbxproj +++ b/Cotabby.xcodeproj/project.pbxproj @@ -1173,7 +1173,7 @@ isa = XCRemoteSwiftPackageReference; repositoryURL = "https://github.com/FuJacob/cotabbyinference.git"; requirement = { - branch = main; + branch = "feat/per-sequence-thread-budget"; kind = branch; }; }; diff --git a/Cotabby/Models/LlamaRuntimeModels.swift b/Cotabby/Models/LlamaRuntimeModels.swift index 44045981..e69c2901 100644 --- a/Cotabby/Models/LlamaRuntimeModels.swift +++ b/Cotabby/Models/LlamaRuntimeModels.swift @@ -204,6 +204,11 @@ struct LlamaGenerationOptions: Equatable, Sendable { let minP: Double let repetitionPenalty: Double var seed: UInt32? + /// CPU threads the sequence's context may use. 0 means "the runtime default" (all hardware + /// threads). Autocomplete leaves this at 0 to stay as fast as possible; the background + /// summarizer sets a smaller budget so it decodes concurrently with autocomplete instead of + /// oversubscribing every core. Does not affect sampling output, only execution width. + var threadCount: Int = 0 static func summary(maxPredictionTokens: Int, temperature: Double) -> LlamaGenerationOptions { LlamaGenerationOptions( @@ -214,9 +219,17 @@ struct LlamaGenerationOptions: Equatable, Sendable { minP: 0.05, // Higher penalty than autocomplete (1.05) because summaries span more tokens and // are more prone to looping when OCR input contains repeated phrases. - repetitionPenalty: 1.4 + repetitionPenalty: 1.4, + // Background work: cap to roughly a quarter of the cores (min 2) so the summarizer + // runs alongside latency-critical autocomplete rather than fighting it for every core. + threadCount: Self.backgroundThreadBudget ) } + + /// ~1/4 of the machine's cores, floored at 2, used for background (summary) sequences. + static var backgroundThreadBudget: Int { + max(2, ProcessInfo.processInfo.activeProcessorCount / 4) + } } /// The concrete runtime assets selected during bootstrap after checking available model files. diff --git a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift index fb8eed5b..d3fcb7df 100644 --- a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift +++ b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift @@ -381,7 +381,8 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { top_p: Float(options.topP), min_p: Float(options.minP), repetition_penalty: Float(options.repetitionPenalty), - seed: options.seed ?? 0 + seed: options.seed ?? 0, + thread_count: Int32(options.threadCount) ) } diff --git a/project.yml b/project.yml index c12d3dbe..e16a5b5f 100644 --- a/project.yml +++ b/project.yml @@ -13,7 +13,9 @@ packages: exactVersion: 2.9.1 CotabbyInference: url: https://github.com/FuJacob/cotabbyinference.git - branch: main + # Temporarily tracks the per-sequence thread-budget branch (cotabbyinference#1). + # Move back to `branch: main` once that PR merges. + branch: feat/per-sequence-thread-budget swift-log: url: https://github.com/apple/swift-log.git from: 1.12.1