Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cotabby.xcodeproj/project.pbxproj
Original file line number Diff line number Diff line change
Expand Up @@ -1173,7 +1173,7 @@
isa = XCRemoteSwiftPackageReference;
repositoryURL = "https://github.com/FuJacob/cotabbyinference.git";
requirement = {
branch = main;
branch = "feat/per-sequence-thread-budget";
kind = branch;
};
};
Expand Down
15 changes: 14 additions & 1 deletion Cotabby/Models/LlamaRuntimeModels.swift
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,11 @@ struct LlamaGenerationOptions: Equatable, Sendable {
let minP: Double
let repetitionPenalty: Double
var seed: UInt32?
/// CPU threads the sequence's context may use. 0 means "the runtime default" (all hardware
/// threads). Autocomplete leaves this at 0 to stay as fast as possible; the background
/// summarizer sets a smaller budget so it decodes concurrently with autocomplete instead of
/// oversubscribing every core. Does not affect sampling output, only execution width.
var threadCount: Int = 0

static func summary(maxPredictionTokens: Int, temperature: Double) -> LlamaGenerationOptions {
LlamaGenerationOptions(
Expand All @@ -214,9 +219,17 @@ struct LlamaGenerationOptions: Equatable, Sendable {
minP: 0.05,
// Higher penalty than autocomplete (1.05) because summaries span more tokens and
// are more prone to looping when OCR input contains repeated phrases.
repetitionPenalty: 1.4
repetitionPenalty: 1.4,
// Background work: cap to roughly a quarter of the cores (min 2) so the summarizer
// runs alongside latency-critical autocomplete rather than fighting it for every core.
threadCount: Self.backgroundThreadBudget
)
}

/// ~1/4 of the machine's cores, floored at 2, used for background (summary) sequences.
static var backgroundThreadBudget: Int {
max(2, ProcessInfo.processInfo.activeProcessorCount / 4)
}
}

/// The concrete runtime assets selected during bootstrap after checking available model files.
Expand Down
3 changes: 2 additions & 1 deletion Cotabby/Services/Runtime/LlamaRuntimeCore.swift
Original file line number Diff line number Diff line change
Expand Up @@ -381,7 +381,8 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
top_p: Float(options.topP),
min_p: Float(options.minP),
repetition_penalty: Float(options.repetitionPenalty),
seed: options.seed ?? 0
seed: options.seed ?? 0,
thread_count: Int32(options.threadCount)
)
}

Expand Down
4 changes: 3 additions & 1 deletion project.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@ packages:
exactVersion: 2.9.1
CotabbyInference:
url: https://github.com/FuJacob/cotabbyinference.git
branch: main
# Temporarily tracks the per-sequence thread-budget branch (cotabbyinference#1).
# Move back to `branch: main` once that PR merges.
Comment on lines 15 to +17
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Upstream branch dependency must land before merge

CotabbyInference is currently pinned to feat/per-sequence-thread-budget at commit c58a938. If cotabbyinference#1 is force-pushed, rebased, or deleted before this PR merges, CI will resolve a different (or missing) tree — and the thread_count field this PR depends on would disappear, breaking the build silently. The same pinned ref appears in project.pbxproj. Make sure to restore branch: main in both files after the upstream dependency merges, and treat those edits as a hard merge-order gate.

Fix in Codex Fix in Claude Code

branch: feat/per-sequence-thread-budget
swift-log:
url: https://github.com/apple/swift-log.git
from: 1.12.1
Expand Down