From c58a938a15d73215bd9dae54a4ac652482ea4f95 Mon Sep 17 00:00:00 2001 From: Jacob Fu <141651335+FuJacob@users.noreply.github.com> Date: Wed, 27 May 2026 01:06:53 -0700 Subject: [PATCH 1/2] Add per-sequence thread budget to SamplingConfig MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit createSequence now reads SamplingConfig.thread_count for a context's n_threads / n_threads_batch, falling back to the all-cores default when it is <= 0. This lets a caller cap a background sequence (e.g. the visual-context summarizer) to a smaller thread budget so it decodes concurrently with latency-critical autocomplete instead of oversubscribing every core and starving it — the dominant reason two CPU sequences showed no real concurrency before. Backward compatible: thread_count == 0 preserves prior behavior. --- .../CotabbyInferenceEngine/CotabbyInferenceEngine.cpp | 10 ++++++++-- .../include/CotabbyInferenceEngine.h | 5 +++++ .../CotabbyInferenceTests/LlamaMiddlewareTests.swift | 11 ++++++++--- 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/Sources/CotabbyInferenceEngine/CotabbyInferenceEngine.cpp b/Sources/CotabbyInferenceEngine/CotabbyInferenceEngine.cpp index 251ebd1..b1205bb 100644 --- a/Sources/CotabbyInferenceEngine/CotabbyInferenceEngine.cpp +++ b/Sources/CotabbyInferenceEngine/CotabbyInferenceEngine.cpp @@ -252,8 +252,14 @@ int32_t CotabbyInferenceEngine::createSequence(SamplingConfig config) { ctx_params.n_batch = static_cast(impl_->batch_size); ctx_params.n_ubatch = static_cast(impl_->batch_size); ctx_params.n_seq_max = 1; - ctx_params.n_threads = static_cast(impl_->thread_count); - ctx_params.n_threads_batch = static_cast(impl_->thread_count); + // Per-sequence thread budget: a positive config value lets callers cap a background sequence + // (e.g. the summarizer) so it shares cores with autocomplete instead of oversubscribing them. + // Anything <= 0 falls back to the engine default of all hardware threads. + const int32_t sequence_threads = config.thread_count > 0 + ? static_cast(config.thread_count) + : static_cast(impl_->thread_count); + ctx_params.n_threads = sequence_threads; + ctx_params.n_threads_batch = sequence_threads; ctx_params.offload_kqv = true; llama_context* ctx = llama_init_from_model(impl_->model, ctx_params); diff --git a/Sources/CotabbyInferenceEngine/include/CotabbyInferenceEngine.h b/Sources/CotabbyInferenceEngine/include/CotabbyInferenceEngine.h index dca6f12..651a988 100644 --- a/Sources/CotabbyInferenceEngine/include/CotabbyInferenceEngine.h +++ b/Sources/CotabbyInferenceEngine/include/CotabbyInferenceEngine.h @@ -11,6 +11,11 @@ struct SamplingConfig { float min_p; float repetition_penalty; uint32_t seed; + // CPU threads this sequence's context may use during decode. 0 (or negative) keeps the + // engine default of all hardware threads. Set a smaller budget on background sequences (e.g. + // the visual-context summarizer) so they decode concurrently with latency-critical + // autocomplete instead of oversubscribing every core and starving it. + int thread_count; }; struct SWIFT_SELF_CONTAINED SampleResult { diff --git a/Tests/CotabbyInferenceTests/LlamaMiddlewareTests.swift b/Tests/CotabbyInferenceTests/LlamaMiddlewareTests.swift index 064c1d2..a15bf22 100644 --- a/Tests/CotabbyInferenceTests/LlamaMiddlewareTests.swift +++ b/Tests/CotabbyInferenceTests/LlamaMiddlewareTests.swift @@ -31,7 +31,8 @@ final class LlamaMiddlewareTests: XCTestCase { top_p: 0.7, min_p: 0.08, repetition_penalty: 1.05, - seed: 0 + seed: 0, + thread_count: 0 ) let seqId = engine.createSequence(config) XCTAssertEqual(seqId, -1) @@ -107,7 +108,8 @@ final class LlamaMiddlewareTests: XCTestCase { top_p: 0.7, min_p: 0.08, repetition_penalty: 1.05, - seed: 42 + seed: 42, + thread_count: 0 ) let seqA = engine.createSequence(autoConfig) XCTAssertGreaterThan(seqA, 0) @@ -152,7 +154,10 @@ final class LlamaMiddlewareTests: XCTestCase { top_p: 0.95, min_p: 0.05, repetition_penalty: 1.4, - seed: 0 + seed: 0, + // Background summary sequence runs on a reduced thread budget so it can decode + // concurrently with autocomplete instead of oversubscribing every core. + thread_count: 2 ) let seqB = engine.createSequence(summaryConfig) XCTAssertGreaterThan(seqB, 0) From 97bcc2d942e0b53f68446d08740dc2e57f13bd00 Mon Sep 17 00:00:00 2001 From: Jacob Fu <141651335+FuJacob@users.noreply.github.com> Date: Wed, 27 May 2026 01:32:04 -0700 Subject: [PATCH 2/2] Revert "Add per-sequence thread budget to SamplingConfig" This reverts commit c58a938a15d73215bd9dae54a4ac652482ea4f95. --- .../CotabbyInferenceEngine/CotabbyInferenceEngine.cpp | 10 ++-------- .../include/CotabbyInferenceEngine.h | 5 ----- .../CotabbyInferenceTests/LlamaMiddlewareTests.swift | 11 +++-------- 3 files changed, 5 insertions(+), 21 deletions(-) diff --git a/Sources/CotabbyInferenceEngine/CotabbyInferenceEngine.cpp b/Sources/CotabbyInferenceEngine/CotabbyInferenceEngine.cpp index b1205bb..251ebd1 100644 --- a/Sources/CotabbyInferenceEngine/CotabbyInferenceEngine.cpp +++ b/Sources/CotabbyInferenceEngine/CotabbyInferenceEngine.cpp @@ -252,14 +252,8 @@ int32_t CotabbyInferenceEngine::createSequence(SamplingConfig config) { ctx_params.n_batch = static_cast(impl_->batch_size); ctx_params.n_ubatch = static_cast(impl_->batch_size); ctx_params.n_seq_max = 1; - // Per-sequence thread budget: a positive config value lets callers cap a background sequence - // (e.g. the summarizer) so it shares cores with autocomplete instead of oversubscribing them. - // Anything <= 0 falls back to the engine default of all hardware threads. - const int32_t sequence_threads = config.thread_count > 0 - ? static_cast(config.thread_count) - : static_cast(impl_->thread_count); - ctx_params.n_threads = sequence_threads; - ctx_params.n_threads_batch = sequence_threads; + ctx_params.n_threads = static_cast(impl_->thread_count); + ctx_params.n_threads_batch = static_cast(impl_->thread_count); ctx_params.offload_kqv = true; llama_context* ctx = llama_init_from_model(impl_->model, ctx_params); diff --git a/Sources/CotabbyInferenceEngine/include/CotabbyInferenceEngine.h b/Sources/CotabbyInferenceEngine/include/CotabbyInferenceEngine.h index 651a988..dca6f12 100644 --- a/Sources/CotabbyInferenceEngine/include/CotabbyInferenceEngine.h +++ b/Sources/CotabbyInferenceEngine/include/CotabbyInferenceEngine.h @@ -11,11 +11,6 @@ struct SamplingConfig { float min_p; float repetition_penalty; uint32_t seed; - // CPU threads this sequence's context may use during decode. 0 (or negative) keeps the - // engine default of all hardware threads. Set a smaller budget on background sequences (e.g. - // the visual-context summarizer) so they decode concurrently with latency-critical - // autocomplete instead of oversubscribing every core and starving it. - int thread_count; }; struct SWIFT_SELF_CONTAINED SampleResult { diff --git a/Tests/CotabbyInferenceTests/LlamaMiddlewareTests.swift b/Tests/CotabbyInferenceTests/LlamaMiddlewareTests.swift index a15bf22..064c1d2 100644 --- a/Tests/CotabbyInferenceTests/LlamaMiddlewareTests.swift +++ b/Tests/CotabbyInferenceTests/LlamaMiddlewareTests.swift @@ -31,8 +31,7 @@ final class LlamaMiddlewareTests: XCTestCase { top_p: 0.7, min_p: 0.08, repetition_penalty: 1.05, - seed: 0, - thread_count: 0 + seed: 0 ) let seqId = engine.createSequence(config) XCTAssertEqual(seqId, -1) @@ -108,8 +107,7 @@ final class LlamaMiddlewareTests: XCTestCase { top_p: 0.7, min_p: 0.08, repetition_penalty: 1.05, - seed: 42, - thread_count: 0 + seed: 42 ) let seqA = engine.createSequence(autoConfig) XCTAssertGreaterThan(seqA, 0) @@ -154,10 +152,7 @@ final class LlamaMiddlewareTests: XCTestCase { top_p: 0.95, min_p: 0.05, repetition_penalty: 1.4, - seed: 0, - // Background summary sequence runs on a reduced thread budget so it can decode - // concurrently with autocomplete instead of oversubscribing every core. - thread_count: 2 + seed: 0 ) let seqB = engine.createSequence(summaryConfig) XCTAssertGreaterThan(seqB, 0)