From c58a938a15d73215bd9dae54a4ac652482ea4f95 Mon Sep 17 00:00:00 2001
From: Jacob Fu <141651335+FuJacob@users.noreply.github.com>
Date: Wed, 27 May 2026 01:06:53 -0700
Subject: [PATCH 1/2] Add per-sequence thread budget to SamplingConfig
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

createSequence now reads SamplingConfig.thread_count for a context's
n_threads / n_threads_batch, falling back to the all-cores default when
it is <= 0. This lets a caller cap a background sequence (e.g. the
visual-context summarizer) to a smaller thread budget so it decodes
concurrently with latency-critical autocomplete instead of
oversubscribing every core and starving it — the dominant reason two
CPU sequences showed no real concurrency before.

Backward compatible: thread_count == 0 preserves prior behavior.
---
 .../CotabbyInferenceEngine/CotabbyInferenceEngine.cpp | 10 ++++++++--
 .../include/CotabbyInferenceEngine.h                  |  5 +++++
 .../CotabbyInferenceTests/LlamaMiddlewareTests.swift  | 11 ++++++++---
 3 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/Sources/CotabbyInferenceEngine/CotabbyInferenceEngine.cpp b/Sources/CotabbyInferenceEngine/CotabbyInferenceEngine.cpp
index 251ebd1..b1205bb 100644
--- a/Sources/CotabbyInferenceEngine/CotabbyInferenceEngine.cpp
+++ b/Sources/CotabbyInferenceEngine/CotabbyInferenceEngine.cpp
@@ -252,8 +252,14 @@ int32_t CotabbyInferenceEngine::createSequence(SamplingConfig config) {
     ctx_params.n_batch = static_cast<uint32_t>(impl_->batch_size);
     ctx_params.n_ubatch = static_cast<uint32_t>(impl_->batch_size);
     ctx_params.n_seq_max = 1;
-    ctx_params.n_threads = static_cast<int32_t>(impl_->thread_count);
-    ctx_params.n_threads_batch = static_cast<int32_t>(impl_->thread_count);
+    // Per-sequence thread budget: a positive config value lets callers cap a background sequence
+    // (e.g. the summarizer) so it shares cores with autocomplete instead of oversubscribing them.
+    // Anything <= 0 falls back to the engine default of all hardware threads.
+    const int32_t sequence_threads = config.thread_count > 0
+        ? static_cast<int32_t>(config.thread_count)
+        : static_cast<int32_t>(impl_->thread_count);
+    ctx_params.n_threads = sequence_threads;
+    ctx_params.n_threads_batch = sequence_threads;
     ctx_params.offload_kqv = true;
 
     llama_context* ctx = llama_init_from_model(impl_->model, ctx_params);
diff --git a/Sources/CotabbyInferenceEngine/include/CotabbyInferenceEngine.h b/Sources/CotabbyInferenceEngine/include/CotabbyInferenceEngine.h
index dca6f12..651a988 100644
--- a/Sources/CotabbyInferenceEngine/include/CotabbyInferenceEngine.h
+++ b/Sources/CotabbyInferenceEngine/include/CotabbyInferenceEngine.h
@@ -11,6 +11,11 @@ struct SamplingConfig {
     float min_p;
     float repetition_penalty;
     uint32_t seed;
+    // CPU threads this sequence's context may use during decode. 0 (or negative) keeps the
+    // engine default of all hardware threads. Set a smaller budget on background sequences (e.g.
+    // the visual-context summarizer) so they decode concurrently with latency-critical
+    // autocomplete instead of oversubscribing every core and starving it.
+    int thread_count;
 };
 
 struct SWIFT_SELF_CONTAINED SampleResult {
diff --git a/Tests/CotabbyInferenceTests/LlamaMiddlewareTests.swift b/Tests/CotabbyInferenceTests/LlamaMiddlewareTests.swift
index 064c1d2..a15bf22 100644
--- a/Tests/CotabbyInferenceTests/LlamaMiddlewareTests.swift
+++ b/Tests/CotabbyInferenceTests/LlamaMiddlewareTests.swift
@@ -31,7 +31,8 @@ final class LlamaMiddlewareTests: XCTestCase {
             top_p: 0.7,
             min_p: 0.08,
             repetition_penalty: 1.05,
-            seed: 0
+            seed: 0,
+            thread_count: 0
         )
         let seqId = engine.createSequence(config)
         XCTAssertEqual(seqId, -1)
@@ -107,7 +108,8 @@ final class LlamaMiddlewareTests: XCTestCase {
             top_p: 0.7,
             min_p: 0.08,
             repetition_penalty: 1.05,
-            seed: 42
+            seed: 42,
+            thread_count: 0
         )
         let seqA = engine.createSequence(autoConfig)
         XCTAssertGreaterThan(seqA, 0)
@@ -152,7 +154,10 @@ final class LlamaMiddlewareTests: XCTestCase {
             top_p: 0.95,
             min_p: 0.05,
             repetition_penalty: 1.4,
-            seed: 0
+            seed: 0,
+            // Background summary sequence runs on a reduced thread budget so it can decode
+            // concurrently with autocomplete instead of oversubscribing every core.
+            thread_count: 2
         )
         let seqB = engine.createSequence(summaryConfig)
         XCTAssertGreaterThan(seqB, 0)

From 97bcc2d942e0b53f68446d08740dc2e57f13bd00 Mon Sep 17 00:00:00 2001
From: Jacob Fu <141651335+FuJacob@users.noreply.github.com>
Date: Wed, 27 May 2026 01:32:04 -0700
Subject: [PATCH 2/2] Revert "Add per-sequence thread budget to SamplingConfig"

This reverts commit c58a938a15d73215bd9dae54a4ac652482ea4f95.
---
 .../CotabbyInferenceEngine/CotabbyInferenceEngine.cpp | 10 ++--------
 .../include/CotabbyInferenceEngine.h                  |  5 -----
 .../CotabbyInferenceTests/LlamaMiddlewareTests.swift  | 11 +++--------
 3 files changed, 5 insertions(+), 21 deletions(-)

diff --git a/Sources/CotabbyInferenceEngine/CotabbyInferenceEngine.cpp b/Sources/CotabbyInferenceEngine/CotabbyInferenceEngine.cpp
index b1205bb..251ebd1 100644
--- a/Sources/CotabbyInferenceEngine/CotabbyInferenceEngine.cpp
+++ b/Sources/CotabbyInferenceEngine/CotabbyInferenceEngine.cpp
@@ -252,14 +252,8 @@ int32_t CotabbyInferenceEngine::createSequence(SamplingConfig config) {
     ctx_params.n_batch = static_cast<uint32_t>(impl_->batch_size);
     ctx_params.n_ubatch = static_cast<uint32_t>(impl_->batch_size);
     ctx_params.n_seq_max = 1;
-    // Per-sequence thread budget: a positive config value lets callers cap a background sequence
-    // (e.g. the summarizer) so it shares cores with autocomplete instead of oversubscribing them.
-    // Anything <= 0 falls back to the engine default of all hardware threads.
-    const int32_t sequence_threads = config.thread_count > 0
-        ? static_cast<int32_t>(config.thread_count)
-        : static_cast<int32_t>(impl_->thread_count);
-    ctx_params.n_threads = sequence_threads;
-    ctx_params.n_threads_batch = sequence_threads;
+    ctx_params.n_threads = static_cast<int32_t>(impl_->thread_count);
+    ctx_params.n_threads_batch = static_cast<int32_t>(impl_->thread_count);
     ctx_params.offload_kqv = true;
 
     llama_context* ctx = llama_init_from_model(impl_->model, ctx_params);
diff --git a/Sources/CotabbyInferenceEngine/include/CotabbyInferenceEngine.h b/Sources/CotabbyInferenceEngine/include/CotabbyInferenceEngine.h
index 651a988..dca6f12 100644
--- a/Sources/CotabbyInferenceEngine/include/CotabbyInferenceEngine.h
+++ b/Sources/CotabbyInferenceEngine/include/CotabbyInferenceEngine.h
@@ -11,11 +11,6 @@ struct SamplingConfig {
     float min_p;
     float repetition_penalty;
     uint32_t seed;
-    // CPU threads this sequence's context may use during decode. 0 (or negative) keeps the
-    // engine default of all hardware threads. Set a smaller budget on background sequences (e.g.
-    // the visual-context summarizer) so they decode concurrently with latency-critical
-    // autocomplete instead of oversubscribing every core and starving it.
-    int thread_count;
 };
 
 struct SWIFT_SELF_CONTAINED SampleResult {
diff --git a/Tests/CotabbyInferenceTests/LlamaMiddlewareTests.swift b/Tests/CotabbyInferenceTests/LlamaMiddlewareTests.swift
index a15bf22..064c1d2 100644
--- a/Tests/CotabbyInferenceTests/LlamaMiddlewareTests.swift
+++ b/Tests/CotabbyInferenceTests/LlamaMiddlewareTests.swift
@@ -31,8 +31,7 @@ final class LlamaMiddlewareTests: XCTestCase {
             top_p: 0.7,
             min_p: 0.08,
             repetition_penalty: 1.05,
-            seed: 0,
-            thread_count: 0
+            seed: 0
         )
         let seqId = engine.createSequence(config)
         XCTAssertEqual(seqId, -1)
@@ -108,8 +107,7 @@ final class LlamaMiddlewareTests: XCTestCase {
             top_p: 0.7,
             min_p: 0.08,
             repetition_penalty: 1.05,
-            seed: 42,
-            thread_count: 0
+            seed: 42
         )
         let seqA = engine.createSequence(autoConfig)
         XCTAssertGreaterThan(seqA, 0)
@@ -154,10 +152,7 @@ final class LlamaMiddlewareTests: XCTestCase {
             top_p: 0.95,
             min_p: 0.05,
             repetition_penalty: 1.4,
-            seed: 0,
-            // Background summary sequence runs on a reduced thread budget so it can decode
-            // concurrently with autocomplete instead of oversubscribing every core.
-            thread_count: 2
+            seed: 0
         )
         let seqB = engine.createSequence(summaryConfig)
         XCTAssertGreaterThan(seqB, 0)