FuJacob · FuJacob · Jun 1, 2026 · Jun 1, 2026
diff --git a/Cotabby/Models/SuggestionModels.swift b/Cotabby/Models/SuggestionModels.swift
@@ -43,20 +43,22 @@ enum SuggestionWordCountPreset: String, CaseIterable, Equatable, Hashable, Senda
     }
 
     /// Token budget is the sole governor of completion length on the local model (the in-prompt
-    /// word-range cue was removed), so it must track the upper word bound closely. Sized at
-    /// ~1.5x the upper word count to leave headroom for multi-token words (contractions, proper
-    /// nouns, punctuation) without overrunning the preset. The earlier 50% bump (17/27/45) let
-    /// completions blow past the setting, e.g. ~12 words on the shortest preset (#271).
+    /// word-range cue was removed), so it must track the upper word bound closely. English BPE
+    /// averages ~1.3 tokens per word, so these are sized at ~1.25x the upper word count: the cap
+    /// lands at or just under the upper bound instead of past it. History: a ~1.5x sizing
+    /// (6/11/18/30) still overran because real prose uses fewer tokens per word than that assumed,
+    /// and an earlier 50% bump overran further, e.g. ~12 words on the shortest preset (#271). When
+    /// unsure, bias shorter; a clipped suggestion is cheaper than one that blows past the setting.
     var suggestedPredictionTokenBudget: Int {
         switch self {
         case .twoToFour:
-            return 6
+            return 5
         case .fourToSeven:
-            return 11
+            return 9
         case .sevenToTwelve:
-            return 18
+            return 15
         case .twelveToTwenty:
-            return 30
+            return 25
         }
     }
 }
@@ -101,8 +103,10 @@ struct SuggestionConfiguration: Equatable, Sendable {
     /// The configuration shipped by the app today.
     /// These are product defaults, not temporary debug overrides.
     static let standard = SuggestionConfiguration(
-        // Keep completions short so ghost text stays fast and easy to accept.
-        maxPredictionTokens: 8,
+        // Floor for the per-request token budget (see SuggestionRequestFactory.activeMaxPredictionTokens).
+        // Held at the smallest word-count preset (2-4 words) so that preset's budget governs instead
+        // of being silently raised; keeps ghost text short, fast, and easy to accept.
+        maxPredictionTokens: 5,
         // Aggressive debounce: 20ms keeps time-to-first-suggestion low while still collapsing
         // bursts (superseded generations are cancelled; the host-publish poll absorbs AX lag).
         debounceMilliseconds: 20,

diff --git a/CotabbyTests/ModelAndPresentationValueTests.swift b/CotabbyTests/ModelAndPresentationValueTests.swift
@@ -40,16 +40,16 @@ final class SuggestionTextColorCodecTests: XCTestCase {
 final class SuggestionModelValueTests: XCTestCase {
     func test_wordCountPresetsExposeMatchingPromptInstructionsAndTokenBudgets() {
         XCTAssertEqual(SuggestionWordCountPreset.twoToFour.promptInstruction, "Return only the next 2 to 4 words.")
-        XCTAssertEqual(SuggestionWordCountPreset.twoToFour.suggestedPredictionTokenBudget, 6)
+        XCTAssertEqual(SuggestionWordCountPreset.twoToFour.suggestedPredictionTokenBudget, 5)
 
         XCTAssertEqual(SuggestionWordCountPreset.fourToSeven.promptInstruction, "Return only the next 4 to 7 words.")
-        XCTAssertEqual(SuggestionWordCountPreset.fourToSeven.suggestedPredictionTokenBudget, 11)
+        XCTAssertEqual(SuggestionWordCountPreset.fourToSeven.suggestedPredictionTokenBudget, 9)
 
         XCTAssertEqual(SuggestionWordCountPreset.sevenToTwelve.promptInstruction, "Return only the next 7 to 12 words.")
-        XCTAssertEqual(SuggestionWordCountPreset.sevenToTwelve.suggestedPredictionTokenBudget, 18)
+        XCTAssertEqual(SuggestionWordCountPreset.sevenToTwelve.suggestedPredictionTokenBudget, 15)
 
         XCTAssertEqual(SuggestionWordCountPreset.twelveToTwenty.promptInstruction, "Return only the next 12 to 20 words.")
-        XCTAssertEqual(SuggestionWordCountPreset.twelveToTwenty.suggestedPredictionTokenBudget, 30)
+        XCTAssertEqual(SuggestionWordCountPreset.twelveToTwenty.suggestedPredictionTokenBudget, 25)
     }
 
     func test_activeSuggestionSession_clampsConsumedCountAndSlicesByCharacters() {

diff --git a/CotabbyTests/SuggestionRequestFactoryTests.swift b/CotabbyTests/SuggestionRequestFactoryTests.swift
@@ -167,7 +167,7 @@ final class SuggestionRequestFactoryTests: XCTestCase {
             result.request.completionLengthInstruction,
             "Return only the next 12 to 20 words."
         )
-        XCTAssertEqual(result.request.maxPredictionTokens, 30)
+        XCTAssertEqual(result.request.maxPredictionTokens, 25)
         XCTAssertEqual(result.promptPreview, result.request.prompt)
     }