diff --git a/Cotabby/Models/SuggestionModels.swift b/Cotabby/Models/SuggestionModels.swift
index f7485eeb..54104b5a 100644
--- a/Cotabby/Models/SuggestionModels.swift
+++ b/Cotabby/Models/SuggestionModels.swift
@@ -216,6 +216,11 @@ struct SuggestionRequest: Equatable, Sendable {
     /// Engines that prefer a separate instructions channel can derive their own request text from
     /// `prefixText` and the other shared fields instead of consuming this string directly.
     let prompt: String
+    /// The same llama policy as `prompt`, split into chat roles for models that ship a chat
+    /// template. `nil` for backends/paths that do not use it (e.g. Apple Intelligence). The local
+    /// runtime renders this through the model's own template when available and falls back to the
+    /// single-string `prompt` for base models with no template.
+    let llamaChatPrompt: LlamaPromptRenderer.ChatPrompt?
     let generation: UInt64
     let maxPredictionTokens: Int
     let temperature: Double
@@ -262,6 +267,7 @@ struct SuggestionRequest: Equatable, Sendable {
         context: FocusedInputContext,
         prefixText: String,
         prompt: String,
+        llamaChatPrompt: LlamaPromptRenderer.ChatPrompt? = nil,
         generation: UInt64,
         maxPredictionTokens: Int,
         temperature: Double,
@@ -284,6 +290,7 @@ struct SuggestionRequest: Equatable, Sendable {
         self.context = context
         self.prefixText = prefixText
         self.prompt = prompt
+        self.llamaChatPrompt = llamaChatPrompt
         self.generation = generation
         self.maxPredictionTokens = maxPredictionTokens
         self.temperature = temperature
diff --git a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift
index a1e19769..d358ae48 100644
--- a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift
+++ b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift
@@ -115,6 +115,7 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
     /// Holds `autocompleteLock` for the full call to prevent concurrent KV cache mutation.
     func generate(
         prompt: String,
+        chatPrompt: LlamaPromptRenderer.ChatPrompt? = nil,
         cachedPrefixBytes: Int? = nil,
         options: LlamaGenerationOptions
     ) throws -> String {
@@ -137,8 +138,23 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
             lifecycleCondition.unlock()
         }
 
-        let promptBytes = Array(prompt.utf8)
-        let allPromptTokens = tokenize(prompt)
+        // Prefer the model's own chat template when it ships one and the caller supplied a
+        // role-split prompt; otherwise fall back to the raw single-string path that base models
+        // need. Both branches derive `promptBytes` from the SAME string they tokenize, so the
+        // KV-cache reuse comparison downstream stays self-consistent (the external byte hint is
+        // only ever used as an upper bound via `min`, so a mismatched hint clamps reuse safely).
+        let promptBytes: [UInt8]
+        let allPromptTokens: [Int32]
+        let promptStyle: String
+        if let chatPrompt, let templated = templatedPromptTokens(chatPrompt) {
+            promptBytes = templated.bytes
+            allPromptTokens = templated.tokens
+            promptStyle = "chat_template"
+        } else {
+            promptBytes = Array(prompt.utf8)
+            allPromptTokens = tokenize(prompt)
+            promptStyle = "raw"
+        }
         guard !allPromptTokens.isEmpty else {
             CotabbyLogger.runtime.error(
                 "Tokenization returned no prompt tokens",
@@ -150,6 +166,7 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
             "Decode start",
             metadata: [
                 "kind": .string("generate"),
+                "prompt_style": .string(promptStyle),
                 "prompt_tokens": .stringConvertible(allPromptTokens.count),
                 "max_tokens": .stringConvertible(options.maxPredictionTokens),
                 "cached_prefix_bytes": .string(cachedPrefixBytes.map(String.init) ?? "none")
@@ -443,6 +460,48 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
         return Array(vec)
     }
 
+    /// Renders `chatPrompt` through the loaded model's own chat template (via the engine's C-ABI
+    /// buffer accessor) and tokenizes the result with special tokens parsed, so the template's
+    /// control markers become real token IDs. Returns `nil` when the model ships no template or
+    /// rendering fails, signaling `generate` to fall back to the raw single-string prompt path.
+    ///
+    /// `add_special` is true on the tokenize call so BOS is added per the model's metadata,
+    /// matching the raw `tokenize` path; chat templates emit their own role markers but not BOS,
+    /// and `llama_tokenize` only injects specials the model is actually configured to add.
+    private func templatedPromptTokens(
+        _ chatPrompt: LlamaPromptRenderer.ChatPrompt
+    ) -> (bytes: [UInt8], tokens: [Int32])? {
+        guard engine.hasChatTemplate() else { return nil }
+
+        // applyChatTemplate writes the rendered prompt into our buffer and returns the byte count;
+        // a negative return is -(required size) when the buffer was too small, so resize once and
+        // retry; 0 means render failure → fall back to raw.
+        var buffer = [CChar](repeating: 0, count: 4096)
+        var written = engine.applyChatTemplate(
+            chatPrompt.system, chatPrompt.user, true, &buffer, Int32(buffer.count)
+        )
+        if written < 0 {
+            buffer = [CChar](repeating: 0, count: Int(-written))
+            written = engine.applyChatTemplate(
+                chatPrompt.system, chatPrompt.user, true, &buffer, Int32(buffer.count)
+            )
+        }
+        guard written > 0 else { return nil }
+
+        // `buffer` is CChar (Int8); reinterpret the written prefix as UTF-8 bytes. The failable
+        // initializer returns nil on invalid UTF-8, which we treat as a render failure (fall back).
+        let renderedBytes = buffer.prefix(Int(written)).map { UInt8(bitPattern: $0) }
+        guard let rendered = String(bytes: renderedBytes, encoding: .utf8), !rendered.isEmpty else {
+            return nil
+        }
+
+        let vec = engine.tokenizeWithOptions(rendered, Int32(rendered.utf8.count), true, true)
+        let tokens = Array(vec)
+        guard !tokens.isEmpty else { return nil }
+
+        return (Array(rendered.utf8), tokens)
+    }
+
     private static func extractPiece(_ result: SampleResult) -> String {
         guard let piece = result.piece, result.piece_length > 0 else { return "" }
         let buffer = UnsafeBufferPointer(
diff --git a/Cotabby/Services/Runtime/LlamaRuntimeManager.swift b/Cotabby/Services/Runtime/LlamaRuntimeManager.swift
index b4ed4f24..08f191f7 100644
--- a/Cotabby/Services/Runtime/LlamaRuntimeManager.swift
+++ b/Cotabby/Services/Runtime/LlamaRuntimeManager.swift
@@ -90,6 +90,7 @@ final class LlamaRuntimeManager: ObservableObject {
     /// still validates the token prefix before trusting any native KV state.
     func generate(
         prompt: String,
+        chatPrompt: LlamaPromptRenderer.ChatPrompt? = nil,
         cachedPrefixBytes: Int? = nil,
         options: LlamaGenerationOptions
     ) async throws -> String {
@@ -104,6 +105,7 @@ final class LlamaRuntimeManager: ObservableObject {
             let task = Task.detached {
                 try core.generate(
                     prompt: prompt,
+                    chatPrompt: chatPrompt,
                     cachedPrefixBytes: cachedPrefixBytes,
                     options: options
                 )
diff --git a/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift b/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift
index 03266932..3a44234f 100644
--- a/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift
+++ b/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift
@@ -37,6 +37,7 @@ final class LlamaSuggestionEngine {
             )
             let rawSuggestion = try await runtimeManager.generate(
                 prompt: request.prompt,
+                chatPrompt: request.llamaChatPrompt,
                 cachedPrefixBytes: cachedPrefixBytes,
                 options: LlamaGenerationOptions(
                     maxPredictionTokens: request.maxPredictionTokens,
diff --git a/Cotabby/Support/LlamaPromptRenderer.swift b/Cotabby/Support/LlamaPromptRenderer.swift
index 854005ff..1045f074 100644
--- a/Cotabby/Support/LlamaPromptRenderer.swift
+++ b/Cotabby/Support/LlamaPromptRenderer.swift
@@ -1,12 +1,15 @@
 import Foundation
 
 /// File overview:
-/// Renders the single prompt string consumed by the local llama runtime.
+/// Renders the prompts consumed by the local llama runtime, in two shapes: `prompt(...)` for the
+/// raw single-string path (base / no-template models) and `messages(...)` for the chat-template
+/// path (instruct models that ship a template). Both are plain prose with no standalone `Label:`
+/// lines, because small instruct models echo a bare label line straight into the ghost text.
 ///
 /// Why this file exists:
 /// llama.cpp does not give us a separate "instructions" channel the way Foundation Models does.
-/// That means all base behavior, user preferences, and request context must be composed into one
-/// prompt string. Keeping that composition isolated here prevents prompt policy from leaking into
+/// That means all base behavior, user preferences, and request context must be composed by us.
+/// Keeping that composition isolated here prevents prompt policy from leaking into
 /// `SuggestionRequestFactory` or the runtime lifecycle layer.
 enum LlamaPromptRenderer {
     /// Renders Cotabby's local-model prompt.
@@ -24,82 +27,164 @@ enum LlamaPromptRenderer {
         clipboardContext: String? = nil,
         visualContextSummary: String? = nil
     ) -> String {
-        var sections = [
-            "Task:",
-            "- Continue the user's existing text exactly at the caret position.",
-            "- This is autocomplete, not chat. Do not answer the user or start a conversation.",
-            "- Never repeat, restate, or quote the text before the caret.",
-            "- Use clipboard context only when it directly helps the inline continuation.",
-            "- Return plain text only with no thinking, labels, bullets, markdown, quotes, or explanation."
+        // Composed entirely as prose, with no standalone `Label:` lines. Small instruct models echo
+        // a lone "Task:" / "Screen context:" / "Text before caret:" line straight into the ghost
+        // text — they read a bare label as content to continue. Folding everything into sentences
+        // removes that surface. The one invariant that actually locates the caret is preserved:
+        // `prefixText` is the LAST thing in the string, so the model (templated or base) continues
+        // from where the user stopped. The instruction sentences sit before it; the declared-language
+        // hint stays last among the instructions so it keeps its high-attention slot right before the
+        // prefix. `completionLengthInstruction` remains intentionally unused — length is governed by
+        // the token budget (`SuggestionWordCountPreset.suggestedPredictionTokenBudget`).
+        var sentences = [
+            "You complete partially-typed text. The user is the author; produce the next few words "
+                + "they would type, continuing directly from where their text stops.",
+            "This is autocomplete, not chat. Do not answer the user or start a conversation.",
+            "Never repeat, restate, or quote the text the user has already typed.",
+            "Use clipboard or screen context only when it directly helps the inline continuation.",
+            "Return plain text only, with no thinking, labels, bullets, markdown, quotes, or explanation."
         ]
 
-        var profileSections: [String] = []
         if let name = userName, !name.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty {
-            profileSections.append("- The user's name is \(name).")
-        }
-        if !profileSections.isEmpty {
-            sections.append("")
-            sections.append("User Profile Context:")
-            sections.append(contentsOf: profileSections)
+            sentences.append("The user's name is \(name).")
         }
 
-        // User style rules render after the base task rules and profile, with an explicit
-        // subordination line so a user "rule" can never override the autocomplete/output contract
-        // above (prompt-injection guard).
+        // User style rules are folded into a single sentence with an explicit subordination clause so
+        // a user "rule" can never override the autocomplete/output contract above (prompt-injection
+        // guard), matching the prior labeled form's intent.
         let trimmedRules = customRules
             .map { $0.trimmingCharacters(in: .whitespacesAndNewlines) }
             .filter { !$0.isEmpty }
         if !trimmedRules.isEmpty {
-            sections.append("")
-            sections.append("Your style preferences:")
-            sections.append(contentsOf: trimmedRules.map { "- \($0)" })
-            sections.append("Apply these only when they fit the continuation naturally; never break the rules above.")
+            let joinedRules = trimmedRules.joined(separator: "; ")
+            sentences.append(
+                "When it fits the continuation naturally, also honor the user's own writing "
+                    + "preferences (\(joinedRules)), but never break the rules above."
+            )
         }
 
-        // Free-form user-authored reference notes (glossary, jargon, style guide). Rendered as a
-        // verbatim block rather than line-by-line bullets so the user's structure (lists, headings,
-        // examples) is preserved. The subordination line is the same prompt-injection guard used
-        // for style preferences above: this is reference material, not an override of the base
-        // autocomplete contract.
+        // Free-form user-authored reference notes (glossary, jargon, style guide). The notes can
+        // carry their own structure (lists, headings), so they go in verbatim after an introducing
+        // sentence rather than being flattened — but with no standalone `Label:` line of our own.
+        // The subordination clause is the same prompt-injection guard used for style preferences:
+        // this is reference material, not an override of the base autocomplete contract.
         if let extendedContext, !extendedContext.isEmpty {
-            sections.append("")
-            sections.append("Reference notes from the user:")
-            sections.append(extendedContext)
-            sections.append("Use these notes only when they fit the continuation naturally; never break the rules above.")
+            sentences.append(
+                "Reference notes from the user (use only when they fit the continuation naturally, "
+                    + "and never to break the rules above):\n\(extendedContext)"
+            )
         }
 
-        sections.append("")
-        sections.append("Screen context:")
-        sections.append("User is on \(applicationName).")
+        sentences.append("The user is writing in \(applicationName).")
         if let summary = visualContextSummary, !summary.isEmpty {
-            sections.append("Screen content:")
-            sections.append(summary)
+            sentences.append("Nearby on screen, the user can see \(summary)")
         }
         if let clipboardContext, !clipboardContext.isEmpty {
-            sections.append("User's clipboard:")
-            sections.append(clipboardContext)
+            sentences.append("The user's clipboard currently contains \(clipboardContext)")
         }
 
-        // The final task cue sits immediately before the prefix so small instruct models see the
-        // current length policy right before the text they must continue, while the prefix itself
-        // still remains the last payload in the prompt.
-        sections.append("")
-        sections.append("Final instruction:")
-        // The declared-language hint sits in the late, high-attention block right before the prefix
-        // so small instruct models actually weigh it — without it they tend to drift to English when
-        // the surrounding text is short or ambiguous.
+        _ = completionLengthInstruction
+        // The declared-language hint sits last among the instructions (highest attention, right
+        // before the prefix) — without it small models drift to English when the surrounding text is
+        // short or ambiguous.
         if let languageInstruction, !languageInstruction.isEmpty {
-            sections.append("- \(languageInstruction)")
+            sentences.append(languageInstruction)
+        }
+
+        // Blank line then the bare prefix as the final payload: the model continues from the last
+        // text, and the blank line keeps the prefix visually distinct from the instructions without
+        // a label the model could echo.
+        let instructions = sentences.joined(separator: "\n")
+        return instructions + "\n\n" + prefixText
+    }
+
+    /// A system/user message pair for the chat-template path. The system turn carries every rule
+    /// and context block; the user turn carries only the text to continue, so when the model's
+    /// own template opens an assistant turn after it, the model continues the user's text as its
+    /// own rather than answering it.
+    struct ChatPrompt: Equatable, Sendable {
+        let system: String
+        let user: String
+    }
+
+    /// Renders the same policy as `prompt(...)` but split into chat roles, for models that ship a
+    /// chat template (see `CotabbyInferenceEngine.hasChatTemplate`). The raw `prompt(...)` stays the
+    /// fallback for base models with no template.
+    ///
+    /// Why the split matters: the single-string `prompt(...)` ends on a `Text before caret:` label
+    /// because a raw model needs that scaffolding to know where the continuation begins. A templated
+    /// model instead gets the rules and context in the system turn and the bare prefix in the user
+    /// turn. The system turn is deliberately written as prose with no standalone `Label:` lines:
+    /// small instruct models echo a lone `Screen context:` / `App:` line straight into the ghost
+    /// text, so removing the label surface (not just the trailing prefix label) is what stops the
+    /// scaffolding leak. The framing mirrors `FoundationModelPromptRenderer`: continue, do not converse.
+    static func messages(
+        prefixText: String,
+        applicationName: String,
+        completionLengthInstruction: String,
+        userName: String?,
+        customRules: [String] = [],
+        extendedContext: String? = nil,
+        languageInstruction: String? = nil,
+        clipboardContext: String? = nil,
+        visualContextSummary: String? = nil
+    ) -> ChatPrompt {
+        var sentences = [
+            "You complete partially-typed text. The user is the author; produce the next few words "
+                + "they would type, continuing directly from where their text stops.",
+            "This is autocomplete, not chat. Do not answer the user, greet them, or start a "
+                + "conversation.",
+            "Never repeat, restate, or quote the text the user has already typed.",
+            "Match the existing language, register, casing, and punctuation.",
+            "Use clipboard or screen context only when it directly helps the inline continuation.",
+            "Return plain text only, with no thinking, labels, bullets, markdown, quotes, or explanation."
+        ]
+
+        // Context is written as plain sentences rather than "Label:" blocks. The earlier labeled
+        // form (a standalone line reading e.g. "Screen context:") was the thing small instruct
+        // models echoed verbatim into ghost text — they treat a lone "Label:" line as content to
+        // continue. Folding the same information into prose removes the label surface entirely while
+        // keeping every value the model needs, so there is nothing label-shaped left to copy.
+        if let name = userName, !name.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty {
+            sentences.append("The user's name is \(name).")
+        }
+
+        let trimmedRules = customRules
+            .map { $0.trimmingCharacters(in: .whitespacesAndNewlines) }
+            .filter { !$0.isEmpty }
+        if !trimmedRules.isEmpty {
+            let joinedRules = trimmedRules.joined(separator: "; ")
+            sentences.append(
+                "When it fits the continuation naturally, also honor the user's own writing "
+                    + "preferences (\(joinedRules)), but never break the rules above."
+            )
         }
-        // Experiment: the explicit word-range line (`completionLengthInstruction`) is intentionally
-        // omitted from the local-model prompt so length is governed purely by the token budget
-        // (`SuggestionWordCountPreset.suggestedPredictionTokenBudget`). The parameter stays wired so
-        // re-enabling the in-prompt cue is a one-line change. Apple Intelligence still gets the cue.
+
+        // Free-form reference notes, same treatment as the raw prompt() path: introduced by a
+        // sentence then included verbatim (preserving the user's own structure), subordinate to the
+        // base rules. Kept in sync with prompt() so both engines see the user's notes.
+        if let extendedContext, !extendedContext.isEmpty {
+            sentences.append(
+                "Reference notes from the user (use only when they fit the continuation naturally, "
+                    + "and never to break the rules above):\n\(extendedContext)"
+            )
+        }
+
+        sentences.append("The user is writing in \(applicationName).")
+        if let summary = visualContextSummary, !summary.isEmpty {
+            sentences.append("Nearby on screen, the user can see \(summary)")
+        }
+        if let clipboardContext, !clipboardContext.isEmpty {
+            sentences.append("The user's clipboard currently contains \(clipboardContext)")
+        }
+
+        // Length is governed by the token budget for the local path (see `prompt(...)`), so the
+        // explicit word-range cue stays omitted here too; the parameter is kept wired for symmetry.
         _ = completionLengthInstruction
-        sections.append("- The next line must begin directly with the continuation text.")
-        sections.append("Text before caret:")
-        sections.append(prefixText)
+        if let languageInstruction, !languageInstruction.isEmpty {
+            sentences.append(languageInstruction)
+        }
 
-        return sections.joined(separator: "\n")
+        return ChatPrompt(system: sentences.joined(separator: "\n"), user: prefixText)
     }
 }
diff --git a/Cotabby/Support/SuggestionRequestFactory.swift b/Cotabby/Support/SuggestionRequestFactory.swift
index 996b0db8..94704961 100644
--- a/Cotabby/Support/SuggestionRequestFactory.swift
+++ b/Cotabby/Support/SuggestionRequestFactory.swift
@@ -73,11 +73,25 @@ enum SuggestionRequestFactory {
             clipboardContext: boundedClipboardContext,
             visualContextSummary: boundedVisualContextSummary
         )
+        // Role-split variant for chat-template-capable local models. Built unconditionally and
+        // cheaply; the runtime decides per-model whether to use it or fall back to `prompt`.
+        let llamaChatPrompt = LlamaPromptRenderer.messages(
+            prefixText: prefixText,
+            applicationName: context.applicationName,
+            completionLengthInstruction: completionLengthInstruction,
+            userName: userName,
+            customRules: customRules,
+            extendedContext: activeExtendedContext,
+            languageInstruction: languageInstruction,
+            clipboardContext: boundedClipboardContext,
+            visualContextSummary: boundedVisualContextSummary
+        )
 
         let request = SuggestionRequest(
             context: context,
             prefixText: prefixText,
             prompt: prompt,
+            llamaChatPrompt: llamaChatPrompt,
             generation: context.generation,
             maxPredictionTokens: activeMaxPredictionTokens(
                 configuration: configuration,
diff --git a/CotabbyTests/CustomRulesTests.swift b/CotabbyTests/CustomRulesTests.swift
index afcc866b..288824f5 100644
--- a/CotabbyTests/CustomRulesTests.swift
+++ b/CotabbyTests/CustomRulesTests.swift
@@ -46,14 +46,15 @@ final class CustomRulesTests: XCTestCase {
             customRules: ["Use British spelling", "Never use em dashes"]
         )
 
-        XCTAssertTrue(prompt.contains("Your style preferences:"))
-        XCTAssertTrue(prompt.contains("- Use British spelling"))
-        XCTAssertTrue(prompt.contains("- Never use em dashes"))
+        // The prompt is prose now (no "Your style preferences:" label block), but the user's rules
+        // and the subordination clause must still be present.
+        XCTAssertTrue(prompt.contains("Use British spelling"))
+        XCTAssertTrue(prompt.contains("Never use em dashes"))
         XCTAssertTrue(prompt.contains("never break the rules above"))
 
-        // The base task rules must precede the user style section.
-        let baseIndex = try? XCTUnwrap(prompt.range(of: "Task:"))
-        let rulesIndex = try? XCTUnwrap(prompt.range(of: "Your style preferences:"))
+        // The base autocomplete rules must precede the user style preferences.
+        let baseIndex = try? XCTUnwrap(prompt.range(of: "autocomplete, not chat"))
+        let rulesIndex = try? XCTUnwrap(prompt.range(of: "Use British spelling"))
         if let baseIndex, let rulesIndex {
             XCTAssertLessThan(baseIndex.lowerBound, rulesIndex.lowerBound)
         }
@@ -68,6 +69,8 @@ final class CustomRulesTests: XCTestCase {
             customRules: []
         )
 
+        // No user rules → no style-preferences sentence (and no leftover label form either).
+        XCTAssertFalse(prompt.contains("writing preferences"))
         XCTAssertFalse(prompt.contains("Your style preferences:"))
     }
 
diff --git a/CotabbyTests/ExtendedContextTests.swift b/CotabbyTests/ExtendedContextTests.swift
index a0e374a0..88ceaad5 100644
--- a/CotabbyTests/ExtendedContextTests.swift
+++ b/CotabbyTests/ExtendedContextTests.swift
@@ -113,7 +113,7 @@ final class ExtendedContextTests: XCTestCase {
             configuration: .standard
         )
 
-        XCTAssertTrue(result.promptPreview.contains("Reference notes from the user:"))
+        XCTAssertTrue(result.promptPreview.contains("Reference notes from the user"))
         XCTAssertTrue(result.promptPreview.contains("RULE: Every other word should be 'meow'"))
     }
 
@@ -129,14 +129,17 @@ final class ExtendedContextTests: XCTestCase {
             extendedContext: "Project codenames: Aurora = the iOS app. Borealis = the macOS app."
         )
 
-        XCTAssertTrue(prompt.contains("Reference notes from the user:"))
+        XCTAssertTrue(prompt.contains("Reference notes from the user"))
         XCTAssertTrue(prompt.contains("Project codenames: Aurora = the iOS app."))
         XCTAssertTrue(prompt.contains("never break the rules above"))
 
-        // Reference notes must follow custom rules, which must themselves follow the base task block.
-        guard let baseRange = prompt.range(of: "Task:"),
-              let rulesRange = prompt.range(of: "Your style preferences:"),
-              let notesRange = prompt.range(of: "Reference notes from the user:")
+        // The renderer is now plain prose with no standalone `Label:` lines (small instruct models
+        // echoed bare labels into ghost text), so anchor the ordering on stable phrases instead of the
+        // old "Task:" / "Your style preferences:" labels. Reference notes must still follow custom
+        // rules, which must themselves follow the base autocomplete instructions.
+        guard let baseRange = prompt.range(of: "You complete partially-typed text"),
+              let rulesRange = prompt.range(of: "honor the user's own writing preferences"),
+              let notesRange = prompt.range(of: "Reference notes from the user")
         else {
             return XCTFail("expected base/rules/notes sections to be present")
         }
@@ -152,7 +155,7 @@ final class ExtendedContextTests: XCTestCase {
             userName: nil
         )
 
-        XCTAssertFalse(prompt.contains("Reference notes from the user:"))
+        XCTAssertFalse(prompt.contains("Reference notes from the user"))
     }
 
     // MARK: - foundation model rendering
diff --git a/CotabbyTests/LanguageSupportTests.swift b/CotabbyTests/LanguageSupportTests.swift
index 3761e5c9..da4b6d5a 100644
--- a/CotabbyTests/LanguageSupportTests.swift
+++ b/CotabbyTests/LanguageSupportTests.swift
@@ -69,9 +69,11 @@ final class LanguageSupportTests: XCTestCase {
 
     // MARK: - rendering
 
-    func test_llamaRenderer_placesLanguageHintInFinalBlock() {
-        // The length cue is no longer rendered (token-budget-only experiment), so this guards that
-        // the language hint still lands in the late, high-attention final-instruction block.
+    func test_llamaRenderer_placesLanguageHintLateRightBeforePrefix() {
+        // The length cue is no longer rendered (token-budget-only experiment), and the prompt is now
+        // prose with no "Final instruction:" header. This guards that the language hint still lands
+        // late — after the app-context sentence and immediately before the prefix, its
+        // high-attention slot — so small models actually weigh it.
         let prompt = LlamaPromptRenderer.prompt(
             prefixText: "Hola",
             applicationName: "Notes",
@@ -82,12 +84,16 @@ final class LanguageSupportTests: XCTestCase {
 
         XCTAssertFalse(prompt.contains("UNIQUE_LENGTH_CUE"))
 
-        guard let finalRange = prompt.range(of: "Final instruction:"),
-              let langRange = prompt.range(of: "Spanish") else {
-            XCTFail("Expected final instruction header and language hint in the prompt")
+        guard let contextRange = prompt.range(of: "writing in Notes"),
+              let langRange = prompt.range(of: "Spanish"),
+              let prefixRange = prompt.range(of: "Hola") else {
+            XCTFail("Expected app-context sentence, language hint, and prefix in the prompt")
             return
         }
-        XCTAssertLessThan(finalRange.lowerBound, langRange.lowerBound)
+        // Order: app context → language hint → prefix (last).
+        XCTAssertLessThan(contextRange.lowerBound, langRange.lowerBound)
+        XCTAssertLessThan(langRange.lowerBound, prefixRange.lowerBound)
+        XCTAssertTrue(prompt.hasSuffix("Hola"))
     }
 
     func test_llamaRenderer_emitsNoLanguageLineWhenNoneDeclared() {
diff --git a/CotabbyTests/LlamaPromptRendererTests.swift b/CotabbyTests/LlamaPromptRendererTests.swift
index 1ac9b039..0c1c7eb8 100644
--- a/CotabbyTests/LlamaPromptRendererTests.swift
+++ b/CotabbyTests/LlamaPromptRendererTests.swift
@@ -60,10 +60,11 @@ final class LlamaPromptRendererTests: XCTestCase {
 
     // MARK: - instruction prompt
 
-    /// The structural contract for local instruct models: stable task rules first, supporting
-    /// context in the middle, then a late length cue right before the prefix the model must
-    /// continue. Losing one of these sections tends to degrade prompt-following without throwing.
-    func test_instructionPrompt_containsTaskScreenContextAndFinalInstruction() {
+    /// The prose contract for the raw single-string prompt: autocomplete rules, then context as
+    /// sentences, then the bare prefix as the final payload. No standalone `Label:` lines (the
+    /// thing small models echo into ghost text), and the prefix stays last so the model continues
+    /// from where the user stopped.
+    func test_instructionPrompt_carriesAutocompleteRulesAndAppContextAsProse() {
         let prompt = LlamaPromptRenderer.prompt(
             prefixText: "Once upon",
             applicationName: "Messages",
@@ -71,16 +72,32 @@ final class LlamaPromptRendererTests: XCTestCase {
             userName: nil
         )
 
-        XCTAssertTrue(prompt.contains("Task:"), "instruction prompt should include Task section")
-        XCTAssertTrue(
-            prompt.contains("Screen context:"),
-            "instruction prompt should include Screen context section"
-        )
-        XCTAssertTrue(
-            prompt.contains("Final instruction:"),
-            "instruction prompt should include a late final instruction section"
+        XCTAssertTrue(prompt.contains("autocomplete, not chat"))
+        XCTAssertTrue(prompt.contains("writing in Messages"))
+    }
+
+    /// No standalone `Label:` line may appear, even with every context block populated — those are
+    /// exactly what small instruct models parrot back as ghost text.
+    func test_instructionPrompt_containsNoLabelScaffolding() {
+        let prompt = LlamaPromptRenderer.prompt(
+            prefixText: "Once upon",
+            applicationName: "Messages",
+            completionLengthInstruction: "Keep completion short.",
+            userName: "Jacob",
+            customRules: ["Be concise"],
+            languageInstruction: "Respond in German.",
+            clipboardContext: "clip",
+            visualContextSummary: "a form"
         )
-        XCTAssertTrue(prompt.contains("Text before caret:"), "instruction prompt should include the prefix header")
+
+        XCTAssertFalse(prompt.contains("Task:"))
+        XCTAssertFalse(prompt.contains("Screen context:"))
+        XCTAssertFalse(prompt.contains("Screen content:"))
+        XCTAssertFalse(prompt.contains("Final instruction:"))
+        XCTAssertFalse(prompt.contains("Text before caret:"))
+        XCTAssertFalse(prompt.contains("User Profile Context:"))
+        XCTAssertFalse(prompt.contains("Your style preferences:"))
+        XCTAssertFalse(prompt.contains("User's clipboard:"))
     }
 
     func test_instructionPrompt_includesApplicationNameAndPrefix() {
@@ -91,16 +108,13 @@ final class LlamaPromptRendererTests: XCTestCase {
             userName: nil
         )
 
-        XCTAssertTrue(prompt.contains("User is on Slack."))
+        XCTAssertTrue(prompt.contains("writing in Slack"))
         XCTAssertTrue(prompt.contains("My prefix text here"))
     }
 
     /// Length is enforced by the token budget, not by an in-prompt word range, so the
     /// completion-length cue must never reach the local-model prompt even if a caller passes one.
     func test_instructionPrompt_omitsCompletionLengthInstruction() {
-        // Experiment: the local-model prompt no longer carries the word-range cue; length is
-        // governed solely by the token budget. The cue must not leak into the prompt even when a
-        // caller still passes one.
         let prompt = LlamaPromptRenderer.prompt(
             prefixText: "PREFIX_BODY_XYZ",
             applicationName: "App",
@@ -109,14 +123,8 @@ final class LlamaPromptRendererTests: XCTestCase {
         )
 
         XCTAssertFalse(prompt.contains("UNIQUE_LENGTH_MARKER_7_TO_12_WORDS"))
-
-        guard let finalInstructionRange = prompt.range(of: "Final instruction:"),
-              let prefixRange = prompt.range(of: "PREFIX_BODY_XYZ") else {
-            XCTFail("Expected final instruction header and prefix in the prompt")
-            return
-        }
-
-        XCTAssertLessThan(finalInstructionRange.lowerBound, prefixRange.lowerBound)
+        // The prefix is still the last payload regardless.
+        XCTAssertTrue(prompt.hasSuffix("PREFIX_BODY_XYZ"))
     }
 
     func test_instructionPrompt_includesProfileContextWhenProvided() {
@@ -131,9 +139,9 @@ final class LlamaPromptRendererTests: XCTestCase {
                       "instruction prompt should carry user-provided profile name")
     }
 
-    /// The prefix remains the last payload in the prompt so the model still ends on the actual
-    /// text it must continue, even though the length cue is moved later in the prompt.
-    func test_instructionPrompt_prefixAppearsAfterScreenContextAndEndsPrompt() {
+    /// The prefix remains the last payload in the prompt so the model ends on the actual text it
+    /// must continue. This is the one structural invariant the prose rewrite must preserve.
+    func test_instructionPrompt_prefixAppearsAfterContextAndEndsPrompt() {
         let prompt = LlamaPromptRenderer.prompt(
             prefixText: "PREFIX_BODY_XYZ",
             applicationName: "App",
@@ -141,14 +149,14 @@ final class LlamaPromptRendererTests: XCTestCase {
             userName: nil
         )
 
-        guard let contextRange = prompt.range(of: "Screen context:"),
+        guard let contextRange = prompt.range(of: "writing in App"),
               let prefixRange = prompt.range(of: "PREFIX_BODY_XYZ") else {
-            XCTFail("Expected both Screen context: and PREFIX_BODY_XYZ in the prompt")
+            XCTFail("Expected both the app-context sentence and PREFIX_BODY_XYZ in the prompt")
             return
         }
 
         XCTAssertLessThan(contextRange.lowerBound, prefixRange.lowerBound,
-                          "prefix must appear after the Screen context header")
+                          "prefix must appear after the app-context sentence")
         XCTAssertTrue(prompt.hasSuffix("PREFIX_BODY_XYZ"))
     }
 
@@ -161,7 +169,7 @@ final class LlamaPromptRendererTests: XCTestCase {
             visualContextSummary: "A window describing a cat."
         )
 
-        XCTAssertTrue(prompt.contains("Screen content:"))
+        XCTAssertTrue(prompt.contains("Nearby on screen, the user can see"))
         XCTAssertTrue(prompt.contains("A window describing a cat."))
     }
 
@@ -174,7 +182,7 @@ final class LlamaPromptRendererTests: XCTestCase {
             clipboardContext: "UNIQUE_CLIPBOARD_MARKER"
         )
 
-        XCTAssertTrue(prompt.contains("User's clipboard:"))
+        XCTAssertTrue(prompt.contains("clipboard currently contains"))
         XCTAssertTrue(prompt.contains("UNIQUE_CLIPBOARD_MARKER"))
     }
 
@@ -187,7 +195,114 @@ final class LlamaPromptRendererTests: XCTestCase {
             visualContextSummary: nil
         )
 
-        XCTAssertFalse(prompt.contains("Screen content:"))
+        XCTAssertFalse(prompt.contains("Nearby on screen"))
+    }
+
+    // MARK: - messages() chat-template path
+    //
+    // The chat-template path (used when the model ships a template) splits the prompt into a
+    // system turn (rules + context) and a user turn (the bare prefix). These tests guard the
+    // invariant that fixes the prompt-scaffolding echo bug: the user turn must be exactly the
+    // text to continue, with none of the "Text before caret:" / "Task:" labels that small
+    // instruct models were parroting back into the ghost text.
+
+    func test_messages_userTurnIsExactlyThePrefixWithNoScaffolding() {
+        let chat = LlamaPromptRenderer.messages(
+            prefixText: "I was just about to",
+            applicationName: "TextEdit",
+            completionLengthInstruction: "Return only the next 3 to 7 words.",
+            userName: nil,
+            customRules: []
+        )
+
+        XCTAssertEqual(chat.user, "I was just about to")
+    }
+
+    func test_messages_systemTurnDropsRawLabelScaffolding() {
+        let chat = LlamaPromptRenderer.messages(
+            prefixText: "hello",
+            applicationName: "TextEdit",
+            completionLengthInstruction: "",
+            userName: nil,
+            customRules: []
+        )
+
+        XCTAssertFalse(chat.system.contains("Text before caret:"))
+        XCTAssertFalse(chat.system.contains("Final instruction:"))
+    }
+
+    func test_messages_systemTurnDoesNotContainThePrefix() {
+        let chat = LlamaPromptRenderer.messages(
+            prefixText: "Zxqv distinctive prefix marker",
+            applicationName: "TextEdit",
+            completionLengthInstruction: "",
+            userName: nil,
+            customRules: []
+        )
+
+        XCTAssertFalse(chat.system.contains("Zxqv distinctive prefix marker"))
+    }
+
+    func test_messages_systemTurnCarriesAutocompleteRules() {
+        let chat = LlamaPromptRenderer.messages(
+            prefixText: "x",
+            applicationName: "TextEdit",
+            completionLengthInstruction: "",
+            userName: nil,
+            customRules: []
+        )
+
+        XCTAssertTrue(chat.system.contains("autocomplete, not chat"))
+        // App context is now prose ("The user is writing in TextEdit."), not a "Screen context:"
+        // label block — but the application name itself must still be present.
+        XCTAssertTrue(chat.system.contains("writing in TextEdit"))
+    }
+
+    func test_messages_systemTurnIncludesProfileRulesContextWhenProvided() {
+        let chat = LlamaPromptRenderer.messages(
+            prefixText: "x",
+            applicationName: "TextEdit",
+            completionLengthInstruction: "",
+            userName: "Jacob",
+            customRules: ["Always be concise"],
+            languageInstruction: "Respond in German.",
+            clipboardContext: "copied text",
+            visualContextSummary: "a login form"
+        )
+
+        XCTAssertTrue(chat.system.contains("Jacob"))
+        XCTAssertTrue(chat.system.contains("Always be concise"))
+        XCTAssertTrue(chat.system.contains("Respond in German."))
+        XCTAssertTrue(chat.system.contains("copied text"))
+        XCTAssertTrue(chat.system.contains("a login form"))
+
+        // The prose invariant: even with every context block populated, the system turn must carry
+        // no standalone "Label:" lines — those are exactly what small models echoed into ghost text.
+        XCTAssertFalse(chat.system.contains("User Profile Context:"))
+        XCTAssertFalse(chat.system.contains("Your style preferences:"))
+        XCTAssertFalse(chat.system.contains("Screen context:"))
+        XCTAssertFalse(chat.system.contains("Screen content:"))
+        XCTAssertFalse(chat.system.contains("User's clipboard:"))
+    }
+
+    func test_messages_omitsOptionalContextWhenAbsent() {
+        let chat = LlamaPromptRenderer.messages(
+            prefixText: "x",
+            applicationName: "TextEdit",
+            completionLengthInstruction: "",
+            userName: nil,
+            customRules: [],
+            clipboardContext: nil,
+            visualContextSummary: nil
+        )
+
+        // Assert the optional *content blocks* are absent, not rule words: the base rules always
+        // mention "clipboard" ("Use clipboard or screen context only when it directly helps"), so
+        // the block header "User's clipboard:" is the correct absence check.
+        XCTAssertFalse(chat.system.contains("User's clipboard:"))
+        XCTAssertFalse(chat.system.contains("Screen content:"))
+        XCTAssertFalse(chat.system.contains("User Profile Context:"))
+        XCTAssertFalse(chat.system.contains("Your style preferences:"))
     }
 
     private func makeRequest(
diff --git a/CotabbyTests/SuggestionRequestFactoryTests.swift b/CotabbyTests/SuggestionRequestFactoryTests.swift
index ad8abe06..bddc8d63 100644
--- a/CotabbyTests/SuggestionRequestFactoryTests.swift
+++ b/CotabbyTests/SuggestionRequestFactoryTests.swift
@@ -239,7 +239,7 @@ final class SuggestionRequestFactoryTests: XCTestCase {
         )
 
         XCTAssertEqual(result.request.clipboardContext, "Copied project notes.")
-        XCTAssertTrue(result.promptPreview.contains("User's clipboard:"))
+        XCTAssertTrue(result.promptPreview.contains("clipboard currently contains"))
         XCTAssertTrue(result.promptPreview.contains("Copied project notes."))
     }
 
@@ -272,7 +272,7 @@ final class SuggestionRequestFactoryTests: XCTestCase {
         )
 
         XCTAssertNil(result.request.clipboardContext)
-        XCTAssertFalse(result.promptPreview.contains("User's clipboard:"))
+        XCTAssertFalse(result.promptPreview.contains("clipboard currently contains"))
         XCTAssertFalse(result.promptPreview.contains("Copied project notes."))
     }