From 1a15d5f0e1d98141de1ee2af820473f07bca5aec Mon Sep 17 00:00:00 2001 From: Jacob Fu <141651335+FuJacob@users.noreply.github.com> Date: Sat, 30 May 2026 16:49:30 -0700 Subject: [PATCH] Make applyChatTemplate a buffer-based C ABI so it bridges to objcxx interop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous applyChatTemplate took a std::vector (a struct with std::string members) and returned std::string. That compiles under the Cxx interop mode this package's test target uses, but does NOT bridge into the Cotabby app target, which uses objcxx interop — a returned std::string has no usable Swift String initializer there, so the app could never call it. Replace it with a detokenize-style C ABI that crosses the boundary cleanly: int applyChatTemplate(const char* system_text, const char* user_text, bool add_assistant, char* buffer, int buffer_size) system + user are passed directly (autocomplete is always exactly those two turns), matching how the app already calls tokenize/detokenize with const char* and a caller buffer. Return contract: >0 bytes written, 0 = no model / no template / render failure (caller falls back to raw), <0 = -(required size) so the caller can resize and retry. Drops the ChatMessage struct entirely. Tests updated to the new signature; swift test green (15 tests, 0 failures, 3 model-dependent skipped). --- .../CotabbyInferenceEngine.cpp | 74 +++++++++---------- .../include/CotabbyInferenceEngine.h | 33 +++++---- .../LlamaMiddlewareTests.swift | 38 ++++++---- 3 files changed, 74 insertions(+), 71 deletions(-) diff --git a/Sources/CotabbyInferenceEngine/CotabbyInferenceEngine.cpp b/Sources/CotabbyInferenceEngine/CotabbyInferenceEngine.cpp index 026db00..fc64bea 100644 --- a/Sources/CotabbyInferenceEngine/CotabbyInferenceEngine.cpp +++ b/Sources/CotabbyInferenceEngine/CotabbyInferenceEngine.cpp @@ -584,55 +584,49 @@ bool CotabbyInferenceEngine::hasChatTemplate() const { return llama_model_chat_template(impl_->model, /*name=*/nullptr) != nullptr; } -std::string CotabbyInferenceEngine::applyChatTemplate( - const ChatMessage* messages, int message_count, - bool add_assistant) const { - if (!impl_->model || !messages || message_count <= 0) { - return {}; +int CotabbyInferenceEngine::applyChatTemplate( + const char* system_text, + const char* user_text, + bool add_assistant, + char* buffer, + int buffer_size) const { + if (!impl_->model || !system_text || !user_text || + !buffer || buffer_size <= 0) { + return 0; } const char* tmpl = llama_model_chat_template(impl_->model, /*name=*/nullptr); if (!tmpl) { - return {}; + return 0; } - // `llama_chat_message` holds borrowed `const char*`. The backing - // std::strings live in `messages` for the duration of this call, so - // pointing at their c_str() is safe. - std::vector chat; - chat.reserve(message_count); - size_t total_chars = 0; - for (int i = 0; i < message_count; ++i) { - chat.push_back(llama_chat_message{ - messages[i].role.c_str(), - messages[i].content.c_str() - }); - total_chars += messages[i].role.size() + messages[i].content.size(); - } + // Borrowed `const char*` from the caller; valid for this call's duration. + llama_chat_message chat[2] = { + { "system", system_text }, + { "user", user_text } + }; - // The header recommends an initial buffer of 2x the total message - // characters; grow and retry if the template expands beyond that. - std::vector buf(std::max(total_chars * 2, 256)); - while (true) { - int32_t n = llama_chat_apply_template( - tmpl, - chat.data(), - chat.size(), - add_assistant, - buf.data(), - static_cast(buf.size()) - ); + int32_t n = llama_chat_apply_template( + tmpl, + chat, + 2, + add_assistant, + buffer, + static_cast(buffer_size) + ); - if (n < 0) { - // Template not supported by llama.cpp's predefined list, or some - // other failure. Signal "fall back to the raw path". - return {}; - } - if (static_cast(n) <= buf.size()) { - return std::string(buf.data(), static_cast(n)); - } - buf.resize(static_cast(n)); + // Contract of llama_chat_apply_template: returns the total byte length of + // the formatted prompt; negative means the template is unsupported by + // llama.cpp's predefined list. A positive value larger than the buffer + // means the output did not fit and the caller must retry with a bigger + // buffer. Map all three onto this function's documented C-ABI contract. + if (n < 0) { + return 0; // genuine render failure → caller falls back to raw + } + if (n > buffer_size) { + return -n; // too small → -(required size); caller resizes and retries } + return n; // success: n bytes written (n <= buffer_size) } int CotabbyInferenceEngine::detokenize(int32_t token, char* buffer, diff --git a/Sources/CotabbyInferenceEngine/include/CotabbyInferenceEngine.h b/Sources/CotabbyInferenceEngine/include/CotabbyInferenceEngine.h index 9ca3ae4..5fdab1c 100644 --- a/Sources/CotabbyInferenceEngine/include/CotabbyInferenceEngine.h +++ b/Sources/CotabbyInferenceEngine/include/CotabbyInferenceEngine.h @@ -1,6 +1,5 @@ #pragma once #include -#include #include #include @@ -14,13 +13,6 @@ struct SamplingConfig { uint32_t seed; }; -/// One message in a chat-template conversation, mirroring `llama_chat_message`. -/// Roles are the usual "system" / "user" / "assistant". Owned by the caller. -struct ChatMessage { - std::string role; - std::string content; -}; - struct SWIFT_SELF_CONTAINED SampleResult { int32_t token; const char* piece; @@ -78,13 +70,24 @@ class CotabbyInferenceEngine { // chat-template prompt path and the legacy raw-continuation path so a // user-supplied base model keeps working. bool hasChatTemplate() const; - // Renders `messages` through the model's built-in chat template and returns - // the formatted prompt string. `add_assistant` appends the assistant-turn - // opening marker so the model continues as the assistant. Returns an empty - // string if no model is loaded, the model has no template, or formatting - // fails — callers must treat empty as "fall back to the raw path". - std::string applyChatTemplate(const ChatMessage* messages, int message_count, - bool add_assistant) const; + // Renders a system + user turn through the model's built-in chat template + // into `buffer`. `add_assistant` appends the assistant-turn opening marker + // so the model continues as the assistant. Returns: + // > 0 : number of bytes written (<= buffer_size) — the formatted prompt. + // < 0 : -(required buffer size); the buffer was too small, retry at that size. + // = 0 : no model, no template, or render failure — caller falls back to raw. + // + // Autocomplete needs exactly one system turn (rules + context) and one user + // turn (the text to continue), so the signature takes those two directly + // rather than a message array. This buffer-based C ABI mirrors `detokenize` + // and deliberately avoids std::string / struct parameter and return types, + // so it bridges cleanly into the Swift objcxx interop mode the app target + // uses (where a std::string return does not bridge). + int applyChatTemplate(const char* system_text, + const char* user_text, + bool add_assistant, + char* buffer, + int buffer_size) const; // Prompt decoding EngineStatus decodePrompt(int32_t sequence_id, diff --git a/Tests/CotabbyInferenceTests/LlamaMiddlewareTests.swift b/Tests/CotabbyInferenceTests/LlamaMiddlewareTests.swift index 5f0d0b1..096eb0f 100644 --- a/Tests/CotabbyInferenceTests/LlamaMiddlewareTests.swift +++ b/Tests/CotabbyInferenceTests/LlamaMiddlewareTests.swift @@ -69,14 +69,14 @@ final class LlamaMiddlewareTests: XCTestCase { XCTAssertFalse(engine.hasChatTemplate()) } - func testApplyChatTemplateWithoutModelReturnsEmpty() { + func testApplyChatTemplateWithoutModelReturnsZero() { let engine = CotabbyInferenceEngine() - var messages = [ChatMessage]() - messages.append(ChatMessage(role: "user", content: "hi")) - let rendered = messages.withUnsafeBufferPointer { buf in - engine.applyChatTemplate(buf.baseAddress, Int32(buf.count), true) - } - XCTAssertTrue(rendered.isEmpty) + var buffer = [CChar](repeating: 0, count: 256) + let written = engine.applyChatTemplate( + "You complete text.", "The quick brown", true, &buffer, Int32(buffer.count) + ) + // No model loaded → 0 (caller falls back to the raw path). + XCTAssertEqual(written, 0) } func testDiagnosticsDefaultToZero() { @@ -122,16 +122,22 @@ final class LlamaMiddlewareTests: XCTestCase { // rendering a simple conversation must produce a non-empty prompt that // tokenizes (with parse_special) to a non-empty token list. if engine.hasChatTemplate() { - var messages = [ChatMessage]() - messages.append(ChatMessage(role: "system", content: "You complete text.")) - messages.append(ChatMessage(role: "user", content: "The quick brown")) - let rendered = messages.withUnsafeBufferPointer { buf in - engine.applyChatTemplate(buf.baseAddress, Int32(buf.count), true) + // Render system + user through the model's template into a caller buffer. + var buffer = [CChar](repeating: 0, count: 4096) + let written = engine.applyChatTemplate( + "You complete text.", "The quick brown", true, &buffer, Int32(buffer.count) + ) + XCTAssertGreaterThan(written, 0, "Model reports a template but rendering produced no bytes") + + let rendered = buffer.prefix(Int(written)).withUnsafeBufferPointer { ptr in + String( + bytes: UnsafeRawBufferPointer(ptr), + encoding: .utf8 + ) } - // applyChatTemplate returns a C++ std::string; bridge to a Swift - // String before using String APIs like .utf8. - let renderedSwift = String(rendered) - XCTAssertFalse(renderedSwift.isEmpty, "Model reports a template but rendering was empty") + let renderedSwift = try XCTUnwrap(rendered, "Rendered template was not valid UTF-8") + XCTAssertFalse(renderedSwift.isEmpty) + let templated = engine.tokenizeWithOptions( renderedSwift, Int32(renderedSwift.utf8.count), false, true )