diff --git a/Package.swift b/Package.swift index 1827dbc..2f786df 100644 --- a/Package.swift +++ b/Package.swift @@ -33,5 +33,18 @@ let package = Package( .interoperabilityMode(.Cxx), ] ), + // Phase 0 spike: standalone executable that compares aggregate decode + // tok/s between the current "separate llama_context per sequence" + // architecture and a prospective shared-context-with-batched-decode + // architecture. Used to decide whether the Phase 1 refactor is worth + // doing on M-series + Metal. Not linked into CotabbyInference itself. + .executableTarget( + name: "CotabbyInferenceBench", + dependencies: ["llama-cpp"], + path: "Sources/CotabbyInferenceBench", + cxxSettings: [ + .unsafeFlags(["-std=c++17"]), + ] + ), ] ) diff --git a/Sources/CotabbyInferenceBench/baseline_a.cpp b/Sources/CotabbyInferenceBench/baseline_a.cpp new file mode 100644 index 0000000..88ad2d1 --- /dev/null +++ b/Sources/CotabbyInferenceBench/baseline_a.cpp @@ -0,0 +1,207 @@ +#include "bench_common.h" + +#include +#include +#include +#include + +#include + +namespace { + +llama_sampler* make_greedy_sampler() { + auto params = llama_sampler_chain_default_params(); + llama_sampler* chain = llama_sampler_chain_init(params); + if (!chain) return nullptr; + llama_sampler_chain_add(chain, llama_sampler_init_greedy()); + return chain; +} + +llama_context* create_isolated_context( + llama_model* model, + const BenchConfig& cfg +) { + auto p = llama_context_default_params(); + p.n_ctx = static_cast(cfg.ctx_size); + p.n_batch = static_cast(cfg.batch_size); + p.n_ubatch = static_cast(cfg.batch_size); + p.n_seq_max = 1; + int t = static_cast( + std::max(1u, std::thread::hardware_concurrency()) + ); + p.n_threads = t; + p.n_threads_batch = t; + p.offload_kqv = true; + return llama_init_from_model(model, p); +} + +// Walks the prompt in chunks of `batch_cap`, setting logits=1 only on the +// final token of the final chunk. That leaves a single logits row at batch +// index 0 of the last decode, which is where the first sample reads from. +bool decode_prompt( + llama_context* ctx, + const std::vector& tokens, + int batch_cap +) { + llama_batch batch = llama_batch_init(batch_cap, 0, 1); + int cursor = 0; + int n = static_cast(tokens.size()); + while (cursor < n) { + int chunk_end = std::min(cursor + batch_cap, n); + int chunk_size = chunk_end - cursor; + batch.n_tokens = chunk_size; + for (int i = 0; i < chunk_size; ++i) { + int idx = cursor + i; + batch.token[i] = tokens[idx]; + batch.pos[i] = idx; + batch.n_seq_id[i] = 1; + if (batch.seq_id && batch.seq_id[i]) batch.seq_id[i][0] = 0; + bool is_last = (chunk_end == n && i == chunk_size - 1); + batch.logits[i] = is_last ? 1 : 0; + } + if (llama_decode(ctx, batch) != 0) { + llama_batch_free(batch); + return false; + } + cursor = chunk_end; + } + llama_batch_free(batch); + return true; +} + +// Steady-state sampling loop, run once per sequence in its own thread. Takes +// the seed token from warmup as input. Each iteration feedback-decodes the +// previous sample then takes a new one. Mirrors CotabbyInferenceEngine's +// sampleNext pattern minus the per-call batch allocation. +int run_sampling_loop( + llama_context* ctx, + llama_sampler* sampler, + llama_token seed_token, + int start_position, + int sample_count +) { + if (sample_count <= 0) return 0; + + llama_token next = seed_token; + int position = start_position; + int sampled = 0; + + llama_batch batch = llama_batch_init(1, 0, 1); + for (int i = 0; i < sample_count; ++i) { + batch.n_tokens = 1; + batch.token[0] = next; + batch.pos[0] = position; + batch.n_seq_id[0] = 1; + if (batch.seq_id && batch.seq_id[0]) batch.seq_id[0][0] = 0; + batch.logits[0] = 1; + if (llama_decode(ctx, batch) != 0) { + llama_batch_free(batch); + return sampled; + } + position++; + next = llama_sampler_sample(sampler, ctx, -1); + llama_sampler_accept(sampler, next); + sampled++; + } + llama_batch_free(batch); + return sampled; +} + +} // namespace + +BenchResult run_baseline_a_two_contexts( + llama_model* model, + const BenchConfig& cfg +) { + BenchResult r; + r.scenario = "a_two_contexts"; + r.num_sequences = cfg.num_sequences; + r.prompt_tokens = cfg.prompt_tokens; + r.sample_tokens = cfg.sample_tokens; + + const llama_vocab* vocab = llama_model_get_vocab(model); + auto prompt = make_synthetic_prompt(vocab, cfg.prompt_tokens); + + std::vector ctxs(cfg.num_sequences, nullptr); + std::vector samplers(cfg.num_sequences, nullptr); + + auto cleanup = [&]() { + for (auto* s : samplers) if (s) llama_sampler_free(s); + for (auto* c : ctxs) if (c) llama_free(c); + }; + + if (prompt.empty()) { + r.error = "Failed to tokenize synthetic prompt"; + cleanup(); + return r; + } + + // Allocate all resources upfront. Any failure here aborts before any + // timing happens. + for (int i = 0; i < cfg.num_sequences; ++i) { + ctxs[i] = create_isolated_context(model, cfg); + if (!ctxs[i]) { + r.error = "Failed to create context"; + cleanup(); + return r; + } + samplers[i] = make_greedy_sampler(); + if (!samplers[i]) { + r.error = "Failed to create sampler"; + cleanup(); + return r; + } + } + + // Warmup: prompt decode + first sample per context. Both untimed so the + // timed section measures only steady-state decode+sample work, which is + // what the user actually feels in the real app (prompt is mostly cached + // across requests via KV reuse). Pulling the seed sample out also makes + // the count symmetric with baseline B's timed section. + std::vector seed_tokens(cfg.num_sequences, 0); + for (int i = 0; i < cfg.num_sequences; ++i) { + if (!decode_prompt(ctxs[i], prompt, cfg.batch_size)) { + r.error = "Prompt decode failed"; + cleanup(); + return r; + } + // -1 reads from the last logits row, which is where the prompt's + // final-token logits live (we set logits=1 only on that position + // during prompt decode). + seed_tokens[i] = llama_sampler_sample(samplers[i], ctxs[i], -1); + llama_sampler_accept(samplers[i], seed_tokens[i]); + } + + // Timed: one thread per sequence, each runs (sample_tokens - 1) feedback + // decode + sample iterations against its own context. + Timer timer; + timer.start(); + std::vector threads; + std::atomic total{0}; + threads.reserve(cfg.num_sequences); + for (int i = 0; i < cfg.num_sequences; ++i) { + threads.emplace_back([&, i]() { + int n = run_sampling_loop( + ctxs[i], + samplers[i], + seed_tokens[i], + cfg.prompt_tokens, + cfg.sample_tokens - 1 + ); + total.fetch_add(n); + }); + } + for (auto& t : threads) t.join(); + r.elapsed_seconds = timer.elapsed_seconds(); + r.total_tokens_sampled = total.load(); + + if (r.elapsed_seconds > 0.0) { + r.aggregate_tokens_per_second = + r.total_tokens_sampled / r.elapsed_seconds; + r.per_sequence_tokens_per_second = + r.aggregate_tokens_per_second / cfg.num_sequences; + } + + cleanup(); + return r; +} diff --git a/Sources/CotabbyInferenceBench/baseline_b.cpp b/Sources/CotabbyInferenceBench/baseline_b.cpp new file mode 100644 index 0000000..43f25b9 --- /dev/null +++ b/Sources/CotabbyInferenceBench/baseline_b.cpp @@ -0,0 +1,195 @@ +#include "bench_common.h" + +#include +#include +#include + +#include + +namespace { + +llama_sampler* make_greedy_sampler() { + auto params = llama_sampler_chain_default_params(); + llama_sampler* chain = llama_sampler_chain_init(params); + if (!chain) return nullptr; + llama_sampler_chain_add(chain, llama_sampler_init_greedy()); + return chain; +} + +// Single shared context with room for `num_sequences` distinct seq_ids. The +// KV cache size is multiplied by `num_sequences` so each sequence still gets +// its configured `ctx_size` of slots — Phase 0 also wants to confirm whether +// n_ctx is shared or per-sequence in the b9310 build. +llama_context* create_shared_context( + llama_model* model, + const BenchConfig& cfg +) { + auto p = llama_context_default_params(); + p.n_ctx = static_cast(cfg.ctx_size * cfg.num_sequences); + p.n_batch = static_cast(cfg.batch_size); + p.n_ubatch = static_cast(cfg.batch_size); + p.n_seq_max = static_cast(cfg.num_sequences); + int t = static_cast( + std::max(1u, std::thread::hardware_concurrency()) + ); + p.n_threads = t; + p.n_threads_batch = t; + p.offload_kqv = true; + return llama_init_from_model(model, p); +} + +// Decodes one sequence's prompt into the shared context, tagged with `seq_id`. +// Only the final token of the final chunk has logits=1, so the next sampler +// call reads from batch index 0 of that decode. +bool decode_prompt_for_seq( + llama_context* ctx, + llama_seq_id seq_id, + const std::vector& tokens, + int batch_cap +) { + llama_batch batch = llama_batch_init(batch_cap, 0, 1); + int cursor = 0; + int n = static_cast(tokens.size()); + while (cursor < n) { + int chunk_end = std::min(cursor + batch_cap, n); + int chunk_size = chunk_end - cursor; + batch.n_tokens = chunk_size; + for (int i = 0; i < chunk_size; ++i) { + int idx = cursor + i; + batch.token[i] = tokens[idx]; + batch.pos[i] = idx; + batch.n_seq_id[i] = 1; + if (batch.seq_id && batch.seq_id[i]) batch.seq_id[i][0] = seq_id; + bool is_last = (chunk_end == n && i == chunk_size - 1); + batch.logits[i] = is_last ? 1 : 0; + } + if (llama_decode(ctx, batch) != 0) { + llama_batch_free(batch); + return false; + } + cursor = chunk_end; + } + llama_batch_free(batch); + return true; +} + +} // namespace + +BenchResult run_baseline_b_shared_context( + llama_model* model, + const BenchConfig& cfg +) { + BenchResult r; + r.scenario = "b_shared_context"; + r.num_sequences = cfg.num_sequences; + r.prompt_tokens = cfg.prompt_tokens; + r.sample_tokens = cfg.sample_tokens; + + const llama_vocab* vocab = llama_model_get_vocab(model); + auto prompt = make_synthetic_prompt(vocab, cfg.prompt_tokens); + + llama_context* ctx = nullptr; + std::vector samplers(cfg.num_sequences, nullptr); + + auto cleanup = [&]() { + for (auto* s : samplers) if (s) llama_sampler_free(s); + if (ctx) llama_free(ctx); + }; + + if (prompt.empty()) { + r.error = "Failed to tokenize synthetic prompt"; + cleanup(); + return r; + } + + ctx = create_shared_context(model, cfg); + if (!ctx) { + r.error = "Failed to create shared context"; + cleanup(); + return r; + } + + for (int s = 0; s < cfg.num_sequences; ++s) { + samplers[s] = make_greedy_sampler(); + if (!samplers[s]) { + r.error = "Failed to create sampler"; + cleanup(); + return r; + } + } + + std::vector last_tokens(cfg.num_sequences, 0); + std::vector positions(cfg.num_sequences, cfg.prompt_tokens); + + // Warmup: decode each prompt and immediately sample one token while that + // sequence's prompt logits are still resident. The next prompt decode for + // sequence s+1 overwrites the logits buffer, so we must sample-before- + // advance. The sampled token becomes the seed input to the timed loop. + for (int s = 0; s < cfg.num_sequences; ++s) { + if (!decode_prompt_for_seq( + ctx, static_cast(s), + prompt, cfg.batch_size)) { + r.error = "Prompt decode failed"; + cleanup(); + return r; + } + // -1 reads from the last logits row, which is where this sequence's + // prompt-final-token logits live until the next decode overwrites + // them. + last_tokens[s] = llama_sampler_sample(samplers[s], ctx, -1); + llama_sampler_accept(samplers[s], last_tokens[s]); + } + + // Timed: build one batch carrying num_sequences tokens (different seq_ids), + // decode all of them in a single llama_decode call, then sample one new + // token per sequence from its respective logit row. Each iteration + // produces one new token per sequence. + llama_batch batch = llama_batch_init(cfg.num_sequences + 4, 0, 1); + Timer timer; + timer.start(); + bool decode_failed = false; + + for (int step = 1; step < cfg.sample_tokens; ++step) { + batch.n_tokens = cfg.num_sequences; + for (int s = 0; s < cfg.num_sequences; ++s) { + batch.token[s] = last_tokens[s]; + batch.pos[s] = positions[s]; + batch.n_seq_id[s] = 1; + if (batch.seq_id && batch.seq_id[s]) { + batch.seq_id[s][0] = static_cast(s); + } + batch.logits[s] = 1; + positions[s]++; + } + if (llama_decode(ctx, batch) != 0) { + decode_failed = true; + break; + } + for (int s = 0; s < cfg.num_sequences; ++s) { + last_tokens[s] = llama_sampler_sample(samplers[s], ctx, s); + llama_sampler_accept(samplers[s], last_tokens[s]); + } + } + + r.elapsed_seconds = timer.elapsed_seconds(); + llama_batch_free(batch); + + if (decode_failed) { + r.error = "llama_decode failed mid-loop"; + cleanup(); + return r; + } + + // Exclude the warmup seed sample from total_tokens_sampled so the + // numerator is comparable to baseline A's timed-only count. + r.total_tokens_sampled = (cfg.sample_tokens - 1) * cfg.num_sequences; + if (r.elapsed_seconds > 0.0) { + r.aggregate_tokens_per_second = + r.total_tokens_sampled / r.elapsed_seconds; + r.per_sequence_tokens_per_second = + r.aggregate_tokens_per_second / cfg.num_sequences; + } + + cleanup(); + return r; +} diff --git a/Sources/CotabbyInferenceBench/bench_common.cpp b/Sources/CotabbyInferenceBench/bench_common.cpp new file mode 100644 index 0000000..dbc9c5e --- /dev/null +++ b/Sources/CotabbyInferenceBench/bench_common.cpp @@ -0,0 +1,73 @@ +#include "bench_common.h" + +#include +#include + +#include + +std::vector make_synthetic_prompt( + const llama_vocab* vocab, + int target_tokens +) { + if (!vocab || target_tokens <= 0) return {}; + + // Real text so the tokenizer produces a sensible distribution. The seed is + // short; we pad until tokenization yields at least `target_tokens` tokens. + static const char seed[] = + "The quick brown fox jumps over the lazy dog. "; + std::string text; + while (static_cast(text.size()) < target_tokens * 6) { + text += seed; + } + + bool add_bos = llama_vocab_get_add_bos(vocab); + int capacity = target_tokens * 4 + 16; + std::vector tokens(capacity); + int n = llama_tokenize( + vocab, + text.c_str(), + static_cast(text.size()), + tokens.data(), + static_cast(capacity), + add_bos, + false + ); + if (n <= 0) return {}; + + tokens.resize(n); + if (static_cast(tokens.size()) > target_tokens) { + tokens.resize(target_tokens); + } + return tokens; +} + +void print_result(const BenchResult& r) { + if (!r.error.empty()) { + std::printf( + "{\"scenario\":\"%s\",\"error\":\"%s\"}\n", + r.scenario.c_str(), + r.error.c_str() + ); + return; + } + std::printf( + "{" + "\"scenario\":\"%s\"," + "\"num_sequences\":%d," + "\"prompt_tokens\":%d," + "\"sample_tokens\":%d," + "\"elapsed_seconds\":%.4f," + "\"total_tokens_sampled\":%d," + "\"aggregate_tokens_per_second\":%.2f," + "\"per_sequence_tokens_per_second\":%.2f" + "}\n", + r.scenario.c_str(), + r.num_sequences, + r.prompt_tokens, + r.sample_tokens, + r.elapsed_seconds, + r.total_tokens_sampled, + r.aggregate_tokens_per_second, + r.per_sequence_tokens_per_second + ); +} diff --git a/Sources/CotabbyInferenceBench/bench_common.h b/Sources/CotabbyInferenceBench/bench_common.h new file mode 100644 index 0000000..de1c7bf --- /dev/null +++ b/Sources/CotabbyInferenceBench/bench_common.h @@ -0,0 +1,80 @@ +#pragma once + +#include +#include +#include +#include + +// Phase 0 spike: compare aggregate decode tok/s between the current "separate +// llama_context per sequence" architecture and a prospective "single shared +// llama_context with n_seq_max > 1, batched llama_decode" architecture. +// +// The goal is a yes/no decision: on the M-series + Metal hardware our users +// run, does batched decode actually move the needle (>= 1.4x aggregate tok/s) +// or is the GPU command queue serializing everything regardless? +// +// Two baselines, identical warmup, identical token count. Both report total +// samples produced in the timed section and elapsed wall-clock so the caller +// can compute tok/s without us baking in a definition. + +struct BenchConfig { + std::string model_path; + std::string scenario; + int num_sequences = 2; + int prompt_tokens = 256; + int sample_tokens = 200; + int gpu_layers = -1; + int batch_size = 512; + int ctx_size = 2048; + bool verbose = false; +}; + +struct BenchResult { + std::string scenario; + int num_sequences = 0; + int prompt_tokens = 0; + int sample_tokens = 0; + double elapsed_seconds = 0.0; + int total_tokens_sampled = 0; + double aggregate_tokens_per_second = 0.0; + double per_sequence_tokens_per_second = 0.0; + std::string error; +}; + +class Timer { +public: + void start() { t0_ = std::chrono::steady_clock::now(); } + double elapsed_seconds() const { + auto dt = std::chrono::steady_clock::now() - t0_; + return std::chrono::duration(dt).count(); + } +private: + std::chrono::steady_clock::time_point t0_; +}; + +// Pads a short English seed string to a target token count using the supplied +// vocab. Returning real tokens (not random IDs) keeps the sampler in a regime +// the model was trained for, so the decode cost matches what real usage hits. +std::vector make_synthetic_prompt( + const struct llama_vocab* vocab, + int target_tokens +); + +// Emits a single-line JSON record to stdout. One result per invocation; the +// caller is expected to run the binary multiple times and compare lines. +void print_result(const BenchResult& r); + +// Baseline A: N separate llama_context instances, each decoded from its own +// thread. This is what `CotabbyInferenceEngine` does today. +BenchResult run_baseline_a_two_contexts( + struct llama_model* model, + const BenchConfig& cfg +); + +// Baseline B: one shared llama_context with n_seq_max = N, batched +// llama_decode calls carrying tokens for all sequences. This is the candidate +// architecture for Phase 1. +BenchResult run_baseline_b_shared_context( + struct llama_model* model, + const BenchConfig& cfg +); diff --git a/Sources/CotabbyInferenceBench/main.cpp b/Sources/CotabbyInferenceBench/main.cpp new file mode 100644 index 0000000..5cceef6 --- /dev/null +++ b/Sources/CotabbyInferenceBench/main.cpp @@ -0,0 +1,129 @@ +#include "bench_common.h" + +#include +#include +#include +#include + +#include + +namespace { + +void silenced_log(ggml_log_level, const char*, void*) {} + +bool parse_args(int argc, char** argv, BenchConfig& cfg) { + for (int i = 1; i < argc; ++i) { + std::string a = argv[i]; + auto next = [&]() -> const char* { + if (i + 1 >= argc) return nullptr; + return argv[++i]; + }; + if (a == "--model") { + const char* v = next(); + if (v) cfg.model_path = v; + } else if (a == "--scenario") { + const char* v = next(); + if (v) cfg.scenario = v; + } else if (a == "--num-sequences") { + const char* v = next(); + if (v) cfg.num_sequences = std::atoi(v); + } else if (a == "--prompt-tokens") { + const char* v = next(); + if (v) cfg.prompt_tokens = std::atoi(v); + } else if (a == "--sample-tokens") { + const char* v = next(); + if (v) cfg.sample_tokens = std::atoi(v); + } else if (a == "--gpu-layers") { + const char* v = next(); + if (v) cfg.gpu_layers = std::atoi(v); + } else if (a == "--batch-size") { + const char* v = next(); + if (v) cfg.batch_size = std::atoi(v); + } else if (a == "--ctx-size") { + const char* v = next(); + if (v) cfg.ctx_size = std::atoi(v); + } else if (a == "--verbose") { + cfg.verbose = true; + } else if (a == "--help" || a == "-h") { + return false; + } + } + return !cfg.model_path.empty() && !cfg.scenario.empty(); +} + +void usage() { + std::fprintf(stderr, + "CotabbyInferenceBench — Phase 0 spike for batched-decode evaluation\n" + "\n" + "Usage:\n" + " CotabbyInferenceBench --model PATH --scenario SCENARIO [options]\n" + "\n" + "Scenarios:\n" + " a_two_contexts N separate llama_context instances, one thread each\n" + " (current CotabbyInferenceEngine architecture)\n" + " b_shared_context One shared llama_context with n_seq_max=N, batched\n" + " llama_decode (candidate Phase 1 architecture)\n" + "\n" + "Options:\n" + " --num-sequences N Number of concurrent sequences (default 2)\n" + " --prompt-tokens N Synthetic prompt length per sequence (default 256)\n" + " --sample-tokens N Tokens to generate per sequence (default 200)\n" + " --gpu-layers N llama n_gpu_layers; -1 means all (default -1)\n" + " --batch-size N Decode batch size (default 512)\n" + " --ctx-size N Per-sequence KV slots (default 2048)\n" + " --verbose Don't silence llama's internal logging\n" + "\n" + "Output: one line of JSON to stdout with elapsed_seconds,\n" + " total_tokens_sampled, aggregate_tokens_per_second.\n" + "\n" + "Both scenarios exclude prompt decode and the seed sample from the\n" + "timed section, so the numerator counts (sample_tokens - 1) *\n" + "num_sequences steady-state samples and is directly comparable.\n" + ); +} + +} // namespace + +int main(int argc, char** argv) { + BenchConfig cfg; + if (!parse_args(argc, argv, cfg)) { + usage(); + return 1; + } + + if (!cfg.verbose) { + llama_log_set(silenced_log, nullptr); + } + llama_backend_init(); + + auto model_params = llama_model_default_params(); + model_params.n_gpu_layers = cfg.gpu_layers; + model_params.use_mmap = true; + model_params.use_mlock = false; + + llama_model* model = llama_model_load_from_file( + cfg.model_path.c_str(), model_params); + if (!model) { + std::fprintf(stderr, "Failed to load model: %s\n", cfg.model_path.c_str()); + llama_backend_free(); + return 2; + } + + BenchResult r; + if (cfg.scenario == "a_two_contexts") { + r = run_baseline_a_two_contexts(model, cfg); + } else if (cfg.scenario == "b_shared_context") { + r = run_baseline_b_shared_context(model, cfg); + } else { + std::fprintf(stderr, "Unknown scenario: %s\n", cfg.scenario.c_str()); + llama_model_free(model); + llama_backend_free(); + return 3; + } + + print_result(r); + + llama_model_free(model); + llama_backend_free(); + return r.error.empty() ? 0 : 4; +}