diff --git a/Package.swift b/Package.swift
index 1827dbc..2f786df 100644
--- a/Package.swift
+++ b/Package.swift
@@ -33,5 +33,18 @@ let package = Package(
                 .interoperabilityMode(.Cxx),
             ]
         ),
+        // Phase 0 spike: standalone executable that compares aggregate decode
+        // tok/s between the current "separate llama_context per sequence"
+        // architecture and a prospective shared-context-with-batched-decode
+        // architecture. Used to decide whether the Phase 1 refactor is worth
+        // doing on M-series + Metal. Not linked into CotabbyInference itself.
+        .executableTarget(
+            name: "CotabbyInferenceBench",
+            dependencies: ["llama-cpp"],
+            path: "Sources/CotabbyInferenceBench",
+            cxxSettings: [
+                .unsafeFlags(["-std=c++17"]),
+            ]
+        ),
     ]
 )
diff --git a/Sources/CotabbyInferenceBench/baseline_a.cpp b/Sources/CotabbyInferenceBench/baseline_a.cpp
new file mode 100644
index 0000000..88ad2d1
--- /dev/null
+++ b/Sources/CotabbyInferenceBench/baseline_a.cpp
@@ -0,0 +1,207 @@
+#include "bench_common.h"
+
+#include <algorithm>
+#include <atomic>
+#include <thread>
+#include <vector>
+
+#include <llama/llama.h>
+
+namespace {
+
+llama_sampler* make_greedy_sampler() {
+    auto params = llama_sampler_chain_default_params();
+    llama_sampler* chain = llama_sampler_chain_init(params);
+    if (!chain) return nullptr;
+    llama_sampler_chain_add(chain, llama_sampler_init_greedy());
+    return chain;
+}
+
+llama_context* create_isolated_context(
+    llama_model* model,
+    const BenchConfig& cfg
+) {
+    auto p = llama_context_default_params();
+    p.n_ctx = static_cast<uint32_t>(cfg.ctx_size);
+    p.n_batch = static_cast<uint32_t>(cfg.batch_size);
+    p.n_ubatch = static_cast<uint32_t>(cfg.batch_size);
+    p.n_seq_max = 1;
+    int t = static_cast<int>(
+        std::max(1u, std::thread::hardware_concurrency())
+    );
+    p.n_threads = t;
+    p.n_threads_batch = t;
+    p.offload_kqv = true;
+    return llama_init_from_model(model, p);
+}
+
+// Walks the prompt in chunks of `batch_cap`, setting logits=1 only on the
+// final token of the final chunk. That leaves a single logits row at batch
+// index 0 of the last decode, which is where the first sample reads from.
+bool decode_prompt(
+    llama_context* ctx,
+    const std::vector<int32_t>& tokens,
+    int batch_cap
+) {
+    llama_batch batch = llama_batch_init(batch_cap, 0, 1);
+    int cursor = 0;
+    int n = static_cast<int>(tokens.size());
+    while (cursor < n) {
+        int chunk_end = std::min(cursor + batch_cap, n);
+        int chunk_size = chunk_end - cursor;
+        batch.n_tokens = chunk_size;
+        for (int i = 0; i < chunk_size; ++i) {
+            int idx = cursor + i;
+            batch.token[i] = tokens[idx];
+            batch.pos[i] = idx;
+            batch.n_seq_id[i] = 1;
+            if (batch.seq_id && batch.seq_id[i]) batch.seq_id[i][0] = 0;
+            bool is_last = (chunk_end == n && i == chunk_size - 1);
+            batch.logits[i] = is_last ? 1 : 0;
+        }
+        if (llama_decode(ctx, batch) != 0) {
+            llama_batch_free(batch);
+            return false;
+        }
+        cursor = chunk_end;
+    }
+    llama_batch_free(batch);
+    return true;
+}
+
+// Steady-state sampling loop, run once per sequence in its own thread. Takes
+// the seed token from warmup as input. Each iteration feedback-decodes the
+// previous sample then takes a new one. Mirrors CotabbyInferenceEngine's
+// sampleNext pattern minus the per-call batch allocation.
+int run_sampling_loop(
+    llama_context* ctx,
+    llama_sampler* sampler,
+    llama_token seed_token,
+    int start_position,
+    int sample_count
+) {
+    if (sample_count <= 0) return 0;
+
+    llama_token next = seed_token;
+    int position = start_position;
+    int sampled = 0;
+
+    llama_batch batch = llama_batch_init(1, 0, 1);
+    for (int i = 0; i < sample_count; ++i) {
+        batch.n_tokens = 1;
+        batch.token[0] = next;
+        batch.pos[0] = position;
+        batch.n_seq_id[0] = 1;
+        if (batch.seq_id && batch.seq_id[0]) batch.seq_id[0][0] = 0;
+        batch.logits[0] = 1;
+        if (llama_decode(ctx, batch) != 0) {
+            llama_batch_free(batch);
+            return sampled;
+        }
+        position++;
+        next = llama_sampler_sample(sampler, ctx, -1);
+        llama_sampler_accept(sampler, next);
+        sampled++;
+    }
+    llama_batch_free(batch);
+    return sampled;
+}
+
+} // namespace
+
+BenchResult run_baseline_a_two_contexts(
+    llama_model* model,
+    const BenchConfig& cfg
+) {
+    BenchResult r;
+    r.scenario = "a_two_contexts";
+    r.num_sequences = cfg.num_sequences;
+    r.prompt_tokens = cfg.prompt_tokens;
+    r.sample_tokens = cfg.sample_tokens;
+
+    const llama_vocab* vocab = llama_model_get_vocab(model);
+    auto prompt = make_synthetic_prompt(vocab, cfg.prompt_tokens);
+
+    std::vector<llama_context*> ctxs(cfg.num_sequences, nullptr);
+    std::vector<llama_sampler*> samplers(cfg.num_sequences, nullptr);
+
+    auto cleanup = [&]() {
+        for (auto* s : samplers) if (s) llama_sampler_free(s);
+        for (auto* c : ctxs) if (c) llama_free(c);
+    };
+
+    if (prompt.empty()) {
+        r.error = "Failed to tokenize synthetic prompt";
+        cleanup();
+        return r;
+    }
+
+    // Allocate all resources upfront. Any failure here aborts before any
+    // timing happens.
+    for (int i = 0; i < cfg.num_sequences; ++i) {
+        ctxs[i] = create_isolated_context(model, cfg);
+        if (!ctxs[i]) {
+            r.error = "Failed to create context";
+            cleanup();
+            return r;
+        }
+        samplers[i] = make_greedy_sampler();
+        if (!samplers[i]) {
+            r.error = "Failed to create sampler";
+            cleanup();
+            return r;
+        }
+    }
+
+    // Warmup: prompt decode + first sample per context. Both untimed so the
+    // timed section measures only steady-state decode+sample work, which is
+    // what the user actually feels in the real app (prompt is mostly cached
+    // across requests via KV reuse). Pulling the seed sample out also makes
+    // the count symmetric with baseline B's timed section.
+    std::vector<llama_token> seed_tokens(cfg.num_sequences, 0);
+    for (int i = 0; i < cfg.num_sequences; ++i) {
+        if (!decode_prompt(ctxs[i], prompt, cfg.batch_size)) {
+            r.error = "Prompt decode failed";
+            cleanup();
+            return r;
+        }
+        // -1 reads from the last logits row, which is where the prompt's
+        // final-token logits live (we set logits=1 only on that position
+        // during prompt decode).
+        seed_tokens[i] = llama_sampler_sample(samplers[i], ctxs[i], -1);
+        llama_sampler_accept(samplers[i], seed_tokens[i]);
+    }
+
+    // Timed: one thread per sequence, each runs (sample_tokens - 1) feedback
+    // decode + sample iterations against its own context.
+    Timer timer;
+    timer.start();
+    std::vector<std::thread> threads;
+    std::atomic<int> total{0};
+    threads.reserve(cfg.num_sequences);
+    for (int i = 0; i < cfg.num_sequences; ++i) {
+        threads.emplace_back([&, i]() {
+            int n = run_sampling_loop(
+                ctxs[i],
+                samplers[i],
+                seed_tokens[i],
+                cfg.prompt_tokens,
+                cfg.sample_tokens - 1
+            );
+            total.fetch_add(n);
+        });
+    }
+    for (auto& t : threads) t.join();
+    r.elapsed_seconds = timer.elapsed_seconds();
+    r.total_tokens_sampled = total.load();
+
+    if (r.elapsed_seconds > 0.0) {
+        r.aggregate_tokens_per_second =
+            r.total_tokens_sampled / r.elapsed_seconds;
+        r.per_sequence_tokens_per_second =
+            r.aggregate_tokens_per_second / cfg.num_sequences;
+    }
+
+    cleanup();
+    return r;
+}
diff --git a/Sources/CotabbyInferenceBench/baseline_b.cpp b/Sources/CotabbyInferenceBench/baseline_b.cpp
new file mode 100644
index 0000000..43f25b9
--- /dev/null
+++ b/Sources/CotabbyInferenceBench/baseline_b.cpp
@@ -0,0 +1,195 @@
+#include "bench_common.h"
+
+#include <algorithm>
+#include <thread>
+#include <vector>
+
+#include <llama/llama.h>
+
+namespace {
+
+llama_sampler* make_greedy_sampler() {
+    auto params = llama_sampler_chain_default_params();
+    llama_sampler* chain = llama_sampler_chain_init(params);
+    if (!chain) return nullptr;
+    llama_sampler_chain_add(chain, llama_sampler_init_greedy());
+    return chain;
+}
+
+// Single shared context with room for `num_sequences` distinct seq_ids. The
+// KV cache size is multiplied by `num_sequences` so each sequence still gets
+// its configured `ctx_size` of slots — Phase 0 also wants to confirm whether
+// n_ctx is shared or per-sequence in the b9310 build.
+llama_context* create_shared_context(
+    llama_model* model,
+    const BenchConfig& cfg
+) {
+    auto p = llama_context_default_params();
+    p.n_ctx = static_cast<uint32_t>(cfg.ctx_size * cfg.num_sequences);
+    p.n_batch = static_cast<uint32_t>(cfg.batch_size);
+    p.n_ubatch = static_cast<uint32_t>(cfg.batch_size);
+    p.n_seq_max = static_cast<uint32_t>(cfg.num_sequences);
+    int t = static_cast<int>(
+        std::max(1u, std::thread::hardware_concurrency())
+    );
+    p.n_threads = t;
+    p.n_threads_batch = t;
+    p.offload_kqv = true;
+    return llama_init_from_model(model, p);
+}
+
+// Decodes one sequence's prompt into the shared context, tagged with `seq_id`.
+// Only the final token of the final chunk has logits=1, so the next sampler
+// call reads from batch index 0 of that decode.
+bool decode_prompt_for_seq(
+    llama_context* ctx,
+    llama_seq_id seq_id,
+    const std::vector<int32_t>& tokens,
+    int batch_cap
+) {
+    llama_batch batch = llama_batch_init(batch_cap, 0, 1);
+    int cursor = 0;
+    int n = static_cast<int>(tokens.size());
+    while (cursor < n) {
+        int chunk_end = std::min(cursor + batch_cap, n);
+        int chunk_size = chunk_end - cursor;
+        batch.n_tokens = chunk_size;
+        for (int i = 0; i < chunk_size; ++i) {
+            int idx = cursor + i;
+            batch.token[i] = tokens[idx];
+            batch.pos[i] = idx;
+            batch.n_seq_id[i] = 1;
+            if (batch.seq_id && batch.seq_id[i]) batch.seq_id[i][0] = seq_id;
+            bool is_last = (chunk_end == n && i == chunk_size - 1);
+            batch.logits[i] = is_last ? 1 : 0;
+        }
+        if (llama_decode(ctx, batch) != 0) {
+            llama_batch_free(batch);
+            return false;
+        }
+        cursor = chunk_end;
+    }
+    llama_batch_free(batch);
+    return true;
+}
+
+} // namespace
+
+BenchResult run_baseline_b_shared_context(
+    llama_model* model,
+    const BenchConfig& cfg
+) {
+    BenchResult r;
+    r.scenario = "b_shared_context";
+    r.num_sequences = cfg.num_sequences;
+    r.prompt_tokens = cfg.prompt_tokens;
+    r.sample_tokens = cfg.sample_tokens;
+
+    const llama_vocab* vocab = llama_model_get_vocab(model);
+    auto prompt = make_synthetic_prompt(vocab, cfg.prompt_tokens);
+
+    llama_context* ctx = nullptr;
+    std::vector<llama_sampler*> samplers(cfg.num_sequences, nullptr);
+
+    auto cleanup = [&]() {
+        for (auto* s : samplers) if (s) llama_sampler_free(s);
+        if (ctx) llama_free(ctx);
+    };
+
+    if (prompt.empty()) {
+        r.error = "Failed to tokenize synthetic prompt";
+        cleanup();
+        return r;
+    }
+
+    ctx = create_shared_context(model, cfg);
+    if (!ctx) {
+        r.error = "Failed to create shared context";
+        cleanup();
+        return r;
+    }
+
+    for (int s = 0; s < cfg.num_sequences; ++s) {
+        samplers[s] = make_greedy_sampler();
+        if (!samplers[s]) {
+            r.error = "Failed to create sampler";
+            cleanup();
+            return r;
+        }
+    }
+
+    std::vector<llama_token> last_tokens(cfg.num_sequences, 0);
+    std::vector<int> positions(cfg.num_sequences, cfg.prompt_tokens);
+
+    // Warmup: decode each prompt and immediately sample one token while that
+    // sequence's prompt logits are still resident. The next prompt decode for
+    // sequence s+1 overwrites the logits buffer, so we must sample-before-
+    // advance. The sampled token becomes the seed input to the timed loop.
+    for (int s = 0; s < cfg.num_sequences; ++s) {
+        if (!decode_prompt_for_seq(
+                ctx, static_cast<llama_seq_id>(s),
+                prompt, cfg.batch_size)) {
+            r.error = "Prompt decode failed";
+            cleanup();
+            return r;
+        }
+        // -1 reads from the last logits row, which is where this sequence's
+        // prompt-final-token logits live until the next decode overwrites
+        // them.
+        last_tokens[s] = llama_sampler_sample(samplers[s], ctx, -1);
+        llama_sampler_accept(samplers[s], last_tokens[s]);
+    }
+
+    // Timed: build one batch carrying num_sequences tokens (different seq_ids),
+    // decode all of them in a single llama_decode call, then sample one new
+    // token per sequence from its respective logit row. Each iteration
+    // produces one new token per sequence.
+    llama_batch batch = llama_batch_init(cfg.num_sequences + 4, 0, 1);
+    Timer timer;
+    timer.start();
+    bool decode_failed = false;
+
+    for (int step = 1; step < cfg.sample_tokens; ++step) {
+        batch.n_tokens = cfg.num_sequences;
+        for (int s = 0; s < cfg.num_sequences; ++s) {
+            batch.token[s] = last_tokens[s];
+            batch.pos[s] = positions[s];
+            batch.n_seq_id[s] = 1;
+            if (batch.seq_id && batch.seq_id[s]) {
+                batch.seq_id[s][0] = static_cast<llama_seq_id>(s);
+            }
+            batch.logits[s] = 1;
+            positions[s]++;
+        }
+        if (llama_decode(ctx, batch) != 0) {
+            decode_failed = true;
+            break;
+        }
+        for (int s = 0; s < cfg.num_sequences; ++s) {
+            last_tokens[s] = llama_sampler_sample(samplers[s], ctx, s);
+            llama_sampler_accept(samplers[s], last_tokens[s]);
+        }
+    }
+
+    r.elapsed_seconds = timer.elapsed_seconds();
+    llama_batch_free(batch);
+
+    if (decode_failed) {
+        r.error = "llama_decode failed mid-loop";
+        cleanup();
+        return r;
+    }
+
+    // Exclude the warmup seed sample from total_tokens_sampled so the
+    // numerator is comparable to baseline A's timed-only count.
+    r.total_tokens_sampled = (cfg.sample_tokens - 1) * cfg.num_sequences;
+    if (r.elapsed_seconds > 0.0) {
+        r.aggregate_tokens_per_second =
+            r.total_tokens_sampled / r.elapsed_seconds;
+        r.per_sequence_tokens_per_second =
+            r.aggregate_tokens_per_second / cfg.num_sequences;
+    }
+
+    cleanup();
+    return r;
+}
diff --git a/Sources/CotabbyInferenceBench/bench_common.cpp b/Sources/CotabbyInferenceBench/bench_common.cpp
new file mode 100644
index 0000000..dbc9c5e
--- /dev/null
+++ b/Sources/CotabbyInferenceBench/bench_common.cpp
@@ -0,0 +1,73 @@
+#include "bench_common.h"
+
+#include <cstdio>
+#include <string>
+
+#include <llama/llama.h>
+
+std::vector<int32_t> make_synthetic_prompt(
+    const llama_vocab* vocab,
+    int target_tokens
+) {
+    if (!vocab || target_tokens <= 0) return {};
+
+    // Real text so the tokenizer produces a sensible distribution. The seed is
+    // short; we pad until tokenization yields at least `target_tokens` tokens.
+    static const char seed[] =
+        "The quick brown fox jumps over the lazy dog. ";
+    std::string text;
+    while (static_cast<int>(text.size()) < target_tokens * 6) {
+        text += seed;
+    }
+
+    bool add_bos = llama_vocab_get_add_bos(vocab);
+    int capacity = target_tokens * 4 + 16;
+    std::vector<int32_t> tokens(capacity);
+    int n = llama_tokenize(
+        vocab,
+        text.c_str(),
+        static_cast<int32_t>(text.size()),
+        tokens.data(),
+        static_cast<int32_t>(capacity),
+        add_bos,
+        false
+    );
+    if (n <= 0) return {};
+
+    tokens.resize(n);
+    if (static_cast<int>(tokens.size()) > target_tokens) {
+        tokens.resize(target_tokens);
+    }
+    return tokens;
+}
+
+void print_result(const BenchResult& r) {
+    if (!r.error.empty()) {
+        std::printf(
+            "{\"scenario\":\"%s\",\"error\":\"%s\"}\n",
+            r.scenario.c_str(),
+            r.error.c_str()
+        );
+        return;
+    }
+    std::printf(
+        "{"
+        "\"scenario\":\"%s\","
+        "\"num_sequences\":%d,"
+        "\"prompt_tokens\":%d,"
+        "\"sample_tokens\":%d,"
+        "\"elapsed_seconds\":%.4f,"
+        "\"total_tokens_sampled\":%d,"
+        "\"aggregate_tokens_per_second\":%.2f,"
+        "\"per_sequence_tokens_per_second\":%.2f"
+        "}\n",
+        r.scenario.c_str(),
+        r.num_sequences,
+        r.prompt_tokens,
+        r.sample_tokens,
+        r.elapsed_seconds,
+        r.total_tokens_sampled,
+        r.aggregate_tokens_per_second,
+        r.per_sequence_tokens_per_second
+    );
+}
diff --git a/Sources/CotabbyInferenceBench/bench_common.h b/Sources/CotabbyInferenceBench/bench_common.h
new file mode 100644
index 0000000..de1c7bf
--- /dev/null
+++ b/Sources/CotabbyInferenceBench/bench_common.h
@@ -0,0 +1,80 @@
+#pragma once
+
+#include <chrono>
+#include <string>
+#include <vector>
+#include <cstdint>
+
+// Phase 0 spike: compare aggregate decode tok/s between the current "separate
+// llama_context per sequence" architecture and a prospective "single shared
+// llama_context with n_seq_max > 1, batched llama_decode" architecture.
+//
+// The goal is a yes/no decision: on the M-series + Metal hardware our users
+// run, does batched decode actually move the needle (>= 1.4x aggregate tok/s)
+// or is the GPU command queue serializing everything regardless?
+//
+// Two baselines, identical warmup, identical token count. Both report total
+// samples produced in the timed section and elapsed wall-clock so the caller
+// can compute tok/s without us baking in a definition.
+
+struct BenchConfig {
+    std::string model_path;
+    std::string scenario;
+    int num_sequences = 2;
+    int prompt_tokens = 256;
+    int sample_tokens = 200;
+    int gpu_layers = -1;
+    int batch_size = 512;
+    int ctx_size = 2048;
+    bool verbose = false;
+};
+
+struct BenchResult {
+    std::string scenario;
+    int num_sequences = 0;
+    int prompt_tokens = 0;
+    int sample_tokens = 0;
+    double elapsed_seconds = 0.0;
+    int total_tokens_sampled = 0;
+    double aggregate_tokens_per_second = 0.0;
+    double per_sequence_tokens_per_second = 0.0;
+    std::string error;
+};
+
+class Timer {
+public:
+    void start() { t0_ = std::chrono::steady_clock::now(); }
+    double elapsed_seconds() const {
+        auto dt = std::chrono::steady_clock::now() - t0_;
+        return std::chrono::duration<double>(dt).count();
+    }
+private:
+    std::chrono::steady_clock::time_point t0_;
+};
+
+// Pads a short English seed string to a target token count using the supplied
+// vocab. Returning real tokens (not random IDs) keeps the sampler in a regime
+// the model was trained for, so the decode cost matches what real usage hits.
+std::vector<int32_t> make_synthetic_prompt(
+    const struct llama_vocab* vocab,
+    int target_tokens
+);
+
+// Emits a single-line JSON record to stdout. One result per invocation; the
+// caller is expected to run the binary multiple times and compare lines.
+void print_result(const BenchResult& r);
+
+// Baseline A: N separate llama_context instances, each decoded from its own
+// thread. This is what `CotabbyInferenceEngine` does today.
+BenchResult run_baseline_a_two_contexts(
+    struct llama_model* model,
+    const BenchConfig& cfg
+);
+
+// Baseline B: one shared llama_context with n_seq_max = N, batched
+// llama_decode calls carrying tokens for all sequences. This is the candidate
+// architecture for Phase 1.
+BenchResult run_baseline_b_shared_context(
+    struct llama_model* model,
+    const BenchConfig& cfg
+);
diff --git a/Sources/CotabbyInferenceBench/main.cpp b/Sources/CotabbyInferenceBench/main.cpp
new file mode 100644
index 0000000..5cceef6
--- /dev/null
+++ b/Sources/CotabbyInferenceBench/main.cpp
@@ -0,0 +1,129 @@
+#include "bench_common.h"
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <string>
+
+#include <llama/llama.h>
+
+namespace {
+
+void silenced_log(ggml_log_level, const char*, void*) {}
+
+bool parse_args(int argc, char** argv, BenchConfig& cfg) {
+    for (int i = 1; i < argc; ++i) {
+        std::string a = argv[i];
+        auto next = [&]() -> const char* {
+            if (i + 1 >= argc) return nullptr;
+            return argv[++i];
+        };
+        if (a == "--model") {
+            const char* v = next();
+            if (v) cfg.model_path = v;
+        } else if (a == "--scenario") {
+            const char* v = next();
+            if (v) cfg.scenario = v;
+        } else if (a == "--num-sequences") {
+            const char* v = next();
+            if (v) cfg.num_sequences = std::atoi(v);
+        } else if (a == "--prompt-tokens") {
+            const char* v = next();
+            if (v) cfg.prompt_tokens = std::atoi(v);
+        } else if (a == "--sample-tokens") {
+            const char* v = next();
+            if (v) cfg.sample_tokens = std::atoi(v);
+        } else if (a == "--gpu-layers") {
+            const char* v = next();
+            if (v) cfg.gpu_layers = std::atoi(v);
+        } else if (a == "--batch-size") {
+            const char* v = next();
+            if (v) cfg.batch_size = std::atoi(v);
+        } else if (a == "--ctx-size") {
+            const char* v = next();
+            if (v) cfg.ctx_size = std::atoi(v);
+        } else if (a == "--verbose") {
+            cfg.verbose = true;
+        } else if (a == "--help" || a == "-h") {
+            return false;
+        }
+    }
+    return !cfg.model_path.empty() && !cfg.scenario.empty();
+}
+
+void usage() {
+    std::fprintf(stderr,
+        "CotabbyInferenceBench — Phase 0 spike for batched-decode evaluation\n"
+        "\n"
+        "Usage:\n"
+        "  CotabbyInferenceBench --model PATH --scenario SCENARIO [options]\n"
+        "\n"
+        "Scenarios:\n"
+        "  a_two_contexts     N separate llama_context instances, one thread each\n"
+        "                     (current CotabbyInferenceEngine architecture)\n"
+        "  b_shared_context   One shared llama_context with n_seq_max=N, batched\n"
+        "                     llama_decode (candidate Phase 1 architecture)\n"
+        "\n"
+        "Options:\n"
+        "  --num-sequences N   Number of concurrent sequences (default 2)\n"
+        "  --prompt-tokens N   Synthetic prompt length per sequence (default 256)\n"
+        "  --sample-tokens N   Tokens to generate per sequence (default 200)\n"
+        "  --gpu-layers N      llama n_gpu_layers; -1 means all (default -1)\n"
+        "  --batch-size N      Decode batch size (default 512)\n"
+        "  --ctx-size N        Per-sequence KV slots (default 2048)\n"
+        "  --verbose           Don't silence llama's internal logging\n"
+        "\n"
+        "Output: one line of JSON to stdout with elapsed_seconds,\n"
+        "        total_tokens_sampled, aggregate_tokens_per_second.\n"
+        "\n"
+        "Both scenarios exclude prompt decode and the seed sample from the\n"
+        "timed section, so the numerator counts (sample_tokens - 1) *\n"
+        "num_sequences steady-state samples and is directly comparable.\n"
+    );
+}
+
+} // namespace
+
+int main(int argc, char** argv) {
+    BenchConfig cfg;
+    if (!parse_args(argc, argv, cfg)) {
+        usage();
+        return 1;
+    }
+
+    if (!cfg.verbose) {
+        llama_log_set(silenced_log, nullptr);
+    }
+    llama_backend_init();
+
+    auto model_params = llama_model_default_params();
+    model_params.n_gpu_layers = cfg.gpu_layers;
+    model_params.use_mmap = true;
+    model_params.use_mlock = false;
+
+    llama_model* model = llama_model_load_from_file(
+        cfg.model_path.c_str(), model_params);
+    if (!model) {
+        std::fprintf(stderr, "Failed to load model: %s\n", cfg.model_path.c_str());
+        llama_backend_free();
+        return 2;
+    }
+
+    BenchResult r;
+    if (cfg.scenario == "a_two_contexts") {
+        r = run_baseline_a_two_contexts(model, cfg);
+    } else if (cfg.scenario == "b_shared_context") {
+        r = run_baseline_b_shared_context(model, cfg);
+    } else {
+        std::fprintf(stderr, "Unknown scenario: %s\n", cfg.scenario.c_str());
+        llama_model_free(model);
+        llama_backend_free();
+        return 3;
+    }
+
+    print_result(r);
+
+    llama_model_free(model);
+    llama_backend_free();
+    return r.error.empty() ? 0 : 4;
+}