diff --git a/Cargo.lock b/Cargo.lock
index f9c99776..05880f38 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1234,6 +1234,7 @@ dependencies = [
  "libtest-mimic",
  "llama-cpp-bindings",
  "llama-cpp-test-harness-macros",
+ "thiserror",
 ]
 
 [[package]]
diff --git a/llama-cpp-bindings-tests/Cargo.toml b/llama-cpp-bindings-tests/Cargo.toml
index c17b881d..cba73b08 100644
--- a/llama-cpp-bindings-tests/Cargo.toml
+++ b/llama-cpp-bindings-tests/Cargo.toml
@@ -15,135 +15,19 @@ llama-cpp-test-harness = { workspace = true }
 serde_json = { workspace = true }
 
 [[test]]
-name = "context"
+name = "backend_initialization"
 harness = false
 
 [[test]]
-name = "llama_backend"
+name = "chat_template_and_message_parsing"
 harness = false
 
 [[test]]
-name = "context_kv_cache"
+name = "embedding_and_encoder"
 harness = false
 
 [[test]]
-name = "deepseek_r1_8b_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt"
-harness = false
-
-[[test]]
-name = "deepseek_r1_8b_classifier_emits_reasoning"
-harness = false
-
-[[test]]
-name = "deepseek_r1_8b_duck_types_gemma_paired_quote"
-harness = false
-
-[[test]]
-name = "deepseek_r1_8b_duck_types_glm_key_value_tags"
-harness = false
-
-[[test]]
-name = "deepseek_r1_8b_duck_types_mistral_bracketed_json"
-harness = false
-
-[[test]]
-name = "deepseek_r1_8b_duck_types_qwen_xml"
-harness = false
-
-[[test]]
-name = "deepseek_r1_8b_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested"
-harness = false
-
-[[test]]
-name = "deepseek_r1_8b_recognizes_empty_tool_calls_when_tools_not_requested"
-harness = false
-
-[[test]]
-name = "context_session"
-harness = false
-
-[[test]]
-name = "embeddings"
-harness = false
-
-[[test]]
-name = "gemma4_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt"
-harness = false
-
-[[test]]
-name = "gemma4_classifier_emits_reasoning"
-harness = false
-
-[[test]]
-name = "gemma4_classifier_emits_reasoning_for_multimodal_thinking_prompt"
-harness = false
-
-[[test]]
-name = "gemma4_parses_tool_call_payload"
-harness = false
-
-[[test]]
-name = "gemma4_template_override_returns_full_markers"
-harness = false
-
-[[test]]
-name = "glm47_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt"
-harness = false
-
-[[test]]
-name = "glm47_classifier_emits_reasoning"
-harness = false
-
-[[test]]
-name = "glm47_parses_tool_call_payload"
-harness = false
-
-[[test]]
-name = "glm47_template_override_returns_full_markers"
-harness = false
-
-[[test]]
-name = "mistral3_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt"
-harness = false
-
-[[test]]
-name = "mistral3_classifier_emits_reasoning"
-harness = false
-
-[[test]]
-name = "mistral3_classifier_emits_reasoning_for_multimodal_thinking_prompt"
-harness = false
-
-[[test]]
-name = "mistral3_parses_tool_call_payload"
-harness = false
-
-[[test]]
-name = "eval_multimodal_chunks_records_exact_token_counts"
-harness = false
-
-[[test]]
-name = "ingest_prompt_chunk"
-harness = false
-
-[[test]]
-name = "llguidance"
-harness = false
-
-[[test]]
-name = "model_chat_template"
-harness = false
-
-[[test]]
-name = "model_context_creation"
-harness = false
-
-[[test]]
-name = "model_helpers"
-harness = false
-
-[[test]]
-name = "model_params"
+name = "kv_cache_and_session"
 harness = false
 
 [[test]]
@@ -151,127 +35,19 @@ name = "model_loading_errors"
 harness = false
 
 [[test]]
-name = "model_lora_adapter_errors"
-harness = false
-
-[[test]]
-name = "model_metadata_kv"
-harness = false
-
-[[test]]
-name = "model_properties"
-harness = false
-
-[[test]]
-name = "model_sampling"
-harness = false
-
-[[test]]
-name = "model_special_tokens"
-harness = false
-
-[[test]]
-name = "model_str_to_token"
-harness = false
-
-[[test]]
-name = "model_token_to_piece"
-harness = false
-
-[[test]]
-name = "model_tokens_iterator"
-harness = false
-
-[[test]]
-name = "mtmd_bitmap"
-harness = false
-
-[[test]]
-name = "mtmd_chunk_operations"
-harness = false
-
-[[test]]
-name = "mtmd_chunk_structure"
-harness = false
-
-[[test]]
-name = "mtmd_context"
-harness = false
-
-[[test]]
-name = "mtmd_evaluation"
-harness = false
-
-[[test]]
-name = "mtmd_tokenization"
-harness = false
-
-[[test]]
-name = "multimodal"
-harness = false
-
-[[test]]
-name = "parse_chat_message"
-harness = false
-
-[[test]]
-name = "qwen35_chat_inference_emits_reasoning_when_template_auto_opens"
-harness = false
-
-[[test]]
-name = "qwen35_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt"
-harness = false
-
-[[test]]
-name = "qwen35_classifier_emits_reasoning"
-harness = false
-
-[[test]]
-name = "qwen35_classifier_emits_reasoning_for_multimodal_thinking_prompt"
-harness = false
-
-[[test]]
-name = "qwen35_parses_constrained_schema_payload"
-harness = false
-
-[[test]]
-name = "qwen35_parses_tool_call_payload"
-harness = false
-
-[[test]]
-name = "qwen35_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested"
-harness = false
-
-[[test]]
-name = "qwen36_chat_inference_emits_reasoning_when_template_auto_opens"
-harness = false
-
-[[test]]
-name = "qwen36_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt"
-harness = false
-
-[[test]]
-name = "qwen36_classifier_emits_reasoning"
-harness = false
-
-[[test]]
-name = "qwen36_classifier_emits_reasoning_for_multimodal_thinking_prompt"
-harness = false
-
-[[test]]
-name = "reranker"
+name = "multimodal_vision"
 harness = false
 
 [[test]]
-name = "sampled_token_classifier_markers"
+name = "reasoning_markers_and_tool_calls"
 harness = false
 
 [[test]]
-name = "sampling"
+name = "sampling_and_constrained_decoding"
 harness = false
 
 [[test]]
-name = "text_generation"
+name = "vocabulary_and_metadata"
 harness = false
 
 [features]
diff --git a/llama-cpp-bindings-tests/tests/llama_backend.rs b/llama-cpp-bindings-tests/tests/backend_initialization.rs
similarity index 100%
rename from llama-cpp-bindings-tests/tests/llama_backend.rs
rename to llama-cpp-bindings-tests/tests/backend_initialization.rs
diff --git a/llama-cpp-bindings-tests/tests/chat_template_and_message_parsing.rs b/llama-cpp-bindings-tests/tests/chat_template_and_message_parsing.rs
new file mode 100644
index 00000000..a7e18245
--- /dev/null
+++ b/llama-cpp-bindings-tests/tests/chat_template_and_message_parsing.rs
@@ -0,0 +1,567 @@
+use llama_cpp_test_harness::llama_tests_main;
+
+mod model_chat_template {
+    #![expect(
+        clippy::unnecessary_wraps,
+        reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
+    )]
+
+    use anyhow::Result;
+    use llama_cpp_bindings::ChatTemplateError;
+    use llama_cpp_bindings::model::LlamaChatMessage;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn chat_template_returns_non_empty(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let template = fixture.model.chat_template(None);
+        assert!(template.is_ok());
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn apply_chat_template_produces_prompt(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let model = fixture.model;
+        let template = model.chat_template(None)?;
+        let message = LlamaChatMessage::new("user".to_string(), "hello".to_string())?;
+        let prompt = model.apply_chat_template(&template, &[message], true);
+
+        assert!(prompt.is_ok());
+        assert!(!prompt?.is_empty());
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn apply_chat_template_buffer_resize_with_long_messages(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let model = fixture.model;
+        let template = model.chat_template(None)?;
+        let long_content = "a".repeat(2000);
+        let message = LlamaChatMessage::new("user".to_string(), long_content)?;
+        let prompt = model.apply_chat_template(&template, &[message], true);
+
+        assert!(prompt.is_ok());
+        assert!(!prompt?.is_empty());
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn chat_template_with_nonexistent_name_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let result = fixture
+            .model
+            .chat_template(Some("nonexistent_template_name_xyz"));
+        assert_eq!(result.unwrap_err(), ChatTemplateError::MissingTemplate);
+        Ok(())
+    }
+}
+
+mod parse_chat_message {
+    #![expect(
+        clippy::unnecessary_wraps,
+        reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
+    )]
+
+    use anyhow::Result;
+    use anyhow::bail;
+    use llama_cpp_bindings::ChatMessageParseOutcome;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn parses_pure_content_response(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let outcome = fixture
+            .model
+            .parse_chat_message("[]", "hello world", false)?;
+
+        let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+            bail!("expected Recognized for plain content; got Unrecognized");
+        };
+        assert!(parsed.tool_calls.is_empty());
+        assert!(!parsed.is_empty());
+        assert!(parsed.content.contains("hello world"));
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn parses_reasoning_section_into_reasoning_content(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let input = "<think>step one, step two</think>\n\nactual response";
+        let outcome = fixture.model.parse_chat_message("[]", input, false)?;
+
+        let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+            bail!("expected Recognized for reasoning section; got Unrecognized");
+        };
+        assert!(
+            parsed.reasoning_content.contains("step") || parsed.content.contains("step"),
+            "neither content nor reasoning contains 'step'; content={:?} reasoning={:?}",
+            parsed.content,
+            parsed.reasoning_content
+        );
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn parses_empty_input_yields_empty_message(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let outcome = fixture.model.parse_chat_message("[]", "", false)?;
+
+        let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+            bail!("expected Recognized for empty input; got Unrecognized");
+        };
+        assert!(parsed.tool_calls.is_empty());
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn parses_malformed_tools_json_returns_tools_json_invalid_error(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let result = fixture
+            .model
+            .parse_chat_message("not_a_json[}", "hello", false);
+
+        assert!(matches!(
+            result,
+            Err(llama_cpp_bindings::ParseChatMessageError::ToolsJsonInvalid(
+                _
+            ))
+        ));
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn parses_non_array_tools_json_returns_tools_json_not_array_error(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let result = fixture
+            .model
+            .parse_chat_message("{\"foo\": 1}", "hello", false);
+
+        assert!(matches!(
+            result,
+            Err(llama_cpp_bindings::ParseChatMessageError::ToolsJsonNotArray)
+        ));
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn parses_with_tools_null_byte_returns_tools_json_invalid_error(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let result = fixture
+            .model
+            .parse_chat_message("[]\0extra", "hello", false);
+
+        assert!(matches!(
+            result,
+            Err(llama_cpp_bindings::ParseChatMessageError::ToolsJsonInvalid(
+                _
+            ))
+        ));
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn parses_with_input_null_byte_returns_tools_serialization_error(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let result = fixture
+            .model
+            .parse_chat_message("[]", "hello\0world", false);
+
+        assert!(matches!(
+            result,
+            Err(llama_cpp_bindings::ParseChatMessageError::ToolsSerialization(_))
+        ));
+        Ok(())
+    }
+}
+
+llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/constrained_decoding.rs b/llama-cpp-bindings-tests/tests/constrained_decoding.rs
deleted file mode 100644
index 533981c9..00000000
--- a/llama-cpp-bindings-tests/tests/constrained_decoding.rs
+++ /dev/null
@@ -1,124 +0,0 @@
-use std::io::Write;
-
-use anyhow::Result;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_bindings::sampled_token::SampledToken;
-use llama_cpp_bindings::sampling::LlamaSampler;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn json_schema_constrains_output(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let model = fixture.model;
-    let backend = fixture.backend;
-
-    let prompt = "The weather in Paris is sunny and 22 degrees. Extract as JSON:\n";
-
-    let mut ctx = LlamaContext::from_model(
-        model,
-        backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-
-    let tokens_list = model.str_to_token(prompt, AddBos::Always)?;
-
-    let mut batch = LlamaBatch::new(512, 1)?;
-    let last_index = i32::try_from(tokens_list.len())? - 1;
-
-    for (index, token) in (0_i32..).zip(&tokens_list) {
-        batch.add(
-            &SampledToken::Content(*token),
-            index,
-            &[0],
-            index == last_index,
-        )?;
-    }
-
-    ctx.decode(&mut batch)?;
-
-    let schema = r#"{
-  "type": "object",
-  "properties": {
-    "city": { "type": "string" },
-    "temperature": { "type": "number" }
-  },
-  "required": ["city", "temperature"]
-}"#;
-
-    let llg_sampler = LlamaSampler::llguidance(model, "json", schema)?;
-    let mut sampler = LlamaSampler::chain_simple([llg_sampler, LlamaSampler::greedy()]);
-
-    let mut n_cur = batch.n_tokens();
-    let mut decoder = encoding_rs::UTF_8.new_decoder();
-    let mut generated = String::new();
-
-    while n_cur <= 128 {
-        let token = SampledToken::Content(sampler.sample(&ctx, batch.n_tokens() - 1)?);
-
-        if model.is_eog_token(&token) {
-            break;
-        }
-
-        let output_string = model.token_to_piece(&token, &mut decoder, true, None)?;
-        generated.push_str(&output_string);
-        print!("{output_string}");
-        std::io::stdout().flush()?;
-
-        batch.clear();
-        batch.add(&token, n_cur, &[0], true)?;
-        n_cur += 1;
-        ctx.decode(&mut batch)?;
-    }
-
-    println!();
-
-    let parsed = serde_json::Deserializer::from_str(&generated)
-        .into_iter::<serde_json::Value>()
-        .next()
-        .ok_or_else(|| anyhow::anyhow!("model produced no JSON value"))??;
-
-    assert!(parsed.get("city").is_some());
-    assert!(parsed.get("temperature").is_some());
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/context.rs b/llama-cpp-bindings-tests/tests/context.rs
deleted file mode 100644
index 1e3a6b08..00000000
--- a/llama-cpp-bindings-tests/tests/context.rs
+++ /dev/null
@@ -1,917 +0,0 @@
-use std::ptr::NonNull;
-use std::sync::Arc;
-use std::sync::atomic::AtomicBool;
-
-use anyhow::Result;
-use llama_cpp_bindings::DecodeError;
-use llama_cpp_bindings::LogitsError;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_bindings::model::LlamaLoraAdapter;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-// =========================================================================================
-// Group A: default Qwen model, embeddings=false. Most context tests fall here.
-// =========================================================================================
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn context_creation_and_properties(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let context = LlamaContext::from_model(
-        fixture.model,
-        fixture.backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-
-    assert!(context.n_ctx() > 0);
-    assert!(context.n_batch() > 0);
-    assert!(context.n_ubatch() > 0);
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn decode_and_get_logits(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut context = LlamaContext::from_model(
-        fixture.model,
-        fixture.backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-    let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
-    let mut batch = LlamaBatch::new(512, 1)?;
-    batch.add_sequence(&tokens, 0, false)?;
-
-    let decode_result = context.decode(&mut batch);
-    assert!(decode_result.is_ok());
-
-    let logits = context.get_logits()?;
-    assert!(!logits.is_empty());
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn timings_work(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut context = LlamaContext::from_model(
-        fixture.model,
-        fixture.backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-
-    context.reset_timings();
-    let timings = context.timings();
-    assert!(timings.t_start_ms() >= 0.0);
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn token_data_array_has_entries_after_decode(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut context = LlamaContext::from_model(
-        fixture.model,
-        fixture.backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-    let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
-    let mut batch = LlamaBatch::new(512, 1)?;
-    batch.add_sequence(&tokens, 0, false)?;
-    context.decode(&mut batch)?;
-
-    let token_data_array = context.token_data_array()?;
-
-    assert!(!token_data_array.data.is_empty());
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn get_logits_ith_returns_valid_slice(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut context = LlamaContext::from_model(
-        fixture.model,
-        fixture.backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-    let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
-    let last_index = i32::try_from(tokens.len() - 1)?;
-    let mut batch = LlamaBatch::new(512, 1)?;
-    batch.add_sequence(&tokens, 0, false)?;
-    context.decode(&mut batch)?;
-
-    let logits = context.get_logits_ith(last_index)?;
-
-    assert_eq!(logits.len(), usize::try_from(fixture.model.n_vocab())?);
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn token_data_array_ith_returns_valid_data(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut context = LlamaContext::from_model(
-        fixture.model,
-        fixture.backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-    let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
-    let last_index = i32::try_from(tokens.len() - 1)?;
-    let mut batch = LlamaBatch::new(512, 1)?;
-    batch.add_sequence(&tokens, 0, false)?;
-    context.decode(&mut batch)?;
-
-    let token_data_array = context.token_data_array_ith(last_index)?;
-
-    assert_eq!(
-        token_data_array.data.len(),
-        usize::try_from(fixture.model.n_vocab())?
-    );
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn embeddings_ith_returns_error_when_embeddings_disabled(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let context = LlamaContext::from_model(
-        fixture.model,
-        fixture.backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-
-    let result = context.embeddings_ith(0);
-
-    assert!(result.is_err());
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn embeddings_seq_ith_returns_error_when_embeddings_disabled(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let context = LlamaContext::from_model(
-        fixture.model,
-        fixture.backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-
-    let result = context.embeddings_seq_ith(0);
-
-    assert!(result.is_err());
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn candidates_returns_n_vocab_entries(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut context = LlamaContext::from_model(
-        fixture.model,
-        fixture.backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-    let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
-    let mut batch = LlamaBatch::new(512, 1)?;
-    batch.add_sequence(&tokens, 0, false)?;
-    context.decode(&mut batch)?;
-
-    let count = context.candidates()?.count();
-
-    assert_eq!(count, usize::try_from(fixture.model.n_vocab())?);
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn debug_format_contains_struct_name(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let context = LlamaContext::from_model(
-        fixture.model,
-        fixture.backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-    let debug_output = format!("{context:?}");
-
-    assert!(debug_output.contains("LlamaContext"));
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn candidates_ith_returns_n_vocab_entries(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut context = LlamaContext::from_model(
-        fixture.model,
-        fixture.backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-    let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
-    let last_index = i32::try_from(tokens.len() - 1)?;
-    let mut batch = LlamaBatch::new(512, 1)?;
-    batch.add_sequence(&tokens, 0, false)?;
-    context.decode(&mut batch)?;
-
-    let count = context.candidates_ith(last_index)?.count();
-
-    assert_eq!(count, usize::try_from(fixture.model.n_vocab())?);
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn lora_adapter_remove_succeeds_with_no_adapters(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let context = LlamaContext::from_model(
-        fixture.model,
-        fixture.backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-    let mut adapter = LlamaLoraAdapter {
-        lora_adapter: NonNull::dangling(),
-    };
-
-    let result = context.lora_adapter_remove(&mut adapter);
-
-    assert!(result.is_ok());
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn encode_on_non_encoder_model_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut context = LlamaContext::from_model(
-        fixture.model,
-        fixture.backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-    let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
-    let mut batch = LlamaBatch::new(512, 1)?;
-    batch.add_sequence(&tokens, 0, false)?;
-
-    let result = context.encode(&mut batch);
-
-    assert!(result.is_err());
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn lora_adapter_set_with_dangling_pointer_succeeds_or_errors(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let context = LlamaContext::from_model(
-        fixture.model,
-        fixture.backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-    let mut adapter = LlamaLoraAdapter {
-        lora_adapter: NonNull::dangling(),
-    };
-
-    let result = context.lora_adapter_set(&mut adapter, 1.0);
-
-    assert!(result.is_ok());
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-    embeddings = true,
-)]
-fn embeddings_seq_ith_returns_null_embedding_error_for_invalid_seq(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let mut context = LlamaContext::from_model(
-        fixture.model,
-        fixture.backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-    let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
-    let mut batch = LlamaBatch::new(512, 1)?;
-    batch.add_sequence(&tokens, 0, false)?;
-    context.decode(&mut batch)?;
-
-    let result = context.embeddings_seq_ith(999);
-
-    assert!(result.is_err());
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn decode_empty_batch_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut context = LlamaContext::from_model(
-        fixture.model,
-        fixture.backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-    let mut batch = LlamaBatch::new(512, 1)?;
-
-    let result = context.decode(&mut batch);
-
-    assert!(result.is_err());
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn set_abort_flag_aborts_decode(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut context = LlamaContext::from_model(
-        fixture.model,
-        fixture.backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-    let abort_flag = Arc::new(AtomicBool::new(true));
-    context.set_abort_flag(abort_flag);
-
-    let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
-    let mut batch = LlamaBatch::new(512, 1)?;
-    batch.add_sequence(&tokens, 0, false)?;
-
-    let result = context.decode(&mut batch);
-
-    assert_eq!(result, Err(DecodeError::Aborted));
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn set_abort_flag_false_allows_decode(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut context = LlamaContext::from_model(
-        fixture.model,
-        fixture.backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-    let abort_flag = Arc::new(AtomicBool::new(false));
-    context.set_abort_flag(abort_flag);
-
-    let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
-    let mut batch = LlamaBatch::new(512, 1)?;
-    batch.add_sequence(&tokens, 0, false)?;
-
-    let result = context.decode(&mut batch);
-
-    assert!(result.is_ok());
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn clear_abort_callback_allows_decode_with_flag_true(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut context = LlamaContext::from_model(
-        fixture.model,
-        fixture.backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-    let abort_flag = Arc::new(AtomicBool::new(true));
-    context.set_abort_flag(abort_flag);
-    context.clear_abort_callback();
-
-    let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
-    let mut batch = LlamaBatch::new(512, 1)?;
-    batch.add_sequence(&tokens, 0, false)?;
-
-    let result = context.decode(&mut batch);
-
-    assert!(result.is_ok());
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn synchronize_completes_without_panic(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let context = LlamaContext::from_model(
-        fixture.model,
-        fixture.backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-
-    context.synchronize();
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn detach_threadpool_completes_without_panic(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let context = LlamaContext::from_model(
-        fixture.model,
-        fixture.backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-
-    context.detach_threadpool();
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn get_logits_ith_returns_token_not_initialized_for_unknown_index(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let context = LlamaContext::from_model(
-        fixture.model,
-        fixture.backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-
-    let result = context.get_logits_ith(7);
-
-    assert!(matches!(result, Err(LogitsError::TokenNotInitialized(7))));
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 64,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn get_logits_ith_returns_token_index_exceeds_context_for_huge_index(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let mut context = LlamaContext::from_model(
-        fixture.model,
-        fixture.backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-
-    let huge_index = i32::try_from(context.n_ctx())?;
-    context.mark_logits_initialized(huge_index);
-    let result = context.get_logits_ith(huge_index);
-
-    assert!(matches!(
-        result,
-        Err(LogitsError::TokenIndexExceedsContext { .. })
-    ));
-
-    Ok(())
-}
-
-// =========================================================================================
-// Group B: Qwen embedding model, embeddings=true. Six embedding-specific tests.
-// =========================================================================================
-
-#[llama_test(
-    model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-    embeddings = true,
-)]
-fn decode_with_embeddings_enabled(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut context = LlamaContext::from_model(
-        fixture.model,
-        fixture.backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-    let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
-    let mut batch = LlamaBatch::new(512, 1)?;
-    batch.add_sequence(&tokens, 0, false)?;
-
-    let result = context.decode(&mut batch);
-
-    assert!(result.is_ok());
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-    embeddings = true,
-)]
-fn embeddings_seq_ith_returns_valid_embeddings(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut context = LlamaContext::from_model(
-        fixture.model,
-        fixture.backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-    let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
-    let mut batch = LlamaBatch::new(512, 1)?;
-    batch.add_sequence(&tokens, 0, false)?;
-    context.decode(&mut batch)?;
-
-    let embeddings = context.embeddings_seq_ith(0)?;
-
-    assert_eq!(embeddings.len(), usize::try_from(fixture.model.n_embd())?);
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-    n_seq_max = 4,
-    embeddings = true,
-)]
-fn multi_sequence_embeddings_returns_one_embedding_per_sequence(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let mut context = LlamaContext::from_model(
-        fixture.model,
-        fixture.backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-
-    let inputs = [
-        "alpha is here",
-        "beta runs fast",
-        "gamma waits",
-        "delta jumps",
-    ];
-    let mut batch = LlamaBatch::new(64, 4)?;
-
-    for (sequence_index, text) in inputs.iter().enumerate() {
-        let tokens = fixture.model.str_to_token(text, AddBos::Always)?;
-        let sequence_id = i32::try_from(sequence_index)?;
-
-        batch.add_sequence(&tokens, sequence_id, true)?;
-    }
-
-    context.decode(&mut batch)?;
-
-    let n_embd = usize::try_from(fixture.model.n_embd())?;
-    let mut collected: Vec<Vec<f32>> = Vec::with_capacity(inputs.len());
-
-    for sequence_index in 0..inputs.len() {
-        let sequence_id = i32::try_from(sequence_index)?;
-        let embedding = context.embeddings_seq_ith(sequence_id)?;
-
-        assert_eq!(
-            embedding.len(),
-            n_embd,
-            "sequence {sequence_index} embedding length mismatch"
-        );
-
-        collected.push(embedding.to_vec());
-    }
-
-    for (left_index, left) in collected.iter().enumerate() {
-        for (right_index, right) in collected.iter().enumerate().skip(left_index + 1) {
-            assert_ne!(
-                left, right,
-                "embedding for sequence {left_index} must differ from sequence {right_index}",
-            );
-        }
-    }
-
-    Ok(())
-}
-
-/// Reproduces paddler's embedding batching loop exactly with the document strings, batch
-/// shape, and iteration pattern from the failing harness test
-/// `agent_embedding_batch_distribution_independent_of_context_size`.
-#[llama_test(
-    model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-    n_seq_max = 4,
-    embeddings = true,
-)]
-fn embeddings_returns_distinct_values_when_reused_batch_has_extra_capacity(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let mut context = LlamaContext::from_model(
-        fixture.model,
-        fixture.backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-
-    let iterations = [
-        [
-            "This is the first document with enough content to contribute meaningfully to the batch size calculation",
-            "This is the second document that should be processed in a potentially different batch from the first",
-        ],
-        [
-            "This is the third document adding more content to ensure the total exceeds the configured chunk limit",
-            "This is the fourth document which should demonstrate that batching distributes across agent requests",
-        ],
-    ];
-
-    let n_embd = usize::try_from(fixture.model.n_embd())?;
-    let mut batch = LlamaBatch::new(64, 4)?;
-    let mut collected: Vec<Vec<f32>> = Vec::new();
-
-    for iteration_inputs in iterations {
-        for (sequence_index, text) in iteration_inputs.iter().enumerate() {
-            let tokens = fixture.model.str_to_token(text, AddBos::Always)?;
-            let sequence_id = i32::try_from(sequence_index)?;
-
-            batch.add_sequence(&tokens, sequence_id, true)?;
-        }
-
-        context.clear_kv_cache();
-        context.decode(&mut batch)?;
-
-        for sequence_index in 0..iteration_inputs.len() {
-            let sequence_id = i32::try_from(sequence_index)?;
-            let embedding = context.embeddings_seq_ith(sequence_id)?;
-
-            assert_eq!(
-                embedding.len(),
-                n_embd,
-                "iteration sequence {sequence_index} embedding length mismatch"
-            );
-
-            collected.push(embedding.to_vec());
-        }
-
-        batch.clear();
-    }
-
-    assert_eq!(
-        collected.len(),
-        iterations.iter().flatten().count(),
-        "expected one embedding per input across every iteration"
-    );
-
-    for (left_index, left) in collected.iter().enumerate() {
-        for (right_index, right) in collected.iter().enumerate().skip(left_index + 1) {
-            assert_ne!(
-                left, right,
-                "embedding {left_index} must differ from embedding {right_index} across reused-batch iterations",
-            );
-        }
-    }
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-    embeddings = true,
-)]
-fn embeddings_ith_returns_valid_embeddings(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut context = LlamaContext::from_model(
-        fixture.model,
-        fixture.backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-    let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
-    let last_index = i32::try_from(tokens.len() - 1)?;
-    let mut batch = LlamaBatch::new(512, 1)?;
-    batch.add_sequence(&tokens, 0, false)?;
-    context.decode(&mut batch)?;
-
-    let embeddings = context.embeddings_ith(last_index)?;
-
-    assert_eq!(embeddings.len(), usize::try_from(fixture.model.n_embd())?);
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-    embeddings = true,
-)]
-fn embeddings_ith_returns_null_embedding_error_for_non_embedding_token(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let context = LlamaContext::from_model(
-        fixture.model,
-        fixture.backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-
-    let result = context.embeddings_ith(999);
-
-    assert!(result.is_err());
-
-    Ok(())
-}
-
-// =========================================================================================
-// Group C: t5-small encoder model, embeddings=true. Single trial.
-// =========================================================================================
-
-#[llama_test(
-    model_source = HuggingFace("Xiaojian9992024/t5-small-GGUF", "t5-small.bf16.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-    embeddings = true,
-)]
-fn encode_succeeds_with_encoder_model(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut context = LlamaContext::from_model(
-        fixture.model,
-        fixture.backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-    let tokens = fixture.model.str_to_token("hello", AddBos::Never)?;
-    let mut batch = LlamaBatch::new(512, 1)?;
-    batch.add_sequence(&tokens, 0, false)?;
-
-    let result = context.encode(&mut batch);
-
-    assert!(result.is_ok());
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/context_kv_cache.rs b/llama-cpp-bindings-tests/tests/context_kv_cache.rs
deleted file mode 100644
index 467a2aa4..00000000
--- a/llama-cpp-bindings-tests/tests/context_kv_cache.rs
+++ /dev/null
@@ -1,961 +0,0 @@
-use std::num::NonZeroU8;
-
-use anyhow::Result;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::context::kv_cache::KvCacheConversionError;
-use llama_cpp_bindings::error::KvCacheSeqAddError;
-use llama_cpp_bindings::error::KvCacheSeqDivError;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-fn build_context<'context>(fixture: &'context LlamaFixture<'_>) -> Result<LlamaContext<'context>> {
-    Ok(LlamaContext::from_model(
-        fixture.model,
-        fixture.backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?)
-}
-
-fn decode_hello_world(fixture: &LlamaFixture<'_>, context: &mut LlamaContext<'_>) -> Result<()> {
-    let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
-    let mut batch = LlamaBatch::new(512, 1)?;
-    batch.add_sequence(&tokens, 0, false)?;
-    context.decode(&mut batch)?;
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn clear_kv_cache_resets_positions(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut context = build_context(fixture)?;
-
-    decode_hello_world(fixture, &mut context)?;
-
-    context.clear_kv_cache();
-    assert_eq!(context.kv_cache_seq_pos_max(0), -1);
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn kv_cache_seq_pos_max_is_non_negative_after_decode(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut context = build_context(fixture)?;
-
-    decode_hello_world(fixture, &mut context)?;
-
-    assert!(context.kv_cache_seq_pos_max(0) >= 0);
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn clear_kv_cache_seq_with_range(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut context = build_context(fixture)?;
-
-    decode_hello_world(fixture, &mut context)?;
-
-    let result = context.clear_kv_cache_seq(Some(0), Some(0), Some(1));
-    assert!(result.is_ok());
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn copy_kv_cache_seq_succeeds(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut context = build_context(fixture)?;
-
-    decode_hello_world(fixture, &mut context)?;
-
-    let result = context.copy_kv_cache_seq(0, 1, None, None);
-    assert!(result.is_ok());
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn copy_cache_executes_without_crash(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut context = build_context(fixture)?;
-
-    decode_hello_world(fixture, &mut context)?;
-
-    let pos_max = context.kv_cache_seq_pos_max(0);
-    context.copy_cache(0, 1, pos_max + 1);
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn kv_cache_seq_add_returns_error_for_mrope_model(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut context = build_context(fixture)?;
-
-    decode_hello_world(fixture, &mut context)?;
-
-    let result = context.kv_cache_seq_add(0, Some(0), None, 1);
-
-    assert!(matches!(
-        result.unwrap_err(),
-        KvCacheSeqAddError::IncompatibleRopeType,
-    ));
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn kv_cache_seq_div_returns_error_for_mrope_model(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut context = build_context(fixture)?;
-
-    decode_hello_world(fixture, &mut context)?;
-
-    let divisor = NonZeroU8::new(2).ok_or_else(|| anyhow::anyhow!("2 is non-zero"))?;
-    let result = context.kv_cache_seq_div(0, Some(0), None, divisor);
-
-    assert!(matches!(
-        result.unwrap_err(),
-        KvCacheSeqDivError::IncompatibleRopeType,
-    ));
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn kv_cache_seq_keep_retains_specified_sequence(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut context = build_context(fixture)?;
-
-    decode_hello_world(fixture, &mut context)?;
-
-    context.kv_cache_seq_keep(0);
-
-    assert!(context.kv_cache_seq_pos_max(0) >= 0);
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn copy_kv_cache_seq_with_explicit_range(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut context = build_context(fixture)?;
-
-    decode_hello_world(fixture, &mut context)?;
-
-    let result = context.copy_kv_cache_seq(0, 2, Some(0), Some(1));
-
-    assert!(result.is_ok());
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn kv_cache_seq_add_succeeds_on_embedding_model(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut context = build_context(fixture)?;
-
-    decode_hello_world(fixture, &mut context)?;
-
-    let result = context.kv_cache_seq_add(0, Some(0), None, 1);
-
-    assert!(result.is_ok());
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn kv_cache_seq_div_succeeds_on_embedding_model(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut context = build_context(fixture)?;
-
-    decode_hello_world(fixture, &mut context)?;
-
-    let divisor = NonZeroU8::new(2).ok_or_else(|| anyhow::anyhow!("2 is non-zero"))?;
-    let result = context.kv_cache_seq_div(0, Some(0), None, divisor);
-
-    assert!(result.is_ok());
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn kv_cache_seq_pos_max_returns_negative_one_for_unused_seq(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let context = build_context(fixture)?;
-
-    let result = context.kv_cache_seq_pos_max(999);
-
-    assert_eq!(result, -1);
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn copy_kv_cache_seq_rejects_p0_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut context = build_context(fixture)?;
-
-    let result = context.copy_kv_cache_seq(0, 1, Some(u32::MAX), None);
-
-    assert!(matches!(
-        result.unwrap_err(),
-        KvCacheConversionError::P0TooLarge(_),
-    ));
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn copy_kv_cache_seq_rejects_p1_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut context = build_context(fixture)?;
-
-    let result = context.copy_kv_cache_seq(0, 1, Some(0), Some(u32::MAX));
-
-    assert!(matches!(
-        result.unwrap_err(),
-        KvCacheConversionError::P1TooLarge(_),
-    ));
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn clear_kv_cache_seq_rejects_src_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut context = build_context(fixture)?;
-
-    let result = context.clear_kv_cache_seq(Some(u32::MAX), None, None);
-
-    assert!(matches!(
-        result.unwrap_err(),
-        KvCacheConversionError::SeqIdTooLarge(_),
-    ));
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn clear_kv_cache_seq_rejects_p0_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut context = build_context(fixture)?;
-
-    let result = context.clear_kv_cache_seq(Some(0), Some(u32::MAX), None);
-
-    assert!(matches!(
-        result.unwrap_err(),
-        KvCacheConversionError::P0TooLarge(_),
-    ));
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn clear_kv_cache_seq_rejects_p1_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut context = build_context(fixture)?;
-
-    let result = context.clear_kv_cache_seq(Some(0), Some(0), Some(u32::MAX));
-
-    assert!(matches!(
-        result.unwrap_err(),
-        KvCacheConversionError::P1TooLarge(_),
-    ));
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn kv_cache_seq_add_rejects_p0_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut context = build_context(fixture)?;
-
-    let result = context.kv_cache_seq_add(0, Some(u32::MAX), None, 1);
-
-    assert!(matches!(
-        result.unwrap_err(),
-        KvCacheSeqAddError::P0TooLarge(_),
-    ));
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn kv_cache_seq_add_rejects_p1_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut context = build_context(fixture)?;
-
-    let result = context.kv_cache_seq_add(0, Some(0), Some(u32::MAX), 1);
-
-    assert!(matches!(
-        result.unwrap_err(),
-        KvCacheSeqAddError::P1TooLarge(_),
-    ));
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn kv_cache_seq_div_rejects_p0_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut context = build_context(fixture)?;
-
-    let divisor = NonZeroU8::new(2).ok_or_else(|| anyhow::anyhow!("2 is non-zero"))?;
-    let result = context.kv_cache_seq_div(0, Some(u32::MAX), None, divisor);
-
-    assert!(matches!(
-        result.unwrap_err(),
-        KvCacheSeqDivError::P0TooLarge(_),
-    ));
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn kv_cache_seq_div_rejects_p1_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut context = build_context(fixture)?;
-
-    let divisor = NonZeroU8::new(2).ok_or_else(|| anyhow::anyhow!("2 is non-zero"))?;
-    let result = context.kv_cache_seq_div(0, Some(0), Some(u32::MAX), divisor);
-
-    assert!(matches!(
-        result.unwrap_err(),
-        KvCacheSeqDivError::P1TooLarge(_),
-    ));
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/context_session.rs b/llama-cpp-bindings-tests/tests/context_session.rs
deleted file mode 100644
index d32f7ecf..00000000
--- a/llama-cpp-bindings-tests/tests/context_session.rs
+++ /dev/null
@@ -1,1162 +0,0 @@
-use anyhow::Result;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-fn build_context<'context>(fixture: &'context LlamaFixture<'_>) -> Result<LlamaContext<'context>> {
-    Ok(LlamaContext::from_model(
-        fixture.model,
-        fixture.backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?)
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn save_and_load_session_file(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut context = build_context(fixture)?;
-
-    let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
-    let mut batch = LlamaBatch::new(512, 1)?;
-    batch.add_sequence(&tokens, 0, false)?;
-    context.decode(&mut batch)?;
-
-    let session_path = std::env::temp_dir().join("llama_test_session.bin");
-    context.state_save_file(&session_path, &tokens)?;
-
-    let loaded_tokens = context.state_load_file(&session_path, 512)?;
-    assert_eq!(loaded_tokens, tokens);
-
-    std::fs::remove_file(&session_path)?;
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn get_state_size_is_positive(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let context = build_context(fixture)?;
-
-    assert!(context.get_state_size() > 0);
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn state_seq_save_and_load_file_roundtrip(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut context = build_context(fixture)?;
-
-    let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
-    let mut batch = LlamaBatch::new(512, 1)?;
-    batch.add_sequence(&tokens, 0, false)?;
-    context.decode(&mut batch)?;
-
-    let session_path = std::env::temp_dir().join("llama_test_seq_state.bin");
-    let bytes_written = context.state_seq_save_file(&session_path, 0, &tokens)?;
-    assert!(bytes_written > 0);
-
-    let (loaded_tokens, bytes_read) = context.state_seq_load_file(&session_path, 0, 512)?;
-    assert_eq!(loaded_tokens, tokens);
-    assert!(bytes_read > 0);
-
-    std::fs::remove_file(&session_path)?;
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn copy_state_data_and_set_state_data_roundtrip(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut context = build_context(fixture)?;
-
-    let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
-    let mut batch = LlamaBatch::new(512, 1)?;
-    batch.add_sequence(&tokens, 0, false)?;
-    context.decode(&mut batch)?;
-
-    let state_size = context.get_state_size();
-    let mut state_data = vec![0u8; state_size];
-    let bytes_copied = unsafe { context.copy_state_data(&mut state_data) };
-    assert!(bytes_copied > 0);
-
-    let bytes_read = unsafe { context.set_state_data(&state_data) };
-    assert!(bytes_read > 0);
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn state_load_file_with_nonexistent_file_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut context = build_context(fixture)?;
-
-    let result = context.state_load_file("/nonexistent/session.bin", 512);
-
-    assert!(result.is_err());
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn state_seq_load_file_with_nonexistent_file_returns_error(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let mut context = build_context(fixture)?;
-
-    let result = context.state_seq_load_file("/nonexistent/seq_state.bin", 0, 512);
-
-    assert!(result.is_err());
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn state_save_file_to_invalid_directory_returns_failed_to_save(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let context = build_context(fixture)?;
-
-    let result = context.state_save_file("/nonexistent_dir/session.bin", &[]);
-
-    assert!(result.is_err());
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn state_seq_save_file_to_invalid_directory_returns_failed_to_save(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let context = build_context(fixture)?;
-
-    let result = context.state_seq_save_file("/nonexistent_dir/seq_state.bin", 0, &[]);
-
-    assert!(result.is_err());
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn state_load_file_with_zero_max_tokens_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut context = build_context(fixture)?;
-
-    let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
-    let mut batch = LlamaBatch::new(512, 1)?;
-    batch.add_sequence(&tokens, 0, false)?;
-    context.decode(&mut batch)?;
-
-    let session_path = std::env::temp_dir().join("llama_test_session_zero_max.bin");
-    context.state_save_file(&session_path, &tokens)?;
-
-    let result = context.state_load_file(&session_path, 0);
-
-    assert!(result.is_err());
-    let _ = std::fs::remove_file(&session_path);
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn state_seq_load_file_with_zero_max_tokens_returns_error(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let mut context = build_context(fixture)?;
-
-    let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
-    let mut batch = LlamaBatch::new(512, 1)?;
-    batch.add_sequence(&tokens, 0, false)?;
-    context.decode(&mut batch)?;
-
-    let session_path = std::env::temp_dir().join("llama_test_seq_state_zero_max.bin");
-    context.state_seq_save_file(&session_path, 0, &tokens)?;
-
-    let result = context.state_seq_load_file(&session_path, 0, 0);
-
-    assert!(result.is_err());
-    let _ = std::fs::remove_file(&session_path);
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn state_load_file_with_insufficient_max_tokens_returns_length_error(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let mut context = build_context(fixture)?;
-
-    let tokens = fixture.model.str_to_token(
-        "Hello world this is a longer string for more tokens",
-        AddBos::Always,
-    )?;
-    let mut batch = LlamaBatch::new(512, 1)?;
-    batch.add_sequence(&tokens, 0, false)?;
-    context.decode(&mut batch)?;
-
-    let session_path = std::env::temp_dir().join("llama_test_session_insuf.bin");
-    context.state_save_file(&session_path, &tokens)?;
-
-    let result = context.state_load_file(&session_path, 1);
-
-    assert!(result.is_err());
-    let _ = std::fs::remove_file(&session_path);
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn state_seq_load_file_with_insufficient_max_tokens_returns_length_error(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let mut context = build_context(fixture)?;
-
-    let tokens = fixture.model.str_to_token(
-        "Hello world this is a longer string for more tokens",
-        AddBos::Always,
-    )?;
-    let mut batch = LlamaBatch::new(512, 1)?;
-    batch.add_sequence(&tokens, 0, false)?;
-    context.decode(&mut batch)?;
-
-    let session_path = std::env::temp_dir().join("llama_test_seq_state_insuf.bin");
-    context.state_seq_save_file(&session_path, 0, &tokens)?;
-
-    let result = context.state_seq_load_file(&session_path, 0, 1);
-
-    assert!(result.is_err());
-    let _ = std::fs::remove_file(&session_path);
-
-    Ok(())
-}
-
-#[cfg(unix)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn state_save_file_with_non_utf8_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
-    use std::ffi::OsStr;
-    use std::os::unix::ffi::OsStrExt;
-
-    let context = build_context(fixture)?;
-
-    let non_utf8_path = std::path::Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.bin"));
-    let result = context.state_save_file(non_utf8_path, &[]);
-
-    assert!(result.is_err());
-
-    Ok(())
-}
-
-#[cfg(unix)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn state_load_file_with_non_utf8_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
-    use std::ffi::OsStr;
-    use std::os::unix::ffi::OsStrExt;
-
-    let mut context = build_context(fixture)?;
-
-    let non_utf8_path = std::path::Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.bin"));
-    let result = context.state_load_file(non_utf8_path, 512);
-
-    assert!(result.is_err());
-
-    Ok(())
-}
-
-#[cfg(unix)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn state_seq_save_file_with_non_utf8_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
-    use std::ffi::OsStr;
-    use std::os::unix::ffi::OsStrExt;
-
-    let context = build_context(fixture)?;
-
-    let non_utf8_path = std::path::Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.bin"));
-    let result = context.state_seq_save_file(non_utf8_path, 0, &[]);
-
-    assert!(result.is_err());
-
-    Ok(())
-}
-
-#[cfg(unix)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn state_seq_load_file_with_non_utf8_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
-    use std::ffi::OsStr;
-    use std::os::unix::ffi::OsStrExt;
-
-    let mut context = build_context(fixture)?;
-
-    let non_utf8_path = std::path::Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.bin"));
-    let result = context.state_seq_load_file(non_utf8_path, 0, 512);
-
-    assert!(result.is_err());
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn state_save_file_with_null_byte_in_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let context = build_context(fixture)?;
-
-    let path_with_null = std::path::Path::new("/tmp/foo\0bar.bin");
-    let result = context.state_save_file(path_with_null, &[]);
-
-    assert!(result.is_err());
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn state_load_file_with_null_byte_in_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut context = build_context(fixture)?;
-
-    let path_with_null = std::path::Path::new("/tmp/foo\0bar.bin");
-    let result = context.state_load_file(path_with_null, 512);
-
-    assert!(result.is_err());
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn state_seq_save_file_with_null_byte_in_path_returns_error(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let context = build_context(fixture)?;
-
-    let path_with_null = std::path::Path::new("/tmp/foo\0bar.bin");
-    let result = context.state_seq_save_file(path_with_null, 0, &[]);
-
-    assert!(result.is_err());
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn state_seq_load_file_with_null_byte_in_path_returns_error(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let mut context = build_context(fixture)?;
-
-    let path_with_null = std::path::Path::new("/tmp/foo\0bar.bin");
-    let result = context.state_seq_load_file(path_with_null, 0, 512);
-
-    assert!(result.is_err());
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn state_seq_get_size_ext_returns_size_for_decoded_sequence(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    use llama_cpp_bindings::context::llama_state_seq_flags::LlamaStateSeqFlags;
-
-    let mut context = build_context(fixture)?;
-
-    let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
-    let mut batch = LlamaBatch::new(512, 1)?;
-    batch.add_sequence(&tokens, 0, false)?;
-    context.decode(&mut batch)?;
-
-    let flags = LlamaStateSeqFlags::empty();
-    let size = context.state_seq_get_size_ext(0, &flags);
-
-    assert!(size > 0);
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn state_seq_get_data_ext_and_set_data_ext_round_trip(fixture: &LlamaFixture<'_>) -> Result<()> {
-    use llama_cpp_bindings::context::llama_state_seq_flags::LlamaStateSeqFlags;
-
-    let mut context = build_context(fixture)?;
-
-    let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
-    let mut batch = LlamaBatch::new(512, 1)?;
-    batch.add_sequence(&tokens, 0, false)?;
-    context.decode(&mut batch)?;
-
-    let flags = LlamaStateSeqFlags::empty();
-    let size = context.state_seq_get_size_ext(0, &flags);
-    let mut buffer = vec![0u8; size];
-    let bytes_written = unsafe { context.state_seq_get_data_ext(&mut buffer, 0, &flags) };
-
-    assert!(bytes_written > 0);
-
-    let bytes_read = unsafe { context.state_seq_set_data_ext(&buffer, 0, &flags) };
-
-    assert!(bytes_read > 0);
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs b/llama-cpp-bindings-tests/tests/deepseek_r1_8b_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs
deleted file mode 100644
index 712397df..00000000
--- a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs
+++ /dev/null
@@ -1,126 +0,0 @@
-use anyhow::Result;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_bindings::sampling::LlamaSampler;
-use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const MAX_GENERATED_TOKENS: i32 = 200;
-
-const DEEPSEEK_R1_8B_THINKING_DISABLED_PROMPT: &str = "\
-<｜User｜>What is 2 + 2?<｜Assistant｜><think>
-
-</think>
-
-";
-
-const FORBIDDEN_MARKERS: &[&str] = &["<think>", "</think>"];
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 8192,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn deepseek_r1_8b_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let model = fixture.model;
-    let backend = fixture.backend;
-
-    let mut classifier = model.sampled_token_classifier();
-    let prompt_tokens =
-        model.str_to_token(DEEPSEEK_R1_8B_THINKING_DISABLED_PROMPT, AddBos::Never)?;
-    let prompt_token_count = u64::try_from(prompt_tokens.len())?;
-
-    let mut batch = LlamaBatch::new(2048, 1)?;
-    classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
-
-    let mut context = LlamaContext::from_model(
-        model,
-        backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-
-    context.decode(&mut batch)?;
-
-    let promoted = classifier.commit_prompt_tokens();
-    assert_eq!(promoted, prompt_token_count);
-
-    let mut sampler = LlamaSampler::chain_simple([
-        LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
-        LlamaSampler::top_k(40),
-        LlamaSampler::top_p(0.9, 1),
-        LlamaSampler::min_p(0.05, 1),
-        LlamaSampler::temp(0.7),
-        LlamaSampler::dist(0x00C0_FFEE),
-    ]);
-    let initial_position = batch.n_tokens();
-    let outcome = ClassifySampleLoop {
-        model,
-        classifier: &mut classifier,
-        sampler: &mut sampler,
-        context: &mut context,
-        batch: &mut batch,
-        initial_position,
-        max_generated_tokens: MAX_GENERATED_TOKENS,
-    }
-    .run()?;
-
-    let usage = classifier.usage();
-
-    assert!(
-        !outcome.generated_raw.is_empty(),
-        "DeepSeek-R1-8B: must generate at least one token"
-    );
-    assert_eq!(
-        outcome.observed_reasoning, 0,
-        "DeepSeek-R1-8B thinking-disabled: classifier must not emit any Reasoning token \
-         when the prompt closes the think block before generation begins; \
-         generated={:?}",
-        outcome.generated_raw
-    );
-    assert_eq!(
-        outcome.observed_undeterminable, 0,
-        "DeepSeek-R1-8B thinking-disabled: prompt-token replay must move section to Content \
-         before generation, so no Undeterminable tokens may be emitted; \
-         generated={:?}",
-        outcome.generated_raw
-    );
-    assert_eq!(
-        usage.reasoning_tokens, 0,
-        "DeepSeek-R1-8B thinking-disabled: usage.reasoning_tokens must be zero; usage={usage:?}"
-    );
-    assert_eq!(
-        usage.undeterminable_tokens, 0,
-        "DeepSeek-R1-8B thinking-disabled: usage.undeterminable_tokens must be zero; usage={usage:?}"
-    );
-    assert!(
-        outcome.observed_content > 0,
-        "DeepSeek-R1-8B thinking-disabled: classifier must emit at least one Content token"
-    );
-    assert_eq!(
-        usage.completion_tokens(),
-        outcome.observed_content,
-        "DeepSeek-R1-8B thinking-disabled: completion tokens must equal observed Content tokens"
-    );
-
-    for forbidden in FORBIDDEN_MARKERS {
-        assert!(
-            !outcome.content_stream.contains(forbidden),
-            "DeepSeek-R1-8B thinking-disabled: content_stream leaked marker {forbidden:?}; \
-             content_stream={:?}",
-            outcome.content_stream
-        );
-    }
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_classifier_emits_reasoning.rs b/llama-cpp-bindings-tests/tests/deepseek_r1_8b_classifier_emits_reasoning.rs
deleted file mode 100644
index 6bed6bbe..00000000
--- a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_classifier_emits_reasoning.rs
+++ /dev/null
@@ -1,151 +0,0 @@
-use anyhow::Result;
-use anyhow::bail;
-use llama_cpp_bindings::ChatMessageParseOutcome;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_bindings::sampling::LlamaSampler;
-use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const MAX_GENERATED_TOKENS: i32 = 1500;
-
-// DeepSeek-R1-Distill-Llama-8B uses `<think>...</think>` reasoning markers
-// and full-width-bar role tokens `<｜User｜>` / `<｜Assistant｜>` (U+FF5C,
-// not ASCII `|`). The chat template's `add_generation_prompt` ALWAYS appends
-// `<｜Assistant｜><think>\n` — DeepSeek-R1 is a pure reasoner with no
-// thinking-disabled mode — so the model resumes generation already inside
-// the reasoning block.
-const DEEPSEEK_R1_8B_THINKING_PROMPT: &str = "\
-<｜User｜>What is 2 + 2?<｜Assistant｜><think>
-";
-
-const FORBIDDEN_MARKERS: &[&str] = &["<think>", "</think>"];
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 8192,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn deepseek_r1_8b_classifier_emits_reasoning_for_thinking_enabled_prompt(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let model = fixture.model;
-    let backend = fixture.backend;
-
-    let mut classifier = model.sampled_token_classifier();
-    let prompt_tokens = model.str_to_token(DEEPSEEK_R1_8B_THINKING_PROMPT, AddBos::Never)?;
-    let prompt_token_count = u64::try_from(prompt_tokens.len())?;
-
-    let mut batch = LlamaBatch::new(2048, 1)?;
-    classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
-
-    let mut context = LlamaContext::from_model(
-        model,
-        backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-
-    context.decode(&mut batch)?;
-
-    let promoted = classifier.commit_prompt_tokens();
-    assert_eq!(promoted, prompt_token_count);
-
-    let mut sampler = LlamaSampler::chain_simple([
-        LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
-        LlamaSampler::top_k(40),
-        LlamaSampler::top_p(0.9, 1),
-        LlamaSampler::min_p(0.05, 1),
-        LlamaSampler::temp(0.7),
-        LlamaSampler::dist(0x00C0_FFEE),
-    ]);
-    let initial_position = batch.n_tokens();
-    let outcome = ClassifySampleLoop {
-        model,
-        classifier: &mut classifier,
-        sampler: &mut sampler,
-        context: &mut context,
-        batch: &mut batch,
-        initial_position,
-        max_generated_tokens: MAX_GENERATED_TOKENS,
-    }
-    .run()?;
-
-    let usage = classifier.usage();
-    let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?;
-    let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else {
-        bail!("DeepSeek-R1-8B chat template must be recognised by the parser; got Unrecognized");
-    };
-
-    assert!(
-        !outcome.generated_raw.is_empty(),
-        "DeepSeek-R1-8B: must generate at least one token"
-    );
-    assert!(
-        outcome.observed_reasoning > 0,
-        "DeepSeek-R1-8B: classifier must emit at least one Reasoning token when the prompt \
-         opens a <think> block; outcome={outcome:?}",
-    );
-    assert!(
-        usage.reasoning_tokens > 0,
-        "DeepSeek-R1-8B: usage.reasoning_tokens must be non-zero when the prompt opens a \
-         <think> block; usage was {usage:?}"
-    );
-    assert_eq!(
-        outcome.observed_undeterminable, 0,
-        "DeepSeek-R1-8B: prompt-token replay must move section to Reasoning before generation, \
-         so no Undeterminable tokens may be emitted; outcome={outcome:?}"
-    );
-    assert_eq!(
-        usage.undeterminable_tokens, 0,
-        "DeepSeek-R1-8B: usage.undeterminable_tokens must be zero; usage={usage:?}"
-    );
-    assert_eq!(
-        usage.completion_tokens(),
-        outcome.observed_content + outcome.observed_reasoning,
-        "DeepSeek-R1-8B: completion tokens must equal observed Content + Reasoning"
-    );
-
-    if parsed.reasoning_content.is_empty() {
-        eprintln!(
-            "DeepSeek-R1-8B didn't close its reasoning block within {MAX_GENERATED_TOKENS} \
-             tokens — skipping strict parser-equality assertions"
-        );
-    } else {
-        assert_eq!(
-            outcome.reasoning_stream, parsed.reasoning_content,
-            "DeepSeek-R1-8B: per-token reasoning stream must equal parser-side reasoning_content \
-             (any difference means a marker leaked into the user-visible stream)",
-        );
-        assert_eq!(
-            outcome.content_stream, parsed.content,
-            "DeepSeek-R1-8B: per-token content stream must equal parser-side content \
-             (any difference means a marker leaked into the user-visible stream)",
-        );
-    }
-
-    for forbidden in FORBIDDEN_MARKERS {
-        assert!(
-            !outcome.reasoning_stream.contains(forbidden),
-            "DeepSeek-R1-8B: reasoning_stream leaked marker {forbidden:?}; \
-             reasoning_stream={:?}",
-            outcome.reasoning_stream
-        );
-        assert!(
-            !outcome.content_stream.contains(forbidden),
-            "DeepSeek-R1-8B: content_stream leaked marker {forbidden:?}; \
-             content_stream={:?}",
-            outcome.content_stream
-        );
-    }
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_gemma_paired_quote.rs b/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_gemma_paired_quote.rs
deleted file mode 100644
index ce2b922d..00000000
--- a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_gemma_paired_quote.rs
+++ /dev/null
@@ -1,70 +0,0 @@
-use anyhow::Result;
-use anyhow::bail;
-use llama_cpp_bindings::ChatMessageParseOutcome;
-use llama_cpp_bindings::ToolCallArguments;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const TOOLS_JSON: &str = r#"[
-    {
-        "type": "function",
-        "function": {
-            "name": "get_weather",
-            "description": "Get the current weather for a location",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "location": {"type": "string", "description": "The city name"}
-                },
-                "required": ["location"]
-            }
-        }
-    }
-]"#;
-
-const GEMMA_PAIRED_QUOTE_PAYLOAD: &str = "<|tool_call>call:get_weather{location:<|\"|>Paris<|\"|>}";
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn deepseek_r1_8b_duck_types_gemma_paired_quote(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let outcome =
-        fixture
-            .model
-            .parse_chat_message(TOOLS_JSON, GEMMA_PAIRED_QUOTE_PAYLOAD, false)?;
-
-    let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
-        bail!(
-            "duck-type pass must recognise Gemma paired-quote on a model with no registered \
-             template; got Unrecognized"
-        );
-    };
-    assert_eq!(
-        parsed.tool_calls.len(),
-        1,
-        "expected one tool call; got {:?}",
-        parsed.tool_calls
-    );
-    assert_eq!(parsed.tool_calls[0].name, "get_weather");
-    let location = match &parsed.tool_calls[0].arguments {
-        ToolCallArguments::ValidJson(value) => value
-            .get("location")
-            .and_then(|v| v.as_str())
-            .map(str::to_owned),
-        ToolCallArguments::InvalidJson(raw) => {
-            bail!("expected ValidJson, got InvalidJson: {raw}");
-        }
-    };
-    assert_eq!(location.as_deref(), Some("Paris"));
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_glm_key_value_tags.rs b/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_glm_key_value_tags.rs
deleted file mode 100644
index 7b9e052b..00000000
--- a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_glm_key_value_tags.rs
+++ /dev/null
@@ -1,72 +0,0 @@
-use anyhow::Result;
-use anyhow::bail;
-use llama_cpp_bindings::ChatMessageParseOutcome;
-use llama_cpp_bindings::ToolCallArguments;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const TOOLS_JSON: &str = r#"[
-    {
-        "type": "function",
-        "function": {
-            "name": "get_weather",
-            "description": "Get the current weather for a location",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "location": {"type": "string", "description": "The city name"}
-                },
-                "required": ["location"]
-            }
-        }
-    }
-]"#;
-
-const GLM_KEY_VALUE_PAYLOAD: &str = "<tool_call>get_weather\
-<arg_key>location</arg_key>\
-<arg_value>Paris</arg_value>\
-</tool_call>";
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn deepseek_r1_8b_duck_types_glm_key_value_tags(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let outcome = fixture
-        .model
-        .parse_chat_message(TOOLS_JSON, GLM_KEY_VALUE_PAYLOAD, false)?;
-
-    let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
-        bail!(
-            "duck-type pass must recognise GLM key-value tags on a model with no registered \
-             template; got Unrecognized"
-        );
-    };
-    assert_eq!(
-        parsed.tool_calls.len(),
-        1,
-        "expected one tool call; got {:?}",
-        parsed.tool_calls
-    );
-    assert_eq!(parsed.tool_calls[0].name, "get_weather");
-    let location = match &parsed.tool_calls[0].arguments {
-        ToolCallArguments::ValidJson(value) => value
-            .get("location")
-            .and_then(|v| v.as_str())
-            .map(str::to_owned),
-        ToolCallArguments::InvalidJson(raw) => {
-            bail!("expected ValidJson, got InvalidJson: {raw}");
-        }
-    };
-    assert_eq!(location.as_deref(), Some("Paris"));
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_mistral_bracketed_json.rs b/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_mistral_bracketed_json.rs
deleted file mode 100644
index 66b4caab..00000000
--- a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_mistral_bracketed_json.rs
+++ /dev/null
@@ -1,70 +0,0 @@
-use anyhow::Result;
-use anyhow::bail;
-use llama_cpp_bindings::ChatMessageParseOutcome;
-use llama_cpp_bindings::ToolCallArguments;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const TOOLS_JSON: &str = r#"[
-    {
-        "type": "function",
-        "function": {
-            "name": "get_weather",
-            "description": "Get the current weather for a location",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "location": {"type": "string", "description": "The city name"}
-                },
-                "required": ["location"]
-            }
-        }
-    }
-]"#;
-
-const MISTRAL_BRACKETED_JSON_PAYLOAD: &str = r#"[TOOL_CALLS]get_weather[ARGS]{"location":"Paris"}"#;
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn deepseek_r1_8b_duck_types_mistral_bracketed_json(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let outcome =
-        fixture
-            .model
-            .parse_chat_message(TOOLS_JSON, MISTRAL_BRACKETED_JSON_PAYLOAD, false)?;
-
-    let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
-        bail!(
-            "duck-type pass must recognise Mistral bracketed-JSON on a model with no registered \
-             template; got Unrecognized"
-        );
-    };
-    assert_eq!(
-        parsed.tool_calls.len(),
-        1,
-        "expected one tool call; got {:?}",
-        parsed.tool_calls
-    );
-    assert_eq!(parsed.tool_calls[0].name, "get_weather");
-    let location = match &parsed.tool_calls[0].arguments {
-        ToolCallArguments::ValidJson(value) => value
-            .get("location")
-            .and_then(|v| v.as_str())
-            .map(str::to_owned),
-        ToolCallArguments::InvalidJson(raw) => {
-            bail!("expected ValidJson, got InvalidJson: {raw}");
-        }
-    };
-    assert_eq!(location.as_deref(), Some("Paris"));
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_qwen_xml.rs b/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_qwen_xml.rs
deleted file mode 100644
index 203ae0e8..00000000
--- a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_qwen_xml.rs
+++ /dev/null
@@ -1,75 +0,0 @@
-use anyhow::Result;
-use anyhow::bail;
-use llama_cpp_bindings::ChatMessageParseOutcome;
-use llama_cpp_bindings::ToolCallArguments;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const TOOLS_JSON: &str = r#"[
-    {
-        "type": "function",
-        "function": {
-            "name": "get_weather",
-            "description": "Get the current weather for a location",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "location": {"type": "string", "description": "The city name"}
-                },
-                "required": ["location"]
-            }
-        }
-    }
-]"#;
-
-const QWEN_XML_PAYLOAD: &str = "<tool_call>\n\
-<function=get_weather>\n\
-<parameter=location>\n\
-Paris\n\
-</parameter>\n\
-</function>\n\
-</tool_call>";
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn deepseek_r1_8b_duck_types_qwen_xml(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let outcome = fixture
-        .model
-        .parse_chat_message(TOOLS_JSON, QWEN_XML_PAYLOAD, false)?;
-
-    let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
-        bail!(
-            "duck-type pass must recognise Qwen XML on a model with no registered template; \
-             got Unrecognized"
-        );
-    };
-    assert_eq!(
-        parsed.tool_calls.len(),
-        1,
-        "expected one tool call; got {:?}",
-        parsed.tool_calls
-    );
-    assert_eq!(parsed.tool_calls[0].name, "get_weather");
-    let location = match &parsed.tool_calls[0].arguments {
-        ToolCallArguments::ValidJson(value) => value
-            .get("location")
-            .and_then(|v| v.as_str())
-            .map(str::to_owned),
-        ToolCallArguments::InvalidJson(raw) => {
-            bail!("expected ValidJson, got InvalidJson: {raw}");
-        }
-    };
-    assert_eq!(location.as_deref(), Some("Paris"));
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested.rs b/llama-cpp-bindings-tests/tests/deepseek_r1_8b_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested.rs
deleted file mode 100644
index 2921b3d6..00000000
--- a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested.rs
+++ /dev/null
@@ -1,58 +0,0 @@
-use anyhow::Result;
-use anyhow::bail;
-use llama_cpp_bindings::ChatMessageParseOutcome;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const TOOLS_JSON: &str = r#"[
-    {
-        "type": "function",
-        "function": {
-            "name": "get_weather",
-            "description": "Get the current weather for a location",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "location": {"type": "string", "description": "The city name"}
-                },
-                "required": ["location"]
-            }
-        }
-    }
-]"#;
-
-const PLAIN_CONTENT: &str = "Sorry, I cannot help with that.";
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn deepseek_r1_8b_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let outcome = fixture
-        .model
-        .parse_chat_message(TOOLS_JSON, PLAIN_CONTENT, false)?;
-
-    let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
-        bail!(
-            "plain content with tools requested must produce Recognized (with empty tool_calls); \
-             got Unrecognized"
-        );
-    };
-    assert!(
-        parsed.tool_calls.is_empty(),
-        "expected no tool calls; got {:?}",
-        parsed.tool_calls
-    );
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_recognizes_empty_tool_calls_when_tools_not_requested.rs b/llama-cpp-bindings-tests/tests/deepseek_r1_8b_recognizes_empty_tool_calls_when_tools_not_requested.rs
deleted file mode 100644
index cc48350f..00000000
--- a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_recognizes_empty_tool_calls_when_tools_not_requested.rs
+++ /dev/null
@@ -1,38 +0,0 @@
-use anyhow::Result;
-use anyhow::bail;
-use llama_cpp_bindings::ChatMessageParseOutcome;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const PLAIN_CONTENT: &str = "Hello there.";
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn deepseek_r1_8b_recognizes_empty_tool_calls_when_tools_not_requested(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let outcome = fixture
-        .model
-        .parse_chat_message("[]", PLAIN_CONTENT, false)?;
-
-    let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
-        bail!("plain content with empty tools array must produce Recognized; got Unrecognized");
-    };
-    assert!(
-        parsed.tool_calls.is_empty(),
-        "expected no tool calls; got {:?}",
-        parsed.tool_calls
-    );
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/embedding_and_encoder.rs b/llama-cpp-bindings-tests/tests/embedding_and_encoder.rs
new file mode 100644
index 00000000..cebd47c1
--- /dev/null
+++ b/llama-cpp-bindings-tests/tests/embedding_and_encoder.rs
@@ -0,0 +1,707 @@
+use llama_cpp_test_harness::llama_tests_main;
+
+mod embeddings {
+    use std::time::Duration;
+
+    use anyhow::{Context, Result};
+    use llama_cpp_bindings::context::LlamaContext;
+    use llama_cpp_bindings::ggml_time_us;
+    use llama_cpp_bindings::llama_batch::LlamaBatch;
+    use llama_cpp_bindings::model::AddBos;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    fn normalize(input: &[f32]) -> Vec<f32> {
+        let magnitude = input
+            .iter()
+            .fold(0.0, |accumulator, &value| value.mul_add(value, accumulator))
+            .sqrt();
+
+        input.iter().map(|&value| value / magnitude).collect()
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+        n_threads_batch = 8,
+        embeddings = true,
+    )]
+    fn embedding_generation_produces_vectors(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let model = fixture.model;
+
+        let mut ctx = LlamaContext::from_model(
+            model,
+            fixture.backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )
+        .with_context(|| "unable to create context")?;
+
+        let prompt = "Hello my name is";
+        let tokens = model
+            .str_to_token(prompt, AddBos::Always)
+            .with_context(|| format!("failed to tokenize {prompt}"))?;
+        let prompt_token_count = u64::try_from(tokens.len())?;
+
+        let n_ctx = usize::try_from(ctx.n_ctx())?;
+        assert!(tokens.len() <= n_ctx, "prompt exceeds context window size");
+
+        let t_main_start = ggml_time_us();
+
+        let mut classifier = model.sampled_token_classifier();
+        let mut batch = LlamaBatch::new(n_ctx, 1)?;
+        classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?;
+
+        assert_eq!(classifier.pending_prompt_tokens(), prompt_token_count);
+        assert_eq!(classifier.usage().prompt_tokens, 0);
+
+        ctx.clear_kv_cache();
+        ctx.decode(&mut batch)
+            .with_context(|| "llama_decode() failed")?;
+
+        let promoted = classifier.commit_prompt_tokens();
+        assert_eq!(promoted, prompt_token_count);
+
+        let embedding = ctx
+            .embeddings_seq_ith(0)
+            .with_context(|| "failed to get embeddings")?;
+        let normalized = normalize(embedding);
+
+        let t_main_end = ggml_time_us();
+        let duration = Duration::from_micros(u64::try_from(t_main_end - t_main_start)?);
+
+        eprintln!(
+            "created embedding with {} dimensions in {:.2} s",
+            normalized.len(),
+            duration.as_secs_f32()
+        );
+
+        assert!(
+            !normalized.is_empty(),
+            "embedding should have at least one dimension"
+        );
+
+        let magnitude: f32 = normalized
+            .iter()
+            .map(|value| value * value)
+            .sum::<f32>()
+            .sqrt();
+        assert!(
+            (magnitude - 1.0).abs() < 0.01,
+            "normalized embedding magnitude should be approximately 1.0, got {magnitude}"
+        );
+
+        let usage = classifier.into_usage();
+        assert_eq!(usage.prompt_tokens, prompt_token_count);
+        assert_eq!(usage.completion_tokens(), 0);
+
+        Ok(())
+    }
+}
+
+mod reranker {
+    use std::time::Duration;
+
+    use anyhow::{Context, Result, bail};
+    use llama_cpp_bindings::context::LlamaContext;
+    use llama_cpp_bindings::ggml_time_us;
+    use llama_cpp_bindings::llama_batch::LlamaBatch;
+    use llama_cpp_bindings::model::AddBos;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    fn normalize(input: &[f32]) -> Vec<f32> {
+        let magnitude = input
+            .iter()
+            .fold(0.0, |accumulator, &value| value.mul_add(value, accumulator))
+            .sqrt();
+
+        input.iter().map(|&value| value / magnitude).collect()
+    }
+
+    fn cosine_similarity(vec_a: &[f32], vec_b: &[f32]) -> f32 {
+        vec_a
+            .iter()
+            .zip(vec_b.iter())
+            .map(|(left, right)| left * right)
+            .sum::<f32>()
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+        n_seq_max = 2,
+        n_threads_batch = 8,
+        embeddings = true,
+    )]
+    fn reranking_produces_scores(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let model = fixture.model;
+
+        let query = "What is machine learning?";
+        let documents = [
+            "Machine learning is a subset of artificial intelligence.",
+            "The weather today is sunny and warm.",
+        ];
+
+        let document_count = documents.len();
+        assert_eq!(
+            u32::try_from(document_count)?,
+            fixture.context_params.n_seq_max,
+            "attribute n_seq_max must match the document count this trial expects",
+        );
+
+        let mut ctx = LlamaContext::from_model(
+            model,
+            fixture.backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )
+        .with_context(|| "unable to create context")?;
+
+        let prompt_lines: Vec<String> = documents
+            .iter()
+            .map(|document| format!("{query}</s><s>{document}"))
+            .collect();
+
+        let tokens_lines_list = prompt_lines
+            .iter()
+            .map(|line| model.str_to_token(line, AddBos::Always))
+            .collect::<std::result::Result<Vec<_>, _>>()
+            .with_context(|| "failed to tokenize prompts")?;
+
+        let n_ctx = usize::try_from(ctx.n_ctx())?;
+
+        if tokens_lines_list.iter().any(|tokens| n_ctx < tokens.len()) {
+            bail!("one of the provided prompts exceeds the size of the context window");
+        }
+
+        let mut classifier = model.sampled_token_classifier();
+        let mut batch = LlamaBatch::new(2048, i32::try_from(document_count)?)?;
+        let t_main_start = ggml_time_us();
+
+        for (sequence_index, tokens) in tokens_lines_list.iter().enumerate() {
+            classifier.feed_prompt_sequence_to_batch(
+                &mut batch,
+                tokens,
+                i32::try_from(sequence_index)?,
+                false,
+            )?;
+        }
+
+        let total_tokens: usize = tokens_lines_list.iter().map(Vec::len).sum();
+        let total_token_count = u64::try_from(total_tokens)?;
+
+        assert_eq!(classifier.pending_prompt_tokens(), total_token_count);
+        assert_eq!(classifier.usage().prompt_tokens, 0);
+
+        ctx.clear_kv_cache();
+        ctx.decode(&mut batch)
+            .with_context(|| "llama_decode() failed")?;
+
+        let promoted = classifier.commit_prompt_tokens();
+        assert_eq!(promoted, total_token_count);
+
+        let mut embeddings = Vec::with_capacity(document_count);
+
+        for sequence_index in 0..document_count {
+            let raw_embedding = ctx
+                .embeddings_seq_ith(i32::try_from(sequence_index)?)
+                .with_context(|| "failed to get sequence embeddings")?;
+            embeddings.push(normalize(raw_embedding));
+        }
+
+        let t_main_end = ggml_time_us();
+        let duration = Duration::from_micros(u64::try_from(t_main_end - t_main_start)?);
+
+        #[expect(
+            clippy::cast_precision_loss,
+            reason = "logged throughput tolerates f32 precision"
+        )]
+        let tokens_per_second = total_tokens as f32 / duration.as_secs_f32();
+
+        eprintln!(
+            "created embeddings for {total_tokens} tokens in {:.2} s, speed {tokens_per_second:.2} t/s",
+            duration.as_secs_f32(),
+        );
+
+        assert_eq!(
+            embeddings.len(),
+            document_count,
+            "should produce one embedding per document"
+        );
+
+        for (index, embedding) in embeddings.iter().enumerate() {
+            assert!(
+                !embedding.is_empty(),
+                "embedding {index} should not be empty"
+            );
+        }
+
+        let similarity = cosine_similarity(&embeddings[0], &embeddings[1]);
+        eprintln!("cosine similarity between document embeddings: {similarity:.4}");
+
+        assert!(
+            similarity.is_finite(),
+            "cosine similarity should be a finite number"
+        );
+
+        let usage = classifier.into_usage();
+        assert_eq!(usage.prompt_tokens, total_token_count);
+        assert_eq!(usage.completion_tokens(), 0);
+
+        Ok(())
+    }
+}
+
+mod context_embedding_and_encoder {
+
+    use anyhow::Result;
+
+    use llama_cpp_bindings::context::LlamaContext;
+    use llama_cpp_bindings::llama_batch::LlamaBatch;
+    use llama_cpp_bindings::model::AddBos;
+
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    // =========================================================================================
+    // Group A: default Qwen model, embeddings=false. Most context tests fall here.
+    // =========================================================================================
+
+    #[llama_test(
+        model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+        embeddings = true,
+    )]
+    fn decode_with_embeddings_enabled(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mut context = LlamaContext::from_model(
+            fixture.model,
+            fixture.backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+        let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
+        let mut batch = LlamaBatch::new(512, 1)?;
+        batch.add_sequence(&tokens, 0, false)?;
+
+        let result = context.decode(&mut batch);
+
+        assert!(result.is_ok());
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+        embeddings = true,
+    )]
+    fn embeddings_seq_ith_returns_valid_embeddings(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mut context = LlamaContext::from_model(
+            fixture.model,
+            fixture.backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+        let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
+        let mut batch = LlamaBatch::new(512, 1)?;
+        batch.add_sequence(&tokens, 0, false)?;
+        context.decode(&mut batch)?;
+
+        let embeddings = context.embeddings_seq_ith(0)?;
+
+        assert_eq!(embeddings.len(), usize::try_from(fixture.model.n_embd())?);
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+        n_seq_max = 4,
+        embeddings = true,
+    )]
+    fn multi_sequence_embeddings_returns_one_embedding_per_sequence(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let mut context = LlamaContext::from_model(
+            fixture.model,
+            fixture.backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+
+        let inputs = [
+            "alpha is here",
+            "beta runs fast",
+            "gamma waits",
+            "delta jumps",
+        ];
+        let mut batch = LlamaBatch::new(64, 4)?;
+
+        for (sequence_index, text) in inputs.iter().enumerate() {
+            let tokens = fixture.model.str_to_token(text, AddBos::Always)?;
+            let sequence_id = i32::try_from(sequence_index)?;
+
+            batch.add_sequence(&tokens, sequence_id, true)?;
+        }
+
+        context.decode(&mut batch)?;
+
+        let n_embd = usize::try_from(fixture.model.n_embd())?;
+        let mut collected: Vec<Vec<f32>> = Vec::with_capacity(inputs.len());
+
+        for sequence_index in 0..inputs.len() {
+            let sequence_id = i32::try_from(sequence_index)?;
+            let embedding = context.embeddings_seq_ith(sequence_id)?;
+
+            assert_eq!(
+                embedding.len(),
+                n_embd,
+                "sequence {sequence_index} embedding length mismatch"
+            );
+
+            collected.push(embedding.to_vec());
+        }
+
+        for (left_index, left) in collected.iter().enumerate() {
+            for (right_index, right) in collected.iter().enumerate().skip(left_index + 1) {
+                assert_ne!(
+                    left, right,
+                    "embedding for sequence {left_index} must differ from sequence {right_index}",
+                );
+            }
+        }
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+        n_seq_max = 4,
+        embeddings = true,
+    )]
+    fn embeddings_returns_distinct_values_when_reused_batch_has_extra_capacity(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let mut context = LlamaContext::from_model(
+            fixture.model,
+            fixture.backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+
+        let iterations = [
+            [
+                "This is the first document with enough content to contribute meaningfully to the batch size calculation",
+                "This is the second document that should be processed in a potentially different batch from the first",
+            ],
+            [
+                "This is the third document adding more content to ensure the total exceeds the configured chunk limit",
+                "This is the fourth document which should demonstrate that batching distributes across agent requests",
+            ],
+        ];
+
+        let n_embd = usize::try_from(fixture.model.n_embd())?;
+        let mut batch = LlamaBatch::new(64, 4)?;
+        let mut collected: Vec<Vec<f32>> = Vec::new();
+
+        for iteration_inputs in iterations {
+            for (sequence_index, text) in iteration_inputs.iter().enumerate() {
+                let tokens = fixture.model.str_to_token(text, AddBos::Always)?;
+                let sequence_id = i32::try_from(sequence_index)?;
+
+                batch.add_sequence(&tokens, sequence_id, true)?;
+            }
+
+            context.clear_kv_cache();
+            context.decode(&mut batch)?;
+
+            for sequence_index in 0..iteration_inputs.len() {
+                let sequence_id = i32::try_from(sequence_index)?;
+                let embedding = context.embeddings_seq_ith(sequence_id)?;
+
+                assert_eq!(
+                    embedding.len(),
+                    n_embd,
+                    "iteration sequence {sequence_index} embedding length mismatch"
+                );
+
+                collected.push(embedding.to_vec());
+            }
+
+            batch.clear();
+        }
+
+        assert_eq!(
+            collected.len(),
+            iterations.iter().flatten().count(),
+            "expected one embedding per input across every iteration"
+        );
+
+        for (left_index, left) in collected.iter().enumerate() {
+            for (right_index, right) in collected.iter().enumerate().skip(left_index + 1) {
+                assert_ne!(
+                    left, right,
+                    "embedding {left_index} must differ from embedding {right_index} across reused-batch iterations",
+                );
+            }
+        }
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+        embeddings = true,
+    )]
+    fn embeddings_ith_returns_valid_embeddings(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mut context = LlamaContext::from_model(
+            fixture.model,
+            fixture.backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+        let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
+        let last_index = i32::try_from(tokens.len() - 1)?;
+        let mut batch = LlamaBatch::new(512, 1)?;
+        batch.add_sequence(&tokens, 0, false)?;
+        context.decode(&mut batch)?;
+
+        let embeddings = context.embeddings_ith(last_index)?;
+
+        assert_eq!(embeddings.len(), usize::try_from(fixture.model.n_embd())?);
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+        embeddings = true,
+    )]
+    fn embeddings_ith_returns_null_embedding_error_for_non_embedding_token(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let context = LlamaContext::from_model(
+            fixture.model,
+            fixture.backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+
+        let result = context.embeddings_ith(999);
+
+        assert!(result.is_err());
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("Xiaojian9992024/t5-small-GGUF", "t5-small.bf16.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+        embeddings = true,
+    )]
+    fn encode_succeeds_with_encoder_model(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mut context = LlamaContext::from_model(
+            fixture.model,
+            fixture.backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+        let tokens = fixture.model.str_to_token("hello", AddBos::Never)?;
+        let mut batch = LlamaBatch::new(512, 1)?;
+        batch.add_sequence(&tokens, 0, false)?;
+
+        let result = context.encode(&mut batch);
+
+        assert!(result.is_ok());
+
+        Ok(())
+    }
+}
+
+mod context_kv_cache_embedding {
+    use std::num::NonZeroU8;
+
+    use anyhow::Result;
+    use llama_cpp_bindings::context::LlamaContext;
+
+    use llama_cpp_bindings::llama_batch::LlamaBatch;
+    use llama_cpp_bindings::model::AddBos;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    fn build_context<'context>(
+        fixture: &'context LlamaFixture<'_>,
+    ) -> Result<LlamaContext<'context>> {
+        Ok(LlamaContext::from_model(
+            fixture.model,
+            fixture.backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?)
+    }
+
+    fn decode_hello_world(
+        fixture: &LlamaFixture<'_>,
+        context: &mut LlamaContext<'_>,
+    ) -> Result<()> {
+        let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
+        let mut batch = LlamaBatch::new(512, 1)?;
+        batch.add_sequence(&tokens, 0, false)?;
+        context.decode(&mut batch)?;
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn kv_cache_seq_add_succeeds_on_embedding_model(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mut context = build_context(fixture)?;
+
+        decode_hello_world(fixture, &mut context)?;
+
+        let result = context.kv_cache_seq_add(0, Some(0), None, 1);
+
+        assert!(result.is_ok());
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn kv_cache_seq_div_succeeds_on_embedding_model(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mut context = build_context(fixture)?;
+
+        decode_hello_world(fixture, &mut context)?;
+
+        let divisor = NonZeroU8::new(2).ok_or_else(|| anyhow::anyhow!("2 is non-zero"))?;
+        let result = context.kv_cache_seq_div(0, Some(0), None, divisor);
+
+        assert!(result.is_ok());
+
+        Ok(())
+    }
+}
+
+mod model_helpers_embedding {
+    #![expect(
+        clippy::unnecessary_wraps,
+        reason = "every trial returns anyhow::Result<()> to match the LlamaTestFn signature"
+    )]
+
+    use anyhow::Result;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    #[llama_test(
+        model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 2048,
+        n_batch = 512,
+        n_ubatch = 128
+    )]
+    fn embedding_model_tool_call_markers_call_does_not_panic(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let _markers = fixture.model.tool_call_markers();
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 2048,
+        n_batch = 512,
+        n_ubatch = 128
+    )]
+    fn embedding_model_streaming_markers_returns_ok_for_a_model_without_tool_calls(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let _markers = fixture.model.streaming_markers()?;
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 2048,
+        n_batch = 512,
+        n_ubatch = 128
+    )]
+    fn approximate_tok_env_falls_back_to_eos_when_eot_unavailable(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let env = fixture.model.approximate_tok_env();
+        let env_again = fixture.model.approximate_tok_env();
+
+        assert!(
+            std::sync::Arc::ptr_eq(&env, &env_again),
+            "approximate_tok_env must return the same cached Arc for any model, including \
+             the embedding model which lacks an EOT token (forcing the fallback-to-EOS path)"
+        );
+
+        Ok(())
+    }
+}
+
+llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/embeddings.rs b/llama-cpp-bindings-tests/tests/embeddings.rs
deleted file mode 100644
index 7e531cec..00000000
--- a/llama-cpp-bindings-tests/tests/embeddings.rs
+++ /dev/null
@@ -1,103 +0,0 @@
-use std::time::Duration;
-
-use anyhow::{Context, Result};
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::ggml_time_us;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-fn normalize(input: &[f32]) -> Vec<f32> {
-    let magnitude = input
-        .iter()
-        .fold(0.0, |accumulator, &value| value.mul_add(value, accumulator))
-        .sqrt();
-
-    input.iter().map(|&value| value / magnitude).collect()
-}
-
-#[llama_test(
-    model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-    n_threads_batch = 8,
-    embeddings = true,
-)]
-fn embedding_generation_produces_vectors(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let model = fixture.model;
-
-    let mut ctx = LlamaContext::from_model(
-        model,
-        fixture.backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )
-    .with_context(|| "unable to create context")?;
-
-    let prompt = "Hello my name is";
-    let tokens = model
-        .str_to_token(prompt, AddBos::Always)
-        .with_context(|| format!("failed to tokenize {prompt}"))?;
-    let prompt_token_count = u64::try_from(tokens.len())?;
-
-    let n_ctx = usize::try_from(ctx.n_ctx())?;
-    assert!(tokens.len() <= n_ctx, "prompt exceeds context window size");
-
-    let t_main_start = ggml_time_us();
-
-    let mut classifier = model.sampled_token_classifier();
-    let mut batch = LlamaBatch::new(n_ctx, 1)?;
-    classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?;
-
-    assert_eq!(classifier.pending_prompt_tokens(), prompt_token_count);
-    assert_eq!(classifier.usage().prompt_tokens, 0);
-
-    ctx.clear_kv_cache();
-    ctx.decode(&mut batch)
-        .with_context(|| "llama_decode() failed")?;
-
-    let promoted = classifier.commit_prompt_tokens();
-    assert_eq!(promoted, prompt_token_count);
-
-    let embedding = ctx
-        .embeddings_seq_ith(0)
-        .with_context(|| "failed to get embeddings")?;
-    let normalized = normalize(embedding);
-
-    let t_main_end = ggml_time_us();
-    let duration = Duration::from_micros(u64::try_from(t_main_end - t_main_start)?);
-
-    eprintln!(
-        "created embedding with {} dimensions in {:.2} s",
-        normalized.len(),
-        duration.as_secs_f32()
-    );
-
-    assert!(
-        !normalized.is_empty(),
-        "embedding should have at least one dimension"
-    );
-
-    let magnitude: f32 = normalized
-        .iter()
-        .map(|value| value * value)
-        .sum::<f32>()
-        .sqrt();
-    assert!(
-        (magnitude - 1.0).abs() < 0.01,
-        "normalized embedding magnitude should be approximately 1.0, got {magnitude}"
-    );
-
-    let usage = classifier.into_usage();
-    assert_eq!(usage.prompt_tokens, prompt_token_count);
-    assert_eq!(usage.completion_tokens(), 0);
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/eval_multimodal_chunks_records_exact_token_counts.rs b/llama-cpp-bindings-tests/tests/eval_multimodal_chunks_records_exact_token_counts.rs
deleted file mode 100644
index dcef4ded..00000000
--- a/llama-cpp-bindings-tests/tests/eval_multimodal_chunks_records_exact_token_counts.rs
+++ /dev/null
@@ -1,185 +0,0 @@
-use anyhow::Result;
-use llama_cpp_bindings::TokenUsage;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::mtmd::MtmdBitmap;
-use llama_cpp_bindings::mtmd::MtmdInputChunkType;
-use llama_cpp_bindings::mtmd::MtmdInputChunks;
-use llama_cpp_bindings::mtmd::MtmdInputText;
-use llama_cpp_bindings::mtmd::mtmd_default_marker;
-use llama_cpp_bindings_tests::test_model::fixtures_dir;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const PROMPT_QUESTION: &str = "What animals do you see in this image?";
-
-struct ExpectedChunkTotals {
-    text: u64,
-    image: u64,
-    audio: u64,
-}
-
-fn sum_chunk_token_counts_by_type(chunks: &MtmdInputChunks) -> Result<ExpectedChunkTotals> {
-    let mut totals = ExpectedChunkTotals {
-        text: 0,
-        image: 0,
-        audio: 0,
-    };
-    for index in 0..chunks.len() {
-        let chunk = chunks
-            .get(index)
-            .ok_or_else(|| anyhow::anyhow!("chunk index {index} should exist"))?;
-        let n_tokens = u64::try_from(chunk.n_tokens())?;
-        match chunk.chunk_type()? {
-            MtmdInputChunkType::Text => {
-                totals.text = totals.text.saturating_add(n_tokens);
-            }
-            MtmdInputChunkType::Image => {
-                totals.image = totals.image.saturating_add(n_tokens);
-            }
-            MtmdInputChunkType::Audio => {
-                totals.audio = totals.audio.saturating_add(n_tokens);
-            }
-        }
-    }
-    Ok(totals)
-}
-
-fn build_multimodal_chunks_and_eval_into_usage(
-    fixture: &LlamaFixture<'_>,
-) -> Result<(TokenUsage, ExpectedChunkTotals)> {
-    let model = fixture.model;
-    let mtmd_ctx = fixture
-        .mtmd_context
-        .expect("mmproj_file declared in attribute");
-
-    let image_path = fixtures_dir().join("llamas.jpg");
-    let image_path_str = image_path
-        .to_str()
-        .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?;
-    let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?;
-
-    let marker = mtmd_default_marker();
-    let prompt = format!("{marker}{PROMPT_QUESTION}");
-
-    let input_text = MtmdInputText {
-        text: prompt,
-        add_special: false,
-        parse_special: true,
-    };
-
-    let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
-    let expected = sum_chunk_token_counts_by_type(&chunks)?;
-
-    let context_params = (*fixture.context_params).into_llama_context_params();
-    let context = LlamaContext::from_model(model, fixture.backend, context_params)?;
-
-    let mut classifier = model.sampled_token_classifier();
-    classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?;
-
-    Ok((classifier.into_usage(), expected))
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 4096,
-    n_batch = 512,
-    n_ubatch = 512,
-    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-fn prompt_tokens_match_text_chunk_total(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let (usage, expected) = build_multimodal_chunks_and_eval_into_usage(fixture)?;
-
-    if usage.prompt_tokens != expected.text {
-        anyhow::bail!(
-            "prompt_tokens must equal sum of text-chunk n_tokens; expected {}, got {}",
-            expected.text,
-            usage.prompt_tokens
-        );
-    }
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 4096,
-    n_batch = 512,
-    n_ubatch = 512,
-    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-fn input_image_tokens_match_image_chunk_total(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let (usage, expected) = build_multimodal_chunks_and_eval_into_usage(fixture)?;
-
-    if usage.input_image_tokens != expected.image {
-        anyhow::bail!(
-            "input_image_tokens must equal sum of image-chunk n_tokens; expected {}, got {}",
-            expected.image,
-            usage.input_image_tokens
-        );
-    }
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 4096,
-    n_batch = 512,
-    n_ubatch = 512,
-    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-fn input_audio_tokens_are_zero_for_image_only_input(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let (usage, expected) = build_multimodal_chunks_and_eval_into_usage(fixture)?;
-
-    if expected.audio != 0 {
-        anyhow::bail!(
-            "fixture invariant: image-only multimodal input should produce zero audio chunk tokens, got {}",
-            expected.audio
-        );
-    }
-    if usage.input_audio_tokens != 0 {
-        anyhow::bail!(
-            "input_audio_tokens must be zero when no audio chunks are evaluated; got {}",
-            usage.input_audio_tokens
-        );
-    }
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 4096,
-    n_batch = 512,
-    n_ubatch = 512,
-    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-fn completion_tokens_are_zero_after_eval_before_generation(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let (usage, _expected) = build_multimodal_chunks_and_eval_into_usage(fixture)?;
-
-    if usage.completion_tokens() != 0 {
-        anyhow::bail!(
-            "completion_tokens must be zero immediately after eval (no generation has occurred); got {}",
-            usage.completion_tokens()
-        );
-    }
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/gemma4_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs b/llama-cpp-bindings-tests/tests/gemma4_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs
deleted file mode 100644
index e20b99a2..00000000
--- a/llama-cpp-bindings-tests/tests/gemma4_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs
+++ /dev/null
@@ -1,115 +0,0 @@
-use anyhow::Result;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_bindings::sampling::LlamaSampler;
-use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const MAX_GENERATED_TOKENS: i32 = 200;
-
-const GEMMA4_THINKING_DISABLED_PROMPT: &str = "\
-<bos><start_of_turn>user\nReply with the single word: four. Do not explain.<end_of_turn>\n\
-<start_of_turn>model\n<|channel>thought\n<channel|>\n";
-
-const FORBIDDEN_MARKERS: &[&str] = &["<|channel>thought", "<channel|>"];
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 8192,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn gemma4_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let model = fixture.model;
-    let backend = fixture.backend;
-
-    let mut classifier = model.sampled_token_classifier();
-    let prompt_tokens = model.str_to_token(GEMMA4_THINKING_DISABLED_PROMPT, AddBos::Never)?;
-    let prompt_token_count = u64::try_from(prompt_tokens.len())?;
-
-    let mut batch = LlamaBatch::new(2048, 1)?;
-    classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
-
-    let mut context = LlamaContext::from_model(
-        model,
-        backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-
-    context.decode(&mut batch)?;
-
-    let promoted = classifier.commit_prompt_tokens();
-    assert_eq!(promoted, prompt_token_count);
-
-    let mut sampler = LlamaSampler::greedy();
-    let initial_position = batch.n_tokens();
-    let outcome = ClassifySampleLoop {
-        model,
-        classifier: &mut classifier,
-        sampler: &mut sampler,
-        context: &mut context,
-        batch: &mut batch,
-        initial_position,
-        max_generated_tokens: MAX_GENERATED_TOKENS,
-    }
-    .run()?;
-
-    let usage = classifier.usage();
-
-    assert!(
-        !outcome.generated_raw.is_empty(),
-        "Gemma 4 must generate at least one token"
-    );
-    assert_eq!(
-        outcome.observed_reasoning, 0,
-        "Gemma 4 thinking-disabled: classifier must not emit any Reasoning token \
-         when the prompt closes the thought channel before generation begins; \
-         generated={:?}",
-        outcome.generated_raw
-    );
-    assert_eq!(
-        outcome.observed_undeterminable, 0,
-        "Gemma 4 thinking-disabled: prompt-token replay must move section to Content \
-         before generation, so no Undeterminable tokens may be emitted; \
-         generated={:?}",
-        outcome.generated_raw
-    );
-    assert_eq!(
-        usage.reasoning_tokens, 0,
-        "Gemma 4 thinking-disabled: usage.reasoning_tokens must be zero; usage={usage:?}"
-    );
-    assert_eq!(
-        usage.undeterminable_tokens, 0,
-        "Gemma 4 thinking-disabled: usage.undeterminable_tokens must be zero; usage={usage:?}"
-    );
-    assert!(
-        outcome.observed_content > 0,
-        "Gemma 4 thinking-disabled: classifier must emit at least one Content token"
-    );
-    assert_eq!(
-        usage.completion_tokens(),
-        outcome.observed_content,
-        "Gemma 4 thinking-disabled: completion tokens must equal observed Content tokens"
-    );
-
-    for forbidden in FORBIDDEN_MARKERS {
-        assert!(
-            !outcome.content_stream.contains(forbidden),
-            "Gemma 4 thinking-disabled: content_stream leaked marker {forbidden:?}; \
-             content_stream={:?}",
-            outcome.content_stream
-        );
-    }
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/gemma4_classifier_emits_reasoning.rs b/llama-cpp-bindings-tests/tests/gemma4_classifier_emits_reasoning.rs
deleted file mode 100644
index 6a7aaba0..00000000
--- a/llama-cpp-bindings-tests/tests/gemma4_classifier_emits_reasoning.rs
+++ /dev/null
@@ -1,124 +0,0 @@
-use anyhow::Result;
-use anyhow::bail;
-use llama_cpp_bindings::ChatMessageParseOutcome;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_bindings::sampling::LlamaSampler;
-use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const MAX_GENERATED_TOKENS: i32 = 1500;
-
-const GEMMA4_THINKING_PROMPT: &str = "\
-<bos><start_of_turn>user\nReply with the single word: four. Do not explain.<end_of_turn>\n\
-<start_of_turn>model\n<|channel>thought\n";
-
-const FORBIDDEN_MARKERS: &[&str] = &["<|channel>thought", "<channel|>"];
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 8192,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn gemma4_classifier_emits_reasoning_for_thinking_prompt(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let model = fixture.model;
-    let backend = fixture.backend;
-
-    let mut classifier = model.sampled_token_classifier();
-    let prompt_tokens = model.str_to_token(GEMMA4_THINKING_PROMPT, AddBos::Never)?;
-    let prompt_token_count = u64::try_from(prompt_tokens.len())?;
-
-    let mut batch = LlamaBatch::new(2048, 1)?;
-    classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
-
-    let mut context = LlamaContext::from_model(
-        model,
-        backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-
-    context.decode(&mut batch)?;
-
-    let promoted = classifier.commit_prompt_tokens();
-    assert_eq!(promoted, prompt_token_count);
-
-    let mut sampler = LlamaSampler::greedy();
-    let initial_position = batch.n_tokens();
-    let outcome = ClassifySampleLoop {
-        model,
-        classifier: &mut classifier,
-        sampler: &mut sampler,
-        context: &mut context,
-        batch: &mut batch,
-        initial_position,
-        max_generated_tokens: MAX_GENERATED_TOKENS,
-    }
-    .run()?;
-
-    let usage = classifier.usage();
-    let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?;
-    let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else {
-        bail!("Gemma 4 chat template must be recognised by the parser; got Unrecognized");
-    };
-
-    assert!(
-        !outcome.generated_raw.is_empty(),
-        "Gemma 4 must generate at least one token"
-    );
-    assert!(
-        outcome.observed_reasoning > 0,
-        "Gemma 4 classifier must emit at least one Reasoning token when the model \
-         emits a `<|channel>thought` block; outcome={outcome:?}",
-    );
-    assert!(
-        usage.reasoning_tokens > 0,
-        "Gemma 4 usage.reasoning_tokens must be non-zero when the model emits a \
-         reasoning block; usage was {usage:?}"
-    );
-    assert_eq!(
-        outcome.observed_undeterminable, 0,
-        "Gemma 4: classifier must not emit Undeterminable when the model emits a \
-         detected `<|channel>thought` marker; outcome={outcome:?}"
-    );
-    assert_eq!(
-        usage.undeterminable_tokens, 0,
-        "Gemma 4: usage.undeterminable_tokens must be zero; usage={usage:?}"
-    );
-    assert_eq!(
-        usage.completion_tokens(),
-        outcome.observed_content + outcome.observed_reasoning,
-        "Gemma 4: completion tokens must equal observed Content + Reasoning"
-    );
-    assert!(
-        !parsed.reasoning_content.is_empty(),
-        "Gemma 4 must close its reasoning block within {MAX_GENERATED_TOKENS} tokens; \
-         increase the budget or pick a more direct prompt. generated={:?}",
-        outcome.generated_raw,
-    );
-
-    for forbidden in FORBIDDEN_MARKERS {
-        assert!(
-            !outcome.reasoning_stream.contains(forbidden),
-            "Gemma 4: reasoning_stream leaked marker {forbidden:?}; \
-             reasoning_stream={:?}",
-            outcome.reasoning_stream
-        );
-        assert!(
-            !outcome.content_stream.contains(forbidden),
-            "Gemma 4: content_stream leaked marker {forbidden:?}; \
-             content_stream={:?}",
-            outcome.content_stream
-        );
-    }
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/gemma4_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs b/llama-cpp-bindings-tests/tests/gemma4_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs
deleted file mode 100644
index e810ca3e..00000000
--- a/llama-cpp-bindings-tests/tests/gemma4_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs
+++ /dev/null
@@ -1,101 +0,0 @@
-use anyhow::Result;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::mtmd::MtmdBitmap;
-use llama_cpp_bindings::mtmd::MtmdInputText;
-use llama_cpp_bindings::mtmd::mtmd_default_marker;
-use llama_cpp_bindings::sampling::LlamaSampler;
-use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-use llama_cpp_bindings_tests::test_model::fixtures_dir;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const MAX_GENERATED_TOKENS: i32 = 200;
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 8192,
-    n_batch = 512,
-    n_ubatch = 512,
-    mmproj_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "mmproj-F16.gguf"),
-)]
-fn gemma4_classifier_emits_reasoning_for_multimodal_thinking_prompt(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let model = fixture.model;
-    let backend = fixture.backend;
-    let mtmd_ctx = fixture
-        .mtmd_context
-        .expect("mmproj_file declared in attribute");
-
-    let mut context = LlamaContext::from_model(
-        model,
-        backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-
-    let image_path = fixtures_dir().join("llamas.jpg");
-    let image_path_str = image_path
-        .to_str()
-        .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?;
-    let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?;
-
-    let marker = mtmd_default_marker();
-    let prompt = format!(
-        "<bos><start_of_turn>user\n{marker}What animals do you see in this image?<end_of_turn>\n<start_of_turn>model\n<|channel>thought\n"
-    );
-
-    let input_text = MtmdInputText {
-        text: prompt,
-        add_special: false,
-        parse_special: true,
-    };
-
-    let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
-
-    let mut classifier = model.sampled_token_classifier();
-    let n_past = classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?;
-
-    let mut sampler = LlamaSampler::chain_simple([
-        LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
-        LlamaSampler::top_k(40),
-        LlamaSampler::top_p(0.9, 1),
-        LlamaSampler::min_p(0.05, 1),
-        LlamaSampler::temp(0.7),
-        LlamaSampler::dist(0x00C0_FFEE),
-    ]);
-
-    let mut batch = LlamaBatch::new(2048, 1)?;
-    let outcome = ClassifySampleLoop {
-        model,
-        classifier: &mut classifier,
-        sampler: &mut sampler,
-        context: &mut context,
-        batch: &mut batch,
-        initial_position: n_past,
-        max_generated_tokens: MAX_GENERATED_TOKENS,
-    }
-    .run()?;
-
-    let usage = classifier.usage();
-
-    if outcome.observed_reasoning == 0 {
-        anyhow::bail!(
-            "Gemma 4 multimodal + thinking: classifier must emit at least one Reasoning token \
-             when the prompt opens a `<|channel>thought` block; outcome={outcome:?}"
-        );
-    }
-    if usage.reasoning_tokens == 0 {
-        anyhow::bail!(
-            "Gemma 4 multimodal + thinking: usage.reasoning_tokens must be non-zero; usage={usage:?}"
-        );
-    }
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/gemma4_parses_tool_call_payload.rs b/llama-cpp-bindings-tests/tests/gemma4_parses_tool_call_payload.rs
deleted file mode 100644
index 2f3d3eaa..00000000
--- a/llama-cpp-bindings-tests/tests/gemma4_parses_tool_call_payload.rs
+++ /dev/null
@@ -1,68 +0,0 @@
-use anyhow::Result;
-use anyhow::bail;
-use llama_cpp_bindings::ChatMessageParseOutcome;
-use llama_cpp_bindings::ToolCallArguments;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const TOOLS_JSON: &str = r#"[
-    {
-        "type": "function",
-        "function": {
-            "name": "get_weather",
-            "description": "Get the current weather for a location",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "location": {"type": "string", "description": "The city name"}
-                },
-                "required": ["location"]
-            }
-        }
-    }
-]"#;
-
-const GEMMA4_PAIRED_QUOTE_PAYLOAD: &str =
-    "<|tool_call>call:get_weather{location:<|\"|>Paris<|\"|>}";
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn gemma4_parses_tool_call_payload(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let outcome =
-        fixture
-            .model
-            .parse_chat_message(TOOLS_JSON, GEMMA4_PAIRED_QUOTE_PAYLOAD, false)?;
-
-    let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
-        bail!("expected Recognized for Gemma 4 PairedQuote on a Gemma-4 model; got Unrecognized");
-    };
-    assert_eq!(
-        parsed.tool_calls.len(),
-        1,
-        "expected one tool call; got {:?}",
-        parsed.tool_calls
-    );
-    assert_eq!(parsed.tool_calls[0].name, "get_weather");
-    let location = match &parsed.tool_calls[0].arguments {
-        ToolCallArguments::ValidJson(value) => value
-            .get("location")
-            .and_then(|v| v.as_str())
-            .map(str::to_owned),
-        ToolCallArguments::InvalidJson(raw) => {
-            bail!("expected ValidJson, got InvalidJson: {raw}");
-        }
-    };
-    assert_eq!(location.as_deref(), Some("Paris"));
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/gemma4_template_override_returns_full_markers.rs b/llama-cpp-bindings-tests/tests/gemma4_template_override_returns_full_markers.rs
deleted file mode 100644
index dc8099d7..00000000
--- a/llama-cpp-bindings-tests/tests/gemma4_template_override_returns_full_markers.rs
+++ /dev/null
@@ -1,50 +0,0 @@
-#![expect(
-    clippy::unnecessary_wraps,
-    reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-)]
-
-use anyhow::Result;
-use llama_cpp_bindings::ToolCallArgsShape;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn gemma4_template_override_returns_full_markers(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let model = fixture.model;
-    let template = model
-        .chat_template(None)
-        .expect("Gemma 4 chat template must be present");
-    let template_str = template.to_str().expect("template must be valid UTF-8");
-    assert!(
-        template_str.contains("<|tool_call>call:"),
-        "Gemma 4 chat template must contain '<|tool_call>call:' fingerprint; \
-         template starts with: {:?}",
-        &template_str[..template_str.len().min(200)],
-    );
-
-    let markers = model
-        .tool_call_markers()
-        .expect("Gemma 4 must produce ToolCallMarkers via override registry");
-
-    assert_eq!(markers.open, "<|tool_call>call:");
-    assert_eq!(markers.close, "}");
-    let ToolCallArgsShape::PairedQuote(shape) = markers.args_shape else {
-        panic!("expected PairedQuote variant, got {:?}", markers.args_shape);
-    };
-    assert_eq!(shape.name_args_separator, "{");
-    assert_eq!(shape.value_quote.open, "<|\"|>");
-    assert_eq!(shape.value_quote.close, "<|\"|>");
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/glm47_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs b/llama-cpp-bindings-tests/tests/glm47_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs
deleted file mode 100644
index 7b614ef9..00000000
--- a/llama-cpp-bindings-tests/tests/glm47_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs
+++ /dev/null
@@ -1,93 +0,0 @@
-use anyhow::Result;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_bindings::sampling::LlamaSampler;
-use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const MAX_GENERATED_TOKENS: i32 = 200;
-
-const GLM47_THINKING_DISABLED_PROMPT: &str = "\
-<|user|>
-What is 2 + 2?
-<|assistant|>
-</think>
-
-";
-
-const FORBIDDEN_MARKERS: &[&str] = &["<think>", "</think>"];
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 8192,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn glm47_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let model = fixture.model;
-    let backend = fixture.backend;
-
-    let mut classifier = model.sampled_token_classifier();
-    let prompt_tokens = model.str_to_token(GLM47_THINKING_DISABLED_PROMPT, AddBos::Never)?;
-    let prompt_token_count = u64::try_from(prompt_tokens.len())?;
-
-    let mut batch = LlamaBatch::new(2048, 1)?;
-    classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
-
-    let mut context = LlamaContext::from_model(
-        model,
-        backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-
-    context.decode(&mut batch)?;
-
-    let promoted = classifier.commit_prompt_tokens();
-    assert_eq!(promoted, prompt_token_count);
-
-    let mut sampler = LlamaSampler::chain_simple([
-        LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
-        LlamaSampler::top_k(40),
-        LlamaSampler::top_p(0.9, 1),
-        LlamaSampler::min_p(0.05, 1),
-        LlamaSampler::temp(0.7),
-        LlamaSampler::dist(0x00C0_FFEE),
-    ]);
-    let initial_position = batch.n_tokens();
-    let outcome = ClassifySampleLoop {
-        model,
-        classifier: &mut classifier,
-        sampler: &mut sampler,
-        context: &mut context,
-        batch: &mut batch,
-        initial_position,
-        max_generated_tokens: MAX_GENERATED_TOKENS,
-    }
-    .run()?;
-
-    let usage = classifier.usage();
-
-    assert!(!outcome.generated_raw.is_empty());
-    assert_eq!(outcome.observed_reasoning, 0);
-    assert_eq!(outcome.observed_undeterminable, 0);
-    assert_eq!(usage.reasoning_tokens, 0);
-    assert_eq!(usage.undeterminable_tokens, 0);
-    assert!(outcome.observed_content > 0);
-    assert_eq!(usage.completion_tokens(), outcome.observed_content);
-
-    for forbidden in FORBIDDEN_MARKERS {
-        assert!(!outcome.content_stream.contains(forbidden));
-    }
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/glm47_classifier_emits_reasoning.rs b/llama-cpp-bindings-tests/tests/glm47_classifier_emits_reasoning.rs
deleted file mode 100644
index d4677a14..00000000
--- a/llama-cpp-bindings-tests/tests/glm47_classifier_emits_reasoning.rs
+++ /dev/null
@@ -1,111 +0,0 @@
-use anyhow::Result;
-use anyhow::bail;
-use llama_cpp_bindings::ChatMessageParseOutcome;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_bindings::sampling::LlamaSampler;
-use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const MAX_GENERATED_TOKENS: i32 = 1500;
-
-const GLM47_THINKING_PROMPT: &str = "\
-<|user|>
-What is 2 + 2?
-<|assistant|>
-<think>
-";
-
-const FORBIDDEN_MARKERS: &[&str] = &["<think>", "</think>"];
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 8192,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn glm47_classifier_emits_reasoning_for_thinking_enabled_prompt(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let model = fixture.model;
-    let backend = fixture.backend;
-
-    let mut classifier = model.sampled_token_classifier();
-    let prompt_tokens = model.str_to_token(GLM47_THINKING_PROMPT, AddBos::Never)?;
-    let prompt_token_count = u64::try_from(prompt_tokens.len())?;
-
-    let mut batch = LlamaBatch::new(2048, 1)?;
-    classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
-
-    let mut context = LlamaContext::from_model(
-        model,
-        backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-
-    context.decode(&mut batch)?;
-
-    let promoted = classifier.commit_prompt_tokens();
-    assert_eq!(promoted, prompt_token_count);
-
-    let mut sampler = LlamaSampler::chain_simple([
-        LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
-        LlamaSampler::top_k(40),
-        LlamaSampler::top_p(0.9, 1),
-        LlamaSampler::min_p(0.05, 1),
-        LlamaSampler::temp(0.7),
-        LlamaSampler::dist(0x00C0_FFEE),
-    ]);
-    let initial_position = batch.n_tokens();
-    let outcome = ClassifySampleLoop {
-        model,
-        classifier: &mut classifier,
-        sampler: &mut sampler,
-        context: &mut context,
-        batch: &mut batch,
-        initial_position,
-        max_generated_tokens: MAX_GENERATED_TOKENS,
-    }
-    .run()?;
-
-    let usage = classifier.usage();
-    let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?;
-    let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else {
-        bail!("GLM-4.7 chat template must be recognised by the parser; got Unrecognized");
-    };
-
-    assert!(!outcome.generated_raw.is_empty());
-    assert!(outcome.observed_reasoning > 0);
-    assert!(usage.reasoning_tokens > 0);
-    assert_eq!(outcome.observed_undeterminable, 0);
-    assert_eq!(usage.undeterminable_tokens, 0);
-    assert_eq!(
-        usage.completion_tokens(),
-        outcome.observed_content + outcome.observed_reasoning
-    );
-
-    if parsed.reasoning_content.is_empty() {
-        eprintln!(
-            "GLM-4.7 didn't close its reasoning block within {MAX_GENERATED_TOKENS} tokens — \
-             skipping strict parser-equality assertions"
-        );
-    } else {
-        assert_eq!(outcome.reasoning_stream, parsed.reasoning_content);
-        assert_eq!(outcome.content_stream, parsed.content);
-    }
-
-    for forbidden in FORBIDDEN_MARKERS {
-        assert!(!outcome.reasoning_stream.contains(forbidden));
-        assert!(!outcome.content_stream.contains(forbidden));
-    }
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/glm47_parses_tool_call_payload.rs b/llama-cpp-bindings-tests/tests/glm47_parses_tool_call_payload.rs
deleted file mode 100644
index 8f31901e..00000000
--- a/llama-cpp-bindings-tests/tests/glm47_parses_tool_call_payload.rs
+++ /dev/null
@@ -1,66 +0,0 @@
-use anyhow::Result;
-use anyhow::bail;
-use llama_cpp_bindings::ChatMessageParseOutcome;
-use llama_cpp_bindings::ToolCallArguments;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const TOOLS_JSON: &str = r#"[
-    {
-        "type": "function",
-        "function": {
-            "name": "get_weather",
-            "description": "Get the current weather for a location",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "location": {"type": "string", "description": "The city name"}
-                },
-                "required": ["location"]
-            }
-        }
-    }
-]"#;
-
-const GLM47_KEY_VALUE_PAYLOAD: &str = "<tool_call>get_weather\
-<arg_key>location</arg_key>\
-<arg_value>Paris</arg_value>\
-</tool_call>";
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn glm47_parses_tool_call_payload(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let outcome = fixture
-        .model
-        .parse_chat_message(TOOLS_JSON, GLM47_KEY_VALUE_PAYLOAD, false)?;
-
-    let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
-        bail!(
-            "expected Recognized for GLM-4.7 key-value tags on a GLM-4.7-Flash model; got Unrecognized"
-        );
-    };
-    assert_eq!(parsed.tool_calls.len(), 1);
-    assert_eq!(parsed.tool_calls[0].name, "get_weather");
-    let location = match &parsed.tool_calls[0].arguments {
-        ToolCallArguments::ValidJson(value) => value
-            .get("location")
-            .and_then(|v| v.as_str())
-            .map(str::to_owned),
-        ToolCallArguments::InvalidJson(raw) => {
-            bail!("expected ValidJson, got InvalidJson: {raw}");
-        }
-    };
-    assert_eq!(location.as_deref(), Some("Paris"));
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/glm47_template_override_returns_full_markers.rs b/llama-cpp-bindings-tests/tests/glm47_template_override_returns_full_markers.rs
deleted file mode 100644
index 491c46c4..00000000
--- a/llama-cpp-bindings-tests/tests/glm47_template_override_returns_full_markers.rs
+++ /dev/null
@@ -1,49 +0,0 @@
-#![expect(
-    clippy::unnecessary_wraps,
-    reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-)]
-
-use anyhow::Result;
-use llama_cpp_bindings::ToolCallArgsShape;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn glm47_template_override_returns_full_markers(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let model = fixture.model;
-    let template = model
-        .chat_template(None)
-        .expect("GLM-4.7 chat template must be present");
-    let template_str = template.to_str().expect("template must be valid UTF-8");
-    assert!(template_str.contains("<arg_key>"));
-
-    let markers = model
-        .tool_call_markers()
-        .expect("GLM-4.7 must produce ToolCallMarkers via override registry");
-
-    assert_eq!(markers.open, "<tool_call>");
-    assert_eq!(markers.close, "</tool_call>");
-    let ToolCallArgsShape::KeyValueXmlTags(shape) = markers.args_shape else {
-        panic!(
-            "expected KeyValueXmlTags variant, got {:?}",
-            markers.args_shape
-        );
-    };
-    assert_eq!(shape.key_open, "<arg_key>");
-    assert_eq!(shape.key_close, "</arg_key>");
-    assert_eq!(shape.value_open, "<arg_value>");
-    assert_eq!(shape.value_close, "</arg_value>");
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/ingest_prompt_chunk.rs b/llama-cpp-bindings-tests/tests/ingest_prompt_chunk.rs
deleted file mode 100644
index 24045f7c..00000000
--- a/llama-cpp-bindings-tests/tests/ingest_prompt_chunk.rs
+++ /dev/null
@@ -1,181 +0,0 @@
-use anyhow::Result;
-use llama_cpp_bindings::ingest_prompt_chunk::ingest_prompt_chunk;
-use llama_cpp_bindings::mtmd::MtmdBitmap;
-use llama_cpp_bindings::mtmd::MtmdInputChunkType;
-use llama_cpp_bindings::mtmd::MtmdInputText;
-use llama_cpp_bindings::mtmd::mtmd_default_marker;
-use llama_cpp_bindings_tests::test_model::fixtures_dir;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-fn text_chunk_records_prompt_tokens(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let model = fixture.model;
-    let mtmd_ctx = fixture
-        .mtmd_context
-        .expect("mmproj_file declared in attribute");
-
-    let input_text = MtmdInputText {
-        text: "hello world".to_owned(),
-        add_special: false,
-        parse_special: false,
-    };
-    let chunks = mtmd_ctx.tokenize(input_text, &[])?;
-
-    let text_chunk = (0..chunks.len())
-        .filter_map(|index| chunks.get(index))
-        .find(|chunk| chunk.chunk_type() == Ok(MtmdInputChunkType::Text))
-        .ok_or_else(|| {
-            anyhow::anyhow!("text-only tokenization should produce at least one text chunk")
-        })?;
-
-    let n_tokens = u64::try_from(text_chunk.n_tokens())?;
-
-    let mut classifier = model.sampled_token_classifier();
-
-    ingest_prompt_chunk(&mut classifier, &text_chunk)?;
-
-    let usage = classifier.usage();
-    if usage.prompt_tokens != n_tokens {
-        anyhow::bail!(
-            "text chunk must record n_tokens as prompt_tokens; expected {n_tokens}, got {}",
-            usage.prompt_tokens
-        );
-    }
-    if usage.input_image_tokens != 0 {
-        anyhow::bail!(
-            "text chunk must not bump input_image_tokens; got {}",
-            usage.input_image_tokens
-        );
-    }
-    if usage.input_audio_tokens != 0 {
-        anyhow::bail!(
-            "text chunk must not bump input_audio_tokens; got {}",
-            usage.input_audio_tokens
-        );
-    }
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-fn image_chunk_records_input_image_tokens_only(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let model = fixture.model;
-    let mtmd_ctx = fixture
-        .mtmd_context
-        .expect("mmproj_file declared in attribute");
-
-    let image_path = fixtures_dir().join("llamas.jpg");
-    let image_path_str = image_path
-        .to_str()
-        .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?;
-    let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?;
-
-    let marker = mtmd_default_marker();
-    let input_text = MtmdInputText {
-        text: marker.to_owned(),
-        add_special: false,
-        parse_special: true,
-    };
-    let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
-
-    let image_chunk = (0..chunks.len())
-        .filter_map(|index| chunks.get(index))
-        .find(|chunk| chunk.chunk_type() == Ok(MtmdInputChunkType::Image))
-        .ok_or_else(|| anyhow::anyhow!("multimodal tokenization should produce an image chunk"))?;
-
-    let n_tokens = u64::try_from(image_chunk.n_tokens())?;
-    if n_tokens == 0 {
-        anyhow::bail!("image chunk should report at least one token");
-    }
-
-    let mut classifier = model.sampled_token_classifier();
-
-    ingest_prompt_chunk(&mut classifier, &image_chunk)?;
-
-    let usage = classifier.usage();
-    if usage.input_image_tokens != n_tokens {
-        anyhow::bail!(
-            "image chunk must record n_tokens as input_image_tokens; expected {n_tokens}, got {}",
-            usage.input_image_tokens
-        );
-    }
-    if usage.prompt_tokens != 0 {
-        anyhow::bail!(
-            "image chunk must not bump prompt_tokens; got {}",
-            usage.prompt_tokens
-        );
-    }
-    if usage.input_audio_tokens != 0 {
-        anyhow::bail!(
-            "image chunk must not bump input_audio_tokens; got {}",
-            usage.input_audio_tokens
-        );
-    }
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-fn text_chunk_drives_marker_state_machine_to_reasoning(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let model = fixture.model;
-    let mtmd_ctx = fixture
-        .mtmd_context
-        .expect("mmproj_file declared in attribute");
-
-    let input_text = MtmdInputText {
-        text: "<|im_start|>user\nHi<|im_end|>\n<|im_start|>assistant\n<think>\n".to_owned(),
-        add_special: false,
-        parse_special: true,
-    };
-    let chunks = mtmd_ctx.tokenize(input_text, &[])?;
-
-    let mut classifier = model.sampled_token_classifier();
-
-    for index in 0..chunks.len() {
-        let chunk = chunks
-            .get(index)
-            .ok_or_else(|| anyhow::anyhow!("chunk index {index} must exist"))?;
-        ingest_prompt_chunk(&mut classifier, &chunk)?;
-    }
-
-    if classifier.current_section() != llama_cpp_bindings::SampledTokenSection::Reasoning {
-        anyhow::bail!(
-            "text chunk replay must transition the classifier section to Reasoning when the \
-             prompt opens a `<think>` block; got {:?}",
-            classifier.current_section()
-        );
-    }
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/kv_cache_and_session.rs b/llama-cpp-bindings-tests/tests/kv_cache_and_session.rs
new file mode 100644
index 00000000..de316e42
--- /dev/null
+++ b/llama-cpp-bindings-tests/tests/kv_cache_and_session.rs
@@ -0,0 +1,2836 @@
+use llama_cpp_test_harness::llama_tests_main;
+
+mod model_context_creation {
+    #![expect(
+        clippy::unnecessary_wraps,
+        reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
+    )]
+
+    use anyhow::Result;
+    use llama_cpp_bindings::context::LlamaContext;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 256,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 256,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 256,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 256,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn new_context_returns_valid_context(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let context = LlamaContext::from_model(
+            fixture.model,
+            fixture.backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+
+        assert!(context.n_ctx() > 0);
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 4294967295,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 4294967295,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 4294967295,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 4294967295,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn new_context_with_huge_ctx_returns_null_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let result = LlamaContext::from_model(
+            fixture.model,
+            fixture.backend,
+            (*fixture.context_params).into_llama_context_params(),
+        );
+
+        assert!(result.is_err());
+        Ok(())
+    }
+}
+
+mod context {
+    use std::ptr::NonNull;
+    use std::sync::Arc;
+    use std::sync::atomic::AtomicBool;
+
+    use anyhow::Result;
+    use llama_cpp_bindings::DecodeError;
+    use llama_cpp_bindings::LogitsError;
+    use llama_cpp_bindings::context::LlamaContext;
+    use llama_cpp_bindings::llama_batch::LlamaBatch;
+    use llama_cpp_bindings::model::AddBos;
+    use llama_cpp_bindings::model::LlamaLoraAdapter;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    // =========================================================================================
+    // Group A: default Qwen model, embeddings=false. Most context tests fall here.
+    // =========================================================================================
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn context_creation_and_properties(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let context = LlamaContext::from_model(
+            fixture.model,
+            fixture.backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+
+        assert!(context.n_ctx() > 0);
+        assert!(context.n_batch() > 0);
+        assert!(context.n_ubatch() > 0);
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn decode_and_get_logits(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mut context = LlamaContext::from_model(
+            fixture.model,
+            fixture.backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+        let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
+        let mut batch = LlamaBatch::new(512, 1)?;
+        batch.add_sequence(&tokens, 0, false)?;
+
+        let decode_result = context.decode(&mut batch);
+        assert!(decode_result.is_ok());
+
+        let logits = context.get_logits()?;
+        assert!(!logits.is_empty());
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn timings_work(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mut context = LlamaContext::from_model(
+            fixture.model,
+            fixture.backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+
+        context.reset_timings();
+        let timings = context.timings();
+        assert!(timings.t_start_ms() >= 0.0);
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn token_data_array_has_entries_after_decode(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mut context = LlamaContext::from_model(
+            fixture.model,
+            fixture.backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+        let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
+        let mut batch = LlamaBatch::new(512, 1)?;
+        batch.add_sequence(&tokens, 0, false)?;
+        context.decode(&mut batch)?;
+
+        let token_data_array = context.token_data_array()?;
+
+        assert!(!token_data_array.data.is_empty());
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn get_logits_ith_returns_valid_slice(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mut context = LlamaContext::from_model(
+            fixture.model,
+            fixture.backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+        let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
+        let last_index = i32::try_from(tokens.len() - 1)?;
+        let mut batch = LlamaBatch::new(512, 1)?;
+        batch.add_sequence(&tokens, 0, false)?;
+        context.decode(&mut batch)?;
+
+        let logits = context.get_logits_ith(last_index)?;
+
+        assert_eq!(logits.len(), usize::try_from(fixture.model.n_vocab())?);
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn token_data_array_ith_returns_valid_data(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mut context = LlamaContext::from_model(
+            fixture.model,
+            fixture.backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+        let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
+        let last_index = i32::try_from(tokens.len() - 1)?;
+        let mut batch = LlamaBatch::new(512, 1)?;
+        batch.add_sequence(&tokens, 0, false)?;
+        context.decode(&mut batch)?;
+
+        let token_data_array = context.token_data_array_ith(last_index)?;
+
+        assert_eq!(
+            token_data_array.data.len(),
+            usize::try_from(fixture.model.n_vocab())?
+        );
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn embeddings_ith_returns_error_when_embeddings_disabled(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let context = LlamaContext::from_model(
+            fixture.model,
+            fixture.backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+
+        let result = context.embeddings_ith(0);
+
+        assert!(result.is_err());
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn embeddings_seq_ith_returns_error_when_embeddings_disabled(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let context = LlamaContext::from_model(
+            fixture.model,
+            fixture.backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+
+        let result = context.embeddings_seq_ith(0);
+
+        assert!(result.is_err());
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn candidates_returns_n_vocab_entries(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mut context = LlamaContext::from_model(
+            fixture.model,
+            fixture.backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+        let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
+        let mut batch = LlamaBatch::new(512, 1)?;
+        batch.add_sequence(&tokens, 0, false)?;
+        context.decode(&mut batch)?;
+
+        let count = context.candidates()?.count();
+
+        assert_eq!(count, usize::try_from(fixture.model.n_vocab())?);
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn debug_format_contains_struct_name(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let context = LlamaContext::from_model(
+            fixture.model,
+            fixture.backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+        let debug_output = format!("{context:?}");
+
+        assert!(debug_output.contains("LlamaContext"));
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn candidates_ith_returns_n_vocab_entries(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mut context = LlamaContext::from_model(
+            fixture.model,
+            fixture.backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+        let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
+        let last_index = i32::try_from(tokens.len() - 1)?;
+        let mut batch = LlamaBatch::new(512, 1)?;
+        batch.add_sequence(&tokens, 0, false)?;
+        context.decode(&mut batch)?;
+
+        let count = context.candidates_ith(last_index)?.count();
+
+        assert_eq!(count, usize::try_from(fixture.model.n_vocab())?);
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn lora_adapter_remove_succeeds_with_no_adapters(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let context = LlamaContext::from_model(
+            fixture.model,
+            fixture.backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+        let mut adapter = LlamaLoraAdapter {
+            lora_adapter: NonNull::dangling(),
+        };
+
+        let result = context.lora_adapter_remove(&mut adapter);
+
+        assert!(result.is_ok());
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn encode_on_non_encoder_model_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mut context = LlamaContext::from_model(
+            fixture.model,
+            fixture.backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+        let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
+        let mut batch = LlamaBatch::new(512, 1)?;
+        batch.add_sequence(&tokens, 0, false)?;
+
+        let result = context.encode(&mut batch);
+
+        assert!(result.is_err());
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn lora_adapter_set_with_dangling_pointer_succeeds_or_errors(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let context = LlamaContext::from_model(
+            fixture.model,
+            fixture.backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+        let mut adapter = LlamaLoraAdapter {
+            lora_adapter: NonNull::dangling(),
+        };
+
+        let result = context.lora_adapter_set(&mut adapter, 1.0);
+
+        assert!(result.is_ok());
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+        embeddings = true,
+    )]
+    fn embeddings_seq_ith_returns_null_embedding_error_for_invalid_seq(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let mut context = LlamaContext::from_model(
+            fixture.model,
+            fixture.backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+        let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
+        let mut batch = LlamaBatch::new(512, 1)?;
+        batch.add_sequence(&tokens, 0, false)?;
+        context.decode(&mut batch)?;
+
+        let result = context.embeddings_seq_ith(999);
+
+        assert!(result.is_err());
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn decode_empty_batch_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mut context = LlamaContext::from_model(
+            fixture.model,
+            fixture.backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+        let mut batch = LlamaBatch::new(512, 1)?;
+
+        let result = context.decode(&mut batch);
+
+        assert!(result.is_err());
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn set_abort_flag_aborts_decode(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mut context = LlamaContext::from_model(
+            fixture.model,
+            fixture.backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+        let abort_flag = Arc::new(AtomicBool::new(true));
+        context.set_abort_flag(abort_flag);
+
+        let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
+        let mut batch = LlamaBatch::new(512, 1)?;
+        batch.add_sequence(&tokens, 0, false)?;
+
+        let result = context.decode(&mut batch);
+
+        assert_eq!(result, Err(DecodeError::Aborted));
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn set_abort_flag_false_allows_decode(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mut context = LlamaContext::from_model(
+            fixture.model,
+            fixture.backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+        let abort_flag = Arc::new(AtomicBool::new(false));
+        context.set_abort_flag(abort_flag);
+
+        let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
+        let mut batch = LlamaBatch::new(512, 1)?;
+        batch.add_sequence(&tokens, 0, false)?;
+
+        let result = context.decode(&mut batch);
+
+        assert!(result.is_ok());
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn clear_abort_callback_allows_decode_with_flag_true(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mut context = LlamaContext::from_model(
+            fixture.model,
+            fixture.backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+        let abort_flag = Arc::new(AtomicBool::new(true));
+        context.set_abort_flag(abort_flag);
+        context.clear_abort_callback();
+
+        let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
+        let mut batch = LlamaBatch::new(512, 1)?;
+        batch.add_sequence(&tokens, 0, false)?;
+
+        let result = context.decode(&mut batch);
+
+        assert!(result.is_ok());
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn synchronize_completes_without_panic(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let context = LlamaContext::from_model(
+            fixture.model,
+            fixture.backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+
+        context.synchronize();
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn detach_threadpool_completes_without_panic(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let context = LlamaContext::from_model(
+            fixture.model,
+            fixture.backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+
+        context.detach_threadpool();
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn get_logits_ith_returns_token_not_initialized_for_unknown_index(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let context = LlamaContext::from_model(
+            fixture.model,
+            fixture.backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+
+        let result = context.get_logits_ith(7);
+
+        assert!(matches!(result, Err(LogitsError::TokenNotInitialized(7))));
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 64,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn get_logits_ith_returns_token_index_exceeds_context_for_huge_index(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let mut context = LlamaContext::from_model(
+            fixture.model,
+            fixture.backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+
+        let huge_index = i32::try_from(context.n_ctx())?;
+        context.mark_logits_initialized(huge_index);
+        let result = context.get_logits_ith(huge_index);
+
+        assert!(matches!(
+            result,
+            Err(LogitsError::TokenIndexExceedsContext { .. })
+        ));
+
+        Ok(())
+    }
+}
+
+mod context_kv_cache {
+    use std::num::NonZeroU8;
+
+    use anyhow::Result;
+    use llama_cpp_bindings::context::LlamaContext;
+    use llama_cpp_bindings::context::kv_cache::KvCacheConversionError;
+    use llama_cpp_bindings::error::KvCacheSeqAddError;
+    use llama_cpp_bindings::error::KvCacheSeqDivError;
+    use llama_cpp_bindings::llama_batch::LlamaBatch;
+    use llama_cpp_bindings::model::AddBos;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    fn build_context<'context>(
+        fixture: &'context LlamaFixture<'_>,
+    ) -> Result<LlamaContext<'context>> {
+        Ok(LlamaContext::from_model(
+            fixture.model,
+            fixture.backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?)
+    }
+
+    fn decode_hello_world(
+        fixture: &LlamaFixture<'_>,
+        context: &mut LlamaContext<'_>,
+    ) -> Result<()> {
+        let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
+        let mut batch = LlamaBatch::new(512, 1)?;
+        batch.add_sequence(&tokens, 0, false)?;
+        context.decode(&mut batch)?;
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn clear_kv_cache_resets_positions(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mut context = build_context(fixture)?;
+
+        decode_hello_world(fixture, &mut context)?;
+
+        context.clear_kv_cache();
+        assert_eq!(context.kv_cache_seq_pos_max(0), -1);
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn kv_cache_seq_pos_max_is_non_negative_after_decode(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mut context = build_context(fixture)?;
+
+        decode_hello_world(fixture, &mut context)?;
+
+        assert!(context.kv_cache_seq_pos_max(0) >= 0);
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn clear_kv_cache_seq_with_range(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mut context = build_context(fixture)?;
+
+        decode_hello_world(fixture, &mut context)?;
+
+        let result = context.clear_kv_cache_seq(Some(0), Some(0), Some(1));
+        assert!(result.is_ok());
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn copy_kv_cache_seq_succeeds(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mut context = build_context(fixture)?;
+
+        decode_hello_world(fixture, &mut context)?;
+
+        let result = context.copy_kv_cache_seq(0, 1, None, None);
+        assert!(result.is_ok());
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn copy_cache_executes_without_crash(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mut context = build_context(fixture)?;
+
+        decode_hello_world(fixture, &mut context)?;
+
+        let pos_max = context.kv_cache_seq_pos_max(0);
+        context.copy_cache(0, 1, pos_max + 1);
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn kv_cache_seq_add_returns_error_for_mrope_model(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mut context = build_context(fixture)?;
+
+        decode_hello_world(fixture, &mut context)?;
+
+        let result = context.kv_cache_seq_add(0, Some(0), None, 1);
+
+        assert!(matches!(
+            result.unwrap_err(),
+            KvCacheSeqAddError::IncompatibleRopeType,
+        ));
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn kv_cache_seq_div_returns_error_for_mrope_model(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mut context = build_context(fixture)?;
+
+        decode_hello_world(fixture, &mut context)?;
+
+        let divisor = NonZeroU8::new(2).ok_or_else(|| anyhow::anyhow!("2 is non-zero"))?;
+        let result = context.kv_cache_seq_div(0, Some(0), None, divisor);
+
+        assert!(matches!(
+            result.unwrap_err(),
+            KvCacheSeqDivError::IncompatibleRopeType,
+        ));
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn kv_cache_seq_keep_retains_specified_sequence(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mut context = build_context(fixture)?;
+
+        decode_hello_world(fixture, &mut context)?;
+
+        context.kv_cache_seq_keep(0);
+
+        assert!(context.kv_cache_seq_pos_max(0) >= 0);
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn copy_kv_cache_seq_with_explicit_range(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mut context = build_context(fixture)?;
+
+        decode_hello_world(fixture, &mut context)?;
+
+        let result = context.copy_kv_cache_seq(0, 2, Some(0), Some(1));
+
+        assert!(result.is_ok());
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn kv_cache_seq_pos_max_returns_negative_one_for_unused_seq(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let context = build_context(fixture)?;
+
+        let result = context.kv_cache_seq_pos_max(999);
+
+        assert_eq!(result, -1);
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn copy_kv_cache_seq_rejects_p0_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mut context = build_context(fixture)?;
+
+        let result = context.copy_kv_cache_seq(0, 1, Some(u32::MAX), None);
+
+        assert!(matches!(
+            result.unwrap_err(),
+            KvCacheConversionError::P0TooLarge(_),
+        ));
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn copy_kv_cache_seq_rejects_p1_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mut context = build_context(fixture)?;
+
+        let result = context.copy_kv_cache_seq(0, 1, Some(0), Some(u32::MAX));
+
+        assert!(matches!(
+            result.unwrap_err(),
+            KvCacheConversionError::P1TooLarge(_),
+        ));
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn clear_kv_cache_seq_rejects_src_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mut context = build_context(fixture)?;
+
+        let result = context.clear_kv_cache_seq(Some(u32::MAX), None, None);
+
+        assert!(matches!(
+            result.unwrap_err(),
+            KvCacheConversionError::SeqIdTooLarge(_),
+        ));
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn clear_kv_cache_seq_rejects_p0_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mut context = build_context(fixture)?;
+
+        let result = context.clear_kv_cache_seq(Some(0), Some(u32::MAX), None);
+
+        assert!(matches!(
+            result.unwrap_err(),
+            KvCacheConversionError::P0TooLarge(_),
+        ));
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn clear_kv_cache_seq_rejects_p1_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mut context = build_context(fixture)?;
+
+        let result = context.clear_kv_cache_seq(Some(0), Some(0), Some(u32::MAX));
+
+        assert!(matches!(
+            result.unwrap_err(),
+            KvCacheConversionError::P1TooLarge(_),
+        ));
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn kv_cache_seq_add_rejects_p0_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mut context = build_context(fixture)?;
+
+        let result = context.kv_cache_seq_add(0, Some(u32::MAX), None, 1);
+
+        assert!(matches!(
+            result.unwrap_err(),
+            KvCacheSeqAddError::P0TooLarge(_),
+        ));
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn kv_cache_seq_add_rejects_p1_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mut context = build_context(fixture)?;
+
+        let result = context.kv_cache_seq_add(0, Some(0), Some(u32::MAX), 1);
+
+        assert!(matches!(
+            result.unwrap_err(),
+            KvCacheSeqAddError::P1TooLarge(_),
+        ));
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn kv_cache_seq_div_rejects_p0_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mut context = build_context(fixture)?;
+
+        let divisor = NonZeroU8::new(2).ok_or_else(|| anyhow::anyhow!("2 is non-zero"))?;
+        let result = context.kv_cache_seq_div(0, Some(u32::MAX), None, divisor);
+
+        assert!(matches!(
+            result.unwrap_err(),
+            KvCacheSeqDivError::P0TooLarge(_),
+        ));
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn kv_cache_seq_div_rejects_p1_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mut context = build_context(fixture)?;
+
+        let divisor = NonZeroU8::new(2).ok_or_else(|| anyhow::anyhow!("2 is non-zero"))?;
+        let result = context.kv_cache_seq_div(0, Some(0), Some(u32::MAX), divisor);
+
+        assert!(matches!(
+            result.unwrap_err(),
+            KvCacheSeqDivError::P1TooLarge(_),
+        ));
+
+        Ok(())
+    }
+}
+
+mod context_session {
+    use anyhow::Result;
+    use llama_cpp_bindings::context::LlamaContext;
+    use llama_cpp_bindings::llama_batch::LlamaBatch;
+    use llama_cpp_bindings::model::AddBos;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    fn build_context<'context>(
+        fixture: &'context LlamaFixture<'_>,
+    ) -> Result<LlamaContext<'context>> {
+        Ok(LlamaContext::from_model(
+            fixture.model,
+            fixture.backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?)
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn save_and_load_session_file(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mut context = build_context(fixture)?;
+
+        let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
+        let mut batch = LlamaBatch::new(512, 1)?;
+        batch.add_sequence(&tokens, 0, false)?;
+        context.decode(&mut batch)?;
+
+        let session_path = std::env::temp_dir().join("llama_test_session.bin");
+        context.state_save_file(&session_path, &tokens)?;
+
+        let loaded_tokens = context.state_load_file(&session_path, 512)?;
+        assert_eq!(loaded_tokens, tokens);
+
+        std::fs::remove_file(&session_path)?;
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn get_state_size_is_positive(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let context = build_context(fixture)?;
+
+        assert!(context.get_state_size() > 0);
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn state_seq_save_and_load_file_roundtrip(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mut context = build_context(fixture)?;
+
+        let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
+        let mut batch = LlamaBatch::new(512, 1)?;
+        batch.add_sequence(&tokens, 0, false)?;
+        context.decode(&mut batch)?;
+
+        let session_path = std::env::temp_dir().join("llama_test_seq_state.bin");
+        let bytes_written = context.state_seq_save_file(&session_path, 0, &tokens)?;
+        assert!(bytes_written > 0);
+
+        let (loaded_tokens, bytes_read) = context.state_seq_load_file(&session_path, 0, 512)?;
+        assert_eq!(loaded_tokens, tokens);
+        assert!(bytes_read > 0);
+
+        std::fs::remove_file(&session_path)?;
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn copy_state_data_and_set_state_data_roundtrip(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mut context = build_context(fixture)?;
+
+        let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
+        let mut batch = LlamaBatch::new(512, 1)?;
+        batch.add_sequence(&tokens, 0, false)?;
+        context.decode(&mut batch)?;
+
+        let state_size = context.get_state_size();
+        let mut state_data = vec![0u8; state_size];
+        let bytes_copied = unsafe { context.copy_state_data(&mut state_data) };
+        assert!(bytes_copied > 0);
+
+        let bytes_read = unsafe { context.set_state_data(&state_data) };
+        assert!(bytes_read > 0);
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn state_load_file_with_nonexistent_file_returns_error(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let mut context = build_context(fixture)?;
+
+        let result = context.state_load_file("/nonexistent/session.bin", 512);
+
+        assert!(result.is_err());
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn state_seq_load_file_with_nonexistent_file_returns_error(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let mut context = build_context(fixture)?;
+
+        let result = context.state_seq_load_file("/nonexistent/seq_state.bin", 0, 512);
+
+        assert!(result.is_err());
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn state_save_file_to_invalid_directory_returns_failed_to_save(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let context = build_context(fixture)?;
+
+        let result = context.state_save_file("/nonexistent_dir/session.bin", &[]);
+
+        assert!(result.is_err());
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn state_seq_save_file_to_invalid_directory_returns_failed_to_save(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let context = build_context(fixture)?;
+
+        let result = context.state_seq_save_file("/nonexistent_dir/seq_state.bin", 0, &[]);
+
+        assert!(result.is_err());
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn state_load_file_with_zero_max_tokens_returns_error(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let mut context = build_context(fixture)?;
+
+        let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
+        let mut batch = LlamaBatch::new(512, 1)?;
+        batch.add_sequence(&tokens, 0, false)?;
+        context.decode(&mut batch)?;
+
+        let session_path = std::env::temp_dir().join("llama_test_session_zero_max.bin");
+        context.state_save_file(&session_path, &tokens)?;
+
+        let result = context.state_load_file(&session_path, 0);
+
+        assert!(result.is_err());
+        let _ = std::fs::remove_file(&session_path);
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn state_seq_load_file_with_zero_max_tokens_returns_error(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let mut context = build_context(fixture)?;
+
+        let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
+        let mut batch = LlamaBatch::new(512, 1)?;
+        batch.add_sequence(&tokens, 0, false)?;
+        context.decode(&mut batch)?;
+
+        let session_path = std::env::temp_dir().join("llama_test_seq_state_zero_max.bin");
+        context.state_seq_save_file(&session_path, 0, &tokens)?;
+
+        let result = context.state_seq_load_file(&session_path, 0, 0);
+
+        assert!(result.is_err());
+        let _ = std::fs::remove_file(&session_path);
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn state_load_file_with_insufficient_max_tokens_returns_length_error(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let mut context = build_context(fixture)?;
+
+        let tokens = fixture.model.str_to_token(
+            "Hello world this is a longer string for more tokens",
+            AddBos::Always,
+        )?;
+        let mut batch = LlamaBatch::new(512, 1)?;
+        batch.add_sequence(&tokens, 0, false)?;
+        context.decode(&mut batch)?;
+
+        let session_path = std::env::temp_dir().join("llama_test_session_insuf.bin");
+        context.state_save_file(&session_path, &tokens)?;
+
+        let result = context.state_load_file(&session_path, 1);
+
+        assert!(result.is_err());
+        let _ = std::fs::remove_file(&session_path);
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn state_seq_load_file_with_insufficient_max_tokens_returns_length_error(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let mut context = build_context(fixture)?;
+
+        let tokens = fixture.model.str_to_token(
+            "Hello world this is a longer string for more tokens",
+            AddBos::Always,
+        )?;
+        let mut batch = LlamaBatch::new(512, 1)?;
+        batch.add_sequence(&tokens, 0, false)?;
+        context.decode(&mut batch)?;
+
+        let session_path = std::env::temp_dir().join("llama_test_seq_state_insuf.bin");
+        context.state_seq_save_file(&session_path, 0, &tokens)?;
+
+        let result = context.state_seq_load_file(&session_path, 0, 1);
+
+        assert!(result.is_err());
+        let _ = std::fs::remove_file(&session_path);
+
+        Ok(())
+    }
+
+    #[cfg(unix)]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn state_save_file_with_non_utf8_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+        use std::ffi::OsStr;
+        use std::os::unix::ffi::OsStrExt;
+
+        let context = build_context(fixture)?;
+
+        let non_utf8_path = std::path::Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.bin"));
+        let result = context.state_save_file(non_utf8_path, &[]);
+
+        assert!(result.is_err());
+
+        Ok(())
+    }
+
+    #[cfg(unix)]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn state_load_file_with_non_utf8_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+        use std::ffi::OsStr;
+        use std::os::unix::ffi::OsStrExt;
+
+        let mut context = build_context(fixture)?;
+
+        let non_utf8_path = std::path::Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.bin"));
+        let result = context.state_load_file(non_utf8_path, 512);
+
+        assert!(result.is_err());
+
+        Ok(())
+    }
+
+    #[cfg(unix)]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn state_seq_save_file_with_non_utf8_path_returns_error(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        use std::ffi::OsStr;
+        use std::os::unix::ffi::OsStrExt;
+
+        let context = build_context(fixture)?;
+
+        let non_utf8_path = std::path::Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.bin"));
+        let result = context.state_seq_save_file(non_utf8_path, 0, &[]);
+
+        assert!(result.is_err());
+
+        Ok(())
+    }
+
+    #[cfg(unix)]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn state_seq_load_file_with_non_utf8_path_returns_error(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        use std::ffi::OsStr;
+        use std::os::unix::ffi::OsStrExt;
+
+        let mut context = build_context(fixture)?;
+
+        let non_utf8_path = std::path::Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.bin"));
+        let result = context.state_seq_load_file(non_utf8_path, 0, 512);
+
+        assert!(result.is_err());
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn state_save_file_with_null_byte_in_path_returns_error(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let context = build_context(fixture)?;
+
+        let path_with_null = std::path::Path::new("/tmp/foo\0bar.bin");
+        let result = context.state_save_file(path_with_null, &[]);
+
+        assert!(result.is_err());
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn state_load_file_with_null_byte_in_path_returns_error(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let mut context = build_context(fixture)?;
+
+        let path_with_null = std::path::Path::new("/tmp/foo\0bar.bin");
+        let result = context.state_load_file(path_with_null, 512);
+
+        assert!(result.is_err());
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn state_seq_save_file_with_null_byte_in_path_returns_error(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let context = build_context(fixture)?;
+
+        let path_with_null = std::path::Path::new("/tmp/foo\0bar.bin");
+        let result = context.state_seq_save_file(path_with_null, 0, &[]);
+
+        assert!(result.is_err());
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn state_seq_load_file_with_null_byte_in_path_returns_error(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let mut context = build_context(fixture)?;
+
+        let path_with_null = std::path::Path::new("/tmp/foo\0bar.bin");
+        let result = context.state_seq_load_file(path_with_null, 0, 512);
+
+        assert!(result.is_err());
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn state_seq_get_size_ext_returns_size_for_decoded_sequence(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        use llama_cpp_bindings::context::llama_state_seq_flags::LlamaStateSeqFlags;
+
+        let mut context = build_context(fixture)?;
+
+        let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
+        let mut batch = LlamaBatch::new(512, 1)?;
+        batch.add_sequence(&tokens, 0, false)?;
+        context.decode(&mut batch)?;
+
+        let flags = LlamaStateSeqFlags::empty();
+        let size = context.state_seq_get_size_ext(0, &flags);
+
+        assert!(size > 0);
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn state_seq_get_data_ext_and_set_data_ext_round_trip(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        use llama_cpp_bindings::context::llama_state_seq_flags::LlamaStateSeqFlags;
+
+        let mut context = build_context(fixture)?;
+
+        let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
+        let mut batch = LlamaBatch::new(512, 1)?;
+        batch.add_sequence(&tokens, 0, false)?;
+        context.decode(&mut batch)?;
+
+        let flags = LlamaStateSeqFlags::empty();
+        let size = context.state_seq_get_size_ext(0, &flags);
+        let mut buffer = vec![0u8; size];
+        let bytes_written = unsafe { context.state_seq_get_data_ext(&mut buffer, 0, &flags) };
+
+        assert!(bytes_written > 0);
+
+        let bytes_read = unsafe { context.state_seq_set_data_ext(&buffer, 0, &flags) };
+
+        assert!(bytes_read > 0);
+
+        Ok(())
+    }
+}
+
+llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/llguidance.rs b/llama-cpp-bindings-tests/tests/llguidance.rs
deleted file mode 100644
index 74bd229a..00000000
--- a/llama-cpp-bindings-tests/tests/llguidance.rs
+++ /dev/null
@@ -1,686 +0,0 @@
-#![expect(
-    clippy::unnecessary_wraps,
-    reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-)]
-
-use std::ffi::CStr;
-use std::sync::Arc;
-
-use anyhow::Result;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::llguidance_sampler::create_llg_sampler;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_bindings::sampling::LlamaSampler;
-use llama_cpp_bindings::token::LlamaToken;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const JSON_SCHEMA: &str =
-    r#"{"type":"object","properties":{"answer":{"type":"string"}},"required":["answer"]}"#;
-const REGEX_GRAMMAR: &str = r"yes|no";
-const LARK_GRAMMAR: &str = r#"start: "yes" | "no""#;
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn creates_sampler_with_valid_json_schema(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let sampler = create_llg_sampler(fixture.model, "json", JSON_SCHEMA)?;
-
-    assert!(!sampler.sampler.is_null());
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn creates_sampler_with_valid_regex_grammar(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?;
-
-    assert!(!sampler.sampler.is_null());
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn creates_sampler_with_valid_lark_grammar(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let sampler = create_llg_sampler(fixture.model, "lark", LARK_GRAMMAR)?;
-
-    assert!(!sampler.sampler.is_null());
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn returns_error_for_unknown_grammar_kind(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let result = create_llg_sampler(fixture.model, "not_a_real_kind", "anything");
-
-    assert!(result.is_err());
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn returns_error_for_malformed_json_schema(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let result = create_llg_sampler(fixture.model, "json", "{this is not valid json");
-
-    assert!(result.is_err());
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn returns_error_for_malformed_regex(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let result = create_llg_sampler(fixture.model, "regex", "[invalid");
-
-    assert!(result.is_err());
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn name_callback_returns_llguidance(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?;
-
-    let name_ptr = unsafe { llama_cpp_bindings_sys::llama_sampler_name(sampler.sampler) };
-    assert!(!name_ptr.is_null());
-    let name = unsafe { CStr::from_ptr(name_ptr) }.to_str()?;
-
-    assert_eq!(name, "llguidance");
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn clone_via_ffi_creates_independent_sampler(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?;
-
-    let cloned = unsafe { llama_cpp_bindings_sys::llama_sampler_clone(sampler.sampler) };
-
-    assert!(!cloned.is_null());
-
-    unsafe { llama_cpp_bindings_sys::llama_sampler_free(cloned) };
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn samples_token_constrained_by_grammar(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let model = fixture.model;
-    let backend = fixture.backend;
-    let mut context = LlamaContext::from_model(
-        model,
-        backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-
-    let prompt = "Answer yes or no:";
-    let tokens = model.str_to_token(prompt, AddBos::Always)?;
-    let mut batch = LlamaBatch::new(512, 1)?;
-    batch.add_sequence(&tokens, 0, false)?;
-    context.decode(&mut batch)?;
-
-    let llg_sampler = create_llg_sampler(model, "regex", REGEX_GRAMMAR)?;
-    let mut chain = LlamaSampler::chain_simple([llg_sampler, LlamaSampler::greedy()]);
-
-    let token = chain.sample(&context, batch.n_tokens() - 1)?;
-    chain.accept(token)?;
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn accept_invalid_token_id_does_not_panic(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?;
-
-    let huge_token = LlamaToken(i32::MAX - 1);
-    let _ = sampler.accept(huge_token);
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn approximate_tok_env_returns_same_arc_across_calls(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let first = fixture.model.approximate_tok_env();
-    let second = fixture.model.approximate_tok_env();
-
-    assert!(Arc::ptr_eq(&first, &second));
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn approximate_tok_env_drives_consistent_grammar_constraint(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let first = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?;
-    let second = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?;
-
-    assert!(!first.sampler.is_null());
-    assert!(!second.sampler.is_null());
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn apply_through_chain_during_sample_does_not_panic(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let model = fixture.model;
-    let backend = fixture.backend;
-    let mut context = LlamaContext::from_model(
-        model,
-        backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-
-    let tokens = model.str_to_token("Answer:", AddBos::Always)?;
-    let mut batch = LlamaBatch::new(512, 1)?;
-    batch.add_sequence(&tokens, 0, false)?;
-    context.decode(&mut batch)?;
-
-    let llg_sampler = create_llg_sampler(model, "regex", REGEX_GRAMMAR)?;
-    let mut chain = LlamaSampler::chain_simple([llg_sampler, LlamaSampler::greedy()]);
-    let _ = chain.sample(&context, batch.n_tokens() - 1);
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn reset_clears_sampler_state(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?;
-    let huge_token = LlamaToken(i32::MAX - 1);
-    let _ = sampler.accept(huge_token);
-    sampler.reset();
-    let after = sampler.accept(LlamaToken(0));
-    assert!(
-        after.is_ok() || after.is_err(),
-        "after reset, sampler.accept must return Ok or Err (not panic)"
-    );
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/mistral3_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs b/llama-cpp-bindings-tests/tests/mistral3_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs
deleted file mode 100644
index 6ae1d9cd..00000000
--- a/llama-cpp-bindings-tests/tests/mistral3_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs
+++ /dev/null
@@ -1,81 +0,0 @@
-use anyhow::Result;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_bindings::sampling::LlamaSampler;
-use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const MAX_GENERATED_TOKENS: i32 = 200;
-
-const MISTRAL3_THINKING_DISABLED_PROMPT: &str = "\
-[INST]Reply with the single word: four. Do not explain.[/INST][THINK][/THINK]";
-
-const FORBIDDEN_MARKERS: &[&str] = &["[THINK]", "[/THINK]"];
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 8192,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn mistral3_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let model = fixture.model;
-    let backend = fixture.backend;
-
-    let mut classifier = model.sampled_token_classifier();
-    let prompt_tokens = model.str_to_token(MISTRAL3_THINKING_DISABLED_PROMPT, AddBos::Always)?;
-    let prompt_token_count = u64::try_from(prompt_tokens.len())?;
-
-    let mut batch = LlamaBatch::new(2048, 1)?;
-    classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
-
-    let mut context = LlamaContext::from_model(
-        model,
-        backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-
-    context.decode(&mut batch)?;
-
-    let promoted = classifier.commit_prompt_tokens();
-    assert_eq!(promoted, prompt_token_count);
-
-    let mut sampler = LlamaSampler::greedy();
-    let initial_position = batch.n_tokens();
-    let outcome = ClassifySampleLoop {
-        model,
-        classifier: &mut classifier,
-        sampler: &mut sampler,
-        context: &mut context,
-        batch: &mut batch,
-        initial_position,
-        max_generated_tokens: MAX_GENERATED_TOKENS,
-    }
-    .run()?;
-
-    let usage = classifier.usage();
-
-    assert!(!outcome.generated_raw.is_empty());
-    assert_eq!(outcome.observed_reasoning, 0);
-    assert_eq!(outcome.observed_undeterminable, 0);
-    assert_eq!(usage.reasoning_tokens, 0);
-    assert_eq!(usage.undeterminable_tokens, 0);
-    assert!(outcome.observed_content > 0);
-    assert_eq!(usage.completion_tokens(), outcome.observed_content);
-
-    for forbidden in FORBIDDEN_MARKERS {
-        assert!(!outcome.content_stream.contains(forbidden));
-    }
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/mistral3_classifier_emits_reasoning.rs b/llama-cpp-bindings-tests/tests/mistral3_classifier_emits_reasoning.rs
deleted file mode 100644
index 296ad348..00000000
--- a/llama-cpp-bindings-tests/tests/mistral3_classifier_emits_reasoning.rs
+++ /dev/null
@@ -1,101 +0,0 @@
-use anyhow::Result;
-use anyhow::bail;
-use llama_cpp_bindings::ChatMessageParseOutcome;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_bindings::sampling::LlamaSampler;
-use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const MAX_GENERATED_TOKENS: i32 = 768;
-
-const MISTRAL3_THINKING_PROMPT: &str = "\
-[SYSTEM_PROMPT]# HOW YOU SHOULD THINK AND ANSWER\n\n\
-First draft your thinking process (inner monologue) until you arrive at a response. \
-Format your response using Markdown, and use LaTeX for any mathematical equations. \
-Write both your thoughts and the response in the same language as the input.\n\n\
-Your thinking process must follow the template below:\
-[THINK]Your thoughts or/and draft, like working through an exercise on scratch paper. \
-Be as casual and as long as you want until you are confident to generate the response \
-to the user.[/THINK]Here, provide a self-contained response.[/SYSTEM_PROMPT]\
-[INST]Reply with the single word: four. Do not explain.[/INST]";
-
-const FORBIDDEN_MARKERS: &[&str] = &["[THINK]", "[/THINK]"];
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 8192,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn mistral3_classifier_emits_reasoning_for_thinking_prompt(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let model = fixture.model;
-    let backend = fixture.backend;
-
-    let mut classifier = model.sampled_token_classifier();
-    let prompt_tokens = model.str_to_token(MISTRAL3_THINKING_PROMPT, AddBos::Always)?;
-    let prompt_token_count = u64::try_from(prompt_tokens.len())?;
-
-    let mut batch = LlamaBatch::new(2048, 1)?;
-    classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
-
-    let mut context = LlamaContext::from_model(
-        model,
-        backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-
-    context.decode(&mut batch)?;
-
-    let promoted = classifier.commit_prompt_tokens();
-    assert_eq!(promoted, prompt_token_count);
-
-    let mut sampler = LlamaSampler::greedy();
-    let initial_position = batch.n_tokens();
-    let outcome = ClassifySampleLoop {
-        model,
-        classifier: &mut classifier,
-        sampler: &mut sampler,
-        context: &mut context,
-        batch: &mut batch,
-        initial_position,
-        max_generated_tokens: MAX_GENERATED_TOKENS,
-    }
-    .run()?;
-
-    let usage = classifier.usage();
-    let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?;
-    let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else {
-        bail!("Mistral 3 chat template must be recognised by the parser; got Unrecognized");
-    };
-
-    assert!(!outcome.generated_raw.is_empty());
-    assert!(outcome.observed_reasoning > 0);
-    assert!(usage.reasoning_tokens > 0);
-    assert_eq!(outcome.observed_undeterminable, 0);
-    assert_eq!(usage.undeterminable_tokens, 0);
-    assert_eq!(
-        usage.completion_tokens(),
-        outcome.observed_content + outcome.observed_reasoning,
-    );
-    assert!(!parsed.reasoning_content.is_empty());
-    assert_eq!(outcome.reasoning_stream, parsed.reasoning_content);
-    assert_eq!(outcome.content_stream, parsed.content);
-
-    for forbidden in FORBIDDEN_MARKERS {
-        assert!(!outcome.reasoning_stream.contains(forbidden));
-        assert!(!outcome.content_stream.contains(forbidden));
-    }
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/mistral3_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs b/llama-cpp-bindings-tests/tests/mistral3_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs
deleted file mode 100644
index abb5c39f..00000000
--- a/llama-cpp-bindings-tests/tests/mistral3_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs
+++ /dev/null
@@ -1,101 +0,0 @@
-use anyhow::Result;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::mtmd::MtmdBitmap;
-use llama_cpp_bindings::mtmd::MtmdInputText;
-use llama_cpp_bindings::mtmd::mtmd_default_marker;
-use llama_cpp_bindings::sampling::LlamaSampler;
-use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-use llama_cpp_bindings_tests::test_model::fixtures_dir;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const MAX_GENERATED_TOKENS: i32 = 768;
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 4096,
-    n_batch = 512,
-    n_ubatch = 512,
-    mmproj_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "mmproj-F16.gguf"),
-)]
-fn mistral3_classifier_emits_reasoning_for_multimodal_thinking_prompt(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let model = fixture.model;
-    let backend = fixture.backend;
-    let mtmd_ctx = fixture
-        .mtmd_context
-        .expect("mmproj_file declared in attribute");
-
-    let mut context = LlamaContext::from_model(
-        model,
-        backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-
-    let image_path = fixtures_dir().join("llamas.jpg");
-    let image_path_str = image_path
-        .to_str()
-        .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?;
-    let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?;
-
-    let marker = mtmd_default_marker();
-    let prompt = format!(
-        "[SYSTEM_PROMPT]# HOW YOU SHOULD THINK AND ANSWER\n\n\
-         First draft your thinking process (inner monologue) until you arrive at a response. \
-         Format your response using Markdown, and use LaTeX for any mathematical equations. \
-         Write both your thoughts and the response in the same language as the input.\n\n\
-         Your thinking process must follow the template below:\
-         [THINK]Your thoughts or/and draft, like working through an exercise on scratch paper. \
-         Be as casual and as long as you want until you are confident to generate the response \
-         to the user.[/THINK]Here, provide a self-contained response.[/SYSTEM_PROMPT]\
-         [INST]{marker}What animals do you see in this image?[/INST]"
-    );
-
-    let input_text = MtmdInputText {
-        text: prompt,
-        add_special: true,
-        parse_special: true,
-    };
-
-    let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
-
-    let mut classifier = model.sampled_token_classifier();
-    let n_past = classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?;
-
-    let mut sampler = LlamaSampler::greedy();
-    let mut batch = LlamaBatch::new(2048, 1)?;
-    let outcome = ClassifySampleLoop {
-        model,
-        classifier: &mut classifier,
-        sampler: &mut sampler,
-        context: &mut context,
-        batch: &mut batch,
-        initial_position: n_past,
-        max_generated_tokens: MAX_GENERATED_TOKENS,
-    }
-    .run()?;
-
-    let usage = classifier.usage();
-
-    if outcome.observed_reasoning == 0 {
-        anyhow::bail!(
-            "Mistral 3 multimodal + thinking: classifier must emit at least one Reasoning token \
-             when the model opens a `[THINK]` block; outcome={outcome:?}"
-        );
-    }
-    if usage.reasoning_tokens == 0 {
-        anyhow::bail!(
-            "Mistral 3 multimodal + thinking: usage.reasoning_tokens must be non-zero; usage={usage:?}"
-        );
-    }
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/mistral3_parses_tool_call_payload.rs b/llama-cpp-bindings-tests/tests/mistral3_parses_tool_call_payload.rs
deleted file mode 100644
index b67e0765..00000000
--- a/llama-cpp-bindings-tests/tests/mistral3_parses_tool_call_payload.rs
+++ /dev/null
@@ -1,65 +0,0 @@
-use anyhow::Result;
-use anyhow::bail;
-use llama_cpp_bindings::ChatMessageParseOutcome;
-use llama_cpp_bindings::ToolCallArguments;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const TOOLS_JSON: &str = r#"[
-    {
-        "type": "function",
-        "function": {
-            "name": "get_weather",
-            "description": "Get the current weather for a location",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "location": {"type": "string", "description": "The city name"}
-                },
-                "required": ["location"]
-            }
-        }
-    }
-]"#;
-
-const MISTRAL3_BRACKETED_JSON_PAYLOAD: &str =
-    r#"[TOOL_CALLS]get_weather[ARGS]{"location":"Paris"}"#;
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn mistral3_parses_tool_call_payload(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let outcome =
-        fixture
-            .model
-            .parse_chat_message(TOOLS_JSON, MISTRAL3_BRACKETED_JSON_PAYLOAD, false)?;
-
-    let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
-        bail!(
-            "expected Recognized for Mistral 3 BracketedJson on a Mistral-3 model; got Unrecognized"
-        );
-    };
-    assert_eq!(parsed.tool_calls.len(), 1);
-    assert_eq!(parsed.tool_calls[0].name, "get_weather");
-    let location = match &parsed.tool_calls[0].arguments {
-        ToolCallArguments::ValidJson(value) => value
-            .get("location")
-            .and_then(|v| v.as_str())
-            .map(str::to_owned),
-        ToolCallArguments::InvalidJson(raw) => {
-            bail!("expected ValidJson, got InvalidJson: {raw}");
-        }
-    };
-    assert_eq!(location.as_deref(), Some("Paris"));
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/model_chat_template.rs b/llama-cpp-bindings-tests/tests/model_chat_template.rs
deleted file mode 100644
index 88511471..00000000
--- a/llama-cpp-bindings-tests/tests/model_chat_template.rs
+++ /dev/null
@@ -1,194 +0,0 @@
-#![expect(
-    clippy::unnecessary_wraps,
-    reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-)]
-
-use anyhow::Result;
-use llama_cpp_bindings::ChatTemplateError;
-use llama_cpp_bindings::model::LlamaChatMessage;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn chat_template_returns_non_empty(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let template = fixture.model.chat_template(None);
-    assert!(template.is_ok());
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn apply_chat_template_produces_prompt(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let model = fixture.model;
-    let template = model.chat_template(None)?;
-    let message = LlamaChatMessage::new("user".to_string(), "hello".to_string())?;
-    let prompt = model.apply_chat_template(&template, &[message], true);
-
-    assert!(prompt.is_ok());
-    assert!(!prompt?.is_empty());
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn apply_chat_template_buffer_resize_with_long_messages(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let model = fixture.model;
-    let template = model.chat_template(None)?;
-    let long_content = "a".repeat(2000);
-    let message = LlamaChatMessage::new("user".to_string(), long_content)?;
-    let prompt = model.apply_chat_template(&template, &[message], true);
-
-    assert!(prompt.is_ok());
-    assert!(!prompt?.is_empty());
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn chat_template_with_nonexistent_name_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let result = fixture
-        .model
-        .chat_template(Some("nonexistent_template_name_xyz"));
-    assert_eq!(result.unwrap_err(), ChatTemplateError::MissingTemplate);
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/model_context_creation.rs b/llama-cpp-bindings-tests/tests/model_context_creation.rs
deleted file mode 100644
index 300027ec..00000000
--- a/llama-cpp-bindings-tests/tests/model_context_creation.rs
+++ /dev/null
@@ -1,106 +0,0 @@
-#![expect(
-    clippy::unnecessary_wraps,
-    reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-)]
-
-use anyhow::Result;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 256,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 256,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 256,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 256,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn new_context_returns_valid_context(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let context = LlamaContext::from_model(
-        fixture.model,
-        fixture.backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-
-    assert!(context.n_ctx() > 0);
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 4294967295,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 4294967295,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 4294967295,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 4294967295,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn new_context_with_huge_ctx_returns_null_error(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let result = LlamaContext::from_model(
-        fixture.model,
-        fixture.backend,
-        (*fixture.context_params).into_llama_context_params(),
-    );
-
-    assert!(result.is_err());
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/model_helpers.rs b/llama-cpp-bindings-tests/tests/model_helpers.rs
deleted file mode 100644
index 3efeae82..00000000
--- a/llama-cpp-bindings-tests/tests/model_helpers.rs
+++ /dev/null
@@ -1,103 +0,0 @@
-#![expect(
-    clippy::unnecessary_wraps,
-    reason = "every trial returns anyhow::Result<()> to match the LlamaTestFn signature"
-)]
-
-use anyhow::Result;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 2048,
-    n_batch = 512,
-    n_ubatch = 128
-)]
-fn debug_format_includes_struct_name_and_model_field(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let formatted = format!("{:?}", fixture.model);
-
-    assert!(formatted.contains("LlamaModel"));
-    assert!(formatted.contains("model"));
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 2048,
-    n_batch = 512,
-    n_ubatch = 128
-)]
-fn embedding_model_tool_call_markers_call_does_not_panic(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let _markers = fixture.model.tool_call_markers();
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 2048,
-    n_batch = 512,
-    n_ubatch = 128
-)]
-fn embedding_model_streaming_markers_returns_ok_for_a_model_without_tool_calls(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let _markers = fixture.model.streaming_markers()?;
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 2048,
-    n_batch = 512,
-    n_ubatch = 128
-)]
-fn approximate_tok_env_is_cached_across_calls(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let first = fixture.model.approximate_tok_env();
-    let second = fixture.model.approximate_tok_env();
-
-    assert!(std::sync::Arc::ptr_eq(&first, &second));
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 2048,
-    n_batch = 512,
-    n_ubatch = 128
-)]
-fn approximate_tok_env_falls_back_to_eos_when_eot_unavailable(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let env = fixture.model.approximate_tok_env();
-    let env_again = fixture.model.approximate_tok_env();
-
-    assert!(
-        std::sync::Arc::ptr_eq(&env, &env_again),
-        "approximate_tok_env must return the same cached Arc for any model, including \
-         the embedding model which lacks an EOT token (forcing the fallback-to-EOS path)"
-    );
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/model_loading_errors.rs b/llama-cpp-bindings-tests/tests/model_loading_errors.rs
index cd36eb46..6cf63144 100644
--- a/llama-cpp-bindings-tests/tests/model_loading_errors.rs
+++ b/llama-cpp-bindings-tests/tests/model_loading_errors.rs
@@ -4,8 +4,10 @@
 )]
 
 use std::path::Path;
+use std::path::PathBuf;
 
 use anyhow::Result;
+use llama_cpp_bindings::LlamaLoraAdapterInitError;
 use llama_cpp_bindings::LlamaModelLoadError;
 use llama_cpp_bindings::model::LlamaModel;
 use llama_cpp_bindings::model::params::LlamaModelParams;
@@ -169,4 +171,151 @@ fn load_model_with_non_utf8_path_returns_path_to_str_error(
     Ok(())
 }
 
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn lora_adapter_init_with_invalid_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+    let result = fixture
+        .model
+        .lora_adapter_init("/nonexistent/path/lora.gguf");
+    assert_eq!(
+        result.unwrap_err(),
+        LlamaLoraAdapterInitError::FileNotFound(PathBuf::from("/nonexistent/path/lora.gguf"))
+    );
+    Ok(())
+}
+
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn lora_adapter_init_with_invalid_gguf_returns_unloadable(
+    fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+    let dummy_path = std::env::temp_dir().join("llama_test_dummy_lora.gguf");
+    std::fs::write(&dummy_path, b"not a valid gguf")?;
+
+    let result = fixture.model.lora_adapter_init(&dummy_path);
+
+    assert_eq!(result.unwrap_err(), LlamaLoraAdapterInitError::Unloadable);
+    let _ = std::fs::remove_file(&dummy_path);
+    Ok(())
+}
+
+#[cfg(unix)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+#[llama_test(
+    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+    n_gpu_layers = 999,
+    use_mmap = true,
+    use_mlock = false,
+    n_ctx = 512,
+    n_batch = 128,
+    n_ubatch = 64,
+)]
+fn lora_adapter_init_with_non_utf8_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+    use std::ffi::OsStr;
+    use std::os::unix::ffi::OsStrExt;
+
+    let non_utf8_path = Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.gguf"));
+    let result = fixture.model.lora_adapter_init(non_utf8_path);
+
+    assert_eq!(
+        result.unwrap_err(),
+        LlamaLoraAdapterInitError::PathToStrError(non_utf8_path.to_path_buf())
+    );
+    Ok(())
+}
+
 llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/model_lora_adapter_errors.rs b/llama-cpp-bindings-tests/tests/model_lora_adapter_errors.rs
deleted file mode 100644
index ae04dad8..00000000
--- a/llama-cpp-bindings-tests/tests/model_lora_adapter_errors.rs
+++ /dev/null
@@ -1,162 +0,0 @@
-#![expect(
-    clippy::unnecessary_wraps,
-    reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-)]
-
-use std::path::PathBuf;
-
-use anyhow::Result;
-use llama_cpp_bindings::LlamaLoraAdapterInitError;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn lora_adapter_init_with_invalid_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let result = fixture
-        .model
-        .lora_adapter_init("/nonexistent/path/lora.gguf");
-    assert_eq!(
-        result.unwrap_err(),
-        LlamaLoraAdapterInitError::FileNotFound(PathBuf::from("/nonexistent/path/lora.gguf"))
-    );
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn lora_adapter_init_with_invalid_gguf_returns_unloadable(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let dummy_path = std::env::temp_dir().join("llama_test_dummy_lora.gguf");
-    std::fs::write(&dummy_path, b"not a valid gguf")?;
-
-    let result = fixture.model.lora_adapter_init(&dummy_path);
-
-    assert_eq!(result.unwrap_err(), LlamaLoraAdapterInitError::Unloadable);
-    let _ = std::fs::remove_file(&dummy_path);
-    Ok(())
-}
-
-#[cfg(unix)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn lora_adapter_init_with_non_utf8_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
-    use std::ffi::OsStr;
-    use std::os::unix::ffi::OsStrExt;
-    use std::path::Path;
-
-    let non_utf8_path = Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.gguf"));
-    let result = fixture.model.lora_adapter_init(non_utf8_path);
-
-    assert_eq!(
-        result.unwrap_err(),
-        LlamaLoraAdapterInitError::PathToStrError(non_utf8_path.to_path_buf())
-    );
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/model_metadata_kv.rs b/llama-cpp-bindings-tests/tests/model_metadata_kv.rs
deleted file mode 100644
index 7d99b859..00000000
--- a/llama-cpp-bindings-tests/tests/model_metadata_kv.rs
+++ /dev/null
@@ -1,355 +0,0 @@
-#![expect(
-    clippy::unnecessary_wraps,
-    reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-)]
-
-use anyhow::Result;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn meta_count_returns_positive(fixture: &LlamaFixture<'_>) -> Result<()> {
-    assert!(fixture.model.meta_count() > 0);
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn meta_key_by_index_returns_valid_key(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let key = fixture.model.meta_key_by_index(0)?;
-    assert!(!key.is_empty());
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn meta_val_str_by_index_returns_valid_value(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let value = fixture.model.meta_val_str_by_index(0)?;
-    assert!(!value.is_empty());
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn meta_key_by_index_out_of_range_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let result = fixture.model.meta_key_by_index(999_999);
-    assert!(result.is_err());
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn meta_val_str_by_index_out_of_range_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let result = fixture.model.meta_val_str_by_index(999_999);
-    assert!(result.is_err());
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn meta_val_str_returns_value_for_known_key(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let model = fixture.model;
-    let first_key = model.meta_key_by_index(0)?;
-    let value = model.meta_val_str(&first_key)?;
-    assert!(!value.is_empty());
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn meta_val_str_with_long_value_triggers_buffer_resize(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let model = fixture.model;
-    let count = model.meta_count();
-
-    for index in 0..count {
-        let key = model.meta_key_by_index(index);
-        let value = model.meta_val_str_by_index(index);
-        assert!(key.is_ok());
-        assert!(value.is_ok());
-    }
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn meta_val_str_with_null_byte_in_key_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let result = fixture.model.meta_val_str("key\0with_null");
-    assert!(result.is_err());
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/model_params.rs b/llama-cpp-bindings-tests/tests/model_params.rs
deleted file mode 100644
index 6684625e..00000000
--- a/llama-cpp-bindings-tests/tests/model_params.rs
+++ /dev/null
@@ -1,78 +0,0 @@
-#![expect(
-    clippy::similar_names,
-    reason = "model_path_str and model_path_cstr are both genuinely needed; renaming would not improve clarity"
-)]
-
-use std::ffi::CString;
-use std::pin::pin;
-
-use anyhow::Result;
-use llama_cpp_bindings::context::params::LlamaContextParams;
-use llama_cpp_bindings::max_devices;
-use llama_cpp_bindings::model::params::LlamaModelParams;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn fit_params_succeeds_with_test_model(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let model_path_str = fixture
-        .model_path
-        .to_str()
-        .ok_or_else(|| anyhow::anyhow!("model path is not valid UTF-8"))?;
-    let model_path_cstr = CString::new(model_path_str)?;
-
-    let mut params = pin!(LlamaModelParams::default());
-    let mut context_params = LlamaContextParams::default();
-    let mut margins = vec![0usize; max_devices()];
-
-    let result = params.as_mut().fit_params(
-        &model_path_cstr,
-        &mut context_params,
-        &mut margins,
-        512,
-        llama_cpp_bindings_sys::GGML_LOG_LEVEL_NONE,
-    );
-
-    let fit = result.map_err(|fit_error| anyhow::anyhow!("fit_params failed: {fit_error:?}"))?;
-    assert!(fit.n_ctx > 0);
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/model_properties.rs b/llama-cpp-bindings-tests/tests/model_properties.rs
deleted file mode 100644
index bd33ef6b..00000000
--- a/llama-cpp-bindings-tests/tests/model_properties.rs
+++ /dev/null
@@ -1,421 +0,0 @@
-#![expect(
-    clippy::unnecessary_wraps,
-    reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-)]
-
-use anyhow::Result;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn model_loads_with_valid_metadata(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let model = fixture.model;
-
-    assert!(model.n_vocab() > 0);
-    assert!(model.n_embd() > 0);
-    assert!(model.n_params() > 0);
-    assert!(model.n_ctx_train()? > 0);
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn n_layer_returns_positive(fixture: &LlamaFixture<'_>) -> Result<()> {
-    assert!(fixture.model.n_layer()? > 0);
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn n_head_returns_positive(fixture: &LlamaFixture<'_>) -> Result<()> {
-    assert!(fixture.model.n_head()? > 0);
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn n_head_kv_returns_positive(fixture: &LlamaFixture<'_>) -> Result<()> {
-    assert!(fixture.model.n_head_kv()? > 0);
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn model_size_returns_nonzero(fixture: &LlamaFixture<'_>) -> Result<()> {
-    assert!(fixture.model.size() > 0);
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn is_recurrent_returns_false_for_transformer(fixture: &LlamaFixture<'_>) -> Result<()> {
-    assert!(!fixture.model.is_recurrent());
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn is_hybrid_returns_false_for_non_hybrid_default_models(fixture: &LlamaFixture<'_>) -> Result<()> {
-    assert!(
-        !fixture.model.is_hybrid(),
-        "DeepSeek-R1-Distill-Llama-8B and GLM-4.7-Flash are pure transformers, not hybrid; got is_hybrid=true"
-    );
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn is_hybrid_returns_true_for_qwen_default_models(fixture: &LlamaFixture<'_>) -> Result<()> {
-    assert!(
-        fixture.model.is_hybrid(),
-        "Qwen 3.5 and Qwen 3.6 default GGUFs are reported as hybrid by llama.cpp; got is_hybrid=false"
-    );
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn rope_type_returns_a_known_variant_for_rope_carrying_default_models(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    use llama_cpp_bindings::model::rope_type::RopeType;
-    let rope = fixture.model.rope_type();
-    assert!(
-        matches!(
-            rope,
-            Some(RopeType::Norm | RopeType::NeoX | RopeType::MRope | RopeType::Vision)
-        ),
-        "rope_type must be a known variant for DeepSeek and GLM-4.7; got {rope:?}"
-    );
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn rope_type_returns_none_for_qwen_default_models(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let rope = fixture.model.rope_type();
-    assert!(
-        rope.is_none(),
-        "Qwen 3.5 and Qwen 3.6 default GGUFs do not expose a rope_type in their metadata; got {rope:?}"
-    );
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn vocab_type_returns_a_known_variant(fixture: &LlamaFixture<'_>) -> Result<()> {
-    use llama_cpp_bindings::model::vocab_type::VocabType;
-    let vocab = fixture.model.vocab_type()?;
-    assert!(
-        matches!(vocab, VocabType::BPE | VocabType::SPM),
-        "vocab_type must be a known variant; got {vocab:?}"
-    );
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/model_sampling.rs b/llama-cpp-bindings-tests/tests/model_sampling.rs
deleted file mode 100644
index d6b40ba4..00000000
--- a/llama-cpp-bindings-tests/tests/model_sampling.rs
+++ /dev/null
@@ -1,452 +0,0 @@
-use anyhow::Result;
-use llama_cpp_bindings::SampledToken;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::json_schema_to_grammar;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_bindings::sampling::LlamaSampler;
-use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 256,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 256,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 256,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 256,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn sample_returns_result_and_succeeds_with_valid_index(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let model = fixture.model;
-    let mut context = LlamaContext::from_model(
-        model,
-        fixture.backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-
-    let tokens = model.str_to_token("Hello", AddBos::Always)?;
-    let mut batch = LlamaBatch::new(512, 1)?;
-
-    batch.add_sequence(&tokens, 0, false)?;
-
-    context.decode(&mut batch)?;
-
-    let mut sampler = LlamaSampler::chain_simple([LlamaSampler::temp(0.8), LlamaSampler::greedy()]);
-
-    let result = sampler.sample(&context, batch.n_tokens() - 1);
-
-    assert!(result.is_ok());
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn grammar_sampler_constrains_output_to_yes_or_no(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let model = fixture.model;
-    let mut context = LlamaContext::from_model(
-        model,
-        fixture.backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-
-    let prompt = "<|im_start|>user\nIs the sky blue? Answer yes or no.<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n";
-    let tokens = model.str_to_token(prompt, AddBos::Always)?;
-    let mut batch = LlamaBatch::new(512, 1)?;
-
-    batch.add_sequence(&tokens, 0, false)?;
-
-    context.decode(&mut batch)?;
-
-    let mut sampler = LlamaSampler::chain_simple([
-        LlamaSampler::grammar(model, r"root ::= [Yy] [Ee] [Ss] | [Nn] [Oo]", "root")?,
-        LlamaSampler::temp(0.8),
-        LlamaSampler::greedy(),
-    ]);
-
-    let mut classifier = model.sampled_token_classifier();
-    let (raw_token, mut outcomes) =
-        classifier.sample(&mut sampler, &context, batch.n_tokens() - 1)?;
-    outcomes.extend(classifier.flush());
-
-    assert_eq!(
-        outcomes.len(),
-        1,
-        "expected one finalised outcome after flush"
-    );
-    let outcome = &outcomes[0];
-
-    let raw_as_sampled = SampledToken::Content(raw_token);
-    assert!(
-        !model.is_eog_token(&raw_as_sampled),
-        "Grammar sampler should not allow EOS as first token"
-    );
-
-    let piece = &outcome.raw_piece;
-    let first_char = piece
-        .chars()
-        .next()
-        .ok_or_else(|| anyhow::anyhow!("piece should have at least one character"))?
-        .to_lowercase()
-        .next()
-        .ok_or_else(|| anyhow::anyhow!("lowercase iterator should yield a character"))?;
-
-    assert!(
-        first_char == 'y' || first_char == 'n',
-        "Grammar should constrain first token to start with y/n, got: '{piece}'"
-    );
-    assert_eq!(
-        classifier.usage().completion_tokens(),
-        1,
-        "exactly one completion token sampled"
-    );
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn json_schema_grammar_sampler_constrains_output_to_json(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let model = fixture.model;
-    let mut context = LlamaContext::from_model(
-        model,
-        fixture.backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-
-    let prompt = "<|im_start|>user\nWhat is 2+2? Respond with a JSON object.<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n";
-    let tokens = model.str_to_token(prompt, AddBos::Always)?;
-    let mut batch = LlamaBatch::new(512, 1)?;
-
-    batch.add_sequence(&tokens, 0, false)?;
-
-    context.decode(&mut batch)?;
-
-    let grammar_str = json_schema_to_grammar(
-        r#"{"type": "object", "properties": {"answer": {"type": "string"}}, "required": ["answer"]}"#,
-    )?;
-
-    let mut sampler = LlamaSampler::chain_simple([
-        LlamaSampler::grammar(model, &grammar_str, "root")?,
-        LlamaSampler::temp(0.8),
-        LlamaSampler::greedy(),
-    ]);
-
-    let mut classifier = model.sampled_token_classifier();
-    let (raw_token, mut outcomes) =
-        classifier.sample(&mut sampler, &context, batch.n_tokens() - 1)?;
-    outcomes.extend(classifier.flush());
-
-    assert_eq!(
-        outcomes.len(),
-        1,
-        "expected one finalised outcome after flush"
-    );
-    let outcome = &outcomes[0];
-
-    let raw_as_sampled = SampledToken::Content(raw_token);
-    assert!(
-        !model.is_eog_token(&raw_as_sampled),
-        "Grammar sampler should not allow EOS as first token"
-    );
-
-    let piece = &outcome.raw_piece;
-
-    assert!(
-        piece.starts_with('{'),
-        "JSON schema grammar should constrain first token to start with '{{', got: '{piece}'"
-    );
-    assert_eq!(
-        classifier.usage().completion_tokens(),
-        1,
-        "exactly one completion token sampled"
-    );
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn sample_with_grammar_produces_constrained_output_in_loop(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let model = fixture.model;
-    let mut context = LlamaContext::from_model(
-        model,
-        fixture.backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-
-    let prompt = "<|im_start|>user\nIs the sky blue? yes or no<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n";
-    let tokens = model.str_to_token(prompt, AddBos::Always)?;
-    let mut batch = LlamaBatch::new(512, 1)?;
-
-    let mut classifier = model.sampled_token_classifier();
-    classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?;
-
-    context.decode(&mut batch)?;
-    classifier.commit_prompt_tokens();
-
-    let mut sampler = LlamaSampler::chain_simple([
-        LlamaSampler::grammar(model, r#"root ::= "yes" | "no""#, "root")?,
-        LlamaSampler::temp(0.8),
-        LlamaSampler::greedy(),
-    ]);
-
-    let initial_position = batch.n_tokens();
-    let outcome = ClassifySampleLoop {
-        model,
-        classifier: &mut classifier,
-        sampler: &mut sampler,
-        context: &mut context,
-        batch: &mut batch,
-        initial_position,
-        max_generated_tokens: 10,
-    }
-    .run()?;
-
-    let lowercase = outcome.generated_raw.to_lowercase();
-    assert!(
-        lowercase == "yes" || lowercase == "no",
-        "Grammar loop should produce 'yes' or 'no', got: '{}'",
-        outcome.generated_raw
-    );
-    assert!(
-        outcome.eog_seen,
-        "loop must terminate via EOG once grammar accepts, not by exhausting the budget; outcome={outcome:?}"
-    );
-    assert_eq!(outcome.observed_reasoning, 0);
-    assert_eq!(outcome.observed_undeterminable, 0);
-    assert_eq!(outcome.observed_tool_call, 0);
-    assert!(outcome.observed_content > 0);
-
-    let usage = classifier.into_usage();
-    assert_eq!(usage.completion_tokens(), outcome.observed_content);
-    assert_eq!(usage.reasoning_tokens, 0);
-    assert_eq!(usage.undeterminable_tokens, 0);
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn sample_without_grammar_produces_multiple_tokens(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let model = fixture.model;
-    let mut context = LlamaContext::from_model(
-        model,
-        fixture.backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-
-    let prompt =
-        "<|im_start|>user\nSay hello<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n";
-    let tokens = model.str_to_token(prompt, AddBos::Always)?;
-    let mut batch = LlamaBatch::new(512, 1)?;
-
-    batch.add_sequence(&tokens, 0, false)?;
-
-    context.decode(&mut batch)?;
-
-    let mut sampler = LlamaSampler::chain_simple([LlamaSampler::temp(0.8), LlamaSampler::greedy()]);
-
-    let mut classifier = model.sampled_token_classifier();
-    let mut sampled_count: u64 = 0;
-
-    for (position, _) in (batch.n_tokens()..).zip(0..5) {
-        let (raw_token, _outcomes) = classifier.sample(&mut sampler, &context, -1)?;
-        let raw_as_sampled = SampledToken::Content(raw_token);
-
-        if model.is_eog_token(&raw_as_sampled) {
-            break;
-        }
-
-        sampled_count += 1;
-
-        batch.clear();
-        batch.add(&raw_as_sampled, position, &[0], true)?;
-
-        context.decode(&mut batch)?;
-    }
-
-    let _ = classifier.flush();
-
-    assert!(
-        sampled_count > 0,
-        "Should produce at least one token without grammar"
-    );
-    let usage = classifier.into_usage();
-    assert!(
-        usage.completion_tokens() >= sampled_count,
-        "completion_tokens ({}) must include the {sampled_count} non-EOG samples",
-        usage.completion_tokens()
-    );
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/model_special_tokens.rs b/llama-cpp-bindings-tests/tests/model_special_tokens.rs
deleted file mode 100644
index c719501b..00000000
--- a/llama-cpp-bindings-tests/tests/model_special_tokens.rs
+++ /dev/null
@@ -1,381 +0,0 @@
-#![expect(
-    clippy::unnecessary_wraps,
-    reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-)]
-
-use anyhow::Result;
-use llama_cpp_bindings::SampledToken;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn special_tokens_exist(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let model = fixture.model;
-    let bos = model.token_bos();
-    let eos = model.token_eos();
-
-    assert_ne!(bos, eos);
-    assert!(model.is_eog_token(&SampledToken::Content(eos)));
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn token_nl_returns_valid_token(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let nl_token = fixture.model.token_nl();
-    assert!(nl_token.0 >= 0);
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn is_eog_token_classifies_reasoning_variant(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let model = fixture.model;
-    let eos = model.token_eos();
-    assert!(model.is_eog_token(&SampledToken::Reasoning(eos)));
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn is_eog_token_classifies_tool_call_variant(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let model = fixture.model;
-    let eos = model.token_eos();
-    assert!(model.is_eog_token(&SampledToken::ToolCall(eos)));
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn is_eog_token_classifies_undeterminable_variant(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let model = fixture.model;
-    let eos = model.token_eos();
-    assert!(model.is_eog_token(&SampledToken::Undeterminable(eos)));
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn decode_start_token_returns_valid_token(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let model = fixture.model;
-    let token = model.decode_start_token();
-    let n_vocab = model.n_vocab();
-    assert!(
-        token.0 == -1 || (0..n_vocab).contains(&token.0),
-        "decode_start_token must be either -1 (no decoder-start defined) or a valid vocab index < {n_vocab}; got {token}"
-    );
-    assert_eq!(
-        token,
-        model.decode_start_token(),
-        "decode_start_token must be deterministic across calls"
-    );
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn token_sep_returns_valid_token(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let model = fixture.model;
-    let token = model.token_sep();
-    let n_vocab = model.n_vocab();
-    assert!(
-        token.0 == -1 || (0..n_vocab).contains(&token.0),
-        "token_sep must be either -1 (no SEP token defined) or a valid vocab index < {n_vocab}; got {token}"
-    );
-    assert_eq!(
-        token,
-        model.token_sep(),
-        "token_sep must be deterministic across calls"
-    );
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn token_attr_returns_attrs_for_bos(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let model = fixture.model;
-    let bos = model.token_bos();
-    let attrs = model.token_attr(bos)?;
-    let bit_repr = format!("{:?}", *attrs);
-    assert!(
-        !bit_repr.is_empty(),
-        "token_attr(bos) must produce Debug output"
-    );
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/model_str_to_token.rs b/llama-cpp-bindings-tests/tests/model_str_to_token.rs
deleted file mode 100644
index ea8ebb9c..00000000
--- a/llama-cpp-bindings-tests/tests/model_str_to_token.rs
+++ /dev/null
@@ -1,210 +0,0 @@
-use anyhow::Result;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn str_to_token_roundtrip(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let model = fixture.model;
-    let tokens = model.str_to_token("hello world", AddBos::Never)?;
-    assert!(!tokens.is_empty());
-    let mut decoder = encoding_rs::UTF_8.new_decoder();
-    let piece = model.token_to_piece(
-        &llama_cpp_bindings::SampledToken::Content(tokens[0]),
-        &mut decoder,
-        false,
-        None,
-    )?;
-
-    assert!(!piece.is_empty());
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn str_to_token_grows_buffer_when_initial_estimation_too_small(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let many_short_chars = "a b c d e f g h i j k l";
-    let tokens = fixture
-        .model
-        .str_to_token(many_short_chars, AddBos::Always)?;
-
-    assert!(
-        tokens.len() > 8,
-        "expected regrow; got {} tokens",
-        tokens.len()
-    );
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn str_to_token_with_add_bos_never(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let model = fixture.model;
-    let tokens_with_bos = model.str_to_token("hello", AddBos::Always)?;
-    let tokens_without_bos = model.str_to_token("hello", AddBos::Never)?;
-
-    assert!(tokens_with_bos.len() >= tokens_without_bos.len());
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn str_to_token_with_many_tokens_triggers_buffer_resize(fixture: &LlamaFixture<'_>) -> Result<()> {
-    use std::fmt::Write;
-
-    let many_numbers = (0..2000).fold(String::new(), |mut accumulator, number| {
-        let _ = write!(accumulator, "{number} ");
-        accumulator
-    });
-
-    let tokens = fixture.model.str_to_token(&many_numbers, AddBos::Always)?;
-
-    assert!(tokens.len() > many_numbers.len() / 2);
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/model_token_to_piece.rs b/llama-cpp-bindings-tests/tests/model_token_to_piece.rs
deleted file mode 100644
index b86d391b..00000000
--- a/llama-cpp-bindings-tests/tests/model_token_to_piece.rs
+++ /dev/null
@@ -1,364 +0,0 @@
-#![expect(
-    clippy::unnecessary_wraps,
-    reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-)]
-
-use std::num::NonZeroU16;
-
-use anyhow::Result;
-use llama_cpp_bindings::SampledToken;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn token_to_piece_bytes_returns_bytes_for_known_token(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let model = fixture.model;
-    let tokens = model.str_to_token("hello", AddBos::Never)?;
-    let bytes = model.token_to_piece_bytes(tokens[0], 32, false, None)?;
-
-    assert!(!bytes.is_empty());
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn token_to_piece_handles_large_token_requiring_buffer_resize(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let model = fixture.model;
-    let mut decoder = encoding_rs::UTF_8.new_decoder();
-
-    for (token, _) in model.tokens(true).take(200) {
-        let result = model.token_to_piece(&SampledToken::Content(token), &mut decoder, true, None);
-        assert!(result.is_ok());
-    }
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn token_to_piece_bytes_insufficient_buffer_returns_error(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let model = fixture.model;
-    let tokens = model.str_to_token("hello", AddBos::Never)?;
-    let result = model.token_to_piece_bytes(tokens[0], 1, false, None);
-
-    assert!(
-        result
-            .unwrap_err()
-            .to_string()
-            .contains("Insufficient Buffer Space")
-    );
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn token_to_piece_with_lstrip(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let model = fixture.model;
-    let mut decoder = encoding_rs::UTF_8.new_decoder();
-    let tokens = model.str_to_token("hello", AddBos::Never)?;
-    let result = model.token_to_piece(
-        &SampledToken::Content(tokens[0]),
-        &mut decoder,
-        false,
-        NonZeroU16::new(1),
-    );
-
-    assert!(result.is_ok());
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn token_to_piece_decodes_reasoning_variant(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let model = fixture.model;
-    let mut decoder = encoding_rs::UTF_8.new_decoder();
-    let tokens = model.str_to_token("hi", AddBos::Never)?;
-
-    let piece = model.token_to_piece(
-        &SampledToken::Reasoning(tokens[0]),
-        &mut decoder,
-        true,
-        None,
-    )?;
-
-    assert!(!piece.is_empty());
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn token_to_piece_decodes_tool_call_variant(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let model = fixture.model;
-    let mut decoder = encoding_rs::UTF_8.new_decoder();
-    let tokens = model.str_to_token("hi", AddBos::Never)?;
-
-    let piece =
-        model.token_to_piece(&SampledToken::ToolCall(tokens[0]), &mut decoder, true, None)?;
-
-    assert!(!piece.is_empty());
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn token_to_piece_decodes_undeterminable_variant(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let model = fixture.model;
-    let mut decoder = encoding_rs::UTF_8.new_decoder();
-    let tokens = model.str_to_token("hi", AddBos::Never)?;
-
-    let piece = model.token_to_piece(
-        &SampledToken::Undeterminable(tokens[0]),
-        &mut decoder,
-        true,
-        None,
-    )?;
-
-    assert!(!piece.is_empty());
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/model_tokens_iterator.rs b/llama-cpp-bindings-tests/tests/model_tokens_iterator.rs
deleted file mode 100644
index 3f9ad9da..00000000
--- a/llama-cpp-bindings-tests/tests/model_tokens_iterator.rs
+++ /dev/null
@@ -1,109 +0,0 @@
-#![expect(
-    clippy::unnecessary_wraps,
-    reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-)]
-
-use anyhow::Result;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn tokens_iterator_produces_valid_entries(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let model = fixture.model;
-    let mut count = 0;
-
-    for (token, _piece_result) in model.tokens(false) {
-        assert!(token.0 >= 0);
-        count += 1;
-
-        if count >= 100 {
-            break;
-        }
-    }
-
-    assert_eq!(count, 100);
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn n_vocab_matches_tokens_iterator_count(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let model = fixture.model;
-    let n_vocab = model.n_vocab();
-    let count = model.tokens(false).count();
-
-    assert_eq!(count, usize::try_from(n_vocab)?);
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/mtmd_bitmap.rs b/llama-cpp-bindings-tests/tests/mtmd_bitmap.rs
deleted file mode 100644
index 3c66f82f..00000000
--- a/llama-cpp-bindings-tests/tests/mtmd_bitmap.rs
+++ /dev/null
@@ -1,81 +0,0 @@
-#![expect(
-    clippy::unnecessary_wraps,
-    reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-)]
-
-use anyhow::Result;
-use llama_cpp_bindings::mtmd::MtmdBitmap;
-use llama_cpp_bindings_tests::test_model;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn from_buffer_creates_bitmap_from_image_bytes(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mtmd_ctx = fixture
-        .mtmd_context
-        .expect("mmproj_file declared in attribute");
-
-    let fixtures = test_model::fixtures_dir();
-    let image_path = fixtures.join("llamas.jpg");
-    let image_bytes = std::fs::read(&image_path)?;
-    let bitmap = MtmdBitmap::from_buffer(mtmd_ctx, &image_bytes)?;
-
-    assert!(bitmap.nx() > 0);
-    assert!(bitmap.ny() > 0);
-    assert!(!bitmap.is_audio());
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn from_file_with_null_byte_in_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mtmd_ctx = fixture
-        .mtmd_context
-        .expect("mmproj_file declared in attribute");
-    let result = MtmdBitmap::from_file(mtmd_ctx, "path\0null");
-
-    assert!(result.is_err());
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/mtmd_chunk_operations.rs b/llama-cpp-bindings-tests/tests/mtmd_chunk_operations.rs
deleted file mode 100644
index 8a960774..00000000
--- a/llama-cpp-bindings-tests/tests/mtmd_chunk_operations.rs
+++ /dev/null
@@ -1,147 +0,0 @@
-use anyhow::Result;
-use llama_cpp_bindings::mtmd::MtmdBitmap;
-use llama_cpp_bindings::mtmd::MtmdInputChunkType;
-use llama_cpp_bindings::mtmd::MtmdInputText;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn copy_creates_owned_duplicate(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mtmd_ctx = fixture
-        .mtmd_context
-        .expect("mmproj_file declared in attribute");
-    let image_data = vec![128u8; 64 * 64 * 3];
-    let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?;
-    let input_text = MtmdInputText {
-        text: "Hello <__media__>".to_string(),
-        add_special: true,
-        parse_special: true,
-    };
-    let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
-    let first_chunk = chunks
-        .get(0)
-        .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?;
-    let copied = first_chunk.copy()?;
-
-    assert!(copied.owned);
-    assert_eq!(copied.n_tokens(), first_chunk.n_tokens());
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn encode_chunk_succeeds_for_image_chunk(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mtmd_ctx = fixture
-        .mtmd_context
-        .expect("mmproj_file declared in attribute");
-    let image_data = vec![128u8; 64 * 64 * 3];
-    let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?;
-    let input_text = MtmdInputText {
-        text: "Describe: <__media__>".to_string(),
-        add_special: true,
-        parse_special: true,
-    };
-    let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
-
-    for chunk_index in 0..chunks.len() {
-        let chunk = chunks
-            .get(chunk_index)
-            .ok_or_else(|| anyhow::anyhow!("missing chunk at index {chunk_index}"))?;
-        if chunk.chunk_type() == Ok(MtmdInputChunkType::Image) {
-            let result = mtmd_ctx.encode_chunk(&chunk);
-            assert!(result.is_ok());
-            return Ok(());
-        }
-    }
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn decode_use_non_causal_returns_bool_for_image_chunk(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mtmd_ctx = fixture
-        .mtmd_context
-        .expect("mmproj_file declared in attribute");
-    let image_data = vec![128u8; 64 * 64 * 3];
-    let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?;
-    let input_text = MtmdInputText {
-        text: "Describe: <__media__>".to_string(),
-        add_special: true,
-        parse_special: true,
-    };
-    let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
-    for chunk_index in 0..chunks.len() {
-        let chunk = chunks
-            .get(chunk_index)
-            .ok_or_else(|| anyhow::anyhow!("missing chunk at index {chunk_index}"))?;
-        if chunk.chunk_type() == Ok(MtmdInputChunkType::Image) {
-            let value = mtmd_ctx.decode_use_non_causal(&chunk);
-            let printed = format!("{value:?}");
-            assert!(
-                !printed.is_empty(),
-                "decode_use_non_causal must return a Debug-printable bool"
-            );
-            return Ok(());
-        }
-    }
-    anyhow::bail!("tokenization should produce at least one Image chunk");
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/mtmd_chunk_structure.rs b/llama-cpp-bindings-tests/tests/mtmd_chunk_structure.rs
deleted file mode 100644
index 1114af3c..00000000
--- a/llama-cpp-bindings-tests/tests/mtmd_chunk_structure.rs
+++ /dev/null
@@ -1,242 +0,0 @@
-use anyhow::Result;
-use llama_cpp_bindings::mtmd::MtmdBitmap;
-use llama_cpp_bindings::mtmd::MtmdInputChunkType;
-use llama_cpp_bindings::mtmd::MtmdInputText;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-fn tokenize_synthetic(
-    fixture: &LlamaFixture<'_>,
-    prompt: &str,
-) -> Result<llama_cpp_bindings::mtmd::MtmdInputChunks> {
-    let mtmd_ctx = fixture
-        .mtmd_context
-        .expect("mmproj_file declared in attribute");
-    let image_data = vec![128u8; 64 * 64 * 3];
-    let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?;
-    let input_text = MtmdInputText {
-        text: prompt.to_owned(),
-        add_special: true,
-        parse_special: true,
-    };
-    Ok(mtmd_ctx.tokenize(input_text, &[&bitmap])?)
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn text_chunk_has_text_type(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let chunks = tokenize_synthetic(fixture, "Hello world <__media__>")?;
-    let first_chunk = chunks
-        .get(0)
-        .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?;
-    assert_eq!(first_chunk.chunk_type()?, MtmdInputChunkType::Text);
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn text_chunk_returns_text_tokens(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let chunks = tokenize_synthetic(fixture, "Hello world <__media__>")?;
-    let first_chunk = chunks
-        .get(0)
-        .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?;
-    let tokens = first_chunk.text_tokens();
-    assert!(tokens.is_some());
-    assert!(!tokens.expect("tokens should be some").is_empty());
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn chunk_n_tokens_is_positive(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let chunks = tokenize_synthetic(fixture, "Hello world <__media__>")?;
-    let first_chunk = chunks
-        .get(0)
-        .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?;
-    assert!(first_chunk.n_tokens() > 0);
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn chunk_n_positions_is_positive(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let chunks = tokenize_synthetic(fixture, "Hello world <__media__>")?;
-    let first_chunk = chunks
-        .get(0)
-        .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?;
-    assert!(first_chunk.n_positions() > 0);
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn text_chunk_id_returns_none(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let chunks = tokenize_synthetic(fixture, "Hello <__media__>")?;
-    let first_chunk = chunks
-        .get(0)
-        .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?;
-    assert_eq!(first_chunk.chunk_type()?, MtmdInputChunkType::Text);
-    assert!(first_chunk.id().is_none());
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn image_chunk_returns_none_for_text_tokens(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let chunks = tokenize_synthetic(fixture, "Hello <__media__>")?;
-    for chunk_index in 0..chunks.len() {
-        let chunk = chunks
-            .get(chunk_index)
-            .ok_or_else(|| anyhow::anyhow!("missing chunk at index {chunk_index}"))?;
-        if chunk.chunk_type() == Ok(MtmdInputChunkType::Image) {
-            assert!(chunk.text_tokens().is_none());
-            return Ok(());
-        }
-    }
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn image_chunk_id_returns_some(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let chunks = tokenize_synthetic(fixture, "Hello <__media__>")?;
-    for chunk_index in 0..chunks.len() {
-        let chunk = chunks
-            .get(chunk_index)
-            .ok_or_else(|| anyhow::anyhow!("missing chunk at index {chunk_index}"))?;
-        if chunk.chunk_type() == Ok(MtmdInputChunkType::Image) {
-            assert!(chunk.id().is_some());
-            return Ok(());
-        }
-    }
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/mtmd_context.rs b/llama-cpp-bindings-tests/tests/mtmd_context.rs
deleted file mode 100644
index 8595eb2b..00000000
--- a/llama-cpp-bindings-tests/tests/mtmd_context.rs
+++ /dev/null
@@ -1,162 +0,0 @@
-#![expect(
-    clippy::unnecessary_wraps,
-    reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-)]
-
-use anyhow::Result;
-use llama_cpp_bindings::mtmd::MtmdContext;
-use llama_cpp_bindings::mtmd::MtmdContextParams;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn init_and_supports_vision(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mtmd_ctx = fixture
-        .mtmd_context
-        .expect("mmproj_file declared in attribute");
-    assert!(mtmd_ctx.support_vision());
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn init_from_file_with_null_byte_in_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mtmd_params = MtmdContextParams::default();
-    let result = MtmdContext::init_from_file("path\0null", fixture.model, &mtmd_params);
-
-    assert!(result.is_err());
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn decode_use_mrope_is_true_for_qwen_vision(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mtmd_ctx = fixture
-        .mtmd_context
-        .expect("mmproj_file declared in attribute");
-    assert!(
-        mtmd_ctx.decode_use_mrope(),
-        "Qwen 3.5 / 3.6 mmproj uses mrope; decode_use_mrope must return true"
-    );
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn support_audio_is_false_for_vision_only_mmproj(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mtmd_ctx = fixture
-        .mtmd_context
-        .expect("mmproj_file declared in attribute");
-    assert!(
-        !mtmd_ctx.support_audio(),
-        "Qwen 3.5 / 3.6 mmproj is vision-only; support_audio must return false"
-    );
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn get_audio_sample_rate_is_none_for_vision_only_mmproj(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mtmd_ctx = fixture
-        .mtmd_context
-        .expect("mmproj_file declared in attribute");
-    assert!(
-        mtmd_ctx.get_audio_sample_rate().is_none(),
-        "Qwen 3.5 / 3.6 mmproj has no audio; get_audio_sample_rate must return None"
-    );
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/mtmd_evaluation.rs b/llama-cpp-bindings-tests/tests/mtmd_evaluation.rs
deleted file mode 100644
index b6f30f1c..00000000
--- a/llama-cpp-bindings-tests/tests/mtmd_evaluation.rs
+++ /dev/null
@@ -1,236 +0,0 @@
-#![expect(
-    clippy::unnecessary_wraps,
-    reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-)]
-
-use anyhow::Result;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::mtmd::MtmdBitmap;
-use llama_cpp_bindings::mtmd::MtmdEvalError;
-use llama_cpp_bindings::mtmd::MtmdInputChunks;
-use llama_cpp_bindings::mtmd::MtmdInputText;
-use llama_cpp_bindings_tests::test_model;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-fn eval_synthetic_bitmap(fixture: &LlamaFixture<'_>, width: u32, height: u32) -> Result<()> {
-    let mtmd_ctx = fixture
-        .mtmd_context
-        .expect("mmproj_file declared in attribute");
-    let image_data = vec![128u8; (width as usize) * (height as usize) * 3];
-    let bitmap = MtmdBitmap::from_image_data(width, height, &image_data)?;
-    let input_text = MtmdInputText {
-        text: "Describe: <__media__>".to_string(),
-        add_special: true,
-        parse_special: true,
-    };
-    let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
-    let n_positions = chunks.total_positions();
-    let required_n_ctx = u32::try_from(n_positions + 256)?;
-    if fixture.context_params.n_ctx < required_n_ctx {
-        anyhow::bail!(
-            "fixture n_ctx ({}) below required ({}) for {}x{} image",
-            fixture.context_params.n_ctx,
-            required_n_ctx,
-            width,
-            height,
-        );
-    }
-
-    let llama_ctx = LlamaContext::from_model(
-        fixture.model,
-        fixture.backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-    let n_batch = i32::try_from(llama_ctx.n_batch())?;
-    chunks.eval_chunks(mtmd_ctx, &llama_ctx, 0, 0, n_batch, false)?;
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 64,
-    n_batch = 64,
-    n_ubatch = 32,
-    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 64,
-    n_batch = 64,
-    n_ubatch = 32,
-    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn eval_chunks_returns_batch_size_exceeds_context_limit_for_huge_batch(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let mtmd_ctx = fixture
-        .mtmd_context
-        .expect("mmproj_file declared in attribute");
-    let llama_ctx = LlamaContext::from_model(
-        fixture.model,
-        fixture.backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-
-    let chunks = MtmdInputChunks::new()?;
-    let huge_batch = i32::try_from(llama_ctx.n_batch() + 1)?;
-
-    let result = chunks.eval_chunks(mtmd_ctx, &llama_ctx, 0, 0, huge_batch, false);
-
-    assert!(matches!(
-        result,
-        Err(MtmdEvalError::BatchSizeExceedsContextLimit { .. })
-    ));
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 8192,
-    n_batch = 512,
-    n_ubatch = 512,
-    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 8192,
-    n_batch = 512,
-    n_ubatch = 512,
-    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn eval_chunks_with_standard_image(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mtmd_ctx = fixture
-        .mtmd_context
-        .expect("mmproj_file declared in attribute");
-
-    let fixtures = test_model::fixtures_dir();
-    let image_path = fixtures.join("llamas.jpg");
-    let image_path_str = image_path
-        .to_str()
-        .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?;
-    let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?;
-    let input_text = MtmdInputText {
-        text: "What is in this image? <__media__>".to_string(),
-        add_special: true,
-        parse_special: true,
-    };
-    let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
-    let n_positions = chunks.total_positions();
-    let required_n_ctx = u32::try_from(n_positions + 256)?;
-    assert!(
-        fixture.context_params.n_ctx >= required_n_ctx,
-        "fixture n_ctx ({}) below required ({}); update the attribute literal",
-        fixture.context_params.n_ctx,
-        required_n_ctx,
-    );
-
-    let llama_ctx = LlamaContext::from_model(
-        fixture.model,
-        fixture.backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-    let n_batch = i32::try_from(llama_ctx.n_batch())?;
-    let result = chunks.eval_chunks(mtmd_ctx, &llama_ctx, 0, 0, n_batch, false);
-
-    assert!(result.is_ok());
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 8192,
-    n_batch = 512,
-    n_ubatch = 512,
-    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 8192,
-    n_batch = 512,
-    n_ubatch = 512,
-    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn eval_chunks_with_varied_dimensions(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let test_dimensions: [(u32, u32); 4] = [(224, 224), (512, 512), (100, 500), (337, 421)];
-
-    for (width, height) in test_dimensions {
-        let result = eval_synthetic_bitmap(fixture, width, height);
-        assert!(
-            result.is_ok(),
-            "dimension {width}x{height} should succeed: {result:?}"
-        );
-    }
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 8192,
-    n_batch = 512,
-    n_ubatch = 512,
-    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 8192,
-    n_batch = 512,
-    n_ubatch = 512,
-    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn eval_chunks_with_extreme_dimensions_does_not_crash(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let extreme_dimensions: [(u32, u32); 6] = [
-        (1, 1),
-        (7, 13),
-        (3, 1000),
-        (1000, 3),
-        (1920, 1080),
-        (4096, 4096),
-    ];
-
-    let mut any_reached_eval = false;
-
-    for (width, height) in extreme_dimensions {
-        match eval_synthetic_bitmap(fixture, width, height) {
-            Ok(()) => any_reached_eval = true,
-            Err(error) => eprintln!("  {width}x{height} failed: {error}"),
-        }
-    }
-
-    assert!(
-        any_reached_eval,
-        "at least one extreme dimension should reach eval_chunks"
-    );
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/mtmd_tokenization.rs b/llama-cpp-bindings-tests/tests/mtmd_tokenization.rs
deleted file mode 100644
index ae5f32c3..00000000
--- a/llama-cpp-bindings-tests/tests/mtmd_tokenization.rs
+++ /dev/null
@@ -1,121 +0,0 @@
-#![expect(
-    clippy::unnecessary_wraps,
-    reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-)]
-
-use anyhow::Result;
-use llama_cpp_bindings::mtmd::MtmdBitmap;
-use llama_cpp_bindings::mtmd::MtmdInputText;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn tokenize_text_with_image(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mtmd_ctx = fixture
-        .mtmd_context
-        .expect("mmproj_file declared in attribute");
-    let image_data = vec![128u8; 64 * 64 * 3];
-    let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?;
-    let input_text = MtmdInputText {
-        text: "Describe this image: <__media__>".to_string(),
-        add_special: true,
-        parse_special: true,
-    };
-    let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
-
-    assert!(!chunks.is_empty());
-    assert!(chunks.total_tokens() > 0);
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn tokenize_bitmap_count_mismatch_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mtmd_ctx = fixture
-        .mtmd_context
-        .expect("mmproj_file declared in attribute");
-    let input_text = MtmdInputText {
-        text: "No media markers here".to_string(),
-        add_special: true,
-        parse_special: true,
-    };
-    let image_data = vec![128u8; 64 * 64 * 3];
-    let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?;
-    let result = mtmd_ctx.tokenize(input_text, &[&bitmap]);
-    assert!(result.is_err());
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn tokenize_with_null_byte_in_text_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mtmd_ctx = fixture
-        .mtmd_context
-        .expect("mmproj_file declared in attribute");
-    let input_text = MtmdInputText {
-        text: "text\0null".to_string(),
-        add_special: true,
-        parse_special: true,
-    };
-    let result = mtmd_ctx.tokenize(input_text, &[]);
-    assert!(result.is_err());
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/multimodal.rs b/llama-cpp-bindings-tests/tests/multimodal.rs
deleted file mode 100644
index c1108c4d..00000000
--- a/llama-cpp-bindings-tests/tests/multimodal.rs
+++ /dev/null
@@ -1,212 +0,0 @@
-use anyhow::{Context, Result};
-use llama_cpp_bindings::SampledTokenClassifier;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::{LlamaChatMessage, LlamaModel};
-use llama_cpp_bindings::mtmd::{MtmdBitmap, MtmdInputChunkType, MtmdInputChunks, MtmdInputText};
-use llama_cpp_bindings::sampled_token::SampledToken;
-use llama_cpp_bindings::sampling::LlamaSampler;
-use llama_cpp_bindings_sys::llama_pos;
-use llama_cpp_bindings_tests::test_model;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-struct ChunkTokenBreakdown {
-    text: u64,
-    image: u64,
-    audio: u64,
-}
-
-fn count_chunk_tokens_by_type(chunks: &MtmdInputChunks) -> Result<ChunkTokenBreakdown> {
-    let mut breakdown = ChunkTokenBreakdown {
-        text: 0,
-        image: 0,
-        audio: 0,
-    };
-    for index in 0..chunks.len() {
-        let chunk = chunks
-            .get(index)
-            .with_context(|| format!("chunk index {index} is missing"))?;
-        let n_tokens = u64::try_from(chunk.n_tokens())?;
-        match chunk.chunk_type()? {
-            MtmdInputChunkType::Text => breakdown.text += n_tokens,
-            MtmdInputChunkType::Image => breakdown.image += n_tokens,
-            MtmdInputChunkType::Audio => breakdown.audio += n_tokens,
-        }
-    }
-
-    Ok(breakdown)
-}
-
-fn build_user_prompt_with_image_marker(model: &LlamaModel, question: &str) -> Result<String> {
-    let marker = llama_cpp_bindings::mtmd::mtmd_default_marker();
-    let user_content = format!("{marker}{question}");
-    let chat_template = model.chat_template(None)?;
-    let messages = [LlamaChatMessage::new("user".to_string(), user_content)?];
-
-    Ok(model.apply_chat_template(&chat_template, &messages, true)?)
-}
-
-struct SamplingTotals {
-    generated: String,
-    observed_content: u64,
-    observed_reasoning: u64,
-}
-
-fn drive_sampling_loop(
-    classifier: &mut SampledTokenClassifier,
-    model: &LlamaModel,
-    ctx: &mut LlamaContext,
-    starting_position: llama_pos,
-    max_tokens: usize,
-) -> Result<SamplingTotals> {
-    let mut sampler = LlamaSampler::greedy();
-    let mut totals = SamplingTotals {
-        generated: String::new(),
-        observed_content: 0,
-        observed_reasoning: 0,
-    };
-    let mut batch = LlamaBatch::new(512, 1)?;
-
-    for (current_position, _) in (starting_position..).zip(0..max_tokens) {
-        let (raw_token, outcomes) = classifier.sample(&mut sampler, ctx, -1)?;
-        for outcome in &outcomes {
-            totals.generated.push_str(&outcome.raw_piece);
-            match outcome.sampled_token {
-                SampledToken::Content(_) => totals.observed_content += 1,
-                SampledToken::Reasoning(_) => totals.observed_reasoning += 1,
-                SampledToken::ToolCall(_) | SampledToken::Undeterminable(_) => {}
-            }
-        }
-
-        let raw_as_sampled = SampledToken::Content(raw_token);
-        if model.is_eog_token(&raw_as_sampled) {
-            break;
-        }
-
-        batch.clear();
-        batch.add(&raw_as_sampled, current_position, &[0], true)?;
-
-        ctx.decode(&mut batch)
-            .with_context(|| "failed to decode generated token")?;
-    }
-
-    for outcome in classifier.flush() {
-        totals.generated.push_str(&outcome.raw_piece);
-        match outcome.sampled_token {
-            SampledToken::Content(_) => totals.observed_content += 1,
-            SampledToken::Reasoning(_) => totals.observed_reasoning += 1,
-            SampledToken::ToolCall(_) | SampledToken::Undeterminable(_) => {}
-        }
-    }
-
-    Ok(totals)
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 4096,
-    n_batch = 512,
-    n_ubatch = 512,
-    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-fn multimodal_vision_inference_produces_output(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let model = fixture.model;
-    let mtmd_ctx = fixture
-        .mtmd_context
-        .expect("mmproj_file declared in attribute");
-
-    let mut ctx = LlamaContext::from_model(
-        model,
-        fixture.backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )
-    .with_context(|| "unable to create llama context")?;
-
-    assert!(
-        mtmd_ctx.support_vision(),
-        "model should support vision input"
-    );
-
-    let image_path = test_model::fixtures_dir().join("llamas.jpg");
-    let image_path_str = image_path
-        .to_str()
-        .with_context(|| "image path is not valid UTF-8")?;
-    let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)
-        .with_context(|| "failed to load image from file")?;
-
-    let formatted_prompt =
-        build_user_prompt_with_image_marker(model, "What animals do you see in this image?")?;
-
-    let input_text = MtmdInputText {
-        text: formatted_prompt,
-        add_special: false,
-        parse_special: true,
-    };
-
-    let chunks = mtmd_ctx
-        .tokenize(input_text, &[&bitmap])
-        .with_context(|| "failed to tokenize multimodal input")?;
-
-    assert!(
-        !chunks.is_empty(),
-        "tokenization should produce at least one chunk"
-    );
-
-    let expected = count_chunk_tokens_by_type(&chunks)?;
-
-    eprintln!(
-        "tokenized into {} chunks, text {} image {} audio {}",
-        chunks.len(),
-        expected.text,
-        expected.image,
-        expected.audio
-    );
-
-    assert!(
-        expected.image > 0,
-        "vision input must produce at least one image chunk"
-    );
-
-    let mut classifier = model.sampled_token_classifier();
-    let n_past = classifier
-        .eval_multimodal_chunks(&chunks, mtmd_ctx, &ctx, 0, 0, 512, true)
-        .with_context(|| "failed to evaluate chunks")?;
-
-    eprintln!("evaluated chunks, n_past = {n_past}");
-
-    {
-        let usage = classifier.usage();
-        assert_eq!(usage.prompt_tokens, expected.text);
-        assert_eq!(usage.input_image_tokens, expected.image);
-        assert_eq!(usage.input_audio_tokens, expected.audio);
-    }
-
-    let totals = drive_sampling_loop(&mut classifier, model, &mut ctx, n_past, 512)?;
-
-    eprintln!("generated text: {}", totals.generated);
-
-    assert!(
-        !totals.generated.is_empty(),
-        "model should generate at least one token from image input"
-    );
-
-    let usage = classifier.into_usage();
-    assert_eq!(usage.prompt_tokens, expected.text);
-    assert_eq!(usage.input_image_tokens, expected.image);
-    assert_eq!(usage.input_audio_tokens, expected.audio);
-    assert_eq!(usage.content_tokens, totals.observed_content);
-    assert_eq!(usage.reasoning_tokens, totals.observed_reasoning);
-    assert_eq!(
-        usage.completion_tokens(),
-        totals.observed_content + totals.observed_reasoning
-    );
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/multimodal_vision.rs b/llama-cpp-bindings-tests/tests/multimodal_vision.rs
new file mode 100644
index 00000000..7e596be6
--- /dev/null
+++ b/llama-cpp-bindings-tests/tests/multimodal_vision.rs
@@ -0,0 +1,2001 @@
+use llama_cpp_test_harness::llama_tests_main;
+
+mod mtmd_bitmap {
+    #![expect(
+        clippy::unnecessary_wraps,
+        reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
+    )]
+
+    use anyhow::Result;
+    use llama_cpp_bindings::mtmd::MtmdBitmap;
+    use llama_cpp_bindings_tests::test_model;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+    )]
+    fn from_buffer_creates_bitmap_from_image_bytes(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mtmd_ctx = fixture
+            .mtmd_context
+            .expect("mmproj_file declared in attribute");
+
+        let fixtures = test_model::fixtures_dir();
+        let image_path = fixtures.join("llamas.jpg");
+        let image_bytes = std::fs::read(&image_path)?;
+        let bitmap = MtmdBitmap::from_buffer(mtmd_ctx, &image_bytes)?;
+
+        assert!(bitmap.nx() > 0);
+        assert!(bitmap.ny() > 0);
+        assert!(!bitmap.is_audio());
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+    )]
+    fn from_file_with_null_byte_in_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mtmd_ctx = fixture
+            .mtmd_context
+            .expect("mmproj_file declared in attribute");
+        let result = MtmdBitmap::from_file(mtmd_ctx, "path\0null");
+
+        assert!(result.is_err());
+
+        Ok(())
+    }
+}
+
+mod mtmd_chunk_operations {
+    use anyhow::Result;
+    use llama_cpp_bindings::mtmd::MtmdBitmap;
+    use llama_cpp_bindings::mtmd::MtmdInputChunkType;
+    use llama_cpp_bindings::mtmd::MtmdInputText;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+    )]
+    fn copy_creates_owned_duplicate(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mtmd_ctx = fixture
+            .mtmd_context
+            .expect("mmproj_file declared in attribute");
+        let image_data = vec![128u8; 64 * 64 * 3];
+        let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?;
+        let input_text = MtmdInputText {
+            text: "Hello <__media__>".to_string(),
+            add_special: true,
+            parse_special: true,
+        };
+        let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
+        let first_chunk = chunks
+            .get(0)
+            .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?;
+        let copied = first_chunk.copy()?;
+
+        assert!(copied.owned);
+        assert_eq!(copied.n_tokens(), first_chunk.n_tokens());
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+    )]
+    fn encode_chunk_succeeds_for_image_chunk(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mtmd_ctx = fixture
+            .mtmd_context
+            .expect("mmproj_file declared in attribute");
+        let image_data = vec![128u8; 64 * 64 * 3];
+        let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?;
+        let input_text = MtmdInputText {
+            text: "Describe: <__media__>".to_string(),
+            add_special: true,
+            parse_special: true,
+        };
+        let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
+
+        for chunk_index in 0..chunks.len() {
+            let chunk = chunks
+                .get(chunk_index)
+                .ok_or_else(|| anyhow::anyhow!("missing chunk at index {chunk_index}"))?;
+            if chunk.chunk_type() == Ok(MtmdInputChunkType::Image) {
+                let result = mtmd_ctx.encode_chunk(&chunk);
+                assert!(result.is_ok());
+                return Ok(());
+            }
+        }
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+    )]
+    fn decode_use_non_causal_returns_bool_for_image_chunk(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let mtmd_ctx = fixture
+            .mtmd_context
+            .expect("mmproj_file declared in attribute");
+        let image_data = vec![128u8; 64 * 64 * 3];
+        let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?;
+        let input_text = MtmdInputText {
+            text: "Describe: <__media__>".to_string(),
+            add_special: true,
+            parse_special: true,
+        };
+        let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
+        for chunk_index in 0..chunks.len() {
+            let chunk = chunks
+                .get(chunk_index)
+                .ok_or_else(|| anyhow::anyhow!("missing chunk at index {chunk_index}"))?;
+            if chunk.chunk_type() == Ok(MtmdInputChunkType::Image) {
+                let value = mtmd_ctx.decode_use_non_causal(&chunk);
+                let printed = format!("{value:?}");
+                assert!(
+                    !printed.is_empty(),
+                    "decode_use_non_causal must return a Debug-printable bool"
+                );
+                return Ok(());
+            }
+        }
+        anyhow::bail!("tokenization should produce at least one Image chunk");
+    }
+}
+
+mod mtmd_chunk_structure {
+    use anyhow::Result;
+    use llama_cpp_bindings::mtmd::MtmdBitmap;
+    use llama_cpp_bindings::mtmd::MtmdInputChunkType;
+    use llama_cpp_bindings::mtmd::MtmdInputText;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    fn tokenize_synthetic(
+        fixture: &LlamaFixture<'_>,
+        prompt: &str,
+    ) -> Result<llama_cpp_bindings::mtmd::MtmdInputChunks> {
+        let mtmd_ctx = fixture
+            .mtmd_context
+            .expect("mmproj_file declared in attribute");
+        let image_data = vec![128u8; 64 * 64 * 3];
+        let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?;
+        let input_text = MtmdInputText {
+            text: prompt.to_owned(),
+            add_special: true,
+            parse_special: true,
+        };
+        Ok(mtmd_ctx.tokenize(input_text, &[&bitmap])?)
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+    )]
+    fn text_chunk_has_text_type(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let chunks = tokenize_synthetic(fixture, "Hello world <__media__>")?;
+        let first_chunk = chunks
+            .get(0)
+            .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?;
+        assert_eq!(first_chunk.chunk_type()?, MtmdInputChunkType::Text);
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+    )]
+    fn text_chunk_returns_text_tokens(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let chunks = tokenize_synthetic(fixture, "Hello world <__media__>")?;
+        let first_chunk = chunks
+            .get(0)
+            .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?;
+        let tokens = first_chunk.text_tokens();
+        assert!(tokens.is_some());
+        assert!(!tokens.expect("tokens should be some").is_empty());
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+    )]
+    fn chunk_n_tokens_is_positive(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let chunks = tokenize_synthetic(fixture, "Hello world <__media__>")?;
+        let first_chunk = chunks
+            .get(0)
+            .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?;
+        assert!(first_chunk.n_tokens() > 0);
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+    )]
+    fn chunk_n_positions_is_positive(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let chunks = tokenize_synthetic(fixture, "Hello world <__media__>")?;
+        let first_chunk = chunks
+            .get(0)
+            .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?;
+        assert!(first_chunk.n_positions() > 0);
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+    )]
+    fn text_chunk_id_returns_none(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let chunks = tokenize_synthetic(fixture, "Hello <__media__>")?;
+        let first_chunk = chunks
+            .get(0)
+            .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?;
+        assert_eq!(first_chunk.chunk_type()?, MtmdInputChunkType::Text);
+        assert!(first_chunk.id().is_none());
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+    )]
+    fn image_chunk_returns_none_for_text_tokens(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let chunks = tokenize_synthetic(fixture, "Hello <__media__>")?;
+        for chunk_index in 0..chunks.len() {
+            let chunk = chunks
+                .get(chunk_index)
+                .ok_or_else(|| anyhow::anyhow!("missing chunk at index {chunk_index}"))?;
+            if chunk.chunk_type() == Ok(MtmdInputChunkType::Image) {
+                assert!(chunk.text_tokens().is_none());
+                return Ok(());
+            }
+        }
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+    )]
+    fn image_chunk_id_returns_some(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let chunks = tokenize_synthetic(fixture, "Hello <__media__>")?;
+        for chunk_index in 0..chunks.len() {
+            let chunk = chunks
+                .get(chunk_index)
+                .ok_or_else(|| anyhow::anyhow!("missing chunk at index {chunk_index}"))?;
+            if chunk.chunk_type() == Ok(MtmdInputChunkType::Image) {
+                assert!(chunk.id().is_some());
+                return Ok(());
+            }
+        }
+        Ok(())
+    }
+}
+
+mod mtmd_context {
+    #![expect(
+        clippy::unnecessary_wraps,
+        reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
+    )]
+
+    use anyhow::Result;
+    use llama_cpp_bindings::mtmd::MtmdContext;
+    use llama_cpp_bindings::mtmd::MtmdContextParams;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+    )]
+    fn init_and_supports_vision(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mtmd_ctx = fixture
+            .mtmd_context
+            .expect("mmproj_file declared in attribute");
+        assert!(mtmd_ctx.support_vision());
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+    )]
+    fn init_from_file_with_null_byte_in_path_returns_error(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let mtmd_params = MtmdContextParams::default();
+        let result = MtmdContext::init_from_file("path\0null", fixture.model, &mtmd_params);
+
+        assert!(result.is_err());
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+    )]
+    fn decode_use_mrope_is_true_for_qwen_vision(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mtmd_ctx = fixture
+            .mtmd_context
+            .expect("mmproj_file declared in attribute");
+        assert!(
+            mtmd_ctx.decode_use_mrope(),
+            "Qwen 3.5 / 3.6 mmproj uses mrope; decode_use_mrope must return true"
+        );
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+    )]
+    fn support_audio_is_false_for_vision_only_mmproj(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mtmd_ctx = fixture
+            .mtmd_context
+            .expect("mmproj_file declared in attribute");
+        assert!(
+            !mtmd_ctx.support_audio(),
+            "Qwen 3.5 / 3.6 mmproj is vision-only; support_audio must return false"
+        );
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+    )]
+    fn get_audio_sample_rate_is_none_for_vision_only_mmproj(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let mtmd_ctx = fixture
+            .mtmd_context
+            .expect("mmproj_file declared in attribute");
+        assert!(
+            mtmd_ctx.get_audio_sample_rate().is_none(),
+            "Qwen 3.5 / 3.6 mmproj has no audio; get_audio_sample_rate must return None"
+        );
+        Ok(())
+    }
+}
+
+mod mtmd_evaluation {
+    #![expect(
+        clippy::unnecessary_wraps,
+        reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
+    )]
+
+    use anyhow::Result;
+    use llama_cpp_bindings::context::LlamaContext;
+    use llama_cpp_bindings::mtmd::MtmdBitmap;
+    use llama_cpp_bindings::mtmd::MtmdEvalError;
+    use llama_cpp_bindings::mtmd::MtmdInputChunks;
+    use llama_cpp_bindings::mtmd::MtmdInputText;
+    use llama_cpp_bindings_tests::test_model;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    fn eval_synthetic_bitmap(fixture: &LlamaFixture<'_>, width: u32, height: u32) -> Result<()> {
+        let mtmd_ctx = fixture
+            .mtmd_context
+            .expect("mmproj_file declared in attribute");
+        let image_data = vec![128u8; (width as usize) * (height as usize) * 3];
+        let bitmap = MtmdBitmap::from_image_data(width, height, &image_data)?;
+        let input_text = MtmdInputText {
+            text: "Describe: <__media__>".to_string(),
+            add_special: true,
+            parse_special: true,
+        };
+        let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
+        let n_positions = chunks.total_positions();
+        let required_n_ctx = u32::try_from(n_positions + 256)?;
+        if fixture.context_params.n_ctx < required_n_ctx {
+            anyhow::bail!(
+                "fixture n_ctx ({}) below required ({}) for {}x{} image",
+                fixture.context_params.n_ctx,
+                required_n_ctx,
+                width,
+                height,
+            );
+        }
+
+        let llama_ctx = LlamaContext::from_model(
+            fixture.model,
+            fixture.backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+        let n_batch = i32::try_from(llama_ctx.n_batch())?;
+        chunks.eval_chunks(mtmd_ctx, &llama_ctx, 0, 0, n_batch, false)?;
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 64,
+        n_batch = 64,
+        n_ubatch = 32,
+        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 64,
+        n_batch = 64,
+        n_ubatch = 32,
+        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+    )]
+    fn eval_chunks_returns_batch_size_exceeds_context_limit_for_huge_batch(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let mtmd_ctx = fixture
+            .mtmd_context
+            .expect("mmproj_file declared in attribute");
+        let llama_ctx = LlamaContext::from_model(
+            fixture.model,
+            fixture.backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+
+        let chunks = MtmdInputChunks::new()?;
+        let huge_batch = i32::try_from(llama_ctx.n_batch() + 1)?;
+
+        let result = chunks.eval_chunks(mtmd_ctx, &llama_ctx, 0, 0, huge_batch, false);
+
+        assert!(matches!(
+            result,
+            Err(MtmdEvalError::BatchSizeExceedsContextLimit { .. })
+        ));
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 8192,
+        n_batch = 512,
+        n_ubatch = 512,
+        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 8192,
+        n_batch = 512,
+        n_ubatch = 512,
+        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+    )]
+    fn eval_chunks_with_standard_image(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mtmd_ctx = fixture
+            .mtmd_context
+            .expect("mmproj_file declared in attribute");
+
+        let fixtures = test_model::fixtures_dir();
+        let image_path = fixtures.join("llamas.jpg");
+        let image_path_str = image_path
+            .to_str()
+            .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?;
+        let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?;
+        let input_text = MtmdInputText {
+            text: "What is in this image? <__media__>".to_string(),
+            add_special: true,
+            parse_special: true,
+        };
+        let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
+        let n_positions = chunks.total_positions();
+        let required_n_ctx = u32::try_from(n_positions + 256)?;
+        assert!(
+            fixture.context_params.n_ctx >= required_n_ctx,
+            "fixture n_ctx ({}) below required ({}); update the attribute literal",
+            fixture.context_params.n_ctx,
+            required_n_ctx,
+        );
+
+        let llama_ctx = LlamaContext::from_model(
+            fixture.model,
+            fixture.backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+        let n_batch = i32::try_from(llama_ctx.n_batch())?;
+        let result = chunks.eval_chunks(mtmd_ctx, &llama_ctx, 0, 0, n_batch, false);
+
+        assert!(result.is_ok());
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 8192,
+        n_batch = 512,
+        n_ubatch = 512,
+        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 8192,
+        n_batch = 512,
+        n_ubatch = 512,
+        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+    )]
+    fn eval_chunks_with_varied_dimensions(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let test_dimensions: [(u32, u32); 4] = [(224, 224), (512, 512), (100, 500), (337, 421)];
+
+        for (width, height) in test_dimensions {
+            let result = eval_synthetic_bitmap(fixture, width, height);
+            assert!(
+                result.is_ok(),
+                "dimension {width}x{height} should succeed: {result:?}"
+            );
+        }
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 8192,
+        n_batch = 512,
+        n_ubatch = 512,
+        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 8192,
+        n_batch = 512,
+        n_ubatch = 512,
+        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+    )]
+    fn eval_chunks_with_extreme_dimensions_does_not_crash(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let extreme_dimensions: [(u32, u32); 6] = [
+            (1, 1),
+            (7, 13),
+            (3, 1000),
+            (1000, 3),
+            (1920, 1080),
+            (4096, 4096),
+        ];
+
+        let mut any_reached_eval = false;
+
+        for (width, height) in extreme_dimensions {
+            match eval_synthetic_bitmap(fixture, width, height) {
+                Ok(()) => any_reached_eval = true,
+                Err(error) => eprintln!("  {width}x{height} failed: {error}"),
+            }
+        }
+
+        assert!(
+            any_reached_eval,
+            "at least one extreme dimension should reach eval_chunks"
+        );
+
+        Ok(())
+    }
+}
+
+mod mtmd_tokenization {
+    #![expect(
+        clippy::unnecessary_wraps,
+        reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
+    )]
+
+    use anyhow::Result;
+    use llama_cpp_bindings::mtmd::MtmdBitmap;
+    use llama_cpp_bindings::mtmd::MtmdInputText;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+    )]
+    fn tokenize_text_with_image(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mtmd_ctx = fixture
+            .mtmd_context
+            .expect("mmproj_file declared in attribute");
+        let image_data = vec![128u8; 64 * 64 * 3];
+        let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?;
+        let input_text = MtmdInputText {
+            text: "Describe this image: <__media__>".to_string(),
+            add_special: true,
+            parse_special: true,
+        };
+        let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
+
+        assert!(!chunks.is_empty());
+        assert!(chunks.total_tokens() > 0);
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+    )]
+    fn tokenize_bitmap_count_mismatch_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mtmd_ctx = fixture
+            .mtmd_context
+            .expect("mmproj_file declared in attribute");
+        let input_text = MtmdInputText {
+            text: "No media markers here".to_string(),
+            add_special: true,
+            parse_special: true,
+        };
+        let image_data = vec![128u8; 64 * 64 * 3];
+        let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?;
+        let result = mtmd_ctx.tokenize(input_text, &[&bitmap]);
+        assert!(result.is_err());
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+    )]
+    fn tokenize_with_null_byte_in_text_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mtmd_ctx = fixture
+            .mtmd_context
+            .expect("mmproj_file declared in attribute");
+        let input_text = MtmdInputText {
+            text: "text\0null".to_string(),
+            add_special: true,
+            parse_special: true,
+        };
+        let result = mtmd_ctx.tokenize(input_text, &[]);
+        assert!(result.is_err());
+        Ok(())
+    }
+}
+
+mod multimodal {
+    use anyhow::{Context, Result};
+    use llama_cpp_bindings::SampledTokenClassifier;
+    use llama_cpp_bindings::context::LlamaContext;
+    use llama_cpp_bindings::llama_batch::LlamaBatch;
+    use llama_cpp_bindings::model::{LlamaChatMessage, LlamaModel};
+    use llama_cpp_bindings::mtmd::{
+        MtmdBitmap, MtmdInputChunkType, MtmdInputChunks, MtmdInputText,
+    };
+    use llama_cpp_bindings::sampled_token::SampledToken;
+    use llama_cpp_bindings::sampling::LlamaSampler;
+    use llama_cpp_bindings_sys::llama_pos;
+    use llama_cpp_bindings_tests::test_model;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    struct ChunkTokenBreakdown {
+        text: u64,
+        image: u64,
+        audio: u64,
+    }
+
+    fn count_chunk_tokens_by_type(chunks: &MtmdInputChunks) -> Result<ChunkTokenBreakdown> {
+        let mut breakdown = ChunkTokenBreakdown {
+            text: 0,
+            image: 0,
+            audio: 0,
+        };
+        for index in 0..chunks.len() {
+            let chunk = chunks
+                .get(index)
+                .with_context(|| format!("chunk index {index} is missing"))?;
+            let n_tokens = u64::try_from(chunk.n_tokens())?;
+            match chunk.chunk_type()? {
+                MtmdInputChunkType::Text => breakdown.text += n_tokens,
+                MtmdInputChunkType::Image => breakdown.image += n_tokens,
+                MtmdInputChunkType::Audio => breakdown.audio += n_tokens,
+            }
+        }
+
+        Ok(breakdown)
+    }
+
+    fn build_user_prompt_with_image_marker(model: &LlamaModel, question: &str) -> Result<String> {
+        let marker = llama_cpp_bindings::mtmd::mtmd_default_marker();
+        let user_content = format!("{marker}{question}");
+        let chat_template = model.chat_template(None)?;
+        let messages = [LlamaChatMessage::new("user".to_string(), user_content)?];
+
+        Ok(model.apply_chat_template(&chat_template, &messages, true)?)
+    }
+
+    struct SamplingTotals {
+        generated: String,
+        observed_content: u64,
+        observed_reasoning: u64,
+    }
+
+    fn drive_sampling_loop(
+        classifier: &mut SampledTokenClassifier,
+        model: &LlamaModel,
+        ctx: &mut LlamaContext,
+        starting_position: llama_pos,
+        max_tokens: usize,
+    ) -> Result<SamplingTotals> {
+        let mut sampler = LlamaSampler::greedy();
+        let mut totals = SamplingTotals {
+            generated: String::new(),
+            observed_content: 0,
+            observed_reasoning: 0,
+        };
+        let mut batch = LlamaBatch::new(512, 1)?;
+
+        for (current_position, _) in (starting_position..).zip(0..max_tokens) {
+            let (raw_token, outcomes) = classifier.sample(&mut sampler, ctx, -1)?;
+            for outcome in &outcomes {
+                totals.generated.push_str(&outcome.raw_piece);
+                match outcome.sampled_token {
+                    SampledToken::Content(_) => totals.observed_content += 1,
+                    SampledToken::Reasoning(_) => totals.observed_reasoning += 1,
+                    SampledToken::ToolCall(_) | SampledToken::Undeterminable(_) => {}
+                }
+            }
+
+            let raw_as_sampled = SampledToken::Content(raw_token);
+            if model.is_eog_token(&raw_as_sampled) {
+                break;
+            }
+
+            batch.clear();
+            batch.add(&raw_as_sampled, current_position, &[0], true)?;
+
+            ctx.decode(&mut batch)
+                .with_context(|| "failed to decode generated token")?;
+        }
+
+        for outcome in classifier.flush() {
+            totals.generated.push_str(&outcome.raw_piece);
+            match outcome.sampled_token {
+                SampledToken::Content(_) => totals.observed_content += 1,
+                SampledToken::Reasoning(_) => totals.observed_reasoning += 1,
+                SampledToken::ToolCall(_) | SampledToken::Undeterminable(_) => {}
+            }
+        }
+
+        Ok(totals)
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 4096,
+        n_batch = 512,
+        n_ubatch = 512,
+        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+    )]
+    fn multimodal_vision_inference_produces_output(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let model = fixture.model;
+        let mtmd_ctx = fixture
+            .mtmd_context
+            .expect("mmproj_file declared in attribute");
+
+        let mut ctx = LlamaContext::from_model(
+            model,
+            fixture.backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )
+        .with_context(|| "unable to create llama context")?;
+
+        assert!(
+            mtmd_ctx.support_vision(),
+            "model should support vision input"
+        );
+
+        let image_path = test_model::fixtures_dir().join("llamas.jpg");
+        let image_path_str = image_path
+            .to_str()
+            .with_context(|| "image path is not valid UTF-8")?;
+        let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)
+            .with_context(|| "failed to load image from file")?;
+
+        let formatted_prompt =
+            build_user_prompt_with_image_marker(model, "What animals do you see in this image?")?;
+
+        let input_text = MtmdInputText {
+            text: formatted_prompt,
+            add_special: false,
+            parse_special: true,
+        };
+
+        let chunks = mtmd_ctx
+            .tokenize(input_text, &[&bitmap])
+            .with_context(|| "failed to tokenize multimodal input")?;
+
+        assert!(
+            !chunks.is_empty(),
+            "tokenization should produce at least one chunk"
+        );
+
+        let expected = count_chunk_tokens_by_type(&chunks)?;
+
+        eprintln!(
+            "tokenized into {} chunks, text {} image {} audio {}",
+            chunks.len(),
+            expected.text,
+            expected.image,
+            expected.audio
+        );
+
+        assert!(
+            expected.image > 0,
+            "vision input must produce at least one image chunk"
+        );
+
+        let mut classifier = model.sampled_token_classifier();
+        let n_past = classifier
+            .eval_multimodal_chunks(&chunks, mtmd_ctx, &ctx, 0, 0, 512, true)
+            .with_context(|| "failed to evaluate chunks")?;
+
+        eprintln!("evaluated chunks, n_past = {n_past}");
+
+        {
+            let usage = classifier.usage();
+            assert_eq!(usage.prompt_tokens, expected.text);
+            assert_eq!(usage.input_image_tokens, expected.image);
+            assert_eq!(usage.input_audio_tokens, expected.audio);
+        }
+
+        let totals = drive_sampling_loop(&mut classifier, model, &mut ctx, n_past, 512)?;
+
+        eprintln!("generated text: {}", totals.generated);
+
+        assert!(
+            !totals.generated.is_empty(),
+            "model should generate at least one token from image input"
+        );
+
+        let usage = classifier.into_usage();
+        assert_eq!(usage.prompt_tokens, expected.text);
+        assert_eq!(usage.input_image_tokens, expected.image);
+        assert_eq!(usage.input_audio_tokens, expected.audio);
+        assert_eq!(usage.content_tokens, totals.observed_content);
+        assert_eq!(usage.reasoning_tokens, totals.observed_reasoning);
+        assert_eq!(
+            usage.completion_tokens(),
+            totals.observed_content + totals.observed_reasoning
+        );
+
+        Ok(())
+    }
+}
+
+mod eval_multimodal_chunks_records_exact_token_counts {
+    use anyhow::Result;
+    use llama_cpp_bindings::TokenUsage;
+    use llama_cpp_bindings::context::LlamaContext;
+    use llama_cpp_bindings::mtmd::MtmdBitmap;
+    use llama_cpp_bindings::mtmd::MtmdInputChunkType;
+    use llama_cpp_bindings::mtmd::MtmdInputChunks;
+    use llama_cpp_bindings::mtmd::MtmdInputText;
+    use llama_cpp_bindings::mtmd::mtmd_default_marker;
+    use llama_cpp_bindings_tests::test_model::fixtures_dir;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    const PROMPT_QUESTION: &str = "What animals do you see in this image?";
+
+    struct ExpectedChunkTotals {
+        text: u64,
+        image: u64,
+        audio: u64,
+    }
+
+    fn sum_chunk_token_counts_by_type(chunks: &MtmdInputChunks) -> Result<ExpectedChunkTotals> {
+        let mut totals = ExpectedChunkTotals {
+            text: 0,
+            image: 0,
+            audio: 0,
+        };
+        for index in 0..chunks.len() {
+            let chunk = chunks
+                .get(index)
+                .ok_or_else(|| anyhow::anyhow!("chunk index {index} should exist"))?;
+            let n_tokens = u64::try_from(chunk.n_tokens())?;
+            match chunk.chunk_type()? {
+                MtmdInputChunkType::Text => {
+                    totals.text = totals.text.saturating_add(n_tokens);
+                }
+                MtmdInputChunkType::Image => {
+                    totals.image = totals.image.saturating_add(n_tokens);
+                }
+                MtmdInputChunkType::Audio => {
+                    totals.audio = totals.audio.saturating_add(n_tokens);
+                }
+            }
+        }
+        Ok(totals)
+    }
+
+    fn build_multimodal_chunks_and_eval_into_usage(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<(TokenUsage, ExpectedChunkTotals)> {
+        let model = fixture.model;
+        let mtmd_ctx = fixture
+            .mtmd_context
+            .expect("mmproj_file declared in attribute");
+
+        let image_path = fixtures_dir().join("llamas.jpg");
+        let image_path_str = image_path
+            .to_str()
+            .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?;
+        let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?;
+
+        let marker = mtmd_default_marker();
+        let prompt = format!("{marker}{PROMPT_QUESTION}");
+
+        let input_text = MtmdInputText {
+            text: prompt,
+            add_special: false,
+            parse_special: true,
+        };
+
+        let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
+        let expected = sum_chunk_token_counts_by_type(&chunks)?;
+
+        let context_params = (*fixture.context_params).into_llama_context_params();
+        let context = LlamaContext::from_model(model, fixture.backend, context_params)?;
+
+        let mut classifier = model.sampled_token_classifier();
+        classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?;
+
+        Ok((classifier.into_usage(), expected))
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 4096,
+        n_batch = 512,
+        n_ubatch = 512,
+        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+    )]
+    fn prompt_tokens_match_text_chunk_total(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let (usage, expected) = build_multimodal_chunks_and_eval_into_usage(fixture)?;
+
+        if usage.prompt_tokens != expected.text {
+            anyhow::bail!(
+                "prompt_tokens must equal sum of text-chunk n_tokens; expected {}, got {}",
+                expected.text,
+                usage.prompt_tokens
+            );
+        }
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 4096,
+        n_batch = 512,
+        n_ubatch = 512,
+        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+    )]
+    fn input_image_tokens_match_image_chunk_total(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let (usage, expected) = build_multimodal_chunks_and_eval_into_usage(fixture)?;
+
+        if usage.input_image_tokens != expected.image {
+            anyhow::bail!(
+                "input_image_tokens must equal sum of image-chunk n_tokens; expected {}, got {}",
+                expected.image,
+                usage.input_image_tokens
+            );
+        }
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 4096,
+        n_batch = 512,
+        n_ubatch = 512,
+        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+    )]
+    fn input_audio_tokens_are_zero_for_image_only_input(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let (usage, expected) = build_multimodal_chunks_and_eval_into_usage(fixture)?;
+
+        if expected.audio != 0 {
+            anyhow::bail!(
+                "fixture invariant: image-only multimodal input should produce zero audio chunk tokens, got {}",
+                expected.audio
+            );
+        }
+        if usage.input_audio_tokens != 0 {
+            anyhow::bail!(
+                "input_audio_tokens must be zero when no audio chunks are evaluated; got {}",
+                usage.input_audio_tokens
+            );
+        }
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 4096,
+        n_batch = 512,
+        n_ubatch = 512,
+        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+    )]
+    fn completion_tokens_are_zero_after_eval_before_generation(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let (usage, _expected) = build_multimodal_chunks_and_eval_into_usage(fixture)?;
+
+        if usage.completion_tokens() != 0 {
+            anyhow::bail!(
+                "completion_tokens must be zero immediately after eval (no generation has occurred); got {}",
+                usage.completion_tokens()
+            );
+        }
+
+        Ok(())
+    }
+}
+
+mod ingest_prompt_chunk {
+    use anyhow::Result;
+    use llama_cpp_bindings::ingest_prompt_chunk::ingest_prompt_chunk;
+    use llama_cpp_bindings::mtmd::MtmdBitmap;
+    use llama_cpp_bindings::mtmd::MtmdInputChunkType;
+    use llama_cpp_bindings::mtmd::MtmdInputText;
+    use llama_cpp_bindings::mtmd::mtmd_default_marker;
+    use llama_cpp_bindings_tests::test_model::fixtures_dir;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+    )]
+    fn text_chunk_records_prompt_tokens(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let model = fixture.model;
+        let mtmd_ctx = fixture
+            .mtmd_context
+            .expect("mmproj_file declared in attribute");
+
+        let input_text = MtmdInputText {
+            text: "hello world".to_owned(),
+            add_special: false,
+            parse_special: false,
+        };
+        let chunks = mtmd_ctx.tokenize(input_text, &[])?;
+
+        let text_chunk = (0..chunks.len())
+            .filter_map(|index| chunks.get(index))
+            .find(|chunk| chunk.chunk_type() == Ok(MtmdInputChunkType::Text))
+            .ok_or_else(|| {
+                anyhow::anyhow!("text-only tokenization should produce at least one text chunk")
+            })?;
+
+        let n_tokens = u64::try_from(text_chunk.n_tokens())?;
+
+        let mut classifier = model.sampled_token_classifier();
+
+        ingest_prompt_chunk(&mut classifier, &text_chunk)?;
+
+        let usage = classifier.usage();
+        if usage.prompt_tokens != n_tokens {
+            anyhow::bail!(
+                "text chunk must record n_tokens as prompt_tokens; expected {n_tokens}, got {}",
+                usage.prompt_tokens
+            );
+        }
+        if usage.input_image_tokens != 0 {
+            anyhow::bail!(
+                "text chunk must not bump input_image_tokens; got {}",
+                usage.input_image_tokens
+            );
+        }
+        if usage.input_audio_tokens != 0 {
+            anyhow::bail!(
+                "text chunk must not bump input_audio_tokens; got {}",
+                usage.input_audio_tokens
+            );
+        }
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+    )]
+    fn image_chunk_records_input_image_tokens_only(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let model = fixture.model;
+        let mtmd_ctx = fixture
+            .mtmd_context
+            .expect("mmproj_file declared in attribute");
+
+        let image_path = fixtures_dir().join("llamas.jpg");
+        let image_path_str = image_path
+            .to_str()
+            .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?;
+        let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?;
+
+        let marker = mtmd_default_marker();
+        let input_text = MtmdInputText {
+            text: marker.to_owned(),
+            add_special: false,
+            parse_special: true,
+        };
+        let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
+
+        let image_chunk = (0..chunks.len())
+            .filter_map(|index| chunks.get(index))
+            .find(|chunk| chunk.chunk_type() == Ok(MtmdInputChunkType::Image))
+            .ok_or_else(|| {
+                anyhow::anyhow!("multimodal tokenization should produce an image chunk")
+            })?;
+
+        let n_tokens = u64::try_from(image_chunk.n_tokens())?;
+        if n_tokens == 0 {
+            anyhow::bail!("image chunk should report at least one token");
+        }
+
+        let mut classifier = model.sampled_token_classifier();
+
+        ingest_prompt_chunk(&mut classifier, &image_chunk)?;
+
+        let usage = classifier.usage();
+        if usage.input_image_tokens != n_tokens {
+            anyhow::bail!(
+                "image chunk must record n_tokens as input_image_tokens; expected {n_tokens}, got {}",
+                usage.input_image_tokens
+            );
+        }
+        if usage.prompt_tokens != 0 {
+            anyhow::bail!(
+                "image chunk must not bump prompt_tokens; got {}",
+                usage.prompt_tokens
+            );
+        }
+        if usage.input_audio_tokens != 0 {
+            anyhow::bail!(
+                "image chunk must not bump input_audio_tokens; got {}",
+                usage.input_audio_tokens
+            );
+        }
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+    )]
+    fn text_chunk_drives_marker_state_machine_to_reasoning(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let model = fixture.model;
+        let mtmd_ctx = fixture
+            .mtmd_context
+            .expect("mmproj_file declared in attribute");
+
+        let input_text = MtmdInputText {
+            text: "<|im_start|>user\nHi<|im_end|>\n<|im_start|>assistant\n<think>\n".to_owned(),
+            add_special: false,
+            parse_special: true,
+        };
+        let chunks = mtmd_ctx.tokenize(input_text, &[])?;
+
+        let mut classifier = model.sampled_token_classifier();
+
+        for index in 0..chunks.len() {
+            let chunk = chunks
+                .get(index)
+                .ok_or_else(|| anyhow::anyhow!("chunk index {index} must exist"))?;
+            ingest_prompt_chunk(&mut classifier, &chunk)?;
+        }
+
+        if classifier.current_section() != llama_cpp_bindings::SampledTokenSection::Reasoning {
+            anyhow::bail!(
+                "text chunk replay must transition the classifier section to Reasoning when the \
+                 prompt opens a `<think>` block; got {:?}",
+                classifier.current_section()
+            );
+        }
+
+        Ok(())
+    }
+}
+
+mod gemma4_classifier_emits_reasoning_for_multimodal_thinking_prompt {
+    use anyhow::Result;
+    use llama_cpp_bindings::context::LlamaContext;
+    use llama_cpp_bindings::llama_batch::LlamaBatch;
+    use llama_cpp_bindings::mtmd::MtmdBitmap;
+    use llama_cpp_bindings::mtmd::MtmdInputText;
+    use llama_cpp_bindings::mtmd::mtmd_default_marker;
+    use llama_cpp_bindings::sampling::LlamaSampler;
+    use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
+    use llama_cpp_bindings_tests::test_model::fixtures_dir;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    const MAX_GENERATED_TOKENS: i32 = 200;
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 8192,
+        n_batch = 512,
+        n_ubatch = 512,
+        mmproj_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "mmproj-F16.gguf"),
+    )]
+    fn gemma4_classifier_emits_reasoning_for_multimodal_thinking_prompt(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let model = fixture.model;
+        let backend = fixture.backend;
+        let mtmd_ctx = fixture
+            .mtmd_context
+            .expect("mmproj_file declared in attribute");
+
+        let mut context = LlamaContext::from_model(
+            model,
+            backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+
+        let image_path = fixtures_dir().join("llamas.jpg");
+        let image_path_str = image_path
+            .to_str()
+            .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?;
+        let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?;
+
+        let marker = mtmd_default_marker();
+        let prompt = format!(
+            "<bos><start_of_turn>user\n{marker}What animals do you see in this image?<end_of_turn>\n<start_of_turn>model\n<|channel>thought\n"
+        );
+
+        let input_text = MtmdInputText {
+            text: prompt,
+            add_special: false,
+            parse_special: true,
+        };
+
+        let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
+
+        let mut classifier = model.sampled_token_classifier();
+        let n_past =
+            classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?;
+
+        let mut sampler = LlamaSampler::chain_simple([
+            LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
+            LlamaSampler::top_k(40),
+            LlamaSampler::top_p(0.9, 1),
+            LlamaSampler::min_p(0.05, 1),
+            LlamaSampler::temp(0.7),
+            LlamaSampler::dist(0x00C0_FFEE),
+        ]);
+
+        let mut batch = LlamaBatch::new(2048, 1)?;
+        let outcome = ClassifySampleLoop {
+            model,
+            classifier: &mut classifier,
+            sampler: &mut sampler,
+            context: &mut context,
+            batch: &mut batch,
+            initial_position: n_past,
+            max_generated_tokens: MAX_GENERATED_TOKENS,
+        }
+        .run()?;
+
+        let usage = classifier.usage();
+
+        if outcome.observed_reasoning == 0 {
+            anyhow::bail!(
+                "Gemma 4 multimodal + thinking: classifier must emit at least one Reasoning token \
+                 when the prompt opens a `<|channel>thought` block; outcome={outcome:?}"
+            );
+        }
+        if usage.reasoning_tokens == 0 {
+            anyhow::bail!(
+                "Gemma 4 multimodal + thinking: usage.reasoning_tokens must be non-zero; usage={usage:?}"
+            );
+        }
+
+        Ok(())
+    }
+}
+
+mod mistral3_classifier_emits_reasoning_for_multimodal_thinking_prompt {
+    use anyhow::Result;
+    use llama_cpp_bindings::context::LlamaContext;
+    use llama_cpp_bindings::llama_batch::LlamaBatch;
+    use llama_cpp_bindings::mtmd::MtmdBitmap;
+    use llama_cpp_bindings::mtmd::MtmdInputText;
+    use llama_cpp_bindings::mtmd::mtmd_default_marker;
+    use llama_cpp_bindings::sampling::LlamaSampler;
+    use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
+    use llama_cpp_bindings_tests::test_model::fixtures_dir;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    const MAX_GENERATED_TOKENS: i32 = 768;
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 4096,
+        n_batch = 512,
+        n_ubatch = 512,
+        mmproj_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "mmproj-F16.gguf"),
+    )]
+    fn mistral3_classifier_emits_reasoning_for_multimodal_thinking_prompt(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let model = fixture.model;
+        let backend = fixture.backend;
+        let mtmd_ctx = fixture
+            .mtmd_context
+            .expect("mmproj_file declared in attribute");
+
+        let mut context = LlamaContext::from_model(
+            model,
+            backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+
+        let image_path = fixtures_dir().join("llamas.jpg");
+        let image_path_str = image_path
+            .to_str()
+            .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?;
+        let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?;
+
+        let marker = mtmd_default_marker();
+        let prompt = format!(
+            "[SYSTEM_PROMPT]# HOW YOU SHOULD THINK AND ANSWER\n\n\
+             First draft your thinking process (inner monologue) until you arrive at a response. \
+             Format your response using Markdown, and use LaTeX for any mathematical equations. \
+             Write both your thoughts and the response in the same language as the input.\n\n\
+             Your thinking process must follow the template below:\
+             [THINK]Your thoughts or/and draft, like working through an exercise on scratch paper. \
+             Be as casual and as long as you want until you are confident to generate the response \
+             to the user.[/THINK]Here, provide a self-contained response.[/SYSTEM_PROMPT]\
+             [INST]{marker}What animals do you see in this image?[/INST]"
+        );
+
+        let input_text = MtmdInputText {
+            text: prompt,
+            add_special: true,
+            parse_special: true,
+        };
+
+        let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
+
+        let mut classifier = model.sampled_token_classifier();
+        let n_past =
+            classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?;
+
+        let mut sampler = LlamaSampler::greedy();
+        let mut batch = LlamaBatch::new(2048, 1)?;
+        let outcome = ClassifySampleLoop {
+            model,
+            classifier: &mut classifier,
+            sampler: &mut sampler,
+            context: &mut context,
+            batch: &mut batch,
+            initial_position: n_past,
+            max_generated_tokens: MAX_GENERATED_TOKENS,
+        }
+        .run()?;
+
+        let usage = classifier.usage();
+
+        if outcome.observed_reasoning == 0 {
+            anyhow::bail!(
+                "Mistral 3 multimodal + thinking: classifier must emit at least one Reasoning token \
+                 when the model opens a `[THINK]` block; outcome={outcome:?}"
+            );
+        }
+        if usage.reasoning_tokens == 0 {
+            anyhow::bail!(
+                "Mistral 3 multimodal + thinking: usage.reasoning_tokens must be non-zero; usage={usage:?}"
+            );
+        }
+
+        Ok(())
+    }
+}
+
+mod qwen35_classifier_emits_reasoning_for_multimodal_thinking_prompt {
+    use anyhow::Result;
+    use llama_cpp_bindings::context::LlamaContext;
+    use llama_cpp_bindings::llama_batch::LlamaBatch;
+    use llama_cpp_bindings::mtmd::MtmdBitmap;
+    use llama_cpp_bindings::mtmd::MtmdInputText;
+    use llama_cpp_bindings::mtmd::mtmd_default_marker;
+    use llama_cpp_bindings::sampling::LlamaSampler;
+    use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
+    use llama_cpp_bindings_tests::test_model::fixtures_dir;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    const MAX_GENERATED_TOKENS: i32 = 200;
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 4096,
+        n_batch = 512,
+        n_ubatch = 512,
+        mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 4096,
+        n_batch = 512,
+        n_ubatch = 512,
+        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+    )]
+    fn qwen35_classifier_emits_reasoning_for_multimodal_thinking_prompt(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let model = fixture.model;
+        let backend = fixture.backend;
+        let mtmd_ctx = fixture
+            .mtmd_context
+            .expect("mmproj_file declared in attribute");
+
+        let mut context = LlamaContext::from_model(
+            model,
+            backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+
+        let image_path = fixtures_dir().join("llamas.jpg");
+        let image_path_str = image_path
+            .to_str()
+            .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?;
+        let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?;
+
+        let marker = mtmd_default_marker();
+        let prompt = format!(
+            "<|im_start|>user\n{marker}What animals do you see in this image?<|im_end|>\n<|im_start|>assistant\n<think>\n"
+        );
+
+        let input_text = MtmdInputText {
+            text: prompt,
+            add_special: false,
+            parse_special: true,
+        };
+
+        let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
+
+        let mut classifier = model.sampled_token_classifier();
+        let n_past =
+            classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?;
+
+        let mut sampler = LlamaSampler::chain_simple([
+            LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
+            LlamaSampler::top_k(40),
+            LlamaSampler::top_p(0.9, 1),
+            LlamaSampler::min_p(0.05, 1),
+            LlamaSampler::temp(0.7),
+            LlamaSampler::dist(0x00C0_FFEE),
+        ]);
+
+        let mut batch = LlamaBatch::new(2048, 1)?;
+        let outcome = ClassifySampleLoop {
+            model,
+            classifier: &mut classifier,
+            sampler: &mut sampler,
+            context: &mut context,
+            batch: &mut batch,
+            initial_position: n_past,
+            max_generated_tokens: MAX_GENERATED_TOKENS,
+        }
+        .run()?;
+
+        let usage = classifier.usage();
+
+        if outcome.observed_reasoning == 0 {
+            anyhow::bail!(
+                "Qwen 3.5 multimodal + thinking: classifier must emit at least one Reasoning token \
+                 when the prompt opens a `<think>` block; outcome={outcome:?}"
+            );
+        }
+        if usage.reasoning_tokens == 0 {
+            anyhow::bail!(
+                "Qwen 3.5 multimodal + thinking: usage.reasoning_tokens must be non-zero; usage={usage:?}"
+            );
+        }
+
+        Ok(())
+    }
+}
+
+mod qwen36_classifier_emits_reasoning_for_multimodal_thinking_prompt {
+    use anyhow::Result;
+    use llama_cpp_bindings::context::LlamaContext;
+    use llama_cpp_bindings::llama_batch::LlamaBatch;
+    use llama_cpp_bindings::mtmd::MtmdBitmap;
+    use llama_cpp_bindings::mtmd::MtmdInputText;
+    use llama_cpp_bindings::mtmd::mtmd_default_marker;
+    use llama_cpp_bindings::sampling::LlamaSampler;
+    use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
+    use llama_cpp_bindings_tests::test_model::fixtures_dir;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    const MAX_GENERATED_TOKENS: i32 = 200;
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 8192,
+        n_batch = 512,
+        n_ubatch = 512,
+        mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+    )]
+    fn qwen36_classifier_emits_reasoning_for_multimodal_thinking_prompt(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let model = fixture.model;
+        let backend = fixture.backend;
+        let mtmd_ctx = fixture
+            .mtmd_context
+            .expect("mmproj_file declared in attribute");
+
+        let mut context = LlamaContext::from_model(
+            model,
+            backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+
+        let image_path = fixtures_dir().join("llamas.jpg");
+        let image_path_str = image_path
+            .to_str()
+            .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?;
+        let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?;
+
+        let marker = mtmd_default_marker();
+        let prompt = format!(
+            "<|im_start|>user\n{marker}What animals do you see in this image?<|im_end|>\n<|im_start|>assistant\n<think>\n"
+        );
+
+        let input_text = MtmdInputText {
+            text: prompt,
+            add_special: false,
+            parse_special: true,
+        };
+
+        let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
+
+        let mut classifier = model.sampled_token_classifier();
+        let n_past =
+            classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?;
+
+        let mut sampler = LlamaSampler::chain_simple([
+            LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
+            LlamaSampler::top_k(40),
+            LlamaSampler::top_p(0.9, 1),
+            LlamaSampler::min_p(0.05, 1),
+            LlamaSampler::temp(0.7),
+            LlamaSampler::dist(0x00C0_FFEE),
+        ]);
+
+        let mut batch = LlamaBatch::new(2048, 1)?;
+        let outcome = ClassifySampleLoop {
+            model,
+            classifier: &mut classifier,
+            sampler: &mut sampler,
+            context: &mut context,
+            batch: &mut batch,
+            initial_position: n_past,
+            max_generated_tokens: MAX_GENERATED_TOKENS,
+        }
+        .run()?;
+
+        let usage = classifier.usage();
+
+        if outcome.observed_reasoning == 0 {
+            anyhow::bail!(
+                "Qwen 3.6 multimodal + thinking: classifier must emit at least one Reasoning token; outcome={outcome:?}"
+            );
+        }
+        if usage.reasoning_tokens == 0 {
+            anyhow::bail!(
+                "Qwen 3.6 multimodal + thinking: usage.reasoning_tokens must be non-zero; usage={usage:?}"
+            );
+        }
+
+        Ok(())
+    }
+}
+
+llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/parse_chat_message.rs b/llama-cpp-bindings-tests/tests/parse_chat_message.rs
deleted file mode 100644
index d23fe1c2..00000000
--- a/llama-cpp-bindings-tests/tests/parse_chat_message.rs
+++ /dev/null
@@ -1,368 +0,0 @@
-#![expect(
-    clippy::unnecessary_wraps,
-    reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-)]
-
-use anyhow::Result;
-use anyhow::bail;
-use llama_cpp_bindings::ChatMessageParseOutcome;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn parses_pure_content_response(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let outcome = fixture
-        .model
-        .parse_chat_message("[]", "hello world", false)?;
-
-    let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
-        bail!("expected Recognized for plain content; got Unrecognized");
-    };
-    assert!(parsed.tool_calls.is_empty());
-    assert!(!parsed.is_empty());
-    assert!(parsed.content.contains("hello world"));
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn parses_reasoning_section_into_reasoning_content(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let input = "<think>step one, step two</think>\n\nactual response";
-    let outcome = fixture.model.parse_chat_message("[]", input, false)?;
-
-    let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
-        bail!("expected Recognized for reasoning section; got Unrecognized");
-    };
-    assert!(
-        parsed.reasoning_content.contains("step") || parsed.content.contains("step"),
-        "neither content nor reasoning contains 'step'; content={:?} reasoning={:?}",
-        parsed.content,
-        parsed.reasoning_content
-    );
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn parses_empty_input_yields_empty_message(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let outcome = fixture.model.parse_chat_message("[]", "", false)?;
-
-    let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
-        bail!("expected Recognized for empty input; got Unrecognized");
-    };
-    assert!(parsed.tool_calls.is_empty());
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn parses_malformed_tools_json_returns_tools_json_invalid_error(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let result = fixture
-        .model
-        .parse_chat_message("not_a_json[}", "hello", false);
-
-    assert!(matches!(
-        result,
-        Err(llama_cpp_bindings::ParseChatMessageError::ToolsJsonInvalid(
-            _
-        ))
-    ));
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn parses_non_array_tools_json_returns_tools_json_not_array_error(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let result = fixture
-        .model
-        .parse_chat_message("{\"foo\": 1}", "hello", false);
-
-    assert!(matches!(
-        result,
-        Err(llama_cpp_bindings::ParseChatMessageError::ToolsJsonNotArray)
-    ));
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn parses_with_tools_null_byte_returns_tools_json_invalid_error(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let result = fixture
-        .model
-        .parse_chat_message("[]\0extra", "hello", false);
-
-    assert!(matches!(
-        result,
-        Err(llama_cpp_bindings::ParseChatMessageError::ToolsJsonInvalid(
-            _
-        ))
-    ));
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn parses_with_input_null_byte_returns_tools_serialization_error(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let result = fixture
-        .model
-        .parse_chat_message("[]", "hello\0world", false);
-
-    assert!(matches!(
-        result,
-        Err(llama_cpp_bindings::ParseChatMessageError::ToolsSerialization(_))
-    ));
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/qwen35_chat_inference_emits_reasoning_when_template_auto_opens.rs b/llama-cpp-bindings-tests/tests/qwen35_chat_inference_emits_reasoning_when_template_auto_opens.rs
deleted file mode 100644
index 260dd0f6..00000000
--- a/llama-cpp-bindings-tests/tests/qwen35_chat_inference_emits_reasoning_when_template_auto_opens.rs
+++ /dev/null
@@ -1,87 +0,0 @@
-use anyhow::Result;
-use anyhow::bail;
-use llama_cpp_bindings::ChatMessageParseOutcome;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_bindings::model::LlamaChatMessage;
-use llama_cpp_bindings::sampling::LlamaSampler;
-use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 2048,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn qwen35_chat_inference_emits_reasoning_when_template_auto_opens(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let model = fixture.model;
-    let backend = fixture.backend;
-
-    let mut context = LlamaContext::from_model(
-        model,
-        backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-
-    let chat_template = model.chat_template(None)?;
-    let messages = vec![LlamaChatMessage::new(
-        "user".to_owned(),
-        "Hello! How are you?".to_owned(),
-    )?];
-    let prompt = model.apply_chat_template(&chat_template, &messages, true)?;
-
-    let mut classifier = model.sampled_token_classifier();
-    let tokens = model.str_to_token(&prompt, AddBos::Always)?;
-    let prompt_token_count = u64::try_from(tokens.len())?;
-
-    let mut batch = LlamaBatch::new(512, 1)?;
-    classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?;
-
-    context.decode(&mut batch)?;
-
-    let promoted = classifier.commit_prompt_tokens();
-    assert_eq!(promoted, prompt_token_count);
-
-    let mut sampler = LlamaSampler::greedy();
-    let initial_position = batch.n_tokens();
-    let outcome = ClassifySampleLoop {
-        model,
-        classifier: &mut classifier,
-        sampler: &mut sampler,
-        context: &mut context,
-        batch: &mut batch,
-        initial_position,
-        max_generated_tokens: 1024,
-    }
-    .run()?;
-
-    assert!(!outcome.generated_raw.is_empty());
-    assert!(outcome.observed_reasoning > 0);
-    assert!(outcome.observed_content > 0);
-    assert_eq!(outcome.observed_undeterminable, 0);
-    assert_eq!(outcome.observed_tool_call, 0);
-
-    let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?;
-    let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else {
-        bail!("Qwen3.5 chat template must be recognised by the parser; got Unrecognized");
-    };
-    assert!(!parsed.content.is_empty());
-
-    let usage = classifier.into_usage();
-    assert_eq!(usage.prompt_tokens, prompt_token_count);
-    assert_eq!(usage.reasoning_tokens, outcome.observed_reasoning);
-    assert_eq!(usage.undeterminable_tokens, 0);
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/qwen35_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs b/llama-cpp-bindings-tests/tests/qwen35_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs
deleted file mode 100644
index df0a9b80..00000000
--- a/llama-cpp-bindings-tests/tests/qwen35_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs
+++ /dev/null
@@ -1,95 +0,0 @@
-use anyhow::Result;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_bindings::sampling::LlamaSampler;
-use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const MAX_GENERATED_TOKENS: i32 = 200;
-
-const QWEN35_THINKING_DISABLED_PROMPT: &str = "\
-<|im_start|>user
-What is 2 + 2?<|im_end|>
-<|im_start|>assistant
-<think>
-
-</think>
-
-";
-
-const FORBIDDEN_MARKERS: &[&str] = &["<think>", "</think>"];
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 8192,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn qwen35_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let model = fixture.model;
-    let backend = fixture.backend;
-
-    let mut classifier = model.sampled_token_classifier();
-    let prompt_tokens = model.str_to_token(QWEN35_THINKING_DISABLED_PROMPT, AddBos::Never)?;
-    let prompt_token_count = u64::try_from(prompt_tokens.len())?;
-
-    let mut batch = LlamaBatch::new(2048, 1)?;
-    classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
-
-    let mut context = LlamaContext::from_model(
-        model,
-        backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-
-    context.decode(&mut batch)?;
-
-    let promoted = classifier.commit_prompt_tokens();
-    assert_eq!(promoted, prompt_token_count);
-
-    let mut sampler = LlamaSampler::chain_simple([
-        LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
-        LlamaSampler::top_k(40),
-        LlamaSampler::top_p(0.9, 1),
-        LlamaSampler::min_p(0.05, 1),
-        LlamaSampler::temp(0.7),
-        LlamaSampler::dist(0x00C0_FFEE),
-    ]);
-    let initial_position = batch.n_tokens();
-    let outcome = ClassifySampleLoop {
-        model,
-        classifier: &mut classifier,
-        sampler: &mut sampler,
-        context: &mut context,
-        batch: &mut batch,
-        initial_position,
-        max_generated_tokens: MAX_GENERATED_TOKENS,
-    }
-    .run()?;
-
-    let usage = classifier.usage();
-
-    assert!(!outcome.generated_raw.is_empty());
-    assert_eq!(outcome.observed_reasoning, 0);
-    assert_eq!(outcome.observed_undeterminable, 0);
-    assert_eq!(usage.reasoning_tokens, 0);
-    assert_eq!(usage.undeterminable_tokens, 0);
-    assert!(outcome.observed_content > 0);
-    assert_eq!(usage.completion_tokens(), outcome.observed_content);
-
-    for forbidden in FORBIDDEN_MARKERS {
-        assert!(!outcome.content_stream.contains(forbidden));
-    }
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/qwen35_classifier_emits_reasoning.rs b/llama-cpp-bindings-tests/tests/qwen35_classifier_emits_reasoning.rs
deleted file mode 100644
index f9c98932..00000000
--- a/llama-cpp-bindings-tests/tests/qwen35_classifier_emits_reasoning.rs
+++ /dev/null
@@ -1,111 +0,0 @@
-use anyhow::Result;
-use anyhow::bail;
-use llama_cpp_bindings::ChatMessageParseOutcome;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_bindings::sampling::LlamaSampler;
-use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const MAX_GENERATED_TOKENS: i32 = 1500;
-
-const QWEN35_THINKING_PROMPT: &str = "\
-<|im_start|>user
-What is 2 + 2?<|im_end|>
-<|im_start|>assistant
-<think>
-";
-
-const FORBIDDEN_MARKERS: &[&str] = &["<think>", "</think>"];
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 8192,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn qwen35_classifier_emits_reasoning_for_thinking_enabled_prompt(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let model = fixture.model;
-    let backend = fixture.backend;
-
-    let mut classifier = model.sampled_token_classifier();
-    let prompt_tokens = model.str_to_token(QWEN35_THINKING_PROMPT, AddBos::Never)?;
-    let prompt_token_count = u64::try_from(prompt_tokens.len())?;
-
-    let mut batch = LlamaBatch::new(2048, 1)?;
-    classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
-
-    let mut context = LlamaContext::from_model(
-        model,
-        backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-
-    context.decode(&mut batch)?;
-
-    let promoted = classifier.commit_prompt_tokens();
-    assert_eq!(promoted, prompt_token_count);
-
-    let mut sampler = LlamaSampler::chain_simple([
-        LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
-        LlamaSampler::top_k(40),
-        LlamaSampler::top_p(0.9, 1),
-        LlamaSampler::min_p(0.05, 1),
-        LlamaSampler::temp(0.7),
-        LlamaSampler::dist(0x00C0_FFEE),
-    ]);
-    let initial_position = batch.n_tokens();
-    let outcome = ClassifySampleLoop {
-        model,
-        classifier: &mut classifier,
-        sampler: &mut sampler,
-        context: &mut context,
-        batch: &mut batch,
-        initial_position,
-        max_generated_tokens: MAX_GENERATED_TOKENS,
-    }
-    .run()?;
-
-    let usage = classifier.usage();
-    let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?;
-    let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else {
-        bail!("Qwen3.5 chat template must be recognised by the parser; got Unrecognized");
-    };
-
-    assert!(!outcome.generated_raw.is_empty());
-    assert!(outcome.observed_reasoning > 0);
-    assert!(usage.reasoning_tokens > 0);
-    assert_eq!(outcome.observed_undeterminable, 0);
-    assert_eq!(usage.undeterminable_tokens, 0);
-    assert_eq!(
-        usage.completion_tokens(),
-        outcome.observed_content + outcome.observed_reasoning,
-    );
-
-    if parsed.reasoning_content.is_empty() {
-        eprintln!(
-            "Qwen3.5 didn't close its reasoning block within {MAX_GENERATED_TOKENS} tokens — \
-             skipping strict parser-equality assertions"
-        );
-    } else {
-        assert_eq!(outcome.reasoning_stream, parsed.reasoning_content);
-        assert_eq!(outcome.content_stream, parsed.content);
-    }
-
-    for forbidden in FORBIDDEN_MARKERS {
-        assert!(!outcome.reasoning_stream.contains(forbidden));
-        assert!(!outcome.content_stream.contains(forbidden));
-    }
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/qwen35_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs b/llama-cpp-bindings-tests/tests/qwen35_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs
deleted file mode 100644
index 414fde9a..00000000
--- a/llama-cpp-bindings-tests/tests/qwen35_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs
+++ /dev/null
@@ -1,111 +0,0 @@
-use anyhow::Result;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::mtmd::MtmdBitmap;
-use llama_cpp_bindings::mtmd::MtmdInputText;
-use llama_cpp_bindings::mtmd::mtmd_default_marker;
-use llama_cpp_bindings::sampling::LlamaSampler;
-use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-use llama_cpp_bindings_tests::test_model::fixtures_dir;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const MAX_GENERATED_TOKENS: i32 = 200;
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 4096,
-    n_batch = 512,
-    n_ubatch = 512,
-    mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 4096,
-    n_batch = 512,
-    n_ubatch = 512,
-    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn qwen35_classifier_emits_reasoning_for_multimodal_thinking_prompt(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let model = fixture.model;
-    let backend = fixture.backend;
-    let mtmd_ctx = fixture
-        .mtmd_context
-        .expect("mmproj_file declared in attribute");
-
-    let mut context = LlamaContext::from_model(
-        model,
-        backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-
-    let image_path = fixtures_dir().join("llamas.jpg");
-    let image_path_str = image_path
-        .to_str()
-        .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?;
-    let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?;
-
-    let marker = mtmd_default_marker();
-    let prompt = format!(
-        "<|im_start|>user\n{marker}What animals do you see in this image?<|im_end|>\n<|im_start|>assistant\n<think>\n"
-    );
-
-    let input_text = MtmdInputText {
-        text: prompt,
-        add_special: false,
-        parse_special: true,
-    };
-
-    let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
-
-    let mut classifier = model.sampled_token_classifier();
-    let n_past = classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?;
-
-    let mut sampler = LlamaSampler::chain_simple([
-        LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
-        LlamaSampler::top_k(40),
-        LlamaSampler::top_p(0.9, 1),
-        LlamaSampler::min_p(0.05, 1),
-        LlamaSampler::temp(0.7),
-        LlamaSampler::dist(0x00C0_FFEE),
-    ]);
-
-    let mut batch = LlamaBatch::new(2048, 1)?;
-    let outcome = ClassifySampleLoop {
-        model,
-        classifier: &mut classifier,
-        sampler: &mut sampler,
-        context: &mut context,
-        batch: &mut batch,
-        initial_position: n_past,
-        max_generated_tokens: MAX_GENERATED_TOKENS,
-    }
-    .run()?;
-
-    let usage = classifier.usage();
-
-    if outcome.observed_reasoning == 0 {
-        anyhow::bail!(
-            "Qwen 3.5 multimodal + thinking: classifier must emit at least one Reasoning token \
-             when the prompt opens a `<think>` block; outcome={outcome:?}"
-        );
-    }
-    if usage.reasoning_tokens == 0 {
-        anyhow::bail!(
-            "Qwen 3.5 multimodal + thinking: usage.reasoning_tokens must be non-zero; usage={usage:?}"
-        );
-    }
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/qwen35_parses_constrained_schema_payload.rs b/llama-cpp-bindings-tests/tests/qwen35_parses_constrained_schema_payload.rs
deleted file mode 100644
index f517a4e7..00000000
--- a/llama-cpp-bindings-tests/tests/qwen35_parses_constrained_schema_payload.rs
+++ /dev/null
@@ -1,104 +0,0 @@
-use anyhow::Result;
-use anyhow::bail;
-use llama_cpp_bindings::ChatMessageParseOutcome;
-use llama_cpp_bindings::ToolCallArguments;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-use serde_json::Value;
-use serde_json::json;
-
-const NEGOTIATE_WITH_CAT_TOOLS_JSON: &str = r#"[
-    {
-        "type": "function",
-        "function": {
-            "name": "negotiate_with_cat",
-            "description": "Attempt to negotiate with a cat. Outcomes are not guaranteed and may include the silent treatment.",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "topic": {
-                        "type": "string",
-                        "description": "What you are trying to negotiate, e.g. 'get off the keyboard' or 'stop knocking things off the table'"
-                    },
-                    "bribe": {
-                        "type": "string",
-                        "enum": ["tuna", "salmon", "treats", "ear_scritches", "cardboard_box", "none"],
-                        "description": "What you are offering in exchange"
-                    },
-                    "desperation_level": {
-                        "type": "integer",
-                        "description": "How desperate you are, on a scale from 1 (mildly annoyed human) to 10 (it is 3am)",
-                        "minimum": 1,
-                        "maximum": 10
-                    }
-                },
-                "required": ["topic"],
-                "additionalProperties": false
-            }
-        }
-    }
-]"#;
-
-const NEGOTIATE_WITH_CAT_INPUT: &str = "<tool_call>\n\
-<function=negotiate_with_cat>\n\
-<parameter=bribe>\n\
-tuna\n\
-</parameter>\n\
-<parameter=desperation_level>\n\
-8\n\
-</parameter>\n\
-<parameter=topic>\n\
-get off the keyboard\n\
-</parameter>\n\
-</function>\n\
-</tool_call>";
-
-fn arguments_as_json(arguments: &ToolCallArguments) -> Result<&Value> {
-    match arguments {
-        ToolCallArguments::ValidJson(value) => Ok(value),
-        ToolCallArguments::InvalidJson(raw) => {
-            bail!("expected ValidJson arguments, got InvalidJson: {raw}")
-        }
-    }
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn qwen35_parses_constrained_schema_payload(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let outcome = fixture.model.parse_chat_message(
-        NEGOTIATE_WITH_CAT_TOOLS_JSON,
-        NEGOTIATE_WITH_CAT_INPUT,
-        false,
-    )?;
-
-    let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
-        bail!(
-            "Qwen 3.5's tool-call payload must be parsed by the wrapper-side duck-type pass; \
-             got Unrecognized"
-        );
-    };
-
-    assert_eq!(parsed.tool_calls.len(), 1);
-    assert_eq!(parsed.tool_calls[0].name, "negotiate_with_cat");
-    assert_eq!(parsed.tool_calls[0].id, "call_0");
-    assert_eq!(
-        arguments_as_json(&parsed.tool_calls[0].arguments)?,
-        &json!({
-            "bribe": "tuna",
-            "desperation_level": 8,
-            "topic": "get off the keyboard",
-        }),
-    );
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/qwen35_parses_tool_call_payload.rs b/llama-cpp-bindings-tests/tests/qwen35_parses_tool_call_payload.rs
deleted file mode 100644
index 2fe2b89c..00000000
--- a/llama-cpp-bindings-tests/tests/qwen35_parses_tool_call_payload.rs
+++ /dev/null
@@ -1,134 +0,0 @@
-use anyhow::Result;
-use anyhow::bail;
-use llama_cpp_bindings::ChatMessageParseOutcome;
-use llama_cpp_bindings::ToolCallArguments;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const TOOLS_JSON: &str = r#"[
-    {
-        "type": "function",
-        "function": {
-            "name": "get_weather",
-            "description": "Get the current weather for a location",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "location": {"type": "string", "description": "The city name"}
-                },
-                "required": ["location"]
-            }
-        }
-    }
-]"#;
-
-const QWEN_XML_PAYLOAD: &str = "<tool_call>\n\
-<function=get_weather>\n\
-<parameter=location>\n\
-Paris\n\
-</parameter>\n\
-</function>\n\
-</tool_call>";
-
-const PARTIAL_QWEN_XML_PAYLOAD: &str = "<tool_call>\n<function=get_weather>\n<parameter=lo";
-
-const TWO_QWEN_XML_PAYLOADS: &str = "<tool_call>\n\
-<function=get_weather>\n\
-<parameter=location>\n\
-Paris\n\
-</parameter>\n\
-</function>\n\
-</tool_call>\n\
-<tool_call>\n\
-<function=get_weather>\n\
-<parameter=location>\n\
-Berlin\n\
-</parameter>\n\
-</function>\n\
-</tool_call>";
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn qwen35_parses_tool_call_payload(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let outcome = fixture
-        .model
-        .parse_chat_message(TOOLS_JSON, QWEN_XML_PAYLOAD, false)?;
-
-    let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
-        bail!("expected Recognized for Qwen XML on a Qwen-3.5 model; got Unrecognized");
-    };
-    assert_eq!(parsed.tool_calls.len(), 1);
-    assert_eq!(parsed.tool_calls[0].name, "get_weather");
-    let location = match &parsed.tool_calls[0].arguments {
-        ToolCallArguments::ValidJson(value) => value
-            .get("location")
-            .and_then(|v| v.as_str())
-            .map(str::to_owned),
-        ToolCallArguments::InvalidJson(raw) => {
-            bail!("expected ValidJson, got InvalidJson: {raw}");
-        }
-    };
-    assert_eq!(location.as_deref(), Some("Paris"));
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn qwen35_parses_partial_tool_call_returns_pending_state(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let outcome = fixture
-        .model
-        .parse_chat_message(TOOLS_JSON, PARTIAL_QWEN_XML_PAYLOAD, true)?;
-
-    let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
-        bail!("expected Recognized for partial Qwen XML on a Qwen-3.5 model; got Unrecognized");
-    };
-    assert!(parsed.tool_calls.is_empty() || parsed.tool_calls.len() == 1);
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn qwen35_parses_multiple_tool_calls(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let outcome = fixture
-        .model
-        .parse_chat_message(TOOLS_JSON, TWO_QWEN_XML_PAYLOADS, false)?;
-
-    let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
-        bail!(
-            "expected Recognized for two Qwen XML payloads on a Qwen-3.5 model; got Unrecognized"
-        );
-    };
-    assert!(
-        !parsed.tool_calls.is_empty(),
-        "expected at least one tool call; got {:?}",
-        parsed.tool_calls
-    );
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/qwen35_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested.rs b/llama-cpp-bindings-tests/tests/qwen35_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested.rs
deleted file mode 100644
index 96b76cf5..00000000
--- a/llama-cpp-bindings-tests/tests/qwen35_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested.rs
+++ /dev/null
@@ -1,58 +0,0 @@
-use anyhow::Result;
-use anyhow::bail;
-use llama_cpp_bindings::ChatMessageParseOutcome;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const TOOLS_JSON: &str = r#"[
-    {
-        "type": "function",
-        "function": {
-            "name": "get_weather",
-            "description": "Get the current weather for a location",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "location": {"type": "string", "description": "The city name"}
-                },
-                "required": ["location"]
-            }
-        }
-    }
-]"#;
-
-const PLAIN_CONTENT: &str = "Sorry, I cannot help with that.";
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn qwen35_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let outcome = fixture
-        .model
-        .parse_chat_message(TOOLS_JSON, PLAIN_CONTENT, false)?;
-
-    let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
-        bail!(
-            "Qwen 3.5 with tools requested + plain content must produce Recognized (with empty \
-             tool_calls); got Unrecognized"
-        );
-    };
-    assert!(
-        parsed.tool_calls.is_empty(),
-        "expected no tool calls; got {:?}",
-        parsed.tool_calls
-    );
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/qwen36_chat_inference_emits_reasoning_when_template_auto_opens.rs b/llama-cpp-bindings-tests/tests/qwen36_chat_inference_emits_reasoning_when_template_auto_opens.rs
deleted file mode 100644
index 233cef95..00000000
--- a/llama-cpp-bindings-tests/tests/qwen36_chat_inference_emits_reasoning_when_template_auto_opens.rs
+++ /dev/null
@@ -1,87 +0,0 @@
-use anyhow::Result;
-use anyhow::bail;
-use llama_cpp_bindings::ChatMessageParseOutcome;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_bindings::model::LlamaChatMessage;
-use llama_cpp_bindings::sampling::LlamaSampler;
-use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 2048,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn qwen36_chat_inference_emits_reasoning_when_template_auto_opens(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let model = fixture.model;
-    let backend = fixture.backend;
-
-    let mut context = LlamaContext::from_model(
-        model,
-        backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-
-    let chat_template = model.chat_template(None)?;
-    let messages = vec![LlamaChatMessage::new(
-        "user".to_owned(),
-        "Hello! How are you?".to_owned(),
-    )?];
-    let prompt = model.apply_chat_template(&chat_template, &messages, true)?;
-
-    let mut classifier = model.sampled_token_classifier();
-    let tokens = model.str_to_token(&prompt, AddBos::Always)?;
-    let prompt_token_count = u64::try_from(tokens.len())?;
-
-    let mut batch = LlamaBatch::new(512, 1)?;
-    classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?;
-
-    context.decode(&mut batch)?;
-
-    let promoted = classifier.commit_prompt_tokens();
-    assert_eq!(promoted, prompt_token_count);
-
-    let mut sampler = LlamaSampler::greedy();
-    let initial_position = batch.n_tokens();
-    let outcome = ClassifySampleLoop {
-        model,
-        classifier: &mut classifier,
-        sampler: &mut sampler,
-        context: &mut context,
-        batch: &mut batch,
-        initial_position,
-        max_generated_tokens: 1024,
-    }
-    .run()?;
-
-    assert!(!outcome.generated_raw.is_empty());
-    assert!(outcome.observed_reasoning > 0);
-    assert!(outcome.observed_content > 0);
-    assert_eq!(outcome.observed_undeterminable, 0);
-    assert_eq!(outcome.observed_tool_call, 0);
-
-    let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?;
-    let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else {
-        bail!("Qwen3.6 chat template must be recognised by the parser; got Unrecognized");
-    };
-    assert!(!parsed.content.is_empty());
-
-    let usage = classifier.into_usage();
-    assert_eq!(usage.prompt_tokens, prompt_token_count);
-    assert_eq!(usage.reasoning_tokens, outcome.observed_reasoning);
-    assert_eq!(usage.undeterminable_tokens, 0);
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/qwen36_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs b/llama-cpp-bindings-tests/tests/qwen36_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs
deleted file mode 100644
index 2b57fa17..00000000
--- a/llama-cpp-bindings-tests/tests/qwen36_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs
+++ /dev/null
@@ -1,95 +0,0 @@
-use anyhow::Result;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_bindings::sampling::LlamaSampler;
-use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const MAX_GENERATED_TOKENS: i32 = 200;
-
-const QWEN36_THINKING_DISABLED_PROMPT: &str = "\
-<|im_start|>user
-What is 2 + 2?<|im_end|>
-<|im_start|>assistant
-<think>
-
-</think>
-
-";
-
-const FORBIDDEN_MARKERS: &[&str] = &["<think>", "</think>"];
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 8192,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn qwen36_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let model = fixture.model;
-    let backend = fixture.backend;
-
-    let mut classifier = model.sampled_token_classifier();
-    let prompt_tokens = model.str_to_token(QWEN36_THINKING_DISABLED_PROMPT, AddBos::Never)?;
-    let prompt_token_count = u64::try_from(prompt_tokens.len())?;
-
-    let mut batch = LlamaBatch::new(2048, 1)?;
-    classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
-
-    let mut context = LlamaContext::from_model(
-        model,
-        backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-
-    context.decode(&mut batch)?;
-
-    let promoted = classifier.commit_prompt_tokens();
-    assert_eq!(promoted, prompt_token_count);
-
-    let mut sampler = LlamaSampler::chain_simple([
-        LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
-        LlamaSampler::top_k(40),
-        LlamaSampler::top_p(0.9, 1),
-        LlamaSampler::min_p(0.05, 1),
-        LlamaSampler::temp(0.7),
-        LlamaSampler::dist(0x00C0_FFEE),
-    ]);
-    let initial_position = batch.n_tokens();
-    let outcome = ClassifySampleLoop {
-        model,
-        classifier: &mut classifier,
-        sampler: &mut sampler,
-        context: &mut context,
-        batch: &mut batch,
-        initial_position,
-        max_generated_tokens: MAX_GENERATED_TOKENS,
-    }
-    .run()?;
-
-    let usage = classifier.usage();
-
-    assert!(!outcome.generated_raw.is_empty());
-    assert_eq!(outcome.observed_reasoning, 0);
-    assert_eq!(outcome.observed_undeterminable, 0);
-    assert_eq!(usage.reasoning_tokens, 0);
-    assert_eq!(usage.undeterminable_tokens, 0);
-    assert!(outcome.observed_content > 0);
-    assert_eq!(usage.completion_tokens(), outcome.observed_content);
-
-    for forbidden in FORBIDDEN_MARKERS {
-        assert!(!outcome.content_stream.contains(forbidden));
-    }
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/qwen36_classifier_emits_reasoning.rs b/llama-cpp-bindings-tests/tests/qwen36_classifier_emits_reasoning.rs
deleted file mode 100644
index c9c16a64..00000000
--- a/llama-cpp-bindings-tests/tests/qwen36_classifier_emits_reasoning.rs
+++ /dev/null
@@ -1,108 +0,0 @@
-use anyhow::Result;
-use anyhow::bail;
-use llama_cpp_bindings::ChatMessageParseOutcome;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_bindings::sampling::LlamaSampler;
-use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const MAX_GENERATED_TOKENS: i32 = 1500;
-
-const QWEN36_THINKING_PROMPT: &str = "\
-<|im_start|>user
-What is 2 + 2?<|im_end|>
-<|im_start|>assistant
-<think>
-";
-
-const FORBIDDEN_MARKERS: &[&str] = &["<think>", "</think>"];
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 8192,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn qwen36_classifier_emits_reasoning_for_thinking_enabled_prompt(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let model = fixture.model;
-    let backend = fixture.backend;
-
-    let mut classifier = model.sampled_token_classifier();
-    let prompt_tokens = model.str_to_token(QWEN36_THINKING_PROMPT, AddBos::Never)?;
-    let prompt_token_count = u64::try_from(prompt_tokens.len())?;
-
-    let mut batch = LlamaBatch::new(2048, 1)?;
-    classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
-
-    let mut context = LlamaContext::from_model(
-        model,
-        backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-
-    context.decode(&mut batch)?;
-
-    let promoted = classifier.commit_prompt_tokens();
-    assert_eq!(promoted, prompt_token_count);
-
-    let mut sampler = LlamaSampler::chain_simple([
-        LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
-        LlamaSampler::top_k(40),
-        LlamaSampler::top_p(0.9, 1),
-        LlamaSampler::min_p(0.05, 1),
-        LlamaSampler::temp(0.7),
-        LlamaSampler::dist(0x00C0_FFEE),
-    ]);
-    let initial_position = batch.n_tokens();
-    let outcome = ClassifySampleLoop {
-        model,
-        classifier: &mut classifier,
-        sampler: &mut sampler,
-        context: &mut context,
-        batch: &mut batch,
-        initial_position,
-        max_generated_tokens: MAX_GENERATED_TOKENS,
-    }
-    .run()?;
-
-    let usage = classifier.usage();
-    let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, true)?;
-    let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else {
-        bail!("Qwen3.6 chat template must be recognised by the parser; got Unrecognized");
-    };
-
-    assert!(!outcome.generated_raw.is_empty());
-    assert!(outcome.observed_reasoning > 0);
-    assert!(usage.reasoning_tokens > 0);
-    assert_eq!(outcome.observed_undeterminable, 0);
-    assert_eq!(usage.undeterminable_tokens, 0);
-    assert_eq!(
-        usage.completion_tokens(),
-        outcome.observed_content + outcome.observed_reasoning,
-    );
-
-    if parsed.reasoning_content.is_empty() {
-        eprintln!("Qwen3.6 parser returned empty reasoning_content — relying on FORBIDDEN_MARKERS");
-    } else {
-        assert_eq!(outcome.reasoning_stream, parsed.reasoning_content);
-        assert_eq!(outcome.content_stream, parsed.content);
-    }
-
-    for forbidden in FORBIDDEN_MARKERS {
-        assert!(!outcome.reasoning_stream.contains(forbidden));
-        assert!(!outcome.content_stream.contains(forbidden));
-    }
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/qwen36_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs b/llama-cpp-bindings-tests/tests/qwen36_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs
deleted file mode 100644
index cf43adfd..00000000
--- a/llama-cpp-bindings-tests/tests/qwen36_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs
+++ /dev/null
@@ -1,100 +0,0 @@
-use anyhow::Result;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::mtmd::MtmdBitmap;
-use llama_cpp_bindings::mtmd::MtmdInputText;
-use llama_cpp_bindings::mtmd::mtmd_default_marker;
-use llama_cpp_bindings::sampling::LlamaSampler;
-use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-use llama_cpp_bindings_tests::test_model::fixtures_dir;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const MAX_GENERATED_TOKENS: i32 = 200;
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 8192,
-    n_batch = 512,
-    n_ubatch = 512,
-    mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn qwen36_classifier_emits_reasoning_for_multimodal_thinking_prompt(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let model = fixture.model;
-    let backend = fixture.backend;
-    let mtmd_ctx = fixture
-        .mtmd_context
-        .expect("mmproj_file declared in attribute");
-
-    let mut context = LlamaContext::from_model(
-        model,
-        backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-
-    let image_path = fixtures_dir().join("llamas.jpg");
-    let image_path_str = image_path
-        .to_str()
-        .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?;
-    let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?;
-
-    let marker = mtmd_default_marker();
-    let prompt = format!(
-        "<|im_start|>user\n{marker}What animals do you see in this image?<|im_end|>\n<|im_start|>assistant\n<think>\n"
-    );
-
-    let input_text = MtmdInputText {
-        text: prompt,
-        add_special: false,
-        parse_special: true,
-    };
-
-    let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
-
-    let mut classifier = model.sampled_token_classifier();
-    let n_past = classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?;
-
-    let mut sampler = LlamaSampler::chain_simple([
-        LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
-        LlamaSampler::top_k(40),
-        LlamaSampler::top_p(0.9, 1),
-        LlamaSampler::min_p(0.05, 1),
-        LlamaSampler::temp(0.7),
-        LlamaSampler::dist(0x00C0_FFEE),
-    ]);
-
-    let mut batch = LlamaBatch::new(2048, 1)?;
-    let outcome = ClassifySampleLoop {
-        model,
-        classifier: &mut classifier,
-        sampler: &mut sampler,
-        context: &mut context,
-        batch: &mut batch,
-        initial_position: n_past,
-        max_generated_tokens: MAX_GENERATED_TOKENS,
-    }
-    .run()?;
-
-    let usage = classifier.usage();
-
-    if outcome.observed_reasoning == 0 {
-        anyhow::bail!(
-            "Qwen 3.6 multimodal + thinking: classifier must emit at least one Reasoning token; outcome={outcome:?}"
-        );
-    }
-    if usage.reasoning_tokens == 0 {
-        anyhow::bail!(
-            "Qwen 3.6 multimodal + thinking: usage.reasoning_tokens must be non-zero; usage={usage:?}"
-        );
-    }
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/reasoning_markers_and_tool_calls.rs b/llama-cpp-bindings-tests/tests/reasoning_markers_and_tool_calls.rs
new file mode 100644
index 00000000..a5aac3d4
--- /dev/null
+++ b/llama-cpp-bindings-tests/tests/reasoning_markers_and_tool_calls.rs
@@ -0,0 +1,2484 @@
+use llama_cpp_test_harness::llama_tests_main;
+
+mod deepseek_r1_8b_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt {
+    use anyhow::Result;
+    use llama_cpp_bindings::context::LlamaContext;
+    use llama_cpp_bindings::llama_batch::LlamaBatch;
+    use llama_cpp_bindings::model::AddBos;
+    use llama_cpp_bindings::sampling::LlamaSampler;
+    use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    const MAX_GENERATED_TOKENS: i32 = 200;
+
+    const DEEPSEEK_R1_8B_THINKING_DISABLED_PROMPT: &str = "\
+    <｜User｜>What is 2 + 2?<｜Assistant｜><think>
+
+    </think>
+
+    ";
+
+    const FORBIDDEN_MARKERS: &[&str] = &["<think>", "</think>"];
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 8192,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn deepseek_r1_8b_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let model = fixture.model;
+        let backend = fixture.backend;
+
+        let mut classifier = model.sampled_token_classifier();
+        let prompt_tokens =
+            model.str_to_token(DEEPSEEK_R1_8B_THINKING_DISABLED_PROMPT, AddBos::Never)?;
+        let prompt_token_count = u64::try_from(prompt_tokens.len())?;
+
+        let mut batch = LlamaBatch::new(2048, 1)?;
+        classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
+
+        let mut context = LlamaContext::from_model(
+            model,
+            backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+
+        context.decode(&mut batch)?;
+
+        let promoted = classifier.commit_prompt_tokens();
+        assert_eq!(promoted, prompt_token_count);
+
+        let mut sampler = LlamaSampler::chain_simple([
+            LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
+            LlamaSampler::top_k(40),
+            LlamaSampler::top_p(0.9, 1),
+            LlamaSampler::min_p(0.05, 1),
+            LlamaSampler::temp(0.7),
+            LlamaSampler::dist(0x00C0_FFEE),
+        ]);
+        let initial_position = batch.n_tokens();
+        let outcome = ClassifySampleLoop {
+            model,
+            classifier: &mut classifier,
+            sampler: &mut sampler,
+            context: &mut context,
+            batch: &mut batch,
+            initial_position,
+            max_generated_tokens: MAX_GENERATED_TOKENS,
+        }
+        .run()?;
+
+        let usage = classifier.usage();
+
+        assert!(
+            !outcome.generated_raw.is_empty(),
+            "DeepSeek-R1-8B: must generate at least one token"
+        );
+        assert_eq!(
+            outcome.observed_reasoning, 0,
+            "DeepSeek-R1-8B thinking-disabled: classifier must not emit any Reasoning token \
+             when the prompt closes the think block before generation begins; \
+             generated={:?}",
+            outcome.generated_raw
+        );
+        assert_eq!(
+            outcome.observed_undeterminable, 0,
+            "DeepSeek-R1-8B thinking-disabled: prompt-token replay must move section to Content \
+             before generation, so no Undeterminable tokens may be emitted; \
+             generated={:?}",
+            outcome.generated_raw
+        );
+        assert_eq!(
+            usage.reasoning_tokens, 0,
+            "DeepSeek-R1-8B thinking-disabled: usage.reasoning_tokens must be zero; usage={usage:?}"
+        );
+        assert_eq!(
+            usage.undeterminable_tokens, 0,
+            "DeepSeek-R1-8B thinking-disabled: usage.undeterminable_tokens must be zero; usage={usage:?}"
+        );
+        assert!(
+            outcome.observed_content > 0,
+            "DeepSeek-R1-8B thinking-disabled: classifier must emit at least one Content token"
+        );
+        assert_eq!(
+            usage.completion_tokens(),
+            outcome.observed_content,
+            "DeepSeek-R1-8B thinking-disabled: completion tokens must equal observed Content tokens"
+        );
+
+        for forbidden in FORBIDDEN_MARKERS {
+            assert!(
+                !outcome.content_stream.contains(forbidden),
+                "DeepSeek-R1-8B thinking-disabled: content_stream leaked marker {forbidden:?}; \
+                 content_stream={:?}",
+                outcome.content_stream
+            );
+        }
+
+        Ok(())
+    }
+}
+
+mod deepseek_r1_8b_classifier_emits_reasoning {
+    use anyhow::Result;
+    use anyhow::bail;
+    use llama_cpp_bindings::ChatMessageParseOutcome;
+    use llama_cpp_bindings::context::LlamaContext;
+    use llama_cpp_bindings::llama_batch::LlamaBatch;
+    use llama_cpp_bindings::model::AddBos;
+    use llama_cpp_bindings::sampling::LlamaSampler;
+    use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    const MAX_GENERATED_TOKENS: i32 = 1500;
+
+    // DeepSeek-R1-Distill-Llama-8B uses `<think>...</think>` reasoning markers
+    // and full-width-bar role tokens `<｜User｜>` / `<｜Assistant｜>` (U+FF5C,
+    // not ASCII `|`). The chat template's `add_generation_prompt` ALWAYS appends
+    // `<｜Assistant｜><think>\n` — DeepSeek-R1 is a pure reasoner with no
+    // thinking-disabled mode — so the model resumes generation already inside
+    // the reasoning block.
+    const DEEPSEEK_R1_8B_THINKING_PROMPT: &str = "\
+    <｜User｜>What is 2 + 2?<｜Assistant｜><think>
+    ";
+
+    const FORBIDDEN_MARKERS: &[&str] = &["<think>", "</think>"];
+
+    #[expect(
+        clippy::too_many_lines,
+        reason = "test asserts many distinct properties of DeepSeek-R1-8B reasoning output; shortening messages or splitting the body would reduce diagnostic signal at failure time"
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 8192,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn deepseek_r1_8b_classifier_emits_reasoning_for_thinking_enabled_prompt(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let model = fixture.model;
+        let backend = fixture.backend;
+
+        let mut classifier = model.sampled_token_classifier();
+        let prompt_tokens = model.str_to_token(DEEPSEEK_R1_8B_THINKING_PROMPT, AddBos::Never)?;
+        let prompt_token_count = u64::try_from(prompt_tokens.len())?;
+
+        let mut batch = LlamaBatch::new(2048, 1)?;
+        classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
+
+        let mut context = LlamaContext::from_model(
+            model,
+            backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+
+        context.decode(&mut batch)?;
+
+        let promoted = classifier.commit_prompt_tokens();
+        assert_eq!(promoted, prompt_token_count);
+
+        let mut sampler = LlamaSampler::chain_simple([
+            LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
+            LlamaSampler::top_k(40),
+            LlamaSampler::top_p(0.9, 1),
+            LlamaSampler::min_p(0.05, 1),
+            LlamaSampler::temp(0.7),
+            LlamaSampler::dist(0x00C0_FFEE),
+        ]);
+        let initial_position = batch.n_tokens();
+        let outcome = ClassifySampleLoop {
+            model,
+            classifier: &mut classifier,
+            sampler: &mut sampler,
+            context: &mut context,
+            batch: &mut batch,
+            initial_position,
+            max_generated_tokens: MAX_GENERATED_TOKENS,
+        }
+        .run()?;
+
+        let usage = classifier.usage();
+        let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?;
+        let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else {
+            bail!(
+                "DeepSeek-R1-8B chat template must be recognised by the parser; got Unrecognized"
+            );
+        };
+
+        assert!(
+            !outcome.generated_raw.is_empty(),
+            "DeepSeek-R1-8B: must generate at least one token"
+        );
+        assert!(
+            outcome.observed_reasoning > 0,
+            "DeepSeek-R1-8B: classifier must emit at least one Reasoning token when the prompt \
+             opens a <think> block; outcome={outcome:?}",
+        );
+        assert!(
+            usage.reasoning_tokens > 0,
+            "DeepSeek-R1-8B: usage.reasoning_tokens must be non-zero when the prompt opens a \
+             <think> block; usage was {usage:?}"
+        );
+        assert_eq!(
+            outcome.observed_undeterminable, 0,
+            "DeepSeek-R1-8B: prompt-token replay must move section to Reasoning before generation, \
+             so no Undeterminable tokens may be emitted; outcome={outcome:?}"
+        );
+        assert_eq!(
+            usage.undeterminable_tokens, 0,
+            "DeepSeek-R1-8B: usage.undeterminable_tokens must be zero; usage={usage:?}"
+        );
+        assert_eq!(
+            usage.completion_tokens(),
+            outcome.observed_content + outcome.observed_reasoning,
+            "DeepSeek-R1-8B: completion tokens must equal observed Content + Reasoning"
+        );
+
+        if parsed.reasoning_content.is_empty() {
+            eprintln!(
+                "DeepSeek-R1-8B didn't close its reasoning block within {MAX_GENERATED_TOKENS} \
+                 tokens — skipping strict parser-equality assertions"
+            );
+        } else {
+            assert_eq!(
+                outcome.reasoning_stream, parsed.reasoning_content,
+                "DeepSeek-R1-8B: per-token reasoning stream must equal parser-side reasoning_content \
+                 (any difference means a marker leaked into the user-visible stream)",
+            );
+            assert_eq!(
+                outcome.content_stream, parsed.content,
+                "DeepSeek-R1-8B: per-token content stream must equal parser-side content \
+                 (any difference means a marker leaked into the user-visible stream)",
+            );
+        }
+
+        for forbidden in FORBIDDEN_MARKERS {
+            assert!(
+                !outcome.reasoning_stream.contains(forbidden),
+                "DeepSeek-R1-8B: reasoning_stream leaked marker {forbidden:?}; \
+                 reasoning_stream={:?}",
+                outcome.reasoning_stream
+            );
+            assert!(
+                !outcome.content_stream.contains(forbidden),
+                "DeepSeek-R1-8B: content_stream leaked marker {forbidden:?}; \
+                 content_stream={:?}",
+                outcome.content_stream
+            );
+        }
+
+        Ok(())
+    }
+}
+
+mod deepseek_r1_8b_duck_types_gemma_paired_quote {
+    use anyhow::Result;
+    use anyhow::bail;
+    use llama_cpp_bindings::ChatMessageParseOutcome;
+    use llama_cpp_bindings::ToolCallArguments;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    const TOOLS_JSON: &str = r#"[
+        {
+            "type": "function",
+            "function": {
+                "name": "get_weather",
+                "description": "Get the current weather for a location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {"type": "string", "description": "The city name"}
+                    },
+                    "required": ["location"]
+                }
+            }
+        }
+    ]"#;
+
+    const GEMMA_PAIRED_QUOTE_PAYLOAD: &str =
+        "<|tool_call>call:get_weather{location:<|\"|>Paris<|\"|>}";
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn deepseek_r1_8b_duck_types_gemma_paired_quote(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let outcome =
+            fixture
+                .model
+                .parse_chat_message(TOOLS_JSON, GEMMA_PAIRED_QUOTE_PAYLOAD, false)?;
+
+        let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+            bail!(
+                "duck-type pass must recognise Gemma paired-quote on a model with no registered \
+                 template; got Unrecognized"
+            );
+        };
+        assert_eq!(
+            parsed.tool_calls.len(),
+            1,
+            "expected one tool call; got {:?}",
+            parsed.tool_calls
+        );
+        assert_eq!(parsed.tool_calls[0].name, "get_weather");
+        let location = match &parsed.tool_calls[0].arguments {
+            ToolCallArguments::ValidJson(value) => value
+                .get("location")
+                .and_then(|v| v.as_str())
+                .map(str::to_owned),
+            ToolCallArguments::InvalidJson(raw) => {
+                bail!("expected ValidJson, got InvalidJson: {raw}");
+            }
+        };
+        assert_eq!(location.as_deref(), Some("Paris"));
+
+        Ok(())
+    }
+}
+
+mod deepseek_r1_8b_duck_types_glm_key_value_tags {
+    use anyhow::Result;
+    use anyhow::bail;
+    use llama_cpp_bindings::ChatMessageParseOutcome;
+    use llama_cpp_bindings::ToolCallArguments;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    const TOOLS_JSON: &str = r#"[
+        {
+            "type": "function",
+            "function": {
+                "name": "get_weather",
+                "description": "Get the current weather for a location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {"type": "string", "description": "The city name"}
+                    },
+                    "required": ["location"]
+                }
+            }
+        }
+    ]"#;
+
+    const GLM_KEY_VALUE_PAYLOAD: &str = "<tool_call>get_weather\
+    <arg_key>location</arg_key>\
+    <arg_value>Paris</arg_value>\
+    </tool_call>";
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn deepseek_r1_8b_duck_types_glm_key_value_tags(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let outcome = fixture
+            .model
+            .parse_chat_message(TOOLS_JSON, GLM_KEY_VALUE_PAYLOAD, false)?;
+
+        let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+            bail!(
+                "duck-type pass must recognise GLM key-value tags on a model with no registered \
+                 template; got Unrecognized"
+            );
+        };
+        assert_eq!(
+            parsed.tool_calls.len(),
+            1,
+            "expected one tool call; got {:?}",
+            parsed.tool_calls
+        );
+        assert_eq!(parsed.tool_calls[0].name, "get_weather");
+        let location = match &parsed.tool_calls[0].arguments {
+            ToolCallArguments::ValidJson(value) => value
+                .get("location")
+                .and_then(|v| v.as_str())
+                .map(str::to_owned),
+            ToolCallArguments::InvalidJson(raw) => {
+                bail!("expected ValidJson, got InvalidJson: {raw}");
+            }
+        };
+        assert_eq!(location.as_deref(), Some("Paris"));
+
+        Ok(())
+    }
+}
+
+mod deepseek_r1_8b_duck_types_mistral_bracketed_json {
+    use anyhow::Result;
+    use anyhow::bail;
+    use llama_cpp_bindings::ChatMessageParseOutcome;
+    use llama_cpp_bindings::ToolCallArguments;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    const TOOLS_JSON: &str = r#"[
+        {
+            "type": "function",
+            "function": {
+                "name": "get_weather",
+                "description": "Get the current weather for a location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {"type": "string", "description": "The city name"}
+                    },
+                    "required": ["location"]
+                }
+            }
+        }
+    ]"#;
+
+    const MISTRAL_BRACKETED_JSON_PAYLOAD: &str =
+        r#"[TOOL_CALLS]get_weather[ARGS]{"location":"Paris"}"#;
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn deepseek_r1_8b_duck_types_mistral_bracketed_json(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let outcome =
+            fixture
+                .model
+                .parse_chat_message(TOOLS_JSON, MISTRAL_BRACKETED_JSON_PAYLOAD, false)?;
+
+        let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+            bail!(
+                "duck-type pass must recognise Mistral bracketed-JSON on a model with no registered \
+                 template; got Unrecognized"
+            );
+        };
+        assert_eq!(
+            parsed.tool_calls.len(),
+            1,
+            "expected one tool call; got {:?}",
+            parsed.tool_calls
+        );
+        assert_eq!(parsed.tool_calls[0].name, "get_weather");
+        let location = match &parsed.tool_calls[0].arguments {
+            ToolCallArguments::ValidJson(value) => value
+                .get("location")
+                .and_then(|v| v.as_str())
+                .map(str::to_owned),
+            ToolCallArguments::InvalidJson(raw) => {
+                bail!("expected ValidJson, got InvalidJson: {raw}");
+            }
+        };
+        assert_eq!(location.as_deref(), Some("Paris"));
+
+        Ok(())
+    }
+}
+
+mod deepseek_r1_8b_duck_types_qwen_xml {
+    use anyhow::Result;
+    use anyhow::bail;
+    use llama_cpp_bindings::ChatMessageParseOutcome;
+    use llama_cpp_bindings::ToolCallArguments;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    const TOOLS_JSON: &str = r#"[
+        {
+            "type": "function",
+            "function": {
+                "name": "get_weather",
+                "description": "Get the current weather for a location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {"type": "string", "description": "The city name"}
+                    },
+                    "required": ["location"]
+                }
+            }
+        }
+    ]"#;
+
+    const QWEN_XML_PAYLOAD: &str = "<tool_call>\n\
+    <function=get_weather>\n\
+    <parameter=location>\n\
+    Paris\n\
+    </parameter>\n\
+    </function>\n\
+    </tool_call>";
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn deepseek_r1_8b_duck_types_qwen_xml(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let outcome = fixture
+            .model
+            .parse_chat_message(TOOLS_JSON, QWEN_XML_PAYLOAD, false)?;
+
+        let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+            bail!(
+                "duck-type pass must recognise Qwen XML on a model with no registered template; \
+                 got Unrecognized"
+            );
+        };
+        assert_eq!(
+            parsed.tool_calls.len(),
+            1,
+            "expected one tool call; got {:?}",
+            parsed.tool_calls
+        );
+        assert_eq!(parsed.tool_calls[0].name, "get_weather");
+        let location = match &parsed.tool_calls[0].arguments {
+            ToolCallArguments::ValidJson(value) => value
+                .get("location")
+                .and_then(|v| v.as_str())
+                .map(str::to_owned),
+            ToolCallArguments::InvalidJson(raw) => {
+                bail!("expected ValidJson, got InvalidJson: {raw}");
+            }
+        };
+        assert_eq!(location.as_deref(), Some("Paris"));
+
+        Ok(())
+    }
+}
+
+mod deepseek_r1_8b_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested {
+    use anyhow::Result;
+    use anyhow::bail;
+    use llama_cpp_bindings::ChatMessageParseOutcome;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    const TOOLS_JSON: &str = r#"[
+        {
+            "type": "function",
+            "function": {
+                "name": "get_weather",
+                "description": "Get the current weather for a location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {"type": "string", "description": "The city name"}
+                    },
+                    "required": ["location"]
+                }
+            }
+        }
+    ]"#;
+
+    const PLAIN_CONTENT: &str = "Sorry, I cannot help with that.";
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn deepseek_r1_8b_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let outcome = fixture
+            .model
+            .parse_chat_message(TOOLS_JSON, PLAIN_CONTENT, false)?;
+
+        let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+            bail!(
+                "plain content with tools requested must produce Recognized (with empty tool_calls); \
+                 got Unrecognized"
+            );
+        };
+        assert!(
+            parsed.tool_calls.is_empty(),
+            "expected no tool calls; got {:?}",
+            parsed.tool_calls
+        );
+
+        Ok(())
+    }
+}
+
+mod deepseek_r1_8b_recognizes_empty_tool_calls_when_tools_not_requested {
+    use anyhow::Result;
+    use anyhow::bail;
+    use llama_cpp_bindings::ChatMessageParseOutcome;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    const PLAIN_CONTENT: &str = "Hello there.";
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn deepseek_r1_8b_recognizes_empty_tool_calls_when_tools_not_requested(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let outcome = fixture
+            .model
+            .parse_chat_message("[]", PLAIN_CONTENT, false)?;
+
+        let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+            bail!("plain content with empty tools array must produce Recognized; got Unrecognized");
+        };
+        assert!(
+            parsed.tool_calls.is_empty(),
+            "expected no tool calls; got {:?}",
+            parsed.tool_calls
+        );
+
+        Ok(())
+    }
+}
+
+mod gemma4_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt {
+    use anyhow::Result;
+    use llama_cpp_bindings::context::LlamaContext;
+    use llama_cpp_bindings::llama_batch::LlamaBatch;
+    use llama_cpp_bindings::model::AddBos;
+    use llama_cpp_bindings::sampling::LlamaSampler;
+    use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    const MAX_GENERATED_TOKENS: i32 = 200;
+
+    const GEMMA4_THINKING_DISABLED_PROMPT: &str = "\
+    <bos><start_of_turn>user\nReply with the single word: four. Do not explain.<end_of_turn>\n\
+    <start_of_turn>model\n<|channel>thought\n<channel|>\n";
+
+    const FORBIDDEN_MARKERS: &[&str] = &["<|channel>thought", "<channel|>"];
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 8192,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn gemma4_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let model = fixture.model;
+        let backend = fixture.backend;
+
+        let mut classifier = model.sampled_token_classifier();
+        let prompt_tokens = model.str_to_token(GEMMA4_THINKING_DISABLED_PROMPT, AddBos::Never)?;
+        let prompt_token_count = u64::try_from(prompt_tokens.len())?;
+
+        let mut batch = LlamaBatch::new(2048, 1)?;
+        classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
+
+        let mut context = LlamaContext::from_model(
+            model,
+            backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+
+        context.decode(&mut batch)?;
+
+        let promoted = classifier.commit_prompt_tokens();
+        assert_eq!(promoted, prompt_token_count);
+
+        let mut sampler = LlamaSampler::greedy();
+        let initial_position = batch.n_tokens();
+        let outcome = ClassifySampleLoop {
+            model,
+            classifier: &mut classifier,
+            sampler: &mut sampler,
+            context: &mut context,
+            batch: &mut batch,
+            initial_position,
+            max_generated_tokens: MAX_GENERATED_TOKENS,
+        }
+        .run()?;
+
+        let usage = classifier.usage();
+
+        assert!(
+            !outcome.generated_raw.is_empty(),
+            "Gemma 4 must generate at least one token"
+        );
+        assert_eq!(
+            outcome.observed_reasoning, 0,
+            "Gemma 4 thinking-disabled: classifier must not emit any Reasoning token \
+             when the prompt closes the thought channel before generation begins; \
+             generated={:?}",
+            outcome.generated_raw
+        );
+        assert_eq!(
+            outcome.observed_undeterminable, 0,
+            "Gemma 4 thinking-disabled: prompt-token replay must move section to Content \
+             before generation, so no Undeterminable tokens may be emitted; \
+             generated={:?}",
+            outcome.generated_raw
+        );
+        assert_eq!(
+            usage.reasoning_tokens, 0,
+            "Gemma 4 thinking-disabled: usage.reasoning_tokens must be zero; usage={usage:?}"
+        );
+        assert_eq!(
+            usage.undeterminable_tokens, 0,
+            "Gemma 4 thinking-disabled: usage.undeterminable_tokens must be zero; usage={usage:?}"
+        );
+        assert!(
+            outcome.observed_content > 0,
+            "Gemma 4 thinking-disabled: classifier must emit at least one Content token"
+        );
+        assert_eq!(
+            usage.completion_tokens(),
+            outcome.observed_content,
+            "Gemma 4 thinking-disabled: completion tokens must equal observed Content tokens"
+        );
+
+        for forbidden in FORBIDDEN_MARKERS {
+            assert!(
+                !outcome.content_stream.contains(forbidden),
+                "Gemma 4 thinking-disabled: content_stream leaked marker {forbidden:?}; \
+                 content_stream={:?}",
+                outcome.content_stream
+            );
+        }
+
+        Ok(())
+    }
+}
+
+mod gemma4_classifier_emits_reasoning {
+    use anyhow::Result;
+    use anyhow::bail;
+    use llama_cpp_bindings::ChatMessageParseOutcome;
+    use llama_cpp_bindings::context::LlamaContext;
+    use llama_cpp_bindings::llama_batch::LlamaBatch;
+    use llama_cpp_bindings::model::AddBos;
+    use llama_cpp_bindings::sampling::LlamaSampler;
+    use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    const MAX_GENERATED_TOKENS: i32 = 1500;
+
+    const GEMMA4_THINKING_PROMPT: &str = "\
+    <bos><start_of_turn>user\nReply with the single word: four. Do not explain.<end_of_turn>\n\
+    <start_of_turn>model\n<|channel>thought\n";
+
+    const FORBIDDEN_MARKERS: &[&str] = &["<|channel>thought", "<channel|>"];
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 8192,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn gemma4_classifier_emits_reasoning_for_thinking_prompt(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let model = fixture.model;
+        let backend = fixture.backend;
+
+        let mut classifier = model.sampled_token_classifier();
+        let prompt_tokens = model.str_to_token(GEMMA4_THINKING_PROMPT, AddBos::Never)?;
+        let prompt_token_count = u64::try_from(prompt_tokens.len())?;
+
+        let mut batch = LlamaBatch::new(2048, 1)?;
+        classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
+
+        let mut context = LlamaContext::from_model(
+            model,
+            backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+
+        context.decode(&mut batch)?;
+
+        let promoted = classifier.commit_prompt_tokens();
+        assert_eq!(promoted, prompt_token_count);
+
+        let mut sampler = LlamaSampler::greedy();
+        let initial_position = batch.n_tokens();
+        let outcome = ClassifySampleLoop {
+            model,
+            classifier: &mut classifier,
+            sampler: &mut sampler,
+            context: &mut context,
+            batch: &mut batch,
+            initial_position,
+            max_generated_tokens: MAX_GENERATED_TOKENS,
+        }
+        .run()?;
+
+        let usage = classifier.usage();
+        let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?;
+        let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else {
+            bail!("Gemma 4 chat template must be recognised by the parser; got Unrecognized");
+        };
+
+        assert!(
+            !outcome.generated_raw.is_empty(),
+            "Gemma 4 must generate at least one token"
+        );
+        assert!(
+            outcome.observed_reasoning > 0,
+            "Gemma 4 classifier must emit at least one Reasoning token when the model \
+             emits a `<|channel>thought` block; outcome={outcome:?}",
+        );
+        assert!(
+            usage.reasoning_tokens > 0,
+            "Gemma 4 usage.reasoning_tokens must be non-zero when the model emits a \
+             reasoning block; usage was {usage:?}"
+        );
+        assert_eq!(
+            outcome.observed_undeterminable, 0,
+            "Gemma 4: classifier must not emit Undeterminable when the model emits a \
+             detected `<|channel>thought` marker; outcome={outcome:?}"
+        );
+        assert_eq!(
+            usage.undeterminable_tokens, 0,
+            "Gemma 4: usage.undeterminable_tokens must be zero; usage={usage:?}"
+        );
+        assert_eq!(
+            usage.completion_tokens(),
+            outcome.observed_content + outcome.observed_reasoning,
+            "Gemma 4: completion tokens must equal observed Content + Reasoning"
+        );
+        assert!(
+            !parsed.reasoning_content.is_empty(),
+            "Gemma 4 must close its reasoning block within {MAX_GENERATED_TOKENS} tokens; \
+             increase the budget or pick a more direct prompt. generated={:?}",
+            outcome.generated_raw,
+        );
+
+        for forbidden in FORBIDDEN_MARKERS {
+            assert!(
+                !outcome.reasoning_stream.contains(forbidden),
+                "Gemma 4: reasoning_stream leaked marker {forbidden:?}; \
+                 reasoning_stream={:?}",
+                outcome.reasoning_stream
+            );
+            assert!(
+                !outcome.content_stream.contains(forbidden),
+                "Gemma 4: content_stream leaked marker {forbidden:?}; \
+                 content_stream={:?}",
+                outcome.content_stream
+            );
+        }
+
+        Ok(())
+    }
+}
+
+mod gemma4_parses_tool_call_payload {
+    use anyhow::Result;
+    use anyhow::bail;
+    use llama_cpp_bindings::ChatMessageParseOutcome;
+    use llama_cpp_bindings::ToolCallArguments;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    const TOOLS_JSON: &str = r#"[
+        {
+            "type": "function",
+            "function": {
+                "name": "get_weather",
+                "description": "Get the current weather for a location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {"type": "string", "description": "The city name"}
+                    },
+                    "required": ["location"]
+                }
+            }
+        }
+    ]"#;
+
+    const GEMMA4_PAIRED_QUOTE_PAYLOAD: &str =
+        "<|tool_call>call:get_weather{location:<|\"|>Paris<|\"|>}";
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn gemma4_parses_tool_call_payload(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let outcome =
+            fixture
+                .model
+                .parse_chat_message(TOOLS_JSON, GEMMA4_PAIRED_QUOTE_PAYLOAD, false)?;
+
+        let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+            bail!(
+                "expected Recognized for Gemma 4 PairedQuote on a Gemma-4 model; got Unrecognized"
+            );
+        };
+        assert_eq!(
+            parsed.tool_calls.len(),
+            1,
+            "expected one tool call; got {:?}",
+            parsed.tool_calls
+        );
+        assert_eq!(parsed.tool_calls[0].name, "get_weather");
+        let location = match &parsed.tool_calls[0].arguments {
+            ToolCallArguments::ValidJson(value) => value
+                .get("location")
+                .and_then(|v| v.as_str())
+                .map(str::to_owned),
+            ToolCallArguments::InvalidJson(raw) => {
+                bail!("expected ValidJson, got InvalidJson: {raw}");
+            }
+        };
+        assert_eq!(location.as_deref(), Some("Paris"));
+
+        Ok(())
+    }
+}
+
+mod gemma4_template_override_returns_full_markers {
+    #![expect(
+        clippy::unnecessary_wraps,
+        reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
+    )]
+
+    use anyhow::Result;
+    use llama_cpp_bindings::ToolCallArgsShape;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn gemma4_template_override_returns_full_markers(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let model = fixture.model;
+        let template = model
+            .chat_template(None)
+            .expect("Gemma 4 chat template must be present");
+        let template_str = template.to_str().expect("template must be valid UTF-8");
+        assert!(
+            template_str.contains("<|tool_call>call:"),
+            "Gemma 4 chat template must contain '<|tool_call>call:' fingerprint; \
+             template starts with: {:?}",
+            &template_str[..template_str.len().min(200)],
+        );
+
+        let markers = model
+            .tool_call_markers()
+            .expect("Gemma 4 must produce ToolCallMarkers via override registry");
+
+        assert_eq!(markers.open, "<|tool_call>call:");
+        assert_eq!(markers.close, "}");
+        let ToolCallArgsShape::PairedQuote(shape) = markers.args_shape else {
+            panic!("expected PairedQuote variant, got {:?}", markers.args_shape);
+        };
+        assert_eq!(shape.name_args_separator, "{");
+        assert_eq!(shape.value_quote.open, "<|\"|>");
+        assert_eq!(shape.value_quote.close, "<|\"|>");
+
+        Ok(())
+    }
+}
+
+mod glm47_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt {
+    use anyhow::Result;
+    use llama_cpp_bindings::context::LlamaContext;
+    use llama_cpp_bindings::llama_batch::LlamaBatch;
+    use llama_cpp_bindings::model::AddBos;
+    use llama_cpp_bindings::sampling::LlamaSampler;
+    use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    const MAX_GENERATED_TOKENS: i32 = 200;
+
+    const GLM47_THINKING_DISABLED_PROMPT: &str = "\
+    <|user|>
+    What is 2 + 2?
+    <|assistant|>
+    </think>
+
+    ";
+
+    const FORBIDDEN_MARKERS: &[&str] = &["<think>", "</think>"];
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 8192,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn glm47_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let model = fixture.model;
+        let backend = fixture.backend;
+
+        let mut classifier = model.sampled_token_classifier();
+        let prompt_tokens = model.str_to_token(GLM47_THINKING_DISABLED_PROMPT, AddBos::Never)?;
+        let prompt_token_count = u64::try_from(prompt_tokens.len())?;
+
+        let mut batch = LlamaBatch::new(2048, 1)?;
+        classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
+
+        let mut context = LlamaContext::from_model(
+            model,
+            backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+
+        context.decode(&mut batch)?;
+
+        let promoted = classifier.commit_prompt_tokens();
+        assert_eq!(promoted, prompt_token_count);
+
+        let mut sampler = LlamaSampler::chain_simple([
+            LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
+            LlamaSampler::top_k(40),
+            LlamaSampler::top_p(0.9, 1),
+            LlamaSampler::min_p(0.05, 1),
+            LlamaSampler::temp(0.7),
+            LlamaSampler::dist(0x00C0_FFEE),
+        ]);
+        let initial_position = batch.n_tokens();
+        let outcome = ClassifySampleLoop {
+            model,
+            classifier: &mut classifier,
+            sampler: &mut sampler,
+            context: &mut context,
+            batch: &mut batch,
+            initial_position,
+            max_generated_tokens: MAX_GENERATED_TOKENS,
+        }
+        .run()?;
+
+        let usage = classifier.usage();
+
+        assert!(!outcome.generated_raw.is_empty());
+        assert_eq!(outcome.observed_reasoning, 0);
+        assert_eq!(outcome.observed_undeterminable, 0);
+        assert_eq!(usage.reasoning_tokens, 0);
+        assert_eq!(usage.undeterminable_tokens, 0);
+        assert!(outcome.observed_content > 0);
+        assert_eq!(usage.completion_tokens(), outcome.observed_content);
+
+        for forbidden in FORBIDDEN_MARKERS {
+            assert!(!outcome.content_stream.contains(forbidden));
+        }
+
+        Ok(())
+    }
+}
+
+mod glm47_classifier_emits_reasoning {
+    use anyhow::Result;
+    use anyhow::bail;
+    use llama_cpp_bindings::ChatMessageParseOutcome;
+    use llama_cpp_bindings::context::LlamaContext;
+    use llama_cpp_bindings::llama_batch::LlamaBatch;
+    use llama_cpp_bindings::model::AddBos;
+    use llama_cpp_bindings::sampling::LlamaSampler;
+    use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    const MAX_GENERATED_TOKENS: i32 = 1500;
+
+    const GLM47_THINKING_PROMPT: &str = "\
+    <|user|>
+    What is 2 + 2?
+    <|assistant|>
+    <think>
+    ";
+
+    const FORBIDDEN_MARKERS: &[&str] = &["<think>", "</think>"];
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 8192,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn glm47_classifier_emits_reasoning_for_thinking_enabled_prompt(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let model = fixture.model;
+        let backend = fixture.backend;
+
+        let mut classifier = model.sampled_token_classifier();
+        let prompt_tokens = model.str_to_token(GLM47_THINKING_PROMPT, AddBos::Never)?;
+        let prompt_token_count = u64::try_from(prompt_tokens.len())?;
+
+        let mut batch = LlamaBatch::new(2048, 1)?;
+        classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
+
+        let mut context = LlamaContext::from_model(
+            model,
+            backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+
+        context.decode(&mut batch)?;
+
+        let promoted = classifier.commit_prompt_tokens();
+        assert_eq!(promoted, prompt_token_count);
+
+        let mut sampler = LlamaSampler::chain_simple([
+            LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
+            LlamaSampler::top_k(40),
+            LlamaSampler::top_p(0.9, 1),
+            LlamaSampler::min_p(0.05, 1),
+            LlamaSampler::temp(0.7),
+            LlamaSampler::dist(0x00C0_FFEE),
+        ]);
+        let initial_position = batch.n_tokens();
+        let outcome = ClassifySampleLoop {
+            model,
+            classifier: &mut classifier,
+            sampler: &mut sampler,
+            context: &mut context,
+            batch: &mut batch,
+            initial_position,
+            max_generated_tokens: MAX_GENERATED_TOKENS,
+        }
+        .run()?;
+
+        let usage = classifier.usage();
+        let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?;
+        let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else {
+            bail!("GLM-4.7 chat template must be recognised by the parser; got Unrecognized");
+        };
+
+        assert!(!outcome.generated_raw.is_empty());
+        assert!(outcome.observed_reasoning > 0);
+        assert!(usage.reasoning_tokens > 0);
+        assert_eq!(outcome.observed_undeterminable, 0);
+        assert_eq!(usage.undeterminable_tokens, 0);
+        assert_eq!(
+            usage.completion_tokens(),
+            outcome.observed_content + outcome.observed_reasoning
+        );
+
+        if parsed.reasoning_content.is_empty() {
+            eprintln!(
+                "GLM-4.7 didn't close its reasoning block within {MAX_GENERATED_TOKENS} tokens — \
+                 skipping strict parser-equality assertions"
+            );
+        } else {
+            assert_eq!(outcome.reasoning_stream, parsed.reasoning_content);
+            assert_eq!(outcome.content_stream, parsed.content);
+        }
+
+        for forbidden in FORBIDDEN_MARKERS {
+            assert!(!outcome.reasoning_stream.contains(forbidden));
+            assert!(!outcome.content_stream.contains(forbidden));
+        }
+
+        Ok(())
+    }
+}
+
+mod glm47_parses_tool_call_payload {
+    use anyhow::Result;
+    use anyhow::bail;
+    use llama_cpp_bindings::ChatMessageParseOutcome;
+    use llama_cpp_bindings::ToolCallArguments;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    const TOOLS_JSON: &str = r#"[
+        {
+            "type": "function",
+            "function": {
+                "name": "get_weather",
+                "description": "Get the current weather for a location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {"type": "string", "description": "The city name"}
+                    },
+                    "required": ["location"]
+                }
+            }
+        }
+    ]"#;
+
+    const GLM47_KEY_VALUE_PAYLOAD: &str = "<tool_call>get_weather\
+    <arg_key>location</arg_key>\
+    <arg_value>Paris</arg_value>\
+    </tool_call>";
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn glm47_parses_tool_call_payload(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let outcome =
+            fixture
+                .model
+                .parse_chat_message(TOOLS_JSON, GLM47_KEY_VALUE_PAYLOAD, false)?;
+
+        let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+            bail!(
+                "expected Recognized for GLM-4.7 key-value tags on a GLM-4.7-Flash model; got Unrecognized"
+            );
+        };
+        assert_eq!(parsed.tool_calls.len(), 1);
+        assert_eq!(parsed.tool_calls[0].name, "get_weather");
+        let location = match &parsed.tool_calls[0].arguments {
+            ToolCallArguments::ValidJson(value) => value
+                .get("location")
+                .and_then(|v| v.as_str())
+                .map(str::to_owned),
+            ToolCallArguments::InvalidJson(raw) => {
+                bail!("expected ValidJson, got InvalidJson: {raw}");
+            }
+        };
+        assert_eq!(location.as_deref(), Some("Paris"));
+
+        Ok(())
+    }
+}
+
+mod glm47_template_override_returns_full_markers {
+    #![expect(
+        clippy::unnecessary_wraps,
+        reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
+    )]
+
+    use anyhow::Result;
+    use llama_cpp_bindings::ToolCallArgsShape;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn glm47_template_override_returns_full_markers(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let model = fixture.model;
+        let template = model
+            .chat_template(None)
+            .expect("GLM-4.7 chat template must be present");
+        let template_str = template.to_str().expect("template must be valid UTF-8");
+        assert!(template_str.contains("<arg_key>"));
+
+        let markers = model
+            .tool_call_markers()
+            .expect("GLM-4.7 must produce ToolCallMarkers via override registry");
+
+        assert_eq!(markers.open, "<tool_call>");
+        assert_eq!(markers.close, "</tool_call>");
+        let ToolCallArgsShape::KeyValueXmlTags(shape) = markers.args_shape else {
+            panic!(
+                "expected KeyValueXmlTags variant, got {:?}",
+                markers.args_shape
+            );
+        };
+        assert_eq!(shape.key_open, "<arg_key>");
+        assert_eq!(shape.key_close, "</arg_key>");
+        assert_eq!(shape.value_open, "<arg_value>");
+        assert_eq!(shape.value_close, "</arg_value>");
+
+        Ok(())
+    }
+}
+
+mod mistral3_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt {
+    use anyhow::Result;
+    use llama_cpp_bindings::context::LlamaContext;
+    use llama_cpp_bindings::llama_batch::LlamaBatch;
+    use llama_cpp_bindings::model::AddBos;
+    use llama_cpp_bindings::sampling::LlamaSampler;
+    use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    const MAX_GENERATED_TOKENS: i32 = 200;
+
+    const MISTRAL3_THINKING_DISABLED_PROMPT: &str = "\
+    [INST]Reply with the single word: four. Do not explain.[/INST][THINK][/THINK]";
+
+    const FORBIDDEN_MARKERS: &[&str] = &["[THINK]", "[/THINK]"];
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 8192,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn mistral3_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let model = fixture.model;
+        let backend = fixture.backend;
+
+        let mut classifier = model.sampled_token_classifier();
+        let prompt_tokens =
+            model.str_to_token(MISTRAL3_THINKING_DISABLED_PROMPT, AddBos::Always)?;
+        let prompt_token_count = u64::try_from(prompt_tokens.len())?;
+
+        let mut batch = LlamaBatch::new(2048, 1)?;
+        classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
+
+        let mut context = LlamaContext::from_model(
+            model,
+            backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+
+        context.decode(&mut batch)?;
+
+        let promoted = classifier.commit_prompt_tokens();
+        assert_eq!(promoted, prompt_token_count);
+
+        let mut sampler = LlamaSampler::greedy();
+        let initial_position = batch.n_tokens();
+        let outcome = ClassifySampleLoop {
+            model,
+            classifier: &mut classifier,
+            sampler: &mut sampler,
+            context: &mut context,
+            batch: &mut batch,
+            initial_position,
+            max_generated_tokens: MAX_GENERATED_TOKENS,
+        }
+        .run()?;
+
+        let usage = classifier.usage();
+
+        assert!(!outcome.generated_raw.is_empty());
+        assert_eq!(outcome.observed_reasoning, 0);
+        assert_eq!(outcome.observed_undeterminable, 0);
+        assert_eq!(usage.reasoning_tokens, 0);
+        assert_eq!(usage.undeterminable_tokens, 0);
+        assert!(outcome.observed_content > 0);
+        assert_eq!(usage.completion_tokens(), outcome.observed_content);
+
+        for forbidden in FORBIDDEN_MARKERS {
+            assert!(!outcome.content_stream.contains(forbidden));
+        }
+
+        Ok(())
+    }
+}
+
+mod mistral3_classifier_emits_reasoning {
+    use anyhow::Result;
+    use anyhow::bail;
+    use llama_cpp_bindings::ChatMessageParseOutcome;
+    use llama_cpp_bindings::context::LlamaContext;
+    use llama_cpp_bindings::llama_batch::LlamaBatch;
+    use llama_cpp_bindings::model::AddBos;
+    use llama_cpp_bindings::sampling::LlamaSampler;
+    use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    const MAX_GENERATED_TOKENS: i32 = 768;
+
+    const MISTRAL3_THINKING_PROMPT: &str = "\
+    [SYSTEM_PROMPT]# HOW YOU SHOULD THINK AND ANSWER\n\n\
+    First draft your thinking process (inner monologue) until you arrive at a response. \
+    Format your response using Markdown, and use LaTeX for any mathematical equations. \
+    Write both your thoughts and the response in the same language as the input.\n\n\
+    Your thinking process must follow the template below:\
+    [THINK]Your thoughts or/and draft, like working through an exercise on scratch paper. \
+    Be as casual and as long as you want until you are confident to generate the response \
+    to the user.[/THINK]Here, provide a self-contained response.[/SYSTEM_PROMPT]\
+    [INST]Reply with the single word: four. Do not explain.[/INST]";
+
+    const FORBIDDEN_MARKERS: &[&str] = &["[THINK]", "[/THINK]"];
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 8192,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn mistral3_classifier_emits_reasoning_for_thinking_prompt(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let model = fixture.model;
+        let backend = fixture.backend;
+
+        let mut classifier = model.sampled_token_classifier();
+        let prompt_tokens = model.str_to_token(MISTRAL3_THINKING_PROMPT, AddBos::Always)?;
+        let prompt_token_count = u64::try_from(prompt_tokens.len())?;
+
+        let mut batch = LlamaBatch::new(2048, 1)?;
+        classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
+
+        let mut context = LlamaContext::from_model(
+            model,
+            backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+
+        context.decode(&mut batch)?;
+
+        let promoted = classifier.commit_prompt_tokens();
+        assert_eq!(promoted, prompt_token_count);
+
+        let mut sampler = LlamaSampler::greedy();
+        let initial_position = batch.n_tokens();
+        let outcome = ClassifySampleLoop {
+            model,
+            classifier: &mut classifier,
+            sampler: &mut sampler,
+            context: &mut context,
+            batch: &mut batch,
+            initial_position,
+            max_generated_tokens: MAX_GENERATED_TOKENS,
+        }
+        .run()?;
+
+        let usage = classifier.usage();
+        let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?;
+        let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else {
+            bail!("Mistral 3 chat template must be recognised by the parser; got Unrecognized");
+        };
+
+        assert!(!outcome.generated_raw.is_empty());
+        assert!(outcome.observed_reasoning > 0);
+        assert!(usage.reasoning_tokens > 0);
+        assert_eq!(outcome.observed_undeterminable, 0);
+        assert_eq!(usage.undeterminable_tokens, 0);
+        assert_eq!(
+            usage.completion_tokens(),
+            outcome.observed_content + outcome.observed_reasoning,
+        );
+        assert!(!parsed.reasoning_content.is_empty());
+        assert_eq!(outcome.reasoning_stream, parsed.reasoning_content);
+        assert_eq!(outcome.content_stream, parsed.content);
+
+        for forbidden in FORBIDDEN_MARKERS {
+            assert!(!outcome.reasoning_stream.contains(forbidden));
+            assert!(!outcome.content_stream.contains(forbidden));
+        }
+
+        Ok(())
+    }
+}
+
+mod mistral3_parses_tool_call_payload {
+    use anyhow::Result;
+    use anyhow::bail;
+    use llama_cpp_bindings::ChatMessageParseOutcome;
+    use llama_cpp_bindings::ToolCallArguments;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    const TOOLS_JSON: &str = r#"[
+        {
+            "type": "function",
+            "function": {
+                "name": "get_weather",
+                "description": "Get the current weather for a location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {"type": "string", "description": "The city name"}
+                    },
+                    "required": ["location"]
+                }
+            }
+        }
+    ]"#;
+
+    const MISTRAL3_BRACKETED_JSON_PAYLOAD: &str =
+        r#"[TOOL_CALLS]get_weather[ARGS]{"location":"Paris"}"#;
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn mistral3_parses_tool_call_payload(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let outcome =
+            fixture
+                .model
+                .parse_chat_message(TOOLS_JSON, MISTRAL3_BRACKETED_JSON_PAYLOAD, false)?;
+
+        let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+            bail!(
+                "expected Recognized for Mistral 3 BracketedJson on a Mistral-3 model; got Unrecognized"
+            );
+        };
+        assert_eq!(parsed.tool_calls.len(), 1);
+        assert_eq!(parsed.tool_calls[0].name, "get_weather");
+        let location = match &parsed.tool_calls[0].arguments {
+            ToolCallArguments::ValidJson(value) => value
+                .get("location")
+                .and_then(|v| v.as_str())
+                .map(str::to_owned),
+            ToolCallArguments::InvalidJson(raw) => {
+                bail!("expected ValidJson, got InvalidJson: {raw}");
+            }
+        };
+        assert_eq!(location.as_deref(), Some("Paris"));
+
+        Ok(())
+    }
+}
+
+mod qwen35_chat_inference_emits_reasoning_when_template_auto_opens {
+    use anyhow::Result;
+    use anyhow::bail;
+    use llama_cpp_bindings::ChatMessageParseOutcome;
+    use llama_cpp_bindings::context::LlamaContext;
+    use llama_cpp_bindings::llama_batch::LlamaBatch;
+    use llama_cpp_bindings::model::AddBos;
+    use llama_cpp_bindings::model::LlamaChatMessage;
+    use llama_cpp_bindings::sampling::LlamaSampler;
+    use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 2048,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn qwen35_chat_inference_emits_reasoning_when_template_auto_opens(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let model = fixture.model;
+        let backend = fixture.backend;
+
+        let mut context = LlamaContext::from_model(
+            model,
+            backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+
+        let chat_template = model.chat_template(None)?;
+        let messages = vec![LlamaChatMessage::new(
+            "user".to_owned(),
+            "Hello! How are you?".to_owned(),
+        )?];
+        let prompt = model.apply_chat_template(&chat_template, &messages, true)?;
+
+        let mut classifier = model.sampled_token_classifier();
+        let tokens = model.str_to_token(&prompt, AddBos::Always)?;
+        let prompt_token_count = u64::try_from(tokens.len())?;
+
+        let mut batch = LlamaBatch::new(512, 1)?;
+        classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?;
+
+        context.decode(&mut batch)?;
+
+        let promoted = classifier.commit_prompt_tokens();
+        assert_eq!(promoted, prompt_token_count);
+
+        let mut sampler = LlamaSampler::greedy();
+        let initial_position = batch.n_tokens();
+        let outcome = ClassifySampleLoop {
+            model,
+            classifier: &mut classifier,
+            sampler: &mut sampler,
+            context: &mut context,
+            batch: &mut batch,
+            initial_position,
+            max_generated_tokens: 1024,
+        }
+        .run()?;
+
+        assert!(!outcome.generated_raw.is_empty());
+        assert!(outcome.observed_reasoning > 0);
+        assert!(outcome.observed_content > 0);
+        assert_eq!(outcome.observed_undeterminable, 0);
+        assert_eq!(outcome.observed_tool_call, 0);
+
+        let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?;
+        let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else {
+            bail!("Qwen3.5 chat template must be recognised by the parser; got Unrecognized");
+        };
+        assert!(!parsed.content.is_empty());
+
+        let usage = classifier.into_usage();
+        assert_eq!(usage.prompt_tokens, prompt_token_count);
+        assert_eq!(usage.reasoning_tokens, outcome.observed_reasoning);
+        assert_eq!(usage.undeterminable_tokens, 0);
+
+        Ok(())
+    }
+}
+
+mod qwen35_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt {
+    use anyhow::Result;
+    use llama_cpp_bindings::context::LlamaContext;
+    use llama_cpp_bindings::llama_batch::LlamaBatch;
+    use llama_cpp_bindings::model::AddBos;
+    use llama_cpp_bindings::sampling::LlamaSampler;
+    use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    const MAX_GENERATED_TOKENS: i32 = 200;
+
+    const QWEN35_THINKING_DISABLED_PROMPT: &str = "\
+    <|im_start|>user
+    What is 2 + 2?<|im_end|>
+    <|im_start|>assistant
+    <think>
+
+    </think>
+
+    ";
+
+    const FORBIDDEN_MARKERS: &[&str] = &["<think>", "</think>"];
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 8192,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn qwen35_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let model = fixture.model;
+        let backend = fixture.backend;
+
+        let mut classifier = model.sampled_token_classifier();
+        let prompt_tokens = model.str_to_token(QWEN35_THINKING_DISABLED_PROMPT, AddBos::Never)?;
+        let prompt_token_count = u64::try_from(prompt_tokens.len())?;
+
+        let mut batch = LlamaBatch::new(2048, 1)?;
+        classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
+
+        let mut context = LlamaContext::from_model(
+            model,
+            backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+
+        context.decode(&mut batch)?;
+
+        let promoted = classifier.commit_prompt_tokens();
+        assert_eq!(promoted, prompt_token_count);
+
+        let mut sampler = LlamaSampler::chain_simple([
+            LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
+            LlamaSampler::top_k(40),
+            LlamaSampler::top_p(0.9, 1),
+            LlamaSampler::min_p(0.05, 1),
+            LlamaSampler::temp(0.7),
+            LlamaSampler::dist(0x00C0_FFEE),
+        ]);
+        let initial_position = batch.n_tokens();
+        let outcome = ClassifySampleLoop {
+            model,
+            classifier: &mut classifier,
+            sampler: &mut sampler,
+            context: &mut context,
+            batch: &mut batch,
+            initial_position,
+            max_generated_tokens: MAX_GENERATED_TOKENS,
+        }
+        .run()?;
+
+        let usage = classifier.usage();
+
+        assert!(!outcome.generated_raw.is_empty());
+        assert_eq!(outcome.observed_reasoning, 0);
+        assert_eq!(outcome.observed_undeterminable, 0);
+        assert_eq!(usage.reasoning_tokens, 0);
+        assert_eq!(usage.undeterminable_tokens, 0);
+        assert!(outcome.observed_content > 0);
+        assert_eq!(usage.completion_tokens(), outcome.observed_content);
+
+        for forbidden in FORBIDDEN_MARKERS {
+            assert!(!outcome.content_stream.contains(forbidden));
+        }
+
+        Ok(())
+    }
+}
+
+mod qwen35_classifier_emits_reasoning {
+    use anyhow::Result;
+    use anyhow::bail;
+    use llama_cpp_bindings::ChatMessageParseOutcome;
+    use llama_cpp_bindings::context::LlamaContext;
+    use llama_cpp_bindings::llama_batch::LlamaBatch;
+    use llama_cpp_bindings::model::AddBos;
+    use llama_cpp_bindings::sampling::LlamaSampler;
+    use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    const MAX_GENERATED_TOKENS: i32 = 1500;
+
+    const QWEN35_THINKING_PROMPT: &str = "\
+    <|im_start|>user
+    What is 2 + 2?<|im_end|>
+    <|im_start|>assistant
+    <think>
+    ";
+
+    const FORBIDDEN_MARKERS: &[&str] = &["<think>", "</think>"];
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 8192,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn qwen35_classifier_emits_reasoning_for_thinking_enabled_prompt(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let model = fixture.model;
+        let backend = fixture.backend;
+
+        let mut classifier = model.sampled_token_classifier();
+        let prompt_tokens = model.str_to_token(QWEN35_THINKING_PROMPT, AddBos::Never)?;
+        let prompt_token_count = u64::try_from(prompt_tokens.len())?;
+
+        let mut batch = LlamaBatch::new(2048, 1)?;
+        classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
+
+        let mut context = LlamaContext::from_model(
+            model,
+            backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+
+        context.decode(&mut batch)?;
+
+        let promoted = classifier.commit_prompt_tokens();
+        assert_eq!(promoted, prompt_token_count);
+
+        let mut sampler = LlamaSampler::chain_simple([
+            LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
+            LlamaSampler::top_k(40),
+            LlamaSampler::top_p(0.9, 1),
+            LlamaSampler::min_p(0.05, 1),
+            LlamaSampler::temp(0.7),
+            LlamaSampler::dist(0x00C0_FFEE),
+        ]);
+        let initial_position = batch.n_tokens();
+        let outcome = ClassifySampleLoop {
+            model,
+            classifier: &mut classifier,
+            sampler: &mut sampler,
+            context: &mut context,
+            batch: &mut batch,
+            initial_position,
+            max_generated_tokens: MAX_GENERATED_TOKENS,
+        }
+        .run()?;
+
+        let usage = classifier.usage();
+        let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?;
+        let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else {
+            bail!("Qwen3.5 chat template must be recognised by the parser; got Unrecognized");
+        };
+
+        assert!(!outcome.generated_raw.is_empty());
+        assert!(outcome.observed_reasoning > 0);
+        assert!(usage.reasoning_tokens > 0);
+        assert_eq!(outcome.observed_undeterminable, 0);
+        assert_eq!(usage.undeterminable_tokens, 0);
+        assert_eq!(
+            usage.completion_tokens(),
+            outcome.observed_content + outcome.observed_reasoning,
+        );
+
+        if parsed.reasoning_content.is_empty() {
+            eprintln!(
+                "Qwen3.5 didn't close its reasoning block within {MAX_GENERATED_TOKENS} tokens — \
+                 skipping strict parser-equality assertions"
+            );
+        } else {
+            assert_eq!(outcome.reasoning_stream, parsed.reasoning_content);
+            assert_eq!(outcome.content_stream, parsed.content);
+        }
+
+        for forbidden in FORBIDDEN_MARKERS {
+            assert!(!outcome.reasoning_stream.contains(forbidden));
+            assert!(!outcome.content_stream.contains(forbidden));
+        }
+
+        Ok(())
+    }
+}
+
+mod qwen35_parses_constrained_schema_payload {
+    use anyhow::Result;
+    use anyhow::bail;
+    use llama_cpp_bindings::ChatMessageParseOutcome;
+    use llama_cpp_bindings::ToolCallArguments;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+    use serde_json::Value;
+    use serde_json::json;
+
+    const NEGOTIATE_WITH_CAT_TOOLS_JSON: &str = r#"[
+        {
+            "type": "function",
+            "function": {
+                "name": "negotiate_with_cat",
+                "description": "Attempt to negotiate with a cat. Outcomes are not guaranteed and may include the silent treatment.",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "topic": {
+                            "type": "string",
+                            "description": "What you are trying to negotiate, e.g. 'get off the keyboard' or 'stop knocking things off the table'"
+                        },
+                        "bribe": {
+                            "type": "string",
+                            "enum": ["tuna", "salmon", "treats", "ear_scritches", "cardboard_box", "none"],
+                            "description": "What you are offering in exchange"
+                        },
+                        "desperation_level": {
+                            "type": "integer",
+                            "description": "How desperate you are, on a scale from 1 (mildly annoyed human) to 10 (it is 3am)",
+                            "minimum": 1,
+                            "maximum": 10
+                        }
+                    },
+                    "required": ["topic"],
+                    "additionalProperties": false
+                }
+            }
+        }
+    ]"#;
+
+    const NEGOTIATE_WITH_CAT_INPUT: &str = "<tool_call>\n\
+    <function=negotiate_with_cat>\n\
+    <parameter=bribe>\n\
+    tuna\n\
+    </parameter>\n\
+    <parameter=desperation_level>\n\
+    8\n\
+    </parameter>\n\
+    <parameter=topic>\n\
+    get off the keyboard\n\
+    </parameter>\n\
+    </function>\n\
+    </tool_call>";
+
+    fn arguments_as_json(arguments: &ToolCallArguments) -> Result<&Value> {
+        match arguments {
+            ToolCallArguments::ValidJson(value) => Ok(value),
+            ToolCallArguments::InvalidJson(raw) => {
+                bail!("expected ValidJson arguments, got InvalidJson: {raw}")
+            }
+        }
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn qwen35_parses_constrained_schema_payload(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let outcome = fixture.model.parse_chat_message(
+            NEGOTIATE_WITH_CAT_TOOLS_JSON,
+            NEGOTIATE_WITH_CAT_INPUT,
+            false,
+        )?;
+
+        let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+            bail!(
+                "Qwen 3.5's tool-call payload must be parsed by the wrapper-side duck-type pass; \
+                 got Unrecognized"
+            );
+        };
+
+        assert_eq!(parsed.tool_calls.len(), 1);
+        assert_eq!(parsed.tool_calls[0].name, "negotiate_with_cat");
+        assert_eq!(parsed.tool_calls[0].id, "call_0");
+        assert_eq!(
+            arguments_as_json(&parsed.tool_calls[0].arguments)?,
+            &json!({
+                "bribe": "tuna",
+                "desperation_level": 8,
+                "topic": "get off the keyboard",
+            }),
+        );
+
+        Ok(())
+    }
+}
+
+mod qwen35_parses_tool_call_payload {
+    use anyhow::Result;
+    use anyhow::bail;
+    use llama_cpp_bindings::ChatMessageParseOutcome;
+    use llama_cpp_bindings::ToolCallArguments;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    const TOOLS_JSON: &str = r#"[
+        {
+            "type": "function",
+            "function": {
+                "name": "get_weather",
+                "description": "Get the current weather for a location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {"type": "string", "description": "The city name"}
+                    },
+                    "required": ["location"]
+                }
+            }
+        }
+    ]"#;
+
+    const QWEN_XML_PAYLOAD: &str = "<tool_call>\n\
+    <function=get_weather>\n\
+    <parameter=location>\n\
+    Paris\n\
+    </parameter>\n\
+    </function>\n\
+    </tool_call>";
+
+    const PARTIAL_QWEN_XML_PAYLOAD: &str = "<tool_call>\n<function=get_weather>\n<parameter=lo";
+
+    const TWO_QWEN_XML_PAYLOADS: &str = "<tool_call>\n\
+    <function=get_weather>\n\
+    <parameter=location>\n\
+    Paris\n\
+    </parameter>\n\
+    </function>\n\
+    </tool_call>\n\
+    <tool_call>\n\
+    <function=get_weather>\n\
+    <parameter=location>\n\
+    Berlin\n\
+    </parameter>\n\
+    </function>\n\
+    </tool_call>";
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn qwen35_parses_tool_call_payload(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let outcome = fixture
+            .model
+            .parse_chat_message(TOOLS_JSON, QWEN_XML_PAYLOAD, false)?;
+
+        let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+            bail!("expected Recognized for Qwen XML on a Qwen-3.5 model; got Unrecognized");
+        };
+        assert_eq!(parsed.tool_calls.len(), 1);
+        assert_eq!(parsed.tool_calls[0].name, "get_weather");
+        let location = match &parsed.tool_calls[0].arguments {
+            ToolCallArguments::ValidJson(value) => value
+                .get("location")
+                .and_then(|v| v.as_str())
+                .map(str::to_owned),
+            ToolCallArguments::InvalidJson(raw) => {
+                bail!("expected ValidJson, got InvalidJson: {raw}");
+            }
+        };
+        assert_eq!(location.as_deref(), Some("Paris"));
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn qwen35_parses_partial_tool_call_returns_pending_state(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let outcome =
+            fixture
+                .model
+                .parse_chat_message(TOOLS_JSON, PARTIAL_QWEN_XML_PAYLOAD, true)?;
+
+        let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+            bail!("expected Recognized for partial Qwen XML on a Qwen-3.5 model; got Unrecognized");
+        };
+        assert!(parsed.tool_calls.is_empty() || parsed.tool_calls.len() == 1);
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn qwen35_parses_multiple_tool_calls(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let outcome = fixture
+            .model
+            .parse_chat_message(TOOLS_JSON, TWO_QWEN_XML_PAYLOADS, false)?;
+
+        let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+            bail!(
+                "expected Recognized for two Qwen XML payloads on a Qwen-3.5 model; got Unrecognized"
+            );
+        };
+        assert!(
+            !parsed.tool_calls.is_empty(),
+            "expected at least one tool call; got {:?}",
+            parsed.tool_calls
+        );
+
+        Ok(())
+    }
+}
+
+mod qwen35_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested {
+    use anyhow::Result;
+    use anyhow::bail;
+    use llama_cpp_bindings::ChatMessageParseOutcome;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    const TOOLS_JSON: &str = r#"[
+        {
+            "type": "function",
+            "function": {
+                "name": "get_weather",
+                "description": "Get the current weather for a location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {"type": "string", "description": "The city name"}
+                    },
+                    "required": ["location"]
+                }
+            }
+        }
+    ]"#;
+
+    const PLAIN_CONTENT: &str = "Sorry, I cannot help with that.";
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn qwen35_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let outcome = fixture
+            .model
+            .parse_chat_message(TOOLS_JSON, PLAIN_CONTENT, false)?;
+
+        let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+            bail!(
+                "Qwen 3.5 with tools requested + plain content must produce Recognized (with empty \
+                 tool_calls); got Unrecognized"
+            );
+        };
+        assert!(
+            parsed.tool_calls.is_empty(),
+            "expected no tool calls; got {:?}",
+            parsed.tool_calls
+        );
+
+        Ok(())
+    }
+}
+
+mod qwen36_chat_inference_emits_reasoning_when_template_auto_opens {
+    use anyhow::Result;
+    use anyhow::bail;
+    use llama_cpp_bindings::ChatMessageParseOutcome;
+    use llama_cpp_bindings::context::LlamaContext;
+    use llama_cpp_bindings::llama_batch::LlamaBatch;
+    use llama_cpp_bindings::model::AddBos;
+    use llama_cpp_bindings::model::LlamaChatMessage;
+    use llama_cpp_bindings::sampling::LlamaSampler;
+    use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 2048,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn qwen36_chat_inference_emits_reasoning_when_template_auto_opens(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let model = fixture.model;
+        let backend = fixture.backend;
+
+        let mut context = LlamaContext::from_model(
+            model,
+            backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+
+        let chat_template = model.chat_template(None)?;
+        let messages = vec![LlamaChatMessage::new(
+            "user".to_owned(),
+            "Hello! How are you?".to_owned(),
+        )?];
+        let prompt = model.apply_chat_template(&chat_template, &messages, true)?;
+
+        let mut classifier = model.sampled_token_classifier();
+        let tokens = model.str_to_token(&prompt, AddBos::Always)?;
+        let prompt_token_count = u64::try_from(tokens.len())?;
+
+        let mut batch = LlamaBatch::new(512, 1)?;
+        classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?;
+
+        context.decode(&mut batch)?;
+
+        let promoted = classifier.commit_prompt_tokens();
+        assert_eq!(promoted, prompt_token_count);
+
+        let mut sampler = LlamaSampler::greedy();
+        let initial_position = batch.n_tokens();
+        let outcome = ClassifySampleLoop {
+            model,
+            classifier: &mut classifier,
+            sampler: &mut sampler,
+            context: &mut context,
+            batch: &mut batch,
+            initial_position,
+            max_generated_tokens: 1024,
+        }
+        .run()?;
+
+        assert!(!outcome.generated_raw.is_empty());
+        assert!(outcome.observed_reasoning > 0);
+        assert!(outcome.observed_content > 0);
+        assert_eq!(outcome.observed_undeterminable, 0);
+        assert_eq!(outcome.observed_tool_call, 0);
+
+        let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?;
+        let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else {
+            bail!("Qwen3.6 chat template must be recognised by the parser; got Unrecognized");
+        };
+        assert!(!parsed.content.is_empty());
+
+        let usage = classifier.into_usage();
+        assert_eq!(usage.prompt_tokens, prompt_token_count);
+        assert_eq!(usage.reasoning_tokens, outcome.observed_reasoning);
+        assert_eq!(usage.undeterminable_tokens, 0);
+
+        Ok(())
+    }
+}
+
+mod qwen36_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt {
+    use anyhow::Result;
+    use llama_cpp_bindings::context::LlamaContext;
+    use llama_cpp_bindings::llama_batch::LlamaBatch;
+    use llama_cpp_bindings::model::AddBos;
+    use llama_cpp_bindings::sampling::LlamaSampler;
+    use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    const MAX_GENERATED_TOKENS: i32 = 200;
+
+    const QWEN36_THINKING_DISABLED_PROMPT: &str = "\
+    <|im_start|>user
+    What is 2 + 2?<|im_end|>
+    <|im_start|>assistant
+    <think>
+
+    </think>
+
+    ";
+
+    const FORBIDDEN_MARKERS: &[&str] = &["<think>", "</think>"];
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 8192,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn qwen36_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let model = fixture.model;
+        let backend = fixture.backend;
+
+        let mut classifier = model.sampled_token_classifier();
+        let prompt_tokens = model.str_to_token(QWEN36_THINKING_DISABLED_PROMPT, AddBos::Never)?;
+        let prompt_token_count = u64::try_from(prompt_tokens.len())?;
+
+        let mut batch = LlamaBatch::new(2048, 1)?;
+        classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
+
+        let mut context = LlamaContext::from_model(
+            model,
+            backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+
+        context.decode(&mut batch)?;
+
+        let promoted = classifier.commit_prompt_tokens();
+        assert_eq!(promoted, prompt_token_count);
+
+        let mut sampler = LlamaSampler::chain_simple([
+            LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
+            LlamaSampler::top_k(40),
+            LlamaSampler::top_p(0.9, 1),
+            LlamaSampler::min_p(0.05, 1),
+            LlamaSampler::temp(0.7),
+            LlamaSampler::dist(0x00C0_FFEE),
+        ]);
+        let initial_position = batch.n_tokens();
+        let outcome = ClassifySampleLoop {
+            model,
+            classifier: &mut classifier,
+            sampler: &mut sampler,
+            context: &mut context,
+            batch: &mut batch,
+            initial_position,
+            max_generated_tokens: MAX_GENERATED_TOKENS,
+        }
+        .run()?;
+
+        let usage = classifier.usage();
+
+        assert!(!outcome.generated_raw.is_empty());
+        assert_eq!(outcome.observed_reasoning, 0);
+        assert_eq!(outcome.observed_undeterminable, 0);
+        assert_eq!(usage.reasoning_tokens, 0);
+        assert_eq!(usage.undeterminable_tokens, 0);
+        assert!(outcome.observed_content > 0);
+        assert_eq!(usage.completion_tokens(), outcome.observed_content);
+
+        for forbidden in FORBIDDEN_MARKERS {
+            assert!(!outcome.content_stream.contains(forbidden));
+        }
+
+        Ok(())
+    }
+}
+
+mod qwen36_classifier_emits_reasoning {
+    use anyhow::Result;
+    use anyhow::bail;
+    use llama_cpp_bindings::ChatMessageParseOutcome;
+    use llama_cpp_bindings::context::LlamaContext;
+    use llama_cpp_bindings::llama_batch::LlamaBatch;
+    use llama_cpp_bindings::model::AddBos;
+    use llama_cpp_bindings::sampling::LlamaSampler;
+    use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    const MAX_GENERATED_TOKENS: i32 = 1500;
+
+    const QWEN36_THINKING_PROMPT: &str = "\
+    <|im_start|>user
+    What is 2 + 2?<|im_end|>
+    <|im_start|>assistant
+    <think>
+    ";
+
+    const FORBIDDEN_MARKERS: &[&str] = &["<think>", "</think>"];
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 8192,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn qwen36_classifier_emits_reasoning_for_thinking_enabled_prompt(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let model = fixture.model;
+        let backend = fixture.backend;
+
+        let mut classifier = model.sampled_token_classifier();
+        let prompt_tokens = model.str_to_token(QWEN36_THINKING_PROMPT, AddBos::Never)?;
+        let prompt_token_count = u64::try_from(prompt_tokens.len())?;
+
+        let mut batch = LlamaBatch::new(2048, 1)?;
+        classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
+
+        let mut context = LlamaContext::from_model(
+            model,
+            backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+
+        context.decode(&mut batch)?;
+
+        let promoted = classifier.commit_prompt_tokens();
+        assert_eq!(promoted, prompt_token_count);
+
+        let mut sampler = LlamaSampler::chain_simple([
+            LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
+            LlamaSampler::top_k(40),
+            LlamaSampler::top_p(0.9, 1),
+            LlamaSampler::min_p(0.05, 1),
+            LlamaSampler::temp(0.7),
+            LlamaSampler::dist(0x00C0_FFEE),
+        ]);
+        let initial_position = batch.n_tokens();
+        let outcome = ClassifySampleLoop {
+            model,
+            classifier: &mut classifier,
+            sampler: &mut sampler,
+            context: &mut context,
+            batch: &mut batch,
+            initial_position,
+            max_generated_tokens: MAX_GENERATED_TOKENS,
+        }
+        .run()?;
+
+        let usage = classifier.usage();
+        let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, true)?;
+        let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else {
+            bail!("Qwen3.6 chat template must be recognised by the parser; got Unrecognized");
+        };
+
+        assert!(!outcome.generated_raw.is_empty());
+        assert!(outcome.observed_reasoning > 0);
+        assert!(usage.reasoning_tokens > 0);
+        assert_eq!(outcome.observed_undeterminable, 0);
+        assert_eq!(usage.undeterminable_tokens, 0);
+        assert_eq!(
+            usage.completion_tokens(),
+            outcome.observed_content + outcome.observed_reasoning,
+        );
+
+        if parsed.reasoning_content.is_empty() {
+            eprintln!(
+                "Qwen3.6 parser returned empty reasoning_content — relying on FORBIDDEN_MARKERS"
+            );
+        } else {
+            assert_eq!(outcome.reasoning_stream, parsed.reasoning_content);
+            assert_eq!(outcome.content_stream, parsed.content);
+        }
+
+        for forbidden in FORBIDDEN_MARKERS {
+            assert!(!outcome.reasoning_stream.contains(forbidden));
+            assert!(!outcome.content_stream.contains(forbidden));
+        }
+
+        Ok(())
+    }
+}
+
+llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/reranker.rs b/llama-cpp-bindings-tests/tests/reranker.rs
deleted file mode 100644
index d08de7eb..00000000
--- a/llama-cpp-bindings-tests/tests/reranker.rs
+++ /dev/null
@@ -1,158 +0,0 @@
-use std::time::Duration;
-
-use anyhow::{Context, Result, bail};
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::ggml_time_us;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-fn normalize(input: &[f32]) -> Vec<f32> {
-    let magnitude = input
-        .iter()
-        .fold(0.0, |accumulator, &value| value.mul_add(value, accumulator))
-        .sqrt();
-
-    input.iter().map(|&value| value / magnitude).collect()
-}
-
-fn cosine_similarity(vec_a: &[f32], vec_b: &[f32]) -> f32 {
-    vec_a
-        .iter()
-        .zip(vec_b.iter())
-        .map(|(left, right)| left * right)
-        .sum::<f32>()
-}
-
-#[llama_test(
-    model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-    n_seq_max = 2,
-    n_threads_batch = 8,
-    embeddings = true,
-)]
-fn reranking_produces_scores(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let model = fixture.model;
-
-    let query = "What is machine learning?";
-    let documents = [
-        "Machine learning is a subset of artificial intelligence.",
-        "The weather today is sunny and warm.",
-    ];
-
-    let document_count = documents.len();
-    assert_eq!(
-        u32::try_from(document_count)?,
-        fixture.context_params.n_seq_max,
-        "attribute n_seq_max must match the document count this trial expects",
-    );
-
-    let mut ctx = LlamaContext::from_model(
-        model,
-        fixture.backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )
-    .with_context(|| "unable to create context")?;
-
-    let prompt_lines: Vec<String> = documents
-        .iter()
-        .map(|document| format!("{query}</s><s>{document}"))
-        .collect();
-
-    let tokens_lines_list = prompt_lines
-        .iter()
-        .map(|line| model.str_to_token(line, AddBos::Always))
-        .collect::<std::result::Result<Vec<_>, _>>()
-        .with_context(|| "failed to tokenize prompts")?;
-
-    let n_ctx = usize::try_from(ctx.n_ctx())?;
-
-    if tokens_lines_list.iter().any(|tokens| n_ctx < tokens.len()) {
-        bail!("one of the provided prompts exceeds the size of the context window");
-    }
-
-    let mut classifier = model.sampled_token_classifier();
-    let mut batch = LlamaBatch::new(2048, i32::try_from(document_count)?)?;
-    let t_main_start = ggml_time_us();
-
-    for (sequence_index, tokens) in tokens_lines_list.iter().enumerate() {
-        classifier.feed_prompt_sequence_to_batch(
-            &mut batch,
-            tokens,
-            i32::try_from(sequence_index)?,
-            false,
-        )?;
-    }
-
-    let total_tokens: usize = tokens_lines_list.iter().map(Vec::len).sum();
-    let total_token_count = u64::try_from(total_tokens)?;
-
-    assert_eq!(classifier.pending_prompt_tokens(), total_token_count);
-    assert_eq!(classifier.usage().prompt_tokens, 0);
-
-    ctx.clear_kv_cache();
-    ctx.decode(&mut batch)
-        .with_context(|| "llama_decode() failed")?;
-
-    let promoted = classifier.commit_prompt_tokens();
-    assert_eq!(promoted, total_token_count);
-
-    let mut embeddings = Vec::with_capacity(document_count);
-
-    for sequence_index in 0..document_count {
-        let raw_embedding = ctx
-            .embeddings_seq_ith(i32::try_from(sequence_index)?)
-            .with_context(|| "failed to get sequence embeddings")?;
-        embeddings.push(normalize(raw_embedding));
-    }
-
-    let t_main_end = ggml_time_us();
-    let duration = Duration::from_micros(u64::try_from(t_main_end - t_main_start)?);
-
-    #[expect(
-        clippy::cast_precision_loss,
-        reason = "logged throughput tolerates f32 precision"
-    )]
-    let tokens_per_second = total_tokens as f32 / duration.as_secs_f32();
-
-    eprintln!(
-        "created embeddings for {total_tokens} tokens in {:.2} s, speed {tokens_per_second:.2} t/s",
-        duration.as_secs_f32(),
-    );
-
-    assert_eq!(
-        embeddings.len(),
-        document_count,
-        "should produce one embedding per document"
-    );
-
-    for (index, embedding) in embeddings.iter().enumerate() {
-        assert!(
-            !embedding.is_empty(),
-            "embedding {index} should not be empty"
-        );
-    }
-
-    let similarity = cosine_similarity(&embeddings[0], &embeddings[1]);
-    eprintln!("cosine similarity between document embeddings: {similarity:.4}");
-
-    assert!(
-        similarity.is_finite(),
-        "cosine similarity should be a finite number"
-    );
-
-    let usage = classifier.into_usage();
-    assert_eq!(usage.prompt_tokens, total_token_count);
-    assert_eq!(usage.completion_tokens(), 0);
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/sampled_token_classifier_markers.rs b/llama-cpp-bindings-tests/tests/sampled_token_classifier_markers.rs
deleted file mode 100644
index 4127fc58..00000000
--- a/llama-cpp-bindings-tests/tests/sampled_token_classifier_markers.rs
+++ /dev/null
@@ -1,513 +0,0 @@
-#![expect(
-    clippy::unnecessary_wraps,
-    reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-)]
-
-use anyhow::Result;
-use llama_cpp_bindings::SampledToken;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::sampled_token_classifier::SampledTokenClassifier;
-use llama_cpp_bindings::sampled_token_section::SampledTokenSection;
-use llama_cpp_bindings::streaming_markers::StreamingMarkers;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn classifier_starts_in_pending_section_for_default_fixture(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let classifier = fixture.model.sampled_token_classifier();
-
-    assert_eq!(classifier.current_section(), SampledTokenSection::Pending);
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn classifier_construction_is_idempotent_across_calls(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let first = fixture.model.sampled_token_classifier();
-    let second = fixture.model.sampled_token_classifier();
-
-    assert_eq!(first.current_section(), second.current_section());
-    assert_eq!(first.usage(), second.usage());
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn ingest_with_no_markers_emits_undeterminable_with_visible_and_raw_piece(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let model = fixture.model;
-    let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default());
-
-    let outcomes = classifier.ingest(model.token_bos());
-
-    assert_eq!(outcomes.len(), 1);
-    let outcome = &outcomes[0];
-    assert!(matches!(
-        outcome.sampled_token,
-        SampledToken::Undeterminable(_)
-    ));
-    assert_eq!(outcome.visible_piece, outcome.raw_piece);
-    assert_eq!(classifier.usage().undeterminable_tokens, 1);
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn ingest_with_no_markers_decodes_each_token_independently(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let model = fixture.model;
-    let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default());
-
-    let _ = classifier.ingest(model.token_bos());
-    let _ = classifier.ingest(model.token_eos());
-
-    assert_eq!(classifier.usage().undeterminable_tokens, 2);
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn ingest_prompt_token_with_no_markers_is_a_noop(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let model = fixture.model;
-    let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default());
-    let usage_before = *classifier.usage();
-
-    classifier.ingest_prompt_token(model.token_bos());
-    classifier.ingest_prompt_tokens(&[model.token_eos(), model.token_nl()]);
-
-    assert_eq!(*classifier.usage(), usage_before);
-    assert_eq!(classifier.current_section(), SampledTokenSection::Pending);
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn feed_prompt_to_batch_increments_pending_prompt_tokens(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let model = fixture.model;
-    let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default());
-    let mut batch = LlamaBatch::new(8, 1)?;
-
-    classifier.feed_prompt_to_batch(&mut batch, model.token_bos(), 0, &[0], false)?;
-    classifier.feed_prompt_to_batch(&mut batch, model.token_eos(), 1, &[0], false)?;
-
-    assert_eq!(classifier.pending_prompt_tokens(), 2);
-    assert_eq!(batch.n_tokens(), 2);
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn feed_prompt_sequence_to_batch_stages_all_tokens(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let model = fixture.model;
-    let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default());
-    let mut batch = LlamaBatch::new(8, 1)?;
-
-    let tokens = vec![model.token_bos(), model.token_eos(), model.token_nl()];
-    classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?;
-
-    assert_eq!(classifier.pending_prompt_tokens(), 3);
-    assert_eq!(batch.n_tokens(), 3);
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn commit_prompt_tokens_promotes_pending_count_to_usage_and_clears(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let model = fixture.model;
-    let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default());
-    let mut batch = LlamaBatch::new(8, 1)?;
-
-    classifier.feed_prompt_to_batch(&mut batch, model.token_bos(), 0, &[0], false)?;
-    classifier.feed_prompt_to_batch(&mut batch, model.token_eos(), 1, &[0], false)?;
-
-    let promoted = classifier.commit_prompt_tokens();
-
-    assert_eq!(promoted, 2);
-    assert_eq!(classifier.pending_prompt_tokens(), 0);
-    assert_eq!(classifier.usage().prompt_tokens, 2);
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn discard_pending_prompt_tokens_clears_count_without_recording_usage(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let model = fixture.model;
-    let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default());
-    let mut batch = LlamaBatch::new(8, 1)?;
-
-    classifier.feed_prompt_to_batch(&mut batch, model.token_bos(), 0, &[0], false)?;
-
-    let discarded = classifier.discard_pending_prompt_tokens();
-
-    assert_eq!(discarded, 1);
-    assert_eq!(classifier.pending_prompt_tokens(), 0);
-    assert_eq!(classifier.usage().prompt_tokens, 0);
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 128,
-    n_ubatch = 64,
-)]
-fn diagnose_tool_call_synthetic_renders_returns_a_pair_of_strings(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let (left, right) = fixture.model.diagnose_tool_call_synthetic_renders()?;
-    let _ = left;
-    let _ = right;
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/sampling.rs b/llama-cpp-bindings-tests/tests/sampling.rs
deleted file mode 100644
index d03e965e..00000000
--- a/llama-cpp-bindings-tests/tests/sampling.rs
+++ /dev/null
@@ -1,429 +0,0 @@
-#![expect(
-    clippy::unnecessary_wraps,
-    reason = "every trial returns anyhow::Result<()> to match the LlamaTestFn signature"
-)]
-
-use anyhow::Result;
-use llama_cpp_bindings::GrammarError;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_bindings::sampling::LlamaSampler;
-use llama_cpp_bindings::token::LlamaToken;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn dry_sampler_with_model(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let breakers: Vec<&[u8]> = vec![b"\n", b"\t"];
-    let _sampler = LlamaSampler::dry(fixture.model, 1.5, 2.0, 128, 2, &breakers);
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn dry_sampler_with_null_byte_in_seq_breakers_returns_error(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let breakers: Vec<&[u8]> = vec![b"hello\0world"];
-    let result = LlamaSampler::dry(fixture.model, 1.5, 2.0, 128, 2, breakers);
-
-    assert!(result.is_err());
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn grammar_returns_sampler_for_valid_grammar(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let sampler = LlamaSampler::grammar(fixture.model, "root ::= \"hello\"", "root");
-
-    assert!(sampler.is_ok());
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn grammar_lazy_returns_sampler_for_valid_grammar_with_triggers(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let trigger_words: Vec<&[u8]> = vec![b"function"];
-    let sampler = LlamaSampler::grammar_lazy(
-        fixture.model,
-        "root ::= \"hello\"",
-        "root",
-        trigger_words,
-        &[],
-    );
-
-    assert!(sampler.is_ok());
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn grammar_lazy_patterns_returns_sampler_for_valid_grammar_with_patterns(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let patterns = vec!["\\{.*".to_owned()];
-    let sampler = LlamaSampler::grammar_lazy_patterns(
-        fixture.model,
-        "root ::= \"hello\"",
-        "root",
-        &patterns,
-        &[],
-    );
-
-    assert!(sampler.is_ok());
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn grammar_lazy_with_root_not_found_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let trigger_words: Vec<&[u8]> = vec![b"function"];
-    let result = LlamaSampler::grammar_lazy(
-        fixture.model,
-        "expr ::= \"hello\"",
-        "root",
-        trigger_words,
-        &[],
-    );
-
-    assert!(matches!(result, Err(GrammarError::RootNotFound)));
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn grammar_lazy_with_null_byte_in_trigger_word_returns_error(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let trigger_words: Vec<&[u8]> = vec![b"hel\0lo"];
-    let result = LlamaSampler::grammar_lazy(
-        fixture.model,
-        "root ::= \"hello\"",
-        "root",
-        trigger_words,
-        &[],
-    );
-
-    assert!(matches!(result, Err(GrammarError::TriggerWordNullBytes(_))));
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn grammar_lazy_patterns_with_root_not_found_returns_error(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let patterns = vec!["\\{.*".to_owned()];
-    let result = LlamaSampler::grammar_lazy_patterns(
-        fixture.model,
-        "expr ::= \"hello\"",
-        "root",
-        &patterns,
-        &[],
-    );
-
-    assert!(matches!(result, Err(GrammarError::RootNotFound)));
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn grammar_lazy_patterns_with_null_byte_in_pattern_returns_error(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let patterns = vec!["hel\0lo".to_owned()];
-    let result = LlamaSampler::grammar_lazy_patterns(
-        fixture.model,
-        "root ::= \"hello\"",
-        "root",
-        &patterns,
-        &[],
-    );
-
-    assert!(matches!(result, Err(GrammarError::GrammarNullBytes(_))));
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn grammar_lazy_patterns_with_malformed_regex_returns_invalid_trigger_pattern(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let patterns = vec!["[".to_owned()];
-    let result = LlamaSampler::grammar_lazy_patterns(
-        fixture.model,
-        "root ::= \"hello\"",
-        "root",
-        &patterns,
-        &[],
-    );
-
-    assert!(matches!(
-        result,
-        Err(GrammarError::InvalidTriggerPattern { .. }),
-    ));
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn llguidance_method_creates_sampler(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let result = LlamaSampler::llguidance(fixture.model, "regex", r"yes|no");
-
-    assert!(result.is_ok());
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn logit_bias_with_empty_biases_succeeds(_fixture: &LlamaFixture<'_>) -> Result<()> {
-    let result = LlamaSampler::logit_bias(0, &[]);
-
-    assert!(result.is_ok());
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn dry_sampler_with_root_not_found_grammar_does_not_apply(
-    fixture: &LlamaFixture<'_>,
-) -> Result<()> {
-    let breakers: Vec<&[u8]> = vec![b"\n"];
-    let _sampler = LlamaSampler::dry(fixture.model, 1.5, 2.0, 128, 2, &breakers);
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn accept_many_iterates_over_borrowed_tokens(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut sampler = LlamaSampler::chain_simple([LlamaSampler::greedy()]);
-    let tokens = vec![fixture.model.token_bos(), fixture.model.token_eos()];
-
-    sampler.accept_many(&tokens)?;
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn with_tokens_returns_self_after_accepting_each_token(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let sampler = LlamaSampler::chain_simple([LlamaSampler::greedy()]);
-    let tokens = [fixture.model.token_bos(), fixture.model.token_eos()];
-
-    let _consumed = sampler.with_tokens(tokens.iter().copied())?;
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn accept_consumes_a_single_token(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut sampler = LlamaSampler::chain_simple([LlamaSampler::greedy()]);
-
-    sampler.accept(fixture.model.token_bos())?;
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn try_accept_returns_ok_for_a_valid_token(_fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut sampler = LlamaSampler::chain_simple([LlamaSampler::greedy()]);
-
-    sampler.try_accept(LlamaToken::new(0))?;
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn apply_runs_sampler_over_token_data_array(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut context = LlamaContext::from_model(
-        fixture.model,
-        fixture.backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-    let tokens = fixture.model.str_to_token("Hi", AddBos::Always)?;
-    let mut batch = LlamaBatch::new(512, 1)?;
-    batch.add_sequence(&tokens, 0, false)?;
-    context.decode(&mut batch)?;
-
-    let mut data_array = context.token_data_array_ith(batch.n_tokens() - 1)?;
-    let sampler = LlamaSampler::greedy();
-    sampler.apply(&mut data_array);
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 2048,
-    n_ubatch = 512,
-)]
-fn sample_returns_token_after_decode(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let mut context = LlamaContext::from_model(
-        fixture.model,
-        fixture.backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-    let tokens = fixture.model.str_to_token("Hello", AddBos::Always)?;
-    let mut batch = LlamaBatch::new(512, 1)?;
-    batch.add_sequence(&tokens, 0, false)?;
-    context.decode(&mut batch)?;
-    let mut sampler = LlamaSampler::chain_simple([LlamaSampler::temp(0.8), LlamaSampler::greedy()]);
-    let result = sampler.sample(&context, batch.n_tokens() - 1);
-
-    assert!(result.is_ok());
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/sampling_and_constrained_decoding.rs b/llama-cpp-bindings-tests/tests/sampling_and_constrained_decoding.rs
new file mode 100644
index 00000000..dc9395aa
--- /dev/null
+++ b/llama-cpp-bindings-tests/tests/sampling_and_constrained_decoding.rs
@@ -0,0 +1,2518 @@
+use llama_cpp_test_harness::llama_tests_main;
+
+mod model_sampling {
+    use anyhow::Result;
+    use llama_cpp_bindings::SampledToken;
+    use llama_cpp_bindings::context::LlamaContext;
+    use llama_cpp_bindings::json_schema_to_grammar;
+    use llama_cpp_bindings::llama_batch::LlamaBatch;
+    use llama_cpp_bindings::model::AddBos;
+    use llama_cpp_bindings::sampling::LlamaSampler;
+    use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 256,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 256,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 256,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 256,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn sample_returns_result_and_succeeds_with_valid_index(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let model = fixture.model;
+        let mut context = LlamaContext::from_model(
+            model,
+            fixture.backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+
+        let tokens = model.str_to_token("Hello", AddBos::Always)?;
+        let mut batch = LlamaBatch::new(512, 1)?;
+
+        batch.add_sequence(&tokens, 0, false)?;
+
+        context.decode(&mut batch)?;
+
+        let mut sampler =
+            LlamaSampler::chain_simple([LlamaSampler::temp(0.8), LlamaSampler::greedy()]);
+
+        let result = sampler.sample(&context, batch.n_tokens() - 1);
+
+        assert!(result.is_ok());
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn grammar_sampler_constrains_output_to_yes_or_no(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let model = fixture.model;
+        let mut context = LlamaContext::from_model(
+            model,
+            fixture.backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+
+        let prompt = "<|im_start|>user\nIs the sky blue? Answer yes or no.<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n";
+        let tokens = model.str_to_token(prompt, AddBos::Always)?;
+        let mut batch = LlamaBatch::new(512, 1)?;
+
+        batch.add_sequence(&tokens, 0, false)?;
+
+        context.decode(&mut batch)?;
+
+        let mut sampler = LlamaSampler::chain_simple([
+            LlamaSampler::grammar(model, r"root ::= [Yy] [Ee] [Ss] | [Nn] [Oo]", "root")?,
+            LlamaSampler::temp(0.8),
+            LlamaSampler::greedy(),
+        ]);
+
+        let mut classifier = model.sampled_token_classifier();
+        let (raw_token, mut outcomes) =
+            classifier.sample(&mut sampler, &context, batch.n_tokens() - 1)?;
+        outcomes.extend(classifier.flush());
+
+        assert_eq!(
+            outcomes.len(),
+            1,
+            "expected one finalised outcome after flush"
+        );
+        let outcome = &outcomes[0];
+
+        let raw_as_sampled = SampledToken::Content(raw_token);
+        assert!(
+            !model.is_eog_token(&raw_as_sampled),
+            "Grammar sampler should not allow EOS as first token"
+        );
+
+        let piece = &outcome.raw_piece;
+        let first_char = piece
+            .chars()
+            .next()
+            .ok_or_else(|| anyhow::anyhow!("piece should have at least one character"))?
+            .to_lowercase()
+            .next()
+            .ok_or_else(|| anyhow::anyhow!("lowercase iterator should yield a character"))?;
+
+        assert!(
+            first_char == 'y' || first_char == 'n',
+            "Grammar should constrain first token to start with y/n, got: '{piece}'"
+        );
+        assert_eq!(
+            classifier.usage().completion_tokens(),
+            1,
+            "exactly one completion token sampled"
+        );
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn json_schema_grammar_sampler_constrains_output_to_json(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let model = fixture.model;
+        let mut context = LlamaContext::from_model(
+            model,
+            fixture.backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+
+        let prompt = "<|im_start|>user\nWhat is 2+2? Respond with a JSON object.<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n";
+        let tokens = model.str_to_token(prompt, AddBos::Always)?;
+        let mut batch = LlamaBatch::new(512, 1)?;
+
+        batch.add_sequence(&tokens, 0, false)?;
+
+        context.decode(&mut batch)?;
+
+        let grammar_str = json_schema_to_grammar(
+            r#"{"type": "object", "properties": {"answer": {"type": "string"}}, "required": ["answer"]}"#,
+        )?;
+
+        let mut sampler = LlamaSampler::chain_simple([
+            LlamaSampler::grammar(model, &grammar_str, "root")?,
+            LlamaSampler::temp(0.8),
+            LlamaSampler::greedy(),
+        ]);
+
+        let mut classifier = model.sampled_token_classifier();
+        let (raw_token, mut outcomes) =
+            classifier.sample(&mut sampler, &context, batch.n_tokens() - 1)?;
+        outcomes.extend(classifier.flush());
+
+        assert_eq!(
+            outcomes.len(),
+            1,
+            "expected one finalised outcome after flush"
+        );
+        let outcome = &outcomes[0];
+
+        let raw_as_sampled = SampledToken::Content(raw_token);
+        assert!(
+            !model.is_eog_token(&raw_as_sampled),
+            "Grammar sampler should not allow EOS as first token"
+        );
+
+        let piece = &outcome.raw_piece;
+
+        assert!(
+            piece.starts_with('{'),
+            "JSON schema grammar should constrain first token to start with '{{', got: '{piece}'"
+        );
+        assert_eq!(
+            classifier.usage().completion_tokens(),
+            1,
+            "exactly one completion token sampled"
+        );
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn sample_with_grammar_produces_constrained_output_in_loop(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let model = fixture.model;
+        let mut context = LlamaContext::from_model(
+            model,
+            fixture.backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+
+        let prompt = "<|im_start|>user\nIs the sky blue? yes or no<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n";
+        let tokens = model.str_to_token(prompt, AddBos::Always)?;
+        let mut batch = LlamaBatch::new(512, 1)?;
+
+        let mut classifier = model.sampled_token_classifier();
+        classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?;
+
+        context.decode(&mut batch)?;
+        classifier.commit_prompt_tokens();
+
+        let mut sampler = LlamaSampler::chain_simple([
+            LlamaSampler::grammar(model, r#"root ::= "yes" | "no""#, "root")?,
+            LlamaSampler::temp(0.8),
+            LlamaSampler::greedy(),
+        ]);
+
+        let initial_position = batch.n_tokens();
+        let outcome = ClassifySampleLoop {
+            model,
+            classifier: &mut classifier,
+            sampler: &mut sampler,
+            context: &mut context,
+            batch: &mut batch,
+            initial_position,
+            max_generated_tokens: 10,
+        }
+        .run()?;
+
+        let lowercase = outcome.generated_raw.to_lowercase();
+        assert!(
+            lowercase == "yes" || lowercase == "no",
+            "Grammar loop should produce 'yes' or 'no', got: '{}'",
+            outcome.generated_raw
+        );
+        assert!(
+            outcome.eog_seen,
+            "loop must terminate via EOG once grammar accepts, not by exhausting the budget; outcome={outcome:?}"
+        );
+        assert_eq!(outcome.observed_reasoning, 0);
+        assert_eq!(outcome.observed_undeterminable, 0);
+        assert_eq!(outcome.observed_tool_call, 0);
+        assert!(outcome.observed_content > 0);
+
+        let usage = classifier.into_usage();
+        assert_eq!(usage.completion_tokens(), outcome.observed_content);
+        assert_eq!(usage.reasoning_tokens, 0);
+        assert_eq!(usage.undeterminable_tokens, 0);
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn sample_without_grammar_produces_multiple_tokens(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let model = fixture.model;
+        let mut context = LlamaContext::from_model(
+            model,
+            fixture.backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+
+        let prompt =
+            "<|im_start|>user\nSay hello<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n";
+        let tokens = model.str_to_token(prompt, AddBos::Always)?;
+        let mut batch = LlamaBatch::new(512, 1)?;
+
+        batch.add_sequence(&tokens, 0, false)?;
+
+        context.decode(&mut batch)?;
+
+        let mut sampler =
+            LlamaSampler::chain_simple([LlamaSampler::temp(0.8), LlamaSampler::greedy()]);
+
+        let mut classifier = model.sampled_token_classifier();
+        let mut sampled_count: u64 = 0;
+
+        for (position, _) in (batch.n_tokens()..).zip(0..5) {
+            let (raw_token, _outcomes) = classifier.sample(&mut sampler, &context, -1)?;
+            let raw_as_sampled = SampledToken::Content(raw_token);
+
+            if model.is_eog_token(&raw_as_sampled) {
+                break;
+            }
+
+            sampled_count += 1;
+
+            batch.clear();
+            batch.add(&raw_as_sampled, position, &[0], true)?;
+
+            context.decode(&mut batch)?;
+        }
+
+        let _ = classifier.flush();
+
+        assert!(
+            sampled_count > 0,
+            "Should produce at least one token without grammar"
+        );
+        let usage = classifier.into_usage();
+        assert!(
+            usage.completion_tokens() >= sampled_count,
+            "completion_tokens ({}) must include the {sampled_count} non-EOG samples",
+            usage.completion_tokens()
+        );
+
+        Ok(())
+    }
+}
+
+mod sampling {
+    #![expect(
+        clippy::unnecessary_wraps,
+        reason = "every trial returns anyhow::Result<()> to match the LlamaTestFn signature"
+    )]
+
+    use anyhow::Result;
+    use llama_cpp_bindings::GrammarError;
+    use llama_cpp_bindings::context::LlamaContext;
+    use llama_cpp_bindings::llama_batch::LlamaBatch;
+    use llama_cpp_bindings::model::AddBos;
+    use llama_cpp_bindings::sampling::LlamaSampler;
+    use llama_cpp_bindings::token::LlamaToken;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn dry_sampler_with_model(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let breakers: Vec<&[u8]> = vec![b"\n", b"\t"];
+        let _sampler = LlamaSampler::dry(fixture.model, 1.5, 2.0, 128, 2, &breakers);
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn dry_sampler_with_null_byte_in_seq_breakers_returns_error(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let breakers: Vec<&[u8]> = vec![b"hello\0world"];
+        let result = LlamaSampler::dry(fixture.model, 1.5, 2.0, 128, 2, breakers);
+
+        assert!(result.is_err());
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn grammar_returns_sampler_for_valid_grammar(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let sampler = LlamaSampler::grammar(fixture.model, "root ::= \"hello\"", "root");
+
+        assert!(sampler.is_ok());
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn grammar_lazy_returns_sampler_for_valid_grammar_with_triggers(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let trigger_words: Vec<&[u8]> = vec![b"function"];
+        let sampler = LlamaSampler::grammar_lazy(
+            fixture.model,
+            "root ::= \"hello\"",
+            "root",
+            trigger_words,
+            &[],
+        );
+
+        assert!(sampler.is_ok());
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn grammar_lazy_patterns_returns_sampler_for_valid_grammar_with_patterns(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let patterns = vec!["\\{.*".to_owned()];
+        let sampler = LlamaSampler::grammar_lazy_patterns(
+            fixture.model,
+            "root ::= \"hello\"",
+            "root",
+            &patterns,
+            &[],
+        );
+
+        assert!(sampler.is_ok());
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn grammar_lazy_with_root_not_found_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let trigger_words: Vec<&[u8]> = vec![b"function"];
+        let result = LlamaSampler::grammar_lazy(
+            fixture.model,
+            "expr ::= \"hello\"",
+            "root",
+            trigger_words,
+            &[],
+        );
+
+        assert!(matches!(result, Err(GrammarError::RootNotFound)));
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn grammar_lazy_with_null_byte_in_trigger_word_returns_error(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let trigger_words: Vec<&[u8]> = vec![b"hel\0lo"];
+        let result = LlamaSampler::grammar_lazy(
+            fixture.model,
+            "root ::= \"hello\"",
+            "root",
+            trigger_words,
+            &[],
+        );
+
+        assert!(matches!(result, Err(GrammarError::TriggerWordNullBytes(_))));
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn grammar_lazy_patterns_with_root_not_found_returns_error(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let patterns = vec!["\\{.*".to_owned()];
+        let result = LlamaSampler::grammar_lazy_patterns(
+            fixture.model,
+            "expr ::= \"hello\"",
+            "root",
+            &patterns,
+            &[],
+        );
+
+        assert!(matches!(result, Err(GrammarError::RootNotFound)));
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn grammar_lazy_patterns_with_null_byte_in_pattern_returns_error(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let patterns = vec!["hel\0lo".to_owned()];
+        let result = LlamaSampler::grammar_lazy_patterns(
+            fixture.model,
+            "root ::= \"hello\"",
+            "root",
+            &patterns,
+            &[],
+        );
+
+        assert!(matches!(result, Err(GrammarError::GrammarNullBytes(_))));
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn grammar_lazy_patterns_with_malformed_regex_returns_invalid_trigger_pattern(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let patterns = vec!["[".to_owned()];
+        let result = LlamaSampler::grammar_lazy_patterns(
+            fixture.model,
+            "root ::= \"hello\"",
+            "root",
+            &patterns,
+            &[],
+        );
+
+        assert!(matches!(
+            result,
+            Err(GrammarError::InvalidTriggerPattern { .. }),
+        ));
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn llguidance_method_creates_sampler(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let result = LlamaSampler::llguidance(fixture.model, "regex", r"yes|no");
+
+        assert!(result.is_ok());
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn logit_bias_with_empty_biases_succeeds(_fixture: &LlamaFixture<'_>) -> Result<()> {
+        let result = LlamaSampler::logit_bias(0, &[]);
+
+        assert!(result.is_ok());
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn dry_sampler_with_root_not_found_grammar_does_not_apply(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let breakers: Vec<&[u8]> = vec![b"\n"];
+        let _sampler = LlamaSampler::dry(fixture.model, 1.5, 2.0, 128, 2, &breakers);
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn accept_many_iterates_over_borrowed_tokens(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mut sampler = LlamaSampler::chain_simple([LlamaSampler::greedy()]);
+        let tokens = vec![fixture.model.token_bos(), fixture.model.token_eos()];
+
+        sampler.accept_many(&tokens)?;
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn with_tokens_returns_self_after_accepting_each_token(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let sampler = LlamaSampler::chain_simple([LlamaSampler::greedy()]);
+        let tokens = [fixture.model.token_bos(), fixture.model.token_eos()];
+
+        let _consumed = sampler.with_tokens(tokens.iter().copied())?;
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn accept_consumes_a_single_token(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mut sampler = LlamaSampler::chain_simple([LlamaSampler::greedy()]);
+
+        sampler.accept(fixture.model.token_bos())?;
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn try_accept_returns_ok_for_a_valid_token(_fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mut sampler = LlamaSampler::chain_simple([LlamaSampler::greedy()]);
+
+        sampler.try_accept(LlamaToken::new(0))?;
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn apply_runs_sampler_over_token_data_array(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mut context = LlamaContext::from_model(
+            fixture.model,
+            fixture.backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+        let tokens = fixture.model.str_to_token("Hi", AddBos::Always)?;
+        let mut batch = LlamaBatch::new(512, 1)?;
+        batch.add_sequence(&tokens, 0, false)?;
+        context.decode(&mut batch)?;
+
+        let mut data_array = context.token_data_array_ith(batch.n_tokens() - 1)?;
+        let sampler = LlamaSampler::greedy();
+        sampler.apply(&mut data_array);
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 2048,
+        n_ubatch = 512,
+    )]
+    fn sample_returns_token_after_decode(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mut context = LlamaContext::from_model(
+            fixture.model,
+            fixture.backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+        let tokens = fixture.model.str_to_token("Hello", AddBos::Always)?;
+        let mut batch = LlamaBatch::new(512, 1)?;
+        batch.add_sequence(&tokens, 0, false)?;
+        context.decode(&mut batch)?;
+        let mut sampler =
+            LlamaSampler::chain_simple([LlamaSampler::temp(0.8), LlamaSampler::greedy()]);
+        let result = sampler.sample(&context, batch.n_tokens() - 1);
+
+        assert!(result.is_ok());
+
+        Ok(())
+    }
+}
+
+mod text_generation {
+    use std::io::Write;
+    use std::time::Duration;
+
+    use anyhow::Context as _;
+    use anyhow::Result;
+    use llama_cpp_bindings::context::LlamaContext;
+    use llama_cpp_bindings::ggml_time_us;
+    use llama_cpp_bindings::llama_batch::LlamaBatch;
+    use llama_cpp_bindings::model::AddBos;
+    use llama_cpp_bindings::model::LlamaChatMessage;
+    use llama_cpp_bindings::sampled_token::SampledToken;
+    use llama_cpp_bindings::sampling::LlamaSampler;
+    use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn raw_prompt_completion_with_timing(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let model = fixture.model;
+        let backend = fixture.backend;
+        let mut ctx = LlamaContext::from_model(
+            model,
+            backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )
+        .with_context(|| "unable to create context")?;
+
+        let prompt = "Hello my name is";
+        let max_generated_tokens: i32 = 64;
+
+        let mut classifier = model.sampled_token_classifier();
+        let tokens_list = model
+            .str_to_token(prompt, AddBos::Always)
+            .with_context(|| format!("failed to tokenize {prompt}"))?;
+        let prompt_token_count = u64::try_from(tokens_list.len())?;
+
+        let mut decoder = encoding_rs::UTF_8.new_decoder();
+
+        for token in &tokens_list {
+            eprint!(
+                "{}",
+                model.token_to_piece(&SampledToken::Content(*token), &mut decoder, true, None)?
+            );
+        }
+        std::io::stderr().flush()?;
+
+        let mut batch = LlamaBatch::new(512, 1)?;
+        classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens_list, 0, false)?;
+
+        assert_eq!(classifier.pending_prompt_tokens(), prompt_token_count);
+        assert_eq!(classifier.usage().prompt_tokens, 0);
+
+        ctx.decode(&mut batch)
+            .with_context(|| "llama_decode() failed")?;
+
+        let promoted = classifier.commit_prompt_tokens();
+        assert_eq!(promoted, prompt_token_count);
+        assert_eq!(classifier.usage().prompt_tokens, prompt_token_count);
+
+        let mut sampler =
+            LlamaSampler::chain_simple([LlamaSampler::dist(1234), LlamaSampler::greedy()]);
+        let initial_position = batch.n_tokens();
+        let t_main_start = ggml_time_us();
+        let outcome = ClassifySampleLoop {
+            model,
+            classifier: &mut classifier,
+            sampler: &mut sampler,
+            context: &mut ctx,
+            batch: &mut batch,
+            initial_position,
+            max_generated_tokens,
+        }
+        .run()?;
+        let t_main_end = ggml_time_us();
+        let duration = Duration::from_micros(u64::try_from(t_main_end - t_main_start)?);
+        let total_observed =
+            outcome.observed_content + outcome.observed_reasoning + outcome.observed_undeterminable;
+
+        #[expect(
+            clippy::cast_precision_loss,
+            reason = "logged throughput tolerates f32 precision"
+        )]
+        let tokens_per_second = total_observed as f32 / duration.as_secs_f32();
+
+        eprintln!(
+            "\ndecoded {total_observed} tokens in {:.2} s, speed {tokens_per_second:.2} t/s",
+            duration.as_secs_f32(),
+        );
+
+        assert!(
+            !outcome.generated_raw.is_empty(),
+            "model should generate at least one token"
+        );
+        assert_eq!(
+            outcome.observed_tool_call, 0,
+            "raw prompt without tool-call markers must not produce ToolCall tokens; \
+             outcome={outcome:?}"
+        );
+        assert!(
+            total_observed > 0,
+            "model must produce at least one classified token; outcome={outcome:?}"
+        );
+
+        let usage = classifier.into_usage();
+        assert_eq!(
+            usage.prompt_tokens, prompt_token_count,
+            "prompt_tokens must equal the tokenizer's prompt length"
+        );
+        assert_eq!(
+            usage.content_tokens, outcome.observed_content,
+            "content_tokens must equal observed Content variants"
+        );
+        assert_eq!(
+            usage.reasoning_tokens, outcome.observed_reasoning,
+            "reasoning_tokens must equal observed Reasoning variants"
+        );
+        assert_eq!(
+            usage.undeterminable_tokens, outcome.observed_undeterminable,
+            "undeterminable_tokens must equal observed Undeterminable variants"
+        );
+        assert_eq!(
+            usage.tool_call_tokens, outcome.observed_tool_call,
+            "tool_call_tokens must equal observed ToolCall variants"
+        );
+        assert_eq!(
+            usage.completion_tokens(),
+            total_observed,
+            "completion_tokens must equal Content + Reasoning + Undeterminable"
+        );
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 2048,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 2048,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 2048,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 2048,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn chat_inference_produces_coherent_output(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let model = fixture.model;
+        let backend = fixture.backend;
+        let mut context = LlamaContext::from_model(
+            model,
+            backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+
+        let chat_template = model.chat_template(None)?;
+        let messages = vec![LlamaChatMessage::new(
+            "user".to_string(),
+            "Hello! How are you?".to_string(),
+        )?];
+        let prompt = model.apply_chat_template(&chat_template, &messages, true)?;
+
+        let mut classifier = model.sampled_token_classifier();
+        let tokens = model.str_to_token(&prompt, AddBos::Always)?;
+        let prompt_token_count = u64::try_from(tokens.len())?;
+
+        let mut batch = LlamaBatch::new(512, 1)?;
+        classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?;
+
+        assert_eq!(classifier.pending_prompt_tokens(), prompt_token_count);
+        assert_eq!(classifier.usage().prompt_tokens, 0);
+
+        context.decode(&mut batch)?;
+
+        let promoted = classifier.commit_prompt_tokens();
+        assert_eq!(promoted, prompt_token_count);
+
+        let mut sampler = LlamaSampler::greedy();
+        let initial_position = batch.n_tokens();
+        let outcome = ClassifySampleLoop {
+            model,
+            classifier: &mut classifier,
+            sampler: &mut sampler,
+            context: &mut context,
+            batch: &mut batch,
+            initial_position,
+            max_generated_tokens: 1024,
+        }
+        .run()?;
+
+        println!();
+
+        assert!(
+            !outcome.generated_raw.is_empty(),
+            "model should generate at least one token"
+        );
+        let total_observed =
+            outcome.observed_content + outcome.observed_reasoning + outcome.observed_undeterminable;
+        assert!(
+            total_observed > 0,
+            "model must produce at least one classified token; outcome={outcome:?}"
+        );
+        assert_eq!(
+            outcome.observed_tool_call, 0,
+            "chat without tool definitions must not produce ToolCall tokens; outcome={outcome:?}"
+        );
+
+        let usage = classifier.into_usage();
+
+        assert_eq!(
+            usage.prompt_tokens, prompt_token_count,
+            "prompt_tokens must equal the tokenizer's prompt length"
+        );
+        assert_eq!(
+            usage.content_tokens, outcome.observed_content,
+            "content_tokens must equal observed Content variants"
+        );
+        assert_eq!(
+            usage.reasoning_tokens, outcome.observed_reasoning,
+            "reasoning_tokens must equal observed Reasoning variants"
+        );
+        assert_eq!(
+            usage.undeterminable_tokens, outcome.observed_undeterminable,
+            "undeterminable_tokens must equal observed Undeterminable variants"
+        );
+        assert_eq!(
+            usage.completion_tokens(),
+            total_observed,
+            "completion_tokens must equal Content + Reasoning + Undeterminable"
+        );
+        assert_eq!(
+            usage.tool_call_tokens, outcome.observed_tool_call,
+            "tool_call_tokens must equal observed ToolCall variants"
+        );
+
+        Ok(())
+    }
+}
+
+mod constrained_decoding {
+    use std::io::Write;
+
+    use anyhow::Result;
+    use llama_cpp_bindings::context::LlamaContext;
+    use llama_cpp_bindings::llama_batch::LlamaBatch;
+    use llama_cpp_bindings::model::AddBos;
+    use llama_cpp_bindings::sampled_token::SampledToken;
+    use llama_cpp_bindings::sampling::LlamaSampler;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn json_schema_constrains_output(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let model = fixture.model;
+        let backend = fixture.backend;
+
+        let prompt = "The weather in Paris is sunny and 22 degrees. Extract as JSON:\n";
+
+        let mut ctx = LlamaContext::from_model(
+            model,
+            backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+
+        let tokens_list = model.str_to_token(prompt, AddBos::Always)?;
+
+        let mut batch = LlamaBatch::new(512, 1)?;
+        let last_index = i32::try_from(tokens_list.len())? - 1;
+
+        for (index, token) in (0_i32..).zip(&tokens_list) {
+            batch.add(
+                &SampledToken::Content(*token),
+                index,
+                &[0],
+                index == last_index,
+            )?;
+        }
+
+        ctx.decode(&mut batch)?;
+
+        let schema = r#"{
+      "type": "object",
+      "properties": {
+        "city": { "type": "string" },
+        "temperature": { "type": "number" }
+      },
+      "required": ["city", "temperature"]
+    }"#;
+
+        let llg_sampler = LlamaSampler::llguidance(model, "json", schema)?;
+        let mut sampler = LlamaSampler::chain_simple([llg_sampler, LlamaSampler::greedy()]);
+
+        let mut n_cur = batch.n_tokens();
+        let mut decoder = encoding_rs::UTF_8.new_decoder();
+        let mut generated = String::new();
+
+        while n_cur <= 128 {
+            let token = SampledToken::Content(sampler.sample(&ctx, batch.n_tokens() - 1)?);
+
+            if model.is_eog_token(&token) {
+                break;
+            }
+
+            let output_string = model.token_to_piece(&token, &mut decoder, true, None)?;
+            generated.push_str(&output_string);
+            print!("{output_string}");
+            std::io::stdout().flush()?;
+
+            batch.clear();
+            batch.add(&token, n_cur, &[0], true)?;
+            n_cur += 1;
+            ctx.decode(&mut batch)?;
+        }
+
+        println!();
+
+        let parsed = serde_json::Deserializer::from_str(&generated)
+            .into_iter::<serde_json::Value>()
+            .next()
+            .ok_or_else(|| anyhow::anyhow!("model produced no JSON value"))??;
+
+        assert!(parsed.get("city").is_some());
+        assert!(parsed.get("temperature").is_some());
+
+        Ok(())
+    }
+}
+
+mod llguidance {
+    #![expect(
+        clippy::unnecessary_wraps,
+        reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
+    )]
+
+    use std::ffi::CStr;
+    use std::sync::Arc;
+
+    use anyhow::Result;
+    use llama_cpp_bindings::context::LlamaContext;
+    use llama_cpp_bindings::llama_batch::LlamaBatch;
+    use llama_cpp_bindings::llguidance_sampler::create_llg_sampler;
+    use llama_cpp_bindings::model::AddBos;
+    use llama_cpp_bindings::sampling::LlamaSampler;
+    use llama_cpp_bindings::token::LlamaToken;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    const JSON_SCHEMA: &str =
+        r#"{"type":"object","properties":{"answer":{"type":"string"}},"required":["answer"]}"#;
+    const REGEX_GRAMMAR: &str = r"yes|no";
+    const LARK_GRAMMAR: &str = r#"start: "yes" | "no""#;
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn creates_sampler_with_valid_json_schema(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let sampler = create_llg_sampler(fixture.model, "json", JSON_SCHEMA)?;
+
+        assert!(!sampler.sampler.is_null());
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn creates_sampler_with_valid_regex_grammar(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?;
+
+        assert!(!sampler.sampler.is_null());
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn creates_sampler_with_valid_lark_grammar(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let sampler = create_llg_sampler(fixture.model, "lark", LARK_GRAMMAR)?;
+
+        assert!(!sampler.sampler.is_null());
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn returns_error_for_unknown_grammar_kind(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let result = create_llg_sampler(fixture.model, "not_a_real_kind", "anything");
+
+        assert!(result.is_err());
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn returns_error_for_malformed_json_schema(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let result = create_llg_sampler(fixture.model, "json", "{this is not valid json");
+
+        assert!(result.is_err());
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn returns_error_for_malformed_regex(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let result = create_llg_sampler(fixture.model, "regex", "[invalid");
+
+        assert!(result.is_err());
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn name_callback_returns_llguidance(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?;
+
+        let name_ptr = unsafe { llama_cpp_bindings_sys::llama_sampler_name(sampler.sampler) };
+        assert!(!name_ptr.is_null());
+        let name = unsafe { CStr::from_ptr(name_ptr) }.to_str()?;
+
+        assert_eq!(name, "llguidance");
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn clone_via_ffi_creates_independent_sampler(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?;
+
+        let cloned = unsafe { llama_cpp_bindings_sys::llama_sampler_clone(sampler.sampler) };
+
+        assert!(!cloned.is_null());
+
+        unsafe { llama_cpp_bindings_sys::llama_sampler_free(cloned) };
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn samples_token_constrained_by_grammar(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let model = fixture.model;
+        let backend = fixture.backend;
+        let mut context = LlamaContext::from_model(
+            model,
+            backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+
+        let prompt = "Answer yes or no:";
+        let tokens = model.str_to_token(prompt, AddBos::Always)?;
+        let mut batch = LlamaBatch::new(512, 1)?;
+        batch.add_sequence(&tokens, 0, false)?;
+        context.decode(&mut batch)?;
+
+        let llg_sampler = create_llg_sampler(model, "regex", REGEX_GRAMMAR)?;
+        let mut chain = LlamaSampler::chain_simple([llg_sampler, LlamaSampler::greedy()]);
+
+        let token = chain.sample(&context, batch.n_tokens() - 1)?;
+        chain.accept(token)?;
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn accept_invalid_token_id_does_not_panic(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mut sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?;
+
+        let huge_token = LlamaToken(i32::MAX - 1);
+        let _ = sampler.accept(huge_token);
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn approximate_tok_env_returns_same_arc_across_calls(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let first = fixture.model.approximate_tok_env();
+        let second = fixture.model.approximate_tok_env();
+
+        assert!(Arc::ptr_eq(&first, &second));
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn approximate_tok_env_drives_consistent_grammar_constraint(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let first = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?;
+        let second = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?;
+
+        assert!(!first.sampler.is_null());
+        assert!(!second.sampler.is_null());
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn apply_through_chain_during_sample_does_not_panic(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let model = fixture.model;
+        let backend = fixture.backend;
+        let mut context = LlamaContext::from_model(
+            model,
+            backend,
+            (*fixture.context_params).into_llama_context_params(),
+        )?;
+
+        let tokens = model.str_to_token("Answer:", AddBos::Always)?;
+        let mut batch = LlamaBatch::new(512, 1)?;
+        batch.add_sequence(&tokens, 0, false)?;
+        context.decode(&mut batch)?;
+
+        let llg_sampler = create_llg_sampler(model, "regex", REGEX_GRAMMAR)?;
+        let mut chain = LlamaSampler::chain_simple([llg_sampler, LlamaSampler::greedy()]);
+        let _ = chain.sample(&context, batch.n_tokens() - 1);
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 512,
+        n_ubatch = 128,
+    )]
+    fn reset_clears_sampler_state(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let mut sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?;
+        let huge_token = LlamaToken(i32::MAX - 1);
+        let _ = sampler.accept(huge_token);
+        sampler.reset();
+        let after = sampler.accept(LlamaToken(0));
+        assert!(
+            after.is_ok() || after.is_err(),
+            "after reset, sampler.accept must return Ok or Err (not panic)"
+        );
+        Ok(())
+    }
+}
+
+mod sampled_token_classifier_markers {
+    #![expect(
+        clippy::unnecessary_wraps,
+        reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
+    )]
+
+    use anyhow::Result;
+    use llama_cpp_bindings::SampledToken;
+    use llama_cpp_bindings::llama_batch::LlamaBatch;
+    use llama_cpp_bindings::sampled_token_classifier::SampledTokenClassifier;
+    use llama_cpp_bindings::sampled_token_section::SampledTokenSection;
+    use llama_cpp_bindings::streaming_markers::StreamingMarkers;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn classifier_starts_in_pending_section_for_default_fixture(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let classifier = fixture.model.sampled_token_classifier();
+
+        assert_eq!(classifier.current_section(), SampledTokenSection::Pending);
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn classifier_construction_is_idempotent_across_calls(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let first = fixture.model.sampled_token_classifier();
+        let second = fixture.model.sampled_token_classifier();
+
+        assert_eq!(first.current_section(), second.current_section());
+        assert_eq!(first.usage(), second.usage());
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn ingest_with_no_markers_emits_undeterminable_with_visible_and_raw_piece(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let model = fixture.model;
+        let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default());
+
+        let outcomes = classifier.ingest(model.token_bos());
+
+        assert_eq!(outcomes.len(), 1);
+        let outcome = &outcomes[0];
+        assert!(matches!(
+            outcome.sampled_token,
+            SampledToken::Undeterminable(_)
+        ));
+        assert_eq!(outcome.visible_piece, outcome.raw_piece);
+        assert_eq!(classifier.usage().undeterminable_tokens, 1);
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn ingest_with_no_markers_decodes_each_token_independently(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let model = fixture.model;
+        let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default());
+
+        let _ = classifier.ingest(model.token_bos());
+        let _ = classifier.ingest(model.token_eos());
+
+        assert_eq!(classifier.usage().undeterminable_tokens, 2);
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn ingest_prompt_token_with_no_markers_is_a_noop(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let model = fixture.model;
+        let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default());
+        let usage_before = *classifier.usage();
+
+        classifier.ingest_prompt_token(model.token_bos());
+        classifier.ingest_prompt_tokens(&[model.token_eos(), model.token_nl()]);
+
+        assert_eq!(*classifier.usage(), usage_before);
+        assert_eq!(classifier.current_section(), SampledTokenSection::Pending);
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn feed_prompt_to_batch_increments_pending_prompt_tokens(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let model = fixture.model;
+        let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default());
+        let mut batch = LlamaBatch::new(8, 1)?;
+
+        classifier.feed_prompt_to_batch(&mut batch, model.token_bos(), 0, &[0], false)?;
+        classifier.feed_prompt_to_batch(&mut batch, model.token_eos(), 1, &[0], false)?;
+
+        assert_eq!(classifier.pending_prompt_tokens(), 2);
+        assert_eq!(batch.n_tokens(), 2);
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn feed_prompt_sequence_to_batch_stages_all_tokens(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let model = fixture.model;
+        let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default());
+        let mut batch = LlamaBatch::new(8, 1)?;
+
+        let tokens = vec![model.token_bos(), model.token_eos(), model.token_nl()];
+        classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?;
+
+        assert_eq!(classifier.pending_prompt_tokens(), 3);
+        assert_eq!(batch.n_tokens(), 3);
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn commit_prompt_tokens_promotes_pending_count_to_usage_and_clears(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let model = fixture.model;
+        let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default());
+        let mut batch = LlamaBatch::new(8, 1)?;
+
+        classifier.feed_prompt_to_batch(&mut batch, model.token_bos(), 0, &[0], false)?;
+        classifier.feed_prompt_to_batch(&mut batch, model.token_eos(), 1, &[0], false)?;
+
+        let promoted = classifier.commit_prompt_tokens();
+
+        assert_eq!(promoted, 2);
+        assert_eq!(classifier.pending_prompt_tokens(), 0);
+        assert_eq!(classifier.usage().prompt_tokens, 2);
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn discard_pending_prompt_tokens_clears_count_without_recording_usage(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let model = fixture.model;
+        let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default());
+        let mut batch = LlamaBatch::new(8, 1)?;
+
+        classifier.feed_prompt_to_batch(&mut batch, model.token_bos(), 0, &[0], false)?;
+
+        let discarded = classifier.discard_pending_prompt_tokens();
+
+        assert_eq!(discarded, 1);
+        assert_eq!(classifier.pending_prompt_tokens(), 0);
+        assert_eq!(classifier.usage().prompt_tokens, 0);
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn diagnose_tool_call_synthetic_renders_returns_a_pair_of_strings(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let (left, right) = fixture.model.diagnose_tool_call_synthetic_renders()?;
+        let _ = left;
+        let _ = right;
+        Ok(())
+    }
+}
+
+llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/text_generation.rs b/llama-cpp-bindings-tests/tests/text_generation.rs
deleted file mode 100644
index 57fd54d7..00000000
--- a/llama-cpp-bindings-tests/tests/text_generation.rs
+++ /dev/null
@@ -1,298 +0,0 @@
-use std::io::Write;
-use std::time::Duration;
-
-use anyhow::Context as _;
-use anyhow::Result;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::ggml_time_us;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_bindings::model::LlamaChatMessage;
-use llama_cpp_bindings::sampled_token::SampledToken;
-use llama_cpp_bindings::sampling::LlamaSampler;
-use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 512,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn raw_prompt_completion_with_timing(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let model = fixture.model;
-    let backend = fixture.backend;
-    let mut ctx = LlamaContext::from_model(
-        model,
-        backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )
-    .with_context(|| "unable to create context")?;
-
-    let prompt = "Hello my name is";
-    let max_generated_tokens: i32 = 64;
-
-    let mut classifier = model.sampled_token_classifier();
-    let tokens_list = model
-        .str_to_token(prompt, AddBos::Always)
-        .with_context(|| format!("failed to tokenize {prompt}"))?;
-    let prompt_token_count = u64::try_from(tokens_list.len())?;
-
-    let mut decoder = encoding_rs::UTF_8.new_decoder();
-
-    for token in &tokens_list {
-        eprint!(
-            "{}",
-            model.token_to_piece(&SampledToken::Content(*token), &mut decoder, true, None)?
-        );
-    }
-    std::io::stderr().flush()?;
-
-    let mut batch = LlamaBatch::new(512, 1)?;
-    classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens_list, 0, false)?;
-
-    assert_eq!(classifier.pending_prompt_tokens(), prompt_token_count);
-    assert_eq!(classifier.usage().prompt_tokens, 0);
-
-    ctx.decode(&mut batch)
-        .with_context(|| "llama_decode() failed")?;
-
-    let promoted = classifier.commit_prompt_tokens();
-    assert_eq!(promoted, prompt_token_count);
-    assert_eq!(classifier.usage().prompt_tokens, prompt_token_count);
-
-    let mut sampler =
-        LlamaSampler::chain_simple([LlamaSampler::dist(1234), LlamaSampler::greedy()]);
-    let initial_position = batch.n_tokens();
-    let t_main_start = ggml_time_us();
-    let outcome = ClassifySampleLoop {
-        model,
-        classifier: &mut classifier,
-        sampler: &mut sampler,
-        context: &mut ctx,
-        batch: &mut batch,
-        initial_position,
-        max_generated_tokens,
-    }
-    .run()?;
-    let t_main_end = ggml_time_us();
-    let duration = Duration::from_micros(u64::try_from(t_main_end - t_main_start)?);
-    let total_observed =
-        outcome.observed_content + outcome.observed_reasoning + outcome.observed_undeterminable;
-
-    #[expect(
-        clippy::cast_precision_loss,
-        reason = "logged throughput tolerates f32 precision"
-    )]
-    let tokens_per_second = total_observed as f32 / duration.as_secs_f32();
-
-    eprintln!(
-        "\ndecoded {total_observed} tokens in {:.2} s, speed {tokens_per_second:.2} t/s",
-        duration.as_secs_f32(),
-    );
-
-    assert!(
-        !outcome.generated_raw.is_empty(),
-        "model should generate at least one token"
-    );
-    assert_eq!(
-        outcome.observed_tool_call, 0,
-        "raw prompt without tool-call markers must not produce ToolCall tokens; \
-         outcome={outcome:?}"
-    );
-    assert!(
-        total_observed > 0,
-        "model must produce at least one classified token; outcome={outcome:?}"
-    );
-
-    let usage = classifier.into_usage();
-    assert_eq!(
-        usage.prompt_tokens, prompt_token_count,
-        "prompt_tokens must equal the tokenizer's prompt length"
-    );
-    assert_eq!(
-        usage.content_tokens, outcome.observed_content,
-        "content_tokens must equal observed Content variants"
-    );
-    assert_eq!(
-        usage.reasoning_tokens, outcome.observed_reasoning,
-        "reasoning_tokens must equal observed Reasoning variants"
-    );
-    assert_eq!(
-        usage.undeterminable_tokens, outcome.observed_undeterminable,
-        "undeterminable_tokens must equal observed Undeterminable variants"
-    );
-    assert_eq!(
-        usage.tool_call_tokens, outcome.observed_tool_call,
-        "tool_call_tokens must equal observed ToolCall variants"
-    );
-    assert_eq!(
-        usage.completion_tokens(),
-        total_observed,
-        "completion_tokens must equal Content + Reasoning + Undeterminable"
-    );
-
-    Ok(())
-}
-
-#[llama_test(
-    model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 2048,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 2048,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 2048,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-#[llama_test(
-    model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
-    n_gpu_layers = 999,
-    use_mmap = true,
-    use_mlock = false,
-    n_ctx = 2048,
-    n_batch = 512,
-    n_ubatch = 128,
-)]
-fn chat_inference_produces_coherent_output(fixture: &LlamaFixture<'_>) -> Result<()> {
-    let model = fixture.model;
-    let backend = fixture.backend;
-    let mut context = LlamaContext::from_model(
-        model,
-        backend,
-        (*fixture.context_params).into_llama_context_params(),
-    )?;
-
-    let chat_template = model.chat_template(None)?;
-    let messages = vec![LlamaChatMessage::new(
-        "user".to_string(),
-        "Hello! How are you?".to_string(),
-    )?];
-    let prompt = model.apply_chat_template(&chat_template, &messages, true)?;
-
-    let mut classifier = model.sampled_token_classifier();
-    let tokens = model.str_to_token(&prompt, AddBos::Always)?;
-    let prompt_token_count = u64::try_from(tokens.len())?;
-
-    let mut batch = LlamaBatch::new(512, 1)?;
-    classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?;
-
-    assert_eq!(classifier.pending_prompt_tokens(), prompt_token_count);
-    assert_eq!(classifier.usage().prompt_tokens, 0);
-
-    context.decode(&mut batch)?;
-
-    let promoted = classifier.commit_prompt_tokens();
-    assert_eq!(promoted, prompt_token_count);
-
-    let mut sampler = LlamaSampler::greedy();
-    let initial_position = batch.n_tokens();
-    let outcome = ClassifySampleLoop {
-        model,
-        classifier: &mut classifier,
-        sampler: &mut sampler,
-        context: &mut context,
-        batch: &mut batch,
-        initial_position,
-        max_generated_tokens: 1024,
-    }
-    .run()?;
-
-    println!();
-
-    assert!(
-        !outcome.generated_raw.is_empty(),
-        "model should generate at least one token"
-    );
-    let total_observed =
-        outcome.observed_content + outcome.observed_reasoning + outcome.observed_undeterminable;
-    assert!(
-        total_observed > 0,
-        "model must produce at least one classified token; outcome={outcome:?}"
-    );
-    assert_eq!(
-        outcome.observed_tool_call, 0,
-        "chat without tool definitions must not produce ToolCall tokens; outcome={outcome:?}"
-    );
-
-    let usage = classifier.into_usage();
-
-    assert_eq!(
-        usage.prompt_tokens, prompt_token_count,
-        "prompt_tokens must equal the tokenizer's prompt length"
-    );
-    assert_eq!(
-        usage.content_tokens, outcome.observed_content,
-        "content_tokens must equal observed Content variants"
-    );
-    assert_eq!(
-        usage.reasoning_tokens, outcome.observed_reasoning,
-        "reasoning_tokens must equal observed Reasoning variants"
-    );
-    assert_eq!(
-        usage.undeterminable_tokens, outcome.observed_undeterminable,
-        "undeterminable_tokens must equal observed Undeterminable variants"
-    );
-    assert_eq!(
-        usage.completion_tokens(),
-        total_observed,
-        "completion_tokens must equal Content + Reasoning + Undeterminable"
-    );
-    assert_eq!(
-        usage.tool_call_tokens, outcome.observed_tool_call,
-        "tool_call_tokens must equal observed ToolCall variants"
-    );
-
-    Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/vocabulary_and_metadata.rs b/llama-cpp-bindings-tests/tests/vocabulary_and_metadata.rs
new file mode 100644
index 00000000..7b26c7ee
--- /dev/null
+++ b/llama-cpp-bindings-tests/tests/vocabulary_and_metadata.rs
@@ -0,0 +1,1978 @@
+use llama_cpp_test_harness::llama_tests_main;
+
+mod model_properties {
+    #![expect(
+        clippy::unnecessary_wraps,
+        reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
+    )]
+
+    use anyhow::Result;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn model_loads_with_valid_metadata(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let model = fixture.model;
+
+        assert!(model.n_vocab() > 0);
+        assert!(model.n_embd() > 0);
+        assert!(model.n_params() > 0);
+        assert!(model.n_ctx_train()? > 0);
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn n_layer_returns_positive(fixture: &LlamaFixture<'_>) -> Result<()> {
+        assert!(fixture.model.n_layer()? > 0);
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn n_head_returns_positive(fixture: &LlamaFixture<'_>) -> Result<()> {
+        assert!(fixture.model.n_head()? > 0);
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn n_head_kv_returns_positive(fixture: &LlamaFixture<'_>) -> Result<()> {
+        assert!(fixture.model.n_head_kv()? > 0);
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn model_size_returns_nonzero(fixture: &LlamaFixture<'_>) -> Result<()> {
+        assert!(fixture.model.size() > 0);
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn is_recurrent_returns_false_for_transformer(fixture: &LlamaFixture<'_>) -> Result<()> {
+        assert!(!fixture.model.is_recurrent());
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn is_hybrid_returns_false_for_non_hybrid_default_models(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        assert!(
+            !fixture.model.is_hybrid(),
+            "DeepSeek-R1-Distill-Llama-8B and GLM-4.7-Flash are pure transformers, not hybrid; got is_hybrid=true"
+        );
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn is_hybrid_returns_true_for_qwen_default_models(fixture: &LlamaFixture<'_>) -> Result<()> {
+        assert!(
+            fixture.model.is_hybrid(),
+            "Qwen 3.5 and Qwen 3.6 default GGUFs are reported as hybrid by llama.cpp; got is_hybrid=false"
+        );
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn rope_type_returns_a_known_variant_for_rope_carrying_default_models(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        use llama_cpp_bindings::model::rope_type::RopeType;
+        let rope = fixture.model.rope_type();
+        assert!(
+            matches!(
+                rope,
+                Some(RopeType::Norm | RopeType::NeoX | RopeType::MRope | RopeType::Vision)
+            ),
+            "rope_type must be a known variant for DeepSeek and GLM-4.7; got {rope:?}"
+        );
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn rope_type_returns_none_for_qwen_default_models(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let rope = fixture.model.rope_type();
+        assert!(
+            rope.is_none(),
+            "Qwen 3.5 and Qwen 3.6 default GGUFs do not expose a rope_type in their metadata; got {rope:?}"
+        );
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn vocab_type_returns_a_known_variant(fixture: &LlamaFixture<'_>) -> Result<()> {
+        use llama_cpp_bindings::model::vocab_type::VocabType;
+        let vocab = fixture.model.vocab_type()?;
+        assert!(
+            matches!(vocab, VocabType::BPE | VocabType::SPM),
+            "vocab_type must be a known variant; got {vocab:?}"
+        );
+        Ok(())
+    }
+}
+
+mod model_metadata_kv {
+    #![expect(
+        clippy::unnecessary_wraps,
+        reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
+    )]
+
+    use anyhow::Result;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn meta_count_returns_positive(fixture: &LlamaFixture<'_>) -> Result<()> {
+        assert!(fixture.model.meta_count() > 0);
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn meta_key_by_index_returns_valid_key(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let key = fixture.model.meta_key_by_index(0)?;
+        assert!(!key.is_empty());
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn meta_val_str_by_index_returns_valid_value(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let value = fixture.model.meta_val_str_by_index(0)?;
+        assert!(!value.is_empty());
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn meta_key_by_index_out_of_range_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let result = fixture.model.meta_key_by_index(999_999);
+        assert!(result.is_err());
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn meta_val_str_by_index_out_of_range_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let result = fixture.model.meta_val_str_by_index(999_999);
+        assert!(result.is_err());
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn meta_val_str_returns_value_for_known_key(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let model = fixture.model;
+        let first_key = model.meta_key_by_index(0)?;
+        let value = model.meta_val_str(&first_key)?;
+        assert!(!value.is_empty());
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn meta_val_str_with_long_value_triggers_buffer_resize(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let model = fixture.model;
+        let count = model.meta_count();
+
+        for index in 0..count {
+            let key = model.meta_key_by_index(index);
+            let value = model.meta_val_str_by_index(index);
+            assert!(key.is_ok());
+            assert!(value.is_ok());
+        }
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn meta_val_str_with_null_byte_in_key_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let result = fixture.model.meta_val_str("key\0with_null");
+        assert!(result.is_err());
+        Ok(())
+    }
+}
+
+mod model_params {
+    #![expect(
+        clippy::similar_names,
+        reason = "model_path_str and model_path_cstr are both genuinely needed; renaming would not improve clarity"
+    )]
+
+    use std::ffi::CString;
+    use std::pin::pin;
+
+    use anyhow::Result;
+    use llama_cpp_bindings::context::params::LlamaContextParams;
+    use llama_cpp_bindings::max_devices;
+    use llama_cpp_bindings::model::params::LlamaModelParams;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn fit_params_succeeds_with_test_model(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let model_path_str = fixture
+            .model_path
+            .to_str()
+            .ok_or_else(|| anyhow::anyhow!("model path is not valid UTF-8"))?;
+        let model_path_cstr = CString::new(model_path_str)?;
+
+        let mut params = pin!(LlamaModelParams::default());
+        let mut context_params = LlamaContextParams::default();
+        let mut margins = vec![0usize; max_devices()];
+
+        let result = params.as_mut().fit_params(
+            &model_path_cstr,
+            &mut context_params,
+            &mut margins,
+            512,
+            llama_cpp_bindings_sys::GGML_LOG_LEVEL_NONE,
+        );
+
+        let fit =
+            result.map_err(|fit_error| anyhow::anyhow!("fit_params failed: {fit_error:?}"))?;
+        assert!(fit.n_ctx > 0);
+
+        Ok(())
+    }
+}
+
+mod model_special_tokens {
+    #![expect(
+        clippy::unnecessary_wraps,
+        reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
+    )]
+
+    use anyhow::Result;
+    use llama_cpp_bindings::SampledToken;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn special_tokens_exist(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let model = fixture.model;
+        let bos = model.token_bos();
+        let eos = model.token_eos();
+
+        assert_ne!(bos, eos);
+        assert!(model.is_eog_token(&SampledToken::Content(eos)));
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn token_nl_returns_valid_token(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let nl_token = fixture.model.token_nl();
+        assert!(nl_token.0 >= 0);
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn is_eog_token_classifies_reasoning_variant(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let model = fixture.model;
+        let eos = model.token_eos();
+        assert!(model.is_eog_token(&SampledToken::Reasoning(eos)));
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn is_eog_token_classifies_tool_call_variant(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let model = fixture.model;
+        let eos = model.token_eos();
+        assert!(model.is_eog_token(&SampledToken::ToolCall(eos)));
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn is_eog_token_classifies_undeterminable_variant(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let model = fixture.model;
+        let eos = model.token_eos();
+        assert!(model.is_eog_token(&SampledToken::Undeterminable(eos)));
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn decode_start_token_returns_valid_token(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let model = fixture.model;
+        let token = model.decode_start_token();
+        let n_vocab = model.n_vocab();
+        assert!(
+            token.0 == -1 || (0..n_vocab).contains(&token.0),
+            "decode_start_token must be either -1 (no decoder-start defined) or a valid vocab index < {n_vocab}; got {token}"
+        );
+        assert_eq!(
+            token,
+            model.decode_start_token(),
+            "decode_start_token must be deterministic across calls"
+        );
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn token_sep_returns_valid_token(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let model = fixture.model;
+        let token = model.token_sep();
+        let n_vocab = model.n_vocab();
+        assert!(
+            token.0 == -1 || (0..n_vocab).contains(&token.0),
+            "token_sep must be either -1 (no SEP token defined) or a valid vocab index < {n_vocab}; got {token}"
+        );
+        assert_eq!(
+            token,
+            model.token_sep(),
+            "token_sep must be deterministic across calls"
+        );
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn token_attr_returns_attrs_for_bos(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let model = fixture.model;
+        let bos = model.token_bos();
+        let attrs = model.token_attr(bos)?;
+        let bit_repr = format!("{:?}", *attrs);
+        assert!(
+            !bit_repr.is_empty(),
+            "token_attr(bos) must produce Debug output"
+        );
+        Ok(())
+    }
+}
+
+mod model_str_to_token {
+    use anyhow::Result;
+    use llama_cpp_bindings::model::AddBos;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn str_to_token_roundtrip(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let model = fixture.model;
+        let tokens = model.str_to_token("hello world", AddBos::Never)?;
+        assert!(!tokens.is_empty());
+        let mut decoder = encoding_rs::UTF_8.new_decoder();
+        let piece = model.token_to_piece(
+            &llama_cpp_bindings::SampledToken::Content(tokens[0]),
+            &mut decoder,
+            false,
+            None,
+        )?;
+
+        assert!(!piece.is_empty());
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn str_to_token_grows_buffer_when_initial_estimation_too_small(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let many_short_chars = "a b c d e f g h i j k l";
+        let tokens = fixture
+            .model
+            .str_to_token(many_short_chars, AddBos::Always)?;
+
+        assert!(
+            tokens.len() > 8,
+            "expected regrow; got {} tokens",
+            tokens.len()
+        );
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn str_to_token_with_add_bos_never(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let model = fixture.model;
+        let tokens_with_bos = model.str_to_token("hello", AddBos::Always)?;
+        let tokens_without_bos = model.str_to_token("hello", AddBos::Never)?;
+
+        assert!(tokens_with_bos.len() >= tokens_without_bos.len());
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn str_to_token_with_many_tokens_triggers_buffer_resize(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        use std::fmt::Write;
+
+        let many_numbers = (0..2000).fold(String::new(), |mut accumulator, number| {
+            let _ = write!(accumulator, "{number} ");
+            accumulator
+        });
+
+        let tokens = fixture.model.str_to_token(&many_numbers, AddBos::Always)?;
+
+        assert!(tokens.len() > many_numbers.len() / 2);
+
+        Ok(())
+    }
+}
+
+mod model_token_to_piece {
+    #![expect(
+        clippy::unnecessary_wraps,
+        reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
+    )]
+
+    use std::num::NonZeroU16;
+
+    use anyhow::Result;
+    use llama_cpp_bindings::SampledToken;
+    use llama_cpp_bindings::model::AddBos;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn token_to_piece_bytes_returns_bytes_for_known_token(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let model = fixture.model;
+        let tokens = model.str_to_token("hello", AddBos::Never)?;
+        let bytes = model.token_to_piece_bytes(tokens[0], 32, false, None)?;
+
+        assert!(!bytes.is_empty());
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn token_to_piece_handles_large_token_requiring_buffer_resize(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let model = fixture.model;
+        let mut decoder = encoding_rs::UTF_8.new_decoder();
+
+        for (token, _) in model.tokens(true).take(200) {
+            let result =
+                model.token_to_piece(&SampledToken::Content(token), &mut decoder, true, None);
+            assert!(result.is_ok());
+        }
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn token_to_piece_bytes_insufficient_buffer_returns_error(
+        fixture: &LlamaFixture<'_>,
+    ) -> Result<()> {
+        let model = fixture.model;
+        let tokens = model.str_to_token("hello", AddBos::Never)?;
+        let result = model.token_to_piece_bytes(tokens[0], 1, false, None);
+
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("Insufficient Buffer Space")
+        );
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn token_to_piece_with_lstrip(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let model = fixture.model;
+        let mut decoder = encoding_rs::UTF_8.new_decoder();
+        let tokens = model.str_to_token("hello", AddBos::Never)?;
+        let result = model.token_to_piece(
+            &SampledToken::Content(tokens[0]),
+            &mut decoder,
+            false,
+            NonZeroU16::new(1),
+        );
+
+        assert!(result.is_ok());
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn token_to_piece_decodes_reasoning_variant(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let model = fixture.model;
+        let mut decoder = encoding_rs::UTF_8.new_decoder();
+        let tokens = model.str_to_token("hi", AddBos::Never)?;
+
+        let piece = model.token_to_piece(
+            &SampledToken::Reasoning(tokens[0]),
+            &mut decoder,
+            true,
+            None,
+        )?;
+
+        assert!(!piece.is_empty());
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn token_to_piece_decodes_tool_call_variant(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let model = fixture.model;
+        let mut decoder = encoding_rs::UTF_8.new_decoder();
+        let tokens = model.str_to_token("hi", AddBos::Never)?;
+
+        let piece =
+            model.token_to_piece(&SampledToken::ToolCall(tokens[0]), &mut decoder, true, None)?;
+
+        assert!(!piece.is_empty());
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn token_to_piece_decodes_undeterminable_variant(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let model = fixture.model;
+        let mut decoder = encoding_rs::UTF_8.new_decoder();
+        let tokens = model.str_to_token("hi", AddBos::Never)?;
+
+        let piece = model.token_to_piece(
+            &SampledToken::Undeterminable(tokens[0]),
+            &mut decoder,
+            true,
+            None,
+        )?;
+
+        assert!(!piece.is_empty());
+        Ok(())
+    }
+}
+
+mod model_tokens_iterator {
+    #![expect(
+        clippy::unnecessary_wraps,
+        reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
+    )]
+
+    use anyhow::Result;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn tokens_iterator_produces_valid_entries(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let model = fixture.model;
+        let mut count = 0;
+
+        for (token, _piece_result) in model.tokens(false) {
+            assert!(token.0 >= 0);
+            count += 1;
+
+            if count >= 100 {
+                break;
+            }
+        }
+
+        assert_eq!(count, 100);
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 512,
+        n_batch = 128,
+        n_ubatch = 64,
+    )]
+    fn n_vocab_matches_tokens_iterator_count(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let model = fixture.model;
+        let n_vocab = model.n_vocab();
+        let count = model.tokens(false).count();
+
+        assert_eq!(count, usize::try_from(n_vocab)?);
+        Ok(())
+    }
+}
+
+mod model_helpers {
+    #![expect(
+        clippy::unnecessary_wraps,
+        reason = "every trial returns anyhow::Result<()> to match the LlamaTestFn signature"
+    )]
+
+    use anyhow::Result;
+    use llama_cpp_test_harness::LlamaFixture;
+    use llama_cpp_test_harness::llama_test;
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 2048,
+        n_batch = 512,
+        n_ubatch = 128
+    )]
+    fn debug_format_includes_struct_name_and_model_field(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let formatted = format!("{:?}", fixture.model);
+
+        assert!(formatted.contains("LlamaModel"));
+        assert!(formatted.contains("model"));
+
+        Ok(())
+    }
+
+    #[llama_test(
+        model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+        n_gpu_layers = 999,
+        use_mmap = true,
+        use_mlock = false,
+        n_ctx = 2048,
+        n_batch = 512,
+        n_ubatch = 128
+    )]
+    fn approximate_tok_env_is_cached_across_calls(fixture: &LlamaFixture<'_>) -> Result<()> {
+        let first = fixture.model.approximate_tok_env();
+        let second = fixture.model.approximate_tok_env();
+
+        assert!(std::sync::Arc::ptr_eq(&first, &second));
+
+        Ok(())
+    }
+}
+
+llama_tests_main!();
diff --git a/llama-cpp-test-harness/Cargo.toml b/llama-cpp-test-harness/Cargo.toml
index 041ea779..477362da 100644
--- a/llama-cpp-test-harness/Cargo.toml
+++ b/llama-cpp-test-harness/Cargo.toml
@@ -13,6 +13,7 @@ inventory = { workspace = true }
 libtest-mimic = { workspace = true }
 llama-cpp-bindings = { workspace = true }
 llama-cpp-test-harness-macros = { workspace = true }
+thiserror = { workspace = true }
 
 [features]
 cuda = ["llama-cpp-bindings/cuda"]
diff --git a/llama-cpp-test-harness/src/deterministic_arguments.rs b/llama-cpp-test-harness/src/deterministic_arguments.rs
deleted file mode 100644
index 353053dd..00000000
--- a/llama-cpp-test-harness/src/deterministic_arguments.rs
+++ /dev/null
@@ -1,50 +0,0 @@
-use libtest_mimic::Arguments;
-
-const fn build_deterministic_arguments(mut arguments: Arguments) -> Arguments {
-    arguments.test_threads = Some(1);
-    arguments
-}
-
-#[must_use]
-pub fn deterministic_arguments_from_cli() -> Arguments {
-    build_deterministic_arguments(Arguments::from_args())
-}
-
-#[cfg(test)]
-mod tests {
-    use libtest_mimic::Arguments;
-
-    use super::build_deterministic_arguments;
-
-    #[test]
-    fn build_deterministic_arguments_forces_test_threads_to_one() {
-        let input = Arguments {
-            test_threads: Some(8),
-            ..Arguments::default()
-        };
-        let output = build_deterministic_arguments(input);
-
-        assert_eq!(output.test_threads, Some(1));
-    }
-
-    #[test]
-    fn build_deterministic_arguments_overrides_unset_test_threads() {
-        let input = Arguments::default();
-        let output = build_deterministic_arguments(input);
-
-        assert_eq!(output.test_threads, Some(1));
-    }
-
-    #[test]
-    fn build_deterministic_arguments_preserves_other_settings() {
-        let input = Arguments {
-            list: true,
-            filter: Some("foo".to_owned()),
-            ..Arguments::default()
-        };
-        let output = build_deterministic_arguments(input);
-
-        assert!(output.list);
-        assert_eq!(output.filter.as_deref(), Some("foo"));
-    }
-}
diff --git a/llama-cpp-test-harness/src/execution_plan.rs b/llama-cpp-test-harness/src/execution_plan.rs
index 52f6dd4c..927c87a8 100644
--- a/llama-cpp-test-harness/src/execution_plan.rs
+++ b/llama-cpp-test-harness/src/execution_plan.rs
@@ -16,10 +16,10 @@
 use std::collections::BTreeMap;
 use std::sync::Arc;
 
+use libtest_mimic::Arguments;
 use libtest_mimic::Conclusion;
 use llama_cpp_bindings::llama_backend::LlamaBackend;
 
-use crate::deterministic_arguments::deterministic_arguments_from_cli;
 use crate::execution_phase::ExecutionPhase;
 use crate::llama_test_registration::LlamaTestRegistration;
 
@@ -65,13 +65,12 @@ impl ExecutionPlan {
     }
 
     #[must_use]
-    pub fn run(&self, backend: &Arc<LlamaBackend>) -> Vec<Conclusion> {
-        let arguments = deterministic_arguments_from_cli();
+    pub fn run(&self, backend: &Arc<LlamaBackend>, arguments: &Arguments) -> Vec<Conclusion> {
         let total = self.phases.len();
         let mut conclusions = Vec::with_capacity(total);
         for (index, phase) in self.phases.iter().enumerate() {
             phase.print_header(index, total);
-            conclusions.push(phase.run(backend, &arguments));
+            conclusions.push(phase.run(backend, arguments));
         }
         conclusions
     }
diff --git a/llama-cpp-test-harness/src/harness_arguments_error.rs b/llama-cpp-test-harness/src/harness_arguments_error.rs
new file mode 100644
index 00000000..53db2279
--- /dev/null
+++ b/llama-cpp-test-harness/src/harness_arguments_error.rs
@@ -0,0 +1,9 @@
+use thiserror::Error;
+
+#[derive(Debug, Error)]
+pub enum HarnessArgumentsError {
+    #[error(
+        "the test harness requires --test-threads=1 (or unset); got --test-threads={requested}"
+    )]
+    ConflictingTestThreads { requested: usize },
+}
diff --git a/llama-cpp-test-harness/src/lib.rs b/llama-cpp-test-harness/src/lib.rs
index fb0c1230..8f112b9f 100644
--- a/llama-cpp-test-harness/src/lib.rs
+++ b/llama-cpp-test-harness/src/lib.rs
@@ -8,10 +8,10 @@
 //! See the workspace README and `tests/` directory for usage examples.
 
 pub mod context_params;
-pub mod deterministic_arguments;
 pub mod download_model;
 pub mod execution_phase;
 pub mod execution_plan;
+pub mod harness_arguments_error;
 pub mod llama_fixture;
 pub mod llama_test_fn;
 pub mod llama_test_registration;
@@ -21,6 +21,7 @@ pub mod mmproj_source;
 pub mod model_load_params;
 pub mod model_source;
 pub mod no_op;
+pub mod parse_harness_arguments;
 pub mod phase_state;
 pub mod run;
 pub mod run_to_conclusions;
diff --git a/llama-cpp-test-harness/src/parse_harness_arguments.rs b/llama-cpp-test-harness/src/parse_harness_arguments.rs
new file mode 100644
index 00000000..b4b3ce72
--- /dev/null
+++ b/llama-cpp-test-harness/src/parse_harness_arguments.rs
@@ -0,0 +1,82 @@
+use libtest_mimic::Arguments;
+
+use crate::harness_arguments_error::HarnessArgumentsError;
+
+fn validate(mut arguments: Arguments) -> Result<Arguments, HarnessArgumentsError> {
+    match arguments.test_threads {
+        None | Some(1) => {
+            arguments.test_threads = Some(1);
+            Ok(arguments)
+        }
+        Some(requested) => Err(HarnessArgumentsError::ConflictingTestThreads { requested }),
+    }
+}
+
+/// Parses the test-binary CLI into [`libtest_mimic::Arguments`], enforcing the harness's
+/// single-thread requirement.
+///
+/// `--test-threads` left unset is treated as `1`; `--test-threads=1` is accepted unchanged.
+///
+/// # Errors
+///
+/// Returns [`HarnessArgumentsError::ConflictingTestThreads`] when `--test-threads` is set to
+/// any value other than `1`. The harness orchestrates phase batching itself and cannot share
+/// that responsibility with `libtest_mimic`'s thread pool.
+pub fn parse_harness_arguments() -> Result<Arguments, HarnessArgumentsError> {
+    validate(Arguments::from_args())
+}
+
+#[cfg(test)]
+mod tests {
+    use libtest_mimic::Arguments;
+
+    use crate::harness_arguments_error::HarnessArgumentsError;
+
+    use super::validate;
+
+    #[test]
+    fn validate_accepts_unset_test_threads_and_defaults_to_one() {
+        let input = Arguments::default();
+        let output = validate(input).expect("unset must be accepted");
+
+        assert_eq!(output.test_threads, Some(1));
+    }
+
+    #[test]
+    fn validate_accepts_explicit_single_thread() {
+        let input = Arguments {
+            test_threads: Some(1),
+            ..Arguments::default()
+        };
+        let output = validate(input).expect("--test-threads=1 must be accepted");
+
+        assert_eq!(output.test_threads, Some(1));
+    }
+
+    #[test]
+    fn validate_rejects_non_one_test_threads() {
+        let input = Arguments {
+            test_threads: Some(8),
+            ..Arguments::default()
+        };
+        let error = validate(input).expect_err("--test-threads=8 must be rejected");
+
+        assert!(matches!(
+            error,
+            HarnessArgumentsError::ConflictingTestThreads { requested: 8 }
+        ));
+    }
+
+    #[test]
+    fn validate_preserves_other_settings() {
+        let input = Arguments {
+            list: true,
+            filter: Some("foo".to_owned()),
+            ..Arguments::default()
+        };
+        let output = validate(input).expect("default test_threads must pass");
+
+        assert!(output.list);
+        assert_eq!(output.filter.as_deref(), Some("foo"));
+    }
+}
diff --git a/llama-cpp-test-harness/src/run.rs b/llama-cpp-test-harness/src/run.rs
index 6d13b1b4..376cbbae 100644
--- a/llama-cpp-test-harness/src/run.rs
+++ b/llama-cpp-test-harness/src/run.rs
@@ -5,6 +5,7 @@ use libtest_mimic::Conclusion;
 use llama_cpp_bindings::llama_backend::LlamaBackend;
 
 use crate::execution_plan::ExecutionPlan;
+use crate::parse_harness_arguments::parse_harness_arguments;
 
 fn aggregate_exit_code(conclusions: &[Conclusion]) -> ExitCode {
     if conclusions.iter().any(Conclusion::has_failed) {
@@ -16,6 +17,13 @@ fn aggregate_exit_code(conclusions: &[Conclusion]) -> ExitCode {
 
 #[must_use]
 pub fn run() -> ExitCode {
+    let arguments = match parse_harness_arguments() {
+        Ok(arguments) => arguments,
+        Err(error) => {
+            eprintln!("llama-cpp-test-harness: {error}");
+            return ExitCode::from(2);
+        }
+    };
     let mut backend = match LlamaBackend::init() {
         Ok(backend) => backend,
         Err(error) => {
@@ -28,7 +36,7 @@ pub fn run() -> ExitCode {
         backend.void_logs();
     }
     let backend = Arc::new(backend);
-    aggregate_exit_code(&plan.run(&backend))
+    aggregate_exit_code(&plan.run(&backend, &arguments))
 }
 
 #[cfg(test)]
diff --git a/llama-cpp-test-harness/src/run_to_conclusions.rs b/llama-cpp-test-harness/src/run_to_conclusions.rs
index 8de67e11..67c90003 100644
--- a/llama-cpp-test-harness/src/run_to_conclusions.rs
+++ b/llama-cpp-test-harness/src/run_to_conclusions.rs
@@ -4,6 +4,7 @@ use libtest_mimic::Conclusion;
 use llama_cpp_bindings::llama_backend::LlamaBackend;
 
 use crate::execution_plan::ExecutionPlan;
+use crate::parse_harness_arguments::parse_harness_arguments;
 
 /// Runs every registered test against its declared model and returns one [`Conclusion`] per phase.
 ///
@@ -13,10 +14,15 @@ use crate::execution_plan::ExecutionPlan;
 ///
 /// # Panics
 ///
-/// Panics if [`LlamaBackend::init`] fails. The harness is meaningless without a backend; a
-/// crash is the loudest possible failure signal.
+/// Panics if [`LlamaBackend::init`] fails or if the CLI arguments conflict with the harness's
+/// single-thread requirement. The harness is meaningless without a backend or with conflicting
+/// thread-count flags; a crash is the loudest possible failure signal.
 #[must_use]
 pub fn run_to_conclusions() -> Vec<Conclusion> {
+    let arguments = match parse_harness_arguments() {
+        Ok(arguments) => arguments,
+        Err(error) => panic!("llama-cpp-test-harness: {error}"),
+    };
     let mut backend = match LlamaBackend::init() {
         Ok(backend) => backend,
         Err(error) => panic!("llama-cpp-test-harness: backend init failed: {error}"),
@@ -26,7 +32,7 @@ pub fn run_to_conclusions() -> Vec<Conclusion> {
         backend.void_logs();
     }
     let backend = Arc::new(backend);
-    plan.run(&backend)
+    plan.run(&backend, &arguments)
 }
 
 #[cfg(test)]