diff --git a/Cargo.lock b/Cargo.lock
index f9c99776..05880f38 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1234,6 +1234,7 @@ dependencies = [
"libtest-mimic",
"llama-cpp-bindings",
"llama-cpp-test-harness-macros",
+ "thiserror",
]
[[package]]
diff --git a/llama-cpp-bindings-tests/Cargo.toml b/llama-cpp-bindings-tests/Cargo.toml
index c17b881d..cba73b08 100644
--- a/llama-cpp-bindings-tests/Cargo.toml
+++ b/llama-cpp-bindings-tests/Cargo.toml
@@ -15,135 +15,19 @@ llama-cpp-test-harness = { workspace = true }
serde_json = { workspace = true }
[[test]]
-name = "context"
+name = "backend_initialization"
harness = false
[[test]]
-name = "llama_backend"
+name = "chat_template_and_message_parsing"
harness = false
[[test]]
-name = "context_kv_cache"
+name = "embedding_and_encoder"
harness = false
[[test]]
-name = "deepseek_r1_8b_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt"
-harness = false
-
-[[test]]
-name = "deepseek_r1_8b_classifier_emits_reasoning"
-harness = false
-
-[[test]]
-name = "deepseek_r1_8b_duck_types_gemma_paired_quote"
-harness = false
-
-[[test]]
-name = "deepseek_r1_8b_duck_types_glm_key_value_tags"
-harness = false
-
-[[test]]
-name = "deepseek_r1_8b_duck_types_mistral_bracketed_json"
-harness = false
-
-[[test]]
-name = "deepseek_r1_8b_duck_types_qwen_xml"
-harness = false
-
-[[test]]
-name = "deepseek_r1_8b_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested"
-harness = false
-
-[[test]]
-name = "deepseek_r1_8b_recognizes_empty_tool_calls_when_tools_not_requested"
-harness = false
-
-[[test]]
-name = "context_session"
-harness = false
-
-[[test]]
-name = "embeddings"
-harness = false
-
-[[test]]
-name = "gemma4_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt"
-harness = false
-
-[[test]]
-name = "gemma4_classifier_emits_reasoning"
-harness = false
-
-[[test]]
-name = "gemma4_classifier_emits_reasoning_for_multimodal_thinking_prompt"
-harness = false
-
-[[test]]
-name = "gemma4_parses_tool_call_payload"
-harness = false
-
-[[test]]
-name = "gemma4_template_override_returns_full_markers"
-harness = false
-
-[[test]]
-name = "glm47_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt"
-harness = false
-
-[[test]]
-name = "glm47_classifier_emits_reasoning"
-harness = false
-
-[[test]]
-name = "glm47_parses_tool_call_payload"
-harness = false
-
-[[test]]
-name = "glm47_template_override_returns_full_markers"
-harness = false
-
-[[test]]
-name = "mistral3_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt"
-harness = false
-
-[[test]]
-name = "mistral3_classifier_emits_reasoning"
-harness = false
-
-[[test]]
-name = "mistral3_classifier_emits_reasoning_for_multimodal_thinking_prompt"
-harness = false
-
-[[test]]
-name = "mistral3_parses_tool_call_payload"
-harness = false
-
-[[test]]
-name = "eval_multimodal_chunks_records_exact_token_counts"
-harness = false
-
-[[test]]
-name = "ingest_prompt_chunk"
-harness = false
-
-[[test]]
-name = "llguidance"
-harness = false
-
-[[test]]
-name = "model_chat_template"
-harness = false
-
-[[test]]
-name = "model_context_creation"
-harness = false
-
-[[test]]
-name = "model_helpers"
-harness = false
-
-[[test]]
-name = "model_params"
+name = "kv_cache_and_session"
harness = false
[[test]]
@@ -151,127 +35,19 @@ name = "model_loading_errors"
harness = false
[[test]]
-name = "model_lora_adapter_errors"
-harness = false
-
-[[test]]
-name = "model_metadata_kv"
-harness = false
-
-[[test]]
-name = "model_properties"
-harness = false
-
-[[test]]
-name = "model_sampling"
-harness = false
-
-[[test]]
-name = "model_special_tokens"
-harness = false
-
-[[test]]
-name = "model_str_to_token"
-harness = false
-
-[[test]]
-name = "model_token_to_piece"
-harness = false
-
-[[test]]
-name = "model_tokens_iterator"
-harness = false
-
-[[test]]
-name = "mtmd_bitmap"
-harness = false
-
-[[test]]
-name = "mtmd_chunk_operations"
-harness = false
-
-[[test]]
-name = "mtmd_chunk_structure"
-harness = false
-
-[[test]]
-name = "mtmd_context"
-harness = false
-
-[[test]]
-name = "mtmd_evaluation"
-harness = false
-
-[[test]]
-name = "mtmd_tokenization"
-harness = false
-
-[[test]]
-name = "multimodal"
-harness = false
-
-[[test]]
-name = "parse_chat_message"
-harness = false
-
-[[test]]
-name = "qwen35_chat_inference_emits_reasoning_when_template_auto_opens"
-harness = false
-
-[[test]]
-name = "qwen35_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt"
-harness = false
-
-[[test]]
-name = "qwen35_classifier_emits_reasoning"
-harness = false
-
-[[test]]
-name = "qwen35_classifier_emits_reasoning_for_multimodal_thinking_prompt"
-harness = false
-
-[[test]]
-name = "qwen35_parses_constrained_schema_payload"
-harness = false
-
-[[test]]
-name = "qwen35_parses_tool_call_payload"
-harness = false
-
-[[test]]
-name = "qwen35_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested"
-harness = false
-
-[[test]]
-name = "qwen36_chat_inference_emits_reasoning_when_template_auto_opens"
-harness = false
-
-[[test]]
-name = "qwen36_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt"
-harness = false
-
-[[test]]
-name = "qwen36_classifier_emits_reasoning"
-harness = false
-
-[[test]]
-name = "qwen36_classifier_emits_reasoning_for_multimodal_thinking_prompt"
-harness = false
-
-[[test]]
-name = "reranker"
+name = "multimodal_vision"
harness = false
[[test]]
-name = "sampled_token_classifier_markers"
+name = "reasoning_markers_and_tool_calls"
harness = false
[[test]]
-name = "sampling"
+name = "sampling_and_constrained_decoding"
harness = false
[[test]]
-name = "text_generation"
+name = "vocabulary_and_metadata"
harness = false
[features]
diff --git a/llama-cpp-bindings-tests/tests/llama_backend.rs b/llama-cpp-bindings-tests/tests/backend_initialization.rs
similarity index 100%
rename from llama-cpp-bindings-tests/tests/llama_backend.rs
rename to llama-cpp-bindings-tests/tests/backend_initialization.rs
diff --git a/llama-cpp-bindings-tests/tests/chat_template_and_message_parsing.rs b/llama-cpp-bindings-tests/tests/chat_template_and_message_parsing.rs
new file mode 100644
index 00000000..a7e18245
--- /dev/null
+++ b/llama-cpp-bindings-tests/tests/chat_template_and_message_parsing.rs
@@ -0,0 +1,567 @@
+use llama_cpp_test_harness::llama_tests_main;
+
+mod model_chat_template {
+ #![expect(
+ clippy::unnecessary_wraps,
+ reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
+ )]
+
+ use anyhow::Result;
+ use llama_cpp_bindings::ChatTemplateError;
+ use llama_cpp_bindings::model::LlamaChatMessage;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn chat_template_returns_non_empty(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let template = fixture.model.chat_template(None);
+ assert!(template.is_ok());
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn apply_chat_template_produces_prompt(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let model = fixture.model;
+ let template = model.chat_template(None)?;
+ let message = LlamaChatMessage::new("user".to_string(), "hello".to_string())?;
+ let prompt = model.apply_chat_template(&template, &[message], true);
+
+ assert!(prompt.is_ok());
+ assert!(!prompt?.is_empty());
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn apply_chat_template_buffer_resize_with_long_messages(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let model = fixture.model;
+ let template = model.chat_template(None)?;
+ let long_content = "a".repeat(2000);
+ let message = LlamaChatMessage::new("user".to_string(), long_content)?;
+ let prompt = model.apply_chat_template(&template, &[message], true);
+
+ assert!(prompt.is_ok());
+ assert!(!prompt?.is_empty());
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn chat_template_with_nonexistent_name_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let result = fixture
+ .model
+ .chat_template(Some("nonexistent_template_name_xyz"));
+ assert_eq!(result.unwrap_err(), ChatTemplateError::MissingTemplate);
+ Ok(())
+ }
+}
+
+mod parse_chat_message {
+ #![expect(
+ clippy::unnecessary_wraps,
+ reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
+ )]
+
+ use anyhow::Result;
+ use anyhow::bail;
+ use llama_cpp_bindings::ChatMessageParseOutcome;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn parses_pure_content_response(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let outcome = fixture
+ .model
+ .parse_chat_message("[]", "hello world", false)?;
+
+ let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+ bail!("expected Recognized for plain content; got Unrecognized");
+ };
+ assert!(parsed.tool_calls.is_empty());
+ assert!(!parsed.is_empty());
+ assert!(parsed.content.contains("hello world"));
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn parses_reasoning_section_into_reasoning_content(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let input = "step one, step two\n\nactual response";
+ let outcome = fixture.model.parse_chat_message("[]", input, false)?;
+
+ let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+ bail!("expected Recognized for reasoning section; got Unrecognized");
+ };
+ assert!(
+ parsed.reasoning_content.contains("step") || parsed.content.contains("step"),
+ "neither content nor reasoning contains 'step'; content={:?} reasoning={:?}",
+ parsed.content,
+ parsed.reasoning_content
+ );
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn parses_empty_input_yields_empty_message(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let outcome = fixture.model.parse_chat_message("[]", "", false)?;
+
+ let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+ bail!("expected Recognized for empty input; got Unrecognized");
+ };
+ assert!(parsed.tool_calls.is_empty());
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn parses_malformed_tools_json_returns_tools_json_invalid_error(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let result = fixture
+ .model
+ .parse_chat_message("not_a_json[}", "hello", false);
+
+ assert!(matches!(
+ result,
+ Err(llama_cpp_bindings::ParseChatMessageError::ToolsJsonInvalid(
+ _
+ ))
+ ));
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn parses_non_array_tools_json_returns_tools_json_not_array_error(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let result = fixture
+ .model
+ .parse_chat_message("{\"foo\": 1}", "hello", false);
+
+ assert!(matches!(
+ result,
+ Err(llama_cpp_bindings::ParseChatMessageError::ToolsJsonNotArray)
+ ));
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn parses_with_tools_null_byte_returns_tools_json_invalid_error(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let result = fixture
+ .model
+ .parse_chat_message("[]\0extra", "hello", false);
+
+ assert!(matches!(
+ result,
+ Err(llama_cpp_bindings::ParseChatMessageError::ToolsJsonInvalid(
+ _
+ ))
+ ));
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn parses_with_input_null_byte_returns_tools_serialization_error(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let result = fixture
+ .model
+ .parse_chat_message("[]", "hello\0world", false);
+
+ assert!(matches!(
+ result,
+ Err(llama_cpp_bindings::ParseChatMessageError::ToolsSerialization(_))
+ ));
+ Ok(())
+ }
+}
+
+llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/constrained_decoding.rs b/llama-cpp-bindings-tests/tests/constrained_decoding.rs
deleted file mode 100644
index 533981c9..00000000
--- a/llama-cpp-bindings-tests/tests/constrained_decoding.rs
+++ /dev/null
@@ -1,124 +0,0 @@
-use std::io::Write;
-
-use anyhow::Result;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_bindings::sampled_token::SampledToken;
-use llama_cpp_bindings::sampling::LlamaSampler;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn json_schema_constrains_output(fixture: &LlamaFixture<'_>) -> Result<()> {
- let model = fixture.model;
- let backend = fixture.backend;
-
- let prompt = "The weather in Paris is sunny and 22 degrees. Extract as JSON:\n";
-
- let mut ctx = LlamaContext::from_model(
- model,
- backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
-
- let tokens_list = model.str_to_token(prompt, AddBos::Always)?;
-
- let mut batch = LlamaBatch::new(512, 1)?;
- let last_index = i32::try_from(tokens_list.len())? - 1;
-
- for (index, token) in (0_i32..).zip(&tokens_list) {
- batch.add(
- &SampledToken::Content(*token),
- index,
- &[0],
- index == last_index,
- )?;
- }
-
- ctx.decode(&mut batch)?;
-
- let schema = r#"{
- "type": "object",
- "properties": {
- "city": { "type": "string" },
- "temperature": { "type": "number" }
- },
- "required": ["city", "temperature"]
-}"#;
-
- let llg_sampler = LlamaSampler::llguidance(model, "json", schema)?;
- let mut sampler = LlamaSampler::chain_simple([llg_sampler, LlamaSampler::greedy()]);
-
- let mut n_cur = batch.n_tokens();
- let mut decoder = encoding_rs::UTF_8.new_decoder();
- let mut generated = String::new();
-
- while n_cur <= 128 {
- let token = SampledToken::Content(sampler.sample(&ctx, batch.n_tokens() - 1)?);
-
- if model.is_eog_token(&token) {
- break;
- }
-
- let output_string = model.token_to_piece(&token, &mut decoder, true, None)?;
- generated.push_str(&output_string);
- print!("{output_string}");
- std::io::stdout().flush()?;
-
- batch.clear();
- batch.add(&token, n_cur, &[0], true)?;
- n_cur += 1;
- ctx.decode(&mut batch)?;
- }
-
- println!();
-
- let parsed = serde_json::Deserializer::from_str(&generated)
- .into_iter::()
- .next()
- .ok_or_else(|| anyhow::anyhow!("model produced no JSON value"))??;
-
- assert!(parsed.get("city").is_some());
- assert!(parsed.get("temperature").is_some());
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/context.rs b/llama-cpp-bindings-tests/tests/context.rs
deleted file mode 100644
index 1e3a6b08..00000000
--- a/llama-cpp-bindings-tests/tests/context.rs
+++ /dev/null
@@ -1,917 +0,0 @@
-use std::ptr::NonNull;
-use std::sync::Arc;
-use std::sync::atomic::AtomicBool;
-
-use anyhow::Result;
-use llama_cpp_bindings::DecodeError;
-use llama_cpp_bindings::LogitsError;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_bindings::model::LlamaLoraAdapter;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-// =========================================================================================
-// Group A: default Qwen model, embeddings=false. Most context tests fall here.
-// =========================================================================================
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn context_creation_and_properties(fixture: &LlamaFixture<'_>) -> Result<()> {
- let context = LlamaContext::from_model(
- fixture.model,
- fixture.backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
-
- assert!(context.n_ctx() > 0);
- assert!(context.n_batch() > 0);
- assert!(context.n_ubatch() > 0);
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn decode_and_get_logits(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut context = LlamaContext::from_model(
- fixture.model,
- fixture.backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
- let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
- let mut batch = LlamaBatch::new(512, 1)?;
- batch.add_sequence(&tokens, 0, false)?;
-
- let decode_result = context.decode(&mut batch);
- assert!(decode_result.is_ok());
-
- let logits = context.get_logits()?;
- assert!(!logits.is_empty());
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn timings_work(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut context = LlamaContext::from_model(
- fixture.model,
- fixture.backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
-
- context.reset_timings();
- let timings = context.timings();
- assert!(timings.t_start_ms() >= 0.0);
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn token_data_array_has_entries_after_decode(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut context = LlamaContext::from_model(
- fixture.model,
- fixture.backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
- let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
- let mut batch = LlamaBatch::new(512, 1)?;
- batch.add_sequence(&tokens, 0, false)?;
- context.decode(&mut batch)?;
-
- let token_data_array = context.token_data_array()?;
-
- assert!(!token_data_array.data.is_empty());
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn get_logits_ith_returns_valid_slice(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut context = LlamaContext::from_model(
- fixture.model,
- fixture.backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
- let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
- let last_index = i32::try_from(tokens.len() - 1)?;
- let mut batch = LlamaBatch::new(512, 1)?;
- batch.add_sequence(&tokens, 0, false)?;
- context.decode(&mut batch)?;
-
- let logits = context.get_logits_ith(last_index)?;
-
- assert_eq!(logits.len(), usize::try_from(fixture.model.n_vocab())?);
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn token_data_array_ith_returns_valid_data(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut context = LlamaContext::from_model(
- fixture.model,
- fixture.backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
- let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
- let last_index = i32::try_from(tokens.len() - 1)?;
- let mut batch = LlamaBatch::new(512, 1)?;
- batch.add_sequence(&tokens, 0, false)?;
- context.decode(&mut batch)?;
-
- let token_data_array = context.token_data_array_ith(last_index)?;
-
- assert_eq!(
- token_data_array.data.len(),
- usize::try_from(fixture.model.n_vocab())?
- );
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn embeddings_ith_returns_error_when_embeddings_disabled(fixture: &LlamaFixture<'_>) -> Result<()> {
- let context = LlamaContext::from_model(
- fixture.model,
- fixture.backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
-
- let result = context.embeddings_ith(0);
-
- assert!(result.is_err());
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn embeddings_seq_ith_returns_error_when_embeddings_disabled(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let context = LlamaContext::from_model(
- fixture.model,
- fixture.backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
-
- let result = context.embeddings_seq_ith(0);
-
- assert!(result.is_err());
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn candidates_returns_n_vocab_entries(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut context = LlamaContext::from_model(
- fixture.model,
- fixture.backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
- let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
- let mut batch = LlamaBatch::new(512, 1)?;
- batch.add_sequence(&tokens, 0, false)?;
- context.decode(&mut batch)?;
-
- let count = context.candidates()?.count();
-
- assert_eq!(count, usize::try_from(fixture.model.n_vocab())?);
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn debug_format_contains_struct_name(fixture: &LlamaFixture<'_>) -> Result<()> {
- let context = LlamaContext::from_model(
- fixture.model,
- fixture.backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
- let debug_output = format!("{context:?}");
-
- assert!(debug_output.contains("LlamaContext"));
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn candidates_ith_returns_n_vocab_entries(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut context = LlamaContext::from_model(
- fixture.model,
- fixture.backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
- let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
- let last_index = i32::try_from(tokens.len() - 1)?;
- let mut batch = LlamaBatch::new(512, 1)?;
- batch.add_sequence(&tokens, 0, false)?;
- context.decode(&mut batch)?;
-
- let count = context.candidates_ith(last_index)?.count();
-
- assert_eq!(count, usize::try_from(fixture.model.n_vocab())?);
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn lora_adapter_remove_succeeds_with_no_adapters(fixture: &LlamaFixture<'_>) -> Result<()> {
- let context = LlamaContext::from_model(
- fixture.model,
- fixture.backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
- let mut adapter = LlamaLoraAdapter {
- lora_adapter: NonNull::dangling(),
- };
-
- let result = context.lora_adapter_remove(&mut adapter);
-
- assert!(result.is_ok());
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn encode_on_non_encoder_model_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut context = LlamaContext::from_model(
- fixture.model,
- fixture.backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
- let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
- let mut batch = LlamaBatch::new(512, 1)?;
- batch.add_sequence(&tokens, 0, false)?;
-
- let result = context.encode(&mut batch);
-
- assert!(result.is_err());
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn lora_adapter_set_with_dangling_pointer_succeeds_or_errors(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let context = LlamaContext::from_model(
- fixture.model,
- fixture.backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
- let mut adapter = LlamaLoraAdapter {
- lora_adapter: NonNull::dangling(),
- };
-
- let result = context.lora_adapter_set(&mut adapter, 1.0);
-
- assert!(result.is_ok());
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
- embeddings = true,
-)]
-fn embeddings_seq_ith_returns_null_embedding_error_for_invalid_seq(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let mut context = LlamaContext::from_model(
- fixture.model,
- fixture.backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
- let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
- let mut batch = LlamaBatch::new(512, 1)?;
- batch.add_sequence(&tokens, 0, false)?;
- context.decode(&mut batch)?;
-
- let result = context.embeddings_seq_ith(999);
-
- assert!(result.is_err());
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn decode_empty_batch_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut context = LlamaContext::from_model(
- fixture.model,
- fixture.backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
- let mut batch = LlamaBatch::new(512, 1)?;
-
- let result = context.decode(&mut batch);
-
- assert!(result.is_err());
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn set_abort_flag_aborts_decode(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut context = LlamaContext::from_model(
- fixture.model,
- fixture.backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
- let abort_flag = Arc::new(AtomicBool::new(true));
- context.set_abort_flag(abort_flag);
-
- let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
- let mut batch = LlamaBatch::new(512, 1)?;
- batch.add_sequence(&tokens, 0, false)?;
-
- let result = context.decode(&mut batch);
-
- assert_eq!(result, Err(DecodeError::Aborted));
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn set_abort_flag_false_allows_decode(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut context = LlamaContext::from_model(
- fixture.model,
- fixture.backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
- let abort_flag = Arc::new(AtomicBool::new(false));
- context.set_abort_flag(abort_flag);
-
- let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
- let mut batch = LlamaBatch::new(512, 1)?;
- batch.add_sequence(&tokens, 0, false)?;
-
- let result = context.decode(&mut batch);
-
- assert!(result.is_ok());
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn clear_abort_callback_allows_decode_with_flag_true(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut context = LlamaContext::from_model(
- fixture.model,
- fixture.backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
- let abort_flag = Arc::new(AtomicBool::new(true));
- context.set_abort_flag(abort_flag);
- context.clear_abort_callback();
-
- let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
- let mut batch = LlamaBatch::new(512, 1)?;
- batch.add_sequence(&tokens, 0, false)?;
-
- let result = context.decode(&mut batch);
-
- assert!(result.is_ok());
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn synchronize_completes_without_panic(fixture: &LlamaFixture<'_>) -> Result<()> {
- let context = LlamaContext::from_model(
- fixture.model,
- fixture.backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
-
- context.synchronize();
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn detach_threadpool_completes_without_panic(fixture: &LlamaFixture<'_>) -> Result<()> {
- let context = LlamaContext::from_model(
- fixture.model,
- fixture.backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
-
- context.detach_threadpool();
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn get_logits_ith_returns_token_not_initialized_for_unknown_index(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let context = LlamaContext::from_model(
- fixture.model,
- fixture.backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
-
- let result = context.get_logits_ith(7);
-
- assert!(matches!(result, Err(LogitsError::TokenNotInitialized(7))));
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 64,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn get_logits_ith_returns_token_index_exceeds_context_for_huge_index(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let mut context = LlamaContext::from_model(
- fixture.model,
- fixture.backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
-
- let huge_index = i32::try_from(context.n_ctx())?;
- context.mark_logits_initialized(huge_index);
- let result = context.get_logits_ith(huge_index);
-
- assert!(matches!(
- result,
- Err(LogitsError::TokenIndexExceedsContext { .. })
- ));
-
- Ok(())
-}
-
-// =========================================================================================
-// Group B: Qwen embedding model, embeddings=true. Six embedding-specific tests.
-// =========================================================================================
-
-#[llama_test(
- model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
- embeddings = true,
-)]
-fn decode_with_embeddings_enabled(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut context = LlamaContext::from_model(
- fixture.model,
- fixture.backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
- let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
- let mut batch = LlamaBatch::new(512, 1)?;
- batch.add_sequence(&tokens, 0, false)?;
-
- let result = context.decode(&mut batch);
-
- assert!(result.is_ok());
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
- embeddings = true,
-)]
-fn embeddings_seq_ith_returns_valid_embeddings(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut context = LlamaContext::from_model(
- fixture.model,
- fixture.backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
- let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
- let mut batch = LlamaBatch::new(512, 1)?;
- batch.add_sequence(&tokens, 0, false)?;
- context.decode(&mut batch)?;
-
- let embeddings = context.embeddings_seq_ith(0)?;
-
- assert_eq!(embeddings.len(), usize::try_from(fixture.model.n_embd())?);
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
- n_seq_max = 4,
- embeddings = true,
-)]
-fn multi_sequence_embeddings_returns_one_embedding_per_sequence(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let mut context = LlamaContext::from_model(
- fixture.model,
- fixture.backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
-
- let inputs = [
- "alpha is here",
- "beta runs fast",
- "gamma waits",
- "delta jumps",
- ];
- let mut batch = LlamaBatch::new(64, 4)?;
-
- for (sequence_index, text) in inputs.iter().enumerate() {
- let tokens = fixture.model.str_to_token(text, AddBos::Always)?;
- let sequence_id = i32::try_from(sequence_index)?;
-
- batch.add_sequence(&tokens, sequence_id, true)?;
- }
-
- context.decode(&mut batch)?;
-
- let n_embd = usize::try_from(fixture.model.n_embd())?;
- let mut collected: Vec> = Vec::with_capacity(inputs.len());
-
- for sequence_index in 0..inputs.len() {
- let sequence_id = i32::try_from(sequence_index)?;
- let embedding = context.embeddings_seq_ith(sequence_id)?;
-
- assert_eq!(
- embedding.len(),
- n_embd,
- "sequence {sequence_index} embedding length mismatch"
- );
-
- collected.push(embedding.to_vec());
- }
-
- for (left_index, left) in collected.iter().enumerate() {
- for (right_index, right) in collected.iter().enumerate().skip(left_index + 1) {
- assert_ne!(
- left, right,
- "embedding for sequence {left_index} must differ from sequence {right_index}",
- );
- }
- }
-
- Ok(())
-}
-
-/// Reproduces paddler's embedding batching loop exactly with the document strings, batch
-/// shape, and iteration pattern from the failing harness test
-/// `agent_embedding_batch_distribution_independent_of_context_size`.
-#[llama_test(
- model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
- n_seq_max = 4,
- embeddings = true,
-)]
-fn embeddings_returns_distinct_values_when_reused_batch_has_extra_capacity(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let mut context = LlamaContext::from_model(
- fixture.model,
- fixture.backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
-
- let iterations = [
- [
- "This is the first document with enough content to contribute meaningfully to the batch size calculation",
- "This is the second document that should be processed in a potentially different batch from the first",
- ],
- [
- "This is the third document adding more content to ensure the total exceeds the configured chunk limit",
- "This is the fourth document which should demonstrate that batching distributes across agent requests",
- ],
- ];
-
- let n_embd = usize::try_from(fixture.model.n_embd())?;
- let mut batch = LlamaBatch::new(64, 4)?;
- let mut collected: Vec> = Vec::new();
-
- for iteration_inputs in iterations {
- for (sequence_index, text) in iteration_inputs.iter().enumerate() {
- let tokens = fixture.model.str_to_token(text, AddBos::Always)?;
- let sequence_id = i32::try_from(sequence_index)?;
-
- batch.add_sequence(&tokens, sequence_id, true)?;
- }
-
- context.clear_kv_cache();
- context.decode(&mut batch)?;
-
- for sequence_index in 0..iteration_inputs.len() {
- let sequence_id = i32::try_from(sequence_index)?;
- let embedding = context.embeddings_seq_ith(sequence_id)?;
-
- assert_eq!(
- embedding.len(),
- n_embd,
- "iteration sequence {sequence_index} embedding length mismatch"
- );
-
- collected.push(embedding.to_vec());
- }
-
- batch.clear();
- }
-
- assert_eq!(
- collected.len(),
- iterations.iter().flatten().count(),
- "expected one embedding per input across every iteration"
- );
-
- for (left_index, left) in collected.iter().enumerate() {
- for (right_index, right) in collected.iter().enumerate().skip(left_index + 1) {
- assert_ne!(
- left, right,
- "embedding {left_index} must differ from embedding {right_index} across reused-batch iterations",
- );
- }
- }
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
- embeddings = true,
-)]
-fn embeddings_ith_returns_valid_embeddings(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut context = LlamaContext::from_model(
- fixture.model,
- fixture.backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
- let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
- let last_index = i32::try_from(tokens.len() - 1)?;
- let mut batch = LlamaBatch::new(512, 1)?;
- batch.add_sequence(&tokens, 0, false)?;
- context.decode(&mut batch)?;
-
- let embeddings = context.embeddings_ith(last_index)?;
-
- assert_eq!(embeddings.len(), usize::try_from(fixture.model.n_embd())?);
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
- embeddings = true,
-)]
-fn embeddings_ith_returns_null_embedding_error_for_non_embedding_token(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let context = LlamaContext::from_model(
- fixture.model,
- fixture.backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
-
- let result = context.embeddings_ith(999);
-
- assert!(result.is_err());
-
- Ok(())
-}
-
-// =========================================================================================
-// Group C: t5-small encoder model, embeddings=true. Single trial.
-// =========================================================================================
-
-#[llama_test(
- model_source = HuggingFace("Xiaojian9992024/t5-small-GGUF", "t5-small.bf16.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
- embeddings = true,
-)]
-fn encode_succeeds_with_encoder_model(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut context = LlamaContext::from_model(
- fixture.model,
- fixture.backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
- let tokens = fixture.model.str_to_token("hello", AddBos::Never)?;
- let mut batch = LlamaBatch::new(512, 1)?;
- batch.add_sequence(&tokens, 0, false)?;
-
- let result = context.encode(&mut batch);
-
- assert!(result.is_ok());
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/context_kv_cache.rs b/llama-cpp-bindings-tests/tests/context_kv_cache.rs
deleted file mode 100644
index 467a2aa4..00000000
--- a/llama-cpp-bindings-tests/tests/context_kv_cache.rs
+++ /dev/null
@@ -1,961 +0,0 @@
-use std::num::NonZeroU8;
-
-use anyhow::Result;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::context::kv_cache::KvCacheConversionError;
-use llama_cpp_bindings::error::KvCacheSeqAddError;
-use llama_cpp_bindings::error::KvCacheSeqDivError;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-fn build_context<'context>(fixture: &'context LlamaFixture<'_>) -> Result> {
- Ok(LlamaContext::from_model(
- fixture.model,
- fixture.backend,
- (*fixture.context_params).into_llama_context_params(),
- )?)
-}
-
-fn decode_hello_world(fixture: &LlamaFixture<'_>, context: &mut LlamaContext<'_>) -> Result<()> {
- let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
- let mut batch = LlamaBatch::new(512, 1)?;
- batch.add_sequence(&tokens, 0, false)?;
- context.decode(&mut batch)?;
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn clear_kv_cache_resets_positions(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut context = build_context(fixture)?;
-
- decode_hello_world(fixture, &mut context)?;
-
- context.clear_kv_cache();
- assert_eq!(context.kv_cache_seq_pos_max(0), -1);
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn kv_cache_seq_pos_max_is_non_negative_after_decode(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut context = build_context(fixture)?;
-
- decode_hello_world(fixture, &mut context)?;
-
- assert!(context.kv_cache_seq_pos_max(0) >= 0);
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn clear_kv_cache_seq_with_range(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut context = build_context(fixture)?;
-
- decode_hello_world(fixture, &mut context)?;
-
- let result = context.clear_kv_cache_seq(Some(0), Some(0), Some(1));
- assert!(result.is_ok());
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn copy_kv_cache_seq_succeeds(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut context = build_context(fixture)?;
-
- decode_hello_world(fixture, &mut context)?;
-
- let result = context.copy_kv_cache_seq(0, 1, None, None);
- assert!(result.is_ok());
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn copy_cache_executes_without_crash(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut context = build_context(fixture)?;
-
- decode_hello_world(fixture, &mut context)?;
-
- let pos_max = context.kv_cache_seq_pos_max(0);
- context.copy_cache(0, 1, pos_max + 1);
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn kv_cache_seq_add_returns_error_for_mrope_model(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut context = build_context(fixture)?;
-
- decode_hello_world(fixture, &mut context)?;
-
- let result = context.kv_cache_seq_add(0, Some(0), None, 1);
-
- assert!(matches!(
- result.unwrap_err(),
- KvCacheSeqAddError::IncompatibleRopeType,
- ));
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn kv_cache_seq_div_returns_error_for_mrope_model(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut context = build_context(fixture)?;
-
- decode_hello_world(fixture, &mut context)?;
-
- let divisor = NonZeroU8::new(2).ok_or_else(|| anyhow::anyhow!("2 is non-zero"))?;
- let result = context.kv_cache_seq_div(0, Some(0), None, divisor);
-
- assert!(matches!(
- result.unwrap_err(),
- KvCacheSeqDivError::IncompatibleRopeType,
- ));
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn kv_cache_seq_keep_retains_specified_sequence(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut context = build_context(fixture)?;
-
- decode_hello_world(fixture, &mut context)?;
-
- context.kv_cache_seq_keep(0);
-
- assert!(context.kv_cache_seq_pos_max(0) >= 0);
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn copy_kv_cache_seq_with_explicit_range(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut context = build_context(fixture)?;
-
- decode_hello_world(fixture, &mut context)?;
-
- let result = context.copy_kv_cache_seq(0, 2, Some(0), Some(1));
-
- assert!(result.is_ok());
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn kv_cache_seq_add_succeeds_on_embedding_model(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut context = build_context(fixture)?;
-
- decode_hello_world(fixture, &mut context)?;
-
- let result = context.kv_cache_seq_add(0, Some(0), None, 1);
-
- assert!(result.is_ok());
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn kv_cache_seq_div_succeeds_on_embedding_model(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut context = build_context(fixture)?;
-
- decode_hello_world(fixture, &mut context)?;
-
- let divisor = NonZeroU8::new(2).ok_or_else(|| anyhow::anyhow!("2 is non-zero"))?;
- let result = context.kv_cache_seq_div(0, Some(0), None, divisor);
-
- assert!(result.is_ok());
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn kv_cache_seq_pos_max_returns_negative_one_for_unused_seq(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let context = build_context(fixture)?;
-
- let result = context.kv_cache_seq_pos_max(999);
-
- assert_eq!(result, -1);
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn copy_kv_cache_seq_rejects_p0_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut context = build_context(fixture)?;
-
- let result = context.copy_kv_cache_seq(0, 1, Some(u32::MAX), None);
-
- assert!(matches!(
- result.unwrap_err(),
- KvCacheConversionError::P0TooLarge(_),
- ));
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn copy_kv_cache_seq_rejects_p1_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut context = build_context(fixture)?;
-
- let result = context.copy_kv_cache_seq(0, 1, Some(0), Some(u32::MAX));
-
- assert!(matches!(
- result.unwrap_err(),
- KvCacheConversionError::P1TooLarge(_),
- ));
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn clear_kv_cache_seq_rejects_src_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut context = build_context(fixture)?;
-
- let result = context.clear_kv_cache_seq(Some(u32::MAX), None, None);
-
- assert!(matches!(
- result.unwrap_err(),
- KvCacheConversionError::SeqIdTooLarge(_),
- ));
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn clear_kv_cache_seq_rejects_p0_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut context = build_context(fixture)?;
-
- let result = context.clear_kv_cache_seq(Some(0), Some(u32::MAX), None);
-
- assert!(matches!(
- result.unwrap_err(),
- KvCacheConversionError::P0TooLarge(_),
- ));
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn clear_kv_cache_seq_rejects_p1_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut context = build_context(fixture)?;
-
- let result = context.clear_kv_cache_seq(Some(0), Some(0), Some(u32::MAX));
-
- assert!(matches!(
- result.unwrap_err(),
- KvCacheConversionError::P1TooLarge(_),
- ));
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn kv_cache_seq_add_rejects_p0_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut context = build_context(fixture)?;
-
- let result = context.kv_cache_seq_add(0, Some(u32::MAX), None, 1);
-
- assert!(matches!(
- result.unwrap_err(),
- KvCacheSeqAddError::P0TooLarge(_),
- ));
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn kv_cache_seq_add_rejects_p1_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut context = build_context(fixture)?;
-
- let result = context.kv_cache_seq_add(0, Some(0), Some(u32::MAX), 1);
-
- assert!(matches!(
- result.unwrap_err(),
- KvCacheSeqAddError::P1TooLarge(_),
- ));
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn kv_cache_seq_div_rejects_p0_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut context = build_context(fixture)?;
-
- let divisor = NonZeroU8::new(2).ok_or_else(|| anyhow::anyhow!("2 is non-zero"))?;
- let result = context.kv_cache_seq_div(0, Some(u32::MAX), None, divisor);
-
- assert!(matches!(
- result.unwrap_err(),
- KvCacheSeqDivError::P0TooLarge(_),
- ));
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn kv_cache_seq_div_rejects_p1_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut context = build_context(fixture)?;
-
- let divisor = NonZeroU8::new(2).ok_or_else(|| anyhow::anyhow!("2 is non-zero"))?;
- let result = context.kv_cache_seq_div(0, Some(0), Some(u32::MAX), divisor);
-
- assert!(matches!(
- result.unwrap_err(),
- KvCacheSeqDivError::P1TooLarge(_),
- ));
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/context_session.rs b/llama-cpp-bindings-tests/tests/context_session.rs
deleted file mode 100644
index d32f7ecf..00000000
--- a/llama-cpp-bindings-tests/tests/context_session.rs
+++ /dev/null
@@ -1,1162 +0,0 @@
-use anyhow::Result;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-fn build_context<'context>(fixture: &'context LlamaFixture<'_>) -> Result> {
- Ok(LlamaContext::from_model(
- fixture.model,
- fixture.backend,
- (*fixture.context_params).into_llama_context_params(),
- )?)
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn save_and_load_session_file(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut context = build_context(fixture)?;
-
- let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
- let mut batch = LlamaBatch::new(512, 1)?;
- batch.add_sequence(&tokens, 0, false)?;
- context.decode(&mut batch)?;
-
- let session_path = std::env::temp_dir().join("llama_test_session.bin");
- context.state_save_file(&session_path, &tokens)?;
-
- let loaded_tokens = context.state_load_file(&session_path, 512)?;
- assert_eq!(loaded_tokens, tokens);
-
- std::fs::remove_file(&session_path)?;
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn get_state_size_is_positive(fixture: &LlamaFixture<'_>) -> Result<()> {
- let context = build_context(fixture)?;
-
- assert!(context.get_state_size() > 0);
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn state_seq_save_and_load_file_roundtrip(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut context = build_context(fixture)?;
-
- let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
- let mut batch = LlamaBatch::new(512, 1)?;
- batch.add_sequence(&tokens, 0, false)?;
- context.decode(&mut batch)?;
-
- let session_path = std::env::temp_dir().join("llama_test_seq_state.bin");
- let bytes_written = context.state_seq_save_file(&session_path, 0, &tokens)?;
- assert!(bytes_written > 0);
-
- let (loaded_tokens, bytes_read) = context.state_seq_load_file(&session_path, 0, 512)?;
- assert_eq!(loaded_tokens, tokens);
- assert!(bytes_read > 0);
-
- std::fs::remove_file(&session_path)?;
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn copy_state_data_and_set_state_data_roundtrip(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut context = build_context(fixture)?;
-
- let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
- let mut batch = LlamaBatch::new(512, 1)?;
- batch.add_sequence(&tokens, 0, false)?;
- context.decode(&mut batch)?;
-
- let state_size = context.get_state_size();
- let mut state_data = vec![0u8; state_size];
- let bytes_copied = unsafe { context.copy_state_data(&mut state_data) };
- assert!(bytes_copied > 0);
-
- let bytes_read = unsafe { context.set_state_data(&state_data) };
- assert!(bytes_read > 0);
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn state_load_file_with_nonexistent_file_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut context = build_context(fixture)?;
-
- let result = context.state_load_file("/nonexistent/session.bin", 512);
-
- assert!(result.is_err());
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn state_seq_load_file_with_nonexistent_file_returns_error(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let mut context = build_context(fixture)?;
-
- let result = context.state_seq_load_file("/nonexistent/seq_state.bin", 0, 512);
-
- assert!(result.is_err());
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn state_save_file_to_invalid_directory_returns_failed_to_save(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let context = build_context(fixture)?;
-
- let result = context.state_save_file("/nonexistent_dir/session.bin", &[]);
-
- assert!(result.is_err());
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn state_seq_save_file_to_invalid_directory_returns_failed_to_save(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let context = build_context(fixture)?;
-
- let result = context.state_seq_save_file("/nonexistent_dir/seq_state.bin", 0, &[]);
-
- assert!(result.is_err());
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn state_load_file_with_zero_max_tokens_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut context = build_context(fixture)?;
-
- let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
- let mut batch = LlamaBatch::new(512, 1)?;
- batch.add_sequence(&tokens, 0, false)?;
- context.decode(&mut batch)?;
-
- let session_path = std::env::temp_dir().join("llama_test_session_zero_max.bin");
- context.state_save_file(&session_path, &tokens)?;
-
- let result = context.state_load_file(&session_path, 0);
-
- assert!(result.is_err());
- let _ = std::fs::remove_file(&session_path);
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn state_seq_load_file_with_zero_max_tokens_returns_error(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let mut context = build_context(fixture)?;
-
- let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
- let mut batch = LlamaBatch::new(512, 1)?;
- batch.add_sequence(&tokens, 0, false)?;
- context.decode(&mut batch)?;
-
- let session_path = std::env::temp_dir().join("llama_test_seq_state_zero_max.bin");
- context.state_seq_save_file(&session_path, 0, &tokens)?;
-
- let result = context.state_seq_load_file(&session_path, 0, 0);
-
- assert!(result.is_err());
- let _ = std::fs::remove_file(&session_path);
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn state_load_file_with_insufficient_max_tokens_returns_length_error(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let mut context = build_context(fixture)?;
-
- let tokens = fixture.model.str_to_token(
- "Hello world this is a longer string for more tokens",
- AddBos::Always,
- )?;
- let mut batch = LlamaBatch::new(512, 1)?;
- batch.add_sequence(&tokens, 0, false)?;
- context.decode(&mut batch)?;
-
- let session_path = std::env::temp_dir().join("llama_test_session_insuf.bin");
- context.state_save_file(&session_path, &tokens)?;
-
- let result = context.state_load_file(&session_path, 1);
-
- assert!(result.is_err());
- let _ = std::fs::remove_file(&session_path);
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn state_seq_load_file_with_insufficient_max_tokens_returns_length_error(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let mut context = build_context(fixture)?;
-
- let tokens = fixture.model.str_to_token(
- "Hello world this is a longer string for more tokens",
- AddBos::Always,
- )?;
- let mut batch = LlamaBatch::new(512, 1)?;
- batch.add_sequence(&tokens, 0, false)?;
- context.decode(&mut batch)?;
-
- let session_path = std::env::temp_dir().join("llama_test_seq_state_insuf.bin");
- context.state_seq_save_file(&session_path, 0, &tokens)?;
-
- let result = context.state_seq_load_file(&session_path, 0, 1);
-
- assert!(result.is_err());
- let _ = std::fs::remove_file(&session_path);
-
- Ok(())
-}
-
-#[cfg(unix)]
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn state_save_file_with_non_utf8_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
- use std::ffi::OsStr;
- use std::os::unix::ffi::OsStrExt;
-
- let context = build_context(fixture)?;
-
- let non_utf8_path = std::path::Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.bin"));
- let result = context.state_save_file(non_utf8_path, &[]);
-
- assert!(result.is_err());
-
- Ok(())
-}
-
-#[cfg(unix)]
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn state_load_file_with_non_utf8_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
- use std::ffi::OsStr;
- use std::os::unix::ffi::OsStrExt;
-
- let mut context = build_context(fixture)?;
-
- let non_utf8_path = std::path::Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.bin"));
- let result = context.state_load_file(non_utf8_path, 512);
-
- assert!(result.is_err());
-
- Ok(())
-}
-
-#[cfg(unix)]
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn state_seq_save_file_with_non_utf8_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
- use std::ffi::OsStr;
- use std::os::unix::ffi::OsStrExt;
-
- let context = build_context(fixture)?;
-
- let non_utf8_path = std::path::Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.bin"));
- let result = context.state_seq_save_file(non_utf8_path, 0, &[]);
-
- assert!(result.is_err());
-
- Ok(())
-}
-
-#[cfg(unix)]
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn state_seq_load_file_with_non_utf8_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
- use std::ffi::OsStr;
- use std::os::unix::ffi::OsStrExt;
-
- let mut context = build_context(fixture)?;
-
- let non_utf8_path = std::path::Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.bin"));
- let result = context.state_seq_load_file(non_utf8_path, 0, 512);
-
- assert!(result.is_err());
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn state_save_file_with_null_byte_in_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
- let context = build_context(fixture)?;
-
- let path_with_null = std::path::Path::new("/tmp/foo\0bar.bin");
- let result = context.state_save_file(path_with_null, &[]);
-
- assert!(result.is_err());
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn state_load_file_with_null_byte_in_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut context = build_context(fixture)?;
-
- let path_with_null = std::path::Path::new("/tmp/foo\0bar.bin");
- let result = context.state_load_file(path_with_null, 512);
-
- assert!(result.is_err());
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn state_seq_save_file_with_null_byte_in_path_returns_error(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let context = build_context(fixture)?;
-
- let path_with_null = std::path::Path::new("/tmp/foo\0bar.bin");
- let result = context.state_seq_save_file(path_with_null, 0, &[]);
-
- assert!(result.is_err());
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn state_seq_load_file_with_null_byte_in_path_returns_error(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let mut context = build_context(fixture)?;
-
- let path_with_null = std::path::Path::new("/tmp/foo\0bar.bin");
- let result = context.state_seq_load_file(path_with_null, 0, 512);
-
- assert!(result.is_err());
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn state_seq_get_size_ext_returns_size_for_decoded_sequence(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- use llama_cpp_bindings::context::llama_state_seq_flags::LlamaStateSeqFlags;
-
- let mut context = build_context(fixture)?;
-
- let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
- let mut batch = LlamaBatch::new(512, 1)?;
- batch.add_sequence(&tokens, 0, false)?;
- context.decode(&mut batch)?;
-
- let flags = LlamaStateSeqFlags::empty();
- let size = context.state_seq_get_size_ext(0, &flags);
-
- assert!(size > 0);
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn state_seq_get_data_ext_and_set_data_ext_round_trip(fixture: &LlamaFixture<'_>) -> Result<()> {
- use llama_cpp_bindings::context::llama_state_seq_flags::LlamaStateSeqFlags;
-
- let mut context = build_context(fixture)?;
-
- let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
- let mut batch = LlamaBatch::new(512, 1)?;
- batch.add_sequence(&tokens, 0, false)?;
- context.decode(&mut batch)?;
-
- let flags = LlamaStateSeqFlags::empty();
- let size = context.state_seq_get_size_ext(0, &flags);
- let mut buffer = vec![0u8; size];
- let bytes_written = unsafe { context.state_seq_get_data_ext(&mut buffer, 0, &flags) };
-
- assert!(bytes_written > 0);
-
- let bytes_read = unsafe { context.state_seq_set_data_ext(&buffer, 0, &flags) };
-
- assert!(bytes_read > 0);
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs b/llama-cpp-bindings-tests/tests/deepseek_r1_8b_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs
deleted file mode 100644
index 712397df..00000000
--- a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs
+++ /dev/null
@@ -1,126 +0,0 @@
-use anyhow::Result;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_bindings::sampling::LlamaSampler;
-use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const MAX_GENERATED_TOKENS: i32 = 200;
-
-const DEEPSEEK_R1_8B_THINKING_DISABLED_PROMPT: &str = "\
-<|User|>What is 2 + 2?<|Assistant|>
-
-
-
-";
-
-const FORBIDDEN_MARKERS: &[&str] = &["", ""];
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 8192,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn deepseek_r1_8b_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let model = fixture.model;
- let backend = fixture.backend;
-
- let mut classifier = model.sampled_token_classifier();
- let prompt_tokens =
- model.str_to_token(DEEPSEEK_R1_8B_THINKING_DISABLED_PROMPT, AddBos::Never)?;
- let prompt_token_count = u64::try_from(prompt_tokens.len())?;
-
- let mut batch = LlamaBatch::new(2048, 1)?;
- classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
-
- let mut context = LlamaContext::from_model(
- model,
- backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
-
- context.decode(&mut batch)?;
-
- let promoted = classifier.commit_prompt_tokens();
- assert_eq!(promoted, prompt_token_count);
-
- let mut sampler = LlamaSampler::chain_simple([
- LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
- LlamaSampler::top_k(40),
- LlamaSampler::top_p(0.9, 1),
- LlamaSampler::min_p(0.05, 1),
- LlamaSampler::temp(0.7),
- LlamaSampler::dist(0x00C0_FFEE),
- ]);
- let initial_position = batch.n_tokens();
- let outcome = ClassifySampleLoop {
- model,
- classifier: &mut classifier,
- sampler: &mut sampler,
- context: &mut context,
- batch: &mut batch,
- initial_position,
- max_generated_tokens: MAX_GENERATED_TOKENS,
- }
- .run()?;
-
- let usage = classifier.usage();
-
- assert!(
- !outcome.generated_raw.is_empty(),
- "DeepSeek-R1-8B: must generate at least one token"
- );
- assert_eq!(
- outcome.observed_reasoning, 0,
- "DeepSeek-R1-8B thinking-disabled: classifier must not emit any Reasoning token \
- when the prompt closes the think block before generation begins; \
- generated={:?}",
- outcome.generated_raw
- );
- assert_eq!(
- outcome.observed_undeterminable, 0,
- "DeepSeek-R1-8B thinking-disabled: prompt-token replay must move section to Content \
- before generation, so no Undeterminable tokens may be emitted; \
- generated={:?}",
- outcome.generated_raw
- );
- assert_eq!(
- usage.reasoning_tokens, 0,
- "DeepSeek-R1-8B thinking-disabled: usage.reasoning_tokens must be zero; usage={usage:?}"
- );
- assert_eq!(
- usage.undeterminable_tokens, 0,
- "DeepSeek-R1-8B thinking-disabled: usage.undeterminable_tokens must be zero; usage={usage:?}"
- );
- assert!(
- outcome.observed_content > 0,
- "DeepSeek-R1-8B thinking-disabled: classifier must emit at least one Content token"
- );
- assert_eq!(
- usage.completion_tokens(),
- outcome.observed_content,
- "DeepSeek-R1-8B thinking-disabled: completion tokens must equal observed Content tokens"
- );
-
- for forbidden in FORBIDDEN_MARKERS {
- assert!(
- !outcome.content_stream.contains(forbidden),
- "DeepSeek-R1-8B thinking-disabled: content_stream leaked marker {forbidden:?}; \
- content_stream={:?}",
- outcome.content_stream
- );
- }
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_classifier_emits_reasoning.rs b/llama-cpp-bindings-tests/tests/deepseek_r1_8b_classifier_emits_reasoning.rs
deleted file mode 100644
index 6bed6bbe..00000000
--- a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_classifier_emits_reasoning.rs
+++ /dev/null
@@ -1,151 +0,0 @@
-use anyhow::Result;
-use anyhow::bail;
-use llama_cpp_bindings::ChatMessageParseOutcome;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_bindings::sampling::LlamaSampler;
-use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const MAX_GENERATED_TOKENS: i32 = 1500;
-
-// DeepSeek-R1-Distill-Llama-8B uses `...` reasoning markers
-// and full-width-bar role tokens `<|User|>` / `<|Assistant|>` (U+FF5C,
-// not ASCII `|`). The chat template's `add_generation_prompt` ALWAYS appends
-// `<|Assistant|>\n` — DeepSeek-R1 is a pure reasoner with no
-// thinking-disabled mode — so the model resumes generation already inside
-// the reasoning block.
-const DEEPSEEK_R1_8B_THINKING_PROMPT: &str = "\
-<|User|>What is 2 + 2?<|Assistant|>
-";
-
-const FORBIDDEN_MARKERS: &[&str] = &["", ""];
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 8192,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn deepseek_r1_8b_classifier_emits_reasoning_for_thinking_enabled_prompt(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let model = fixture.model;
- let backend = fixture.backend;
-
- let mut classifier = model.sampled_token_classifier();
- let prompt_tokens = model.str_to_token(DEEPSEEK_R1_8B_THINKING_PROMPT, AddBos::Never)?;
- let prompt_token_count = u64::try_from(prompt_tokens.len())?;
-
- let mut batch = LlamaBatch::new(2048, 1)?;
- classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
-
- let mut context = LlamaContext::from_model(
- model,
- backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
-
- context.decode(&mut batch)?;
-
- let promoted = classifier.commit_prompt_tokens();
- assert_eq!(promoted, prompt_token_count);
-
- let mut sampler = LlamaSampler::chain_simple([
- LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
- LlamaSampler::top_k(40),
- LlamaSampler::top_p(0.9, 1),
- LlamaSampler::min_p(0.05, 1),
- LlamaSampler::temp(0.7),
- LlamaSampler::dist(0x00C0_FFEE),
- ]);
- let initial_position = batch.n_tokens();
- let outcome = ClassifySampleLoop {
- model,
- classifier: &mut classifier,
- sampler: &mut sampler,
- context: &mut context,
- batch: &mut batch,
- initial_position,
- max_generated_tokens: MAX_GENERATED_TOKENS,
- }
- .run()?;
-
- let usage = classifier.usage();
- let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?;
- let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else {
- bail!("DeepSeek-R1-8B chat template must be recognised by the parser; got Unrecognized");
- };
-
- assert!(
- !outcome.generated_raw.is_empty(),
- "DeepSeek-R1-8B: must generate at least one token"
- );
- assert!(
- outcome.observed_reasoning > 0,
- "DeepSeek-R1-8B: classifier must emit at least one Reasoning token when the prompt \
- opens a block; outcome={outcome:?}",
- );
- assert!(
- usage.reasoning_tokens > 0,
- "DeepSeek-R1-8B: usage.reasoning_tokens must be non-zero when the prompt opens a \
- block; usage was {usage:?}"
- );
- assert_eq!(
- outcome.observed_undeterminable, 0,
- "DeepSeek-R1-8B: prompt-token replay must move section to Reasoning before generation, \
- so no Undeterminable tokens may be emitted; outcome={outcome:?}"
- );
- assert_eq!(
- usage.undeterminable_tokens, 0,
- "DeepSeek-R1-8B: usage.undeterminable_tokens must be zero; usage={usage:?}"
- );
- assert_eq!(
- usage.completion_tokens(),
- outcome.observed_content + outcome.observed_reasoning,
- "DeepSeek-R1-8B: completion tokens must equal observed Content + Reasoning"
- );
-
- if parsed.reasoning_content.is_empty() {
- eprintln!(
- "DeepSeek-R1-8B didn't close its reasoning block within {MAX_GENERATED_TOKENS} \
- tokens — skipping strict parser-equality assertions"
- );
- } else {
- assert_eq!(
- outcome.reasoning_stream, parsed.reasoning_content,
- "DeepSeek-R1-8B: per-token reasoning stream must equal parser-side reasoning_content \
- (any difference means a marker leaked into the user-visible stream)",
- );
- assert_eq!(
- outcome.content_stream, parsed.content,
- "DeepSeek-R1-8B: per-token content stream must equal parser-side content \
- (any difference means a marker leaked into the user-visible stream)",
- );
- }
-
- for forbidden in FORBIDDEN_MARKERS {
- assert!(
- !outcome.reasoning_stream.contains(forbidden),
- "DeepSeek-R1-8B: reasoning_stream leaked marker {forbidden:?}; \
- reasoning_stream={:?}",
- outcome.reasoning_stream
- );
- assert!(
- !outcome.content_stream.contains(forbidden),
- "DeepSeek-R1-8B: content_stream leaked marker {forbidden:?}; \
- content_stream={:?}",
- outcome.content_stream
- );
- }
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_gemma_paired_quote.rs b/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_gemma_paired_quote.rs
deleted file mode 100644
index ce2b922d..00000000
--- a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_gemma_paired_quote.rs
+++ /dev/null
@@ -1,70 +0,0 @@
-use anyhow::Result;
-use anyhow::bail;
-use llama_cpp_bindings::ChatMessageParseOutcome;
-use llama_cpp_bindings::ToolCallArguments;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const TOOLS_JSON: &str = r#"[
- {
- "type": "function",
- "function": {
- "name": "get_weather",
- "description": "Get the current weather for a location",
- "parameters": {
- "type": "object",
- "properties": {
- "location": {"type": "string", "description": "The city name"}
- },
- "required": ["location"]
- }
- }
- }
-]"#;
-
-const GEMMA_PAIRED_QUOTE_PAYLOAD: &str = "<|tool_call>call:get_weather{location:<|\"|>Paris<|\"|>}";
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn deepseek_r1_8b_duck_types_gemma_paired_quote(fixture: &LlamaFixture<'_>) -> Result<()> {
- let outcome =
- fixture
- .model
- .parse_chat_message(TOOLS_JSON, GEMMA_PAIRED_QUOTE_PAYLOAD, false)?;
-
- let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
- bail!(
- "duck-type pass must recognise Gemma paired-quote on a model with no registered \
- template; got Unrecognized"
- );
- };
- assert_eq!(
- parsed.tool_calls.len(),
- 1,
- "expected one tool call; got {:?}",
- parsed.tool_calls
- );
- assert_eq!(parsed.tool_calls[0].name, "get_weather");
- let location = match &parsed.tool_calls[0].arguments {
- ToolCallArguments::ValidJson(value) => value
- .get("location")
- .and_then(|v| v.as_str())
- .map(str::to_owned),
- ToolCallArguments::InvalidJson(raw) => {
- bail!("expected ValidJson, got InvalidJson: {raw}");
- }
- };
- assert_eq!(location.as_deref(), Some("Paris"));
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_glm_key_value_tags.rs b/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_glm_key_value_tags.rs
deleted file mode 100644
index 7b9e052b..00000000
--- a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_glm_key_value_tags.rs
+++ /dev/null
@@ -1,72 +0,0 @@
-use anyhow::Result;
-use anyhow::bail;
-use llama_cpp_bindings::ChatMessageParseOutcome;
-use llama_cpp_bindings::ToolCallArguments;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const TOOLS_JSON: &str = r#"[
- {
- "type": "function",
- "function": {
- "name": "get_weather",
- "description": "Get the current weather for a location",
- "parameters": {
- "type": "object",
- "properties": {
- "location": {"type": "string", "description": "The city name"}
- },
- "required": ["location"]
- }
- }
- }
-]"#;
-
-const GLM_KEY_VALUE_PAYLOAD: &str = "get_weather\
-location\
-Paris\
-";
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn deepseek_r1_8b_duck_types_glm_key_value_tags(fixture: &LlamaFixture<'_>) -> Result<()> {
- let outcome = fixture
- .model
- .parse_chat_message(TOOLS_JSON, GLM_KEY_VALUE_PAYLOAD, false)?;
-
- let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
- bail!(
- "duck-type pass must recognise GLM key-value tags on a model with no registered \
- template; got Unrecognized"
- );
- };
- assert_eq!(
- parsed.tool_calls.len(),
- 1,
- "expected one tool call; got {:?}",
- parsed.tool_calls
- );
- assert_eq!(parsed.tool_calls[0].name, "get_weather");
- let location = match &parsed.tool_calls[0].arguments {
- ToolCallArguments::ValidJson(value) => value
- .get("location")
- .and_then(|v| v.as_str())
- .map(str::to_owned),
- ToolCallArguments::InvalidJson(raw) => {
- bail!("expected ValidJson, got InvalidJson: {raw}");
- }
- };
- assert_eq!(location.as_deref(), Some("Paris"));
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_mistral_bracketed_json.rs b/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_mistral_bracketed_json.rs
deleted file mode 100644
index 66b4caab..00000000
--- a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_mistral_bracketed_json.rs
+++ /dev/null
@@ -1,70 +0,0 @@
-use anyhow::Result;
-use anyhow::bail;
-use llama_cpp_bindings::ChatMessageParseOutcome;
-use llama_cpp_bindings::ToolCallArguments;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const TOOLS_JSON: &str = r#"[
- {
- "type": "function",
- "function": {
- "name": "get_weather",
- "description": "Get the current weather for a location",
- "parameters": {
- "type": "object",
- "properties": {
- "location": {"type": "string", "description": "The city name"}
- },
- "required": ["location"]
- }
- }
- }
-]"#;
-
-const MISTRAL_BRACKETED_JSON_PAYLOAD: &str = r#"[TOOL_CALLS]get_weather[ARGS]{"location":"Paris"}"#;
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn deepseek_r1_8b_duck_types_mistral_bracketed_json(fixture: &LlamaFixture<'_>) -> Result<()> {
- let outcome =
- fixture
- .model
- .parse_chat_message(TOOLS_JSON, MISTRAL_BRACKETED_JSON_PAYLOAD, false)?;
-
- let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
- bail!(
- "duck-type pass must recognise Mistral bracketed-JSON on a model with no registered \
- template; got Unrecognized"
- );
- };
- assert_eq!(
- parsed.tool_calls.len(),
- 1,
- "expected one tool call; got {:?}",
- parsed.tool_calls
- );
- assert_eq!(parsed.tool_calls[0].name, "get_weather");
- let location = match &parsed.tool_calls[0].arguments {
- ToolCallArguments::ValidJson(value) => value
- .get("location")
- .and_then(|v| v.as_str())
- .map(str::to_owned),
- ToolCallArguments::InvalidJson(raw) => {
- bail!("expected ValidJson, got InvalidJson: {raw}");
- }
- };
- assert_eq!(location.as_deref(), Some("Paris"));
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_qwen_xml.rs b/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_qwen_xml.rs
deleted file mode 100644
index 203ae0e8..00000000
--- a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_qwen_xml.rs
+++ /dev/null
@@ -1,75 +0,0 @@
-use anyhow::Result;
-use anyhow::bail;
-use llama_cpp_bindings::ChatMessageParseOutcome;
-use llama_cpp_bindings::ToolCallArguments;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const TOOLS_JSON: &str = r#"[
- {
- "type": "function",
- "function": {
- "name": "get_weather",
- "description": "Get the current weather for a location",
- "parameters": {
- "type": "object",
- "properties": {
- "location": {"type": "string", "description": "The city name"}
- },
- "required": ["location"]
- }
- }
- }
-]"#;
-
-const QWEN_XML_PAYLOAD: &str = "\n\
-\n\
-\n\
-Paris\n\
-\n\
-\n\
-";
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn deepseek_r1_8b_duck_types_qwen_xml(fixture: &LlamaFixture<'_>) -> Result<()> {
- let outcome = fixture
- .model
- .parse_chat_message(TOOLS_JSON, QWEN_XML_PAYLOAD, false)?;
-
- let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
- bail!(
- "duck-type pass must recognise Qwen XML on a model with no registered template; \
- got Unrecognized"
- );
- };
- assert_eq!(
- parsed.tool_calls.len(),
- 1,
- "expected one tool call; got {:?}",
- parsed.tool_calls
- );
- assert_eq!(parsed.tool_calls[0].name, "get_weather");
- let location = match &parsed.tool_calls[0].arguments {
- ToolCallArguments::ValidJson(value) => value
- .get("location")
- .and_then(|v| v.as_str())
- .map(str::to_owned),
- ToolCallArguments::InvalidJson(raw) => {
- bail!("expected ValidJson, got InvalidJson: {raw}");
- }
- };
- assert_eq!(location.as_deref(), Some("Paris"));
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested.rs b/llama-cpp-bindings-tests/tests/deepseek_r1_8b_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested.rs
deleted file mode 100644
index 2921b3d6..00000000
--- a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested.rs
+++ /dev/null
@@ -1,58 +0,0 @@
-use anyhow::Result;
-use anyhow::bail;
-use llama_cpp_bindings::ChatMessageParseOutcome;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const TOOLS_JSON: &str = r#"[
- {
- "type": "function",
- "function": {
- "name": "get_weather",
- "description": "Get the current weather for a location",
- "parameters": {
- "type": "object",
- "properties": {
- "location": {"type": "string", "description": "The city name"}
- },
- "required": ["location"]
- }
- }
- }
-]"#;
-
-const PLAIN_CONTENT: &str = "Sorry, I cannot help with that.";
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn deepseek_r1_8b_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let outcome = fixture
- .model
- .parse_chat_message(TOOLS_JSON, PLAIN_CONTENT, false)?;
-
- let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
- bail!(
- "plain content with tools requested must produce Recognized (with empty tool_calls); \
- got Unrecognized"
- );
- };
- assert!(
- parsed.tool_calls.is_empty(),
- "expected no tool calls; got {:?}",
- parsed.tool_calls
- );
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_recognizes_empty_tool_calls_when_tools_not_requested.rs b/llama-cpp-bindings-tests/tests/deepseek_r1_8b_recognizes_empty_tool_calls_when_tools_not_requested.rs
deleted file mode 100644
index cc48350f..00000000
--- a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_recognizes_empty_tool_calls_when_tools_not_requested.rs
+++ /dev/null
@@ -1,38 +0,0 @@
-use anyhow::Result;
-use anyhow::bail;
-use llama_cpp_bindings::ChatMessageParseOutcome;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const PLAIN_CONTENT: &str = "Hello there.";
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn deepseek_r1_8b_recognizes_empty_tool_calls_when_tools_not_requested(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let outcome = fixture
- .model
- .parse_chat_message("[]", PLAIN_CONTENT, false)?;
-
- let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
- bail!("plain content with empty tools array must produce Recognized; got Unrecognized");
- };
- assert!(
- parsed.tool_calls.is_empty(),
- "expected no tool calls; got {:?}",
- parsed.tool_calls
- );
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/embedding_and_encoder.rs b/llama-cpp-bindings-tests/tests/embedding_and_encoder.rs
new file mode 100644
index 00000000..cebd47c1
--- /dev/null
+++ b/llama-cpp-bindings-tests/tests/embedding_and_encoder.rs
@@ -0,0 +1,707 @@
+use llama_cpp_test_harness::llama_tests_main;
+
+mod embeddings {
+ use std::time::Duration;
+
+ use anyhow::{Context, Result};
+ use llama_cpp_bindings::context::LlamaContext;
+ use llama_cpp_bindings::ggml_time_us;
+ use llama_cpp_bindings::llama_batch::LlamaBatch;
+ use llama_cpp_bindings::model::AddBos;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ fn normalize(input: &[f32]) -> Vec {
+ let magnitude = input
+ .iter()
+ .fold(0.0, |accumulator, &value| value.mul_add(value, accumulator))
+ .sqrt();
+
+ input.iter().map(|&value| value / magnitude).collect()
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ n_threads_batch = 8,
+ embeddings = true,
+ )]
+ fn embedding_generation_produces_vectors(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let model = fixture.model;
+
+ let mut ctx = LlamaContext::from_model(
+ model,
+ fixture.backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )
+ .with_context(|| "unable to create context")?;
+
+ let prompt = "Hello my name is";
+ let tokens = model
+ .str_to_token(prompt, AddBos::Always)
+ .with_context(|| format!("failed to tokenize {prompt}"))?;
+ let prompt_token_count = u64::try_from(tokens.len())?;
+
+ let n_ctx = usize::try_from(ctx.n_ctx())?;
+ assert!(tokens.len() <= n_ctx, "prompt exceeds context window size");
+
+ let t_main_start = ggml_time_us();
+
+ let mut classifier = model.sampled_token_classifier();
+ let mut batch = LlamaBatch::new(n_ctx, 1)?;
+ classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?;
+
+ assert_eq!(classifier.pending_prompt_tokens(), prompt_token_count);
+ assert_eq!(classifier.usage().prompt_tokens, 0);
+
+ ctx.clear_kv_cache();
+ ctx.decode(&mut batch)
+ .with_context(|| "llama_decode() failed")?;
+
+ let promoted = classifier.commit_prompt_tokens();
+ assert_eq!(promoted, prompt_token_count);
+
+ let embedding = ctx
+ .embeddings_seq_ith(0)
+ .with_context(|| "failed to get embeddings")?;
+ let normalized = normalize(embedding);
+
+ let t_main_end = ggml_time_us();
+ let duration = Duration::from_micros(u64::try_from(t_main_end - t_main_start)?);
+
+ eprintln!(
+ "created embedding with {} dimensions in {:.2} s",
+ normalized.len(),
+ duration.as_secs_f32()
+ );
+
+ assert!(
+ !normalized.is_empty(),
+ "embedding should have at least one dimension"
+ );
+
+ let magnitude: f32 = normalized
+ .iter()
+ .map(|value| value * value)
+ .sum::()
+ .sqrt();
+ assert!(
+ (magnitude - 1.0).abs() < 0.01,
+ "normalized embedding magnitude should be approximately 1.0, got {magnitude}"
+ );
+
+ let usage = classifier.into_usage();
+ assert_eq!(usage.prompt_tokens, prompt_token_count);
+ assert_eq!(usage.completion_tokens(), 0);
+
+ Ok(())
+ }
+}
+
+mod reranker {
+ use std::time::Duration;
+
+ use anyhow::{Context, Result, bail};
+ use llama_cpp_bindings::context::LlamaContext;
+ use llama_cpp_bindings::ggml_time_us;
+ use llama_cpp_bindings::llama_batch::LlamaBatch;
+ use llama_cpp_bindings::model::AddBos;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ fn normalize(input: &[f32]) -> Vec {
+ let magnitude = input
+ .iter()
+ .fold(0.0, |accumulator, &value| value.mul_add(value, accumulator))
+ .sqrt();
+
+ input.iter().map(|&value| value / magnitude).collect()
+ }
+
+ fn cosine_similarity(vec_a: &[f32], vec_b: &[f32]) -> f32 {
+ vec_a
+ .iter()
+ .zip(vec_b.iter())
+ .map(|(left, right)| left * right)
+ .sum::()
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ n_seq_max = 2,
+ n_threads_batch = 8,
+ embeddings = true,
+ )]
+ fn reranking_produces_scores(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let model = fixture.model;
+
+ let query = "What is machine learning?";
+ let documents = [
+ "Machine learning is a subset of artificial intelligence.",
+ "The weather today is sunny and warm.",
+ ];
+
+ let document_count = documents.len();
+ assert_eq!(
+ u32::try_from(document_count)?,
+ fixture.context_params.n_seq_max,
+ "attribute n_seq_max must match the document count this trial expects",
+ );
+
+ let mut ctx = LlamaContext::from_model(
+ model,
+ fixture.backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )
+ .with_context(|| "unable to create context")?;
+
+ let prompt_lines: Vec = documents
+ .iter()
+ .map(|document| format!("{query}{document}"))
+ .collect();
+
+ let tokens_lines_list = prompt_lines
+ .iter()
+ .map(|line| model.str_to_token(line, AddBos::Always))
+ .collect::, _>>()
+ .with_context(|| "failed to tokenize prompts")?;
+
+ let n_ctx = usize::try_from(ctx.n_ctx())?;
+
+ if tokens_lines_list.iter().any(|tokens| n_ctx < tokens.len()) {
+ bail!("one of the provided prompts exceeds the size of the context window");
+ }
+
+ let mut classifier = model.sampled_token_classifier();
+ let mut batch = LlamaBatch::new(2048, i32::try_from(document_count)?)?;
+ let t_main_start = ggml_time_us();
+
+ for (sequence_index, tokens) in tokens_lines_list.iter().enumerate() {
+ classifier.feed_prompt_sequence_to_batch(
+ &mut batch,
+ tokens,
+ i32::try_from(sequence_index)?,
+ false,
+ )?;
+ }
+
+ let total_tokens: usize = tokens_lines_list.iter().map(Vec::len).sum();
+ let total_token_count = u64::try_from(total_tokens)?;
+
+ assert_eq!(classifier.pending_prompt_tokens(), total_token_count);
+ assert_eq!(classifier.usage().prompt_tokens, 0);
+
+ ctx.clear_kv_cache();
+ ctx.decode(&mut batch)
+ .with_context(|| "llama_decode() failed")?;
+
+ let promoted = classifier.commit_prompt_tokens();
+ assert_eq!(promoted, total_token_count);
+
+ let mut embeddings = Vec::with_capacity(document_count);
+
+ for sequence_index in 0..document_count {
+ let raw_embedding = ctx
+ .embeddings_seq_ith(i32::try_from(sequence_index)?)
+ .with_context(|| "failed to get sequence embeddings")?;
+ embeddings.push(normalize(raw_embedding));
+ }
+
+ let t_main_end = ggml_time_us();
+ let duration = Duration::from_micros(u64::try_from(t_main_end - t_main_start)?);
+
+ #[expect(
+ clippy::cast_precision_loss,
+ reason = "logged throughput tolerates f32 precision"
+ )]
+ let tokens_per_second = total_tokens as f32 / duration.as_secs_f32();
+
+ eprintln!(
+ "created embeddings for {total_tokens} tokens in {:.2} s, speed {tokens_per_second:.2} t/s",
+ duration.as_secs_f32(),
+ );
+
+ assert_eq!(
+ embeddings.len(),
+ document_count,
+ "should produce one embedding per document"
+ );
+
+ for (index, embedding) in embeddings.iter().enumerate() {
+ assert!(
+ !embedding.is_empty(),
+ "embedding {index} should not be empty"
+ );
+ }
+
+ let similarity = cosine_similarity(&embeddings[0], &embeddings[1]);
+ eprintln!("cosine similarity between document embeddings: {similarity:.4}");
+
+ assert!(
+ similarity.is_finite(),
+ "cosine similarity should be a finite number"
+ );
+
+ let usage = classifier.into_usage();
+ assert_eq!(usage.prompt_tokens, total_token_count);
+ assert_eq!(usage.completion_tokens(), 0);
+
+ Ok(())
+ }
+}
+
+mod context_embedding_and_encoder {
+
+ use anyhow::Result;
+
+ use llama_cpp_bindings::context::LlamaContext;
+ use llama_cpp_bindings::llama_batch::LlamaBatch;
+ use llama_cpp_bindings::model::AddBos;
+
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ // =========================================================================================
+ // Group A: default Qwen model, embeddings=false. Most context tests fall here.
+ // =========================================================================================
+
+ #[llama_test(
+ model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ embeddings = true,
+ )]
+ fn decode_with_embeddings_enabled(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mut context = LlamaContext::from_model(
+ fixture.model,
+ fixture.backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+ let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
+ let mut batch = LlamaBatch::new(512, 1)?;
+ batch.add_sequence(&tokens, 0, false)?;
+
+ let result = context.decode(&mut batch);
+
+ assert!(result.is_ok());
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ embeddings = true,
+ )]
+ fn embeddings_seq_ith_returns_valid_embeddings(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mut context = LlamaContext::from_model(
+ fixture.model,
+ fixture.backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+ let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
+ let mut batch = LlamaBatch::new(512, 1)?;
+ batch.add_sequence(&tokens, 0, false)?;
+ context.decode(&mut batch)?;
+
+ let embeddings = context.embeddings_seq_ith(0)?;
+
+ assert_eq!(embeddings.len(), usize::try_from(fixture.model.n_embd())?);
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ n_seq_max = 4,
+ embeddings = true,
+ )]
+ fn multi_sequence_embeddings_returns_one_embedding_per_sequence(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let mut context = LlamaContext::from_model(
+ fixture.model,
+ fixture.backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+
+ let inputs = [
+ "alpha is here",
+ "beta runs fast",
+ "gamma waits",
+ "delta jumps",
+ ];
+ let mut batch = LlamaBatch::new(64, 4)?;
+
+ for (sequence_index, text) in inputs.iter().enumerate() {
+ let tokens = fixture.model.str_to_token(text, AddBos::Always)?;
+ let sequence_id = i32::try_from(sequence_index)?;
+
+ batch.add_sequence(&tokens, sequence_id, true)?;
+ }
+
+ context.decode(&mut batch)?;
+
+ let n_embd = usize::try_from(fixture.model.n_embd())?;
+ let mut collected: Vec> = Vec::with_capacity(inputs.len());
+
+ for sequence_index in 0..inputs.len() {
+ let sequence_id = i32::try_from(sequence_index)?;
+ let embedding = context.embeddings_seq_ith(sequence_id)?;
+
+ assert_eq!(
+ embedding.len(),
+ n_embd,
+ "sequence {sequence_index} embedding length mismatch"
+ );
+
+ collected.push(embedding.to_vec());
+ }
+
+ for (left_index, left) in collected.iter().enumerate() {
+ for (right_index, right) in collected.iter().enumerate().skip(left_index + 1) {
+ assert_ne!(
+ left, right,
+ "embedding for sequence {left_index} must differ from sequence {right_index}",
+ );
+ }
+ }
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ n_seq_max = 4,
+ embeddings = true,
+ )]
+ fn embeddings_returns_distinct_values_when_reused_batch_has_extra_capacity(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let mut context = LlamaContext::from_model(
+ fixture.model,
+ fixture.backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+
+ let iterations = [
+ [
+ "This is the first document with enough content to contribute meaningfully to the batch size calculation",
+ "This is the second document that should be processed in a potentially different batch from the first",
+ ],
+ [
+ "This is the third document adding more content to ensure the total exceeds the configured chunk limit",
+ "This is the fourth document which should demonstrate that batching distributes across agent requests",
+ ],
+ ];
+
+ let n_embd = usize::try_from(fixture.model.n_embd())?;
+ let mut batch = LlamaBatch::new(64, 4)?;
+ let mut collected: Vec> = Vec::new();
+
+ for iteration_inputs in iterations {
+ for (sequence_index, text) in iteration_inputs.iter().enumerate() {
+ let tokens = fixture.model.str_to_token(text, AddBos::Always)?;
+ let sequence_id = i32::try_from(sequence_index)?;
+
+ batch.add_sequence(&tokens, sequence_id, true)?;
+ }
+
+ context.clear_kv_cache();
+ context.decode(&mut batch)?;
+
+ for sequence_index in 0..iteration_inputs.len() {
+ let sequence_id = i32::try_from(sequence_index)?;
+ let embedding = context.embeddings_seq_ith(sequence_id)?;
+
+ assert_eq!(
+ embedding.len(),
+ n_embd,
+ "iteration sequence {sequence_index} embedding length mismatch"
+ );
+
+ collected.push(embedding.to_vec());
+ }
+
+ batch.clear();
+ }
+
+ assert_eq!(
+ collected.len(),
+ iterations.iter().flatten().count(),
+ "expected one embedding per input across every iteration"
+ );
+
+ for (left_index, left) in collected.iter().enumerate() {
+ for (right_index, right) in collected.iter().enumerate().skip(left_index + 1) {
+ assert_ne!(
+ left, right,
+ "embedding {left_index} must differ from embedding {right_index} across reused-batch iterations",
+ );
+ }
+ }
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ embeddings = true,
+ )]
+ fn embeddings_ith_returns_valid_embeddings(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mut context = LlamaContext::from_model(
+ fixture.model,
+ fixture.backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+ let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
+ let last_index = i32::try_from(tokens.len() - 1)?;
+ let mut batch = LlamaBatch::new(512, 1)?;
+ batch.add_sequence(&tokens, 0, false)?;
+ context.decode(&mut batch)?;
+
+ let embeddings = context.embeddings_ith(last_index)?;
+
+ assert_eq!(embeddings.len(), usize::try_from(fixture.model.n_embd())?);
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ embeddings = true,
+ )]
+ fn embeddings_ith_returns_null_embedding_error_for_non_embedding_token(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let context = LlamaContext::from_model(
+ fixture.model,
+ fixture.backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+
+ let result = context.embeddings_ith(999);
+
+ assert!(result.is_err());
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("Xiaojian9992024/t5-small-GGUF", "t5-small.bf16.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ embeddings = true,
+ )]
+ fn encode_succeeds_with_encoder_model(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mut context = LlamaContext::from_model(
+ fixture.model,
+ fixture.backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+ let tokens = fixture.model.str_to_token("hello", AddBos::Never)?;
+ let mut batch = LlamaBatch::new(512, 1)?;
+ batch.add_sequence(&tokens, 0, false)?;
+
+ let result = context.encode(&mut batch);
+
+ assert!(result.is_ok());
+
+ Ok(())
+ }
+}
+
+mod context_kv_cache_embedding {
+ use std::num::NonZeroU8;
+
+ use anyhow::Result;
+ use llama_cpp_bindings::context::LlamaContext;
+
+ use llama_cpp_bindings::llama_batch::LlamaBatch;
+ use llama_cpp_bindings::model::AddBos;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ fn build_context<'context>(
+ fixture: &'context LlamaFixture<'_>,
+ ) -> Result> {
+ Ok(LlamaContext::from_model(
+ fixture.model,
+ fixture.backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?)
+ }
+
+ fn decode_hello_world(
+ fixture: &LlamaFixture<'_>,
+ context: &mut LlamaContext<'_>,
+ ) -> Result<()> {
+ let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
+ let mut batch = LlamaBatch::new(512, 1)?;
+ batch.add_sequence(&tokens, 0, false)?;
+ context.decode(&mut batch)?;
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn kv_cache_seq_add_succeeds_on_embedding_model(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mut context = build_context(fixture)?;
+
+ decode_hello_world(fixture, &mut context)?;
+
+ let result = context.kv_cache_seq_add(0, Some(0), None, 1);
+
+ assert!(result.is_ok());
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn kv_cache_seq_div_succeeds_on_embedding_model(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mut context = build_context(fixture)?;
+
+ decode_hello_world(fixture, &mut context)?;
+
+ let divisor = NonZeroU8::new(2).ok_or_else(|| anyhow::anyhow!("2 is non-zero"))?;
+ let result = context.kv_cache_seq_div(0, Some(0), None, divisor);
+
+ assert!(result.is_ok());
+
+ Ok(())
+ }
+}
+
+mod model_helpers_embedding {
+ #![expect(
+ clippy::unnecessary_wraps,
+ reason = "every trial returns anyhow::Result<()> to match the LlamaTestFn signature"
+ )]
+
+ use anyhow::Result;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ #[llama_test(
+ model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 2048,
+ n_batch = 512,
+ n_ubatch = 128
+ )]
+ fn embedding_model_tool_call_markers_call_does_not_panic(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let _markers = fixture.model.tool_call_markers();
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 2048,
+ n_batch = 512,
+ n_ubatch = 128
+ )]
+ fn embedding_model_streaming_markers_returns_ok_for_a_model_without_tool_calls(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let _markers = fixture.model.streaming_markers()?;
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 2048,
+ n_batch = 512,
+ n_ubatch = 128
+ )]
+ fn approximate_tok_env_falls_back_to_eos_when_eot_unavailable(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let env = fixture.model.approximate_tok_env();
+ let env_again = fixture.model.approximate_tok_env();
+
+ assert!(
+ std::sync::Arc::ptr_eq(&env, &env_again),
+ "approximate_tok_env must return the same cached Arc for any model, including \
+ the embedding model which lacks an EOT token (forcing the fallback-to-EOS path)"
+ );
+
+ Ok(())
+ }
+}
+
+llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/embeddings.rs b/llama-cpp-bindings-tests/tests/embeddings.rs
deleted file mode 100644
index 7e531cec..00000000
--- a/llama-cpp-bindings-tests/tests/embeddings.rs
+++ /dev/null
@@ -1,103 +0,0 @@
-use std::time::Duration;
-
-use anyhow::{Context, Result};
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::ggml_time_us;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-fn normalize(input: &[f32]) -> Vec {
- let magnitude = input
- .iter()
- .fold(0.0, |accumulator, &value| value.mul_add(value, accumulator))
- .sqrt();
-
- input.iter().map(|&value| value / magnitude).collect()
-}
-
-#[llama_test(
- model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
- n_threads_batch = 8,
- embeddings = true,
-)]
-fn embedding_generation_produces_vectors(fixture: &LlamaFixture<'_>) -> Result<()> {
- let model = fixture.model;
-
- let mut ctx = LlamaContext::from_model(
- model,
- fixture.backend,
- (*fixture.context_params).into_llama_context_params(),
- )
- .with_context(|| "unable to create context")?;
-
- let prompt = "Hello my name is";
- let tokens = model
- .str_to_token(prompt, AddBos::Always)
- .with_context(|| format!("failed to tokenize {prompt}"))?;
- let prompt_token_count = u64::try_from(tokens.len())?;
-
- let n_ctx = usize::try_from(ctx.n_ctx())?;
- assert!(tokens.len() <= n_ctx, "prompt exceeds context window size");
-
- let t_main_start = ggml_time_us();
-
- let mut classifier = model.sampled_token_classifier();
- let mut batch = LlamaBatch::new(n_ctx, 1)?;
- classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?;
-
- assert_eq!(classifier.pending_prompt_tokens(), prompt_token_count);
- assert_eq!(classifier.usage().prompt_tokens, 0);
-
- ctx.clear_kv_cache();
- ctx.decode(&mut batch)
- .with_context(|| "llama_decode() failed")?;
-
- let promoted = classifier.commit_prompt_tokens();
- assert_eq!(promoted, prompt_token_count);
-
- let embedding = ctx
- .embeddings_seq_ith(0)
- .with_context(|| "failed to get embeddings")?;
- let normalized = normalize(embedding);
-
- let t_main_end = ggml_time_us();
- let duration = Duration::from_micros(u64::try_from(t_main_end - t_main_start)?);
-
- eprintln!(
- "created embedding with {} dimensions in {:.2} s",
- normalized.len(),
- duration.as_secs_f32()
- );
-
- assert!(
- !normalized.is_empty(),
- "embedding should have at least one dimension"
- );
-
- let magnitude: f32 = normalized
- .iter()
- .map(|value| value * value)
- .sum::()
- .sqrt();
- assert!(
- (magnitude - 1.0).abs() < 0.01,
- "normalized embedding magnitude should be approximately 1.0, got {magnitude}"
- );
-
- let usage = classifier.into_usage();
- assert_eq!(usage.prompt_tokens, prompt_token_count);
- assert_eq!(usage.completion_tokens(), 0);
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/eval_multimodal_chunks_records_exact_token_counts.rs b/llama-cpp-bindings-tests/tests/eval_multimodal_chunks_records_exact_token_counts.rs
deleted file mode 100644
index dcef4ded..00000000
--- a/llama-cpp-bindings-tests/tests/eval_multimodal_chunks_records_exact_token_counts.rs
+++ /dev/null
@@ -1,185 +0,0 @@
-use anyhow::Result;
-use llama_cpp_bindings::TokenUsage;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::mtmd::MtmdBitmap;
-use llama_cpp_bindings::mtmd::MtmdInputChunkType;
-use llama_cpp_bindings::mtmd::MtmdInputChunks;
-use llama_cpp_bindings::mtmd::MtmdInputText;
-use llama_cpp_bindings::mtmd::mtmd_default_marker;
-use llama_cpp_bindings_tests::test_model::fixtures_dir;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const PROMPT_QUESTION: &str = "What animals do you see in this image?";
-
-struct ExpectedChunkTotals {
- text: u64,
- image: u64,
- audio: u64,
-}
-
-fn sum_chunk_token_counts_by_type(chunks: &MtmdInputChunks) -> Result {
- let mut totals = ExpectedChunkTotals {
- text: 0,
- image: 0,
- audio: 0,
- };
- for index in 0..chunks.len() {
- let chunk = chunks
- .get(index)
- .ok_or_else(|| anyhow::anyhow!("chunk index {index} should exist"))?;
- let n_tokens = u64::try_from(chunk.n_tokens())?;
- match chunk.chunk_type()? {
- MtmdInputChunkType::Text => {
- totals.text = totals.text.saturating_add(n_tokens);
- }
- MtmdInputChunkType::Image => {
- totals.image = totals.image.saturating_add(n_tokens);
- }
- MtmdInputChunkType::Audio => {
- totals.audio = totals.audio.saturating_add(n_tokens);
- }
- }
- }
- Ok(totals)
-}
-
-fn build_multimodal_chunks_and_eval_into_usage(
- fixture: &LlamaFixture<'_>,
-) -> Result<(TokenUsage, ExpectedChunkTotals)> {
- let model = fixture.model;
- let mtmd_ctx = fixture
- .mtmd_context
- .expect("mmproj_file declared in attribute");
-
- let image_path = fixtures_dir().join("llamas.jpg");
- let image_path_str = image_path
- .to_str()
- .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?;
- let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?;
-
- let marker = mtmd_default_marker();
- let prompt = format!("{marker}{PROMPT_QUESTION}");
-
- let input_text = MtmdInputText {
- text: prompt,
- add_special: false,
- parse_special: true,
- };
-
- let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
- let expected = sum_chunk_token_counts_by_type(&chunks)?;
-
- let context_params = (*fixture.context_params).into_llama_context_params();
- let context = LlamaContext::from_model(model, fixture.backend, context_params)?;
-
- let mut classifier = model.sampled_token_classifier();
- classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?;
-
- Ok((classifier.into_usage(), expected))
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 4096,
- n_batch = 512,
- n_ubatch = 512,
- mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-fn prompt_tokens_match_text_chunk_total(fixture: &LlamaFixture<'_>) -> Result<()> {
- let (usage, expected) = build_multimodal_chunks_and_eval_into_usage(fixture)?;
-
- if usage.prompt_tokens != expected.text {
- anyhow::bail!(
- "prompt_tokens must equal sum of text-chunk n_tokens; expected {}, got {}",
- expected.text,
- usage.prompt_tokens
- );
- }
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 4096,
- n_batch = 512,
- n_ubatch = 512,
- mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-fn input_image_tokens_match_image_chunk_total(fixture: &LlamaFixture<'_>) -> Result<()> {
- let (usage, expected) = build_multimodal_chunks_and_eval_into_usage(fixture)?;
-
- if usage.input_image_tokens != expected.image {
- anyhow::bail!(
- "input_image_tokens must equal sum of image-chunk n_tokens; expected {}, got {}",
- expected.image,
- usage.input_image_tokens
- );
- }
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 4096,
- n_batch = 512,
- n_ubatch = 512,
- mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-fn input_audio_tokens_are_zero_for_image_only_input(fixture: &LlamaFixture<'_>) -> Result<()> {
- let (usage, expected) = build_multimodal_chunks_and_eval_into_usage(fixture)?;
-
- if expected.audio != 0 {
- anyhow::bail!(
- "fixture invariant: image-only multimodal input should produce zero audio chunk tokens, got {}",
- expected.audio
- );
- }
- if usage.input_audio_tokens != 0 {
- anyhow::bail!(
- "input_audio_tokens must be zero when no audio chunks are evaluated; got {}",
- usage.input_audio_tokens
- );
- }
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 4096,
- n_batch = 512,
- n_ubatch = 512,
- mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-fn completion_tokens_are_zero_after_eval_before_generation(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let (usage, _expected) = build_multimodal_chunks_and_eval_into_usage(fixture)?;
-
- if usage.completion_tokens() != 0 {
- anyhow::bail!(
- "completion_tokens must be zero immediately after eval (no generation has occurred); got {}",
- usage.completion_tokens()
- );
- }
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/gemma4_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs b/llama-cpp-bindings-tests/tests/gemma4_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs
deleted file mode 100644
index e20b99a2..00000000
--- a/llama-cpp-bindings-tests/tests/gemma4_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs
+++ /dev/null
@@ -1,115 +0,0 @@
-use anyhow::Result;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_bindings::sampling::LlamaSampler;
-use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const MAX_GENERATED_TOKENS: i32 = 200;
-
-const GEMMA4_THINKING_DISABLED_PROMPT: &str = "\
-user\nReply with the single word: four. Do not explain.\n\
-model\n<|channel>thought\n\n";
-
-const FORBIDDEN_MARKERS: &[&str] = &["<|channel>thought", ""];
-
-#[llama_test(
- model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 8192,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn gemma4_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let model = fixture.model;
- let backend = fixture.backend;
-
- let mut classifier = model.sampled_token_classifier();
- let prompt_tokens = model.str_to_token(GEMMA4_THINKING_DISABLED_PROMPT, AddBos::Never)?;
- let prompt_token_count = u64::try_from(prompt_tokens.len())?;
-
- let mut batch = LlamaBatch::new(2048, 1)?;
- classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
-
- let mut context = LlamaContext::from_model(
- model,
- backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
-
- context.decode(&mut batch)?;
-
- let promoted = classifier.commit_prompt_tokens();
- assert_eq!(promoted, prompt_token_count);
-
- let mut sampler = LlamaSampler::greedy();
- let initial_position = batch.n_tokens();
- let outcome = ClassifySampleLoop {
- model,
- classifier: &mut classifier,
- sampler: &mut sampler,
- context: &mut context,
- batch: &mut batch,
- initial_position,
- max_generated_tokens: MAX_GENERATED_TOKENS,
- }
- .run()?;
-
- let usage = classifier.usage();
-
- assert!(
- !outcome.generated_raw.is_empty(),
- "Gemma 4 must generate at least one token"
- );
- assert_eq!(
- outcome.observed_reasoning, 0,
- "Gemma 4 thinking-disabled: classifier must not emit any Reasoning token \
- when the prompt closes the thought channel before generation begins; \
- generated={:?}",
- outcome.generated_raw
- );
- assert_eq!(
- outcome.observed_undeterminable, 0,
- "Gemma 4 thinking-disabled: prompt-token replay must move section to Content \
- before generation, so no Undeterminable tokens may be emitted; \
- generated={:?}",
- outcome.generated_raw
- );
- assert_eq!(
- usage.reasoning_tokens, 0,
- "Gemma 4 thinking-disabled: usage.reasoning_tokens must be zero; usage={usage:?}"
- );
- assert_eq!(
- usage.undeterminable_tokens, 0,
- "Gemma 4 thinking-disabled: usage.undeterminable_tokens must be zero; usage={usage:?}"
- );
- assert!(
- outcome.observed_content > 0,
- "Gemma 4 thinking-disabled: classifier must emit at least one Content token"
- );
- assert_eq!(
- usage.completion_tokens(),
- outcome.observed_content,
- "Gemma 4 thinking-disabled: completion tokens must equal observed Content tokens"
- );
-
- for forbidden in FORBIDDEN_MARKERS {
- assert!(
- !outcome.content_stream.contains(forbidden),
- "Gemma 4 thinking-disabled: content_stream leaked marker {forbidden:?}; \
- content_stream={:?}",
- outcome.content_stream
- );
- }
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/gemma4_classifier_emits_reasoning.rs b/llama-cpp-bindings-tests/tests/gemma4_classifier_emits_reasoning.rs
deleted file mode 100644
index 6a7aaba0..00000000
--- a/llama-cpp-bindings-tests/tests/gemma4_classifier_emits_reasoning.rs
+++ /dev/null
@@ -1,124 +0,0 @@
-use anyhow::Result;
-use anyhow::bail;
-use llama_cpp_bindings::ChatMessageParseOutcome;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_bindings::sampling::LlamaSampler;
-use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const MAX_GENERATED_TOKENS: i32 = 1500;
-
-const GEMMA4_THINKING_PROMPT: &str = "\
-user\nReply with the single word: four. Do not explain.\n\
-model\n<|channel>thought\n";
-
-const FORBIDDEN_MARKERS: &[&str] = &["<|channel>thought", ""];
-
-#[llama_test(
- model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 8192,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn gemma4_classifier_emits_reasoning_for_thinking_prompt(fixture: &LlamaFixture<'_>) -> Result<()> {
- let model = fixture.model;
- let backend = fixture.backend;
-
- let mut classifier = model.sampled_token_classifier();
- let prompt_tokens = model.str_to_token(GEMMA4_THINKING_PROMPT, AddBos::Never)?;
- let prompt_token_count = u64::try_from(prompt_tokens.len())?;
-
- let mut batch = LlamaBatch::new(2048, 1)?;
- classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
-
- let mut context = LlamaContext::from_model(
- model,
- backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
-
- context.decode(&mut batch)?;
-
- let promoted = classifier.commit_prompt_tokens();
- assert_eq!(promoted, prompt_token_count);
-
- let mut sampler = LlamaSampler::greedy();
- let initial_position = batch.n_tokens();
- let outcome = ClassifySampleLoop {
- model,
- classifier: &mut classifier,
- sampler: &mut sampler,
- context: &mut context,
- batch: &mut batch,
- initial_position,
- max_generated_tokens: MAX_GENERATED_TOKENS,
- }
- .run()?;
-
- let usage = classifier.usage();
- let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?;
- let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else {
- bail!("Gemma 4 chat template must be recognised by the parser; got Unrecognized");
- };
-
- assert!(
- !outcome.generated_raw.is_empty(),
- "Gemma 4 must generate at least one token"
- );
- assert!(
- outcome.observed_reasoning > 0,
- "Gemma 4 classifier must emit at least one Reasoning token when the model \
- emits a `<|channel>thought` block; outcome={outcome:?}",
- );
- assert!(
- usage.reasoning_tokens > 0,
- "Gemma 4 usage.reasoning_tokens must be non-zero when the model emits a \
- reasoning block; usage was {usage:?}"
- );
- assert_eq!(
- outcome.observed_undeterminable, 0,
- "Gemma 4: classifier must not emit Undeterminable when the model emits a \
- detected `<|channel>thought` marker; outcome={outcome:?}"
- );
- assert_eq!(
- usage.undeterminable_tokens, 0,
- "Gemma 4: usage.undeterminable_tokens must be zero; usage={usage:?}"
- );
- assert_eq!(
- usage.completion_tokens(),
- outcome.observed_content + outcome.observed_reasoning,
- "Gemma 4: completion tokens must equal observed Content + Reasoning"
- );
- assert!(
- !parsed.reasoning_content.is_empty(),
- "Gemma 4 must close its reasoning block within {MAX_GENERATED_TOKENS} tokens; \
- increase the budget or pick a more direct prompt. generated={:?}",
- outcome.generated_raw,
- );
-
- for forbidden in FORBIDDEN_MARKERS {
- assert!(
- !outcome.reasoning_stream.contains(forbidden),
- "Gemma 4: reasoning_stream leaked marker {forbidden:?}; \
- reasoning_stream={:?}",
- outcome.reasoning_stream
- );
- assert!(
- !outcome.content_stream.contains(forbidden),
- "Gemma 4: content_stream leaked marker {forbidden:?}; \
- content_stream={:?}",
- outcome.content_stream
- );
- }
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/gemma4_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs b/llama-cpp-bindings-tests/tests/gemma4_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs
deleted file mode 100644
index e810ca3e..00000000
--- a/llama-cpp-bindings-tests/tests/gemma4_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs
+++ /dev/null
@@ -1,101 +0,0 @@
-use anyhow::Result;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::mtmd::MtmdBitmap;
-use llama_cpp_bindings::mtmd::MtmdInputText;
-use llama_cpp_bindings::mtmd::mtmd_default_marker;
-use llama_cpp_bindings::sampling::LlamaSampler;
-use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-use llama_cpp_bindings_tests::test_model::fixtures_dir;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const MAX_GENERATED_TOKENS: i32 = 200;
-
-#[llama_test(
- model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 8192,
- n_batch = 512,
- n_ubatch = 512,
- mmproj_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "mmproj-F16.gguf"),
-)]
-fn gemma4_classifier_emits_reasoning_for_multimodal_thinking_prompt(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let model = fixture.model;
- let backend = fixture.backend;
- let mtmd_ctx = fixture
- .mtmd_context
- .expect("mmproj_file declared in attribute");
-
- let mut context = LlamaContext::from_model(
- model,
- backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
-
- let image_path = fixtures_dir().join("llamas.jpg");
- let image_path_str = image_path
- .to_str()
- .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?;
- let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?;
-
- let marker = mtmd_default_marker();
- let prompt = format!(
- "user\n{marker}What animals do you see in this image?\nmodel\n<|channel>thought\n"
- );
-
- let input_text = MtmdInputText {
- text: prompt,
- add_special: false,
- parse_special: true,
- };
-
- let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
-
- let mut classifier = model.sampled_token_classifier();
- let n_past = classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?;
-
- let mut sampler = LlamaSampler::chain_simple([
- LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
- LlamaSampler::top_k(40),
- LlamaSampler::top_p(0.9, 1),
- LlamaSampler::min_p(0.05, 1),
- LlamaSampler::temp(0.7),
- LlamaSampler::dist(0x00C0_FFEE),
- ]);
-
- let mut batch = LlamaBatch::new(2048, 1)?;
- let outcome = ClassifySampleLoop {
- model,
- classifier: &mut classifier,
- sampler: &mut sampler,
- context: &mut context,
- batch: &mut batch,
- initial_position: n_past,
- max_generated_tokens: MAX_GENERATED_TOKENS,
- }
- .run()?;
-
- let usage = classifier.usage();
-
- if outcome.observed_reasoning == 0 {
- anyhow::bail!(
- "Gemma 4 multimodal + thinking: classifier must emit at least one Reasoning token \
- when the prompt opens a `<|channel>thought` block; outcome={outcome:?}"
- );
- }
- if usage.reasoning_tokens == 0 {
- anyhow::bail!(
- "Gemma 4 multimodal + thinking: usage.reasoning_tokens must be non-zero; usage={usage:?}"
- );
- }
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/gemma4_parses_tool_call_payload.rs b/llama-cpp-bindings-tests/tests/gemma4_parses_tool_call_payload.rs
deleted file mode 100644
index 2f3d3eaa..00000000
--- a/llama-cpp-bindings-tests/tests/gemma4_parses_tool_call_payload.rs
+++ /dev/null
@@ -1,68 +0,0 @@
-use anyhow::Result;
-use anyhow::bail;
-use llama_cpp_bindings::ChatMessageParseOutcome;
-use llama_cpp_bindings::ToolCallArguments;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const TOOLS_JSON: &str = r#"[
- {
- "type": "function",
- "function": {
- "name": "get_weather",
- "description": "Get the current weather for a location",
- "parameters": {
- "type": "object",
- "properties": {
- "location": {"type": "string", "description": "The city name"}
- },
- "required": ["location"]
- }
- }
- }
-]"#;
-
-const GEMMA4_PAIRED_QUOTE_PAYLOAD: &str =
- "<|tool_call>call:get_weather{location:<|\"|>Paris<|\"|>}";
-
-#[llama_test(
- model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn gemma4_parses_tool_call_payload(fixture: &LlamaFixture<'_>) -> Result<()> {
- let outcome =
- fixture
- .model
- .parse_chat_message(TOOLS_JSON, GEMMA4_PAIRED_QUOTE_PAYLOAD, false)?;
-
- let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
- bail!("expected Recognized for Gemma 4 PairedQuote on a Gemma-4 model; got Unrecognized");
- };
- assert_eq!(
- parsed.tool_calls.len(),
- 1,
- "expected one tool call; got {:?}",
- parsed.tool_calls
- );
- assert_eq!(parsed.tool_calls[0].name, "get_weather");
- let location = match &parsed.tool_calls[0].arguments {
- ToolCallArguments::ValidJson(value) => value
- .get("location")
- .and_then(|v| v.as_str())
- .map(str::to_owned),
- ToolCallArguments::InvalidJson(raw) => {
- bail!("expected ValidJson, got InvalidJson: {raw}");
- }
- };
- assert_eq!(location.as_deref(), Some("Paris"));
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/gemma4_template_override_returns_full_markers.rs b/llama-cpp-bindings-tests/tests/gemma4_template_override_returns_full_markers.rs
deleted file mode 100644
index dc8099d7..00000000
--- a/llama-cpp-bindings-tests/tests/gemma4_template_override_returns_full_markers.rs
+++ /dev/null
@@ -1,50 +0,0 @@
-#![expect(
- clippy::unnecessary_wraps,
- reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-)]
-
-use anyhow::Result;
-use llama_cpp_bindings::ToolCallArgsShape;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
- model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn gemma4_template_override_returns_full_markers(fixture: &LlamaFixture<'_>) -> Result<()> {
- let model = fixture.model;
- let template = model
- .chat_template(None)
- .expect("Gemma 4 chat template must be present");
- let template_str = template.to_str().expect("template must be valid UTF-8");
- assert!(
- template_str.contains("<|tool_call>call:"),
- "Gemma 4 chat template must contain '<|tool_call>call:' fingerprint; \
- template starts with: {:?}",
- &template_str[..template_str.len().min(200)],
- );
-
- let markers = model
- .tool_call_markers()
- .expect("Gemma 4 must produce ToolCallMarkers via override registry");
-
- assert_eq!(markers.open, "<|tool_call>call:");
- assert_eq!(markers.close, "}");
- let ToolCallArgsShape::PairedQuote(shape) = markers.args_shape else {
- panic!("expected PairedQuote variant, got {:?}", markers.args_shape);
- };
- assert_eq!(shape.name_args_separator, "{");
- assert_eq!(shape.value_quote.open, "<|\"|>");
- assert_eq!(shape.value_quote.close, "<|\"|>");
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/glm47_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs b/llama-cpp-bindings-tests/tests/glm47_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs
deleted file mode 100644
index 7b614ef9..00000000
--- a/llama-cpp-bindings-tests/tests/glm47_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs
+++ /dev/null
@@ -1,93 +0,0 @@
-use anyhow::Result;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_bindings::sampling::LlamaSampler;
-use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const MAX_GENERATED_TOKENS: i32 = 200;
-
-const GLM47_THINKING_DISABLED_PROMPT: &str = "\
-<|user|>
-What is 2 + 2?
-<|assistant|>
-
-
-";
-
-const FORBIDDEN_MARKERS: &[&str] = &["", ""];
-
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 8192,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn glm47_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let model = fixture.model;
- let backend = fixture.backend;
-
- let mut classifier = model.sampled_token_classifier();
- let prompt_tokens = model.str_to_token(GLM47_THINKING_DISABLED_PROMPT, AddBos::Never)?;
- let prompt_token_count = u64::try_from(prompt_tokens.len())?;
-
- let mut batch = LlamaBatch::new(2048, 1)?;
- classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
-
- let mut context = LlamaContext::from_model(
- model,
- backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
-
- context.decode(&mut batch)?;
-
- let promoted = classifier.commit_prompt_tokens();
- assert_eq!(promoted, prompt_token_count);
-
- let mut sampler = LlamaSampler::chain_simple([
- LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
- LlamaSampler::top_k(40),
- LlamaSampler::top_p(0.9, 1),
- LlamaSampler::min_p(0.05, 1),
- LlamaSampler::temp(0.7),
- LlamaSampler::dist(0x00C0_FFEE),
- ]);
- let initial_position = batch.n_tokens();
- let outcome = ClassifySampleLoop {
- model,
- classifier: &mut classifier,
- sampler: &mut sampler,
- context: &mut context,
- batch: &mut batch,
- initial_position,
- max_generated_tokens: MAX_GENERATED_TOKENS,
- }
- .run()?;
-
- let usage = classifier.usage();
-
- assert!(!outcome.generated_raw.is_empty());
- assert_eq!(outcome.observed_reasoning, 0);
- assert_eq!(outcome.observed_undeterminable, 0);
- assert_eq!(usage.reasoning_tokens, 0);
- assert_eq!(usage.undeterminable_tokens, 0);
- assert!(outcome.observed_content > 0);
- assert_eq!(usage.completion_tokens(), outcome.observed_content);
-
- for forbidden in FORBIDDEN_MARKERS {
- assert!(!outcome.content_stream.contains(forbidden));
- }
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/glm47_classifier_emits_reasoning.rs b/llama-cpp-bindings-tests/tests/glm47_classifier_emits_reasoning.rs
deleted file mode 100644
index d4677a14..00000000
--- a/llama-cpp-bindings-tests/tests/glm47_classifier_emits_reasoning.rs
+++ /dev/null
@@ -1,111 +0,0 @@
-use anyhow::Result;
-use anyhow::bail;
-use llama_cpp_bindings::ChatMessageParseOutcome;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_bindings::sampling::LlamaSampler;
-use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const MAX_GENERATED_TOKENS: i32 = 1500;
-
-const GLM47_THINKING_PROMPT: &str = "\
-<|user|>
-What is 2 + 2?
-<|assistant|>
-
-";
-
-const FORBIDDEN_MARKERS: &[&str] = &["", ""];
-
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 8192,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn glm47_classifier_emits_reasoning_for_thinking_enabled_prompt(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let model = fixture.model;
- let backend = fixture.backend;
-
- let mut classifier = model.sampled_token_classifier();
- let prompt_tokens = model.str_to_token(GLM47_THINKING_PROMPT, AddBos::Never)?;
- let prompt_token_count = u64::try_from(prompt_tokens.len())?;
-
- let mut batch = LlamaBatch::new(2048, 1)?;
- classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
-
- let mut context = LlamaContext::from_model(
- model,
- backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
-
- context.decode(&mut batch)?;
-
- let promoted = classifier.commit_prompt_tokens();
- assert_eq!(promoted, prompt_token_count);
-
- let mut sampler = LlamaSampler::chain_simple([
- LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
- LlamaSampler::top_k(40),
- LlamaSampler::top_p(0.9, 1),
- LlamaSampler::min_p(0.05, 1),
- LlamaSampler::temp(0.7),
- LlamaSampler::dist(0x00C0_FFEE),
- ]);
- let initial_position = batch.n_tokens();
- let outcome = ClassifySampleLoop {
- model,
- classifier: &mut classifier,
- sampler: &mut sampler,
- context: &mut context,
- batch: &mut batch,
- initial_position,
- max_generated_tokens: MAX_GENERATED_TOKENS,
- }
- .run()?;
-
- let usage = classifier.usage();
- let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?;
- let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else {
- bail!("GLM-4.7 chat template must be recognised by the parser; got Unrecognized");
- };
-
- assert!(!outcome.generated_raw.is_empty());
- assert!(outcome.observed_reasoning > 0);
- assert!(usage.reasoning_tokens > 0);
- assert_eq!(outcome.observed_undeterminable, 0);
- assert_eq!(usage.undeterminable_tokens, 0);
- assert_eq!(
- usage.completion_tokens(),
- outcome.observed_content + outcome.observed_reasoning
- );
-
- if parsed.reasoning_content.is_empty() {
- eprintln!(
- "GLM-4.7 didn't close its reasoning block within {MAX_GENERATED_TOKENS} tokens — \
- skipping strict parser-equality assertions"
- );
- } else {
- assert_eq!(outcome.reasoning_stream, parsed.reasoning_content);
- assert_eq!(outcome.content_stream, parsed.content);
- }
-
- for forbidden in FORBIDDEN_MARKERS {
- assert!(!outcome.reasoning_stream.contains(forbidden));
- assert!(!outcome.content_stream.contains(forbidden));
- }
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/glm47_parses_tool_call_payload.rs b/llama-cpp-bindings-tests/tests/glm47_parses_tool_call_payload.rs
deleted file mode 100644
index 8f31901e..00000000
--- a/llama-cpp-bindings-tests/tests/glm47_parses_tool_call_payload.rs
+++ /dev/null
@@ -1,66 +0,0 @@
-use anyhow::Result;
-use anyhow::bail;
-use llama_cpp_bindings::ChatMessageParseOutcome;
-use llama_cpp_bindings::ToolCallArguments;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const TOOLS_JSON: &str = r#"[
- {
- "type": "function",
- "function": {
- "name": "get_weather",
- "description": "Get the current weather for a location",
- "parameters": {
- "type": "object",
- "properties": {
- "location": {"type": "string", "description": "The city name"}
- },
- "required": ["location"]
- }
- }
- }
-]"#;
-
-const GLM47_KEY_VALUE_PAYLOAD: &str = "get_weather\
-location\
-Paris\
-";
-
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn glm47_parses_tool_call_payload(fixture: &LlamaFixture<'_>) -> Result<()> {
- let outcome = fixture
- .model
- .parse_chat_message(TOOLS_JSON, GLM47_KEY_VALUE_PAYLOAD, false)?;
-
- let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
- bail!(
- "expected Recognized for GLM-4.7 key-value tags on a GLM-4.7-Flash model; got Unrecognized"
- );
- };
- assert_eq!(parsed.tool_calls.len(), 1);
- assert_eq!(parsed.tool_calls[0].name, "get_weather");
- let location = match &parsed.tool_calls[0].arguments {
- ToolCallArguments::ValidJson(value) => value
- .get("location")
- .and_then(|v| v.as_str())
- .map(str::to_owned),
- ToolCallArguments::InvalidJson(raw) => {
- bail!("expected ValidJson, got InvalidJson: {raw}");
- }
- };
- assert_eq!(location.as_deref(), Some("Paris"));
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/glm47_template_override_returns_full_markers.rs b/llama-cpp-bindings-tests/tests/glm47_template_override_returns_full_markers.rs
deleted file mode 100644
index 491c46c4..00000000
--- a/llama-cpp-bindings-tests/tests/glm47_template_override_returns_full_markers.rs
+++ /dev/null
@@ -1,49 +0,0 @@
-#![expect(
- clippy::unnecessary_wraps,
- reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-)]
-
-use anyhow::Result;
-use llama_cpp_bindings::ToolCallArgsShape;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn glm47_template_override_returns_full_markers(fixture: &LlamaFixture<'_>) -> Result<()> {
- let model = fixture.model;
- let template = model
- .chat_template(None)
- .expect("GLM-4.7 chat template must be present");
- let template_str = template.to_str().expect("template must be valid UTF-8");
- assert!(template_str.contains(""));
-
- let markers = model
- .tool_call_markers()
- .expect("GLM-4.7 must produce ToolCallMarkers via override registry");
-
- assert_eq!(markers.open, "");
- assert_eq!(markers.close, "");
- let ToolCallArgsShape::KeyValueXmlTags(shape) = markers.args_shape else {
- panic!(
- "expected KeyValueXmlTags variant, got {:?}",
- markers.args_shape
- );
- };
- assert_eq!(shape.key_open, "");
- assert_eq!(shape.key_close, "");
- assert_eq!(shape.value_open, "");
- assert_eq!(shape.value_close, "");
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/ingest_prompt_chunk.rs b/llama-cpp-bindings-tests/tests/ingest_prompt_chunk.rs
deleted file mode 100644
index 24045f7c..00000000
--- a/llama-cpp-bindings-tests/tests/ingest_prompt_chunk.rs
+++ /dev/null
@@ -1,181 +0,0 @@
-use anyhow::Result;
-use llama_cpp_bindings::ingest_prompt_chunk::ingest_prompt_chunk;
-use llama_cpp_bindings::mtmd::MtmdBitmap;
-use llama_cpp_bindings::mtmd::MtmdInputChunkType;
-use llama_cpp_bindings::mtmd::MtmdInputText;
-use llama_cpp_bindings::mtmd::mtmd_default_marker;
-use llama_cpp_bindings_tests::test_model::fixtures_dir;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
- mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-fn text_chunk_records_prompt_tokens(fixture: &LlamaFixture<'_>) -> Result<()> {
- let model = fixture.model;
- let mtmd_ctx = fixture
- .mtmd_context
- .expect("mmproj_file declared in attribute");
-
- let input_text = MtmdInputText {
- text: "hello world".to_owned(),
- add_special: false,
- parse_special: false,
- };
- let chunks = mtmd_ctx.tokenize(input_text, &[])?;
-
- let text_chunk = (0..chunks.len())
- .filter_map(|index| chunks.get(index))
- .find(|chunk| chunk.chunk_type() == Ok(MtmdInputChunkType::Text))
- .ok_or_else(|| {
- anyhow::anyhow!("text-only tokenization should produce at least one text chunk")
- })?;
-
- let n_tokens = u64::try_from(text_chunk.n_tokens())?;
-
- let mut classifier = model.sampled_token_classifier();
-
- ingest_prompt_chunk(&mut classifier, &text_chunk)?;
-
- let usage = classifier.usage();
- if usage.prompt_tokens != n_tokens {
- anyhow::bail!(
- "text chunk must record n_tokens as prompt_tokens; expected {n_tokens}, got {}",
- usage.prompt_tokens
- );
- }
- if usage.input_image_tokens != 0 {
- anyhow::bail!(
- "text chunk must not bump input_image_tokens; got {}",
- usage.input_image_tokens
- );
- }
- if usage.input_audio_tokens != 0 {
- anyhow::bail!(
- "text chunk must not bump input_audio_tokens; got {}",
- usage.input_audio_tokens
- );
- }
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
- mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-fn image_chunk_records_input_image_tokens_only(fixture: &LlamaFixture<'_>) -> Result<()> {
- let model = fixture.model;
- let mtmd_ctx = fixture
- .mtmd_context
- .expect("mmproj_file declared in attribute");
-
- let image_path = fixtures_dir().join("llamas.jpg");
- let image_path_str = image_path
- .to_str()
- .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?;
- let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?;
-
- let marker = mtmd_default_marker();
- let input_text = MtmdInputText {
- text: marker.to_owned(),
- add_special: false,
- parse_special: true,
- };
- let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
-
- let image_chunk = (0..chunks.len())
- .filter_map(|index| chunks.get(index))
- .find(|chunk| chunk.chunk_type() == Ok(MtmdInputChunkType::Image))
- .ok_or_else(|| anyhow::anyhow!("multimodal tokenization should produce an image chunk"))?;
-
- let n_tokens = u64::try_from(image_chunk.n_tokens())?;
- if n_tokens == 0 {
- anyhow::bail!("image chunk should report at least one token");
- }
-
- let mut classifier = model.sampled_token_classifier();
-
- ingest_prompt_chunk(&mut classifier, &image_chunk)?;
-
- let usage = classifier.usage();
- if usage.input_image_tokens != n_tokens {
- anyhow::bail!(
- "image chunk must record n_tokens as input_image_tokens; expected {n_tokens}, got {}",
- usage.input_image_tokens
- );
- }
- if usage.prompt_tokens != 0 {
- anyhow::bail!(
- "image chunk must not bump prompt_tokens; got {}",
- usage.prompt_tokens
- );
- }
- if usage.input_audio_tokens != 0 {
- anyhow::bail!(
- "image chunk must not bump input_audio_tokens; got {}",
- usage.input_audio_tokens
- );
- }
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
- mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-fn text_chunk_drives_marker_state_machine_to_reasoning(fixture: &LlamaFixture<'_>) -> Result<()> {
- let model = fixture.model;
- let mtmd_ctx = fixture
- .mtmd_context
- .expect("mmproj_file declared in attribute");
-
- let input_text = MtmdInputText {
- text: "<|im_start|>user\nHi<|im_end|>\n<|im_start|>assistant\n\n".to_owned(),
- add_special: false,
- parse_special: true,
- };
- let chunks = mtmd_ctx.tokenize(input_text, &[])?;
-
- let mut classifier = model.sampled_token_classifier();
-
- for index in 0..chunks.len() {
- let chunk = chunks
- .get(index)
- .ok_or_else(|| anyhow::anyhow!("chunk index {index} must exist"))?;
- ingest_prompt_chunk(&mut classifier, &chunk)?;
- }
-
- if classifier.current_section() != llama_cpp_bindings::SampledTokenSection::Reasoning {
- anyhow::bail!(
- "text chunk replay must transition the classifier section to Reasoning when the \
- prompt opens a `` block; got {:?}",
- classifier.current_section()
- );
- }
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/kv_cache_and_session.rs b/llama-cpp-bindings-tests/tests/kv_cache_and_session.rs
new file mode 100644
index 00000000..de316e42
--- /dev/null
+++ b/llama-cpp-bindings-tests/tests/kv_cache_and_session.rs
@@ -0,0 +1,2836 @@
+use llama_cpp_test_harness::llama_tests_main;
+
+mod model_context_creation {
+ #![expect(
+ clippy::unnecessary_wraps,
+ reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
+ )]
+
+ use anyhow::Result;
+ use llama_cpp_bindings::context::LlamaContext;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 256,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 256,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 256,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 256,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn new_context_returns_valid_context(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let context = LlamaContext::from_model(
+ fixture.model,
+ fixture.backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+
+ assert!(context.n_ctx() > 0);
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 4294967295,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 4294967295,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 4294967295,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 4294967295,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn new_context_with_huge_ctx_returns_null_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let result = LlamaContext::from_model(
+ fixture.model,
+ fixture.backend,
+ (*fixture.context_params).into_llama_context_params(),
+ );
+
+ assert!(result.is_err());
+ Ok(())
+ }
+}
+
+mod context {
+ use std::ptr::NonNull;
+ use std::sync::Arc;
+ use std::sync::atomic::AtomicBool;
+
+ use anyhow::Result;
+ use llama_cpp_bindings::DecodeError;
+ use llama_cpp_bindings::LogitsError;
+ use llama_cpp_bindings::context::LlamaContext;
+ use llama_cpp_bindings::llama_batch::LlamaBatch;
+ use llama_cpp_bindings::model::AddBos;
+ use llama_cpp_bindings::model::LlamaLoraAdapter;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ // =========================================================================================
+ // Group A: default Qwen model, embeddings=false. Most context tests fall here.
+ // =========================================================================================
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn context_creation_and_properties(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let context = LlamaContext::from_model(
+ fixture.model,
+ fixture.backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+
+ assert!(context.n_ctx() > 0);
+ assert!(context.n_batch() > 0);
+ assert!(context.n_ubatch() > 0);
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn decode_and_get_logits(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mut context = LlamaContext::from_model(
+ fixture.model,
+ fixture.backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+ let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
+ let mut batch = LlamaBatch::new(512, 1)?;
+ batch.add_sequence(&tokens, 0, false)?;
+
+ let decode_result = context.decode(&mut batch);
+ assert!(decode_result.is_ok());
+
+ let logits = context.get_logits()?;
+ assert!(!logits.is_empty());
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn timings_work(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mut context = LlamaContext::from_model(
+ fixture.model,
+ fixture.backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+
+ context.reset_timings();
+ let timings = context.timings();
+ assert!(timings.t_start_ms() >= 0.0);
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn token_data_array_has_entries_after_decode(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mut context = LlamaContext::from_model(
+ fixture.model,
+ fixture.backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+ let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
+ let mut batch = LlamaBatch::new(512, 1)?;
+ batch.add_sequence(&tokens, 0, false)?;
+ context.decode(&mut batch)?;
+
+ let token_data_array = context.token_data_array()?;
+
+ assert!(!token_data_array.data.is_empty());
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn get_logits_ith_returns_valid_slice(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mut context = LlamaContext::from_model(
+ fixture.model,
+ fixture.backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+ let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
+ let last_index = i32::try_from(tokens.len() - 1)?;
+ let mut batch = LlamaBatch::new(512, 1)?;
+ batch.add_sequence(&tokens, 0, false)?;
+ context.decode(&mut batch)?;
+
+ let logits = context.get_logits_ith(last_index)?;
+
+ assert_eq!(logits.len(), usize::try_from(fixture.model.n_vocab())?);
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn token_data_array_ith_returns_valid_data(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mut context = LlamaContext::from_model(
+ fixture.model,
+ fixture.backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+ let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
+ let last_index = i32::try_from(tokens.len() - 1)?;
+ let mut batch = LlamaBatch::new(512, 1)?;
+ batch.add_sequence(&tokens, 0, false)?;
+ context.decode(&mut batch)?;
+
+ let token_data_array = context.token_data_array_ith(last_index)?;
+
+ assert_eq!(
+ token_data_array.data.len(),
+ usize::try_from(fixture.model.n_vocab())?
+ );
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn embeddings_ith_returns_error_when_embeddings_disabled(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let context = LlamaContext::from_model(
+ fixture.model,
+ fixture.backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+
+ let result = context.embeddings_ith(0);
+
+ assert!(result.is_err());
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn embeddings_seq_ith_returns_error_when_embeddings_disabled(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let context = LlamaContext::from_model(
+ fixture.model,
+ fixture.backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+
+ let result = context.embeddings_seq_ith(0);
+
+ assert!(result.is_err());
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn candidates_returns_n_vocab_entries(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mut context = LlamaContext::from_model(
+ fixture.model,
+ fixture.backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+ let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
+ let mut batch = LlamaBatch::new(512, 1)?;
+ batch.add_sequence(&tokens, 0, false)?;
+ context.decode(&mut batch)?;
+
+ let count = context.candidates()?.count();
+
+ assert_eq!(count, usize::try_from(fixture.model.n_vocab())?);
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn debug_format_contains_struct_name(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let context = LlamaContext::from_model(
+ fixture.model,
+ fixture.backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+ let debug_output = format!("{context:?}");
+
+ assert!(debug_output.contains("LlamaContext"));
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn candidates_ith_returns_n_vocab_entries(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mut context = LlamaContext::from_model(
+ fixture.model,
+ fixture.backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+ let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
+ let last_index = i32::try_from(tokens.len() - 1)?;
+ let mut batch = LlamaBatch::new(512, 1)?;
+ batch.add_sequence(&tokens, 0, false)?;
+ context.decode(&mut batch)?;
+
+ let count = context.candidates_ith(last_index)?.count();
+
+ assert_eq!(count, usize::try_from(fixture.model.n_vocab())?);
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn lora_adapter_remove_succeeds_with_no_adapters(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let context = LlamaContext::from_model(
+ fixture.model,
+ fixture.backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+ let mut adapter = LlamaLoraAdapter {
+ lora_adapter: NonNull::dangling(),
+ };
+
+ let result = context.lora_adapter_remove(&mut adapter);
+
+ assert!(result.is_ok());
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn encode_on_non_encoder_model_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mut context = LlamaContext::from_model(
+ fixture.model,
+ fixture.backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+ let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
+ let mut batch = LlamaBatch::new(512, 1)?;
+ batch.add_sequence(&tokens, 0, false)?;
+
+ let result = context.encode(&mut batch);
+
+ assert!(result.is_err());
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn lora_adapter_set_with_dangling_pointer_succeeds_or_errors(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let context = LlamaContext::from_model(
+ fixture.model,
+ fixture.backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+ let mut adapter = LlamaLoraAdapter {
+ lora_adapter: NonNull::dangling(),
+ };
+
+ let result = context.lora_adapter_set(&mut adapter, 1.0);
+
+ assert!(result.is_ok());
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ embeddings = true,
+ )]
+ fn embeddings_seq_ith_returns_null_embedding_error_for_invalid_seq(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let mut context = LlamaContext::from_model(
+ fixture.model,
+ fixture.backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+ let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
+ let mut batch = LlamaBatch::new(512, 1)?;
+ batch.add_sequence(&tokens, 0, false)?;
+ context.decode(&mut batch)?;
+
+ let result = context.embeddings_seq_ith(999);
+
+ assert!(result.is_err());
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn decode_empty_batch_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mut context = LlamaContext::from_model(
+ fixture.model,
+ fixture.backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+ let mut batch = LlamaBatch::new(512, 1)?;
+
+ let result = context.decode(&mut batch);
+
+ assert!(result.is_err());
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn set_abort_flag_aborts_decode(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mut context = LlamaContext::from_model(
+ fixture.model,
+ fixture.backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+ let abort_flag = Arc::new(AtomicBool::new(true));
+ context.set_abort_flag(abort_flag);
+
+ let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
+ let mut batch = LlamaBatch::new(512, 1)?;
+ batch.add_sequence(&tokens, 0, false)?;
+
+ let result = context.decode(&mut batch);
+
+ assert_eq!(result, Err(DecodeError::Aborted));
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn set_abort_flag_false_allows_decode(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mut context = LlamaContext::from_model(
+ fixture.model,
+ fixture.backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+ let abort_flag = Arc::new(AtomicBool::new(false));
+ context.set_abort_flag(abort_flag);
+
+ let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
+ let mut batch = LlamaBatch::new(512, 1)?;
+ batch.add_sequence(&tokens, 0, false)?;
+
+ let result = context.decode(&mut batch);
+
+ assert!(result.is_ok());
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn clear_abort_callback_allows_decode_with_flag_true(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mut context = LlamaContext::from_model(
+ fixture.model,
+ fixture.backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+ let abort_flag = Arc::new(AtomicBool::new(true));
+ context.set_abort_flag(abort_flag);
+ context.clear_abort_callback();
+
+ let tokens = fixture.model.str_to_token("hello", AddBos::Always)?;
+ let mut batch = LlamaBatch::new(512, 1)?;
+ batch.add_sequence(&tokens, 0, false)?;
+
+ let result = context.decode(&mut batch);
+
+ assert!(result.is_ok());
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn synchronize_completes_without_panic(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let context = LlamaContext::from_model(
+ fixture.model,
+ fixture.backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+
+ context.synchronize();
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn detach_threadpool_completes_without_panic(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let context = LlamaContext::from_model(
+ fixture.model,
+ fixture.backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+
+ context.detach_threadpool();
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn get_logits_ith_returns_token_not_initialized_for_unknown_index(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let context = LlamaContext::from_model(
+ fixture.model,
+ fixture.backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+
+ let result = context.get_logits_ith(7);
+
+ assert!(matches!(result, Err(LogitsError::TokenNotInitialized(7))));
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 64,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn get_logits_ith_returns_token_index_exceeds_context_for_huge_index(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let mut context = LlamaContext::from_model(
+ fixture.model,
+ fixture.backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+
+ let huge_index = i32::try_from(context.n_ctx())?;
+ context.mark_logits_initialized(huge_index);
+ let result = context.get_logits_ith(huge_index);
+
+ assert!(matches!(
+ result,
+ Err(LogitsError::TokenIndexExceedsContext { .. })
+ ));
+
+ Ok(())
+ }
+}
+
+mod context_kv_cache {
+ use std::num::NonZeroU8;
+
+ use anyhow::Result;
+ use llama_cpp_bindings::context::LlamaContext;
+ use llama_cpp_bindings::context::kv_cache::KvCacheConversionError;
+ use llama_cpp_bindings::error::KvCacheSeqAddError;
+ use llama_cpp_bindings::error::KvCacheSeqDivError;
+ use llama_cpp_bindings::llama_batch::LlamaBatch;
+ use llama_cpp_bindings::model::AddBos;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ fn build_context<'context>(
+ fixture: &'context LlamaFixture<'_>,
+ ) -> Result> {
+ Ok(LlamaContext::from_model(
+ fixture.model,
+ fixture.backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?)
+ }
+
+ fn decode_hello_world(
+ fixture: &LlamaFixture<'_>,
+ context: &mut LlamaContext<'_>,
+ ) -> Result<()> {
+ let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
+ let mut batch = LlamaBatch::new(512, 1)?;
+ batch.add_sequence(&tokens, 0, false)?;
+ context.decode(&mut batch)?;
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn clear_kv_cache_resets_positions(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mut context = build_context(fixture)?;
+
+ decode_hello_world(fixture, &mut context)?;
+
+ context.clear_kv_cache();
+ assert_eq!(context.kv_cache_seq_pos_max(0), -1);
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn kv_cache_seq_pos_max_is_non_negative_after_decode(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mut context = build_context(fixture)?;
+
+ decode_hello_world(fixture, &mut context)?;
+
+ assert!(context.kv_cache_seq_pos_max(0) >= 0);
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn clear_kv_cache_seq_with_range(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mut context = build_context(fixture)?;
+
+ decode_hello_world(fixture, &mut context)?;
+
+ let result = context.clear_kv_cache_seq(Some(0), Some(0), Some(1));
+ assert!(result.is_ok());
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn copy_kv_cache_seq_succeeds(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mut context = build_context(fixture)?;
+
+ decode_hello_world(fixture, &mut context)?;
+
+ let result = context.copy_kv_cache_seq(0, 1, None, None);
+ assert!(result.is_ok());
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn copy_cache_executes_without_crash(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mut context = build_context(fixture)?;
+
+ decode_hello_world(fixture, &mut context)?;
+
+ let pos_max = context.kv_cache_seq_pos_max(0);
+ context.copy_cache(0, 1, pos_max + 1);
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn kv_cache_seq_add_returns_error_for_mrope_model(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mut context = build_context(fixture)?;
+
+ decode_hello_world(fixture, &mut context)?;
+
+ let result = context.kv_cache_seq_add(0, Some(0), None, 1);
+
+ assert!(matches!(
+ result.unwrap_err(),
+ KvCacheSeqAddError::IncompatibleRopeType,
+ ));
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn kv_cache_seq_div_returns_error_for_mrope_model(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mut context = build_context(fixture)?;
+
+ decode_hello_world(fixture, &mut context)?;
+
+ let divisor = NonZeroU8::new(2).ok_or_else(|| anyhow::anyhow!("2 is non-zero"))?;
+ let result = context.kv_cache_seq_div(0, Some(0), None, divisor);
+
+ assert!(matches!(
+ result.unwrap_err(),
+ KvCacheSeqDivError::IncompatibleRopeType,
+ ));
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn kv_cache_seq_keep_retains_specified_sequence(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mut context = build_context(fixture)?;
+
+ decode_hello_world(fixture, &mut context)?;
+
+ context.kv_cache_seq_keep(0);
+
+ assert!(context.kv_cache_seq_pos_max(0) >= 0);
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn copy_kv_cache_seq_with_explicit_range(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mut context = build_context(fixture)?;
+
+ decode_hello_world(fixture, &mut context)?;
+
+ let result = context.copy_kv_cache_seq(0, 2, Some(0), Some(1));
+
+ assert!(result.is_ok());
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn kv_cache_seq_pos_max_returns_negative_one_for_unused_seq(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let context = build_context(fixture)?;
+
+ let result = context.kv_cache_seq_pos_max(999);
+
+ assert_eq!(result, -1);
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn copy_kv_cache_seq_rejects_p0_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mut context = build_context(fixture)?;
+
+ let result = context.copy_kv_cache_seq(0, 1, Some(u32::MAX), None);
+
+ assert!(matches!(
+ result.unwrap_err(),
+ KvCacheConversionError::P0TooLarge(_),
+ ));
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn copy_kv_cache_seq_rejects_p1_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mut context = build_context(fixture)?;
+
+ let result = context.copy_kv_cache_seq(0, 1, Some(0), Some(u32::MAX));
+
+ assert!(matches!(
+ result.unwrap_err(),
+ KvCacheConversionError::P1TooLarge(_),
+ ));
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn clear_kv_cache_seq_rejects_src_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mut context = build_context(fixture)?;
+
+ let result = context.clear_kv_cache_seq(Some(u32::MAX), None, None);
+
+ assert!(matches!(
+ result.unwrap_err(),
+ KvCacheConversionError::SeqIdTooLarge(_),
+ ));
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn clear_kv_cache_seq_rejects_p0_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mut context = build_context(fixture)?;
+
+ let result = context.clear_kv_cache_seq(Some(0), Some(u32::MAX), None);
+
+ assert!(matches!(
+ result.unwrap_err(),
+ KvCacheConversionError::P0TooLarge(_),
+ ));
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn clear_kv_cache_seq_rejects_p1_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mut context = build_context(fixture)?;
+
+ let result = context.clear_kv_cache_seq(Some(0), Some(0), Some(u32::MAX));
+
+ assert!(matches!(
+ result.unwrap_err(),
+ KvCacheConversionError::P1TooLarge(_),
+ ));
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn kv_cache_seq_add_rejects_p0_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mut context = build_context(fixture)?;
+
+ let result = context.kv_cache_seq_add(0, Some(u32::MAX), None, 1);
+
+ assert!(matches!(
+ result.unwrap_err(),
+ KvCacheSeqAddError::P0TooLarge(_),
+ ));
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn kv_cache_seq_add_rejects_p1_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mut context = build_context(fixture)?;
+
+ let result = context.kv_cache_seq_add(0, Some(0), Some(u32::MAX), 1);
+
+ assert!(matches!(
+ result.unwrap_err(),
+ KvCacheSeqAddError::P1TooLarge(_),
+ ));
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn kv_cache_seq_div_rejects_p0_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mut context = build_context(fixture)?;
+
+ let divisor = NonZeroU8::new(2).ok_or_else(|| anyhow::anyhow!("2 is non-zero"))?;
+ let result = context.kv_cache_seq_div(0, Some(u32::MAX), None, divisor);
+
+ assert!(matches!(
+ result.unwrap_err(),
+ KvCacheSeqDivError::P0TooLarge(_),
+ ));
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn kv_cache_seq_div_rejects_p1_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mut context = build_context(fixture)?;
+
+ let divisor = NonZeroU8::new(2).ok_or_else(|| anyhow::anyhow!("2 is non-zero"))?;
+ let result = context.kv_cache_seq_div(0, Some(0), Some(u32::MAX), divisor);
+
+ assert!(matches!(
+ result.unwrap_err(),
+ KvCacheSeqDivError::P1TooLarge(_),
+ ));
+
+ Ok(())
+ }
+}
+
+mod context_session {
+ use anyhow::Result;
+ use llama_cpp_bindings::context::LlamaContext;
+ use llama_cpp_bindings::llama_batch::LlamaBatch;
+ use llama_cpp_bindings::model::AddBos;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ fn build_context<'context>(
+ fixture: &'context LlamaFixture<'_>,
+ ) -> Result> {
+ Ok(LlamaContext::from_model(
+ fixture.model,
+ fixture.backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?)
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn save_and_load_session_file(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mut context = build_context(fixture)?;
+
+ let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
+ let mut batch = LlamaBatch::new(512, 1)?;
+ batch.add_sequence(&tokens, 0, false)?;
+ context.decode(&mut batch)?;
+
+ let session_path = std::env::temp_dir().join("llama_test_session.bin");
+ context.state_save_file(&session_path, &tokens)?;
+
+ let loaded_tokens = context.state_load_file(&session_path, 512)?;
+ assert_eq!(loaded_tokens, tokens);
+
+ std::fs::remove_file(&session_path)?;
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn get_state_size_is_positive(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let context = build_context(fixture)?;
+
+ assert!(context.get_state_size() > 0);
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn state_seq_save_and_load_file_roundtrip(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mut context = build_context(fixture)?;
+
+ let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
+ let mut batch = LlamaBatch::new(512, 1)?;
+ batch.add_sequence(&tokens, 0, false)?;
+ context.decode(&mut batch)?;
+
+ let session_path = std::env::temp_dir().join("llama_test_seq_state.bin");
+ let bytes_written = context.state_seq_save_file(&session_path, 0, &tokens)?;
+ assert!(bytes_written > 0);
+
+ let (loaded_tokens, bytes_read) = context.state_seq_load_file(&session_path, 0, 512)?;
+ assert_eq!(loaded_tokens, tokens);
+ assert!(bytes_read > 0);
+
+ std::fs::remove_file(&session_path)?;
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn copy_state_data_and_set_state_data_roundtrip(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mut context = build_context(fixture)?;
+
+ let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
+ let mut batch = LlamaBatch::new(512, 1)?;
+ batch.add_sequence(&tokens, 0, false)?;
+ context.decode(&mut batch)?;
+
+ let state_size = context.get_state_size();
+ let mut state_data = vec![0u8; state_size];
+ let bytes_copied = unsafe { context.copy_state_data(&mut state_data) };
+ assert!(bytes_copied > 0);
+
+ let bytes_read = unsafe { context.set_state_data(&state_data) };
+ assert!(bytes_read > 0);
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn state_load_file_with_nonexistent_file_returns_error(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let mut context = build_context(fixture)?;
+
+ let result = context.state_load_file("/nonexistent/session.bin", 512);
+
+ assert!(result.is_err());
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn state_seq_load_file_with_nonexistent_file_returns_error(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let mut context = build_context(fixture)?;
+
+ let result = context.state_seq_load_file("/nonexistent/seq_state.bin", 0, 512);
+
+ assert!(result.is_err());
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn state_save_file_to_invalid_directory_returns_failed_to_save(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let context = build_context(fixture)?;
+
+ let result = context.state_save_file("/nonexistent_dir/session.bin", &[]);
+
+ assert!(result.is_err());
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn state_seq_save_file_to_invalid_directory_returns_failed_to_save(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let context = build_context(fixture)?;
+
+ let result = context.state_seq_save_file("/nonexistent_dir/seq_state.bin", 0, &[]);
+
+ assert!(result.is_err());
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn state_load_file_with_zero_max_tokens_returns_error(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let mut context = build_context(fixture)?;
+
+ let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
+ let mut batch = LlamaBatch::new(512, 1)?;
+ batch.add_sequence(&tokens, 0, false)?;
+ context.decode(&mut batch)?;
+
+ let session_path = std::env::temp_dir().join("llama_test_session_zero_max.bin");
+ context.state_save_file(&session_path, &tokens)?;
+
+ let result = context.state_load_file(&session_path, 0);
+
+ assert!(result.is_err());
+ let _ = std::fs::remove_file(&session_path);
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn state_seq_load_file_with_zero_max_tokens_returns_error(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let mut context = build_context(fixture)?;
+
+ let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
+ let mut batch = LlamaBatch::new(512, 1)?;
+ batch.add_sequence(&tokens, 0, false)?;
+ context.decode(&mut batch)?;
+
+ let session_path = std::env::temp_dir().join("llama_test_seq_state_zero_max.bin");
+ context.state_seq_save_file(&session_path, 0, &tokens)?;
+
+ let result = context.state_seq_load_file(&session_path, 0, 0);
+
+ assert!(result.is_err());
+ let _ = std::fs::remove_file(&session_path);
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn state_load_file_with_insufficient_max_tokens_returns_length_error(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let mut context = build_context(fixture)?;
+
+ let tokens = fixture.model.str_to_token(
+ "Hello world this is a longer string for more tokens",
+ AddBos::Always,
+ )?;
+ let mut batch = LlamaBatch::new(512, 1)?;
+ batch.add_sequence(&tokens, 0, false)?;
+ context.decode(&mut batch)?;
+
+ let session_path = std::env::temp_dir().join("llama_test_session_insuf.bin");
+ context.state_save_file(&session_path, &tokens)?;
+
+ let result = context.state_load_file(&session_path, 1);
+
+ assert!(result.is_err());
+ let _ = std::fs::remove_file(&session_path);
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn state_seq_load_file_with_insufficient_max_tokens_returns_length_error(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let mut context = build_context(fixture)?;
+
+ let tokens = fixture.model.str_to_token(
+ "Hello world this is a longer string for more tokens",
+ AddBos::Always,
+ )?;
+ let mut batch = LlamaBatch::new(512, 1)?;
+ batch.add_sequence(&tokens, 0, false)?;
+ context.decode(&mut batch)?;
+
+ let session_path = std::env::temp_dir().join("llama_test_seq_state_insuf.bin");
+ context.state_seq_save_file(&session_path, 0, &tokens)?;
+
+ let result = context.state_seq_load_file(&session_path, 0, 1);
+
+ assert!(result.is_err());
+ let _ = std::fs::remove_file(&session_path);
+
+ Ok(())
+ }
+
+ #[cfg(unix)]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn state_save_file_with_non_utf8_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+ use std::ffi::OsStr;
+ use std::os::unix::ffi::OsStrExt;
+
+ let context = build_context(fixture)?;
+
+ let non_utf8_path = std::path::Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.bin"));
+ let result = context.state_save_file(non_utf8_path, &[]);
+
+ assert!(result.is_err());
+
+ Ok(())
+ }
+
+ #[cfg(unix)]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn state_load_file_with_non_utf8_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+ use std::ffi::OsStr;
+ use std::os::unix::ffi::OsStrExt;
+
+ let mut context = build_context(fixture)?;
+
+ let non_utf8_path = std::path::Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.bin"));
+ let result = context.state_load_file(non_utf8_path, 512);
+
+ assert!(result.is_err());
+
+ Ok(())
+ }
+
+ #[cfg(unix)]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn state_seq_save_file_with_non_utf8_path_returns_error(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ use std::ffi::OsStr;
+ use std::os::unix::ffi::OsStrExt;
+
+ let context = build_context(fixture)?;
+
+ let non_utf8_path = std::path::Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.bin"));
+ let result = context.state_seq_save_file(non_utf8_path, 0, &[]);
+
+ assert!(result.is_err());
+
+ Ok(())
+ }
+
+ #[cfg(unix)]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn state_seq_load_file_with_non_utf8_path_returns_error(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ use std::ffi::OsStr;
+ use std::os::unix::ffi::OsStrExt;
+
+ let mut context = build_context(fixture)?;
+
+ let non_utf8_path = std::path::Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.bin"));
+ let result = context.state_seq_load_file(non_utf8_path, 0, 512);
+
+ assert!(result.is_err());
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn state_save_file_with_null_byte_in_path_returns_error(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let context = build_context(fixture)?;
+
+ let path_with_null = std::path::Path::new("/tmp/foo\0bar.bin");
+ let result = context.state_save_file(path_with_null, &[]);
+
+ assert!(result.is_err());
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn state_load_file_with_null_byte_in_path_returns_error(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let mut context = build_context(fixture)?;
+
+ let path_with_null = std::path::Path::new("/tmp/foo\0bar.bin");
+ let result = context.state_load_file(path_with_null, 512);
+
+ assert!(result.is_err());
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn state_seq_save_file_with_null_byte_in_path_returns_error(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let context = build_context(fixture)?;
+
+ let path_with_null = std::path::Path::new("/tmp/foo\0bar.bin");
+ let result = context.state_seq_save_file(path_with_null, 0, &[]);
+
+ assert!(result.is_err());
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn state_seq_load_file_with_null_byte_in_path_returns_error(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let mut context = build_context(fixture)?;
+
+ let path_with_null = std::path::Path::new("/tmp/foo\0bar.bin");
+ let result = context.state_seq_load_file(path_with_null, 0, 512);
+
+ assert!(result.is_err());
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn state_seq_get_size_ext_returns_size_for_decoded_sequence(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ use llama_cpp_bindings::context::llama_state_seq_flags::LlamaStateSeqFlags;
+
+ let mut context = build_context(fixture)?;
+
+ let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
+ let mut batch = LlamaBatch::new(512, 1)?;
+ batch.add_sequence(&tokens, 0, false)?;
+ context.decode(&mut batch)?;
+
+ let flags = LlamaStateSeqFlags::empty();
+ let size = context.state_seq_get_size_ext(0, &flags);
+
+ assert!(size > 0);
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn state_seq_get_data_ext_and_set_data_ext_round_trip(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ use llama_cpp_bindings::context::llama_state_seq_flags::LlamaStateSeqFlags;
+
+ let mut context = build_context(fixture)?;
+
+ let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?;
+ let mut batch = LlamaBatch::new(512, 1)?;
+ batch.add_sequence(&tokens, 0, false)?;
+ context.decode(&mut batch)?;
+
+ let flags = LlamaStateSeqFlags::empty();
+ let size = context.state_seq_get_size_ext(0, &flags);
+ let mut buffer = vec![0u8; size];
+ let bytes_written = unsafe { context.state_seq_get_data_ext(&mut buffer, 0, &flags) };
+
+ assert!(bytes_written > 0);
+
+ let bytes_read = unsafe { context.state_seq_set_data_ext(&buffer, 0, &flags) };
+
+ assert!(bytes_read > 0);
+
+ Ok(())
+ }
+}
+
+llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/llguidance.rs b/llama-cpp-bindings-tests/tests/llguidance.rs
deleted file mode 100644
index 74bd229a..00000000
--- a/llama-cpp-bindings-tests/tests/llguidance.rs
+++ /dev/null
@@ -1,686 +0,0 @@
-#![expect(
- clippy::unnecessary_wraps,
- reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-)]
-
-use std::ffi::CStr;
-use std::sync::Arc;
-
-use anyhow::Result;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::llguidance_sampler::create_llg_sampler;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_bindings::sampling::LlamaSampler;
-use llama_cpp_bindings::token::LlamaToken;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const JSON_SCHEMA: &str =
- r#"{"type":"object","properties":{"answer":{"type":"string"}},"required":["answer"]}"#;
-const REGEX_GRAMMAR: &str = r"yes|no";
-const LARK_GRAMMAR: &str = r#"start: "yes" | "no""#;
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn creates_sampler_with_valid_json_schema(fixture: &LlamaFixture<'_>) -> Result<()> {
- let sampler = create_llg_sampler(fixture.model, "json", JSON_SCHEMA)?;
-
- assert!(!sampler.sampler.is_null());
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn creates_sampler_with_valid_regex_grammar(fixture: &LlamaFixture<'_>) -> Result<()> {
- let sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?;
-
- assert!(!sampler.sampler.is_null());
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn creates_sampler_with_valid_lark_grammar(fixture: &LlamaFixture<'_>) -> Result<()> {
- let sampler = create_llg_sampler(fixture.model, "lark", LARK_GRAMMAR)?;
-
- assert!(!sampler.sampler.is_null());
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn returns_error_for_unknown_grammar_kind(fixture: &LlamaFixture<'_>) -> Result<()> {
- let result = create_llg_sampler(fixture.model, "not_a_real_kind", "anything");
-
- assert!(result.is_err());
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn returns_error_for_malformed_json_schema(fixture: &LlamaFixture<'_>) -> Result<()> {
- let result = create_llg_sampler(fixture.model, "json", "{this is not valid json");
-
- assert!(result.is_err());
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn returns_error_for_malformed_regex(fixture: &LlamaFixture<'_>) -> Result<()> {
- let result = create_llg_sampler(fixture.model, "regex", "[invalid");
-
- assert!(result.is_err());
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn name_callback_returns_llguidance(fixture: &LlamaFixture<'_>) -> Result<()> {
- let sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?;
-
- let name_ptr = unsafe { llama_cpp_bindings_sys::llama_sampler_name(sampler.sampler) };
- assert!(!name_ptr.is_null());
- let name = unsafe { CStr::from_ptr(name_ptr) }.to_str()?;
-
- assert_eq!(name, "llguidance");
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn clone_via_ffi_creates_independent_sampler(fixture: &LlamaFixture<'_>) -> Result<()> {
- let sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?;
-
- let cloned = unsafe { llama_cpp_bindings_sys::llama_sampler_clone(sampler.sampler) };
-
- assert!(!cloned.is_null());
-
- unsafe { llama_cpp_bindings_sys::llama_sampler_free(cloned) };
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn samples_token_constrained_by_grammar(fixture: &LlamaFixture<'_>) -> Result<()> {
- let model = fixture.model;
- let backend = fixture.backend;
- let mut context = LlamaContext::from_model(
- model,
- backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
-
- let prompt = "Answer yes or no:";
- let tokens = model.str_to_token(prompt, AddBos::Always)?;
- let mut batch = LlamaBatch::new(512, 1)?;
- batch.add_sequence(&tokens, 0, false)?;
- context.decode(&mut batch)?;
-
- let llg_sampler = create_llg_sampler(model, "regex", REGEX_GRAMMAR)?;
- let mut chain = LlamaSampler::chain_simple([llg_sampler, LlamaSampler::greedy()]);
-
- let token = chain.sample(&context, batch.n_tokens() - 1)?;
- chain.accept(token)?;
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn accept_invalid_token_id_does_not_panic(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?;
-
- let huge_token = LlamaToken(i32::MAX - 1);
- let _ = sampler.accept(huge_token);
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn approximate_tok_env_returns_same_arc_across_calls(fixture: &LlamaFixture<'_>) -> Result<()> {
- let first = fixture.model.approximate_tok_env();
- let second = fixture.model.approximate_tok_env();
-
- assert!(Arc::ptr_eq(&first, &second));
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn approximate_tok_env_drives_consistent_grammar_constraint(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let first = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?;
- let second = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?;
-
- assert!(!first.sampler.is_null());
- assert!(!second.sampler.is_null());
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn apply_through_chain_during_sample_does_not_panic(fixture: &LlamaFixture<'_>) -> Result<()> {
- let model = fixture.model;
- let backend = fixture.backend;
- let mut context = LlamaContext::from_model(
- model,
- backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
-
- let tokens = model.str_to_token("Answer:", AddBos::Always)?;
- let mut batch = LlamaBatch::new(512, 1)?;
- batch.add_sequence(&tokens, 0, false)?;
- context.decode(&mut batch)?;
-
- let llg_sampler = create_llg_sampler(model, "regex", REGEX_GRAMMAR)?;
- let mut chain = LlamaSampler::chain_simple([llg_sampler, LlamaSampler::greedy()]);
- let _ = chain.sample(&context, batch.n_tokens() - 1);
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn reset_clears_sampler_state(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?;
- let huge_token = LlamaToken(i32::MAX - 1);
- let _ = sampler.accept(huge_token);
- sampler.reset();
- let after = sampler.accept(LlamaToken(0));
- assert!(
- after.is_ok() || after.is_err(),
- "after reset, sampler.accept must return Ok or Err (not panic)"
- );
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/mistral3_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs b/llama-cpp-bindings-tests/tests/mistral3_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs
deleted file mode 100644
index 6ae1d9cd..00000000
--- a/llama-cpp-bindings-tests/tests/mistral3_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs
+++ /dev/null
@@ -1,81 +0,0 @@
-use anyhow::Result;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_bindings::sampling::LlamaSampler;
-use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const MAX_GENERATED_TOKENS: i32 = 200;
-
-const MISTRAL3_THINKING_DISABLED_PROMPT: &str = "\
-[INST]Reply with the single word: four. Do not explain.[/INST][THINK][/THINK]";
-
-const FORBIDDEN_MARKERS: &[&str] = &["[THINK]", "[/THINK]"];
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 8192,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn mistral3_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let model = fixture.model;
- let backend = fixture.backend;
-
- let mut classifier = model.sampled_token_classifier();
- let prompt_tokens = model.str_to_token(MISTRAL3_THINKING_DISABLED_PROMPT, AddBos::Always)?;
- let prompt_token_count = u64::try_from(prompt_tokens.len())?;
-
- let mut batch = LlamaBatch::new(2048, 1)?;
- classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
-
- let mut context = LlamaContext::from_model(
- model,
- backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
-
- context.decode(&mut batch)?;
-
- let promoted = classifier.commit_prompt_tokens();
- assert_eq!(promoted, prompt_token_count);
-
- let mut sampler = LlamaSampler::greedy();
- let initial_position = batch.n_tokens();
- let outcome = ClassifySampleLoop {
- model,
- classifier: &mut classifier,
- sampler: &mut sampler,
- context: &mut context,
- batch: &mut batch,
- initial_position,
- max_generated_tokens: MAX_GENERATED_TOKENS,
- }
- .run()?;
-
- let usage = classifier.usage();
-
- assert!(!outcome.generated_raw.is_empty());
- assert_eq!(outcome.observed_reasoning, 0);
- assert_eq!(outcome.observed_undeterminable, 0);
- assert_eq!(usage.reasoning_tokens, 0);
- assert_eq!(usage.undeterminable_tokens, 0);
- assert!(outcome.observed_content > 0);
- assert_eq!(usage.completion_tokens(), outcome.observed_content);
-
- for forbidden in FORBIDDEN_MARKERS {
- assert!(!outcome.content_stream.contains(forbidden));
- }
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/mistral3_classifier_emits_reasoning.rs b/llama-cpp-bindings-tests/tests/mistral3_classifier_emits_reasoning.rs
deleted file mode 100644
index 296ad348..00000000
--- a/llama-cpp-bindings-tests/tests/mistral3_classifier_emits_reasoning.rs
+++ /dev/null
@@ -1,101 +0,0 @@
-use anyhow::Result;
-use anyhow::bail;
-use llama_cpp_bindings::ChatMessageParseOutcome;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_bindings::sampling::LlamaSampler;
-use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const MAX_GENERATED_TOKENS: i32 = 768;
-
-const MISTRAL3_THINKING_PROMPT: &str = "\
-[SYSTEM_PROMPT]# HOW YOU SHOULD THINK AND ANSWER\n\n\
-First draft your thinking process (inner monologue) until you arrive at a response. \
-Format your response using Markdown, and use LaTeX for any mathematical equations. \
-Write both your thoughts and the response in the same language as the input.\n\n\
-Your thinking process must follow the template below:\
-[THINK]Your thoughts or/and draft, like working through an exercise on scratch paper. \
-Be as casual and as long as you want until you are confident to generate the response \
-to the user.[/THINK]Here, provide a self-contained response.[/SYSTEM_PROMPT]\
-[INST]Reply with the single word: four. Do not explain.[/INST]";
-
-const FORBIDDEN_MARKERS: &[&str] = &["[THINK]", "[/THINK]"];
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 8192,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn mistral3_classifier_emits_reasoning_for_thinking_prompt(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let model = fixture.model;
- let backend = fixture.backend;
-
- let mut classifier = model.sampled_token_classifier();
- let prompt_tokens = model.str_to_token(MISTRAL3_THINKING_PROMPT, AddBos::Always)?;
- let prompt_token_count = u64::try_from(prompt_tokens.len())?;
-
- let mut batch = LlamaBatch::new(2048, 1)?;
- classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
-
- let mut context = LlamaContext::from_model(
- model,
- backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
-
- context.decode(&mut batch)?;
-
- let promoted = classifier.commit_prompt_tokens();
- assert_eq!(promoted, prompt_token_count);
-
- let mut sampler = LlamaSampler::greedy();
- let initial_position = batch.n_tokens();
- let outcome = ClassifySampleLoop {
- model,
- classifier: &mut classifier,
- sampler: &mut sampler,
- context: &mut context,
- batch: &mut batch,
- initial_position,
- max_generated_tokens: MAX_GENERATED_TOKENS,
- }
- .run()?;
-
- let usage = classifier.usage();
- let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?;
- let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else {
- bail!("Mistral 3 chat template must be recognised by the parser; got Unrecognized");
- };
-
- assert!(!outcome.generated_raw.is_empty());
- assert!(outcome.observed_reasoning > 0);
- assert!(usage.reasoning_tokens > 0);
- assert_eq!(outcome.observed_undeterminable, 0);
- assert_eq!(usage.undeterminable_tokens, 0);
- assert_eq!(
- usage.completion_tokens(),
- outcome.observed_content + outcome.observed_reasoning,
- );
- assert!(!parsed.reasoning_content.is_empty());
- assert_eq!(outcome.reasoning_stream, parsed.reasoning_content);
- assert_eq!(outcome.content_stream, parsed.content);
-
- for forbidden in FORBIDDEN_MARKERS {
- assert!(!outcome.reasoning_stream.contains(forbidden));
- assert!(!outcome.content_stream.contains(forbidden));
- }
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/mistral3_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs b/llama-cpp-bindings-tests/tests/mistral3_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs
deleted file mode 100644
index abb5c39f..00000000
--- a/llama-cpp-bindings-tests/tests/mistral3_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs
+++ /dev/null
@@ -1,101 +0,0 @@
-use anyhow::Result;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::mtmd::MtmdBitmap;
-use llama_cpp_bindings::mtmd::MtmdInputText;
-use llama_cpp_bindings::mtmd::mtmd_default_marker;
-use llama_cpp_bindings::sampling::LlamaSampler;
-use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-use llama_cpp_bindings_tests::test_model::fixtures_dir;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const MAX_GENERATED_TOKENS: i32 = 768;
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 4096,
- n_batch = 512,
- n_ubatch = 512,
- mmproj_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "mmproj-F16.gguf"),
-)]
-fn mistral3_classifier_emits_reasoning_for_multimodal_thinking_prompt(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let model = fixture.model;
- let backend = fixture.backend;
- let mtmd_ctx = fixture
- .mtmd_context
- .expect("mmproj_file declared in attribute");
-
- let mut context = LlamaContext::from_model(
- model,
- backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
-
- let image_path = fixtures_dir().join("llamas.jpg");
- let image_path_str = image_path
- .to_str()
- .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?;
- let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?;
-
- let marker = mtmd_default_marker();
- let prompt = format!(
- "[SYSTEM_PROMPT]# HOW YOU SHOULD THINK AND ANSWER\n\n\
- First draft your thinking process (inner monologue) until you arrive at a response. \
- Format your response using Markdown, and use LaTeX for any mathematical equations. \
- Write both your thoughts and the response in the same language as the input.\n\n\
- Your thinking process must follow the template below:\
- [THINK]Your thoughts or/and draft, like working through an exercise on scratch paper. \
- Be as casual and as long as you want until you are confident to generate the response \
- to the user.[/THINK]Here, provide a self-contained response.[/SYSTEM_PROMPT]\
- [INST]{marker}What animals do you see in this image?[/INST]"
- );
-
- let input_text = MtmdInputText {
- text: prompt,
- add_special: true,
- parse_special: true,
- };
-
- let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
-
- let mut classifier = model.sampled_token_classifier();
- let n_past = classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?;
-
- let mut sampler = LlamaSampler::greedy();
- let mut batch = LlamaBatch::new(2048, 1)?;
- let outcome = ClassifySampleLoop {
- model,
- classifier: &mut classifier,
- sampler: &mut sampler,
- context: &mut context,
- batch: &mut batch,
- initial_position: n_past,
- max_generated_tokens: MAX_GENERATED_TOKENS,
- }
- .run()?;
-
- let usage = classifier.usage();
-
- if outcome.observed_reasoning == 0 {
- anyhow::bail!(
- "Mistral 3 multimodal + thinking: classifier must emit at least one Reasoning token \
- when the model opens a `[THINK]` block; outcome={outcome:?}"
- );
- }
- if usage.reasoning_tokens == 0 {
- anyhow::bail!(
- "Mistral 3 multimodal + thinking: usage.reasoning_tokens must be non-zero; usage={usage:?}"
- );
- }
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/mistral3_parses_tool_call_payload.rs b/llama-cpp-bindings-tests/tests/mistral3_parses_tool_call_payload.rs
deleted file mode 100644
index b67e0765..00000000
--- a/llama-cpp-bindings-tests/tests/mistral3_parses_tool_call_payload.rs
+++ /dev/null
@@ -1,65 +0,0 @@
-use anyhow::Result;
-use anyhow::bail;
-use llama_cpp_bindings::ChatMessageParseOutcome;
-use llama_cpp_bindings::ToolCallArguments;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const TOOLS_JSON: &str = r#"[
- {
- "type": "function",
- "function": {
- "name": "get_weather",
- "description": "Get the current weather for a location",
- "parameters": {
- "type": "object",
- "properties": {
- "location": {"type": "string", "description": "The city name"}
- },
- "required": ["location"]
- }
- }
- }
-]"#;
-
-const MISTRAL3_BRACKETED_JSON_PAYLOAD: &str =
- r#"[TOOL_CALLS]get_weather[ARGS]{"location":"Paris"}"#;
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn mistral3_parses_tool_call_payload(fixture: &LlamaFixture<'_>) -> Result<()> {
- let outcome =
- fixture
- .model
- .parse_chat_message(TOOLS_JSON, MISTRAL3_BRACKETED_JSON_PAYLOAD, false)?;
-
- let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
- bail!(
- "expected Recognized for Mistral 3 BracketedJson on a Mistral-3 model; got Unrecognized"
- );
- };
- assert_eq!(parsed.tool_calls.len(), 1);
- assert_eq!(parsed.tool_calls[0].name, "get_weather");
- let location = match &parsed.tool_calls[0].arguments {
- ToolCallArguments::ValidJson(value) => value
- .get("location")
- .and_then(|v| v.as_str())
- .map(str::to_owned),
- ToolCallArguments::InvalidJson(raw) => {
- bail!("expected ValidJson, got InvalidJson: {raw}");
- }
- };
- assert_eq!(location.as_deref(), Some("Paris"));
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/model_chat_template.rs b/llama-cpp-bindings-tests/tests/model_chat_template.rs
deleted file mode 100644
index 88511471..00000000
--- a/llama-cpp-bindings-tests/tests/model_chat_template.rs
+++ /dev/null
@@ -1,194 +0,0 @@
-#![expect(
- clippy::unnecessary_wraps,
- reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-)]
-
-use anyhow::Result;
-use llama_cpp_bindings::ChatTemplateError;
-use llama_cpp_bindings::model::LlamaChatMessage;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn chat_template_returns_non_empty(fixture: &LlamaFixture<'_>) -> Result<()> {
- let template = fixture.model.chat_template(None);
- assert!(template.is_ok());
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn apply_chat_template_produces_prompt(fixture: &LlamaFixture<'_>) -> Result<()> {
- let model = fixture.model;
- let template = model.chat_template(None)?;
- let message = LlamaChatMessage::new("user".to_string(), "hello".to_string())?;
- let prompt = model.apply_chat_template(&template, &[message], true);
-
- assert!(prompt.is_ok());
- assert!(!prompt?.is_empty());
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn apply_chat_template_buffer_resize_with_long_messages(fixture: &LlamaFixture<'_>) -> Result<()> {
- let model = fixture.model;
- let template = model.chat_template(None)?;
- let long_content = "a".repeat(2000);
- let message = LlamaChatMessage::new("user".to_string(), long_content)?;
- let prompt = model.apply_chat_template(&template, &[message], true);
-
- assert!(prompt.is_ok());
- assert!(!prompt?.is_empty());
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn chat_template_with_nonexistent_name_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
- let result = fixture
- .model
- .chat_template(Some("nonexistent_template_name_xyz"));
- assert_eq!(result.unwrap_err(), ChatTemplateError::MissingTemplate);
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/model_context_creation.rs b/llama-cpp-bindings-tests/tests/model_context_creation.rs
deleted file mode 100644
index 300027ec..00000000
--- a/llama-cpp-bindings-tests/tests/model_context_creation.rs
+++ /dev/null
@@ -1,106 +0,0 @@
-#![expect(
- clippy::unnecessary_wraps,
- reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-)]
-
-use anyhow::Result;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 256,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 256,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 256,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 256,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn new_context_returns_valid_context(fixture: &LlamaFixture<'_>) -> Result<()> {
- let context = LlamaContext::from_model(
- fixture.model,
- fixture.backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
-
- assert!(context.n_ctx() > 0);
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 4294967295,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 4294967295,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 4294967295,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 4294967295,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn new_context_with_huge_ctx_returns_null_error(fixture: &LlamaFixture<'_>) -> Result<()> {
- let result = LlamaContext::from_model(
- fixture.model,
- fixture.backend,
- (*fixture.context_params).into_llama_context_params(),
- );
-
- assert!(result.is_err());
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/model_helpers.rs b/llama-cpp-bindings-tests/tests/model_helpers.rs
deleted file mode 100644
index 3efeae82..00000000
--- a/llama-cpp-bindings-tests/tests/model_helpers.rs
+++ /dev/null
@@ -1,103 +0,0 @@
-#![expect(
- clippy::unnecessary_wraps,
- reason = "every trial returns anyhow::Result<()> to match the LlamaTestFn signature"
-)]
-
-use anyhow::Result;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 2048,
- n_batch = 512,
- n_ubatch = 128
-)]
-fn debug_format_includes_struct_name_and_model_field(fixture: &LlamaFixture<'_>) -> Result<()> {
- let formatted = format!("{:?}", fixture.model);
-
- assert!(formatted.contains("LlamaModel"));
- assert!(formatted.contains("model"));
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 2048,
- n_batch = 512,
- n_ubatch = 128
-)]
-fn embedding_model_tool_call_markers_call_does_not_panic(fixture: &LlamaFixture<'_>) -> Result<()> {
- let _markers = fixture.model.tool_call_markers();
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 2048,
- n_batch = 512,
- n_ubatch = 128
-)]
-fn embedding_model_streaming_markers_returns_ok_for_a_model_without_tool_calls(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let _markers = fixture.model.streaming_markers()?;
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 2048,
- n_batch = 512,
- n_ubatch = 128
-)]
-fn approximate_tok_env_is_cached_across_calls(fixture: &LlamaFixture<'_>) -> Result<()> {
- let first = fixture.model.approximate_tok_env();
- let second = fixture.model.approximate_tok_env();
-
- assert!(std::sync::Arc::ptr_eq(&first, &second));
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 2048,
- n_batch = 512,
- n_ubatch = 128
-)]
-fn approximate_tok_env_falls_back_to_eos_when_eot_unavailable(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let env = fixture.model.approximate_tok_env();
- let env_again = fixture.model.approximate_tok_env();
-
- assert!(
- std::sync::Arc::ptr_eq(&env, &env_again),
- "approximate_tok_env must return the same cached Arc for any model, including \
- the embedding model which lacks an EOT token (forcing the fallback-to-EOS path)"
- );
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/model_loading_errors.rs b/llama-cpp-bindings-tests/tests/model_loading_errors.rs
index cd36eb46..6cf63144 100644
--- a/llama-cpp-bindings-tests/tests/model_loading_errors.rs
+++ b/llama-cpp-bindings-tests/tests/model_loading_errors.rs
@@ -4,8 +4,10 @@
)]
use std::path::Path;
+use std::path::PathBuf;
use anyhow::Result;
+use llama_cpp_bindings::LlamaLoraAdapterInitError;
use llama_cpp_bindings::LlamaModelLoadError;
use llama_cpp_bindings::model::LlamaModel;
use llama_cpp_bindings::model::params::LlamaModelParams;
@@ -169,4 +171,151 @@ fn load_model_with_non_utf8_path_returns_path_to_str_error(
Ok(())
}
+#[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+)]
+#[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+)]
+#[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+)]
+#[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+)]
+fn lora_adapter_init_with_invalid_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let result = fixture
+ .model
+ .lora_adapter_init("/nonexistent/path/lora.gguf");
+ assert_eq!(
+ result.unwrap_err(),
+ LlamaLoraAdapterInitError::FileNotFound(PathBuf::from("/nonexistent/path/lora.gguf"))
+ );
+ Ok(())
+}
+
+#[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+)]
+#[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+)]
+#[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+)]
+#[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+)]
+fn lora_adapter_init_with_invalid_gguf_returns_unloadable(
+ fixture: &LlamaFixture<'_>,
+) -> Result<()> {
+ let dummy_path = std::env::temp_dir().join("llama_test_dummy_lora.gguf");
+ std::fs::write(&dummy_path, b"not a valid gguf")?;
+
+ let result = fixture.model.lora_adapter_init(&dummy_path);
+
+ assert_eq!(result.unwrap_err(), LlamaLoraAdapterInitError::Unloadable);
+ let _ = std::fs::remove_file(&dummy_path);
+ Ok(())
+}
+
+#[cfg(unix)]
+#[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+)]
+#[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+)]
+#[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+)]
+#[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+)]
+fn lora_adapter_init_with_non_utf8_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+ use std::ffi::OsStr;
+ use std::os::unix::ffi::OsStrExt;
+
+ let non_utf8_path = Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.gguf"));
+ let result = fixture.model.lora_adapter_init(non_utf8_path);
+
+ assert_eq!(
+ result.unwrap_err(),
+ LlamaLoraAdapterInitError::PathToStrError(non_utf8_path.to_path_buf())
+ );
+ Ok(())
+}
+
llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/model_lora_adapter_errors.rs b/llama-cpp-bindings-tests/tests/model_lora_adapter_errors.rs
deleted file mode 100644
index ae04dad8..00000000
--- a/llama-cpp-bindings-tests/tests/model_lora_adapter_errors.rs
+++ /dev/null
@@ -1,162 +0,0 @@
-#![expect(
- clippy::unnecessary_wraps,
- reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-)]
-
-use std::path::PathBuf;
-
-use anyhow::Result;
-use llama_cpp_bindings::LlamaLoraAdapterInitError;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn lora_adapter_init_with_invalid_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
- let result = fixture
- .model
- .lora_adapter_init("/nonexistent/path/lora.gguf");
- assert_eq!(
- result.unwrap_err(),
- LlamaLoraAdapterInitError::FileNotFound(PathBuf::from("/nonexistent/path/lora.gguf"))
- );
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn lora_adapter_init_with_invalid_gguf_returns_unloadable(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let dummy_path = std::env::temp_dir().join("llama_test_dummy_lora.gguf");
- std::fs::write(&dummy_path, b"not a valid gguf")?;
-
- let result = fixture.model.lora_adapter_init(&dummy_path);
-
- assert_eq!(result.unwrap_err(), LlamaLoraAdapterInitError::Unloadable);
- let _ = std::fs::remove_file(&dummy_path);
- Ok(())
-}
-
-#[cfg(unix)]
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn lora_adapter_init_with_non_utf8_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
- use std::ffi::OsStr;
- use std::os::unix::ffi::OsStrExt;
- use std::path::Path;
-
- let non_utf8_path = Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.gguf"));
- let result = fixture.model.lora_adapter_init(non_utf8_path);
-
- assert_eq!(
- result.unwrap_err(),
- LlamaLoraAdapterInitError::PathToStrError(non_utf8_path.to_path_buf())
- );
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/model_metadata_kv.rs b/llama-cpp-bindings-tests/tests/model_metadata_kv.rs
deleted file mode 100644
index 7d99b859..00000000
--- a/llama-cpp-bindings-tests/tests/model_metadata_kv.rs
+++ /dev/null
@@ -1,355 +0,0 @@
-#![expect(
- clippy::unnecessary_wraps,
- reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-)]
-
-use anyhow::Result;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn meta_count_returns_positive(fixture: &LlamaFixture<'_>) -> Result<()> {
- assert!(fixture.model.meta_count() > 0);
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn meta_key_by_index_returns_valid_key(fixture: &LlamaFixture<'_>) -> Result<()> {
- let key = fixture.model.meta_key_by_index(0)?;
- assert!(!key.is_empty());
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn meta_val_str_by_index_returns_valid_value(fixture: &LlamaFixture<'_>) -> Result<()> {
- let value = fixture.model.meta_val_str_by_index(0)?;
- assert!(!value.is_empty());
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn meta_key_by_index_out_of_range_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
- let result = fixture.model.meta_key_by_index(999_999);
- assert!(result.is_err());
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn meta_val_str_by_index_out_of_range_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
- let result = fixture.model.meta_val_str_by_index(999_999);
- assert!(result.is_err());
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn meta_val_str_returns_value_for_known_key(fixture: &LlamaFixture<'_>) -> Result<()> {
- let model = fixture.model;
- let first_key = model.meta_key_by_index(0)?;
- let value = model.meta_val_str(&first_key)?;
- assert!(!value.is_empty());
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn meta_val_str_with_long_value_triggers_buffer_resize(fixture: &LlamaFixture<'_>) -> Result<()> {
- let model = fixture.model;
- let count = model.meta_count();
-
- for index in 0..count {
- let key = model.meta_key_by_index(index);
- let value = model.meta_val_str_by_index(index);
- assert!(key.is_ok());
- assert!(value.is_ok());
- }
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn meta_val_str_with_null_byte_in_key_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
- let result = fixture.model.meta_val_str("key\0with_null");
- assert!(result.is_err());
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/model_params.rs b/llama-cpp-bindings-tests/tests/model_params.rs
deleted file mode 100644
index 6684625e..00000000
--- a/llama-cpp-bindings-tests/tests/model_params.rs
+++ /dev/null
@@ -1,78 +0,0 @@
-#![expect(
- clippy::similar_names,
- reason = "model_path_str and model_path_cstr are both genuinely needed; renaming would not improve clarity"
-)]
-
-use std::ffi::CString;
-use std::pin::pin;
-
-use anyhow::Result;
-use llama_cpp_bindings::context::params::LlamaContextParams;
-use llama_cpp_bindings::max_devices;
-use llama_cpp_bindings::model::params::LlamaModelParams;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn fit_params_succeeds_with_test_model(fixture: &LlamaFixture<'_>) -> Result<()> {
- let model_path_str = fixture
- .model_path
- .to_str()
- .ok_or_else(|| anyhow::anyhow!("model path is not valid UTF-8"))?;
- let model_path_cstr = CString::new(model_path_str)?;
-
- let mut params = pin!(LlamaModelParams::default());
- let mut context_params = LlamaContextParams::default();
- let mut margins = vec![0usize; max_devices()];
-
- let result = params.as_mut().fit_params(
- &model_path_cstr,
- &mut context_params,
- &mut margins,
- 512,
- llama_cpp_bindings_sys::GGML_LOG_LEVEL_NONE,
- );
-
- let fit = result.map_err(|fit_error| anyhow::anyhow!("fit_params failed: {fit_error:?}"))?;
- assert!(fit.n_ctx > 0);
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/model_properties.rs b/llama-cpp-bindings-tests/tests/model_properties.rs
deleted file mode 100644
index bd33ef6b..00000000
--- a/llama-cpp-bindings-tests/tests/model_properties.rs
+++ /dev/null
@@ -1,421 +0,0 @@
-#![expect(
- clippy::unnecessary_wraps,
- reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-)]
-
-use anyhow::Result;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn model_loads_with_valid_metadata(fixture: &LlamaFixture<'_>) -> Result<()> {
- let model = fixture.model;
-
- assert!(model.n_vocab() > 0);
- assert!(model.n_embd() > 0);
- assert!(model.n_params() > 0);
- assert!(model.n_ctx_train()? > 0);
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn n_layer_returns_positive(fixture: &LlamaFixture<'_>) -> Result<()> {
- assert!(fixture.model.n_layer()? > 0);
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn n_head_returns_positive(fixture: &LlamaFixture<'_>) -> Result<()> {
- assert!(fixture.model.n_head()? > 0);
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn n_head_kv_returns_positive(fixture: &LlamaFixture<'_>) -> Result<()> {
- assert!(fixture.model.n_head_kv()? > 0);
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn model_size_returns_nonzero(fixture: &LlamaFixture<'_>) -> Result<()> {
- assert!(fixture.model.size() > 0);
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn is_recurrent_returns_false_for_transformer(fixture: &LlamaFixture<'_>) -> Result<()> {
- assert!(!fixture.model.is_recurrent());
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn is_hybrid_returns_false_for_non_hybrid_default_models(fixture: &LlamaFixture<'_>) -> Result<()> {
- assert!(
- !fixture.model.is_hybrid(),
- "DeepSeek-R1-Distill-Llama-8B and GLM-4.7-Flash are pure transformers, not hybrid; got is_hybrid=true"
- );
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn is_hybrid_returns_true_for_qwen_default_models(fixture: &LlamaFixture<'_>) -> Result<()> {
- assert!(
- fixture.model.is_hybrid(),
- "Qwen 3.5 and Qwen 3.6 default GGUFs are reported as hybrid by llama.cpp; got is_hybrid=false"
- );
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn rope_type_returns_a_known_variant_for_rope_carrying_default_models(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- use llama_cpp_bindings::model::rope_type::RopeType;
- let rope = fixture.model.rope_type();
- assert!(
- matches!(
- rope,
- Some(RopeType::Norm | RopeType::NeoX | RopeType::MRope | RopeType::Vision)
- ),
- "rope_type must be a known variant for DeepSeek and GLM-4.7; got {rope:?}"
- );
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn rope_type_returns_none_for_qwen_default_models(fixture: &LlamaFixture<'_>) -> Result<()> {
- let rope = fixture.model.rope_type();
- assert!(
- rope.is_none(),
- "Qwen 3.5 and Qwen 3.6 default GGUFs do not expose a rope_type in their metadata; got {rope:?}"
- );
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn vocab_type_returns_a_known_variant(fixture: &LlamaFixture<'_>) -> Result<()> {
- use llama_cpp_bindings::model::vocab_type::VocabType;
- let vocab = fixture.model.vocab_type()?;
- assert!(
- matches!(vocab, VocabType::BPE | VocabType::SPM),
- "vocab_type must be a known variant; got {vocab:?}"
- );
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/model_sampling.rs b/llama-cpp-bindings-tests/tests/model_sampling.rs
deleted file mode 100644
index d6b40ba4..00000000
--- a/llama-cpp-bindings-tests/tests/model_sampling.rs
+++ /dev/null
@@ -1,452 +0,0 @@
-use anyhow::Result;
-use llama_cpp_bindings::SampledToken;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::json_schema_to_grammar;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_bindings::sampling::LlamaSampler;
-use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 256,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 256,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 256,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 256,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn sample_returns_result_and_succeeds_with_valid_index(fixture: &LlamaFixture<'_>) -> Result<()> {
- let model = fixture.model;
- let mut context = LlamaContext::from_model(
- model,
- fixture.backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
-
- let tokens = model.str_to_token("Hello", AddBos::Always)?;
- let mut batch = LlamaBatch::new(512, 1)?;
-
- batch.add_sequence(&tokens, 0, false)?;
-
- context.decode(&mut batch)?;
-
- let mut sampler = LlamaSampler::chain_simple([LlamaSampler::temp(0.8), LlamaSampler::greedy()]);
-
- let result = sampler.sample(&context, batch.n_tokens() - 1);
-
- assert!(result.is_ok());
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn grammar_sampler_constrains_output_to_yes_or_no(fixture: &LlamaFixture<'_>) -> Result<()> {
- let model = fixture.model;
- let mut context = LlamaContext::from_model(
- model,
- fixture.backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
-
- let prompt = "<|im_start|>user\nIs the sky blue? Answer yes or no.<|im_end|>\n<|im_start|>assistant\n\n\n\n\n";
- let tokens = model.str_to_token(prompt, AddBos::Always)?;
- let mut batch = LlamaBatch::new(512, 1)?;
-
- batch.add_sequence(&tokens, 0, false)?;
-
- context.decode(&mut batch)?;
-
- let mut sampler = LlamaSampler::chain_simple([
- LlamaSampler::grammar(model, r"root ::= [Yy] [Ee] [Ss] | [Nn] [Oo]", "root")?,
- LlamaSampler::temp(0.8),
- LlamaSampler::greedy(),
- ]);
-
- let mut classifier = model.sampled_token_classifier();
- let (raw_token, mut outcomes) =
- classifier.sample(&mut sampler, &context, batch.n_tokens() - 1)?;
- outcomes.extend(classifier.flush());
-
- assert_eq!(
- outcomes.len(),
- 1,
- "expected one finalised outcome after flush"
- );
- let outcome = &outcomes[0];
-
- let raw_as_sampled = SampledToken::Content(raw_token);
- assert!(
- !model.is_eog_token(&raw_as_sampled),
- "Grammar sampler should not allow EOS as first token"
- );
-
- let piece = &outcome.raw_piece;
- let first_char = piece
- .chars()
- .next()
- .ok_or_else(|| anyhow::anyhow!("piece should have at least one character"))?
- .to_lowercase()
- .next()
- .ok_or_else(|| anyhow::anyhow!("lowercase iterator should yield a character"))?;
-
- assert!(
- first_char == 'y' || first_char == 'n',
- "Grammar should constrain first token to start with y/n, got: '{piece}'"
- );
- assert_eq!(
- classifier.usage().completion_tokens(),
- 1,
- "exactly one completion token sampled"
- );
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn json_schema_grammar_sampler_constrains_output_to_json(fixture: &LlamaFixture<'_>) -> Result<()> {
- let model = fixture.model;
- let mut context = LlamaContext::from_model(
- model,
- fixture.backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
-
- let prompt = "<|im_start|>user\nWhat is 2+2? Respond with a JSON object.<|im_end|>\n<|im_start|>assistant\n\n\n\n\n";
- let tokens = model.str_to_token(prompt, AddBos::Always)?;
- let mut batch = LlamaBatch::new(512, 1)?;
-
- batch.add_sequence(&tokens, 0, false)?;
-
- context.decode(&mut batch)?;
-
- let grammar_str = json_schema_to_grammar(
- r#"{"type": "object", "properties": {"answer": {"type": "string"}}, "required": ["answer"]}"#,
- )?;
-
- let mut sampler = LlamaSampler::chain_simple([
- LlamaSampler::grammar(model, &grammar_str, "root")?,
- LlamaSampler::temp(0.8),
- LlamaSampler::greedy(),
- ]);
-
- let mut classifier = model.sampled_token_classifier();
- let (raw_token, mut outcomes) =
- classifier.sample(&mut sampler, &context, batch.n_tokens() - 1)?;
- outcomes.extend(classifier.flush());
-
- assert_eq!(
- outcomes.len(),
- 1,
- "expected one finalised outcome after flush"
- );
- let outcome = &outcomes[0];
-
- let raw_as_sampled = SampledToken::Content(raw_token);
- assert!(
- !model.is_eog_token(&raw_as_sampled),
- "Grammar sampler should not allow EOS as first token"
- );
-
- let piece = &outcome.raw_piece;
-
- assert!(
- piece.starts_with('{'),
- "JSON schema grammar should constrain first token to start with '{{', got: '{piece}'"
- );
- assert_eq!(
- classifier.usage().completion_tokens(),
- 1,
- "exactly one completion token sampled"
- );
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn sample_with_grammar_produces_constrained_output_in_loop(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let model = fixture.model;
- let mut context = LlamaContext::from_model(
- model,
- fixture.backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
-
- let prompt = "<|im_start|>user\nIs the sky blue? yes or no<|im_end|>\n<|im_start|>assistant\n\n\n\n\n";
- let tokens = model.str_to_token(prompt, AddBos::Always)?;
- let mut batch = LlamaBatch::new(512, 1)?;
-
- let mut classifier = model.sampled_token_classifier();
- classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?;
-
- context.decode(&mut batch)?;
- classifier.commit_prompt_tokens();
-
- let mut sampler = LlamaSampler::chain_simple([
- LlamaSampler::grammar(model, r#"root ::= "yes" | "no""#, "root")?,
- LlamaSampler::temp(0.8),
- LlamaSampler::greedy(),
- ]);
-
- let initial_position = batch.n_tokens();
- let outcome = ClassifySampleLoop {
- model,
- classifier: &mut classifier,
- sampler: &mut sampler,
- context: &mut context,
- batch: &mut batch,
- initial_position,
- max_generated_tokens: 10,
- }
- .run()?;
-
- let lowercase = outcome.generated_raw.to_lowercase();
- assert!(
- lowercase == "yes" || lowercase == "no",
- "Grammar loop should produce 'yes' or 'no', got: '{}'",
- outcome.generated_raw
- );
- assert!(
- outcome.eog_seen,
- "loop must terminate via EOG once grammar accepts, not by exhausting the budget; outcome={outcome:?}"
- );
- assert_eq!(outcome.observed_reasoning, 0);
- assert_eq!(outcome.observed_undeterminable, 0);
- assert_eq!(outcome.observed_tool_call, 0);
- assert!(outcome.observed_content > 0);
-
- let usage = classifier.into_usage();
- assert_eq!(usage.completion_tokens(), outcome.observed_content);
- assert_eq!(usage.reasoning_tokens, 0);
- assert_eq!(usage.undeterminable_tokens, 0);
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn sample_without_grammar_produces_multiple_tokens(fixture: &LlamaFixture<'_>) -> Result<()> {
- let model = fixture.model;
- let mut context = LlamaContext::from_model(
- model,
- fixture.backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
-
- let prompt =
- "<|im_start|>user\nSay hello<|im_end|>\n<|im_start|>assistant\n\n\n\n\n";
- let tokens = model.str_to_token(prompt, AddBos::Always)?;
- let mut batch = LlamaBatch::new(512, 1)?;
-
- batch.add_sequence(&tokens, 0, false)?;
-
- context.decode(&mut batch)?;
-
- let mut sampler = LlamaSampler::chain_simple([LlamaSampler::temp(0.8), LlamaSampler::greedy()]);
-
- let mut classifier = model.sampled_token_classifier();
- let mut sampled_count: u64 = 0;
-
- for (position, _) in (batch.n_tokens()..).zip(0..5) {
- let (raw_token, _outcomes) = classifier.sample(&mut sampler, &context, -1)?;
- let raw_as_sampled = SampledToken::Content(raw_token);
-
- if model.is_eog_token(&raw_as_sampled) {
- break;
- }
-
- sampled_count += 1;
-
- batch.clear();
- batch.add(&raw_as_sampled, position, &[0], true)?;
-
- context.decode(&mut batch)?;
- }
-
- let _ = classifier.flush();
-
- assert!(
- sampled_count > 0,
- "Should produce at least one token without grammar"
- );
- let usage = classifier.into_usage();
- assert!(
- usage.completion_tokens() >= sampled_count,
- "completion_tokens ({}) must include the {sampled_count} non-EOG samples",
- usage.completion_tokens()
- );
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/model_special_tokens.rs b/llama-cpp-bindings-tests/tests/model_special_tokens.rs
deleted file mode 100644
index c719501b..00000000
--- a/llama-cpp-bindings-tests/tests/model_special_tokens.rs
+++ /dev/null
@@ -1,381 +0,0 @@
-#![expect(
- clippy::unnecessary_wraps,
- reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-)]
-
-use anyhow::Result;
-use llama_cpp_bindings::SampledToken;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn special_tokens_exist(fixture: &LlamaFixture<'_>) -> Result<()> {
- let model = fixture.model;
- let bos = model.token_bos();
- let eos = model.token_eos();
-
- assert_ne!(bos, eos);
- assert!(model.is_eog_token(&SampledToken::Content(eos)));
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn token_nl_returns_valid_token(fixture: &LlamaFixture<'_>) -> Result<()> {
- let nl_token = fixture.model.token_nl();
- assert!(nl_token.0 >= 0);
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn is_eog_token_classifies_reasoning_variant(fixture: &LlamaFixture<'_>) -> Result<()> {
- let model = fixture.model;
- let eos = model.token_eos();
- assert!(model.is_eog_token(&SampledToken::Reasoning(eos)));
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn is_eog_token_classifies_tool_call_variant(fixture: &LlamaFixture<'_>) -> Result<()> {
- let model = fixture.model;
- let eos = model.token_eos();
- assert!(model.is_eog_token(&SampledToken::ToolCall(eos)));
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn is_eog_token_classifies_undeterminable_variant(fixture: &LlamaFixture<'_>) -> Result<()> {
- let model = fixture.model;
- let eos = model.token_eos();
- assert!(model.is_eog_token(&SampledToken::Undeterminable(eos)));
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn decode_start_token_returns_valid_token(fixture: &LlamaFixture<'_>) -> Result<()> {
- let model = fixture.model;
- let token = model.decode_start_token();
- let n_vocab = model.n_vocab();
- assert!(
- token.0 == -1 || (0..n_vocab).contains(&token.0),
- "decode_start_token must be either -1 (no decoder-start defined) or a valid vocab index < {n_vocab}; got {token}"
- );
- assert_eq!(
- token,
- model.decode_start_token(),
- "decode_start_token must be deterministic across calls"
- );
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn token_sep_returns_valid_token(fixture: &LlamaFixture<'_>) -> Result<()> {
- let model = fixture.model;
- let token = model.token_sep();
- let n_vocab = model.n_vocab();
- assert!(
- token.0 == -1 || (0..n_vocab).contains(&token.0),
- "token_sep must be either -1 (no SEP token defined) or a valid vocab index < {n_vocab}; got {token}"
- );
- assert_eq!(
- token,
- model.token_sep(),
- "token_sep must be deterministic across calls"
- );
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn token_attr_returns_attrs_for_bos(fixture: &LlamaFixture<'_>) -> Result<()> {
- let model = fixture.model;
- let bos = model.token_bos();
- let attrs = model.token_attr(bos)?;
- let bit_repr = format!("{:?}", *attrs);
- assert!(
- !bit_repr.is_empty(),
- "token_attr(bos) must produce Debug output"
- );
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/model_str_to_token.rs b/llama-cpp-bindings-tests/tests/model_str_to_token.rs
deleted file mode 100644
index ea8ebb9c..00000000
--- a/llama-cpp-bindings-tests/tests/model_str_to_token.rs
+++ /dev/null
@@ -1,210 +0,0 @@
-use anyhow::Result;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn str_to_token_roundtrip(fixture: &LlamaFixture<'_>) -> Result<()> {
- let model = fixture.model;
- let tokens = model.str_to_token("hello world", AddBos::Never)?;
- assert!(!tokens.is_empty());
- let mut decoder = encoding_rs::UTF_8.new_decoder();
- let piece = model.token_to_piece(
- &llama_cpp_bindings::SampledToken::Content(tokens[0]),
- &mut decoder,
- false,
- None,
- )?;
-
- assert!(!piece.is_empty());
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn str_to_token_grows_buffer_when_initial_estimation_too_small(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let many_short_chars = "a b c d e f g h i j k l";
- let tokens = fixture
- .model
- .str_to_token(many_short_chars, AddBos::Always)?;
-
- assert!(
- tokens.len() > 8,
- "expected regrow; got {} tokens",
- tokens.len()
- );
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn str_to_token_with_add_bos_never(fixture: &LlamaFixture<'_>) -> Result<()> {
- let model = fixture.model;
- let tokens_with_bos = model.str_to_token("hello", AddBos::Always)?;
- let tokens_without_bos = model.str_to_token("hello", AddBos::Never)?;
-
- assert!(tokens_with_bos.len() >= tokens_without_bos.len());
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn str_to_token_with_many_tokens_triggers_buffer_resize(fixture: &LlamaFixture<'_>) -> Result<()> {
- use std::fmt::Write;
-
- let many_numbers = (0..2000).fold(String::new(), |mut accumulator, number| {
- let _ = write!(accumulator, "{number} ");
- accumulator
- });
-
- let tokens = fixture.model.str_to_token(&many_numbers, AddBos::Always)?;
-
- assert!(tokens.len() > many_numbers.len() / 2);
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/model_token_to_piece.rs b/llama-cpp-bindings-tests/tests/model_token_to_piece.rs
deleted file mode 100644
index b86d391b..00000000
--- a/llama-cpp-bindings-tests/tests/model_token_to_piece.rs
+++ /dev/null
@@ -1,364 +0,0 @@
-#![expect(
- clippy::unnecessary_wraps,
- reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-)]
-
-use std::num::NonZeroU16;
-
-use anyhow::Result;
-use llama_cpp_bindings::SampledToken;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn token_to_piece_bytes_returns_bytes_for_known_token(fixture: &LlamaFixture<'_>) -> Result<()> {
- let model = fixture.model;
- let tokens = model.str_to_token("hello", AddBos::Never)?;
- let bytes = model.token_to_piece_bytes(tokens[0], 32, false, None)?;
-
- assert!(!bytes.is_empty());
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn token_to_piece_handles_large_token_requiring_buffer_resize(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let model = fixture.model;
- let mut decoder = encoding_rs::UTF_8.new_decoder();
-
- for (token, _) in model.tokens(true).take(200) {
- let result = model.token_to_piece(&SampledToken::Content(token), &mut decoder, true, None);
- assert!(result.is_ok());
- }
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn token_to_piece_bytes_insufficient_buffer_returns_error(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let model = fixture.model;
- let tokens = model.str_to_token("hello", AddBos::Never)?;
- let result = model.token_to_piece_bytes(tokens[0], 1, false, None);
-
- assert!(
- result
- .unwrap_err()
- .to_string()
- .contains("Insufficient Buffer Space")
- );
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn token_to_piece_with_lstrip(fixture: &LlamaFixture<'_>) -> Result<()> {
- let model = fixture.model;
- let mut decoder = encoding_rs::UTF_8.new_decoder();
- let tokens = model.str_to_token("hello", AddBos::Never)?;
- let result = model.token_to_piece(
- &SampledToken::Content(tokens[0]),
- &mut decoder,
- false,
- NonZeroU16::new(1),
- );
-
- assert!(result.is_ok());
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn token_to_piece_decodes_reasoning_variant(fixture: &LlamaFixture<'_>) -> Result<()> {
- let model = fixture.model;
- let mut decoder = encoding_rs::UTF_8.new_decoder();
- let tokens = model.str_to_token("hi", AddBos::Never)?;
-
- let piece = model.token_to_piece(
- &SampledToken::Reasoning(tokens[0]),
- &mut decoder,
- true,
- None,
- )?;
-
- assert!(!piece.is_empty());
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn token_to_piece_decodes_tool_call_variant(fixture: &LlamaFixture<'_>) -> Result<()> {
- let model = fixture.model;
- let mut decoder = encoding_rs::UTF_8.new_decoder();
- let tokens = model.str_to_token("hi", AddBos::Never)?;
-
- let piece =
- model.token_to_piece(&SampledToken::ToolCall(tokens[0]), &mut decoder, true, None)?;
-
- assert!(!piece.is_empty());
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn token_to_piece_decodes_undeterminable_variant(fixture: &LlamaFixture<'_>) -> Result<()> {
- let model = fixture.model;
- let mut decoder = encoding_rs::UTF_8.new_decoder();
- let tokens = model.str_to_token("hi", AddBos::Never)?;
-
- let piece = model.token_to_piece(
- &SampledToken::Undeterminable(tokens[0]),
- &mut decoder,
- true,
- None,
- )?;
-
- assert!(!piece.is_empty());
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/model_tokens_iterator.rs b/llama-cpp-bindings-tests/tests/model_tokens_iterator.rs
deleted file mode 100644
index 3f9ad9da..00000000
--- a/llama-cpp-bindings-tests/tests/model_tokens_iterator.rs
+++ /dev/null
@@ -1,109 +0,0 @@
-#![expect(
- clippy::unnecessary_wraps,
- reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-)]
-
-use anyhow::Result;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn tokens_iterator_produces_valid_entries(fixture: &LlamaFixture<'_>) -> Result<()> {
- let model = fixture.model;
- let mut count = 0;
-
- for (token, _piece_result) in model.tokens(false) {
- assert!(token.0 >= 0);
- count += 1;
-
- if count >= 100 {
- break;
- }
- }
-
- assert_eq!(count, 100);
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn n_vocab_matches_tokens_iterator_count(fixture: &LlamaFixture<'_>) -> Result<()> {
- let model = fixture.model;
- let n_vocab = model.n_vocab();
- let count = model.tokens(false).count();
-
- assert_eq!(count, usize::try_from(n_vocab)?);
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/mtmd_bitmap.rs b/llama-cpp-bindings-tests/tests/mtmd_bitmap.rs
deleted file mode 100644
index 3c66f82f..00000000
--- a/llama-cpp-bindings-tests/tests/mtmd_bitmap.rs
+++ /dev/null
@@ -1,81 +0,0 @@
-#![expect(
- clippy::unnecessary_wraps,
- reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-)]
-
-use anyhow::Result;
-use llama_cpp_bindings::mtmd::MtmdBitmap;
-use llama_cpp_bindings_tests::test_model;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
- mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
- mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn from_buffer_creates_bitmap_from_image_bytes(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mtmd_ctx = fixture
- .mtmd_context
- .expect("mmproj_file declared in attribute");
-
- let fixtures = test_model::fixtures_dir();
- let image_path = fixtures.join("llamas.jpg");
- let image_bytes = std::fs::read(&image_path)?;
- let bitmap = MtmdBitmap::from_buffer(mtmd_ctx, &image_bytes)?;
-
- assert!(bitmap.nx() > 0);
- assert!(bitmap.ny() > 0);
- assert!(!bitmap.is_audio());
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
- mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
- mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn from_file_with_null_byte_in_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mtmd_ctx = fixture
- .mtmd_context
- .expect("mmproj_file declared in attribute");
- let result = MtmdBitmap::from_file(mtmd_ctx, "path\0null");
-
- assert!(result.is_err());
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/mtmd_chunk_operations.rs b/llama-cpp-bindings-tests/tests/mtmd_chunk_operations.rs
deleted file mode 100644
index 8a960774..00000000
--- a/llama-cpp-bindings-tests/tests/mtmd_chunk_operations.rs
+++ /dev/null
@@ -1,147 +0,0 @@
-use anyhow::Result;
-use llama_cpp_bindings::mtmd::MtmdBitmap;
-use llama_cpp_bindings::mtmd::MtmdInputChunkType;
-use llama_cpp_bindings::mtmd::MtmdInputText;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
- mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
- mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn copy_creates_owned_duplicate(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mtmd_ctx = fixture
- .mtmd_context
- .expect("mmproj_file declared in attribute");
- let image_data = vec![128u8; 64 * 64 * 3];
- let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?;
- let input_text = MtmdInputText {
- text: "Hello <__media__>".to_string(),
- add_special: true,
- parse_special: true,
- };
- let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
- let first_chunk = chunks
- .get(0)
- .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?;
- let copied = first_chunk.copy()?;
-
- assert!(copied.owned);
- assert_eq!(copied.n_tokens(), first_chunk.n_tokens());
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
- mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
- mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn encode_chunk_succeeds_for_image_chunk(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mtmd_ctx = fixture
- .mtmd_context
- .expect("mmproj_file declared in attribute");
- let image_data = vec![128u8; 64 * 64 * 3];
- let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?;
- let input_text = MtmdInputText {
- text: "Describe: <__media__>".to_string(),
- add_special: true,
- parse_special: true,
- };
- let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
-
- for chunk_index in 0..chunks.len() {
- let chunk = chunks
- .get(chunk_index)
- .ok_or_else(|| anyhow::anyhow!("missing chunk at index {chunk_index}"))?;
- if chunk.chunk_type() == Ok(MtmdInputChunkType::Image) {
- let result = mtmd_ctx.encode_chunk(&chunk);
- assert!(result.is_ok());
- return Ok(());
- }
- }
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
- mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
- mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn decode_use_non_causal_returns_bool_for_image_chunk(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mtmd_ctx = fixture
- .mtmd_context
- .expect("mmproj_file declared in attribute");
- let image_data = vec![128u8; 64 * 64 * 3];
- let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?;
- let input_text = MtmdInputText {
- text: "Describe: <__media__>".to_string(),
- add_special: true,
- parse_special: true,
- };
- let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
- for chunk_index in 0..chunks.len() {
- let chunk = chunks
- .get(chunk_index)
- .ok_or_else(|| anyhow::anyhow!("missing chunk at index {chunk_index}"))?;
- if chunk.chunk_type() == Ok(MtmdInputChunkType::Image) {
- let value = mtmd_ctx.decode_use_non_causal(&chunk);
- let printed = format!("{value:?}");
- assert!(
- !printed.is_empty(),
- "decode_use_non_causal must return a Debug-printable bool"
- );
- return Ok(());
- }
- }
- anyhow::bail!("tokenization should produce at least one Image chunk");
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/mtmd_chunk_structure.rs b/llama-cpp-bindings-tests/tests/mtmd_chunk_structure.rs
deleted file mode 100644
index 1114af3c..00000000
--- a/llama-cpp-bindings-tests/tests/mtmd_chunk_structure.rs
+++ /dev/null
@@ -1,242 +0,0 @@
-use anyhow::Result;
-use llama_cpp_bindings::mtmd::MtmdBitmap;
-use llama_cpp_bindings::mtmd::MtmdInputChunkType;
-use llama_cpp_bindings::mtmd::MtmdInputText;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-fn tokenize_synthetic(
- fixture: &LlamaFixture<'_>,
- prompt: &str,
-) -> Result {
- let mtmd_ctx = fixture
- .mtmd_context
- .expect("mmproj_file declared in attribute");
- let image_data = vec![128u8; 64 * 64 * 3];
- let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?;
- let input_text = MtmdInputText {
- text: prompt.to_owned(),
- add_special: true,
- parse_special: true,
- };
- Ok(mtmd_ctx.tokenize(input_text, &[&bitmap])?)
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
- mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
- mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn text_chunk_has_text_type(fixture: &LlamaFixture<'_>) -> Result<()> {
- let chunks = tokenize_synthetic(fixture, "Hello world <__media__>")?;
- let first_chunk = chunks
- .get(0)
- .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?;
- assert_eq!(first_chunk.chunk_type()?, MtmdInputChunkType::Text);
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
- mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
- mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn text_chunk_returns_text_tokens(fixture: &LlamaFixture<'_>) -> Result<()> {
- let chunks = tokenize_synthetic(fixture, "Hello world <__media__>")?;
- let first_chunk = chunks
- .get(0)
- .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?;
- let tokens = first_chunk.text_tokens();
- assert!(tokens.is_some());
- assert!(!tokens.expect("tokens should be some").is_empty());
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
- mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
- mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn chunk_n_tokens_is_positive(fixture: &LlamaFixture<'_>) -> Result<()> {
- let chunks = tokenize_synthetic(fixture, "Hello world <__media__>")?;
- let first_chunk = chunks
- .get(0)
- .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?;
- assert!(first_chunk.n_tokens() > 0);
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
- mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
- mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn chunk_n_positions_is_positive(fixture: &LlamaFixture<'_>) -> Result<()> {
- let chunks = tokenize_synthetic(fixture, "Hello world <__media__>")?;
- let first_chunk = chunks
- .get(0)
- .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?;
- assert!(first_chunk.n_positions() > 0);
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
- mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
- mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn text_chunk_id_returns_none(fixture: &LlamaFixture<'_>) -> Result<()> {
- let chunks = tokenize_synthetic(fixture, "Hello <__media__>")?;
- let first_chunk = chunks
- .get(0)
- .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?;
- assert_eq!(first_chunk.chunk_type()?, MtmdInputChunkType::Text);
- assert!(first_chunk.id().is_none());
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
- mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
- mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn image_chunk_returns_none_for_text_tokens(fixture: &LlamaFixture<'_>) -> Result<()> {
- let chunks = tokenize_synthetic(fixture, "Hello <__media__>")?;
- for chunk_index in 0..chunks.len() {
- let chunk = chunks
- .get(chunk_index)
- .ok_or_else(|| anyhow::anyhow!("missing chunk at index {chunk_index}"))?;
- if chunk.chunk_type() == Ok(MtmdInputChunkType::Image) {
- assert!(chunk.text_tokens().is_none());
- return Ok(());
- }
- }
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
- mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
- mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn image_chunk_id_returns_some(fixture: &LlamaFixture<'_>) -> Result<()> {
- let chunks = tokenize_synthetic(fixture, "Hello <__media__>")?;
- for chunk_index in 0..chunks.len() {
- let chunk = chunks
- .get(chunk_index)
- .ok_or_else(|| anyhow::anyhow!("missing chunk at index {chunk_index}"))?;
- if chunk.chunk_type() == Ok(MtmdInputChunkType::Image) {
- assert!(chunk.id().is_some());
- return Ok(());
- }
- }
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/mtmd_context.rs b/llama-cpp-bindings-tests/tests/mtmd_context.rs
deleted file mode 100644
index 8595eb2b..00000000
--- a/llama-cpp-bindings-tests/tests/mtmd_context.rs
+++ /dev/null
@@ -1,162 +0,0 @@
-#![expect(
- clippy::unnecessary_wraps,
- reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-)]
-
-use anyhow::Result;
-use llama_cpp_bindings::mtmd::MtmdContext;
-use llama_cpp_bindings::mtmd::MtmdContextParams;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
- mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
- mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn init_and_supports_vision(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mtmd_ctx = fixture
- .mtmd_context
- .expect("mmproj_file declared in attribute");
- assert!(mtmd_ctx.support_vision());
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
- mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
- mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn init_from_file_with_null_byte_in_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mtmd_params = MtmdContextParams::default();
- let result = MtmdContext::init_from_file("path\0null", fixture.model, &mtmd_params);
-
- assert!(result.is_err());
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
- mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
- mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn decode_use_mrope_is_true_for_qwen_vision(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mtmd_ctx = fixture
- .mtmd_context
- .expect("mmproj_file declared in attribute");
- assert!(
- mtmd_ctx.decode_use_mrope(),
- "Qwen 3.5 / 3.6 mmproj uses mrope; decode_use_mrope must return true"
- );
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
- mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
- mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn support_audio_is_false_for_vision_only_mmproj(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mtmd_ctx = fixture
- .mtmd_context
- .expect("mmproj_file declared in attribute");
- assert!(
- !mtmd_ctx.support_audio(),
- "Qwen 3.5 / 3.6 mmproj is vision-only; support_audio must return false"
- );
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
- mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
- mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn get_audio_sample_rate_is_none_for_vision_only_mmproj(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mtmd_ctx = fixture
- .mtmd_context
- .expect("mmproj_file declared in attribute");
- assert!(
- mtmd_ctx.get_audio_sample_rate().is_none(),
- "Qwen 3.5 / 3.6 mmproj has no audio; get_audio_sample_rate must return None"
- );
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/mtmd_evaluation.rs b/llama-cpp-bindings-tests/tests/mtmd_evaluation.rs
deleted file mode 100644
index b6f30f1c..00000000
--- a/llama-cpp-bindings-tests/tests/mtmd_evaluation.rs
+++ /dev/null
@@ -1,236 +0,0 @@
-#![expect(
- clippy::unnecessary_wraps,
- reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-)]
-
-use anyhow::Result;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::mtmd::MtmdBitmap;
-use llama_cpp_bindings::mtmd::MtmdEvalError;
-use llama_cpp_bindings::mtmd::MtmdInputChunks;
-use llama_cpp_bindings::mtmd::MtmdInputText;
-use llama_cpp_bindings_tests::test_model;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-fn eval_synthetic_bitmap(fixture: &LlamaFixture<'_>, width: u32, height: u32) -> Result<()> {
- let mtmd_ctx = fixture
- .mtmd_context
- .expect("mmproj_file declared in attribute");
- let image_data = vec![128u8; (width as usize) * (height as usize) * 3];
- let bitmap = MtmdBitmap::from_image_data(width, height, &image_data)?;
- let input_text = MtmdInputText {
- text: "Describe: <__media__>".to_string(),
- add_special: true,
- parse_special: true,
- };
- let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
- let n_positions = chunks.total_positions();
- let required_n_ctx = u32::try_from(n_positions + 256)?;
- if fixture.context_params.n_ctx < required_n_ctx {
- anyhow::bail!(
- "fixture n_ctx ({}) below required ({}) for {}x{} image",
- fixture.context_params.n_ctx,
- required_n_ctx,
- width,
- height,
- );
- }
-
- let llama_ctx = LlamaContext::from_model(
- fixture.model,
- fixture.backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
- let n_batch = i32::try_from(llama_ctx.n_batch())?;
- chunks.eval_chunks(mtmd_ctx, &llama_ctx, 0, 0, n_batch, false)?;
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 64,
- n_batch = 64,
- n_ubatch = 32,
- mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 64,
- n_batch = 64,
- n_ubatch = 32,
- mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn eval_chunks_returns_batch_size_exceeds_context_limit_for_huge_batch(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let mtmd_ctx = fixture
- .mtmd_context
- .expect("mmproj_file declared in attribute");
- let llama_ctx = LlamaContext::from_model(
- fixture.model,
- fixture.backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
-
- let chunks = MtmdInputChunks::new()?;
- let huge_batch = i32::try_from(llama_ctx.n_batch() + 1)?;
-
- let result = chunks.eval_chunks(mtmd_ctx, &llama_ctx, 0, 0, huge_batch, false);
-
- assert!(matches!(
- result,
- Err(MtmdEvalError::BatchSizeExceedsContextLimit { .. })
- ));
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 8192,
- n_batch = 512,
- n_ubatch = 512,
- mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 8192,
- n_batch = 512,
- n_ubatch = 512,
- mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn eval_chunks_with_standard_image(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mtmd_ctx = fixture
- .mtmd_context
- .expect("mmproj_file declared in attribute");
-
- let fixtures = test_model::fixtures_dir();
- let image_path = fixtures.join("llamas.jpg");
- let image_path_str = image_path
- .to_str()
- .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?;
- let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?;
- let input_text = MtmdInputText {
- text: "What is in this image? <__media__>".to_string(),
- add_special: true,
- parse_special: true,
- };
- let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
- let n_positions = chunks.total_positions();
- let required_n_ctx = u32::try_from(n_positions + 256)?;
- assert!(
- fixture.context_params.n_ctx >= required_n_ctx,
- "fixture n_ctx ({}) below required ({}); update the attribute literal",
- fixture.context_params.n_ctx,
- required_n_ctx,
- );
-
- let llama_ctx = LlamaContext::from_model(
- fixture.model,
- fixture.backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
- let n_batch = i32::try_from(llama_ctx.n_batch())?;
- let result = chunks.eval_chunks(mtmd_ctx, &llama_ctx, 0, 0, n_batch, false);
-
- assert!(result.is_ok());
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 8192,
- n_batch = 512,
- n_ubatch = 512,
- mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 8192,
- n_batch = 512,
- n_ubatch = 512,
- mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn eval_chunks_with_varied_dimensions(fixture: &LlamaFixture<'_>) -> Result<()> {
- let test_dimensions: [(u32, u32); 4] = [(224, 224), (512, 512), (100, 500), (337, 421)];
-
- for (width, height) in test_dimensions {
- let result = eval_synthetic_bitmap(fixture, width, height);
- assert!(
- result.is_ok(),
- "dimension {width}x{height} should succeed: {result:?}"
- );
- }
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 8192,
- n_batch = 512,
- n_ubatch = 512,
- mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 8192,
- n_batch = 512,
- n_ubatch = 512,
- mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn eval_chunks_with_extreme_dimensions_does_not_crash(fixture: &LlamaFixture<'_>) -> Result<()> {
- let extreme_dimensions: [(u32, u32); 6] = [
- (1, 1),
- (7, 13),
- (3, 1000),
- (1000, 3),
- (1920, 1080),
- (4096, 4096),
- ];
-
- let mut any_reached_eval = false;
-
- for (width, height) in extreme_dimensions {
- match eval_synthetic_bitmap(fixture, width, height) {
- Ok(()) => any_reached_eval = true,
- Err(error) => eprintln!(" {width}x{height} failed: {error}"),
- }
- }
-
- assert!(
- any_reached_eval,
- "at least one extreme dimension should reach eval_chunks"
- );
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/mtmd_tokenization.rs b/llama-cpp-bindings-tests/tests/mtmd_tokenization.rs
deleted file mode 100644
index ae5f32c3..00000000
--- a/llama-cpp-bindings-tests/tests/mtmd_tokenization.rs
+++ /dev/null
@@ -1,121 +0,0 @@
-#![expect(
- clippy::unnecessary_wraps,
- reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-)]
-
-use anyhow::Result;
-use llama_cpp_bindings::mtmd::MtmdBitmap;
-use llama_cpp_bindings::mtmd::MtmdInputText;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
- mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
- mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn tokenize_text_with_image(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mtmd_ctx = fixture
- .mtmd_context
- .expect("mmproj_file declared in attribute");
- let image_data = vec![128u8; 64 * 64 * 3];
- let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?;
- let input_text = MtmdInputText {
- text: "Describe this image: <__media__>".to_string(),
- add_special: true,
- parse_special: true,
- };
- let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
-
- assert!(!chunks.is_empty());
- assert!(chunks.total_tokens() > 0);
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
- mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
- mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn tokenize_bitmap_count_mismatch_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mtmd_ctx = fixture
- .mtmd_context
- .expect("mmproj_file declared in attribute");
- let input_text = MtmdInputText {
- text: "No media markers here".to_string(),
- add_special: true,
- parse_special: true,
- };
- let image_data = vec![128u8; 64 * 64 * 3];
- let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?;
- let result = mtmd_ctx.tokenize(input_text, &[&bitmap]);
- assert!(result.is_err());
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
- mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
- mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn tokenize_with_null_byte_in_text_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mtmd_ctx = fixture
- .mtmd_context
- .expect("mmproj_file declared in attribute");
- let input_text = MtmdInputText {
- text: "text\0null".to_string(),
- add_special: true,
- parse_special: true,
- };
- let result = mtmd_ctx.tokenize(input_text, &[]);
- assert!(result.is_err());
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/multimodal.rs b/llama-cpp-bindings-tests/tests/multimodal.rs
deleted file mode 100644
index c1108c4d..00000000
--- a/llama-cpp-bindings-tests/tests/multimodal.rs
+++ /dev/null
@@ -1,212 +0,0 @@
-use anyhow::{Context, Result};
-use llama_cpp_bindings::SampledTokenClassifier;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::{LlamaChatMessage, LlamaModel};
-use llama_cpp_bindings::mtmd::{MtmdBitmap, MtmdInputChunkType, MtmdInputChunks, MtmdInputText};
-use llama_cpp_bindings::sampled_token::SampledToken;
-use llama_cpp_bindings::sampling::LlamaSampler;
-use llama_cpp_bindings_sys::llama_pos;
-use llama_cpp_bindings_tests::test_model;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-struct ChunkTokenBreakdown {
- text: u64,
- image: u64,
- audio: u64,
-}
-
-fn count_chunk_tokens_by_type(chunks: &MtmdInputChunks) -> Result {
- let mut breakdown = ChunkTokenBreakdown {
- text: 0,
- image: 0,
- audio: 0,
- };
- for index in 0..chunks.len() {
- let chunk = chunks
- .get(index)
- .with_context(|| format!("chunk index {index} is missing"))?;
- let n_tokens = u64::try_from(chunk.n_tokens())?;
- match chunk.chunk_type()? {
- MtmdInputChunkType::Text => breakdown.text += n_tokens,
- MtmdInputChunkType::Image => breakdown.image += n_tokens,
- MtmdInputChunkType::Audio => breakdown.audio += n_tokens,
- }
- }
-
- Ok(breakdown)
-}
-
-fn build_user_prompt_with_image_marker(model: &LlamaModel, question: &str) -> Result {
- let marker = llama_cpp_bindings::mtmd::mtmd_default_marker();
- let user_content = format!("{marker}{question}");
- let chat_template = model.chat_template(None)?;
- let messages = [LlamaChatMessage::new("user".to_string(), user_content)?];
-
- Ok(model.apply_chat_template(&chat_template, &messages, true)?)
-}
-
-struct SamplingTotals {
- generated: String,
- observed_content: u64,
- observed_reasoning: u64,
-}
-
-fn drive_sampling_loop(
- classifier: &mut SampledTokenClassifier,
- model: &LlamaModel,
- ctx: &mut LlamaContext,
- starting_position: llama_pos,
- max_tokens: usize,
-) -> Result {
- let mut sampler = LlamaSampler::greedy();
- let mut totals = SamplingTotals {
- generated: String::new(),
- observed_content: 0,
- observed_reasoning: 0,
- };
- let mut batch = LlamaBatch::new(512, 1)?;
-
- for (current_position, _) in (starting_position..).zip(0..max_tokens) {
- let (raw_token, outcomes) = classifier.sample(&mut sampler, ctx, -1)?;
- for outcome in &outcomes {
- totals.generated.push_str(&outcome.raw_piece);
- match outcome.sampled_token {
- SampledToken::Content(_) => totals.observed_content += 1,
- SampledToken::Reasoning(_) => totals.observed_reasoning += 1,
- SampledToken::ToolCall(_) | SampledToken::Undeterminable(_) => {}
- }
- }
-
- let raw_as_sampled = SampledToken::Content(raw_token);
- if model.is_eog_token(&raw_as_sampled) {
- break;
- }
-
- batch.clear();
- batch.add(&raw_as_sampled, current_position, &[0], true)?;
-
- ctx.decode(&mut batch)
- .with_context(|| "failed to decode generated token")?;
- }
-
- for outcome in classifier.flush() {
- totals.generated.push_str(&outcome.raw_piece);
- match outcome.sampled_token {
- SampledToken::Content(_) => totals.observed_content += 1,
- SampledToken::Reasoning(_) => totals.observed_reasoning += 1,
- SampledToken::ToolCall(_) | SampledToken::Undeterminable(_) => {}
- }
- }
-
- Ok(totals)
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 4096,
- n_batch = 512,
- n_ubatch = 512,
- mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-fn multimodal_vision_inference_produces_output(fixture: &LlamaFixture<'_>) -> Result<()> {
- let model = fixture.model;
- let mtmd_ctx = fixture
- .mtmd_context
- .expect("mmproj_file declared in attribute");
-
- let mut ctx = LlamaContext::from_model(
- model,
- fixture.backend,
- (*fixture.context_params).into_llama_context_params(),
- )
- .with_context(|| "unable to create llama context")?;
-
- assert!(
- mtmd_ctx.support_vision(),
- "model should support vision input"
- );
-
- let image_path = test_model::fixtures_dir().join("llamas.jpg");
- let image_path_str = image_path
- .to_str()
- .with_context(|| "image path is not valid UTF-8")?;
- let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)
- .with_context(|| "failed to load image from file")?;
-
- let formatted_prompt =
- build_user_prompt_with_image_marker(model, "What animals do you see in this image?")?;
-
- let input_text = MtmdInputText {
- text: formatted_prompt,
- add_special: false,
- parse_special: true,
- };
-
- let chunks = mtmd_ctx
- .tokenize(input_text, &[&bitmap])
- .with_context(|| "failed to tokenize multimodal input")?;
-
- assert!(
- !chunks.is_empty(),
- "tokenization should produce at least one chunk"
- );
-
- let expected = count_chunk_tokens_by_type(&chunks)?;
-
- eprintln!(
- "tokenized into {} chunks, text {} image {} audio {}",
- chunks.len(),
- expected.text,
- expected.image,
- expected.audio
- );
-
- assert!(
- expected.image > 0,
- "vision input must produce at least one image chunk"
- );
-
- let mut classifier = model.sampled_token_classifier();
- let n_past = classifier
- .eval_multimodal_chunks(&chunks, mtmd_ctx, &ctx, 0, 0, 512, true)
- .with_context(|| "failed to evaluate chunks")?;
-
- eprintln!("evaluated chunks, n_past = {n_past}");
-
- {
- let usage = classifier.usage();
- assert_eq!(usage.prompt_tokens, expected.text);
- assert_eq!(usage.input_image_tokens, expected.image);
- assert_eq!(usage.input_audio_tokens, expected.audio);
- }
-
- let totals = drive_sampling_loop(&mut classifier, model, &mut ctx, n_past, 512)?;
-
- eprintln!("generated text: {}", totals.generated);
-
- assert!(
- !totals.generated.is_empty(),
- "model should generate at least one token from image input"
- );
-
- let usage = classifier.into_usage();
- assert_eq!(usage.prompt_tokens, expected.text);
- assert_eq!(usage.input_image_tokens, expected.image);
- assert_eq!(usage.input_audio_tokens, expected.audio);
- assert_eq!(usage.content_tokens, totals.observed_content);
- assert_eq!(usage.reasoning_tokens, totals.observed_reasoning);
- assert_eq!(
- usage.completion_tokens(),
- totals.observed_content + totals.observed_reasoning
- );
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/multimodal_vision.rs b/llama-cpp-bindings-tests/tests/multimodal_vision.rs
new file mode 100644
index 00000000..7e596be6
--- /dev/null
+++ b/llama-cpp-bindings-tests/tests/multimodal_vision.rs
@@ -0,0 +1,2001 @@
+use llama_cpp_test_harness::llama_tests_main;
+
+mod mtmd_bitmap {
+ #![expect(
+ clippy::unnecessary_wraps,
+ reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
+ )]
+
+ use anyhow::Result;
+ use llama_cpp_bindings::mtmd::MtmdBitmap;
+ use llama_cpp_bindings_tests::test_model;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+ )]
+ fn from_buffer_creates_bitmap_from_image_bytes(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mtmd_ctx = fixture
+ .mtmd_context
+ .expect("mmproj_file declared in attribute");
+
+ let fixtures = test_model::fixtures_dir();
+ let image_path = fixtures.join("llamas.jpg");
+ let image_bytes = std::fs::read(&image_path)?;
+ let bitmap = MtmdBitmap::from_buffer(mtmd_ctx, &image_bytes)?;
+
+ assert!(bitmap.nx() > 0);
+ assert!(bitmap.ny() > 0);
+ assert!(!bitmap.is_audio());
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+ )]
+ fn from_file_with_null_byte_in_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mtmd_ctx = fixture
+ .mtmd_context
+ .expect("mmproj_file declared in attribute");
+ let result = MtmdBitmap::from_file(mtmd_ctx, "path\0null");
+
+ assert!(result.is_err());
+
+ Ok(())
+ }
+}
+
+mod mtmd_chunk_operations {
+ use anyhow::Result;
+ use llama_cpp_bindings::mtmd::MtmdBitmap;
+ use llama_cpp_bindings::mtmd::MtmdInputChunkType;
+ use llama_cpp_bindings::mtmd::MtmdInputText;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+ )]
+ fn copy_creates_owned_duplicate(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mtmd_ctx = fixture
+ .mtmd_context
+ .expect("mmproj_file declared in attribute");
+ let image_data = vec![128u8; 64 * 64 * 3];
+ let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?;
+ let input_text = MtmdInputText {
+ text: "Hello <__media__>".to_string(),
+ add_special: true,
+ parse_special: true,
+ };
+ let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
+ let first_chunk = chunks
+ .get(0)
+ .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?;
+ let copied = first_chunk.copy()?;
+
+ assert!(copied.owned);
+ assert_eq!(copied.n_tokens(), first_chunk.n_tokens());
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+ )]
+ fn encode_chunk_succeeds_for_image_chunk(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mtmd_ctx = fixture
+ .mtmd_context
+ .expect("mmproj_file declared in attribute");
+ let image_data = vec![128u8; 64 * 64 * 3];
+ let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?;
+ let input_text = MtmdInputText {
+ text: "Describe: <__media__>".to_string(),
+ add_special: true,
+ parse_special: true,
+ };
+ let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
+
+ for chunk_index in 0..chunks.len() {
+ let chunk = chunks
+ .get(chunk_index)
+ .ok_or_else(|| anyhow::anyhow!("missing chunk at index {chunk_index}"))?;
+ if chunk.chunk_type() == Ok(MtmdInputChunkType::Image) {
+ let result = mtmd_ctx.encode_chunk(&chunk);
+ assert!(result.is_ok());
+ return Ok(());
+ }
+ }
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+ )]
+ fn decode_use_non_causal_returns_bool_for_image_chunk(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let mtmd_ctx = fixture
+ .mtmd_context
+ .expect("mmproj_file declared in attribute");
+ let image_data = vec![128u8; 64 * 64 * 3];
+ let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?;
+ let input_text = MtmdInputText {
+ text: "Describe: <__media__>".to_string(),
+ add_special: true,
+ parse_special: true,
+ };
+ let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
+ for chunk_index in 0..chunks.len() {
+ let chunk = chunks
+ .get(chunk_index)
+ .ok_or_else(|| anyhow::anyhow!("missing chunk at index {chunk_index}"))?;
+ if chunk.chunk_type() == Ok(MtmdInputChunkType::Image) {
+ let value = mtmd_ctx.decode_use_non_causal(&chunk);
+ let printed = format!("{value:?}");
+ assert!(
+ !printed.is_empty(),
+ "decode_use_non_causal must return a Debug-printable bool"
+ );
+ return Ok(());
+ }
+ }
+ anyhow::bail!("tokenization should produce at least one Image chunk");
+ }
+}
+
+mod mtmd_chunk_structure {
+ use anyhow::Result;
+ use llama_cpp_bindings::mtmd::MtmdBitmap;
+ use llama_cpp_bindings::mtmd::MtmdInputChunkType;
+ use llama_cpp_bindings::mtmd::MtmdInputText;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ fn tokenize_synthetic(
+ fixture: &LlamaFixture<'_>,
+ prompt: &str,
+ ) -> Result {
+ let mtmd_ctx = fixture
+ .mtmd_context
+ .expect("mmproj_file declared in attribute");
+ let image_data = vec![128u8; 64 * 64 * 3];
+ let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?;
+ let input_text = MtmdInputText {
+ text: prompt.to_owned(),
+ add_special: true,
+ parse_special: true,
+ };
+ Ok(mtmd_ctx.tokenize(input_text, &[&bitmap])?)
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+ )]
+ fn text_chunk_has_text_type(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let chunks = tokenize_synthetic(fixture, "Hello world <__media__>")?;
+ let first_chunk = chunks
+ .get(0)
+ .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?;
+ assert_eq!(first_chunk.chunk_type()?, MtmdInputChunkType::Text);
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+ )]
+ fn text_chunk_returns_text_tokens(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let chunks = tokenize_synthetic(fixture, "Hello world <__media__>")?;
+ let first_chunk = chunks
+ .get(0)
+ .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?;
+ let tokens = first_chunk.text_tokens();
+ assert!(tokens.is_some());
+ assert!(!tokens.expect("tokens should be some").is_empty());
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+ )]
+ fn chunk_n_tokens_is_positive(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let chunks = tokenize_synthetic(fixture, "Hello world <__media__>")?;
+ let first_chunk = chunks
+ .get(0)
+ .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?;
+ assert!(first_chunk.n_tokens() > 0);
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+ )]
+ fn chunk_n_positions_is_positive(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let chunks = tokenize_synthetic(fixture, "Hello world <__media__>")?;
+ let first_chunk = chunks
+ .get(0)
+ .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?;
+ assert!(first_chunk.n_positions() > 0);
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+ )]
+ fn text_chunk_id_returns_none(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let chunks = tokenize_synthetic(fixture, "Hello <__media__>")?;
+ let first_chunk = chunks
+ .get(0)
+ .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?;
+ assert_eq!(first_chunk.chunk_type()?, MtmdInputChunkType::Text);
+ assert!(first_chunk.id().is_none());
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+ )]
+ fn image_chunk_returns_none_for_text_tokens(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let chunks = tokenize_synthetic(fixture, "Hello <__media__>")?;
+ for chunk_index in 0..chunks.len() {
+ let chunk = chunks
+ .get(chunk_index)
+ .ok_or_else(|| anyhow::anyhow!("missing chunk at index {chunk_index}"))?;
+ if chunk.chunk_type() == Ok(MtmdInputChunkType::Image) {
+ assert!(chunk.text_tokens().is_none());
+ return Ok(());
+ }
+ }
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+ )]
+ fn image_chunk_id_returns_some(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let chunks = tokenize_synthetic(fixture, "Hello <__media__>")?;
+ for chunk_index in 0..chunks.len() {
+ let chunk = chunks
+ .get(chunk_index)
+ .ok_or_else(|| anyhow::anyhow!("missing chunk at index {chunk_index}"))?;
+ if chunk.chunk_type() == Ok(MtmdInputChunkType::Image) {
+ assert!(chunk.id().is_some());
+ return Ok(());
+ }
+ }
+ Ok(())
+ }
+}
+
+mod mtmd_context {
+ #![expect(
+ clippy::unnecessary_wraps,
+ reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
+ )]
+
+ use anyhow::Result;
+ use llama_cpp_bindings::mtmd::MtmdContext;
+ use llama_cpp_bindings::mtmd::MtmdContextParams;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+ )]
+ fn init_and_supports_vision(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mtmd_ctx = fixture
+ .mtmd_context
+ .expect("mmproj_file declared in attribute");
+ assert!(mtmd_ctx.support_vision());
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+ )]
+ fn init_from_file_with_null_byte_in_path_returns_error(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let mtmd_params = MtmdContextParams::default();
+ let result = MtmdContext::init_from_file("path\0null", fixture.model, &mtmd_params);
+
+ assert!(result.is_err());
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+ )]
+ fn decode_use_mrope_is_true_for_qwen_vision(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mtmd_ctx = fixture
+ .mtmd_context
+ .expect("mmproj_file declared in attribute");
+ assert!(
+ mtmd_ctx.decode_use_mrope(),
+ "Qwen 3.5 / 3.6 mmproj uses mrope; decode_use_mrope must return true"
+ );
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+ )]
+ fn support_audio_is_false_for_vision_only_mmproj(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mtmd_ctx = fixture
+ .mtmd_context
+ .expect("mmproj_file declared in attribute");
+ assert!(
+ !mtmd_ctx.support_audio(),
+ "Qwen 3.5 / 3.6 mmproj is vision-only; support_audio must return false"
+ );
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+ )]
+ fn get_audio_sample_rate_is_none_for_vision_only_mmproj(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let mtmd_ctx = fixture
+ .mtmd_context
+ .expect("mmproj_file declared in attribute");
+ assert!(
+ mtmd_ctx.get_audio_sample_rate().is_none(),
+ "Qwen 3.5 / 3.6 mmproj has no audio; get_audio_sample_rate must return None"
+ );
+ Ok(())
+ }
+}
+
+mod mtmd_evaluation {
+ #![expect(
+ clippy::unnecessary_wraps,
+ reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
+ )]
+
+ use anyhow::Result;
+ use llama_cpp_bindings::context::LlamaContext;
+ use llama_cpp_bindings::mtmd::MtmdBitmap;
+ use llama_cpp_bindings::mtmd::MtmdEvalError;
+ use llama_cpp_bindings::mtmd::MtmdInputChunks;
+ use llama_cpp_bindings::mtmd::MtmdInputText;
+ use llama_cpp_bindings_tests::test_model;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ fn eval_synthetic_bitmap(fixture: &LlamaFixture<'_>, width: u32, height: u32) -> Result<()> {
+ let mtmd_ctx = fixture
+ .mtmd_context
+ .expect("mmproj_file declared in attribute");
+ let image_data = vec![128u8; (width as usize) * (height as usize) * 3];
+ let bitmap = MtmdBitmap::from_image_data(width, height, &image_data)?;
+ let input_text = MtmdInputText {
+ text: "Describe: <__media__>".to_string(),
+ add_special: true,
+ parse_special: true,
+ };
+ let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
+ let n_positions = chunks.total_positions();
+ let required_n_ctx = u32::try_from(n_positions + 256)?;
+ if fixture.context_params.n_ctx < required_n_ctx {
+ anyhow::bail!(
+ "fixture n_ctx ({}) below required ({}) for {}x{} image",
+ fixture.context_params.n_ctx,
+ required_n_ctx,
+ width,
+ height,
+ );
+ }
+
+ let llama_ctx = LlamaContext::from_model(
+ fixture.model,
+ fixture.backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+ let n_batch = i32::try_from(llama_ctx.n_batch())?;
+ chunks.eval_chunks(mtmd_ctx, &llama_ctx, 0, 0, n_batch, false)?;
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 64,
+ n_batch = 64,
+ n_ubatch = 32,
+ mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 64,
+ n_batch = 64,
+ n_ubatch = 32,
+ mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+ )]
+ fn eval_chunks_returns_batch_size_exceeds_context_limit_for_huge_batch(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let mtmd_ctx = fixture
+ .mtmd_context
+ .expect("mmproj_file declared in attribute");
+ let llama_ctx = LlamaContext::from_model(
+ fixture.model,
+ fixture.backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+
+ let chunks = MtmdInputChunks::new()?;
+ let huge_batch = i32::try_from(llama_ctx.n_batch() + 1)?;
+
+ let result = chunks.eval_chunks(mtmd_ctx, &llama_ctx, 0, 0, huge_batch, false);
+
+ assert!(matches!(
+ result,
+ Err(MtmdEvalError::BatchSizeExceedsContextLimit { .. })
+ ));
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 8192,
+ n_batch = 512,
+ n_ubatch = 512,
+ mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 8192,
+ n_batch = 512,
+ n_ubatch = 512,
+ mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+ )]
+ fn eval_chunks_with_standard_image(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mtmd_ctx = fixture
+ .mtmd_context
+ .expect("mmproj_file declared in attribute");
+
+ let fixtures = test_model::fixtures_dir();
+ let image_path = fixtures.join("llamas.jpg");
+ let image_path_str = image_path
+ .to_str()
+ .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?;
+ let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?;
+ let input_text = MtmdInputText {
+ text: "What is in this image? <__media__>".to_string(),
+ add_special: true,
+ parse_special: true,
+ };
+ let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
+ let n_positions = chunks.total_positions();
+ let required_n_ctx = u32::try_from(n_positions + 256)?;
+ assert!(
+ fixture.context_params.n_ctx >= required_n_ctx,
+ "fixture n_ctx ({}) below required ({}); update the attribute literal",
+ fixture.context_params.n_ctx,
+ required_n_ctx,
+ );
+
+ let llama_ctx = LlamaContext::from_model(
+ fixture.model,
+ fixture.backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+ let n_batch = i32::try_from(llama_ctx.n_batch())?;
+ let result = chunks.eval_chunks(mtmd_ctx, &llama_ctx, 0, 0, n_batch, false);
+
+ assert!(result.is_ok());
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 8192,
+ n_batch = 512,
+ n_ubatch = 512,
+ mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 8192,
+ n_batch = 512,
+ n_ubatch = 512,
+ mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+ )]
+ fn eval_chunks_with_varied_dimensions(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let test_dimensions: [(u32, u32); 4] = [(224, 224), (512, 512), (100, 500), (337, 421)];
+
+ for (width, height) in test_dimensions {
+ let result = eval_synthetic_bitmap(fixture, width, height);
+ assert!(
+ result.is_ok(),
+ "dimension {width}x{height} should succeed: {result:?}"
+ );
+ }
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 8192,
+ n_batch = 512,
+ n_ubatch = 512,
+ mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 8192,
+ n_batch = 512,
+ n_ubatch = 512,
+ mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+ )]
+ fn eval_chunks_with_extreme_dimensions_does_not_crash(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let extreme_dimensions: [(u32, u32); 6] = [
+ (1, 1),
+ (7, 13),
+ (3, 1000),
+ (1000, 3),
+ (1920, 1080),
+ (4096, 4096),
+ ];
+
+ let mut any_reached_eval = false;
+
+ for (width, height) in extreme_dimensions {
+ match eval_synthetic_bitmap(fixture, width, height) {
+ Ok(()) => any_reached_eval = true,
+ Err(error) => eprintln!(" {width}x{height} failed: {error}"),
+ }
+ }
+
+ assert!(
+ any_reached_eval,
+ "at least one extreme dimension should reach eval_chunks"
+ );
+
+ Ok(())
+ }
+}
+
+mod mtmd_tokenization {
+ #![expect(
+ clippy::unnecessary_wraps,
+ reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
+ )]
+
+ use anyhow::Result;
+ use llama_cpp_bindings::mtmd::MtmdBitmap;
+ use llama_cpp_bindings::mtmd::MtmdInputText;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+ )]
+ fn tokenize_text_with_image(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mtmd_ctx = fixture
+ .mtmd_context
+ .expect("mmproj_file declared in attribute");
+ let image_data = vec![128u8; 64 * 64 * 3];
+ let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?;
+ let input_text = MtmdInputText {
+ text: "Describe this image: <__media__>".to_string(),
+ add_special: true,
+ parse_special: true,
+ };
+ let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
+
+ assert!(!chunks.is_empty());
+ assert!(chunks.total_tokens() > 0);
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+ )]
+ fn tokenize_bitmap_count_mismatch_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mtmd_ctx = fixture
+ .mtmd_context
+ .expect("mmproj_file declared in attribute");
+ let input_text = MtmdInputText {
+ text: "No media markers here".to_string(),
+ add_special: true,
+ parse_special: true,
+ };
+ let image_data = vec![128u8; 64 * 64 * 3];
+ let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?;
+ let result = mtmd_ctx.tokenize(input_text, &[&bitmap]);
+ assert!(result.is_err());
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+ )]
+ fn tokenize_with_null_byte_in_text_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mtmd_ctx = fixture
+ .mtmd_context
+ .expect("mmproj_file declared in attribute");
+ let input_text = MtmdInputText {
+ text: "text\0null".to_string(),
+ add_special: true,
+ parse_special: true,
+ };
+ let result = mtmd_ctx.tokenize(input_text, &[]);
+ assert!(result.is_err());
+ Ok(())
+ }
+}
+
+mod multimodal {
+ use anyhow::{Context, Result};
+ use llama_cpp_bindings::SampledTokenClassifier;
+ use llama_cpp_bindings::context::LlamaContext;
+ use llama_cpp_bindings::llama_batch::LlamaBatch;
+ use llama_cpp_bindings::model::{LlamaChatMessage, LlamaModel};
+ use llama_cpp_bindings::mtmd::{
+ MtmdBitmap, MtmdInputChunkType, MtmdInputChunks, MtmdInputText,
+ };
+ use llama_cpp_bindings::sampled_token::SampledToken;
+ use llama_cpp_bindings::sampling::LlamaSampler;
+ use llama_cpp_bindings_sys::llama_pos;
+ use llama_cpp_bindings_tests::test_model;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ struct ChunkTokenBreakdown {
+ text: u64,
+ image: u64,
+ audio: u64,
+ }
+
+ fn count_chunk_tokens_by_type(chunks: &MtmdInputChunks) -> Result {
+ let mut breakdown = ChunkTokenBreakdown {
+ text: 0,
+ image: 0,
+ audio: 0,
+ };
+ for index in 0..chunks.len() {
+ let chunk = chunks
+ .get(index)
+ .with_context(|| format!("chunk index {index} is missing"))?;
+ let n_tokens = u64::try_from(chunk.n_tokens())?;
+ match chunk.chunk_type()? {
+ MtmdInputChunkType::Text => breakdown.text += n_tokens,
+ MtmdInputChunkType::Image => breakdown.image += n_tokens,
+ MtmdInputChunkType::Audio => breakdown.audio += n_tokens,
+ }
+ }
+
+ Ok(breakdown)
+ }
+
+ fn build_user_prompt_with_image_marker(model: &LlamaModel, question: &str) -> Result {
+ let marker = llama_cpp_bindings::mtmd::mtmd_default_marker();
+ let user_content = format!("{marker}{question}");
+ let chat_template = model.chat_template(None)?;
+ let messages = [LlamaChatMessage::new("user".to_string(), user_content)?];
+
+ Ok(model.apply_chat_template(&chat_template, &messages, true)?)
+ }
+
+ struct SamplingTotals {
+ generated: String,
+ observed_content: u64,
+ observed_reasoning: u64,
+ }
+
+ fn drive_sampling_loop(
+ classifier: &mut SampledTokenClassifier,
+ model: &LlamaModel,
+ ctx: &mut LlamaContext,
+ starting_position: llama_pos,
+ max_tokens: usize,
+ ) -> Result {
+ let mut sampler = LlamaSampler::greedy();
+ let mut totals = SamplingTotals {
+ generated: String::new(),
+ observed_content: 0,
+ observed_reasoning: 0,
+ };
+ let mut batch = LlamaBatch::new(512, 1)?;
+
+ for (current_position, _) in (starting_position..).zip(0..max_tokens) {
+ let (raw_token, outcomes) = classifier.sample(&mut sampler, ctx, -1)?;
+ for outcome in &outcomes {
+ totals.generated.push_str(&outcome.raw_piece);
+ match outcome.sampled_token {
+ SampledToken::Content(_) => totals.observed_content += 1,
+ SampledToken::Reasoning(_) => totals.observed_reasoning += 1,
+ SampledToken::ToolCall(_) | SampledToken::Undeterminable(_) => {}
+ }
+ }
+
+ let raw_as_sampled = SampledToken::Content(raw_token);
+ if model.is_eog_token(&raw_as_sampled) {
+ break;
+ }
+
+ batch.clear();
+ batch.add(&raw_as_sampled, current_position, &[0], true)?;
+
+ ctx.decode(&mut batch)
+ .with_context(|| "failed to decode generated token")?;
+ }
+
+ for outcome in classifier.flush() {
+ totals.generated.push_str(&outcome.raw_piece);
+ match outcome.sampled_token {
+ SampledToken::Content(_) => totals.observed_content += 1,
+ SampledToken::Reasoning(_) => totals.observed_reasoning += 1,
+ SampledToken::ToolCall(_) | SampledToken::Undeterminable(_) => {}
+ }
+ }
+
+ Ok(totals)
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 4096,
+ n_batch = 512,
+ n_ubatch = 512,
+ mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+ )]
+ fn multimodal_vision_inference_produces_output(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let model = fixture.model;
+ let mtmd_ctx = fixture
+ .mtmd_context
+ .expect("mmproj_file declared in attribute");
+
+ let mut ctx = LlamaContext::from_model(
+ model,
+ fixture.backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )
+ .with_context(|| "unable to create llama context")?;
+
+ assert!(
+ mtmd_ctx.support_vision(),
+ "model should support vision input"
+ );
+
+ let image_path = test_model::fixtures_dir().join("llamas.jpg");
+ let image_path_str = image_path
+ .to_str()
+ .with_context(|| "image path is not valid UTF-8")?;
+ let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)
+ .with_context(|| "failed to load image from file")?;
+
+ let formatted_prompt =
+ build_user_prompt_with_image_marker(model, "What animals do you see in this image?")?;
+
+ let input_text = MtmdInputText {
+ text: formatted_prompt,
+ add_special: false,
+ parse_special: true,
+ };
+
+ let chunks = mtmd_ctx
+ .tokenize(input_text, &[&bitmap])
+ .with_context(|| "failed to tokenize multimodal input")?;
+
+ assert!(
+ !chunks.is_empty(),
+ "tokenization should produce at least one chunk"
+ );
+
+ let expected = count_chunk_tokens_by_type(&chunks)?;
+
+ eprintln!(
+ "tokenized into {} chunks, text {} image {} audio {}",
+ chunks.len(),
+ expected.text,
+ expected.image,
+ expected.audio
+ );
+
+ assert!(
+ expected.image > 0,
+ "vision input must produce at least one image chunk"
+ );
+
+ let mut classifier = model.sampled_token_classifier();
+ let n_past = classifier
+ .eval_multimodal_chunks(&chunks, mtmd_ctx, &ctx, 0, 0, 512, true)
+ .with_context(|| "failed to evaluate chunks")?;
+
+ eprintln!("evaluated chunks, n_past = {n_past}");
+
+ {
+ let usage = classifier.usage();
+ assert_eq!(usage.prompt_tokens, expected.text);
+ assert_eq!(usage.input_image_tokens, expected.image);
+ assert_eq!(usage.input_audio_tokens, expected.audio);
+ }
+
+ let totals = drive_sampling_loop(&mut classifier, model, &mut ctx, n_past, 512)?;
+
+ eprintln!("generated text: {}", totals.generated);
+
+ assert!(
+ !totals.generated.is_empty(),
+ "model should generate at least one token from image input"
+ );
+
+ let usage = classifier.into_usage();
+ assert_eq!(usage.prompt_tokens, expected.text);
+ assert_eq!(usage.input_image_tokens, expected.image);
+ assert_eq!(usage.input_audio_tokens, expected.audio);
+ assert_eq!(usage.content_tokens, totals.observed_content);
+ assert_eq!(usage.reasoning_tokens, totals.observed_reasoning);
+ assert_eq!(
+ usage.completion_tokens(),
+ totals.observed_content + totals.observed_reasoning
+ );
+
+ Ok(())
+ }
+}
+
+mod eval_multimodal_chunks_records_exact_token_counts {
+ use anyhow::Result;
+ use llama_cpp_bindings::TokenUsage;
+ use llama_cpp_bindings::context::LlamaContext;
+ use llama_cpp_bindings::mtmd::MtmdBitmap;
+ use llama_cpp_bindings::mtmd::MtmdInputChunkType;
+ use llama_cpp_bindings::mtmd::MtmdInputChunks;
+ use llama_cpp_bindings::mtmd::MtmdInputText;
+ use llama_cpp_bindings::mtmd::mtmd_default_marker;
+ use llama_cpp_bindings_tests::test_model::fixtures_dir;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ const PROMPT_QUESTION: &str = "What animals do you see in this image?";
+
+ struct ExpectedChunkTotals {
+ text: u64,
+ image: u64,
+ audio: u64,
+ }
+
+ fn sum_chunk_token_counts_by_type(chunks: &MtmdInputChunks) -> Result {
+ let mut totals = ExpectedChunkTotals {
+ text: 0,
+ image: 0,
+ audio: 0,
+ };
+ for index in 0..chunks.len() {
+ let chunk = chunks
+ .get(index)
+ .ok_or_else(|| anyhow::anyhow!("chunk index {index} should exist"))?;
+ let n_tokens = u64::try_from(chunk.n_tokens())?;
+ match chunk.chunk_type()? {
+ MtmdInputChunkType::Text => {
+ totals.text = totals.text.saturating_add(n_tokens);
+ }
+ MtmdInputChunkType::Image => {
+ totals.image = totals.image.saturating_add(n_tokens);
+ }
+ MtmdInputChunkType::Audio => {
+ totals.audio = totals.audio.saturating_add(n_tokens);
+ }
+ }
+ }
+ Ok(totals)
+ }
+
+ fn build_multimodal_chunks_and_eval_into_usage(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<(TokenUsage, ExpectedChunkTotals)> {
+ let model = fixture.model;
+ let mtmd_ctx = fixture
+ .mtmd_context
+ .expect("mmproj_file declared in attribute");
+
+ let image_path = fixtures_dir().join("llamas.jpg");
+ let image_path_str = image_path
+ .to_str()
+ .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?;
+ let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?;
+
+ let marker = mtmd_default_marker();
+ let prompt = format!("{marker}{PROMPT_QUESTION}");
+
+ let input_text = MtmdInputText {
+ text: prompt,
+ add_special: false,
+ parse_special: true,
+ };
+
+ let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
+ let expected = sum_chunk_token_counts_by_type(&chunks)?;
+
+ let context_params = (*fixture.context_params).into_llama_context_params();
+ let context = LlamaContext::from_model(model, fixture.backend, context_params)?;
+
+ let mut classifier = model.sampled_token_classifier();
+ classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?;
+
+ Ok((classifier.into_usage(), expected))
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 4096,
+ n_batch = 512,
+ n_ubatch = 512,
+ mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+ )]
+ fn prompt_tokens_match_text_chunk_total(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let (usage, expected) = build_multimodal_chunks_and_eval_into_usage(fixture)?;
+
+ if usage.prompt_tokens != expected.text {
+ anyhow::bail!(
+ "prompt_tokens must equal sum of text-chunk n_tokens; expected {}, got {}",
+ expected.text,
+ usage.prompt_tokens
+ );
+ }
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 4096,
+ n_batch = 512,
+ n_ubatch = 512,
+ mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+ )]
+ fn input_image_tokens_match_image_chunk_total(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let (usage, expected) = build_multimodal_chunks_and_eval_into_usage(fixture)?;
+
+ if usage.input_image_tokens != expected.image {
+ anyhow::bail!(
+ "input_image_tokens must equal sum of image-chunk n_tokens; expected {}, got {}",
+ expected.image,
+ usage.input_image_tokens
+ );
+ }
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 4096,
+ n_batch = 512,
+ n_ubatch = 512,
+ mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+ )]
+ fn input_audio_tokens_are_zero_for_image_only_input(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let (usage, expected) = build_multimodal_chunks_and_eval_into_usage(fixture)?;
+
+ if expected.audio != 0 {
+ anyhow::bail!(
+ "fixture invariant: image-only multimodal input should produce zero audio chunk tokens, got {}",
+ expected.audio
+ );
+ }
+ if usage.input_audio_tokens != 0 {
+ anyhow::bail!(
+ "input_audio_tokens must be zero when no audio chunks are evaluated; got {}",
+ usage.input_audio_tokens
+ );
+ }
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 4096,
+ n_batch = 512,
+ n_ubatch = 512,
+ mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+ )]
+ fn completion_tokens_are_zero_after_eval_before_generation(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let (usage, _expected) = build_multimodal_chunks_and_eval_into_usage(fixture)?;
+
+ if usage.completion_tokens() != 0 {
+ anyhow::bail!(
+ "completion_tokens must be zero immediately after eval (no generation has occurred); got {}",
+ usage.completion_tokens()
+ );
+ }
+
+ Ok(())
+ }
+}
+
+mod ingest_prompt_chunk {
+ use anyhow::Result;
+ use llama_cpp_bindings::ingest_prompt_chunk::ingest_prompt_chunk;
+ use llama_cpp_bindings::mtmd::MtmdBitmap;
+ use llama_cpp_bindings::mtmd::MtmdInputChunkType;
+ use llama_cpp_bindings::mtmd::MtmdInputText;
+ use llama_cpp_bindings::mtmd::mtmd_default_marker;
+ use llama_cpp_bindings_tests::test_model::fixtures_dir;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+ )]
+ fn text_chunk_records_prompt_tokens(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let model = fixture.model;
+ let mtmd_ctx = fixture
+ .mtmd_context
+ .expect("mmproj_file declared in attribute");
+
+ let input_text = MtmdInputText {
+ text: "hello world".to_owned(),
+ add_special: false,
+ parse_special: false,
+ };
+ let chunks = mtmd_ctx.tokenize(input_text, &[])?;
+
+ let text_chunk = (0..chunks.len())
+ .filter_map(|index| chunks.get(index))
+ .find(|chunk| chunk.chunk_type() == Ok(MtmdInputChunkType::Text))
+ .ok_or_else(|| {
+ anyhow::anyhow!("text-only tokenization should produce at least one text chunk")
+ })?;
+
+ let n_tokens = u64::try_from(text_chunk.n_tokens())?;
+
+ let mut classifier = model.sampled_token_classifier();
+
+ ingest_prompt_chunk(&mut classifier, &text_chunk)?;
+
+ let usage = classifier.usage();
+ if usage.prompt_tokens != n_tokens {
+ anyhow::bail!(
+ "text chunk must record n_tokens as prompt_tokens; expected {n_tokens}, got {}",
+ usage.prompt_tokens
+ );
+ }
+ if usage.input_image_tokens != 0 {
+ anyhow::bail!(
+ "text chunk must not bump input_image_tokens; got {}",
+ usage.input_image_tokens
+ );
+ }
+ if usage.input_audio_tokens != 0 {
+ anyhow::bail!(
+ "text chunk must not bump input_audio_tokens; got {}",
+ usage.input_audio_tokens
+ );
+ }
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+ )]
+ fn image_chunk_records_input_image_tokens_only(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let model = fixture.model;
+ let mtmd_ctx = fixture
+ .mtmd_context
+ .expect("mmproj_file declared in attribute");
+
+ let image_path = fixtures_dir().join("llamas.jpg");
+ let image_path_str = image_path
+ .to_str()
+ .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?;
+ let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?;
+
+ let marker = mtmd_default_marker();
+ let input_text = MtmdInputText {
+ text: marker.to_owned(),
+ add_special: false,
+ parse_special: true,
+ };
+ let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
+
+ let image_chunk = (0..chunks.len())
+ .filter_map(|index| chunks.get(index))
+ .find(|chunk| chunk.chunk_type() == Ok(MtmdInputChunkType::Image))
+ .ok_or_else(|| {
+ anyhow::anyhow!("multimodal tokenization should produce an image chunk")
+ })?;
+
+ let n_tokens = u64::try_from(image_chunk.n_tokens())?;
+ if n_tokens == 0 {
+ anyhow::bail!("image chunk should report at least one token");
+ }
+
+ let mut classifier = model.sampled_token_classifier();
+
+ ingest_prompt_chunk(&mut classifier, &image_chunk)?;
+
+ let usage = classifier.usage();
+ if usage.input_image_tokens != n_tokens {
+ anyhow::bail!(
+ "image chunk must record n_tokens as input_image_tokens; expected {n_tokens}, got {}",
+ usage.input_image_tokens
+ );
+ }
+ if usage.prompt_tokens != 0 {
+ anyhow::bail!(
+ "image chunk must not bump prompt_tokens; got {}",
+ usage.prompt_tokens
+ );
+ }
+ if usage.input_audio_tokens != 0 {
+ anyhow::bail!(
+ "image chunk must not bump input_audio_tokens; got {}",
+ usage.input_audio_tokens
+ );
+ }
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+ )]
+ fn text_chunk_drives_marker_state_machine_to_reasoning(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let model = fixture.model;
+ let mtmd_ctx = fixture
+ .mtmd_context
+ .expect("mmproj_file declared in attribute");
+
+ let input_text = MtmdInputText {
+ text: "<|im_start|>user\nHi<|im_end|>\n<|im_start|>assistant\n\n".to_owned(),
+ add_special: false,
+ parse_special: true,
+ };
+ let chunks = mtmd_ctx.tokenize(input_text, &[])?;
+
+ let mut classifier = model.sampled_token_classifier();
+
+ for index in 0..chunks.len() {
+ let chunk = chunks
+ .get(index)
+ .ok_or_else(|| anyhow::anyhow!("chunk index {index} must exist"))?;
+ ingest_prompt_chunk(&mut classifier, &chunk)?;
+ }
+
+ if classifier.current_section() != llama_cpp_bindings::SampledTokenSection::Reasoning {
+ anyhow::bail!(
+ "text chunk replay must transition the classifier section to Reasoning when the \
+ prompt opens a `` block; got {:?}",
+ classifier.current_section()
+ );
+ }
+
+ Ok(())
+ }
+}
+
+mod gemma4_classifier_emits_reasoning_for_multimodal_thinking_prompt {
+ use anyhow::Result;
+ use llama_cpp_bindings::context::LlamaContext;
+ use llama_cpp_bindings::llama_batch::LlamaBatch;
+ use llama_cpp_bindings::mtmd::MtmdBitmap;
+ use llama_cpp_bindings::mtmd::MtmdInputText;
+ use llama_cpp_bindings::mtmd::mtmd_default_marker;
+ use llama_cpp_bindings::sampling::LlamaSampler;
+ use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
+ use llama_cpp_bindings_tests::test_model::fixtures_dir;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ const MAX_GENERATED_TOKENS: i32 = 200;
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 8192,
+ n_batch = 512,
+ n_ubatch = 512,
+ mmproj_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "mmproj-F16.gguf"),
+ )]
+ fn gemma4_classifier_emits_reasoning_for_multimodal_thinking_prompt(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let model = fixture.model;
+ let backend = fixture.backend;
+ let mtmd_ctx = fixture
+ .mtmd_context
+ .expect("mmproj_file declared in attribute");
+
+ let mut context = LlamaContext::from_model(
+ model,
+ backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+
+ let image_path = fixtures_dir().join("llamas.jpg");
+ let image_path_str = image_path
+ .to_str()
+ .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?;
+ let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?;
+
+ let marker = mtmd_default_marker();
+ let prompt = format!(
+ "user\n{marker}What animals do you see in this image?\nmodel\n<|channel>thought\n"
+ );
+
+ let input_text = MtmdInputText {
+ text: prompt,
+ add_special: false,
+ parse_special: true,
+ };
+
+ let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
+
+ let mut classifier = model.sampled_token_classifier();
+ let n_past =
+ classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?;
+
+ let mut sampler = LlamaSampler::chain_simple([
+ LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
+ LlamaSampler::top_k(40),
+ LlamaSampler::top_p(0.9, 1),
+ LlamaSampler::min_p(0.05, 1),
+ LlamaSampler::temp(0.7),
+ LlamaSampler::dist(0x00C0_FFEE),
+ ]);
+
+ let mut batch = LlamaBatch::new(2048, 1)?;
+ let outcome = ClassifySampleLoop {
+ model,
+ classifier: &mut classifier,
+ sampler: &mut sampler,
+ context: &mut context,
+ batch: &mut batch,
+ initial_position: n_past,
+ max_generated_tokens: MAX_GENERATED_TOKENS,
+ }
+ .run()?;
+
+ let usage = classifier.usage();
+
+ if outcome.observed_reasoning == 0 {
+ anyhow::bail!(
+ "Gemma 4 multimodal + thinking: classifier must emit at least one Reasoning token \
+ when the prompt opens a `<|channel>thought` block; outcome={outcome:?}"
+ );
+ }
+ if usage.reasoning_tokens == 0 {
+ anyhow::bail!(
+ "Gemma 4 multimodal + thinking: usage.reasoning_tokens must be non-zero; usage={usage:?}"
+ );
+ }
+
+ Ok(())
+ }
+}
+
+mod mistral3_classifier_emits_reasoning_for_multimodal_thinking_prompt {
+ use anyhow::Result;
+ use llama_cpp_bindings::context::LlamaContext;
+ use llama_cpp_bindings::llama_batch::LlamaBatch;
+ use llama_cpp_bindings::mtmd::MtmdBitmap;
+ use llama_cpp_bindings::mtmd::MtmdInputText;
+ use llama_cpp_bindings::mtmd::mtmd_default_marker;
+ use llama_cpp_bindings::sampling::LlamaSampler;
+ use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
+ use llama_cpp_bindings_tests::test_model::fixtures_dir;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ const MAX_GENERATED_TOKENS: i32 = 768;
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 4096,
+ n_batch = 512,
+ n_ubatch = 512,
+ mmproj_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "mmproj-F16.gguf"),
+ )]
+ fn mistral3_classifier_emits_reasoning_for_multimodal_thinking_prompt(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let model = fixture.model;
+ let backend = fixture.backend;
+ let mtmd_ctx = fixture
+ .mtmd_context
+ .expect("mmproj_file declared in attribute");
+
+ let mut context = LlamaContext::from_model(
+ model,
+ backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+
+ let image_path = fixtures_dir().join("llamas.jpg");
+ let image_path_str = image_path
+ .to_str()
+ .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?;
+ let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?;
+
+ let marker = mtmd_default_marker();
+ let prompt = format!(
+ "[SYSTEM_PROMPT]# HOW YOU SHOULD THINK AND ANSWER\n\n\
+ First draft your thinking process (inner monologue) until you arrive at a response. \
+ Format your response using Markdown, and use LaTeX for any mathematical equations. \
+ Write both your thoughts and the response in the same language as the input.\n\n\
+ Your thinking process must follow the template below:\
+ [THINK]Your thoughts or/and draft, like working through an exercise on scratch paper. \
+ Be as casual and as long as you want until you are confident to generate the response \
+ to the user.[/THINK]Here, provide a self-contained response.[/SYSTEM_PROMPT]\
+ [INST]{marker}What animals do you see in this image?[/INST]"
+ );
+
+ let input_text = MtmdInputText {
+ text: prompt,
+ add_special: true,
+ parse_special: true,
+ };
+
+ let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
+
+ let mut classifier = model.sampled_token_classifier();
+ let n_past =
+ classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?;
+
+ let mut sampler = LlamaSampler::greedy();
+ let mut batch = LlamaBatch::new(2048, 1)?;
+ let outcome = ClassifySampleLoop {
+ model,
+ classifier: &mut classifier,
+ sampler: &mut sampler,
+ context: &mut context,
+ batch: &mut batch,
+ initial_position: n_past,
+ max_generated_tokens: MAX_GENERATED_TOKENS,
+ }
+ .run()?;
+
+ let usage = classifier.usage();
+
+ if outcome.observed_reasoning == 0 {
+ anyhow::bail!(
+ "Mistral 3 multimodal + thinking: classifier must emit at least one Reasoning token \
+ when the model opens a `[THINK]` block; outcome={outcome:?}"
+ );
+ }
+ if usage.reasoning_tokens == 0 {
+ anyhow::bail!(
+ "Mistral 3 multimodal + thinking: usage.reasoning_tokens must be non-zero; usage={usage:?}"
+ );
+ }
+
+ Ok(())
+ }
+}
+
+mod qwen35_classifier_emits_reasoning_for_multimodal_thinking_prompt {
+ use anyhow::Result;
+ use llama_cpp_bindings::context::LlamaContext;
+ use llama_cpp_bindings::llama_batch::LlamaBatch;
+ use llama_cpp_bindings::mtmd::MtmdBitmap;
+ use llama_cpp_bindings::mtmd::MtmdInputText;
+ use llama_cpp_bindings::mtmd::mtmd_default_marker;
+ use llama_cpp_bindings::sampling::LlamaSampler;
+ use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
+ use llama_cpp_bindings_tests::test_model::fixtures_dir;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ const MAX_GENERATED_TOKENS: i32 = 200;
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 4096,
+ n_batch = 512,
+ n_ubatch = 512,
+ mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 4096,
+ n_batch = 512,
+ n_ubatch = 512,
+ mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+ )]
+ fn qwen35_classifier_emits_reasoning_for_multimodal_thinking_prompt(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let model = fixture.model;
+ let backend = fixture.backend;
+ let mtmd_ctx = fixture
+ .mtmd_context
+ .expect("mmproj_file declared in attribute");
+
+ let mut context = LlamaContext::from_model(
+ model,
+ backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+
+ let image_path = fixtures_dir().join("llamas.jpg");
+ let image_path_str = image_path
+ .to_str()
+ .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?;
+ let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?;
+
+ let marker = mtmd_default_marker();
+ let prompt = format!(
+ "<|im_start|>user\n{marker}What animals do you see in this image?<|im_end|>\n<|im_start|>assistant\n\n"
+ );
+
+ let input_text = MtmdInputText {
+ text: prompt,
+ add_special: false,
+ parse_special: true,
+ };
+
+ let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
+
+ let mut classifier = model.sampled_token_classifier();
+ let n_past =
+ classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?;
+
+ let mut sampler = LlamaSampler::chain_simple([
+ LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
+ LlamaSampler::top_k(40),
+ LlamaSampler::top_p(0.9, 1),
+ LlamaSampler::min_p(0.05, 1),
+ LlamaSampler::temp(0.7),
+ LlamaSampler::dist(0x00C0_FFEE),
+ ]);
+
+ let mut batch = LlamaBatch::new(2048, 1)?;
+ let outcome = ClassifySampleLoop {
+ model,
+ classifier: &mut classifier,
+ sampler: &mut sampler,
+ context: &mut context,
+ batch: &mut batch,
+ initial_position: n_past,
+ max_generated_tokens: MAX_GENERATED_TOKENS,
+ }
+ .run()?;
+
+ let usage = classifier.usage();
+
+ if outcome.observed_reasoning == 0 {
+ anyhow::bail!(
+ "Qwen 3.5 multimodal + thinking: classifier must emit at least one Reasoning token \
+ when the prompt opens a `` block; outcome={outcome:?}"
+ );
+ }
+ if usage.reasoning_tokens == 0 {
+ anyhow::bail!(
+ "Qwen 3.5 multimodal + thinking: usage.reasoning_tokens must be non-zero; usage={usage:?}"
+ );
+ }
+
+ Ok(())
+ }
+}
+
+mod qwen36_classifier_emits_reasoning_for_multimodal_thinking_prompt {
+ use anyhow::Result;
+ use llama_cpp_bindings::context::LlamaContext;
+ use llama_cpp_bindings::llama_batch::LlamaBatch;
+ use llama_cpp_bindings::mtmd::MtmdBitmap;
+ use llama_cpp_bindings::mtmd::MtmdInputText;
+ use llama_cpp_bindings::mtmd::mtmd_default_marker;
+ use llama_cpp_bindings::sampling::LlamaSampler;
+ use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
+ use llama_cpp_bindings_tests::test_model::fixtures_dir;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ const MAX_GENERATED_TOKENS: i32 = 200;
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 8192,
+ n_batch = 512,
+ n_ubatch = 512,
+ mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
+ )]
+ fn qwen36_classifier_emits_reasoning_for_multimodal_thinking_prompt(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let model = fixture.model;
+ let backend = fixture.backend;
+ let mtmd_ctx = fixture
+ .mtmd_context
+ .expect("mmproj_file declared in attribute");
+
+ let mut context = LlamaContext::from_model(
+ model,
+ backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+
+ let image_path = fixtures_dir().join("llamas.jpg");
+ let image_path_str = image_path
+ .to_str()
+ .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?;
+ let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?;
+
+ let marker = mtmd_default_marker();
+ let prompt = format!(
+ "<|im_start|>user\n{marker}What animals do you see in this image?<|im_end|>\n<|im_start|>assistant\n\n"
+ );
+
+ let input_text = MtmdInputText {
+ text: prompt,
+ add_special: false,
+ parse_special: true,
+ };
+
+ let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
+
+ let mut classifier = model.sampled_token_classifier();
+ let n_past =
+ classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?;
+
+ let mut sampler = LlamaSampler::chain_simple([
+ LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
+ LlamaSampler::top_k(40),
+ LlamaSampler::top_p(0.9, 1),
+ LlamaSampler::min_p(0.05, 1),
+ LlamaSampler::temp(0.7),
+ LlamaSampler::dist(0x00C0_FFEE),
+ ]);
+
+ let mut batch = LlamaBatch::new(2048, 1)?;
+ let outcome = ClassifySampleLoop {
+ model,
+ classifier: &mut classifier,
+ sampler: &mut sampler,
+ context: &mut context,
+ batch: &mut batch,
+ initial_position: n_past,
+ max_generated_tokens: MAX_GENERATED_TOKENS,
+ }
+ .run()?;
+
+ let usage = classifier.usage();
+
+ if outcome.observed_reasoning == 0 {
+ anyhow::bail!(
+ "Qwen 3.6 multimodal + thinking: classifier must emit at least one Reasoning token; outcome={outcome:?}"
+ );
+ }
+ if usage.reasoning_tokens == 0 {
+ anyhow::bail!(
+ "Qwen 3.6 multimodal + thinking: usage.reasoning_tokens must be non-zero; usage={usage:?}"
+ );
+ }
+
+ Ok(())
+ }
+}
+
+llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/parse_chat_message.rs b/llama-cpp-bindings-tests/tests/parse_chat_message.rs
deleted file mode 100644
index d23fe1c2..00000000
--- a/llama-cpp-bindings-tests/tests/parse_chat_message.rs
+++ /dev/null
@@ -1,368 +0,0 @@
-#![expect(
- clippy::unnecessary_wraps,
- reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-)]
-
-use anyhow::Result;
-use anyhow::bail;
-use llama_cpp_bindings::ChatMessageParseOutcome;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn parses_pure_content_response(fixture: &LlamaFixture<'_>) -> Result<()> {
- let outcome = fixture
- .model
- .parse_chat_message("[]", "hello world", false)?;
-
- let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
- bail!("expected Recognized for plain content; got Unrecognized");
- };
- assert!(parsed.tool_calls.is_empty());
- assert!(!parsed.is_empty());
- assert!(parsed.content.contains("hello world"));
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn parses_reasoning_section_into_reasoning_content(fixture: &LlamaFixture<'_>) -> Result<()> {
- let input = "step one, step two\n\nactual response";
- let outcome = fixture.model.parse_chat_message("[]", input, false)?;
-
- let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
- bail!("expected Recognized for reasoning section; got Unrecognized");
- };
- assert!(
- parsed.reasoning_content.contains("step") || parsed.content.contains("step"),
- "neither content nor reasoning contains 'step'; content={:?} reasoning={:?}",
- parsed.content,
- parsed.reasoning_content
- );
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn parses_empty_input_yields_empty_message(fixture: &LlamaFixture<'_>) -> Result<()> {
- let outcome = fixture.model.parse_chat_message("[]", "", false)?;
-
- let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
- bail!("expected Recognized for empty input; got Unrecognized");
- };
- assert!(parsed.tool_calls.is_empty());
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn parses_malformed_tools_json_returns_tools_json_invalid_error(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let result = fixture
- .model
- .parse_chat_message("not_a_json[}", "hello", false);
-
- assert!(matches!(
- result,
- Err(llama_cpp_bindings::ParseChatMessageError::ToolsJsonInvalid(
- _
- ))
- ));
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn parses_non_array_tools_json_returns_tools_json_not_array_error(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let result = fixture
- .model
- .parse_chat_message("{\"foo\": 1}", "hello", false);
-
- assert!(matches!(
- result,
- Err(llama_cpp_bindings::ParseChatMessageError::ToolsJsonNotArray)
- ));
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn parses_with_tools_null_byte_returns_tools_json_invalid_error(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let result = fixture
- .model
- .parse_chat_message("[]\0extra", "hello", false);
-
- assert!(matches!(
- result,
- Err(llama_cpp_bindings::ParseChatMessageError::ToolsJsonInvalid(
- _
- ))
- ));
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn parses_with_input_null_byte_returns_tools_serialization_error(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let result = fixture
- .model
- .parse_chat_message("[]", "hello\0world", false);
-
- assert!(matches!(
- result,
- Err(llama_cpp_bindings::ParseChatMessageError::ToolsSerialization(_))
- ));
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/qwen35_chat_inference_emits_reasoning_when_template_auto_opens.rs b/llama-cpp-bindings-tests/tests/qwen35_chat_inference_emits_reasoning_when_template_auto_opens.rs
deleted file mode 100644
index 260dd0f6..00000000
--- a/llama-cpp-bindings-tests/tests/qwen35_chat_inference_emits_reasoning_when_template_auto_opens.rs
+++ /dev/null
@@ -1,87 +0,0 @@
-use anyhow::Result;
-use anyhow::bail;
-use llama_cpp_bindings::ChatMessageParseOutcome;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_bindings::model::LlamaChatMessage;
-use llama_cpp_bindings::sampling::LlamaSampler;
-use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 2048,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn qwen35_chat_inference_emits_reasoning_when_template_auto_opens(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let model = fixture.model;
- let backend = fixture.backend;
-
- let mut context = LlamaContext::from_model(
- model,
- backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
-
- let chat_template = model.chat_template(None)?;
- let messages = vec![LlamaChatMessage::new(
- "user".to_owned(),
- "Hello! How are you?".to_owned(),
- )?];
- let prompt = model.apply_chat_template(&chat_template, &messages, true)?;
-
- let mut classifier = model.sampled_token_classifier();
- let tokens = model.str_to_token(&prompt, AddBos::Always)?;
- let prompt_token_count = u64::try_from(tokens.len())?;
-
- let mut batch = LlamaBatch::new(512, 1)?;
- classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?;
-
- context.decode(&mut batch)?;
-
- let promoted = classifier.commit_prompt_tokens();
- assert_eq!(promoted, prompt_token_count);
-
- let mut sampler = LlamaSampler::greedy();
- let initial_position = batch.n_tokens();
- let outcome = ClassifySampleLoop {
- model,
- classifier: &mut classifier,
- sampler: &mut sampler,
- context: &mut context,
- batch: &mut batch,
- initial_position,
- max_generated_tokens: 1024,
- }
- .run()?;
-
- assert!(!outcome.generated_raw.is_empty());
- assert!(outcome.observed_reasoning > 0);
- assert!(outcome.observed_content > 0);
- assert_eq!(outcome.observed_undeterminable, 0);
- assert_eq!(outcome.observed_tool_call, 0);
-
- let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?;
- let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else {
- bail!("Qwen3.5 chat template must be recognised by the parser; got Unrecognized");
- };
- assert!(!parsed.content.is_empty());
-
- let usage = classifier.into_usage();
- assert_eq!(usage.prompt_tokens, prompt_token_count);
- assert_eq!(usage.reasoning_tokens, outcome.observed_reasoning);
- assert_eq!(usage.undeterminable_tokens, 0);
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/qwen35_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs b/llama-cpp-bindings-tests/tests/qwen35_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs
deleted file mode 100644
index df0a9b80..00000000
--- a/llama-cpp-bindings-tests/tests/qwen35_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs
+++ /dev/null
@@ -1,95 +0,0 @@
-use anyhow::Result;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_bindings::sampling::LlamaSampler;
-use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const MAX_GENERATED_TOKENS: i32 = 200;
-
-const QWEN35_THINKING_DISABLED_PROMPT: &str = "\
-<|im_start|>user
-What is 2 + 2?<|im_end|>
-<|im_start|>assistant
-
-
-
-
-";
-
-const FORBIDDEN_MARKERS: &[&str] = &["", ""];
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 8192,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn qwen35_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let model = fixture.model;
- let backend = fixture.backend;
-
- let mut classifier = model.sampled_token_classifier();
- let prompt_tokens = model.str_to_token(QWEN35_THINKING_DISABLED_PROMPT, AddBos::Never)?;
- let prompt_token_count = u64::try_from(prompt_tokens.len())?;
-
- let mut batch = LlamaBatch::new(2048, 1)?;
- classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
-
- let mut context = LlamaContext::from_model(
- model,
- backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
-
- context.decode(&mut batch)?;
-
- let promoted = classifier.commit_prompt_tokens();
- assert_eq!(promoted, prompt_token_count);
-
- let mut sampler = LlamaSampler::chain_simple([
- LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
- LlamaSampler::top_k(40),
- LlamaSampler::top_p(0.9, 1),
- LlamaSampler::min_p(0.05, 1),
- LlamaSampler::temp(0.7),
- LlamaSampler::dist(0x00C0_FFEE),
- ]);
- let initial_position = batch.n_tokens();
- let outcome = ClassifySampleLoop {
- model,
- classifier: &mut classifier,
- sampler: &mut sampler,
- context: &mut context,
- batch: &mut batch,
- initial_position,
- max_generated_tokens: MAX_GENERATED_TOKENS,
- }
- .run()?;
-
- let usage = classifier.usage();
-
- assert!(!outcome.generated_raw.is_empty());
- assert_eq!(outcome.observed_reasoning, 0);
- assert_eq!(outcome.observed_undeterminable, 0);
- assert_eq!(usage.reasoning_tokens, 0);
- assert_eq!(usage.undeterminable_tokens, 0);
- assert!(outcome.observed_content > 0);
- assert_eq!(usage.completion_tokens(), outcome.observed_content);
-
- for forbidden in FORBIDDEN_MARKERS {
- assert!(!outcome.content_stream.contains(forbidden));
- }
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/qwen35_classifier_emits_reasoning.rs b/llama-cpp-bindings-tests/tests/qwen35_classifier_emits_reasoning.rs
deleted file mode 100644
index f9c98932..00000000
--- a/llama-cpp-bindings-tests/tests/qwen35_classifier_emits_reasoning.rs
+++ /dev/null
@@ -1,111 +0,0 @@
-use anyhow::Result;
-use anyhow::bail;
-use llama_cpp_bindings::ChatMessageParseOutcome;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_bindings::sampling::LlamaSampler;
-use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const MAX_GENERATED_TOKENS: i32 = 1500;
-
-const QWEN35_THINKING_PROMPT: &str = "\
-<|im_start|>user
-What is 2 + 2?<|im_end|>
-<|im_start|>assistant
-
-";
-
-const FORBIDDEN_MARKERS: &[&str] = &["", ""];
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 8192,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn qwen35_classifier_emits_reasoning_for_thinking_enabled_prompt(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let model = fixture.model;
- let backend = fixture.backend;
-
- let mut classifier = model.sampled_token_classifier();
- let prompt_tokens = model.str_to_token(QWEN35_THINKING_PROMPT, AddBos::Never)?;
- let prompt_token_count = u64::try_from(prompt_tokens.len())?;
-
- let mut batch = LlamaBatch::new(2048, 1)?;
- classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
-
- let mut context = LlamaContext::from_model(
- model,
- backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
-
- context.decode(&mut batch)?;
-
- let promoted = classifier.commit_prompt_tokens();
- assert_eq!(promoted, prompt_token_count);
-
- let mut sampler = LlamaSampler::chain_simple([
- LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
- LlamaSampler::top_k(40),
- LlamaSampler::top_p(0.9, 1),
- LlamaSampler::min_p(0.05, 1),
- LlamaSampler::temp(0.7),
- LlamaSampler::dist(0x00C0_FFEE),
- ]);
- let initial_position = batch.n_tokens();
- let outcome = ClassifySampleLoop {
- model,
- classifier: &mut classifier,
- sampler: &mut sampler,
- context: &mut context,
- batch: &mut batch,
- initial_position,
- max_generated_tokens: MAX_GENERATED_TOKENS,
- }
- .run()?;
-
- let usage = classifier.usage();
- let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?;
- let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else {
- bail!("Qwen3.5 chat template must be recognised by the parser; got Unrecognized");
- };
-
- assert!(!outcome.generated_raw.is_empty());
- assert!(outcome.observed_reasoning > 0);
- assert!(usage.reasoning_tokens > 0);
- assert_eq!(outcome.observed_undeterminable, 0);
- assert_eq!(usage.undeterminable_tokens, 0);
- assert_eq!(
- usage.completion_tokens(),
- outcome.observed_content + outcome.observed_reasoning,
- );
-
- if parsed.reasoning_content.is_empty() {
- eprintln!(
- "Qwen3.5 didn't close its reasoning block within {MAX_GENERATED_TOKENS} tokens — \
- skipping strict parser-equality assertions"
- );
- } else {
- assert_eq!(outcome.reasoning_stream, parsed.reasoning_content);
- assert_eq!(outcome.content_stream, parsed.content);
- }
-
- for forbidden in FORBIDDEN_MARKERS {
- assert!(!outcome.reasoning_stream.contains(forbidden));
- assert!(!outcome.content_stream.contains(forbidden));
- }
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/qwen35_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs b/llama-cpp-bindings-tests/tests/qwen35_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs
deleted file mode 100644
index 414fde9a..00000000
--- a/llama-cpp-bindings-tests/tests/qwen35_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs
+++ /dev/null
@@ -1,111 +0,0 @@
-use anyhow::Result;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::mtmd::MtmdBitmap;
-use llama_cpp_bindings::mtmd::MtmdInputText;
-use llama_cpp_bindings::mtmd::mtmd_default_marker;
-use llama_cpp_bindings::sampling::LlamaSampler;
-use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-use llama_cpp_bindings_tests::test_model::fixtures_dir;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const MAX_GENERATED_TOKENS: i32 = 200;
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 4096,
- n_batch = 512,
- n_ubatch = 512,
- mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"),
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 4096,
- n_batch = 512,
- n_ubatch = 512,
- mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn qwen35_classifier_emits_reasoning_for_multimodal_thinking_prompt(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let model = fixture.model;
- let backend = fixture.backend;
- let mtmd_ctx = fixture
- .mtmd_context
- .expect("mmproj_file declared in attribute");
-
- let mut context = LlamaContext::from_model(
- model,
- backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
-
- let image_path = fixtures_dir().join("llamas.jpg");
- let image_path_str = image_path
- .to_str()
- .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?;
- let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?;
-
- let marker = mtmd_default_marker();
- let prompt = format!(
- "<|im_start|>user\n{marker}What animals do you see in this image?<|im_end|>\n<|im_start|>assistant\n\n"
- );
-
- let input_text = MtmdInputText {
- text: prompt,
- add_special: false,
- parse_special: true,
- };
-
- let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
-
- let mut classifier = model.sampled_token_classifier();
- let n_past = classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?;
-
- let mut sampler = LlamaSampler::chain_simple([
- LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
- LlamaSampler::top_k(40),
- LlamaSampler::top_p(0.9, 1),
- LlamaSampler::min_p(0.05, 1),
- LlamaSampler::temp(0.7),
- LlamaSampler::dist(0x00C0_FFEE),
- ]);
-
- let mut batch = LlamaBatch::new(2048, 1)?;
- let outcome = ClassifySampleLoop {
- model,
- classifier: &mut classifier,
- sampler: &mut sampler,
- context: &mut context,
- batch: &mut batch,
- initial_position: n_past,
- max_generated_tokens: MAX_GENERATED_TOKENS,
- }
- .run()?;
-
- let usage = classifier.usage();
-
- if outcome.observed_reasoning == 0 {
- anyhow::bail!(
- "Qwen 3.5 multimodal + thinking: classifier must emit at least one Reasoning token \
- when the prompt opens a `` block; outcome={outcome:?}"
- );
- }
- if usage.reasoning_tokens == 0 {
- anyhow::bail!(
- "Qwen 3.5 multimodal + thinking: usage.reasoning_tokens must be non-zero; usage={usage:?}"
- );
- }
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/qwen35_parses_constrained_schema_payload.rs b/llama-cpp-bindings-tests/tests/qwen35_parses_constrained_schema_payload.rs
deleted file mode 100644
index f517a4e7..00000000
--- a/llama-cpp-bindings-tests/tests/qwen35_parses_constrained_schema_payload.rs
+++ /dev/null
@@ -1,104 +0,0 @@
-use anyhow::Result;
-use anyhow::bail;
-use llama_cpp_bindings::ChatMessageParseOutcome;
-use llama_cpp_bindings::ToolCallArguments;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-use serde_json::Value;
-use serde_json::json;
-
-const NEGOTIATE_WITH_CAT_TOOLS_JSON: &str = r#"[
- {
- "type": "function",
- "function": {
- "name": "negotiate_with_cat",
- "description": "Attempt to negotiate with a cat. Outcomes are not guaranteed and may include the silent treatment.",
- "parameters": {
- "type": "object",
- "properties": {
- "topic": {
- "type": "string",
- "description": "What you are trying to negotiate, e.g. 'get off the keyboard' or 'stop knocking things off the table'"
- },
- "bribe": {
- "type": "string",
- "enum": ["tuna", "salmon", "treats", "ear_scritches", "cardboard_box", "none"],
- "description": "What you are offering in exchange"
- },
- "desperation_level": {
- "type": "integer",
- "description": "How desperate you are, on a scale from 1 (mildly annoyed human) to 10 (it is 3am)",
- "minimum": 1,
- "maximum": 10
- }
- },
- "required": ["topic"],
- "additionalProperties": false
- }
- }
- }
-]"#;
-
-const NEGOTIATE_WITH_CAT_INPUT: &str = "\n\
-\n\
-\n\
-tuna\n\
-\n\
-\n\
-8\n\
-\n\
-\n\
-get off the keyboard\n\
-\n\
-\n\
-";
-
-fn arguments_as_json(arguments: &ToolCallArguments) -> Result<&Value> {
- match arguments {
- ToolCallArguments::ValidJson(value) => Ok(value),
- ToolCallArguments::InvalidJson(raw) => {
- bail!("expected ValidJson arguments, got InvalidJson: {raw}")
- }
- }
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn qwen35_parses_constrained_schema_payload(fixture: &LlamaFixture<'_>) -> Result<()> {
- let outcome = fixture.model.parse_chat_message(
- NEGOTIATE_WITH_CAT_TOOLS_JSON,
- NEGOTIATE_WITH_CAT_INPUT,
- false,
- )?;
-
- let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
- bail!(
- "Qwen 3.5's tool-call payload must be parsed by the wrapper-side duck-type pass; \
- got Unrecognized"
- );
- };
-
- assert_eq!(parsed.tool_calls.len(), 1);
- assert_eq!(parsed.tool_calls[0].name, "negotiate_with_cat");
- assert_eq!(parsed.tool_calls[0].id, "call_0");
- assert_eq!(
- arguments_as_json(&parsed.tool_calls[0].arguments)?,
- &json!({
- "bribe": "tuna",
- "desperation_level": 8,
- "topic": "get off the keyboard",
- }),
- );
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/qwen35_parses_tool_call_payload.rs b/llama-cpp-bindings-tests/tests/qwen35_parses_tool_call_payload.rs
deleted file mode 100644
index 2fe2b89c..00000000
--- a/llama-cpp-bindings-tests/tests/qwen35_parses_tool_call_payload.rs
+++ /dev/null
@@ -1,134 +0,0 @@
-use anyhow::Result;
-use anyhow::bail;
-use llama_cpp_bindings::ChatMessageParseOutcome;
-use llama_cpp_bindings::ToolCallArguments;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const TOOLS_JSON: &str = r#"[
- {
- "type": "function",
- "function": {
- "name": "get_weather",
- "description": "Get the current weather for a location",
- "parameters": {
- "type": "object",
- "properties": {
- "location": {"type": "string", "description": "The city name"}
- },
- "required": ["location"]
- }
- }
- }
-]"#;
-
-const QWEN_XML_PAYLOAD: &str = "\n\
-\n\
-\n\
-Paris\n\
-\n\
-\n\
-";
-
-const PARTIAL_QWEN_XML_PAYLOAD: &str = "\n\n\n\
-\n\
-\n\
-Paris\n\
-\n\
-\n\
-\n\
-\n\
-\n\
-\n\
-Berlin\n\
-\n\
-\n\
-";
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn qwen35_parses_tool_call_payload(fixture: &LlamaFixture<'_>) -> Result<()> {
- let outcome = fixture
- .model
- .parse_chat_message(TOOLS_JSON, QWEN_XML_PAYLOAD, false)?;
-
- let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
- bail!("expected Recognized for Qwen XML on a Qwen-3.5 model; got Unrecognized");
- };
- assert_eq!(parsed.tool_calls.len(), 1);
- assert_eq!(parsed.tool_calls[0].name, "get_weather");
- let location = match &parsed.tool_calls[0].arguments {
- ToolCallArguments::ValidJson(value) => value
- .get("location")
- .and_then(|v| v.as_str())
- .map(str::to_owned),
- ToolCallArguments::InvalidJson(raw) => {
- bail!("expected ValidJson, got InvalidJson: {raw}");
- }
- };
- assert_eq!(location.as_deref(), Some("Paris"));
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn qwen35_parses_partial_tool_call_returns_pending_state(fixture: &LlamaFixture<'_>) -> Result<()> {
- let outcome = fixture
- .model
- .parse_chat_message(TOOLS_JSON, PARTIAL_QWEN_XML_PAYLOAD, true)?;
-
- let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
- bail!("expected Recognized for partial Qwen XML on a Qwen-3.5 model; got Unrecognized");
- };
- assert!(parsed.tool_calls.is_empty() || parsed.tool_calls.len() == 1);
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn qwen35_parses_multiple_tool_calls(fixture: &LlamaFixture<'_>) -> Result<()> {
- let outcome = fixture
- .model
- .parse_chat_message(TOOLS_JSON, TWO_QWEN_XML_PAYLOADS, false)?;
-
- let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
- bail!(
- "expected Recognized for two Qwen XML payloads on a Qwen-3.5 model; got Unrecognized"
- );
- };
- assert!(
- !parsed.tool_calls.is_empty(),
- "expected at least one tool call; got {:?}",
- parsed.tool_calls
- );
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/qwen35_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested.rs b/llama-cpp-bindings-tests/tests/qwen35_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested.rs
deleted file mode 100644
index 96b76cf5..00000000
--- a/llama-cpp-bindings-tests/tests/qwen35_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested.rs
+++ /dev/null
@@ -1,58 +0,0 @@
-use anyhow::Result;
-use anyhow::bail;
-use llama_cpp_bindings::ChatMessageParseOutcome;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const TOOLS_JSON: &str = r#"[
- {
- "type": "function",
- "function": {
- "name": "get_weather",
- "description": "Get the current weather for a location",
- "parameters": {
- "type": "object",
- "properties": {
- "location": {"type": "string", "description": "The city name"}
- },
- "required": ["location"]
- }
- }
- }
-]"#;
-
-const PLAIN_CONTENT: &str = "Sorry, I cannot help with that.";
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn qwen35_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let outcome = fixture
- .model
- .parse_chat_message(TOOLS_JSON, PLAIN_CONTENT, false)?;
-
- let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
- bail!(
- "Qwen 3.5 with tools requested + plain content must produce Recognized (with empty \
- tool_calls); got Unrecognized"
- );
- };
- assert!(
- parsed.tool_calls.is_empty(),
- "expected no tool calls; got {:?}",
- parsed.tool_calls
- );
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/qwen36_chat_inference_emits_reasoning_when_template_auto_opens.rs b/llama-cpp-bindings-tests/tests/qwen36_chat_inference_emits_reasoning_when_template_auto_opens.rs
deleted file mode 100644
index 233cef95..00000000
--- a/llama-cpp-bindings-tests/tests/qwen36_chat_inference_emits_reasoning_when_template_auto_opens.rs
+++ /dev/null
@@ -1,87 +0,0 @@
-use anyhow::Result;
-use anyhow::bail;
-use llama_cpp_bindings::ChatMessageParseOutcome;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_bindings::model::LlamaChatMessage;
-use llama_cpp_bindings::sampling::LlamaSampler;
-use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 2048,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn qwen36_chat_inference_emits_reasoning_when_template_auto_opens(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let model = fixture.model;
- let backend = fixture.backend;
-
- let mut context = LlamaContext::from_model(
- model,
- backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
-
- let chat_template = model.chat_template(None)?;
- let messages = vec![LlamaChatMessage::new(
- "user".to_owned(),
- "Hello! How are you?".to_owned(),
- )?];
- let prompt = model.apply_chat_template(&chat_template, &messages, true)?;
-
- let mut classifier = model.sampled_token_classifier();
- let tokens = model.str_to_token(&prompt, AddBos::Always)?;
- let prompt_token_count = u64::try_from(tokens.len())?;
-
- let mut batch = LlamaBatch::new(512, 1)?;
- classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?;
-
- context.decode(&mut batch)?;
-
- let promoted = classifier.commit_prompt_tokens();
- assert_eq!(promoted, prompt_token_count);
-
- let mut sampler = LlamaSampler::greedy();
- let initial_position = batch.n_tokens();
- let outcome = ClassifySampleLoop {
- model,
- classifier: &mut classifier,
- sampler: &mut sampler,
- context: &mut context,
- batch: &mut batch,
- initial_position,
- max_generated_tokens: 1024,
- }
- .run()?;
-
- assert!(!outcome.generated_raw.is_empty());
- assert!(outcome.observed_reasoning > 0);
- assert!(outcome.observed_content > 0);
- assert_eq!(outcome.observed_undeterminable, 0);
- assert_eq!(outcome.observed_tool_call, 0);
-
- let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?;
- let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else {
- bail!("Qwen3.6 chat template must be recognised by the parser; got Unrecognized");
- };
- assert!(!parsed.content.is_empty());
-
- let usage = classifier.into_usage();
- assert_eq!(usage.prompt_tokens, prompt_token_count);
- assert_eq!(usage.reasoning_tokens, outcome.observed_reasoning);
- assert_eq!(usage.undeterminable_tokens, 0);
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/qwen36_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs b/llama-cpp-bindings-tests/tests/qwen36_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs
deleted file mode 100644
index 2b57fa17..00000000
--- a/llama-cpp-bindings-tests/tests/qwen36_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs
+++ /dev/null
@@ -1,95 +0,0 @@
-use anyhow::Result;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_bindings::sampling::LlamaSampler;
-use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const MAX_GENERATED_TOKENS: i32 = 200;
-
-const QWEN36_THINKING_DISABLED_PROMPT: &str = "\
-<|im_start|>user
-What is 2 + 2?<|im_end|>
-<|im_start|>assistant
-
-
-
-
-";
-
-const FORBIDDEN_MARKERS: &[&str] = &["", ""];
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 8192,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn qwen36_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let model = fixture.model;
- let backend = fixture.backend;
-
- let mut classifier = model.sampled_token_classifier();
- let prompt_tokens = model.str_to_token(QWEN36_THINKING_DISABLED_PROMPT, AddBos::Never)?;
- let prompt_token_count = u64::try_from(prompt_tokens.len())?;
-
- let mut batch = LlamaBatch::new(2048, 1)?;
- classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
-
- let mut context = LlamaContext::from_model(
- model,
- backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
-
- context.decode(&mut batch)?;
-
- let promoted = classifier.commit_prompt_tokens();
- assert_eq!(promoted, prompt_token_count);
-
- let mut sampler = LlamaSampler::chain_simple([
- LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
- LlamaSampler::top_k(40),
- LlamaSampler::top_p(0.9, 1),
- LlamaSampler::min_p(0.05, 1),
- LlamaSampler::temp(0.7),
- LlamaSampler::dist(0x00C0_FFEE),
- ]);
- let initial_position = batch.n_tokens();
- let outcome = ClassifySampleLoop {
- model,
- classifier: &mut classifier,
- sampler: &mut sampler,
- context: &mut context,
- batch: &mut batch,
- initial_position,
- max_generated_tokens: MAX_GENERATED_TOKENS,
- }
- .run()?;
-
- let usage = classifier.usage();
-
- assert!(!outcome.generated_raw.is_empty());
- assert_eq!(outcome.observed_reasoning, 0);
- assert_eq!(outcome.observed_undeterminable, 0);
- assert_eq!(usage.reasoning_tokens, 0);
- assert_eq!(usage.undeterminable_tokens, 0);
- assert!(outcome.observed_content > 0);
- assert_eq!(usage.completion_tokens(), outcome.observed_content);
-
- for forbidden in FORBIDDEN_MARKERS {
- assert!(!outcome.content_stream.contains(forbidden));
- }
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/qwen36_classifier_emits_reasoning.rs b/llama-cpp-bindings-tests/tests/qwen36_classifier_emits_reasoning.rs
deleted file mode 100644
index c9c16a64..00000000
--- a/llama-cpp-bindings-tests/tests/qwen36_classifier_emits_reasoning.rs
+++ /dev/null
@@ -1,108 +0,0 @@
-use anyhow::Result;
-use anyhow::bail;
-use llama_cpp_bindings::ChatMessageParseOutcome;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_bindings::sampling::LlamaSampler;
-use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const MAX_GENERATED_TOKENS: i32 = 1500;
-
-const QWEN36_THINKING_PROMPT: &str = "\
-<|im_start|>user
-What is 2 + 2?<|im_end|>
-<|im_start|>assistant
-
-";
-
-const FORBIDDEN_MARKERS: &[&str] = &["", ""];
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 8192,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn qwen36_classifier_emits_reasoning_for_thinking_enabled_prompt(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let model = fixture.model;
- let backend = fixture.backend;
-
- let mut classifier = model.sampled_token_classifier();
- let prompt_tokens = model.str_to_token(QWEN36_THINKING_PROMPT, AddBos::Never)?;
- let prompt_token_count = u64::try_from(prompt_tokens.len())?;
-
- let mut batch = LlamaBatch::new(2048, 1)?;
- classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
-
- let mut context = LlamaContext::from_model(
- model,
- backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
-
- context.decode(&mut batch)?;
-
- let promoted = classifier.commit_prompt_tokens();
- assert_eq!(promoted, prompt_token_count);
-
- let mut sampler = LlamaSampler::chain_simple([
- LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
- LlamaSampler::top_k(40),
- LlamaSampler::top_p(0.9, 1),
- LlamaSampler::min_p(0.05, 1),
- LlamaSampler::temp(0.7),
- LlamaSampler::dist(0x00C0_FFEE),
- ]);
- let initial_position = batch.n_tokens();
- let outcome = ClassifySampleLoop {
- model,
- classifier: &mut classifier,
- sampler: &mut sampler,
- context: &mut context,
- batch: &mut batch,
- initial_position,
- max_generated_tokens: MAX_GENERATED_TOKENS,
- }
- .run()?;
-
- let usage = classifier.usage();
- let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, true)?;
- let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else {
- bail!("Qwen3.6 chat template must be recognised by the parser; got Unrecognized");
- };
-
- assert!(!outcome.generated_raw.is_empty());
- assert!(outcome.observed_reasoning > 0);
- assert!(usage.reasoning_tokens > 0);
- assert_eq!(outcome.observed_undeterminable, 0);
- assert_eq!(usage.undeterminable_tokens, 0);
- assert_eq!(
- usage.completion_tokens(),
- outcome.observed_content + outcome.observed_reasoning,
- );
-
- if parsed.reasoning_content.is_empty() {
- eprintln!("Qwen3.6 parser returned empty reasoning_content — relying on FORBIDDEN_MARKERS");
- } else {
- assert_eq!(outcome.reasoning_stream, parsed.reasoning_content);
- assert_eq!(outcome.content_stream, parsed.content);
- }
-
- for forbidden in FORBIDDEN_MARKERS {
- assert!(!outcome.reasoning_stream.contains(forbidden));
- assert!(!outcome.content_stream.contains(forbidden));
- }
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/qwen36_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs b/llama-cpp-bindings-tests/tests/qwen36_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs
deleted file mode 100644
index cf43adfd..00000000
--- a/llama-cpp-bindings-tests/tests/qwen36_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs
+++ /dev/null
@@ -1,100 +0,0 @@
-use anyhow::Result;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::mtmd::MtmdBitmap;
-use llama_cpp_bindings::mtmd::MtmdInputText;
-use llama_cpp_bindings::mtmd::mtmd_default_marker;
-use llama_cpp_bindings::sampling::LlamaSampler;
-use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-use llama_cpp_bindings_tests::test_model::fixtures_dir;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-const MAX_GENERATED_TOKENS: i32 = 200;
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 8192,
- n_batch = 512,
- n_ubatch = 512,
- mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"),
-)]
-fn qwen36_classifier_emits_reasoning_for_multimodal_thinking_prompt(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let model = fixture.model;
- let backend = fixture.backend;
- let mtmd_ctx = fixture
- .mtmd_context
- .expect("mmproj_file declared in attribute");
-
- let mut context = LlamaContext::from_model(
- model,
- backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
-
- let image_path = fixtures_dir().join("llamas.jpg");
- let image_path_str = image_path
- .to_str()
- .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?;
- let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?;
-
- let marker = mtmd_default_marker();
- let prompt = format!(
- "<|im_start|>user\n{marker}What animals do you see in this image?<|im_end|>\n<|im_start|>assistant\n\n"
- );
-
- let input_text = MtmdInputText {
- text: prompt,
- add_special: false,
- parse_special: true,
- };
-
- let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?;
-
- let mut classifier = model.sampled_token_classifier();
- let n_past = classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?;
-
- let mut sampler = LlamaSampler::chain_simple([
- LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
- LlamaSampler::top_k(40),
- LlamaSampler::top_p(0.9, 1),
- LlamaSampler::min_p(0.05, 1),
- LlamaSampler::temp(0.7),
- LlamaSampler::dist(0x00C0_FFEE),
- ]);
-
- let mut batch = LlamaBatch::new(2048, 1)?;
- let outcome = ClassifySampleLoop {
- model,
- classifier: &mut classifier,
- sampler: &mut sampler,
- context: &mut context,
- batch: &mut batch,
- initial_position: n_past,
- max_generated_tokens: MAX_GENERATED_TOKENS,
- }
- .run()?;
-
- let usage = classifier.usage();
-
- if outcome.observed_reasoning == 0 {
- anyhow::bail!(
- "Qwen 3.6 multimodal + thinking: classifier must emit at least one Reasoning token; outcome={outcome:?}"
- );
- }
- if usage.reasoning_tokens == 0 {
- anyhow::bail!(
- "Qwen 3.6 multimodal + thinking: usage.reasoning_tokens must be non-zero; usage={usage:?}"
- );
- }
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/reasoning_markers_and_tool_calls.rs b/llama-cpp-bindings-tests/tests/reasoning_markers_and_tool_calls.rs
new file mode 100644
index 00000000..a5aac3d4
--- /dev/null
+++ b/llama-cpp-bindings-tests/tests/reasoning_markers_and_tool_calls.rs
@@ -0,0 +1,2484 @@
+use llama_cpp_test_harness::llama_tests_main;
+
+mod deepseek_r1_8b_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt {
+ use anyhow::Result;
+ use llama_cpp_bindings::context::LlamaContext;
+ use llama_cpp_bindings::llama_batch::LlamaBatch;
+ use llama_cpp_bindings::model::AddBos;
+ use llama_cpp_bindings::sampling::LlamaSampler;
+ use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ const MAX_GENERATED_TOKENS: i32 = 200;
+
+ const DEEPSEEK_R1_8B_THINKING_DISABLED_PROMPT: &str = "\
+ <|User|>What is 2 + 2?<|Assistant|>
+
+
+
+ ";
+
+ const FORBIDDEN_MARKERS: &[&str] = &["", ""];
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 8192,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn deepseek_r1_8b_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let model = fixture.model;
+ let backend = fixture.backend;
+
+ let mut classifier = model.sampled_token_classifier();
+ let prompt_tokens =
+ model.str_to_token(DEEPSEEK_R1_8B_THINKING_DISABLED_PROMPT, AddBos::Never)?;
+ let prompt_token_count = u64::try_from(prompt_tokens.len())?;
+
+ let mut batch = LlamaBatch::new(2048, 1)?;
+ classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
+
+ let mut context = LlamaContext::from_model(
+ model,
+ backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+
+ context.decode(&mut batch)?;
+
+ let promoted = classifier.commit_prompt_tokens();
+ assert_eq!(promoted, prompt_token_count);
+
+ let mut sampler = LlamaSampler::chain_simple([
+ LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
+ LlamaSampler::top_k(40),
+ LlamaSampler::top_p(0.9, 1),
+ LlamaSampler::min_p(0.05, 1),
+ LlamaSampler::temp(0.7),
+ LlamaSampler::dist(0x00C0_FFEE),
+ ]);
+ let initial_position = batch.n_tokens();
+ let outcome = ClassifySampleLoop {
+ model,
+ classifier: &mut classifier,
+ sampler: &mut sampler,
+ context: &mut context,
+ batch: &mut batch,
+ initial_position,
+ max_generated_tokens: MAX_GENERATED_TOKENS,
+ }
+ .run()?;
+
+ let usage = classifier.usage();
+
+ assert!(
+ !outcome.generated_raw.is_empty(),
+ "DeepSeek-R1-8B: must generate at least one token"
+ );
+ assert_eq!(
+ outcome.observed_reasoning, 0,
+ "DeepSeek-R1-8B thinking-disabled: classifier must not emit any Reasoning token \
+ when the prompt closes the think block before generation begins; \
+ generated={:?}",
+ outcome.generated_raw
+ );
+ assert_eq!(
+ outcome.observed_undeterminable, 0,
+ "DeepSeek-R1-8B thinking-disabled: prompt-token replay must move section to Content \
+ before generation, so no Undeterminable tokens may be emitted; \
+ generated={:?}",
+ outcome.generated_raw
+ );
+ assert_eq!(
+ usage.reasoning_tokens, 0,
+ "DeepSeek-R1-8B thinking-disabled: usage.reasoning_tokens must be zero; usage={usage:?}"
+ );
+ assert_eq!(
+ usage.undeterminable_tokens, 0,
+ "DeepSeek-R1-8B thinking-disabled: usage.undeterminable_tokens must be zero; usage={usage:?}"
+ );
+ assert!(
+ outcome.observed_content > 0,
+ "DeepSeek-R1-8B thinking-disabled: classifier must emit at least one Content token"
+ );
+ assert_eq!(
+ usage.completion_tokens(),
+ outcome.observed_content,
+ "DeepSeek-R1-8B thinking-disabled: completion tokens must equal observed Content tokens"
+ );
+
+ for forbidden in FORBIDDEN_MARKERS {
+ assert!(
+ !outcome.content_stream.contains(forbidden),
+ "DeepSeek-R1-8B thinking-disabled: content_stream leaked marker {forbidden:?}; \
+ content_stream={:?}",
+ outcome.content_stream
+ );
+ }
+
+ Ok(())
+ }
+}
+
+mod deepseek_r1_8b_classifier_emits_reasoning {
+ use anyhow::Result;
+ use anyhow::bail;
+ use llama_cpp_bindings::ChatMessageParseOutcome;
+ use llama_cpp_bindings::context::LlamaContext;
+ use llama_cpp_bindings::llama_batch::LlamaBatch;
+ use llama_cpp_bindings::model::AddBos;
+ use llama_cpp_bindings::sampling::LlamaSampler;
+ use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ const MAX_GENERATED_TOKENS: i32 = 1500;
+
+ // DeepSeek-R1-Distill-Llama-8B uses `...` reasoning markers
+ // and full-width-bar role tokens `<|User|>` / `<|Assistant|>` (U+FF5C,
+ // not ASCII `|`). The chat template's `add_generation_prompt` ALWAYS appends
+ // `<|Assistant|>\n` — DeepSeek-R1 is a pure reasoner with no
+ // thinking-disabled mode — so the model resumes generation already inside
+ // the reasoning block.
+ const DEEPSEEK_R1_8B_THINKING_PROMPT: &str = "\
+ <|User|>What is 2 + 2?<|Assistant|>
+ ";
+
+ const FORBIDDEN_MARKERS: &[&str] = &["", ""];
+
+ #[expect(
+ clippy::too_many_lines,
+ reason = "test asserts many distinct properties of DeepSeek-R1-8B reasoning output; shortening messages or splitting the body would reduce diagnostic signal at failure time"
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 8192,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn deepseek_r1_8b_classifier_emits_reasoning_for_thinking_enabled_prompt(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let model = fixture.model;
+ let backend = fixture.backend;
+
+ let mut classifier = model.sampled_token_classifier();
+ let prompt_tokens = model.str_to_token(DEEPSEEK_R1_8B_THINKING_PROMPT, AddBos::Never)?;
+ let prompt_token_count = u64::try_from(prompt_tokens.len())?;
+
+ let mut batch = LlamaBatch::new(2048, 1)?;
+ classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
+
+ let mut context = LlamaContext::from_model(
+ model,
+ backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+
+ context.decode(&mut batch)?;
+
+ let promoted = classifier.commit_prompt_tokens();
+ assert_eq!(promoted, prompt_token_count);
+
+ let mut sampler = LlamaSampler::chain_simple([
+ LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
+ LlamaSampler::top_k(40),
+ LlamaSampler::top_p(0.9, 1),
+ LlamaSampler::min_p(0.05, 1),
+ LlamaSampler::temp(0.7),
+ LlamaSampler::dist(0x00C0_FFEE),
+ ]);
+ let initial_position = batch.n_tokens();
+ let outcome = ClassifySampleLoop {
+ model,
+ classifier: &mut classifier,
+ sampler: &mut sampler,
+ context: &mut context,
+ batch: &mut batch,
+ initial_position,
+ max_generated_tokens: MAX_GENERATED_TOKENS,
+ }
+ .run()?;
+
+ let usage = classifier.usage();
+ let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?;
+ let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else {
+ bail!(
+ "DeepSeek-R1-8B chat template must be recognised by the parser; got Unrecognized"
+ );
+ };
+
+ assert!(
+ !outcome.generated_raw.is_empty(),
+ "DeepSeek-R1-8B: must generate at least one token"
+ );
+ assert!(
+ outcome.observed_reasoning > 0,
+ "DeepSeek-R1-8B: classifier must emit at least one Reasoning token when the prompt \
+ opens a block; outcome={outcome:?}",
+ );
+ assert!(
+ usage.reasoning_tokens > 0,
+ "DeepSeek-R1-8B: usage.reasoning_tokens must be non-zero when the prompt opens a \
+ block; usage was {usage:?}"
+ );
+ assert_eq!(
+ outcome.observed_undeterminable, 0,
+ "DeepSeek-R1-8B: prompt-token replay must move section to Reasoning before generation, \
+ so no Undeterminable tokens may be emitted; outcome={outcome:?}"
+ );
+ assert_eq!(
+ usage.undeterminable_tokens, 0,
+ "DeepSeek-R1-8B: usage.undeterminable_tokens must be zero; usage={usage:?}"
+ );
+ assert_eq!(
+ usage.completion_tokens(),
+ outcome.observed_content + outcome.observed_reasoning,
+ "DeepSeek-R1-8B: completion tokens must equal observed Content + Reasoning"
+ );
+
+ if parsed.reasoning_content.is_empty() {
+ eprintln!(
+ "DeepSeek-R1-8B didn't close its reasoning block within {MAX_GENERATED_TOKENS} \
+ tokens — skipping strict parser-equality assertions"
+ );
+ } else {
+ assert_eq!(
+ outcome.reasoning_stream, parsed.reasoning_content,
+ "DeepSeek-R1-8B: per-token reasoning stream must equal parser-side reasoning_content \
+ (any difference means a marker leaked into the user-visible stream)",
+ );
+ assert_eq!(
+ outcome.content_stream, parsed.content,
+ "DeepSeek-R1-8B: per-token content stream must equal parser-side content \
+ (any difference means a marker leaked into the user-visible stream)",
+ );
+ }
+
+ for forbidden in FORBIDDEN_MARKERS {
+ assert!(
+ !outcome.reasoning_stream.contains(forbidden),
+ "DeepSeek-R1-8B: reasoning_stream leaked marker {forbidden:?}; \
+ reasoning_stream={:?}",
+ outcome.reasoning_stream
+ );
+ assert!(
+ !outcome.content_stream.contains(forbidden),
+ "DeepSeek-R1-8B: content_stream leaked marker {forbidden:?}; \
+ content_stream={:?}",
+ outcome.content_stream
+ );
+ }
+
+ Ok(())
+ }
+}
+
+mod deepseek_r1_8b_duck_types_gemma_paired_quote {
+ use anyhow::Result;
+ use anyhow::bail;
+ use llama_cpp_bindings::ChatMessageParseOutcome;
+ use llama_cpp_bindings::ToolCallArguments;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ const TOOLS_JSON: &str = r#"[
+ {
+ "type": "function",
+ "function": {
+ "name": "get_weather",
+ "description": "Get the current weather for a location",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "location": {"type": "string", "description": "The city name"}
+ },
+ "required": ["location"]
+ }
+ }
+ }
+ ]"#;
+
+ const GEMMA_PAIRED_QUOTE_PAYLOAD: &str =
+ "<|tool_call>call:get_weather{location:<|\"|>Paris<|\"|>}";
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn deepseek_r1_8b_duck_types_gemma_paired_quote(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let outcome =
+ fixture
+ .model
+ .parse_chat_message(TOOLS_JSON, GEMMA_PAIRED_QUOTE_PAYLOAD, false)?;
+
+ let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+ bail!(
+ "duck-type pass must recognise Gemma paired-quote on a model with no registered \
+ template; got Unrecognized"
+ );
+ };
+ assert_eq!(
+ parsed.tool_calls.len(),
+ 1,
+ "expected one tool call; got {:?}",
+ parsed.tool_calls
+ );
+ assert_eq!(parsed.tool_calls[0].name, "get_weather");
+ let location = match &parsed.tool_calls[0].arguments {
+ ToolCallArguments::ValidJson(value) => value
+ .get("location")
+ .and_then(|v| v.as_str())
+ .map(str::to_owned),
+ ToolCallArguments::InvalidJson(raw) => {
+ bail!("expected ValidJson, got InvalidJson: {raw}");
+ }
+ };
+ assert_eq!(location.as_deref(), Some("Paris"));
+
+ Ok(())
+ }
+}
+
+mod deepseek_r1_8b_duck_types_glm_key_value_tags {
+ use anyhow::Result;
+ use anyhow::bail;
+ use llama_cpp_bindings::ChatMessageParseOutcome;
+ use llama_cpp_bindings::ToolCallArguments;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ const TOOLS_JSON: &str = r#"[
+ {
+ "type": "function",
+ "function": {
+ "name": "get_weather",
+ "description": "Get the current weather for a location",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "location": {"type": "string", "description": "The city name"}
+ },
+ "required": ["location"]
+ }
+ }
+ }
+ ]"#;
+
+ const GLM_KEY_VALUE_PAYLOAD: &str = "get_weather\
+ location\
+ Paris\
+ ";
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn deepseek_r1_8b_duck_types_glm_key_value_tags(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let outcome = fixture
+ .model
+ .parse_chat_message(TOOLS_JSON, GLM_KEY_VALUE_PAYLOAD, false)?;
+
+ let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+ bail!(
+ "duck-type pass must recognise GLM key-value tags on a model with no registered \
+ template; got Unrecognized"
+ );
+ };
+ assert_eq!(
+ parsed.tool_calls.len(),
+ 1,
+ "expected one tool call; got {:?}",
+ parsed.tool_calls
+ );
+ assert_eq!(parsed.tool_calls[0].name, "get_weather");
+ let location = match &parsed.tool_calls[0].arguments {
+ ToolCallArguments::ValidJson(value) => value
+ .get("location")
+ .and_then(|v| v.as_str())
+ .map(str::to_owned),
+ ToolCallArguments::InvalidJson(raw) => {
+ bail!("expected ValidJson, got InvalidJson: {raw}");
+ }
+ };
+ assert_eq!(location.as_deref(), Some("Paris"));
+
+ Ok(())
+ }
+}
+
+mod deepseek_r1_8b_duck_types_mistral_bracketed_json {
+ use anyhow::Result;
+ use anyhow::bail;
+ use llama_cpp_bindings::ChatMessageParseOutcome;
+ use llama_cpp_bindings::ToolCallArguments;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ const TOOLS_JSON: &str = r#"[
+ {
+ "type": "function",
+ "function": {
+ "name": "get_weather",
+ "description": "Get the current weather for a location",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "location": {"type": "string", "description": "The city name"}
+ },
+ "required": ["location"]
+ }
+ }
+ }
+ ]"#;
+
+ const MISTRAL_BRACKETED_JSON_PAYLOAD: &str =
+ r#"[TOOL_CALLS]get_weather[ARGS]{"location":"Paris"}"#;
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn deepseek_r1_8b_duck_types_mistral_bracketed_json(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let outcome =
+ fixture
+ .model
+ .parse_chat_message(TOOLS_JSON, MISTRAL_BRACKETED_JSON_PAYLOAD, false)?;
+
+ let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+ bail!(
+ "duck-type pass must recognise Mistral bracketed-JSON on a model with no registered \
+ template; got Unrecognized"
+ );
+ };
+ assert_eq!(
+ parsed.tool_calls.len(),
+ 1,
+ "expected one tool call; got {:?}",
+ parsed.tool_calls
+ );
+ assert_eq!(parsed.tool_calls[0].name, "get_weather");
+ let location = match &parsed.tool_calls[0].arguments {
+ ToolCallArguments::ValidJson(value) => value
+ .get("location")
+ .and_then(|v| v.as_str())
+ .map(str::to_owned),
+ ToolCallArguments::InvalidJson(raw) => {
+ bail!("expected ValidJson, got InvalidJson: {raw}");
+ }
+ };
+ assert_eq!(location.as_deref(), Some("Paris"));
+
+ Ok(())
+ }
+}
+
+mod deepseek_r1_8b_duck_types_qwen_xml {
+ use anyhow::Result;
+ use anyhow::bail;
+ use llama_cpp_bindings::ChatMessageParseOutcome;
+ use llama_cpp_bindings::ToolCallArguments;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ const TOOLS_JSON: &str = r#"[
+ {
+ "type": "function",
+ "function": {
+ "name": "get_weather",
+ "description": "Get the current weather for a location",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "location": {"type": "string", "description": "The city name"}
+ },
+ "required": ["location"]
+ }
+ }
+ }
+ ]"#;
+
+ const QWEN_XML_PAYLOAD: &str = "\n\
+ \n\
+ \n\
+ Paris\n\
+ \n\
+ \n\
+ ";
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn deepseek_r1_8b_duck_types_qwen_xml(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let outcome = fixture
+ .model
+ .parse_chat_message(TOOLS_JSON, QWEN_XML_PAYLOAD, false)?;
+
+ let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+ bail!(
+ "duck-type pass must recognise Qwen XML on a model with no registered template; \
+ got Unrecognized"
+ );
+ };
+ assert_eq!(
+ parsed.tool_calls.len(),
+ 1,
+ "expected one tool call; got {:?}",
+ parsed.tool_calls
+ );
+ assert_eq!(parsed.tool_calls[0].name, "get_weather");
+ let location = match &parsed.tool_calls[0].arguments {
+ ToolCallArguments::ValidJson(value) => value
+ .get("location")
+ .and_then(|v| v.as_str())
+ .map(str::to_owned),
+ ToolCallArguments::InvalidJson(raw) => {
+ bail!("expected ValidJson, got InvalidJson: {raw}");
+ }
+ };
+ assert_eq!(location.as_deref(), Some("Paris"));
+
+ Ok(())
+ }
+}
+
+mod deepseek_r1_8b_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested {
+ use anyhow::Result;
+ use anyhow::bail;
+ use llama_cpp_bindings::ChatMessageParseOutcome;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ const TOOLS_JSON: &str = r#"[
+ {
+ "type": "function",
+ "function": {
+ "name": "get_weather",
+ "description": "Get the current weather for a location",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "location": {"type": "string", "description": "The city name"}
+ },
+ "required": ["location"]
+ }
+ }
+ }
+ ]"#;
+
+ const PLAIN_CONTENT: &str = "Sorry, I cannot help with that.";
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn deepseek_r1_8b_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let outcome = fixture
+ .model
+ .parse_chat_message(TOOLS_JSON, PLAIN_CONTENT, false)?;
+
+ let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+ bail!(
+ "plain content with tools requested must produce Recognized (with empty tool_calls); \
+ got Unrecognized"
+ );
+ };
+ assert!(
+ parsed.tool_calls.is_empty(),
+ "expected no tool calls; got {:?}",
+ parsed.tool_calls
+ );
+
+ Ok(())
+ }
+}
+
+mod deepseek_r1_8b_recognizes_empty_tool_calls_when_tools_not_requested {
+ use anyhow::Result;
+ use anyhow::bail;
+ use llama_cpp_bindings::ChatMessageParseOutcome;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ const PLAIN_CONTENT: &str = "Hello there.";
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn deepseek_r1_8b_recognizes_empty_tool_calls_when_tools_not_requested(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let outcome = fixture
+ .model
+ .parse_chat_message("[]", PLAIN_CONTENT, false)?;
+
+ let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+ bail!("plain content with empty tools array must produce Recognized; got Unrecognized");
+ };
+ assert!(
+ parsed.tool_calls.is_empty(),
+ "expected no tool calls; got {:?}",
+ parsed.tool_calls
+ );
+
+ Ok(())
+ }
+}
+
+mod gemma4_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt {
+ use anyhow::Result;
+ use llama_cpp_bindings::context::LlamaContext;
+ use llama_cpp_bindings::llama_batch::LlamaBatch;
+ use llama_cpp_bindings::model::AddBos;
+ use llama_cpp_bindings::sampling::LlamaSampler;
+ use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ const MAX_GENERATED_TOKENS: i32 = 200;
+
+ const GEMMA4_THINKING_DISABLED_PROMPT: &str = "\
+ user\nReply with the single word: four. Do not explain.\n\
+ model\n<|channel>thought\n\n";
+
+ const FORBIDDEN_MARKERS: &[&str] = &["<|channel>thought", ""];
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 8192,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn gemma4_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let model = fixture.model;
+ let backend = fixture.backend;
+
+ let mut classifier = model.sampled_token_classifier();
+ let prompt_tokens = model.str_to_token(GEMMA4_THINKING_DISABLED_PROMPT, AddBos::Never)?;
+ let prompt_token_count = u64::try_from(prompt_tokens.len())?;
+
+ let mut batch = LlamaBatch::new(2048, 1)?;
+ classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
+
+ let mut context = LlamaContext::from_model(
+ model,
+ backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+
+ context.decode(&mut batch)?;
+
+ let promoted = classifier.commit_prompt_tokens();
+ assert_eq!(promoted, prompt_token_count);
+
+ let mut sampler = LlamaSampler::greedy();
+ let initial_position = batch.n_tokens();
+ let outcome = ClassifySampleLoop {
+ model,
+ classifier: &mut classifier,
+ sampler: &mut sampler,
+ context: &mut context,
+ batch: &mut batch,
+ initial_position,
+ max_generated_tokens: MAX_GENERATED_TOKENS,
+ }
+ .run()?;
+
+ let usage = classifier.usage();
+
+ assert!(
+ !outcome.generated_raw.is_empty(),
+ "Gemma 4 must generate at least one token"
+ );
+ assert_eq!(
+ outcome.observed_reasoning, 0,
+ "Gemma 4 thinking-disabled: classifier must not emit any Reasoning token \
+ when the prompt closes the thought channel before generation begins; \
+ generated={:?}",
+ outcome.generated_raw
+ );
+ assert_eq!(
+ outcome.observed_undeterminable, 0,
+ "Gemma 4 thinking-disabled: prompt-token replay must move section to Content \
+ before generation, so no Undeterminable tokens may be emitted; \
+ generated={:?}",
+ outcome.generated_raw
+ );
+ assert_eq!(
+ usage.reasoning_tokens, 0,
+ "Gemma 4 thinking-disabled: usage.reasoning_tokens must be zero; usage={usage:?}"
+ );
+ assert_eq!(
+ usage.undeterminable_tokens, 0,
+ "Gemma 4 thinking-disabled: usage.undeterminable_tokens must be zero; usage={usage:?}"
+ );
+ assert!(
+ outcome.observed_content > 0,
+ "Gemma 4 thinking-disabled: classifier must emit at least one Content token"
+ );
+ assert_eq!(
+ usage.completion_tokens(),
+ outcome.observed_content,
+ "Gemma 4 thinking-disabled: completion tokens must equal observed Content tokens"
+ );
+
+ for forbidden in FORBIDDEN_MARKERS {
+ assert!(
+ !outcome.content_stream.contains(forbidden),
+ "Gemma 4 thinking-disabled: content_stream leaked marker {forbidden:?}; \
+ content_stream={:?}",
+ outcome.content_stream
+ );
+ }
+
+ Ok(())
+ }
+}
+
+mod gemma4_classifier_emits_reasoning {
+ use anyhow::Result;
+ use anyhow::bail;
+ use llama_cpp_bindings::ChatMessageParseOutcome;
+ use llama_cpp_bindings::context::LlamaContext;
+ use llama_cpp_bindings::llama_batch::LlamaBatch;
+ use llama_cpp_bindings::model::AddBos;
+ use llama_cpp_bindings::sampling::LlamaSampler;
+ use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ const MAX_GENERATED_TOKENS: i32 = 1500;
+
+ const GEMMA4_THINKING_PROMPT: &str = "\
+ user\nReply with the single word: four. Do not explain.\n\
+ model\n<|channel>thought\n";
+
+ const FORBIDDEN_MARKERS: &[&str] = &["<|channel>thought", ""];
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 8192,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn gemma4_classifier_emits_reasoning_for_thinking_prompt(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let model = fixture.model;
+ let backend = fixture.backend;
+
+ let mut classifier = model.sampled_token_classifier();
+ let prompt_tokens = model.str_to_token(GEMMA4_THINKING_PROMPT, AddBos::Never)?;
+ let prompt_token_count = u64::try_from(prompt_tokens.len())?;
+
+ let mut batch = LlamaBatch::new(2048, 1)?;
+ classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
+
+ let mut context = LlamaContext::from_model(
+ model,
+ backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+
+ context.decode(&mut batch)?;
+
+ let promoted = classifier.commit_prompt_tokens();
+ assert_eq!(promoted, prompt_token_count);
+
+ let mut sampler = LlamaSampler::greedy();
+ let initial_position = batch.n_tokens();
+ let outcome = ClassifySampleLoop {
+ model,
+ classifier: &mut classifier,
+ sampler: &mut sampler,
+ context: &mut context,
+ batch: &mut batch,
+ initial_position,
+ max_generated_tokens: MAX_GENERATED_TOKENS,
+ }
+ .run()?;
+
+ let usage = classifier.usage();
+ let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?;
+ let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else {
+ bail!("Gemma 4 chat template must be recognised by the parser; got Unrecognized");
+ };
+
+ assert!(
+ !outcome.generated_raw.is_empty(),
+ "Gemma 4 must generate at least one token"
+ );
+ assert!(
+ outcome.observed_reasoning > 0,
+ "Gemma 4 classifier must emit at least one Reasoning token when the model \
+ emits a `<|channel>thought` block; outcome={outcome:?}",
+ );
+ assert!(
+ usage.reasoning_tokens > 0,
+ "Gemma 4 usage.reasoning_tokens must be non-zero when the model emits a \
+ reasoning block; usage was {usage:?}"
+ );
+ assert_eq!(
+ outcome.observed_undeterminable, 0,
+ "Gemma 4: classifier must not emit Undeterminable when the model emits a \
+ detected `<|channel>thought` marker; outcome={outcome:?}"
+ );
+ assert_eq!(
+ usage.undeterminable_tokens, 0,
+ "Gemma 4: usage.undeterminable_tokens must be zero; usage={usage:?}"
+ );
+ assert_eq!(
+ usage.completion_tokens(),
+ outcome.observed_content + outcome.observed_reasoning,
+ "Gemma 4: completion tokens must equal observed Content + Reasoning"
+ );
+ assert!(
+ !parsed.reasoning_content.is_empty(),
+ "Gemma 4 must close its reasoning block within {MAX_GENERATED_TOKENS} tokens; \
+ increase the budget or pick a more direct prompt. generated={:?}",
+ outcome.generated_raw,
+ );
+
+ for forbidden in FORBIDDEN_MARKERS {
+ assert!(
+ !outcome.reasoning_stream.contains(forbidden),
+ "Gemma 4: reasoning_stream leaked marker {forbidden:?}; \
+ reasoning_stream={:?}",
+ outcome.reasoning_stream
+ );
+ assert!(
+ !outcome.content_stream.contains(forbidden),
+ "Gemma 4: content_stream leaked marker {forbidden:?}; \
+ content_stream={:?}",
+ outcome.content_stream
+ );
+ }
+
+ Ok(())
+ }
+}
+
+mod gemma4_parses_tool_call_payload {
+ use anyhow::Result;
+ use anyhow::bail;
+ use llama_cpp_bindings::ChatMessageParseOutcome;
+ use llama_cpp_bindings::ToolCallArguments;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ const TOOLS_JSON: &str = r#"[
+ {
+ "type": "function",
+ "function": {
+ "name": "get_weather",
+ "description": "Get the current weather for a location",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "location": {"type": "string", "description": "The city name"}
+ },
+ "required": ["location"]
+ }
+ }
+ }
+ ]"#;
+
+ const GEMMA4_PAIRED_QUOTE_PAYLOAD: &str =
+ "<|tool_call>call:get_weather{location:<|\"|>Paris<|\"|>}";
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn gemma4_parses_tool_call_payload(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let outcome =
+ fixture
+ .model
+ .parse_chat_message(TOOLS_JSON, GEMMA4_PAIRED_QUOTE_PAYLOAD, false)?;
+
+ let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+ bail!(
+ "expected Recognized for Gemma 4 PairedQuote on a Gemma-4 model; got Unrecognized"
+ );
+ };
+ assert_eq!(
+ parsed.tool_calls.len(),
+ 1,
+ "expected one tool call; got {:?}",
+ parsed.tool_calls
+ );
+ assert_eq!(parsed.tool_calls[0].name, "get_weather");
+ let location = match &parsed.tool_calls[0].arguments {
+ ToolCallArguments::ValidJson(value) => value
+ .get("location")
+ .and_then(|v| v.as_str())
+ .map(str::to_owned),
+ ToolCallArguments::InvalidJson(raw) => {
+ bail!("expected ValidJson, got InvalidJson: {raw}");
+ }
+ };
+ assert_eq!(location.as_deref(), Some("Paris"));
+
+ Ok(())
+ }
+}
+
+mod gemma4_template_override_returns_full_markers {
+ #![expect(
+ clippy::unnecessary_wraps,
+ reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
+ )]
+
+ use anyhow::Result;
+ use llama_cpp_bindings::ToolCallArgsShape;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn gemma4_template_override_returns_full_markers(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let model = fixture.model;
+ let template = model
+ .chat_template(None)
+ .expect("Gemma 4 chat template must be present");
+ let template_str = template.to_str().expect("template must be valid UTF-8");
+ assert!(
+ template_str.contains("<|tool_call>call:"),
+ "Gemma 4 chat template must contain '<|tool_call>call:' fingerprint; \
+ template starts with: {:?}",
+ &template_str[..template_str.len().min(200)],
+ );
+
+ let markers = model
+ .tool_call_markers()
+ .expect("Gemma 4 must produce ToolCallMarkers via override registry");
+
+ assert_eq!(markers.open, "<|tool_call>call:");
+ assert_eq!(markers.close, "}");
+ let ToolCallArgsShape::PairedQuote(shape) = markers.args_shape else {
+ panic!("expected PairedQuote variant, got {:?}", markers.args_shape);
+ };
+ assert_eq!(shape.name_args_separator, "{");
+ assert_eq!(shape.value_quote.open, "<|\"|>");
+ assert_eq!(shape.value_quote.close, "<|\"|>");
+
+ Ok(())
+ }
+}
+
+mod glm47_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt {
+ use anyhow::Result;
+ use llama_cpp_bindings::context::LlamaContext;
+ use llama_cpp_bindings::llama_batch::LlamaBatch;
+ use llama_cpp_bindings::model::AddBos;
+ use llama_cpp_bindings::sampling::LlamaSampler;
+ use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ const MAX_GENERATED_TOKENS: i32 = 200;
+
+ const GLM47_THINKING_DISABLED_PROMPT: &str = "\
+ <|user|>
+ What is 2 + 2?
+ <|assistant|>
+
+
+ ";
+
+ const FORBIDDEN_MARKERS: &[&str] = &["", ""];
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 8192,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn glm47_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let model = fixture.model;
+ let backend = fixture.backend;
+
+ let mut classifier = model.sampled_token_classifier();
+ let prompt_tokens = model.str_to_token(GLM47_THINKING_DISABLED_PROMPT, AddBos::Never)?;
+ let prompt_token_count = u64::try_from(prompt_tokens.len())?;
+
+ let mut batch = LlamaBatch::new(2048, 1)?;
+ classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
+
+ let mut context = LlamaContext::from_model(
+ model,
+ backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+
+ context.decode(&mut batch)?;
+
+ let promoted = classifier.commit_prompt_tokens();
+ assert_eq!(promoted, prompt_token_count);
+
+ let mut sampler = LlamaSampler::chain_simple([
+ LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
+ LlamaSampler::top_k(40),
+ LlamaSampler::top_p(0.9, 1),
+ LlamaSampler::min_p(0.05, 1),
+ LlamaSampler::temp(0.7),
+ LlamaSampler::dist(0x00C0_FFEE),
+ ]);
+ let initial_position = batch.n_tokens();
+ let outcome = ClassifySampleLoop {
+ model,
+ classifier: &mut classifier,
+ sampler: &mut sampler,
+ context: &mut context,
+ batch: &mut batch,
+ initial_position,
+ max_generated_tokens: MAX_GENERATED_TOKENS,
+ }
+ .run()?;
+
+ let usage = classifier.usage();
+
+ assert!(!outcome.generated_raw.is_empty());
+ assert_eq!(outcome.observed_reasoning, 0);
+ assert_eq!(outcome.observed_undeterminable, 0);
+ assert_eq!(usage.reasoning_tokens, 0);
+ assert_eq!(usage.undeterminable_tokens, 0);
+ assert!(outcome.observed_content > 0);
+ assert_eq!(usage.completion_tokens(), outcome.observed_content);
+
+ for forbidden in FORBIDDEN_MARKERS {
+ assert!(!outcome.content_stream.contains(forbidden));
+ }
+
+ Ok(())
+ }
+}
+
+mod glm47_classifier_emits_reasoning {
+ use anyhow::Result;
+ use anyhow::bail;
+ use llama_cpp_bindings::ChatMessageParseOutcome;
+ use llama_cpp_bindings::context::LlamaContext;
+ use llama_cpp_bindings::llama_batch::LlamaBatch;
+ use llama_cpp_bindings::model::AddBos;
+ use llama_cpp_bindings::sampling::LlamaSampler;
+ use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ const MAX_GENERATED_TOKENS: i32 = 1500;
+
+ const GLM47_THINKING_PROMPT: &str = "\
+ <|user|>
+ What is 2 + 2?
+ <|assistant|>
+
+ ";
+
+ const FORBIDDEN_MARKERS: &[&str] = &["", ""];
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 8192,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn glm47_classifier_emits_reasoning_for_thinking_enabled_prompt(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let model = fixture.model;
+ let backend = fixture.backend;
+
+ let mut classifier = model.sampled_token_classifier();
+ let prompt_tokens = model.str_to_token(GLM47_THINKING_PROMPT, AddBos::Never)?;
+ let prompt_token_count = u64::try_from(prompt_tokens.len())?;
+
+ let mut batch = LlamaBatch::new(2048, 1)?;
+ classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
+
+ let mut context = LlamaContext::from_model(
+ model,
+ backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+
+ context.decode(&mut batch)?;
+
+ let promoted = classifier.commit_prompt_tokens();
+ assert_eq!(promoted, prompt_token_count);
+
+ let mut sampler = LlamaSampler::chain_simple([
+ LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
+ LlamaSampler::top_k(40),
+ LlamaSampler::top_p(0.9, 1),
+ LlamaSampler::min_p(0.05, 1),
+ LlamaSampler::temp(0.7),
+ LlamaSampler::dist(0x00C0_FFEE),
+ ]);
+ let initial_position = batch.n_tokens();
+ let outcome = ClassifySampleLoop {
+ model,
+ classifier: &mut classifier,
+ sampler: &mut sampler,
+ context: &mut context,
+ batch: &mut batch,
+ initial_position,
+ max_generated_tokens: MAX_GENERATED_TOKENS,
+ }
+ .run()?;
+
+ let usage = classifier.usage();
+ let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?;
+ let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else {
+ bail!("GLM-4.7 chat template must be recognised by the parser; got Unrecognized");
+ };
+
+ assert!(!outcome.generated_raw.is_empty());
+ assert!(outcome.observed_reasoning > 0);
+ assert!(usage.reasoning_tokens > 0);
+ assert_eq!(outcome.observed_undeterminable, 0);
+ assert_eq!(usage.undeterminable_tokens, 0);
+ assert_eq!(
+ usage.completion_tokens(),
+ outcome.observed_content + outcome.observed_reasoning
+ );
+
+ if parsed.reasoning_content.is_empty() {
+ eprintln!(
+ "GLM-4.7 didn't close its reasoning block within {MAX_GENERATED_TOKENS} tokens — \
+ skipping strict parser-equality assertions"
+ );
+ } else {
+ assert_eq!(outcome.reasoning_stream, parsed.reasoning_content);
+ assert_eq!(outcome.content_stream, parsed.content);
+ }
+
+ for forbidden in FORBIDDEN_MARKERS {
+ assert!(!outcome.reasoning_stream.contains(forbidden));
+ assert!(!outcome.content_stream.contains(forbidden));
+ }
+
+ Ok(())
+ }
+}
+
+mod glm47_parses_tool_call_payload {
+ use anyhow::Result;
+ use anyhow::bail;
+ use llama_cpp_bindings::ChatMessageParseOutcome;
+ use llama_cpp_bindings::ToolCallArguments;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ const TOOLS_JSON: &str = r#"[
+ {
+ "type": "function",
+ "function": {
+ "name": "get_weather",
+ "description": "Get the current weather for a location",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "location": {"type": "string", "description": "The city name"}
+ },
+ "required": ["location"]
+ }
+ }
+ }
+ ]"#;
+
+ const GLM47_KEY_VALUE_PAYLOAD: &str = "get_weather\
+ location\
+ Paris\
+ ";
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn glm47_parses_tool_call_payload(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let outcome =
+ fixture
+ .model
+ .parse_chat_message(TOOLS_JSON, GLM47_KEY_VALUE_PAYLOAD, false)?;
+
+ let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+ bail!(
+ "expected Recognized for GLM-4.7 key-value tags on a GLM-4.7-Flash model; got Unrecognized"
+ );
+ };
+ assert_eq!(parsed.tool_calls.len(), 1);
+ assert_eq!(parsed.tool_calls[0].name, "get_weather");
+ let location = match &parsed.tool_calls[0].arguments {
+ ToolCallArguments::ValidJson(value) => value
+ .get("location")
+ .and_then(|v| v.as_str())
+ .map(str::to_owned),
+ ToolCallArguments::InvalidJson(raw) => {
+ bail!("expected ValidJson, got InvalidJson: {raw}");
+ }
+ };
+ assert_eq!(location.as_deref(), Some("Paris"));
+
+ Ok(())
+ }
+}
+
+mod glm47_template_override_returns_full_markers {
+ #![expect(
+ clippy::unnecessary_wraps,
+ reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
+ )]
+
+ use anyhow::Result;
+ use llama_cpp_bindings::ToolCallArgsShape;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn glm47_template_override_returns_full_markers(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let model = fixture.model;
+ let template = model
+ .chat_template(None)
+ .expect("GLM-4.7 chat template must be present");
+ let template_str = template.to_str().expect("template must be valid UTF-8");
+ assert!(template_str.contains(""));
+
+ let markers = model
+ .tool_call_markers()
+ .expect("GLM-4.7 must produce ToolCallMarkers via override registry");
+
+ assert_eq!(markers.open, "");
+ assert_eq!(markers.close, "");
+ let ToolCallArgsShape::KeyValueXmlTags(shape) = markers.args_shape else {
+ panic!(
+ "expected KeyValueXmlTags variant, got {:?}",
+ markers.args_shape
+ );
+ };
+ assert_eq!(shape.key_open, "");
+ assert_eq!(shape.key_close, "");
+ assert_eq!(shape.value_open, "");
+ assert_eq!(shape.value_close, "");
+
+ Ok(())
+ }
+}
+
+mod mistral3_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt {
+ use anyhow::Result;
+ use llama_cpp_bindings::context::LlamaContext;
+ use llama_cpp_bindings::llama_batch::LlamaBatch;
+ use llama_cpp_bindings::model::AddBos;
+ use llama_cpp_bindings::sampling::LlamaSampler;
+ use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ const MAX_GENERATED_TOKENS: i32 = 200;
+
+ const MISTRAL3_THINKING_DISABLED_PROMPT: &str = "\
+ [INST]Reply with the single word: four. Do not explain.[/INST][THINK][/THINK]";
+
+ const FORBIDDEN_MARKERS: &[&str] = &["[THINK]", "[/THINK]"];
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 8192,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn mistral3_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let model = fixture.model;
+ let backend = fixture.backend;
+
+ let mut classifier = model.sampled_token_classifier();
+ let prompt_tokens =
+ model.str_to_token(MISTRAL3_THINKING_DISABLED_PROMPT, AddBos::Always)?;
+ let prompt_token_count = u64::try_from(prompt_tokens.len())?;
+
+ let mut batch = LlamaBatch::new(2048, 1)?;
+ classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
+
+ let mut context = LlamaContext::from_model(
+ model,
+ backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+
+ context.decode(&mut batch)?;
+
+ let promoted = classifier.commit_prompt_tokens();
+ assert_eq!(promoted, prompt_token_count);
+
+ let mut sampler = LlamaSampler::greedy();
+ let initial_position = batch.n_tokens();
+ let outcome = ClassifySampleLoop {
+ model,
+ classifier: &mut classifier,
+ sampler: &mut sampler,
+ context: &mut context,
+ batch: &mut batch,
+ initial_position,
+ max_generated_tokens: MAX_GENERATED_TOKENS,
+ }
+ .run()?;
+
+ let usage = classifier.usage();
+
+ assert!(!outcome.generated_raw.is_empty());
+ assert_eq!(outcome.observed_reasoning, 0);
+ assert_eq!(outcome.observed_undeterminable, 0);
+ assert_eq!(usage.reasoning_tokens, 0);
+ assert_eq!(usage.undeterminable_tokens, 0);
+ assert!(outcome.observed_content > 0);
+ assert_eq!(usage.completion_tokens(), outcome.observed_content);
+
+ for forbidden in FORBIDDEN_MARKERS {
+ assert!(!outcome.content_stream.contains(forbidden));
+ }
+
+ Ok(())
+ }
+}
+
+mod mistral3_classifier_emits_reasoning {
+ use anyhow::Result;
+ use anyhow::bail;
+ use llama_cpp_bindings::ChatMessageParseOutcome;
+ use llama_cpp_bindings::context::LlamaContext;
+ use llama_cpp_bindings::llama_batch::LlamaBatch;
+ use llama_cpp_bindings::model::AddBos;
+ use llama_cpp_bindings::sampling::LlamaSampler;
+ use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ const MAX_GENERATED_TOKENS: i32 = 768;
+
+ const MISTRAL3_THINKING_PROMPT: &str = "\
+ [SYSTEM_PROMPT]# HOW YOU SHOULD THINK AND ANSWER\n\n\
+ First draft your thinking process (inner monologue) until you arrive at a response. \
+ Format your response using Markdown, and use LaTeX for any mathematical equations. \
+ Write both your thoughts and the response in the same language as the input.\n\n\
+ Your thinking process must follow the template below:\
+ [THINK]Your thoughts or/and draft, like working through an exercise on scratch paper. \
+ Be as casual and as long as you want until you are confident to generate the response \
+ to the user.[/THINK]Here, provide a self-contained response.[/SYSTEM_PROMPT]\
+ [INST]Reply with the single word: four. Do not explain.[/INST]";
+
+ const FORBIDDEN_MARKERS: &[&str] = &["[THINK]", "[/THINK]"];
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 8192,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn mistral3_classifier_emits_reasoning_for_thinking_prompt(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let model = fixture.model;
+ let backend = fixture.backend;
+
+ let mut classifier = model.sampled_token_classifier();
+ let prompt_tokens = model.str_to_token(MISTRAL3_THINKING_PROMPT, AddBos::Always)?;
+ let prompt_token_count = u64::try_from(prompt_tokens.len())?;
+
+ let mut batch = LlamaBatch::new(2048, 1)?;
+ classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
+
+ let mut context = LlamaContext::from_model(
+ model,
+ backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+
+ context.decode(&mut batch)?;
+
+ let promoted = classifier.commit_prompt_tokens();
+ assert_eq!(promoted, prompt_token_count);
+
+ let mut sampler = LlamaSampler::greedy();
+ let initial_position = batch.n_tokens();
+ let outcome = ClassifySampleLoop {
+ model,
+ classifier: &mut classifier,
+ sampler: &mut sampler,
+ context: &mut context,
+ batch: &mut batch,
+ initial_position,
+ max_generated_tokens: MAX_GENERATED_TOKENS,
+ }
+ .run()?;
+
+ let usage = classifier.usage();
+ let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?;
+ let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else {
+ bail!("Mistral 3 chat template must be recognised by the parser; got Unrecognized");
+ };
+
+ assert!(!outcome.generated_raw.is_empty());
+ assert!(outcome.observed_reasoning > 0);
+ assert!(usage.reasoning_tokens > 0);
+ assert_eq!(outcome.observed_undeterminable, 0);
+ assert_eq!(usage.undeterminable_tokens, 0);
+ assert_eq!(
+ usage.completion_tokens(),
+ outcome.observed_content + outcome.observed_reasoning,
+ );
+ assert!(!parsed.reasoning_content.is_empty());
+ assert_eq!(outcome.reasoning_stream, parsed.reasoning_content);
+ assert_eq!(outcome.content_stream, parsed.content);
+
+ for forbidden in FORBIDDEN_MARKERS {
+ assert!(!outcome.reasoning_stream.contains(forbidden));
+ assert!(!outcome.content_stream.contains(forbidden));
+ }
+
+ Ok(())
+ }
+}
+
+mod mistral3_parses_tool_call_payload {
+ use anyhow::Result;
+ use anyhow::bail;
+ use llama_cpp_bindings::ChatMessageParseOutcome;
+ use llama_cpp_bindings::ToolCallArguments;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ const TOOLS_JSON: &str = r#"[
+ {
+ "type": "function",
+ "function": {
+ "name": "get_weather",
+ "description": "Get the current weather for a location",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "location": {"type": "string", "description": "The city name"}
+ },
+ "required": ["location"]
+ }
+ }
+ }
+ ]"#;
+
+ const MISTRAL3_BRACKETED_JSON_PAYLOAD: &str =
+ r#"[TOOL_CALLS]get_weather[ARGS]{"location":"Paris"}"#;
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn mistral3_parses_tool_call_payload(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let outcome =
+ fixture
+ .model
+ .parse_chat_message(TOOLS_JSON, MISTRAL3_BRACKETED_JSON_PAYLOAD, false)?;
+
+ let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+ bail!(
+ "expected Recognized for Mistral 3 BracketedJson on a Mistral-3 model; got Unrecognized"
+ );
+ };
+ assert_eq!(parsed.tool_calls.len(), 1);
+ assert_eq!(parsed.tool_calls[0].name, "get_weather");
+ let location = match &parsed.tool_calls[0].arguments {
+ ToolCallArguments::ValidJson(value) => value
+ .get("location")
+ .and_then(|v| v.as_str())
+ .map(str::to_owned),
+ ToolCallArguments::InvalidJson(raw) => {
+ bail!("expected ValidJson, got InvalidJson: {raw}");
+ }
+ };
+ assert_eq!(location.as_deref(), Some("Paris"));
+
+ Ok(())
+ }
+}
+
+mod qwen35_chat_inference_emits_reasoning_when_template_auto_opens {
+ use anyhow::Result;
+ use anyhow::bail;
+ use llama_cpp_bindings::ChatMessageParseOutcome;
+ use llama_cpp_bindings::context::LlamaContext;
+ use llama_cpp_bindings::llama_batch::LlamaBatch;
+ use llama_cpp_bindings::model::AddBos;
+ use llama_cpp_bindings::model::LlamaChatMessage;
+ use llama_cpp_bindings::sampling::LlamaSampler;
+ use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 2048,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn qwen35_chat_inference_emits_reasoning_when_template_auto_opens(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let model = fixture.model;
+ let backend = fixture.backend;
+
+ let mut context = LlamaContext::from_model(
+ model,
+ backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+
+ let chat_template = model.chat_template(None)?;
+ let messages = vec![LlamaChatMessage::new(
+ "user".to_owned(),
+ "Hello! How are you?".to_owned(),
+ )?];
+ let prompt = model.apply_chat_template(&chat_template, &messages, true)?;
+
+ let mut classifier = model.sampled_token_classifier();
+ let tokens = model.str_to_token(&prompt, AddBos::Always)?;
+ let prompt_token_count = u64::try_from(tokens.len())?;
+
+ let mut batch = LlamaBatch::new(512, 1)?;
+ classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?;
+
+ context.decode(&mut batch)?;
+
+ let promoted = classifier.commit_prompt_tokens();
+ assert_eq!(promoted, prompt_token_count);
+
+ let mut sampler = LlamaSampler::greedy();
+ let initial_position = batch.n_tokens();
+ let outcome = ClassifySampleLoop {
+ model,
+ classifier: &mut classifier,
+ sampler: &mut sampler,
+ context: &mut context,
+ batch: &mut batch,
+ initial_position,
+ max_generated_tokens: 1024,
+ }
+ .run()?;
+
+ assert!(!outcome.generated_raw.is_empty());
+ assert!(outcome.observed_reasoning > 0);
+ assert!(outcome.observed_content > 0);
+ assert_eq!(outcome.observed_undeterminable, 0);
+ assert_eq!(outcome.observed_tool_call, 0);
+
+ let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?;
+ let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else {
+ bail!("Qwen3.5 chat template must be recognised by the parser; got Unrecognized");
+ };
+ assert!(!parsed.content.is_empty());
+
+ let usage = classifier.into_usage();
+ assert_eq!(usage.prompt_tokens, prompt_token_count);
+ assert_eq!(usage.reasoning_tokens, outcome.observed_reasoning);
+ assert_eq!(usage.undeterminable_tokens, 0);
+
+ Ok(())
+ }
+}
+
+mod qwen35_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt {
+ use anyhow::Result;
+ use llama_cpp_bindings::context::LlamaContext;
+ use llama_cpp_bindings::llama_batch::LlamaBatch;
+ use llama_cpp_bindings::model::AddBos;
+ use llama_cpp_bindings::sampling::LlamaSampler;
+ use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ const MAX_GENERATED_TOKENS: i32 = 200;
+
+ const QWEN35_THINKING_DISABLED_PROMPT: &str = "\
+ <|im_start|>user
+ What is 2 + 2?<|im_end|>
+ <|im_start|>assistant
+
+
+
+
+ ";
+
+ const FORBIDDEN_MARKERS: &[&str] = &["", ""];
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 8192,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn qwen35_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let model = fixture.model;
+ let backend = fixture.backend;
+
+ let mut classifier = model.sampled_token_classifier();
+ let prompt_tokens = model.str_to_token(QWEN35_THINKING_DISABLED_PROMPT, AddBos::Never)?;
+ let prompt_token_count = u64::try_from(prompt_tokens.len())?;
+
+ let mut batch = LlamaBatch::new(2048, 1)?;
+ classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
+
+ let mut context = LlamaContext::from_model(
+ model,
+ backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+
+ context.decode(&mut batch)?;
+
+ let promoted = classifier.commit_prompt_tokens();
+ assert_eq!(promoted, prompt_token_count);
+
+ let mut sampler = LlamaSampler::chain_simple([
+ LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
+ LlamaSampler::top_k(40),
+ LlamaSampler::top_p(0.9, 1),
+ LlamaSampler::min_p(0.05, 1),
+ LlamaSampler::temp(0.7),
+ LlamaSampler::dist(0x00C0_FFEE),
+ ]);
+ let initial_position = batch.n_tokens();
+ let outcome = ClassifySampleLoop {
+ model,
+ classifier: &mut classifier,
+ sampler: &mut sampler,
+ context: &mut context,
+ batch: &mut batch,
+ initial_position,
+ max_generated_tokens: MAX_GENERATED_TOKENS,
+ }
+ .run()?;
+
+ let usage = classifier.usage();
+
+ assert!(!outcome.generated_raw.is_empty());
+ assert_eq!(outcome.observed_reasoning, 0);
+ assert_eq!(outcome.observed_undeterminable, 0);
+ assert_eq!(usage.reasoning_tokens, 0);
+ assert_eq!(usage.undeterminable_tokens, 0);
+ assert!(outcome.observed_content > 0);
+ assert_eq!(usage.completion_tokens(), outcome.observed_content);
+
+ for forbidden in FORBIDDEN_MARKERS {
+ assert!(!outcome.content_stream.contains(forbidden));
+ }
+
+ Ok(())
+ }
+}
+
+mod qwen35_classifier_emits_reasoning {
+ use anyhow::Result;
+ use anyhow::bail;
+ use llama_cpp_bindings::ChatMessageParseOutcome;
+ use llama_cpp_bindings::context::LlamaContext;
+ use llama_cpp_bindings::llama_batch::LlamaBatch;
+ use llama_cpp_bindings::model::AddBos;
+ use llama_cpp_bindings::sampling::LlamaSampler;
+ use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ const MAX_GENERATED_TOKENS: i32 = 1500;
+
+ const QWEN35_THINKING_PROMPT: &str = "\
+ <|im_start|>user
+ What is 2 + 2?<|im_end|>
+ <|im_start|>assistant
+
+ ";
+
+ const FORBIDDEN_MARKERS: &[&str] = &["", ""];
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 8192,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn qwen35_classifier_emits_reasoning_for_thinking_enabled_prompt(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let model = fixture.model;
+ let backend = fixture.backend;
+
+ let mut classifier = model.sampled_token_classifier();
+ let prompt_tokens = model.str_to_token(QWEN35_THINKING_PROMPT, AddBos::Never)?;
+ let prompt_token_count = u64::try_from(prompt_tokens.len())?;
+
+ let mut batch = LlamaBatch::new(2048, 1)?;
+ classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
+
+ let mut context = LlamaContext::from_model(
+ model,
+ backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+
+ context.decode(&mut batch)?;
+
+ let promoted = classifier.commit_prompt_tokens();
+ assert_eq!(promoted, prompt_token_count);
+
+ let mut sampler = LlamaSampler::chain_simple([
+ LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
+ LlamaSampler::top_k(40),
+ LlamaSampler::top_p(0.9, 1),
+ LlamaSampler::min_p(0.05, 1),
+ LlamaSampler::temp(0.7),
+ LlamaSampler::dist(0x00C0_FFEE),
+ ]);
+ let initial_position = batch.n_tokens();
+ let outcome = ClassifySampleLoop {
+ model,
+ classifier: &mut classifier,
+ sampler: &mut sampler,
+ context: &mut context,
+ batch: &mut batch,
+ initial_position,
+ max_generated_tokens: MAX_GENERATED_TOKENS,
+ }
+ .run()?;
+
+ let usage = classifier.usage();
+ let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?;
+ let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else {
+ bail!("Qwen3.5 chat template must be recognised by the parser; got Unrecognized");
+ };
+
+ assert!(!outcome.generated_raw.is_empty());
+ assert!(outcome.observed_reasoning > 0);
+ assert!(usage.reasoning_tokens > 0);
+ assert_eq!(outcome.observed_undeterminable, 0);
+ assert_eq!(usage.undeterminable_tokens, 0);
+ assert_eq!(
+ usage.completion_tokens(),
+ outcome.observed_content + outcome.observed_reasoning,
+ );
+
+ if parsed.reasoning_content.is_empty() {
+ eprintln!(
+ "Qwen3.5 didn't close its reasoning block within {MAX_GENERATED_TOKENS} tokens — \
+ skipping strict parser-equality assertions"
+ );
+ } else {
+ assert_eq!(outcome.reasoning_stream, parsed.reasoning_content);
+ assert_eq!(outcome.content_stream, parsed.content);
+ }
+
+ for forbidden in FORBIDDEN_MARKERS {
+ assert!(!outcome.reasoning_stream.contains(forbidden));
+ assert!(!outcome.content_stream.contains(forbidden));
+ }
+
+ Ok(())
+ }
+}
+
+mod qwen35_parses_constrained_schema_payload {
+ use anyhow::Result;
+ use anyhow::bail;
+ use llama_cpp_bindings::ChatMessageParseOutcome;
+ use llama_cpp_bindings::ToolCallArguments;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+ use serde_json::Value;
+ use serde_json::json;
+
+ const NEGOTIATE_WITH_CAT_TOOLS_JSON: &str = r#"[
+ {
+ "type": "function",
+ "function": {
+ "name": "negotiate_with_cat",
+ "description": "Attempt to negotiate with a cat. Outcomes are not guaranteed and may include the silent treatment.",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "topic": {
+ "type": "string",
+ "description": "What you are trying to negotiate, e.g. 'get off the keyboard' or 'stop knocking things off the table'"
+ },
+ "bribe": {
+ "type": "string",
+ "enum": ["tuna", "salmon", "treats", "ear_scritches", "cardboard_box", "none"],
+ "description": "What you are offering in exchange"
+ },
+ "desperation_level": {
+ "type": "integer",
+ "description": "How desperate you are, on a scale from 1 (mildly annoyed human) to 10 (it is 3am)",
+ "minimum": 1,
+ "maximum": 10
+ }
+ },
+ "required": ["topic"],
+ "additionalProperties": false
+ }
+ }
+ }
+ ]"#;
+
+ const NEGOTIATE_WITH_CAT_INPUT: &str = "\n\
+ \n\
+ \n\
+ tuna\n\
+ \n\
+ \n\
+ 8\n\
+ \n\
+ \n\
+ get off the keyboard\n\
+ \n\
+ \n\
+ ";
+
+ fn arguments_as_json(arguments: &ToolCallArguments) -> Result<&Value> {
+ match arguments {
+ ToolCallArguments::ValidJson(value) => Ok(value),
+ ToolCallArguments::InvalidJson(raw) => {
+ bail!("expected ValidJson arguments, got InvalidJson: {raw}")
+ }
+ }
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn qwen35_parses_constrained_schema_payload(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let outcome = fixture.model.parse_chat_message(
+ NEGOTIATE_WITH_CAT_TOOLS_JSON,
+ NEGOTIATE_WITH_CAT_INPUT,
+ false,
+ )?;
+
+ let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+ bail!(
+ "Qwen 3.5's tool-call payload must be parsed by the wrapper-side duck-type pass; \
+ got Unrecognized"
+ );
+ };
+
+ assert_eq!(parsed.tool_calls.len(), 1);
+ assert_eq!(parsed.tool_calls[0].name, "negotiate_with_cat");
+ assert_eq!(parsed.tool_calls[0].id, "call_0");
+ assert_eq!(
+ arguments_as_json(&parsed.tool_calls[0].arguments)?,
+ &json!({
+ "bribe": "tuna",
+ "desperation_level": 8,
+ "topic": "get off the keyboard",
+ }),
+ );
+
+ Ok(())
+ }
+}
+
+mod qwen35_parses_tool_call_payload {
+ use anyhow::Result;
+ use anyhow::bail;
+ use llama_cpp_bindings::ChatMessageParseOutcome;
+ use llama_cpp_bindings::ToolCallArguments;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ const TOOLS_JSON: &str = r#"[
+ {
+ "type": "function",
+ "function": {
+ "name": "get_weather",
+ "description": "Get the current weather for a location",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "location": {"type": "string", "description": "The city name"}
+ },
+ "required": ["location"]
+ }
+ }
+ }
+ ]"#;
+
+ const QWEN_XML_PAYLOAD: &str = "\n\
+ \n\
+ \n\
+ Paris\n\
+ \n\
+ \n\
+ ";
+
+ const PARTIAL_QWEN_XML_PAYLOAD: &str = "\n\n\n\
+ \n\
+ \n\
+ Paris\n\
+ \n\
+ \n\
+ \n\
+ \n\
+ \n\
+ \n\
+ Berlin\n\
+ \n\
+ \n\
+ ";
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn qwen35_parses_tool_call_payload(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let outcome = fixture
+ .model
+ .parse_chat_message(TOOLS_JSON, QWEN_XML_PAYLOAD, false)?;
+
+ let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+ bail!("expected Recognized for Qwen XML on a Qwen-3.5 model; got Unrecognized");
+ };
+ assert_eq!(parsed.tool_calls.len(), 1);
+ assert_eq!(parsed.tool_calls[0].name, "get_weather");
+ let location = match &parsed.tool_calls[0].arguments {
+ ToolCallArguments::ValidJson(value) => value
+ .get("location")
+ .and_then(|v| v.as_str())
+ .map(str::to_owned),
+ ToolCallArguments::InvalidJson(raw) => {
+ bail!("expected ValidJson, got InvalidJson: {raw}");
+ }
+ };
+ assert_eq!(location.as_deref(), Some("Paris"));
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn qwen35_parses_partial_tool_call_returns_pending_state(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let outcome =
+ fixture
+ .model
+ .parse_chat_message(TOOLS_JSON, PARTIAL_QWEN_XML_PAYLOAD, true)?;
+
+ let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+ bail!("expected Recognized for partial Qwen XML on a Qwen-3.5 model; got Unrecognized");
+ };
+ assert!(parsed.tool_calls.is_empty() || parsed.tool_calls.len() == 1);
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn qwen35_parses_multiple_tool_calls(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let outcome = fixture
+ .model
+ .parse_chat_message(TOOLS_JSON, TWO_QWEN_XML_PAYLOADS, false)?;
+
+ let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+ bail!(
+ "expected Recognized for two Qwen XML payloads on a Qwen-3.5 model; got Unrecognized"
+ );
+ };
+ assert!(
+ !parsed.tool_calls.is_empty(),
+ "expected at least one tool call; got {:?}",
+ parsed.tool_calls
+ );
+
+ Ok(())
+ }
+}
+
+mod qwen35_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested {
+ use anyhow::Result;
+ use anyhow::bail;
+ use llama_cpp_bindings::ChatMessageParseOutcome;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ const TOOLS_JSON: &str = r#"[
+ {
+ "type": "function",
+ "function": {
+ "name": "get_weather",
+ "description": "Get the current weather for a location",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "location": {"type": "string", "description": "The city name"}
+ },
+ "required": ["location"]
+ }
+ }
+ }
+ ]"#;
+
+ const PLAIN_CONTENT: &str = "Sorry, I cannot help with that.";
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn qwen35_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let outcome = fixture
+ .model
+ .parse_chat_message(TOOLS_JSON, PLAIN_CONTENT, false)?;
+
+ let ChatMessageParseOutcome::Recognized(parsed) = outcome else {
+ bail!(
+ "Qwen 3.5 with tools requested + plain content must produce Recognized (with empty \
+ tool_calls); got Unrecognized"
+ );
+ };
+ assert!(
+ parsed.tool_calls.is_empty(),
+ "expected no tool calls; got {:?}",
+ parsed.tool_calls
+ );
+
+ Ok(())
+ }
+}
+
+mod qwen36_chat_inference_emits_reasoning_when_template_auto_opens {
+ use anyhow::Result;
+ use anyhow::bail;
+ use llama_cpp_bindings::ChatMessageParseOutcome;
+ use llama_cpp_bindings::context::LlamaContext;
+ use llama_cpp_bindings::llama_batch::LlamaBatch;
+ use llama_cpp_bindings::model::AddBos;
+ use llama_cpp_bindings::model::LlamaChatMessage;
+ use llama_cpp_bindings::sampling::LlamaSampler;
+ use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 2048,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn qwen36_chat_inference_emits_reasoning_when_template_auto_opens(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let model = fixture.model;
+ let backend = fixture.backend;
+
+ let mut context = LlamaContext::from_model(
+ model,
+ backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+
+ let chat_template = model.chat_template(None)?;
+ let messages = vec![LlamaChatMessage::new(
+ "user".to_owned(),
+ "Hello! How are you?".to_owned(),
+ )?];
+ let prompt = model.apply_chat_template(&chat_template, &messages, true)?;
+
+ let mut classifier = model.sampled_token_classifier();
+ let tokens = model.str_to_token(&prompt, AddBos::Always)?;
+ let prompt_token_count = u64::try_from(tokens.len())?;
+
+ let mut batch = LlamaBatch::new(512, 1)?;
+ classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?;
+
+ context.decode(&mut batch)?;
+
+ let promoted = classifier.commit_prompt_tokens();
+ assert_eq!(promoted, prompt_token_count);
+
+ let mut sampler = LlamaSampler::greedy();
+ let initial_position = batch.n_tokens();
+ let outcome = ClassifySampleLoop {
+ model,
+ classifier: &mut classifier,
+ sampler: &mut sampler,
+ context: &mut context,
+ batch: &mut batch,
+ initial_position,
+ max_generated_tokens: 1024,
+ }
+ .run()?;
+
+ assert!(!outcome.generated_raw.is_empty());
+ assert!(outcome.observed_reasoning > 0);
+ assert!(outcome.observed_content > 0);
+ assert_eq!(outcome.observed_undeterminable, 0);
+ assert_eq!(outcome.observed_tool_call, 0);
+
+ let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?;
+ let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else {
+ bail!("Qwen3.6 chat template must be recognised by the parser; got Unrecognized");
+ };
+ assert!(!parsed.content.is_empty());
+
+ let usage = classifier.into_usage();
+ assert_eq!(usage.prompt_tokens, prompt_token_count);
+ assert_eq!(usage.reasoning_tokens, outcome.observed_reasoning);
+ assert_eq!(usage.undeterminable_tokens, 0);
+
+ Ok(())
+ }
+}
+
+mod qwen36_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt {
+ use anyhow::Result;
+ use llama_cpp_bindings::context::LlamaContext;
+ use llama_cpp_bindings::llama_batch::LlamaBatch;
+ use llama_cpp_bindings::model::AddBos;
+ use llama_cpp_bindings::sampling::LlamaSampler;
+ use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ const MAX_GENERATED_TOKENS: i32 = 200;
+
+ const QWEN36_THINKING_DISABLED_PROMPT: &str = "\
+ <|im_start|>user
+ What is 2 + 2?<|im_end|>
+ <|im_start|>assistant
+
+
+
+
+ ";
+
+ const FORBIDDEN_MARKERS: &[&str] = &["", ""];
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 8192,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn qwen36_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let model = fixture.model;
+ let backend = fixture.backend;
+
+ let mut classifier = model.sampled_token_classifier();
+ let prompt_tokens = model.str_to_token(QWEN36_THINKING_DISABLED_PROMPT, AddBos::Never)?;
+ let prompt_token_count = u64::try_from(prompt_tokens.len())?;
+
+ let mut batch = LlamaBatch::new(2048, 1)?;
+ classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
+
+ let mut context = LlamaContext::from_model(
+ model,
+ backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+
+ context.decode(&mut batch)?;
+
+ let promoted = classifier.commit_prompt_tokens();
+ assert_eq!(promoted, prompt_token_count);
+
+ let mut sampler = LlamaSampler::chain_simple([
+ LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
+ LlamaSampler::top_k(40),
+ LlamaSampler::top_p(0.9, 1),
+ LlamaSampler::min_p(0.05, 1),
+ LlamaSampler::temp(0.7),
+ LlamaSampler::dist(0x00C0_FFEE),
+ ]);
+ let initial_position = batch.n_tokens();
+ let outcome = ClassifySampleLoop {
+ model,
+ classifier: &mut classifier,
+ sampler: &mut sampler,
+ context: &mut context,
+ batch: &mut batch,
+ initial_position,
+ max_generated_tokens: MAX_GENERATED_TOKENS,
+ }
+ .run()?;
+
+ let usage = classifier.usage();
+
+ assert!(!outcome.generated_raw.is_empty());
+ assert_eq!(outcome.observed_reasoning, 0);
+ assert_eq!(outcome.observed_undeterminable, 0);
+ assert_eq!(usage.reasoning_tokens, 0);
+ assert_eq!(usage.undeterminable_tokens, 0);
+ assert!(outcome.observed_content > 0);
+ assert_eq!(usage.completion_tokens(), outcome.observed_content);
+
+ for forbidden in FORBIDDEN_MARKERS {
+ assert!(!outcome.content_stream.contains(forbidden));
+ }
+
+ Ok(())
+ }
+}
+
+mod qwen36_classifier_emits_reasoning {
+ use anyhow::Result;
+ use anyhow::bail;
+ use llama_cpp_bindings::ChatMessageParseOutcome;
+ use llama_cpp_bindings::context::LlamaContext;
+ use llama_cpp_bindings::llama_batch::LlamaBatch;
+ use llama_cpp_bindings::model::AddBos;
+ use llama_cpp_bindings::sampling::LlamaSampler;
+ use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ const MAX_GENERATED_TOKENS: i32 = 1500;
+
+ const QWEN36_THINKING_PROMPT: &str = "\
+ <|im_start|>user
+ What is 2 + 2?<|im_end|>
+ <|im_start|>assistant
+
+ ";
+
+ const FORBIDDEN_MARKERS: &[&str] = &["", ""];
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 8192,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn qwen36_classifier_emits_reasoning_for_thinking_enabled_prompt(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let model = fixture.model;
+ let backend = fixture.backend;
+
+ let mut classifier = model.sampled_token_classifier();
+ let prompt_tokens = model.str_to_token(QWEN36_THINKING_PROMPT, AddBos::Never)?;
+ let prompt_token_count = u64::try_from(prompt_tokens.len())?;
+
+ let mut batch = LlamaBatch::new(2048, 1)?;
+ classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?;
+
+ let mut context = LlamaContext::from_model(
+ model,
+ backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+
+ context.decode(&mut batch)?;
+
+ let promoted = classifier.commit_prompt_tokens();
+ assert_eq!(promoted, prompt_token_count);
+
+ let mut sampler = LlamaSampler::chain_simple([
+ LlamaSampler::penalties(64, 1.1, 0.0, 0.0),
+ LlamaSampler::top_k(40),
+ LlamaSampler::top_p(0.9, 1),
+ LlamaSampler::min_p(0.05, 1),
+ LlamaSampler::temp(0.7),
+ LlamaSampler::dist(0x00C0_FFEE),
+ ]);
+ let initial_position = batch.n_tokens();
+ let outcome = ClassifySampleLoop {
+ model,
+ classifier: &mut classifier,
+ sampler: &mut sampler,
+ context: &mut context,
+ batch: &mut batch,
+ initial_position,
+ max_generated_tokens: MAX_GENERATED_TOKENS,
+ }
+ .run()?;
+
+ let usage = classifier.usage();
+ let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, true)?;
+ let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else {
+ bail!("Qwen3.6 chat template must be recognised by the parser; got Unrecognized");
+ };
+
+ assert!(!outcome.generated_raw.is_empty());
+ assert!(outcome.observed_reasoning > 0);
+ assert!(usage.reasoning_tokens > 0);
+ assert_eq!(outcome.observed_undeterminable, 0);
+ assert_eq!(usage.undeterminable_tokens, 0);
+ assert_eq!(
+ usage.completion_tokens(),
+ outcome.observed_content + outcome.observed_reasoning,
+ );
+
+ if parsed.reasoning_content.is_empty() {
+ eprintln!(
+ "Qwen3.6 parser returned empty reasoning_content — relying on FORBIDDEN_MARKERS"
+ );
+ } else {
+ assert_eq!(outcome.reasoning_stream, parsed.reasoning_content);
+ assert_eq!(outcome.content_stream, parsed.content);
+ }
+
+ for forbidden in FORBIDDEN_MARKERS {
+ assert!(!outcome.reasoning_stream.contains(forbidden));
+ assert!(!outcome.content_stream.contains(forbidden));
+ }
+
+ Ok(())
+ }
+}
+
+llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/reranker.rs b/llama-cpp-bindings-tests/tests/reranker.rs
deleted file mode 100644
index d08de7eb..00000000
--- a/llama-cpp-bindings-tests/tests/reranker.rs
+++ /dev/null
@@ -1,158 +0,0 @@
-use std::time::Duration;
-
-use anyhow::{Context, Result, bail};
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::ggml_time_us;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-fn normalize(input: &[f32]) -> Vec {
- let magnitude = input
- .iter()
- .fold(0.0, |accumulator, &value| value.mul_add(value, accumulator))
- .sqrt();
-
- input.iter().map(|&value| value / magnitude).collect()
-}
-
-fn cosine_similarity(vec_a: &[f32], vec_b: &[f32]) -> f32 {
- vec_a
- .iter()
- .zip(vec_b.iter())
- .map(|(left, right)| left * right)
- .sum::()
-}
-
-#[llama_test(
- model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
- n_seq_max = 2,
- n_threads_batch = 8,
- embeddings = true,
-)]
-fn reranking_produces_scores(fixture: &LlamaFixture<'_>) -> Result<()> {
- let model = fixture.model;
-
- let query = "What is machine learning?";
- let documents = [
- "Machine learning is a subset of artificial intelligence.",
- "The weather today is sunny and warm.",
- ];
-
- let document_count = documents.len();
- assert_eq!(
- u32::try_from(document_count)?,
- fixture.context_params.n_seq_max,
- "attribute n_seq_max must match the document count this trial expects",
- );
-
- let mut ctx = LlamaContext::from_model(
- model,
- fixture.backend,
- (*fixture.context_params).into_llama_context_params(),
- )
- .with_context(|| "unable to create context")?;
-
- let prompt_lines: Vec = documents
- .iter()
- .map(|document| format!("{query}{document}"))
- .collect();
-
- let tokens_lines_list = prompt_lines
- .iter()
- .map(|line| model.str_to_token(line, AddBos::Always))
- .collect::, _>>()
- .with_context(|| "failed to tokenize prompts")?;
-
- let n_ctx = usize::try_from(ctx.n_ctx())?;
-
- if tokens_lines_list.iter().any(|tokens| n_ctx < tokens.len()) {
- bail!("one of the provided prompts exceeds the size of the context window");
- }
-
- let mut classifier = model.sampled_token_classifier();
- let mut batch = LlamaBatch::new(2048, i32::try_from(document_count)?)?;
- let t_main_start = ggml_time_us();
-
- for (sequence_index, tokens) in tokens_lines_list.iter().enumerate() {
- classifier.feed_prompt_sequence_to_batch(
- &mut batch,
- tokens,
- i32::try_from(sequence_index)?,
- false,
- )?;
- }
-
- let total_tokens: usize = tokens_lines_list.iter().map(Vec::len).sum();
- let total_token_count = u64::try_from(total_tokens)?;
-
- assert_eq!(classifier.pending_prompt_tokens(), total_token_count);
- assert_eq!(classifier.usage().prompt_tokens, 0);
-
- ctx.clear_kv_cache();
- ctx.decode(&mut batch)
- .with_context(|| "llama_decode() failed")?;
-
- let promoted = classifier.commit_prompt_tokens();
- assert_eq!(promoted, total_token_count);
-
- let mut embeddings = Vec::with_capacity(document_count);
-
- for sequence_index in 0..document_count {
- let raw_embedding = ctx
- .embeddings_seq_ith(i32::try_from(sequence_index)?)
- .with_context(|| "failed to get sequence embeddings")?;
- embeddings.push(normalize(raw_embedding));
- }
-
- let t_main_end = ggml_time_us();
- let duration = Duration::from_micros(u64::try_from(t_main_end - t_main_start)?);
-
- #[expect(
- clippy::cast_precision_loss,
- reason = "logged throughput tolerates f32 precision"
- )]
- let tokens_per_second = total_tokens as f32 / duration.as_secs_f32();
-
- eprintln!(
- "created embeddings for {total_tokens} tokens in {:.2} s, speed {tokens_per_second:.2} t/s",
- duration.as_secs_f32(),
- );
-
- assert_eq!(
- embeddings.len(),
- document_count,
- "should produce one embedding per document"
- );
-
- for (index, embedding) in embeddings.iter().enumerate() {
- assert!(
- !embedding.is_empty(),
- "embedding {index} should not be empty"
- );
- }
-
- let similarity = cosine_similarity(&embeddings[0], &embeddings[1]);
- eprintln!("cosine similarity between document embeddings: {similarity:.4}");
-
- assert!(
- similarity.is_finite(),
- "cosine similarity should be a finite number"
- );
-
- let usage = classifier.into_usage();
- assert_eq!(usage.prompt_tokens, total_token_count);
- assert_eq!(usage.completion_tokens(), 0);
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/sampled_token_classifier_markers.rs b/llama-cpp-bindings-tests/tests/sampled_token_classifier_markers.rs
deleted file mode 100644
index 4127fc58..00000000
--- a/llama-cpp-bindings-tests/tests/sampled_token_classifier_markers.rs
+++ /dev/null
@@ -1,513 +0,0 @@
-#![expect(
- clippy::unnecessary_wraps,
- reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
-)]
-
-use anyhow::Result;
-use llama_cpp_bindings::SampledToken;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::sampled_token_classifier::SampledTokenClassifier;
-use llama_cpp_bindings::sampled_token_section::SampledTokenSection;
-use llama_cpp_bindings::streaming_markers::StreamingMarkers;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn classifier_starts_in_pending_section_for_default_fixture(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let classifier = fixture.model.sampled_token_classifier();
-
- assert_eq!(classifier.current_section(), SampledTokenSection::Pending);
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn classifier_construction_is_idempotent_across_calls(fixture: &LlamaFixture<'_>) -> Result<()> {
- let first = fixture.model.sampled_token_classifier();
- let second = fixture.model.sampled_token_classifier();
-
- assert_eq!(first.current_section(), second.current_section());
- assert_eq!(first.usage(), second.usage());
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn ingest_with_no_markers_emits_undeterminable_with_visible_and_raw_piece(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let model = fixture.model;
- let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default());
-
- let outcomes = classifier.ingest(model.token_bos());
-
- assert_eq!(outcomes.len(), 1);
- let outcome = &outcomes[0];
- assert!(matches!(
- outcome.sampled_token,
- SampledToken::Undeterminable(_)
- ));
- assert_eq!(outcome.visible_piece, outcome.raw_piece);
- assert_eq!(classifier.usage().undeterminable_tokens, 1);
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn ingest_with_no_markers_decodes_each_token_independently(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let model = fixture.model;
- let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default());
-
- let _ = classifier.ingest(model.token_bos());
- let _ = classifier.ingest(model.token_eos());
-
- assert_eq!(classifier.usage().undeterminable_tokens, 2);
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn ingest_prompt_token_with_no_markers_is_a_noop(fixture: &LlamaFixture<'_>) -> Result<()> {
- let model = fixture.model;
- let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default());
- let usage_before = *classifier.usage();
-
- classifier.ingest_prompt_token(model.token_bos());
- classifier.ingest_prompt_tokens(&[model.token_eos(), model.token_nl()]);
-
- assert_eq!(*classifier.usage(), usage_before);
- assert_eq!(classifier.current_section(), SampledTokenSection::Pending);
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn feed_prompt_to_batch_increments_pending_prompt_tokens(fixture: &LlamaFixture<'_>) -> Result<()> {
- let model = fixture.model;
- let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default());
- let mut batch = LlamaBatch::new(8, 1)?;
-
- classifier.feed_prompt_to_batch(&mut batch, model.token_bos(), 0, &[0], false)?;
- classifier.feed_prompt_to_batch(&mut batch, model.token_eos(), 1, &[0], false)?;
-
- assert_eq!(classifier.pending_prompt_tokens(), 2);
- assert_eq!(batch.n_tokens(), 2);
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn feed_prompt_sequence_to_batch_stages_all_tokens(fixture: &LlamaFixture<'_>) -> Result<()> {
- let model = fixture.model;
- let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default());
- let mut batch = LlamaBatch::new(8, 1)?;
-
- let tokens = vec![model.token_bos(), model.token_eos(), model.token_nl()];
- classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?;
-
- assert_eq!(classifier.pending_prompt_tokens(), 3);
- assert_eq!(batch.n_tokens(), 3);
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn commit_prompt_tokens_promotes_pending_count_to_usage_and_clears(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let model = fixture.model;
- let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default());
- let mut batch = LlamaBatch::new(8, 1)?;
-
- classifier.feed_prompt_to_batch(&mut batch, model.token_bos(), 0, &[0], false)?;
- classifier.feed_prompt_to_batch(&mut batch, model.token_eos(), 1, &[0], false)?;
-
- let promoted = classifier.commit_prompt_tokens();
-
- assert_eq!(promoted, 2);
- assert_eq!(classifier.pending_prompt_tokens(), 0);
- assert_eq!(classifier.usage().prompt_tokens, 2);
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn discard_pending_prompt_tokens_clears_count_without_recording_usage(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let model = fixture.model;
- let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default());
- let mut batch = LlamaBatch::new(8, 1)?;
-
- classifier.feed_prompt_to_batch(&mut batch, model.token_bos(), 0, &[0], false)?;
-
- let discarded = classifier.discard_pending_prompt_tokens();
-
- assert_eq!(discarded, 1);
- assert_eq!(classifier.pending_prompt_tokens(), 0);
- assert_eq!(classifier.usage().prompt_tokens, 0);
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 128,
- n_ubatch = 64,
-)]
-fn diagnose_tool_call_synthetic_renders_returns_a_pair_of_strings(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let (left, right) = fixture.model.diagnose_tool_call_synthetic_renders()?;
- let _ = left;
- let _ = right;
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/sampling.rs b/llama-cpp-bindings-tests/tests/sampling.rs
deleted file mode 100644
index d03e965e..00000000
--- a/llama-cpp-bindings-tests/tests/sampling.rs
+++ /dev/null
@@ -1,429 +0,0 @@
-#![expect(
- clippy::unnecessary_wraps,
- reason = "every trial returns anyhow::Result<()> to match the LlamaTestFn signature"
-)]
-
-use anyhow::Result;
-use llama_cpp_bindings::GrammarError;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_bindings::sampling::LlamaSampler;
-use llama_cpp_bindings::token::LlamaToken;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn dry_sampler_with_model(fixture: &LlamaFixture<'_>) -> Result<()> {
- let breakers: Vec<&[u8]> = vec![b"\n", b"\t"];
- let _sampler = LlamaSampler::dry(fixture.model, 1.5, 2.0, 128, 2, &breakers);
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn dry_sampler_with_null_byte_in_seq_breakers_returns_error(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let breakers: Vec<&[u8]> = vec![b"hello\0world"];
- let result = LlamaSampler::dry(fixture.model, 1.5, 2.0, 128, 2, breakers);
-
- assert!(result.is_err());
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn grammar_returns_sampler_for_valid_grammar(fixture: &LlamaFixture<'_>) -> Result<()> {
- let sampler = LlamaSampler::grammar(fixture.model, "root ::= \"hello\"", "root");
-
- assert!(sampler.is_ok());
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn grammar_lazy_returns_sampler_for_valid_grammar_with_triggers(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let trigger_words: Vec<&[u8]> = vec![b"function"];
- let sampler = LlamaSampler::grammar_lazy(
- fixture.model,
- "root ::= \"hello\"",
- "root",
- trigger_words,
- &[],
- );
-
- assert!(sampler.is_ok());
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn grammar_lazy_patterns_returns_sampler_for_valid_grammar_with_patterns(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let patterns = vec!["\\{.*".to_owned()];
- let sampler = LlamaSampler::grammar_lazy_patterns(
- fixture.model,
- "root ::= \"hello\"",
- "root",
- &patterns,
- &[],
- );
-
- assert!(sampler.is_ok());
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn grammar_lazy_with_root_not_found_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
- let trigger_words: Vec<&[u8]> = vec![b"function"];
- let result = LlamaSampler::grammar_lazy(
- fixture.model,
- "expr ::= \"hello\"",
- "root",
- trigger_words,
- &[],
- );
-
- assert!(matches!(result, Err(GrammarError::RootNotFound)));
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn grammar_lazy_with_null_byte_in_trigger_word_returns_error(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let trigger_words: Vec<&[u8]> = vec![b"hel\0lo"];
- let result = LlamaSampler::grammar_lazy(
- fixture.model,
- "root ::= \"hello\"",
- "root",
- trigger_words,
- &[],
- );
-
- assert!(matches!(result, Err(GrammarError::TriggerWordNullBytes(_))));
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn grammar_lazy_patterns_with_root_not_found_returns_error(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let patterns = vec!["\\{.*".to_owned()];
- let result = LlamaSampler::grammar_lazy_patterns(
- fixture.model,
- "expr ::= \"hello\"",
- "root",
- &patterns,
- &[],
- );
-
- assert!(matches!(result, Err(GrammarError::RootNotFound)));
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn grammar_lazy_patterns_with_null_byte_in_pattern_returns_error(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let patterns = vec!["hel\0lo".to_owned()];
- let result = LlamaSampler::grammar_lazy_patterns(
- fixture.model,
- "root ::= \"hello\"",
- "root",
- &patterns,
- &[],
- );
-
- assert!(matches!(result, Err(GrammarError::GrammarNullBytes(_))));
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn grammar_lazy_patterns_with_malformed_regex_returns_invalid_trigger_pattern(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let patterns = vec!["[".to_owned()];
- let result = LlamaSampler::grammar_lazy_patterns(
- fixture.model,
- "root ::= \"hello\"",
- "root",
- &patterns,
- &[],
- );
-
- assert!(matches!(
- result,
- Err(GrammarError::InvalidTriggerPattern { .. }),
- ));
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn llguidance_method_creates_sampler(fixture: &LlamaFixture<'_>) -> Result<()> {
- let result = LlamaSampler::llguidance(fixture.model, "regex", r"yes|no");
-
- assert!(result.is_ok());
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn logit_bias_with_empty_biases_succeeds(_fixture: &LlamaFixture<'_>) -> Result<()> {
- let result = LlamaSampler::logit_bias(0, &[]);
-
- assert!(result.is_ok());
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn dry_sampler_with_root_not_found_grammar_does_not_apply(
- fixture: &LlamaFixture<'_>,
-) -> Result<()> {
- let breakers: Vec<&[u8]> = vec![b"\n"];
- let _sampler = LlamaSampler::dry(fixture.model, 1.5, 2.0, 128, 2, &breakers);
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn accept_many_iterates_over_borrowed_tokens(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut sampler = LlamaSampler::chain_simple([LlamaSampler::greedy()]);
- let tokens = vec![fixture.model.token_bos(), fixture.model.token_eos()];
-
- sampler.accept_many(&tokens)?;
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn with_tokens_returns_self_after_accepting_each_token(fixture: &LlamaFixture<'_>) -> Result<()> {
- let sampler = LlamaSampler::chain_simple([LlamaSampler::greedy()]);
- let tokens = [fixture.model.token_bos(), fixture.model.token_eos()];
-
- let _consumed = sampler.with_tokens(tokens.iter().copied())?;
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn accept_consumes_a_single_token(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut sampler = LlamaSampler::chain_simple([LlamaSampler::greedy()]);
-
- sampler.accept(fixture.model.token_bos())?;
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn try_accept_returns_ok_for_a_valid_token(_fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut sampler = LlamaSampler::chain_simple([LlamaSampler::greedy()]);
-
- sampler.try_accept(LlamaToken::new(0))?;
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn apply_runs_sampler_over_token_data_array(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut context = LlamaContext::from_model(
- fixture.model,
- fixture.backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
- let tokens = fixture.model.str_to_token("Hi", AddBos::Always)?;
- let mut batch = LlamaBatch::new(512, 1)?;
- batch.add_sequence(&tokens, 0, false)?;
- context.decode(&mut batch)?;
-
- let mut data_array = context.token_data_array_ith(batch.n_tokens() - 1)?;
- let sampler = LlamaSampler::greedy();
- sampler.apply(&mut data_array);
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 2048,
- n_ubatch = 512,
-)]
-fn sample_returns_token_after_decode(fixture: &LlamaFixture<'_>) -> Result<()> {
- let mut context = LlamaContext::from_model(
- fixture.model,
- fixture.backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
- let tokens = fixture.model.str_to_token("Hello", AddBos::Always)?;
- let mut batch = LlamaBatch::new(512, 1)?;
- batch.add_sequence(&tokens, 0, false)?;
- context.decode(&mut batch)?;
- let mut sampler = LlamaSampler::chain_simple([LlamaSampler::temp(0.8), LlamaSampler::greedy()]);
- let result = sampler.sample(&context, batch.n_tokens() - 1);
-
- assert!(result.is_ok());
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/sampling_and_constrained_decoding.rs b/llama-cpp-bindings-tests/tests/sampling_and_constrained_decoding.rs
new file mode 100644
index 00000000..dc9395aa
--- /dev/null
+++ b/llama-cpp-bindings-tests/tests/sampling_and_constrained_decoding.rs
@@ -0,0 +1,2518 @@
+use llama_cpp_test_harness::llama_tests_main;
+
+mod model_sampling {
+ use anyhow::Result;
+ use llama_cpp_bindings::SampledToken;
+ use llama_cpp_bindings::context::LlamaContext;
+ use llama_cpp_bindings::json_schema_to_grammar;
+ use llama_cpp_bindings::llama_batch::LlamaBatch;
+ use llama_cpp_bindings::model::AddBos;
+ use llama_cpp_bindings::sampling::LlamaSampler;
+ use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 256,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 256,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 256,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 256,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn sample_returns_result_and_succeeds_with_valid_index(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let model = fixture.model;
+ let mut context = LlamaContext::from_model(
+ model,
+ fixture.backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+
+ let tokens = model.str_to_token("Hello", AddBos::Always)?;
+ let mut batch = LlamaBatch::new(512, 1)?;
+
+ batch.add_sequence(&tokens, 0, false)?;
+
+ context.decode(&mut batch)?;
+
+ let mut sampler =
+ LlamaSampler::chain_simple([LlamaSampler::temp(0.8), LlamaSampler::greedy()]);
+
+ let result = sampler.sample(&context, batch.n_tokens() - 1);
+
+ assert!(result.is_ok());
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn grammar_sampler_constrains_output_to_yes_or_no(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let model = fixture.model;
+ let mut context = LlamaContext::from_model(
+ model,
+ fixture.backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+
+ let prompt = "<|im_start|>user\nIs the sky blue? Answer yes or no.<|im_end|>\n<|im_start|>assistant\n\n\n\n\n";
+ let tokens = model.str_to_token(prompt, AddBos::Always)?;
+ let mut batch = LlamaBatch::new(512, 1)?;
+
+ batch.add_sequence(&tokens, 0, false)?;
+
+ context.decode(&mut batch)?;
+
+ let mut sampler = LlamaSampler::chain_simple([
+ LlamaSampler::grammar(model, r"root ::= [Yy] [Ee] [Ss] | [Nn] [Oo]", "root")?,
+ LlamaSampler::temp(0.8),
+ LlamaSampler::greedy(),
+ ]);
+
+ let mut classifier = model.sampled_token_classifier();
+ let (raw_token, mut outcomes) =
+ classifier.sample(&mut sampler, &context, batch.n_tokens() - 1)?;
+ outcomes.extend(classifier.flush());
+
+ assert_eq!(
+ outcomes.len(),
+ 1,
+ "expected one finalised outcome after flush"
+ );
+ let outcome = &outcomes[0];
+
+ let raw_as_sampled = SampledToken::Content(raw_token);
+ assert!(
+ !model.is_eog_token(&raw_as_sampled),
+ "Grammar sampler should not allow EOS as first token"
+ );
+
+ let piece = &outcome.raw_piece;
+ let first_char = piece
+ .chars()
+ .next()
+ .ok_or_else(|| anyhow::anyhow!("piece should have at least one character"))?
+ .to_lowercase()
+ .next()
+ .ok_or_else(|| anyhow::anyhow!("lowercase iterator should yield a character"))?;
+
+ assert!(
+ first_char == 'y' || first_char == 'n',
+ "Grammar should constrain first token to start with y/n, got: '{piece}'"
+ );
+ assert_eq!(
+ classifier.usage().completion_tokens(),
+ 1,
+ "exactly one completion token sampled"
+ );
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn json_schema_grammar_sampler_constrains_output_to_json(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let model = fixture.model;
+ let mut context = LlamaContext::from_model(
+ model,
+ fixture.backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+
+ let prompt = "<|im_start|>user\nWhat is 2+2? Respond with a JSON object.<|im_end|>\n<|im_start|>assistant\n\n\n\n\n";
+ let tokens = model.str_to_token(prompt, AddBos::Always)?;
+ let mut batch = LlamaBatch::new(512, 1)?;
+
+ batch.add_sequence(&tokens, 0, false)?;
+
+ context.decode(&mut batch)?;
+
+ let grammar_str = json_schema_to_grammar(
+ r#"{"type": "object", "properties": {"answer": {"type": "string"}}, "required": ["answer"]}"#,
+ )?;
+
+ let mut sampler = LlamaSampler::chain_simple([
+ LlamaSampler::grammar(model, &grammar_str, "root")?,
+ LlamaSampler::temp(0.8),
+ LlamaSampler::greedy(),
+ ]);
+
+ let mut classifier = model.sampled_token_classifier();
+ let (raw_token, mut outcomes) =
+ classifier.sample(&mut sampler, &context, batch.n_tokens() - 1)?;
+ outcomes.extend(classifier.flush());
+
+ assert_eq!(
+ outcomes.len(),
+ 1,
+ "expected one finalised outcome after flush"
+ );
+ let outcome = &outcomes[0];
+
+ let raw_as_sampled = SampledToken::Content(raw_token);
+ assert!(
+ !model.is_eog_token(&raw_as_sampled),
+ "Grammar sampler should not allow EOS as first token"
+ );
+
+ let piece = &outcome.raw_piece;
+
+ assert!(
+ piece.starts_with('{'),
+ "JSON schema grammar should constrain first token to start with '{{', got: '{piece}'"
+ );
+ assert_eq!(
+ classifier.usage().completion_tokens(),
+ 1,
+ "exactly one completion token sampled"
+ );
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn sample_with_grammar_produces_constrained_output_in_loop(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let model = fixture.model;
+ let mut context = LlamaContext::from_model(
+ model,
+ fixture.backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+
+ let prompt = "<|im_start|>user\nIs the sky blue? yes or no<|im_end|>\n<|im_start|>assistant\n\n\n\n\n";
+ let tokens = model.str_to_token(prompt, AddBos::Always)?;
+ let mut batch = LlamaBatch::new(512, 1)?;
+
+ let mut classifier = model.sampled_token_classifier();
+ classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?;
+
+ context.decode(&mut batch)?;
+ classifier.commit_prompt_tokens();
+
+ let mut sampler = LlamaSampler::chain_simple([
+ LlamaSampler::grammar(model, r#"root ::= "yes" | "no""#, "root")?,
+ LlamaSampler::temp(0.8),
+ LlamaSampler::greedy(),
+ ]);
+
+ let initial_position = batch.n_tokens();
+ let outcome = ClassifySampleLoop {
+ model,
+ classifier: &mut classifier,
+ sampler: &mut sampler,
+ context: &mut context,
+ batch: &mut batch,
+ initial_position,
+ max_generated_tokens: 10,
+ }
+ .run()?;
+
+ let lowercase = outcome.generated_raw.to_lowercase();
+ assert!(
+ lowercase == "yes" || lowercase == "no",
+ "Grammar loop should produce 'yes' or 'no', got: '{}'",
+ outcome.generated_raw
+ );
+ assert!(
+ outcome.eog_seen,
+ "loop must terminate via EOG once grammar accepts, not by exhausting the budget; outcome={outcome:?}"
+ );
+ assert_eq!(outcome.observed_reasoning, 0);
+ assert_eq!(outcome.observed_undeterminable, 0);
+ assert_eq!(outcome.observed_tool_call, 0);
+ assert!(outcome.observed_content > 0);
+
+ let usage = classifier.into_usage();
+ assert_eq!(usage.completion_tokens(), outcome.observed_content);
+ assert_eq!(usage.reasoning_tokens, 0);
+ assert_eq!(usage.undeterminable_tokens, 0);
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn sample_without_grammar_produces_multiple_tokens(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let model = fixture.model;
+ let mut context = LlamaContext::from_model(
+ model,
+ fixture.backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+
+ let prompt =
+ "<|im_start|>user\nSay hello<|im_end|>\n<|im_start|>assistant\n\n\n\n\n";
+ let tokens = model.str_to_token(prompt, AddBos::Always)?;
+ let mut batch = LlamaBatch::new(512, 1)?;
+
+ batch.add_sequence(&tokens, 0, false)?;
+
+ context.decode(&mut batch)?;
+
+ let mut sampler =
+ LlamaSampler::chain_simple([LlamaSampler::temp(0.8), LlamaSampler::greedy()]);
+
+ let mut classifier = model.sampled_token_classifier();
+ let mut sampled_count: u64 = 0;
+
+ for (position, _) in (batch.n_tokens()..).zip(0..5) {
+ let (raw_token, _outcomes) = classifier.sample(&mut sampler, &context, -1)?;
+ let raw_as_sampled = SampledToken::Content(raw_token);
+
+ if model.is_eog_token(&raw_as_sampled) {
+ break;
+ }
+
+ sampled_count += 1;
+
+ batch.clear();
+ batch.add(&raw_as_sampled, position, &[0], true)?;
+
+ context.decode(&mut batch)?;
+ }
+
+ let _ = classifier.flush();
+
+ assert!(
+ sampled_count > 0,
+ "Should produce at least one token without grammar"
+ );
+ let usage = classifier.into_usage();
+ assert!(
+ usage.completion_tokens() >= sampled_count,
+ "completion_tokens ({}) must include the {sampled_count} non-EOG samples",
+ usage.completion_tokens()
+ );
+
+ Ok(())
+ }
+}
+
+mod sampling {
+ #![expect(
+ clippy::unnecessary_wraps,
+ reason = "every trial returns anyhow::Result<()> to match the LlamaTestFn signature"
+ )]
+
+ use anyhow::Result;
+ use llama_cpp_bindings::GrammarError;
+ use llama_cpp_bindings::context::LlamaContext;
+ use llama_cpp_bindings::llama_batch::LlamaBatch;
+ use llama_cpp_bindings::model::AddBos;
+ use llama_cpp_bindings::sampling::LlamaSampler;
+ use llama_cpp_bindings::token::LlamaToken;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn dry_sampler_with_model(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let breakers: Vec<&[u8]> = vec![b"\n", b"\t"];
+ let _sampler = LlamaSampler::dry(fixture.model, 1.5, 2.0, 128, 2, &breakers);
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn dry_sampler_with_null_byte_in_seq_breakers_returns_error(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let breakers: Vec<&[u8]> = vec![b"hello\0world"];
+ let result = LlamaSampler::dry(fixture.model, 1.5, 2.0, 128, 2, breakers);
+
+ assert!(result.is_err());
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn grammar_returns_sampler_for_valid_grammar(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let sampler = LlamaSampler::grammar(fixture.model, "root ::= \"hello\"", "root");
+
+ assert!(sampler.is_ok());
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn grammar_lazy_returns_sampler_for_valid_grammar_with_triggers(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let trigger_words: Vec<&[u8]> = vec![b"function"];
+ let sampler = LlamaSampler::grammar_lazy(
+ fixture.model,
+ "root ::= \"hello\"",
+ "root",
+ trigger_words,
+ &[],
+ );
+
+ assert!(sampler.is_ok());
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn grammar_lazy_patterns_returns_sampler_for_valid_grammar_with_patterns(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let patterns = vec!["\\{.*".to_owned()];
+ let sampler = LlamaSampler::grammar_lazy_patterns(
+ fixture.model,
+ "root ::= \"hello\"",
+ "root",
+ &patterns,
+ &[],
+ );
+
+ assert!(sampler.is_ok());
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn grammar_lazy_with_root_not_found_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let trigger_words: Vec<&[u8]> = vec![b"function"];
+ let result = LlamaSampler::grammar_lazy(
+ fixture.model,
+ "expr ::= \"hello\"",
+ "root",
+ trigger_words,
+ &[],
+ );
+
+ assert!(matches!(result, Err(GrammarError::RootNotFound)));
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn grammar_lazy_with_null_byte_in_trigger_word_returns_error(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let trigger_words: Vec<&[u8]> = vec![b"hel\0lo"];
+ let result = LlamaSampler::grammar_lazy(
+ fixture.model,
+ "root ::= \"hello\"",
+ "root",
+ trigger_words,
+ &[],
+ );
+
+ assert!(matches!(result, Err(GrammarError::TriggerWordNullBytes(_))));
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn grammar_lazy_patterns_with_root_not_found_returns_error(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let patterns = vec!["\\{.*".to_owned()];
+ let result = LlamaSampler::grammar_lazy_patterns(
+ fixture.model,
+ "expr ::= \"hello\"",
+ "root",
+ &patterns,
+ &[],
+ );
+
+ assert!(matches!(result, Err(GrammarError::RootNotFound)));
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn grammar_lazy_patterns_with_null_byte_in_pattern_returns_error(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let patterns = vec!["hel\0lo".to_owned()];
+ let result = LlamaSampler::grammar_lazy_patterns(
+ fixture.model,
+ "root ::= \"hello\"",
+ "root",
+ &patterns,
+ &[],
+ );
+
+ assert!(matches!(result, Err(GrammarError::GrammarNullBytes(_))));
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn grammar_lazy_patterns_with_malformed_regex_returns_invalid_trigger_pattern(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let patterns = vec!["[".to_owned()];
+ let result = LlamaSampler::grammar_lazy_patterns(
+ fixture.model,
+ "root ::= \"hello\"",
+ "root",
+ &patterns,
+ &[],
+ );
+
+ assert!(matches!(
+ result,
+ Err(GrammarError::InvalidTriggerPattern { .. }),
+ ));
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn llguidance_method_creates_sampler(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let result = LlamaSampler::llguidance(fixture.model, "regex", r"yes|no");
+
+ assert!(result.is_ok());
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn logit_bias_with_empty_biases_succeeds(_fixture: &LlamaFixture<'_>) -> Result<()> {
+ let result = LlamaSampler::logit_bias(0, &[]);
+
+ assert!(result.is_ok());
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn dry_sampler_with_root_not_found_grammar_does_not_apply(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let breakers: Vec<&[u8]> = vec![b"\n"];
+ let _sampler = LlamaSampler::dry(fixture.model, 1.5, 2.0, 128, 2, &breakers);
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn accept_many_iterates_over_borrowed_tokens(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mut sampler = LlamaSampler::chain_simple([LlamaSampler::greedy()]);
+ let tokens = vec![fixture.model.token_bos(), fixture.model.token_eos()];
+
+ sampler.accept_many(&tokens)?;
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn with_tokens_returns_self_after_accepting_each_token(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let sampler = LlamaSampler::chain_simple([LlamaSampler::greedy()]);
+ let tokens = [fixture.model.token_bos(), fixture.model.token_eos()];
+
+ let _consumed = sampler.with_tokens(tokens.iter().copied())?;
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn accept_consumes_a_single_token(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mut sampler = LlamaSampler::chain_simple([LlamaSampler::greedy()]);
+
+ sampler.accept(fixture.model.token_bos())?;
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn try_accept_returns_ok_for_a_valid_token(_fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mut sampler = LlamaSampler::chain_simple([LlamaSampler::greedy()]);
+
+ sampler.try_accept(LlamaToken::new(0))?;
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn apply_runs_sampler_over_token_data_array(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mut context = LlamaContext::from_model(
+ fixture.model,
+ fixture.backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+ let tokens = fixture.model.str_to_token("Hi", AddBos::Always)?;
+ let mut batch = LlamaBatch::new(512, 1)?;
+ batch.add_sequence(&tokens, 0, false)?;
+ context.decode(&mut batch)?;
+
+ let mut data_array = context.token_data_array_ith(batch.n_tokens() - 1)?;
+ let sampler = LlamaSampler::greedy();
+ sampler.apply(&mut data_array);
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 2048,
+ n_ubatch = 512,
+ )]
+ fn sample_returns_token_after_decode(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mut context = LlamaContext::from_model(
+ fixture.model,
+ fixture.backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+ let tokens = fixture.model.str_to_token("Hello", AddBos::Always)?;
+ let mut batch = LlamaBatch::new(512, 1)?;
+ batch.add_sequence(&tokens, 0, false)?;
+ context.decode(&mut batch)?;
+ let mut sampler =
+ LlamaSampler::chain_simple([LlamaSampler::temp(0.8), LlamaSampler::greedy()]);
+ let result = sampler.sample(&context, batch.n_tokens() - 1);
+
+ assert!(result.is_ok());
+
+ Ok(())
+ }
+}
+
+mod text_generation {
+ use std::io::Write;
+ use std::time::Duration;
+
+ use anyhow::Context as _;
+ use anyhow::Result;
+ use llama_cpp_bindings::context::LlamaContext;
+ use llama_cpp_bindings::ggml_time_us;
+ use llama_cpp_bindings::llama_batch::LlamaBatch;
+ use llama_cpp_bindings::model::AddBos;
+ use llama_cpp_bindings::model::LlamaChatMessage;
+ use llama_cpp_bindings::sampled_token::SampledToken;
+ use llama_cpp_bindings::sampling::LlamaSampler;
+ use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn raw_prompt_completion_with_timing(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let model = fixture.model;
+ let backend = fixture.backend;
+ let mut ctx = LlamaContext::from_model(
+ model,
+ backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )
+ .with_context(|| "unable to create context")?;
+
+ let prompt = "Hello my name is";
+ let max_generated_tokens: i32 = 64;
+
+ let mut classifier = model.sampled_token_classifier();
+ let tokens_list = model
+ .str_to_token(prompt, AddBos::Always)
+ .with_context(|| format!("failed to tokenize {prompt}"))?;
+ let prompt_token_count = u64::try_from(tokens_list.len())?;
+
+ let mut decoder = encoding_rs::UTF_8.new_decoder();
+
+ for token in &tokens_list {
+ eprint!(
+ "{}",
+ model.token_to_piece(&SampledToken::Content(*token), &mut decoder, true, None)?
+ );
+ }
+ std::io::stderr().flush()?;
+
+ let mut batch = LlamaBatch::new(512, 1)?;
+ classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens_list, 0, false)?;
+
+ assert_eq!(classifier.pending_prompt_tokens(), prompt_token_count);
+ assert_eq!(classifier.usage().prompt_tokens, 0);
+
+ ctx.decode(&mut batch)
+ .with_context(|| "llama_decode() failed")?;
+
+ let promoted = classifier.commit_prompt_tokens();
+ assert_eq!(promoted, prompt_token_count);
+ assert_eq!(classifier.usage().prompt_tokens, prompt_token_count);
+
+ let mut sampler =
+ LlamaSampler::chain_simple([LlamaSampler::dist(1234), LlamaSampler::greedy()]);
+ let initial_position = batch.n_tokens();
+ let t_main_start = ggml_time_us();
+ let outcome = ClassifySampleLoop {
+ model,
+ classifier: &mut classifier,
+ sampler: &mut sampler,
+ context: &mut ctx,
+ batch: &mut batch,
+ initial_position,
+ max_generated_tokens,
+ }
+ .run()?;
+ let t_main_end = ggml_time_us();
+ let duration = Duration::from_micros(u64::try_from(t_main_end - t_main_start)?);
+ let total_observed =
+ outcome.observed_content + outcome.observed_reasoning + outcome.observed_undeterminable;
+
+ #[expect(
+ clippy::cast_precision_loss,
+ reason = "logged throughput tolerates f32 precision"
+ )]
+ let tokens_per_second = total_observed as f32 / duration.as_secs_f32();
+
+ eprintln!(
+ "\ndecoded {total_observed} tokens in {:.2} s, speed {tokens_per_second:.2} t/s",
+ duration.as_secs_f32(),
+ );
+
+ assert!(
+ !outcome.generated_raw.is_empty(),
+ "model should generate at least one token"
+ );
+ assert_eq!(
+ outcome.observed_tool_call, 0,
+ "raw prompt without tool-call markers must not produce ToolCall tokens; \
+ outcome={outcome:?}"
+ );
+ assert!(
+ total_observed > 0,
+ "model must produce at least one classified token; outcome={outcome:?}"
+ );
+
+ let usage = classifier.into_usage();
+ assert_eq!(
+ usage.prompt_tokens, prompt_token_count,
+ "prompt_tokens must equal the tokenizer's prompt length"
+ );
+ assert_eq!(
+ usage.content_tokens, outcome.observed_content,
+ "content_tokens must equal observed Content variants"
+ );
+ assert_eq!(
+ usage.reasoning_tokens, outcome.observed_reasoning,
+ "reasoning_tokens must equal observed Reasoning variants"
+ );
+ assert_eq!(
+ usage.undeterminable_tokens, outcome.observed_undeterminable,
+ "undeterminable_tokens must equal observed Undeterminable variants"
+ );
+ assert_eq!(
+ usage.tool_call_tokens, outcome.observed_tool_call,
+ "tool_call_tokens must equal observed ToolCall variants"
+ );
+ assert_eq!(
+ usage.completion_tokens(),
+ total_observed,
+ "completion_tokens must equal Content + Reasoning + Undeterminable"
+ );
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 2048,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 2048,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 2048,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 2048,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn chat_inference_produces_coherent_output(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let model = fixture.model;
+ let backend = fixture.backend;
+ let mut context = LlamaContext::from_model(
+ model,
+ backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+
+ let chat_template = model.chat_template(None)?;
+ let messages = vec![LlamaChatMessage::new(
+ "user".to_string(),
+ "Hello! How are you?".to_string(),
+ )?];
+ let prompt = model.apply_chat_template(&chat_template, &messages, true)?;
+
+ let mut classifier = model.sampled_token_classifier();
+ let tokens = model.str_to_token(&prompt, AddBos::Always)?;
+ let prompt_token_count = u64::try_from(tokens.len())?;
+
+ let mut batch = LlamaBatch::new(512, 1)?;
+ classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?;
+
+ assert_eq!(classifier.pending_prompt_tokens(), prompt_token_count);
+ assert_eq!(classifier.usage().prompt_tokens, 0);
+
+ context.decode(&mut batch)?;
+
+ let promoted = classifier.commit_prompt_tokens();
+ assert_eq!(promoted, prompt_token_count);
+
+ let mut sampler = LlamaSampler::greedy();
+ let initial_position = batch.n_tokens();
+ let outcome = ClassifySampleLoop {
+ model,
+ classifier: &mut classifier,
+ sampler: &mut sampler,
+ context: &mut context,
+ batch: &mut batch,
+ initial_position,
+ max_generated_tokens: 1024,
+ }
+ .run()?;
+
+ println!();
+
+ assert!(
+ !outcome.generated_raw.is_empty(),
+ "model should generate at least one token"
+ );
+ let total_observed =
+ outcome.observed_content + outcome.observed_reasoning + outcome.observed_undeterminable;
+ assert!(
+ total_observed > 0,
+ "model must produce at least one classified token; outcome={outcome:?}"
+ );
+ assert_eq!(
+ outcome.observed_tool_call, 0,
+ "chat without tool definitions must not produce ToolCall tokens; outcome={outcome:?}"
+ );
+
+ let usage = classifier.into_usage();
+
+ assert_eq!(
+ usage.prompt_tokens, prompt_token_count,
+ "prompt_tokens must equal the tokenizer's prompt length"
+ );
+ assert_eq!(
+ usage.content_tokens, outcome.observed_content,
+ "content_tokens must equal observed Content variants"
+ );
+ assert_eq!(
+ usage.reasoning_tokens, outcome.observed_reasoning,
+ "reasoning_tokens must equal observed Reasoning variants"
+ );
+ assert_eq!(
+ usage.undeterminable_tokens, outcome.observed_undeterminable,
+ "undeterminable_tokens must equal observed Undeterminable variants"
+ );
+ assert_eq!(
+ usage.completion_tokens(),
+ total_observed,
+ "completion_tokens must equal Content + Reasoning + Undeterminable"
+ );
+ assert_eq!(
+ usage.tool_call_tokens, outcome.observed_tool_call,
+ "tool_call_tokens must equal observed ToolCall variants"
+ );
+
+ Ok(())
+ }
+}
+
+mod constrained_decoding {
+ use std::io::Write;
+
+ use anyhow::Result;
+ use llama_cpp_bindings::context::LlamaContext;
+ use llama_cpp_bindings::llama_batch::LlamaBatch;
+ use llama_cpp_bindings::model::AddBos;
+ use llama_cpp_bindings::sampled_token::SampledToken;
+ use llama_cpp_bindings::sampling::LlamaSampler;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn json_schema_constrains_output(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let model = fixture.model;
+ let backend = fixture.backend;
+
+ let prompt = "The weather in Paris is sunny and 22 degrees. Extract as JSON:\n";
+
+ let mut ctx = LlamaContext::from_model(
+ model,
+ backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+
+ let tokens_list = model.str_to_token(prompt, AddBos::Always)?;
+
+ let mut batch = LlamaBatch::new(512, 1)?;
+ let last_index = i32::try_from(tokens_list.len())? - 1;
+
+ for (index, token) in (0_i32..).zip(&tokens_list) {
+ batch.add(
+ &SampledToken::Content(*token),
+ index,
+ &[0],
+ index == last_index,
+ )?;
+ }
+
+ ctx.decode(&mut batch)?;
+
+ let schema = r#"{
+ "type": "object",
+ "properties": {
+ "city": { "type": "string" },
+ "temperature": { "type": "number" }
+ },
+ "required": ["city", "temperature"]
+ }"#;
+
+ let llg_sampler = LlamaSampler::llguidance(model, "json", schema)?;
+ let mut sampler = LlamaSampler::chain_simple([llg_sampler, LlamaSampler::greedy()]);
+
+ let mut n_cur = batch.n_tokens();
+ let mut decoder = encoding_rs::UTF_8.new_decoder();
+ let mut generated = String::new();
+
+ while n_cur <= 128 {
+ let token = SampledToken::Content(sampler.sample(&ctx, batch.n_tokens() - 1)?);
+
+ if model.is_eog_token(&token) {
+ break;
+ }
+
+ let output_string = model.token_to_piece(&token, &mut decoder, true, None)?;
+ generated.push_str(&output_string);
+ print!("{output_string}");
+ std::io::stdout().flush()?;
+
+ batch.clear();
+ batch.add(&token, n_cur, &[0], true)?;
+ n_cur += 1;
+ ctx.decode(&mut batch)?;
+ }
+
+ println!();
+
+ let parsed = serde_json::Deserializer::from_str(&generated)
+ .into_iter::()
+ .next()
+ .ok_or_else(|| anyhow::anyhow!("model produced no JSON value"))??;
+
+ assert!(parsed.get("city").is_some());
+ assert!(parsed.get("temperature").is_some());
+
+ Ok(())
+ }
+}
+
+mod llguidance {
+ #![expect(
+ clippy::unnecessary_wraps,
+ reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
+ )]
+
+ use std::ffi::CStr;
+ use std::sync::Arc;
+
+ use anyhow::Result;
+ use llama_cpp_bindings::context::LlamaContext;
+ use llama_cpp_bindings::llama_batch::LlamaBatch;
+ use llama_cpp_bindings::llguidance_sampler::create_llg_sampler;
+ use llama_cpp_bindings::model::AddBos;
+ use llama_cpp_bindings::sampling::LlamaSampler;
+ use llama_cpp_bindings::token::LlamaToken;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ const JSON_SCHEMA: &str =
+ r#"{"type":"object","properties":{"answer":{"type":"string"}},"required":["answer"]}"#;
+ const REGEX_GRAMMAR: &str = r"yes|no";
+ const LARK_GRAMMAR: &str = r#"start: "yes" | "no""#;
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn creates_sampler_with_valid_json_schema(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let sampler = create_llg_sampler(fixture.model, "json", JSON_SCHEMA)?;
+
+ assert!(!sampler.sampler.is_null());
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn creates_sampler_with_valid_regex_grammar(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?;
+
+ assert!(!sampler.sampler.is_null());
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn creates_sampler_with_valid_lark_grammar(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let sampler = create_llg_sampler(fixture.model, "lark", LARK_GRAMMAR)?;
+
+ assert!(!sampler.sampler.is_null());
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn returns_error_for_unknown_grammar_kind(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let result = create_llg_sampler(fixture.model, "not_a_real_kind", "anything");
+
+ assert!(result.is_err());
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn returns_error_for_malformed_json_schema(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let result = create_llg_sampler(fixture.model, "json", "{this is not valid json");
+
+ assert!(result.is_err());
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn returns_error_for_malformed_regex(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let result = create_llg_sampler(fixture.model, "regex", "[invalid");
+
+ assert!(result.is_err());
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn name_callback_returns_llguidance(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?;
+
+ let name_ptr = unsafe { llama_cpp_bindings_sys::llama_sampler_name(sampler.sampler) };
+ assert!(!name_ptr.is_null());
+ let name = unsafe { CStr::from_ptr(name_ptr) }.to_str()?;
+
+ assert_eq!(name, "llguidance");
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn clone_via_ffi_creates_independent_sampler(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?;
+
+ let cloned = unsafe { llama_cpp_bindings_sys::llama_sampler_clone(sampler.sampler) };
+
+ assert!(!cloned.is_null());
+
+ unsafe { llama_cpp_bindings_sys::llama_sampler_free(cloned) };
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn samples_token_constrained_by_grammar(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let model = fixture.model;
+ let backend = fixture.backend;
+ let mut context = LlamaContext::from_model(
+ model,
+ backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+
+ let prompt = "Answer yes or no:";
+ let tokens = model.str_to_token(prompt, AddBos::Always)?;
+ let mut batch = LlamaBatch::new(512, 1)?;
+ batch.add_sequence(&tokens, 0, false)?;
+ context.decode(&mut batch)?;
+
+ let llg_sampler = create_llg_sampler(model, "regex", REGEX_GRAMMAR)?;
+ let mut chain = LlamaSampler::chain_simple([llg_sampler, LlamaSampler::greedy()]);
+
+ let token = chain.sample(&context, batch.n_tokens() - 1)?;
+ chain.accept(token)?;
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn accept_invalid_token_id_does_not_panic(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mut sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?;
+
+ let huge_token = LlamaToken(i32::MAX - 1);
+ let _ = sampler.accept(huge_token);
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn approximate_tok_env_returns_same_arc_across_calls(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let first = fixture.model.approximate_tok_env();
+ let second = fixture.model.approximate_tok_env();
+
+ assert!(Arc::ptr_eq(&first, &second));
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn approximate_tok_env_drives_consistent_grammar_constraint(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let first = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?;
+ let second = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?;
+
+ assert!(!first.sampler.is_null());
+ assert!(!second.sampler.is_null());
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn apply_through_chain_during_sample_does_not_panic(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let model = fixture.model;
+ let backend = fixture.backend;
+ let mut context = LlamaContext::from_model(
+ model,
+ backend,
+ (*fixture.context_params).into_llama_context_params(),
+ )?;
+
+ let tokens = model.str_to_token("Answer:", AddBos::Always)?;
+ let mut batch = LlamaBatch::new(512, 1)?;
+ batch.add_sequence(&tokens, 0, false)?;
+ context.decode(&mut batch)?;
+
+ let llg_sampler = create_llg_sampler(model, "regex", REGEX_GRAMMAR)?;
+ let mut chain = LlamaSampler::chain_simple([llg_sampler, LlamaSampler::greedy()]);
+ let _ = chain.sample(&context, batch.n_tokens() - 1);
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 512,
+ n_ubatch = 128,
+ )]
+ fn reset_clears_sampler_state(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let mut sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?;
+ let huge_token = LlamaToken(i32::MAX - 1);
+ let _ = sampler.accept(huge_token);
+ sampler.reset();
+ let after = sampler.accept(LlamaToken(0));
+ assert!(
+ after.is_ok() || after.is_err(),
+ "after reset, sampler.accept must return Ok or Err (not panic)"
+ );
+ Ok(())
+ }
+}
+
+mod sampled_token_classifier_markers {
+ #![expect(
+ clippy::unnecessary_wraps,
+ reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
+ )]
+
+ use anyhow::Result;
+ use llama_cpp_bindings::SampledToken;
+ use llama_cpp_bindings::llama_batch::LlamaBatch;
+ use llama_cpp_bindings::sampled_token_classifier::SampledTokenClassifier;
+ use llama_cpp_bindings::sampled_token_section::SampledTokenSection;
+ use llama_cpp_bindings::streaming_markers::StreamingMarkers;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn classifier_starts_in_pending_section_for_default_fixture(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let classifier = fixture.model.sampled_token_classifier();
+
+ assert_eq!(classifier.current_section(), SampledTokenSection::Pending);
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn classifier_construction_is_idempotent_across_calls(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let first = fixture.model.sampled_token_classifier();
+ let second = fixture.model.sampled_token_classifier();
+
+ assert_eq!(first.current_section(), second.current_section());
+ assert_eq!(first.usage(), second.usage());
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn ingest_with_no_markers_emits_undeterminable_with_visible_and_raw_piece(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let model = fixture.model;
+ let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default());
+
+ let outcomes = classifier.ingest(model.token_bos());
+
+ assert_eq!(outcomes.len(), 1);
+ let outcome = &outcomes[0];
+ assert!(matches!(
+ outcome.sampled_token,
+ SampledToken::Undeterminable(_)
+ ));
+ assert_eq!(outcome.visible_piece, outcome.raw_piece);
+ assert_eq!(classifier.usage().undeterminable_tokens, 1);
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn ingest_with_no_markers_decodes_each_token_independently(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let model = fixture.model;
+ let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default());
+
+ let _ = classifier.ingest(model.token_bos());
+ let _ = classifier.ingest(model.token_eos());
+
+ assert_eq!(classifier.usage().undeterminable_tokens, 2);
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn ingest_prompt_token_with_no_markers_is_a_noop(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let model = fixture.model;
+ let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default());
+ let usage_before = *classifier.usage();
+
+ classifier.ingest_prompt_token(model.token_bos());
+ classifier.ingest_prompt_tokens(&[model.token_eos(), model.token_nl()]);
+
+ assert_eq!(*classifier.usage(), usage_before);
+ assert_eq!(classifier.current_section(), SampledTokenSection::Pending);
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn feed_prompt_to_batch_increments_pending_prompt_tokens(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let model = fixture.model;
+ let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default());
+ let mut batch = LlamaBatch::new(8, 1)?;
+
+ classifier.feed_prompt_to_batch(&mut batch, model.token_bos(), 0, &[0], false)?;
+ classifier.feed_prompt_to_batch(&mut batch, model.token_eos(), 1, &[0], false)?;
+
+ assert_eq!(classifier.pending_prompt_tokens(), 2);
+ assert_eq!(batch.n_tokens(), 2);
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn feed_prompt_sequence_to_batch_stages_all_tokens(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let model = fixture.model;
+ let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default());
+ let mut batch = LlamaBatch::new(8, 1)?;
+
+ let tokens = vec![model.token_bos(), model.token_eos(), model.token_nl()];
+ classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?;
+
+ assert_eq!(classifier.pending_prompt_tokens(), 3);
+ assert_eq!(batch.n_tokens(), 3);
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn commit_prompt_tokens_promotes_pending_count_to_usage_and_clears(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let model = fixture.model;
+ let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default());
+ let mut batch = LlamaBatch::new(8, 1)?;
+
+ classifier.feed_prompt_to_batch(&mut batch, model.token_bos(), 0, &[0], false)?;
+ classifier.feed_prompt_to_batch(&mut batch, model.token_eos(), 1, &[0], false)?;
+
+ let promoted = classifier.commit_prompt_tokens();
+
+ assert_eq!(promoted, 2);
+ assert_eq!(classifier.pending_prompt_tokens(), 0);
+ assert_eq!(classifier.usage().prompt_tokens, 2);
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn discard_pending_prompt_tokens_clears_count_without_recording_usage(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let model = fixture.model;
+ let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default());
+ let mut batch = LlamaBatch::new(8, 1)?;
+
+ classifier.feed_prompt_to_batch(&mut batch, model.token_bos(), 0, &[0], false)?;
+
+ let discarded = classifier.discard_pending_prompt_tokens();
+
+ assert_eq!(discarded, 1);
+ assert_eq!(classifier.pending_prompt_tokens(), 0);
+ assert_eq!(classifier.usage().prompt_tokens, 0);
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn diagnose_tool_call_synthetic_renders_returns_a_pair_of_strings(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let (left, right) = fixture.model.diagnose_tool_call_synthetic_renders()?;
+ let _ = left;
+ let _ = right;
+ Ok(())
+ }
+}
+
+llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/text_generation.rs b/llama-cpp-bindings-tests/tests/text_generation.rs
deleted file mode 100644
index 57fd54d7..00000000
--- a/llama-cpp-bindings-tests/tests/text_generation.rs
+++ /dev/null
@@ -1,298 +0,0 @@
-use std::io::Write;
-use std::time::Duration;
-
-use anyhow::Context as _;
-use anyhow::Result;
-use llama_cpp_bindings::context::LlamaContext;
-use llama_cpp_bindings::ggml_time_us;
-use llama_cpp_bindings::llama_batch::LlamaBatch;
-use llama_cpp_bindings::model::AddBos;
-use llama_cpp_bindings::model::LlamaChatMessage;
-use llama_cpp_bindings::sampled_token::SampledToken;
-use llama_cpp_bindings::sampling::LlamaSampler;
-use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop;
-use llama_cpp_test_harness::LlamaFixture;
-use llama_cpp_test_harness::llama_test;
-use llama_cpp_test_harness::llama_tests_main;
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 512,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn raw_prompt_completion_with_timing(fixture: &LlamaFixture<'_>) -> Result<()> {
- let model = fixture.model;
- let backend = fixture.backend;
- let mut ctx = LlamaContext::from_model(
- model,
- backend,
- (*fixture.context_params).into_llama_context_params(),
- )
- .with_context(|| "unable to create context")?;
-
- let prompt = "Hello my name is";
- let max_generated_tokens: i32 = 64;
-
- let mut classifier = model.sampled_token_classifier();
- let tokens_list = model
- .str_to_token(prompt, AddBos::Always)
- .with_context(|| format!("failed to tokenize {prompt}"))?;
- let prompt_token_count = u64::try_from(tokens_list.len())?;
-
- let mut decoder = encoding_rs::UTF_8.new_decoder();
-
- for token in &tokens_list {
- eprint!(
- "{}",
- model.token_to_piece(&SampledToken::Content(*token), &mut decoder, true, None)?
- );
- }
- std::io::stderr().flush()?;
-
- let mut batch = LlamaBatch::new(512, 1)?;
- classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens_list, 0, false)?;
-
- assert_eq!(classifier.pending_prompt_tokens(), prompt_token_count);
- assert_eq!(classifier.usage().prompt_tokens, 0);
-
- ctx.decode(&mut batch)
- .with_context(|| "llama_decode() failed")?;
-
- let promoted = classifier.commit_prompt_tokens();
- assert_eq!(promoted, prompt_token_count);
- assert_eq!(classifier.usage().prompt_tokens, prompt_token_count);
-
- let mut sampler =
- LlamaSampler::chain_simple([LlamaSampler::dist(1234), LlamaSampler::greedy()]);
- let initial_position = batch.n_tokens();
- let t_main_start = ggml_time_us();
- let outcome = ClassifySampleLoop {
- model,
- classifier: &mut classifier,
- sampler: &mut sampler,
- context: &mut ctx,
- batch: &mut batch,
- initial_position,
- max_generated_tokens,
- }
- .run()?;
- let t_main_end = ggml_time_us();
- let duration = Duration::from_micros(u64::try_from(t_main_end - t_main_start)?);
- let total_observed =
- outcome.observed_content + outcome.observed_reasoning + outcome.observed_undeterminable;
-
- #[expect(
- clippy::cast_precision_loss,
- reason = "logged throughput tolerates f32 precision"
- )]
- let tokens_per_second = total_observed as f32 / duration.as_secs_f32();
-
- eprintln!(
- "\ndecoded {total_observed} tokens in {:.2} s, speed {tokens_per_second:.2} t/s",
- duration.as_secs_f32(),
- );
-
- assert!(
- !outcome.generated_raw.is_empty(),
- "model should generate at least one token"
- );
- assert_eq!(
- outcome.observed_tool_call, 0,
- "raw prompt without tool-call markers must not produce ToolCall tokens; \
- outcome={outcome:?}"
- );
- assert!(
- total_observed > 0,
- "model must produce at least one classified token; outcome={outcome:?}"
- );
-
- let usage = classifier.into_usage();
- assert_eq!(
- usage.prompt_tokens, prompt_token_count,
- "prompt_tokens must equal the tokenizer's prompt length"
- );
- assert_eq!(
- usage.content_tokens, outcome.observed_content,
- "content_tokens must equal observed Content variants"
- );
- assert_eq!(
- usage.reasoning_tokens, outcome.observed_reasoning,
- "reasoning_tokens must equal observed Reasoning variants"
- );
- assert_eq!(
- usage.undeterminable_tokens, outcome.observed_undeterminable,
- "undeterminable_tokens must equal observed Undeterminable variants"
- );
- assert_eq!(
- usage.tool_call_tokens, outcome.observed_tool_call,
- "tool_call_tokens must equal observed ToolCall variants"
- );
- assert_eq!(
- usage.completion_tokens(),
- total_observed,
- "completion_tokens must equal Content + Reasoning + Undeterminable"
- );
-
- Ok(())
-}
-
-#[llama_test(
- model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 2048,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 2048,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 2048,
- n_batch = 512,
- n_ubatch = 128,
-)]
-#[llama_test(
- model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
- n_gpu_layers = 999,
- use_mmap = true,
- use_mlock = false,
- n_ctx = 2048,
- n_batch = 512,
- n_ubatch = 128,
-)]
-fn chat_inference_produces_coherent_output(fixture: &LlamaFixture<'_>) -> Result<()> {
- let model = fixture.model;
- let backend = fixture.backend;
- let mut context = LlamaContext::from_model(
- model,
- backend,
- (*fixture.context_params).into_llama_context_params(),
- )?;
-
- let chat_template = model.chat_template(None)?;
- let messages = vec![LlamaChatMessage::new(
- "user".to_string(),
- "Hello! How are you?".to_string(),
- )?];
- let prompt = model.apply_chat_template(&chat_template, &messages, true)?;
-
- let mut classifier = model.sampled_token_classifier();
- let tokens = model.str_to_token(&prompt, AddBos::Always)?;
- let prompt_token_count = u64::try_from(tokens.len())?;
-
- let mut batch = LlamaBatch::new(512, 1)?;
- classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?;
-
- assert_eq!(classifier.pending_prompt_tokens(), prompt_token_count);
- assert_eq!(classifier.usage().prompt_tokens, 0);
-
- context.decode(&mut batch)?;
-
- let promoted = classifier.commit_prompt_tokens();
- assert_eq!(promoted, prompt_token_count);
-
- let mut sampler = LlamaSampler::greedy();
- let initial_position = batch.n_tokens();
- let outcome = ClassifySampleLoop {
- model,
- classifier: &mut classifier,
- sampler: &mut sampler,
- context: &mut context,
- batch: &mut batch,
- initial_position,
- max_generated_tokens: 1024,
- }
- .run()?;
-
- println!();
-
- assert!(
- !outcome.generated_raw.is_empty(),
- "model should generate at least one token"
- );
- let total_observed =
- outcome.observed_content + outcome.observed_reasoning + outcome.observed_undeterminable;
- assert!(
- total_observed > 0,
- "model must produce at least one classified token; outcome={outcome:?}"
- );
- assert_eq!(
- outcome.observed_tool_call, 0,
- "chat without tool definitions must not produce ToolCall tokens; outcome={outcome:?}"
- );
-
- let usage = classifier.into_usage();
-
- assert_eq!(
- usage.prompt_tokens, prompt_token_count,
- "prompt_tokens must equal the tokenizer's prompt length"
- );
- assert_eq!(
- usage.content_tokens, outcome.observed_content,
- "content_tokens must equal observed Content variants"
- );
- assert_eq!(
- usage.reasoning_tokens, outcome.observed_reasoning,
- "reasoning_tokens must equal observed Reasoning variants"
- );
- assert_eq!(
- usage.undeterminable_tokens, outcome.observed_undeterminable,
- "undeterminable_tokens must equal observed Undeterminable variants"
- );
- assert_eq!(
- usage.completion_tokens(),
- total_observed,
- "completion_tokens must equal Content + Reasoning + Undeterminable"
- );
- assert_eq!(
- usage.tool_call_tokens, outcome.observed_tool_call,
- "tool_call_tokens must equal observed ToolCall variants"
- );
-
- Ok(())
-}
-
-llama_tests_main!();
diff --git a/llama-cpp-bindings-tests/tests/vocabulary_and_metadata.rs b/llama-cpp-bindings-tests/tests/vocabulary_and_metadata.rs
new file mode 100644
index 00000000..7b26c7ee
--- /dev/null
+++ b/llama-cpp-bindings-tests/tests/vocabulary_and_metadata.rs
@@ -0,0 +1,1978 @@
+use llama_cpp_test_harness::llama_tests_main;
+
+mod model_properties {
+ #![expect(
+ clippy::unnecessary_wraps,
+ reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
+ )]
+
+ use anyhow::Result;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn model_loads_with_valid_metadata(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let model = fixture.model;
+
+ assert!(model.n_vocab() > 0);
+ assert!(model.n_embd() > 0);
+ assert!(model.n_params() > 0);
+ assert!(model.n_ctx_train()? > 0);
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn n_layer_returns_positive(fixture: &LlamaFixture<'_>) -> Result<()> {
+ assert!(fixture.model.n_layer()? > 0);
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn n_head_returns_positive(fixture: &LlamaFixture<'_>) -> Result<()> {
+ assert!(fixture.model.n_head()? > 0);
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn n_head_kv_returns_positive(fixture: &LlamaFixture<'_>) -> Result<()> {
+ assert!(fixture.model.n_head_kv()? > 0);
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn model_size_returns_nonzero(fixture: &LlamaFixture<'_>) -> Result<()> {
+ assert!(fixture.model.size() > 0);
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn is_recurrent_returns_false_for_transformer(fixture: &LlamaFixture<'_>) -> Result<()> {
+ assert!(!fixture.model.is_recurrent());
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn is_hybrid_returns_false_for_non_hybrid_default_models(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ assert!(
+ !fixture.model.is_hybrid(),
+ "DeepSeek-R1-Distill-Llama-8B and GLM-4.7-Flash are pure transformers, not hybrid; got is_hybrid=true"
+ );
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn is_hybrid_returns_true_for_qwen_default_models(fixture: &LlamaFixture<'_>) -> Result<()> {
+ assert!(
+ fixture.model.is_hybrid(),
+ "Qwen 3.5 and Qwen 3.6 default GGUFs are reported as hybrid by llama.cpp; got is_hybrid=false"
+ );
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn rope_type_returns_a_known_variant_for_rope_carrying_default_models(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ use llama_cpp_bindings::model::rope_type::RopeType;
+ let rope = fixture.model.rope_type();
+ assert!(
+ matches!(
+ rope,
+ Some(RopeType::Norm | RopeType::NeoX | RopeType::MRope | RopeType::Vision)
+ ),
+ "rope_type must be a known variant for DeepSeek and GLM-4.7; got {rope:?}"
+ );
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn rope_type_returns_none_for_qwen_default_models(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let rope = fixture.model.rope_type();
+ assert!(
+ rope.is_none(),
+ "Qwen 3.5 and Qwen 3.6 default GGUFs do not expose a rope_type in their metadata; got {rope:?}"
+ );
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn vocab_type_returns_a_known_variant(fixture: &LlamaFixture<'_>) -> Result<()> {
+ use llama_cpp_bindings::model::vocab_type::VocabType;
+ let vocab = fixture.model.vocab_type()?;
+ assert!(
+ matches!(vocab, VocabType::BPE | VocabType::SPM),
+ "vocab_type must be a known variant; got {vocab:?}"
+ );
+ Ok(())
+ }
+}
+
+mod model_metadata_kv {
+ #![expect(
+ clippy::unnecessary_wraps,
+ reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
+ )]
+
+ use anyhow::Result;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn meta_count_returns_positive(fixture: &LlamaFixture<'_>) -> Result<()> {
+ assert!(fixture.model.meta_count() > 0);
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn meta_key_by_index_returns_valid_key(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let key = fixture.model.meta_key_by_index(0)?;
+ assert!(!key.is_empty());
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn meta_val_str_by_index_returns_valid_value(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let value = fixture.model.meta_val_str_by_index(0)?;
+ assert!(!value.is_empty());
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn meta_key_by_index_out_of_range_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let result = fixture.model.meta_key_by_index(999_999);
+ assert!(result.is_err());
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn meta_val_str_by_index_out_of_range_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let result = fixture.model.meta_val_str_by_index(999_999);
+ assert!(result.is_err());
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn meta_val_str_returns_value_for_known_key(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let model = fixture.model;
+ let first_key = model.meta_key_by_index(0)?;
+ let value = model.meta_val_str(&first_key)?;
+ assert!(!value.is_empty());
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn meta_val_str_with_long_value_triggers_buffer_resize(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let model = fixture.model;
+ let count = model.meta_count();
+
+ for index in 0..count {
+ let key = model.meta_key_by_index(index);
+ let value = model.meta_val_str_by_index(index);
+ assert!(key.is_ok());
+ assert!(value.is_ok());
+ }
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn meta_val_str_with_null_byte_in_key_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let result = fixture.model.meta_val_str("key\0with_null");
+ assert!(result.is_err());
+ Ok(())
+ }
+}
+
+mod model_params {
+ #![expect(
+ clippy::similar_names,
+ reason = "model_path_str and model_path_cstr are both genuinely needed; renaming would not improve clarity"
+ )]
+
+ use std::ffi::CString;
+ use std::pin::pin;
+
+ use anyhow::Result;
+ use llama_cpp_bindings::context::params::LlamaContextParams;
+ use llama_cpp_bindings::max_devices;
+ use llama_cpp_bindings::model::params::LlamaModelParams;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn fit_params_succeeds_with_test_model(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let model_path_str = fixture
+ .model_path
+ .to_str()
+ .ok_or_else(|| anyhow::anyhow!("model path is not valid UTF-8"))?;
+ let model_path_cstr = CString::new(model_path_str)?;
+
+ let mut params = pin!(LlamaModelParams::default());
+ let mut context_params = LlamaContextParams::default();
+ let mut margins = vec![0usize; max_devices()];
+
+ let result = params.as_mut().fit_params(
+ &model_path_cstr,
+ &mut context_params,
+ &mut margins,
+ 512,
+ llama_cpp_bindings_sys::GGML_LOG_LEVEL_NONE,
+ );
+
+ let fit =
+ result.map_err(|fit_error| anyhow::anyhow!("fit_params failed: {fit_error:?}"))?;
+ assert!(fit.n_ctx > 0);
+
+ Ok(())
+ }
+}
+
+mod model_special_tokens {
+ #![expect(
+ clippy::unnecessary_wraps,
+ reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
+ )]
+
+ use anyhow::Result;
+ use llama_cpp_bindings::SampledToken;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn special_tokens_exist(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let model = fixture.model;
+ let bos = model.token_bos();
+ let eos = model.token_eos();
+
+ assert_ne!(bos, eos);
+ assert!(model.is_eog_token(&SampledToken::Content(eos)));
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn token_nl_returns_valid_token(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let nl_token = fixture.model.token_nl();
+ assert!(nl_token.0 >= 0);
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn is_eog_token_classifies_reasoning_variant(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let model = fixture.model;
+ let eos = model.token_eos();
+ assert!(model.is_eog_token(&SampledToken::Reasoning(eos)));
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn is_eog_token_classifies_tool_call_variant(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let model = fixture.model;
+ let eos = model.token_eos();
+ assert!(model.is_eog_token(&SampledToken::ToolCall(eos)));
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn is_eog_token_classifies_undeterminable_variant(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let model = fixture.model;
+ let eos = model.token_eos();
+ assert!(model.is_eog_token(&SampledToken::Undeterminable(eos)));
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn decode_start_token_returns_valid_token(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let model = fixture.model;
+ let token = model.decode_start_token();
+ let n_vocab = model.n_vocab();
+ assert!(
+ token.0 == -1 || (0..n_vocab).contains(&token.0),
+ "decode_start_token must be either -1 (no decoder-start defined) or a valid vocab index < {n_vocab}; got {token}"
+ );
+ assert_eq!(
+ token,
+ model.decode_start_token(),
+ "decode_start_token must be deterministic across calls"
+ );
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn token_sep_returns_valid_token(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let model = fixture.model;
+ let token = model.token_sep();
+ let n_vocab = model.n_vocab();
+ assert!(
+ token.0 == -1 || (0..n_vocab).contains(&token.0),
+ "token_sep must be either -1 (no SEP token defined) or a valid vocab index < {n_vocab}; got {token}"
+ );
+ assert_eq!(
+ token,
+ model.token_sep(),
+ "token_sep must be deterministic across calls"
+ );
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn token_attr_returns_attrs_for_bos(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let model = fixture.model;
+ let bos = model.token_bos();
+ let attrs = model.token_attr(bos)?;
+ let bit_repr = format!("{:?}", *attrs);
+ assert!(
+ !bit_repr.is_empty(),
+ "token_attr(bos) must produce Debug output"
+ );
+ Ok(())
+ }
+}
+
+mod model_str_to_token {
+ use anyhow::Result;
+ use llama_cpp_bindings::model::AddBos;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn str_to_token_roundtrip(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let model = fixture.model;
+ let tokens = model.str_to_token("hello world", AddBos::Never)?;
+ assert!(!tokens.is_empty());
+ let mut decoder = encoding_rs::UTF_8.new_decoder();
+ let piece = model.token_to_piece(
+ &llama_cpp_bindings::SampledToken::Content(tokens[0]),
+ &mut decoder,
+ false,
+ None,
+ )?;
+
+ assert!(!piece.is_empty());
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn str_to_token_grows_buffer_when_initial_estimation_too_small(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let many_short_chars = "a b c d e f g h i j k l";
+ let tokens = fixture
+ .model
+ .str_to_token(many_short_chars, AddBos::Always)?;
+
+ assert!(
+ tokens.len() > 8,
+ "expected regrow; got {} tokens",
+ tokens.len()
+ );
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn str_to_token_with_add_bos_never(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let model = fixture.model;
+ let tokens_with_bos = model.str_to_token("hello", AddBos::Always)?;
+ let tokens_without_bos = model.str_to_token("hello", AddBos::Never)?;
+
+ assert!(tokens_with_bos.len() >= tokens_without_bos.len());
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn str_to_token_with_many_tokens_triggers_buffer_resize(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ use std::fmt::Write;
+
+ let many_numbers = (0..2000).fold(String::new(), |mut accumulator, number| {
+ let _ = write!(accumulator, "{number} ");
+ accumulator
+ });
+
+ let tokens = fixture.model.str_to_token(&many_numbers, AddBos::Always)?;
+
+ assert!(tokens.len() > many_numbers.len() / 2);
+
+ Ok(())
+ }
+}
+
+mod model_token_to_piece {
+ #![expect(
+ clippy::unnecessary_wraps,
+ reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
+ )]
+
+ use std::num::NonZeroU16;
+
+ use anyhow::Result;
+ use llama_cpp_bindings::SampledToken;
+ use llama_cpp_bindings::model::AddBos;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn token_to_piece_bytes_returns_bytes_for_known_token(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let model = fixture.model;
+ let tokens = model.str_to_token("hello", AddBos::Never)?;
+ let bytes = model.token_to_piece_bytes(tokens[0], 32, false, None)?;
+
+ assert!(!bytes.is_empty());
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn token_to_piece_handles_large_token_requiring_buffer_resize(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let model = fixture.model;
+ let mut decoder = encoding_rs::UTF_8.new_decoder();
+
+ for (token, _) in model.tokens(true).take(200) {
+ let result =
+ model.token_to_piece(&SampledToken::Content(token), &mut decoder, true, None);
+ assert!(result.is_ok());
+ }
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn token_to_piece_bytes_insufficient_buffer_returns_error(
+ fixture: &LlamaFixture<'_>,
+ ) -> Result<()> {
+ let model = fixture.model;
+ let tokens = model.str_to_token("hello", AddBos::Never)?;
+ let result = model.token_to_piece_bytes(tokens[0], 1, false, None);
+
+ assert!(
+ result
+ .unwrap_err()
+ .to_string()
+ .contains("Insufficient Buffer Space")
+ );
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn token_to_piece_with_lstrip(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let model = fixture.model;
+ let mut decoder = encoding_rs::UTF_8.new_decoder();
+ let tokens = model.str_to_token("hello", AddBos::Never)?;
+ let result = model.token_to_piece(
+ &SampledToken::Content(tokens[0]),
+ &mut decoder,
+ false,
+ NonZeroU16::new(1),
+ );
+
+ assert!(result.is_ok());
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn token_to_piece_decodes_reasoning_variant(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let model = fixture.model;
+ let mut decoder = encoding_rs::UTF_8.new_decoder();
+ let tokens = model.str_to_token("hi", AddBos::Never)?;
+
+ let piece = model.token_to_piece(
+ &SampledToken::Reasoning(tokens[0]),
+ &mut decoder,
+ true,
+ None,
+ )?;
+
+ assert!(!piece.is_empty());
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn token_to_piece_decodes_tool_call_variant(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let model = fixture.model;
+ let mut decoder = encoding_rs::UTF_8.new_decoder();
+ let tokens = model.str_to_token("hi", AddBos::Never)?;
+
+ let piece =
+ model.token_to_piece(&SampledToken::ToolCall(tokens[0]), &mut decoder, true, None)?;
+
+ assert!(!piece.is_empty());
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn token_to_piece_decodes_undeterminable_variant(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let model = fixture.model;
+ let mut decoder = encoding_rs::UTF_8.new_decoder();
+ let tokens = model.str_to_token("hi", AddBos::Never)?;
+
+ let piece = model.token_to_piece(
+ &SampledToken::Undeterminable(tokens[0]),
+ &mut decoder,
+ true,
+ None,
+ )?;
+
+ assert!(!piece.is_empty());
+ Ok(())
+ }
+}
+
+mod model_tokens_iterator {
+ #![expect(
+ clippy::unnecessary_wraps,
+ reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate"
+ )]
+
+ use anyhow::Result;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn tokens_iterator_produces_valid_entries(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let model = fixture.model;
+ let mut count = 0;
+
+ for (token, _piece_result) in model.tokens(false) {
+ assert!(token.0 >= 0);
+ count += 1;
+
+ if count >= 100 {
+ break;
+ }
+ }
+
+ assert_eq!(count, 100);
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 512,
+ n_batch = 128,
+ n_ubatch = 64,
+ )]
+ fn n_vocab_matches_tokens_iterator_count(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let model = fixture.model;
+ let n_vocab = model.n_vocab();
+ let count = model.tokens(false).count();
+
+ assert_eq!(count, usize::try_from(n_vocab)?);
+ Ok(())
+ }
+}
+
+mod model_helpers {
+ #![expect(
+ clippy::unnecessary_wraps,
+ reason = "every trial returns anyhow::Result<()> to match the LlamaTestFn signature"
+ )]
+
+ use anyhow::Result;
+ use llama_cpp_test_harness::LlamaFixture;
+ use llama_cpp_test_harness::llama_test;
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 2048,
+ n_batch = 512,
+ n_ubatch = 128
+ )]
+ fn debug_format_includes_struct_name_and_model_field(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let formatted = format!("{:?}", fixture.model);
+
+ assert!(formatted.contains("LlamaModel"));
+ assert!(formatted.contains("model"));
+
+ Ok(())
+ }
+
+ #[llama_test(
+ model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"),
+ n_gpu_layers = 999,
+ use_mmap = true,
+ use_mlock = false,
+ n_ctx = 2048,
+ n_batch = 512,
+ n_ubatch = 128
+ )]
+ fn approximate_tok_env_is_cached_across_calls(fixture: &LlamaFixture<'_>) -> Result<()> {
+ let first = fixture.model.approximate_tok_env();
+ let second = fixture.model.approximate_tok_env();
+
+ assert!(std::sync::Arc::ptr_eq(&first, &second));
+
+ Ok(())
+ }
+}
+
+llama_tests_main!();
diff --git a/llama-cpp-test-harness/Cargo.toml b/llama-cpp-test-harness/Cargo.toml
index 041ea779..477362da 100644
--- a/llama-cpp-test-harness/Cargo.toml
+++ b/llama-cpp-test-harness/Cargo.toml
@@ -13,6 +13,7 @@ inventory = { workspace = true }
libtest-mimic = { workspace = true }
llama-cpp-bindings = { workspace = true }
llama-cpp-test-harness-macros = { workspace = true }
+thiserror = { workspace = true }
[features]
cuda = ["llama-cpp-bindings/cuda"]
diff --git a/llama-cpp-test-harness/src/deterministic_arguments.rs b/llama-cpp-test-harness/src/deterministic_arguments.rs
deleted file mode 100644
index 353053dd..00000000
--- a/llama-cpp-test-harness/src/deterministic_arguments.rs
+++ /dev/null
@@ -1,50 +0,0 @@
-use libtest_mimic::Arguments;
-
-const fn build_deterministic_arguments(mut arguments: Arguments) -> Arguments {
- arguments.test_threads = Some(1);
- arguments
-}
-
-#[must_use]
-pub fn deterministic_arguments_from_cli() -> Arguments {
- build_deterministic_arguments(Arguments::from_args())
-}
-
-#[cfg(test)]
-mod tests {
- use libtest_mimic::Arguments;
-
- use super::build_deterministic_arguments;
-
- #[test]
- fn build_deterministic_arguments_forces_test_threads_to_one() {
- let input = Arguments {
- test_threads: Some(8),
- ..Arguments::default()
- };
- let output = build_deterministic_arguments(input);
-
- assert_eq!(output.test_threads, Some(1));
- }
-
- #[test]
- fn build_deterministic_arguments_overrides_unset_test_threads() {
- let input = Arguments::default();
- let output = build_deterministic_arguments(input);
-
- assert_eq!(output.test_threads, Some(1));
- }
-
- #[test]
- fn build_deterministic_arguments_preserves_other_settings() {
- let input = Arguments {
- list: true,
- filter: Some("foo".to_owned()),
- ..Arguments::default()
- };
- let output = build_deterministic_arguments(input);
-
- assert!(output.list);
- assert_eq!(output.filter.as_deref(), Some("foo"));
- }
-}
diff --git a/llama-cpp-test-harness/src/execution_plan.rs b/llama-cpp-test-harness/src/execution_plan.rs
index 52f6dd4c..927c87a8 100644
--- a/llama-cpp-test-harness/src/execution_plan.rs
+++ b/llama-cpp-test-harness/src/execution_plan.rs
@@ -16,10 +16,10 @@
use std::collections::BTreeMap;
use std::sync::Arc;
+use libtest_mimic::Arguments;
use libtest_mimic::Conclusion;
use llama_cpp_bindings::llama_backend::LlamaBackend;
-use crate::deterministic_arguments::deterministic_arguments_from_cli;
use crate::execution_phase::ExecutionPhase;
use crate::llama_test_registration::LlamaTestRegistration;
@@ -65,13 +65,12 @@ impl ExecutionPlan {
}
#[must_use]
- pub fn run(&self, backend: &Arc) -> Vec {
- let arguments = deterministic_arguments_from_cli();
+ pub fn run(&self, backend: &Arc, arguments: &Arguments) -> Vec {
let total = self.phases.len();
let mut conclusions = Vec::with_capacity(total);
for (index, phase) in self.phases.iter().enumerate() {
phase.print_header(index, total);
- conclusions.push(phase.run(backend, &arguments));
+ conclusions.push(phase.run(backend, arguments));
}
conclusions
}
diff --git a/llama-cpp-test-harness/src/harness_arguments_error.rs b/llama-cpp-test-harness/src/harness_arguments_error.rs
new file mode 100644
index 00000000..53db2279
--- /dev/null
+++ b/llama-cpp-test-harness/src/harness_arguments_error.rs
@@ -0,0 +1,9 @@
+use thiserror::Error;
+
+#[derive(Debug, Error)]
+pub enum HarnessArgumentsError {
+ #[error(
+ "the test harness requires --test-threads=1 (or unset); got --test-threads={requested}"
+ )]
+ ConflictingTestThreads { requested: usize },
+}
diff --git a/llama-cpp-test-harness/src/lib.rs b/llama-cpp-test-harness/src/lib.rs
index fb0c1230..8f112b9f 100644
--- a/llama-cpp-test-harness/src/lib.rs
+++ b/llama-cpp-test-harness/src/lib.rs
@@ -8,10 +8,10 @@
//! See the workspace README and `tests/` directory for usage examples.
pub mod context_params;
-pub mod deterministic_arguments;
pub mod download_model;
pub mod execution_phase;
pub mod execution_plan;
+pub mod harness_arguments_error;
pub mod llama_fixture;
pub mod llama_test_fn;
pub mod llama_test_registration;
@@ -21,6 +21,7 @@ pub mod mmproj_source;
pub mod model_load_params;
pub mod model_source;
pub mod no_op;
+pub mod parse_harness_arguments;
pub mod phase_state;
pub mod run;
pub mod run_to_conclusions;
diff --git a/llama-cpp-test-harness/src/parse_harness_arguments.rs b/llama-cpp-test-harness/src/parse_harness_arguments.rs
new file mode 100644
index 00000000..b4b3ce72
--- /dev/null
+++ b/llama-cpp-test-harness/src/parse_harness_arguments.rs
@@ -0,0 +1,82 @@
+use libtest_mimic::Arguments;
+
+use crate::harness_arguments_error::HarnessArgumentsError;
+
+fn validate(mut arguments: Arguments) -> Result {
+ match arguments.test_threads {
+ None | Some(1) => {
+ arguments.test_threads = Some(1);
+ Ok(arguments)
+ }
+ Some(requested) => Err(HarnessArgumentsError::ConflictingTestThreads { requested }),
+ }
+}
+
+/// Parses the test-binary CLI into [`libtest_mimic::Arguments`], enforcing the harness's
+/// single-thread requirement.
+///
+/// `--test-threads` left unset is treated as `1`; `--test-threads=1` is accepted unchanged.
+///
+/// # Errors
+///
+/// Returns [`HarnessArgumentsError::ConflictingTestThreads`] when `--test-threads` is set to
+/// any value other than `1`. The harness orchestrates phase batching itself and cannot share
+/// that responsibility with `libtest_mimic`'s thread pool.
+pub fn parse_harness_arguments() -> Result {
+ validate(Arguments::from_args())
+}
+
+#[cfg(test)]
+mod tests {
+ use libtest_mimic::Arguments;
+
+ use crate::harness_arguments_error::HarnessArgumentsError;
+
+ use super::validate;
+
+ #[test]
+ fn validate_accepts_unset_test_threads_and_defaults_to_one() {
+ let input = Arguments::default();
+ let output = validate(input).expect("unset must be accepted");
+
+ assert_eq!(output.test_threads, Some(1));
+ }
+
+ #[test]
+ fn validate_accepts_explicit_single_thread() {
+ let input = Arguments {
+ test_threads: Some(1),
+ ..Arguments::default()
+ };
+ let output = validate(input).expect("--test-threads=1 must be accepted");
+
+ assert_eq!(output.test_threads, Some(1));
+ }
+
+ #[test]
+ fn validate_rejects_non_one_test_threads() {
+ let input = Arguments {
+ test_threads: Some(8),
+ ..Arguments::default()
+ };
+ let error = validate(input).expect_err("--test-threads=8 must be rejected");
+
+ assert!(matches!(
+ error,
+ HarnessArgumentsError::ConflictingTestThreads { requested: 8 }
+ ));
+ }
+
+ #[test]
+ fn validate_preserves_other_settings() {
+ let input = Arguments {
+ list: true,
+ filter: Some("foo".to_owned()),
+ ..Arguments::default()
+ };
+ let output = validate(input).expect("default test_threads must pass");
+
+ assert!(output.list);
+ assert_eq!(output.filter.as_deref(), Some("foo"));
+ }
+}
diff --git a/llama-cpp-test-harness/src/run.rs b/llama-cpp-test-harness/src/run.rs
index 6d13b1b4..376cbbae 100644
--- a/llama-cpp-test-harness/src/run.rs
+++ b/llama-cpp-test-harness/src/run.rs
@@ -5,6 +5,7 @@ use libtest_mimic::Conclusion;
use llama_cpp_bindings::llama_backend::LlamaBackend;
use crate::execution_plan::ExecutionPlan;
+use crate::parse_harness_arguments::parse_harness_arguments;
fn aggregate_exit_code(conclusions: &[Conclusion]) -> ExitCode {
if conclusions.iter().any(Conclusion::has_failed) {
@@ -16,6 +17,13 @@ fn aggregate_exit_code(conclusions: &[Conclusion]) -> ExitCode {
#[must_use]
pub fn run() -> ExitCode {
+ let arguments = match parse_harness_arguments() {
+ Ok(arguments) => arguments,
+ Err(error) => {
+ eprintln!("llama-cpp-test-harness: {error}");
+ return ExitCode::from(2);
+ }
+ };
let mut backend = match LlamaBackend::init() {
Ok(backend) => backend,
Err(error) => {
@@ -28,7 +36,7 @@ pub fn run() -> ExitCode {
backend.void_logs();
}
let backend = Arc::new(backend);
- aggregate_exit_code(&plan.run(&backend))
+ aggregate_exit_code(&plan.run(&backend, &arguments))
}
#[cfg(test)]
diff --git a/llama-cpp-test-harness/src/run_to_conclusions.rs b/llama-cpp-test-harness/src/run_to_conclusions.rs
index 8de67e11..67c90003 100644
--- a/llama-cpp-test-harness/src/run_to_conclusions.rs
+++ b/llama-cpp-test-harness/src/run_to_conclusions.rs
@@ -4,6 +4,7 @@ use libtest_mimic::Conclusion;
use llama_cpp_bindings::llama_backend::LlamaBackend;
use crate::execution_plan::ExecutionPlan;
+use crate::parse_harness_arguments::parse_harness_arguments;
/// Runs every registered test against its declared model and returns one [`Conclusion`] per phase.
///
@@ -13,10 +14,15 @@ use crate::execution_plan::ExecutionPlan;
///
/// # Panics
///
-/// Panics if [`LlamaBackend::init`] fails. The harness is meaningless without a backend; a
-/// crash is the loudest possible failure signal.
+/// Panics if [`LlamaBackend::init`] fails or if the CLI arguments conflict with the harness's
+/// single-thread requirement. The harness is meaningless without a backend or with conflicting
+/// thread-count flags; a crash is the loudest possible failure signal.
#[must_use]
pub fn run_to_conclusions() -> Vec {
+ let arguments = match parse_harness_arguments() {
+ Ok(arguments) => arguments,
+ Err(error) => panic!("llama-cpp-test-harness: {error}"),
+ };
let mut backend = match LlamaBackend::init() {
Ok(backend) => backend,
Err(error) => panic!("llama-cpp-test-harness: backend init failed: {error}"),
@@ -26,7 +32,7 @@ pub fn run_to_conclusions() -> Vec {
backend.void_logs();
}
let backend = Arc::new(backend);
- plan.run(&backend)
+ plan.run(&backend, &arguments)
}
#[cfg(test)]