From b8245def5bb6d92dc1e1e0a2981232d7c3e42a4d Mon Sep 17 00:00:00 2001 From: Mateusz Charytoniuk Date: Sat, 23 May 2026 10:31:03 +0200 Subject: [PATCH 1/9] migrate integration tests onto declarative llama-cpp-test-harness and tighten coverage gates --- Cargo.lock | 171 ++- Cargo.toml | 10 + Makefile | 92 +- llama-cpp-bindings-sys/src/lib.rs | 31 +- llama-cpp-bindings-tests/Cargo.toml | 265 +++- .../src/fixture_session.rs | 157 --- llama-cpp-bindings-tests/src/gpu_backend.rs | 166 --- llama-cpp-bindings-tests/src/lib.rs | 10 +- llama-cpp-bindings-tests/src/test_model.rs | 233 +--- .../tests/constrained_decoding.rs | 70 +- llama-cpp-bindings-tests/tests/context.rs | 1093 ++++++++++------- .../tests/context_kv_cache.rs | 977 +++++++++++---- .../tests/context_session.rs | 1082 +++++++++++++--- ..._reasoning_for_thinking_disabled_prompt.rs | 52 +- ...epseek_r1_8b_classifier_emits_reasoning.rs | 47 +- ...eek_r1_8b_duck_types_gemma_paired_quote.rs | 37 +- ...eek_r1_8b_duck_types_glm_key_value_tags.rs | 36 +- ...r1_8b_duck_types_mistral_bracketed_json.rs | 37 +- .../deepseek_r1_8b_duck_types_qwen_xml.rs | 36 +- ...t_is_plain_content_with_tools_requested.rs | 39 +- ...pty_tool_calls_when_tools_not_requested.rs | 38 +- llama-cpp-bindings-tests/tests/embeddings.rs | 38 +- ...modal_chunks_records_exact_token_counts.rs | 91 +- ..._reasoning_for_thinking_disabled_prompt.rs | 50 +- .../gemma4_classifier_emits_reasoning.rs | 57 +- ...easoning_for_multimodal_thinking_prompt.rs | 72 +- .../tests/gemma4_parses_tool_call_payload.rs | 37 +- ..._template_override_returns_full_markers.rs | 38 +- ..._reasoning_for_thinking_disabled_prompt.rs | 100 +- .../tests/glm47_classifier_emits_reasoning.rs | 111 +- .../tests/glm47_parses_tool_call_payload.rs | 43 +- ..._template_override_returns_full_markers.rs | 45 +- .../tests/ingest_prompt_chunk.rs | 74 +- .../tests/llama_backend.rs | 38 +- llama-cpp-bindings-tests/tests/llguidance.rs | 674 ++++++++-- ..._reasoning_for_thinking_disabled_prompt.rs | 99 +- .../mistral3_classifier_emits_reasoning.rs | 115 +- ...easoning_for_multimodal_thinking_prompt.rs | 72 +- .../mistral3_parses_tool_call_payload.rs | 44 +- llama-cpp-bindings-tests/tests/model.rs | 987 --------------- .../tests/model_chat_template.rs | 194 +++ .../tests/model_context_creation.rs | 106 ++ .../tests/model_helpers.rs | 115 +- .../tests/model_loading_errors.rs | 172 +++ .../tests/model_lora_adapter_errors.rs | 162 +++ .../tests/model_metadata_kv.rs | 355 ++++++ .../tests/model_params.rs | 59 +- .../tests/model_properties.rs | 423 +++++++ .../tests/model_sampling.rs | 454 +++++++ .../tests/model_special_tokens.rs | 381 ++++++ .../tests/model_str_to_token.rs | 210 ++++ .../tests/model_token_to_piece.rs | 364 ++++++ .../tests/model_tokens_iterator.rs | 109 ++ llama-cpp-bindings-tests/tests/mtmd.rs | 554 --------- llama-cpp-bindings-tests/tests/mtmd_bitmap.rs | 81 ++ .../tests/mtmd_chunk_operations.rs | 147 +++ .../tests/mtmd_chunk_structure.rs | 242 ++++ .../tests/mtmd_context.rs | 162 +++ .../tests/mtmd_evaluation.rs | 236 ++++ .../tests/mtmd_tokenization.rs | 121 ++ llama-cpp-bindings-tests/tests/multimodal.rs | 47 +- .../tests/parse_chat_message.rs | 341 ++++- ...mits_reasoning_when_template_auto_opens.rs | 93 +- ..._reasoning_for_thinking_disabled_prompt.rs | 100 +- .../qwen35_classifier_emits_reasoning.rs | 117 +- ...easoning_for_multimodal_thinking_prompt.rs | 54 +- ...wen35_parses_constrained_schema_payload.rs | 41 +- .../tests/qwen35_parses_tool_call_payload.rs | 84 +- ...t_is_plain_content_with_tools_requested.rs | 39 +- ...mits_reasoning_when_template_auto_opens.rs | 93 +- ..._reasoning_for_thinking_disabled_prompt.rs | 99 +- .../qwen36_classifier_emits_reasoning.rs | 113 +- ...easoning_for_multimodal_thinking_prompt.rs | 75 +- llama-cpp-bindings-tests/tests/reranker.rs | 43 +- .../tests/sampled_token_classifier_markers.rs | 472 ++++++- llama-cpp-bindings-tests/tests/sampling.rs | 428 ++++--- .../tests/text_generation.rs | 113 +- llama-cpp-bindings-types/src/token_usage.rs | 31 +- llama-cpp-test-harness-macros/Cargo.toml | 25 + llama-cpp-test-harness-macros/src/expand.rs | 443 +++++++ llama-cpp-test-harness-macros/src/lib.rs | 78 ++ .../src/parsed_args.rs | 881 +++++++++++++ .../src/parsed_context_params.rs | 9 + .../src/parsed_model_load_params.rs | 6 + .../src/parsed_source.rs | 245 ++++ llama-cpp-test-harness/Cargo.toml | 36 + llama-cpp-test-harness/src/context_params.rs | 191 +++ .../src/deterministic_arguments.rs | 50 + llama-cpp-test-harness/src/download_model.rs | 29 + llama-cpp-test-harness/src/execution_phase.rs | 133 ++ llama-cpp-test-harness/src/execution_plan.rs | 268 ++++ llama-cpp-test-harness/src/lib.rs | 47 + llama-cpp-test-harness/src/llama_fixture.rs | 15 + llama-cpp-test-harness/src/llama_test_fn.rs | 4 + .../src/llama_test_registration.rs | 13 + .../src/llama_tests_main_macro.rs | 11 + llama-cpp-test-harness/src/load_key.rs | 235 ++++ llama-cpp-test-harness/src/mmproj_source.rs | 51 + .../src/model_load_params.rs | 86 ++ llama-cpp-test-harness/src/model_source.rs | 56 + llama-cpp-test-harness/src/no_op.rs | 14 + llama-cpp-test-harness/src/phase_state.rs | 13 + llama-cpp-test-harness/src/run.rs | 122 ++ .../src/run_to_conclusions.rs | 54 + .../src/test_backend_gate.rs | 8 + .../tests/harness_self_test.rs | 199 +++ 106 files changed, 12971 insertions(+), 4845 deletions(-) delete mode 100644 llama-cpp-bindings-tests/src/fixture_session.rs delete mode 100644 llama-cpp-bindings-tests/src/gpu_backend.rs delete mode 100644 llama-cpp-bindings-tests/tests/model.rs create mode 100644 llama-cpp-bindings-tests/tests/model_chat_template.rs create mode 100644 llama-cpp-bindings-tests/tests/model_context_creation.rs create mode 100644 llama-cpp-bindings-tests/tests/model_loading_errors.rs create mode 100644 llama-cpp-bindings-tests/tests/model_lora_adapter_errors.rs create mode 100644 llama-cpp-bindings-tests/tests/model_metadata_kv.rs create mode 100644 llama-cpp-bindings-tests/tests/model_properties.rs create mode 100644 llama-cpp-bindings-tests/tests/model_sampling.rs create mode 100644 llama-cpp-bindings-tests/tests/model_special_tokens.rs create mode 100644 llama-cpp-bindings-tests/tests/model_str_to_token.rs create mode 100644 llama-cpp-bindings-tests/tests/model_token_to_piece.rs create mode 100644 llama-cpp-bindings-tests/tests/model_tokens_iterator.rs delete mode 100644 llama-cpp-bindings-tests/tests/mtmd.rs create mode 100644 llama-cpp-bindings-tests/tests/mtmd_bitmap.rs create mode 100644 llama-cpp-bindings-tests/tests/mtmd_chunk_operations.rs create mode 100644 llama-cpp-bindings-tests/tests/mtmd_chunk_structure.rs create mode 100644 llama-cpp-bindings-tests/tests/mtmd_context.rs create mode 100644 llama-cpp-bindings-tests/tests/mtmd_evaluation.rs create mode 100644 llama-cpp-bindings-tests/tests/mtmd_tokenization.rs create mode 100644 llama-cpp-test-harness-macros/Cargo.toml create mode 100644 llama-cpp-test-harness-macros/src/expand.rs create mode 100644 llama-cpp-test-harness-macros/src/lib.rs create mode 100644 llama-cpp-test-harness-macros/src/parsed_args.rs create mode 100644 llama-cpp-test-harness-macros/src/parsed_context_params.rs create mode 100644 llama-cpp-test-harness-macros/src/parsed_model_load_params.rs create mode 100644 llama-cpp-test-harness-macros/src/parsed_source.rs create mode 100644 llama-cpp-test-harness/Cargo.toml create mode 100644 llama-cpp-test-harness/src/context_params.rs create mode 100644 llama-cpp-test-harness/src/deterministic_arguments.rs create mode 100644 llama-cpp-test-harness/src/download_model.rs create mode 100644 llama-cpp-test-harness/src/execution_phase.rs create mode 100644 llama-cpp-test-harness/src/execution_plan.rs create mode 100644 llama-cpp-test-harness/src/lib.rs create mode 100644 llama-cpp-test-harness/src/llama_fixture.rs create mode 100644 llama-cpp-test-harness/src/llama_test_fn.rs create mode 100644 llama-cpp-test-harness/src/llama_test_registration.rs create mode 100644 llama-cpp-test-harness/src/llama_tests_main_macro.rs create mode 100644 llama-cpp-test-harness/src/load_key.rs create mode 100644 llama-cpp-test-harness/src/mmproj_source.rs create mode 100644 llama-cpp-test-harness/src/model_load_params.rs create mode 100644 llama-cpp-test-harness/src/model_source.rs create mode 100644 llama-cpp-test-harness/src/no_op.rs create mode 100644 llama-cpp-test-harness/src/phase_state.rs create mode 100644 llama-cpp-test-harness/src/run.rs create mode 100644 llama-cpp-test-harness/src/run_to_conclusions.rs create mode 100644 llama-cpp-test-harness/src/test_backend_gate.rs create mode 100644 llama-cpp-test-harness/tests/harness_self_test.rs diff --git a/Cargo.lock b/Cargo.lock index f989f8fd..f9c99776 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -31,6 +31,56 @@ dependencies = [ "memchr", ] +[[package]] +name = "anstream" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000" + +[[package]] +name = "anstyle-parse" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" +dependencies = [ + "anstyle", + "once_cell_polyfill", + "windows-sys 0.61.2", +] + [[package]] name = "anyhow" version = "1.0.102" @@ -160,6 +210,46 @@ dependencies = [ "libloading", ] +[[package]] +name = "clap" +version = "4.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ddb117e43bbf7dacf0a4190fef4d345b9bad68dfc649cb349e7d17d28428e51" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2ce8604710f6733aa641a2b3731eaa1e8b3d9973d5e3565da11800813f997a9" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" + [[package]] name = "cmake" version = "0.1.58" @@ -169,6 +259,12 @@ dependencies = [ "cc", ] +[[package]] +name = "colorchoice" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" + [[package]] name = "console" version = "0.16.3" @@ -402,6 +498,12 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "escape8259" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5692dd7b5a1978a5aeb0ce83b7655c58ca8efdcb79d21036ea249da95afec2c6" + [[package]] name = "fastrand" version = "2.3.0" @@ -930,6 +1032,15 @@ dependencies = [ "web-time", ] +[[package]] +name = "inventory" +version = "0.3.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4f0c30c76f2f4ccee3fe55a2435f691ca00c0e4bd87abe4f4a851b1d4dac39b" +dependencies = [ + "rustversion", +] + [[package]] name = "ipnet" version = "2.12.0" @@ -946,6 +1057,12 @@ dependencies = [ "serde", ] +[[package]] +name = "is_terminal_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" + [[package]] name = "itertools" version = "0.13.0" @@ -1014,6 +1131,18 @@ dependencies = [ "libc", ] +[[package]] +name = "libtest-mimic" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14e6ba06f0ade6e504aff834d7c34298e5155c6baca353cc6a4aaff2f9fd7f33" +dependencies = [ + "anstream", + "anstyle", + "clap", + "escape8259", +] + [[package]] name = "linux-raw-sys" version = "0.12.1" @@ -1076,11 +1205,10 @@ version = "0.7.0" dependencies = [ "anyhow", "encoding_rs", - "hf-hub", "llama-cpp-bindings", "llama-cpp-bindings-sys", + "llama-cpp-test-harness", "serde_json", - "serial_test", ] [[package]] @@ -1096,6 +1224,27 @@ dependencies = [ name = "llama-cpp-log-decoder" version = "0.7.0" +[[package]] +name = "llama-cpp-test-harness" +version = "0.7.0" +dependencies = [ + "anyhow", + "hf-hub", + "inventory", + "libtest-mimic", + "llama-cpp-bindings", + "llama-cpp-test-harness-macros", +] + +[[package]] +name = "llama-cpp-test-harness-macros" +version = "0.7.0" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "llguidance" version = "1.7.0" @@ -1225,6 +1374,12 @@ version = "1.21.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" +[[package]] +name = "once_cell_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" + [[package]] name = "openssl" version = "0.10.76" @@ -1851,6 +2006,12 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + [[package]] name = "strum" version = "0.27.2" @@ -2236,6 +2397,12 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + [[package]] name = "vcpkg" version = "0.2.15" diff --git a/Cargo.toml b/Cargo.toml index 8c203b94..85757ef3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,6 +7,8 @@ members = [ "llama-cpp-bindings", "llama-cpp-bindings-tests", "llama-cpp-log-decoder", + "llama-cpp-test-harness", + "llama-cpp-test-harness-macros", ] [workspace.package] @@ -25,17 +27,25 @@ enumflags2 = "=0.7.12" find_cuda_helper = "=0.2.0" glob = "=0.3.3" hf-hub = "=0.5.0" +inventory = "=0.3.24" +libtest-mimic = "=0.8.2" llama-cpp-bindings = { path = "llama-cpp-bindings", version = "=0.7.0" } llama-cpp-bindings-build = { path = "llama-cpp-bindings-build", version = "=0.7.0" } llama-cpp-bindings-sys = { path = "llama-cpp-bindings-sys", version = "=0.7.0" } llama-cpp-bindings-types = { path = "llama-cpp-bindings-types", version = "=0.7.0" } llama-cpp-log-decoder = { path = "llama-cpp-log-decoder", version = "=0.7.0" } +llama-cpp-test-harness = { path = "llama-cpp-test-harness", version = "=0.7.0" } +llama-cpp-test-harness-macros = { path = "llama-cpp-test-harness-macros", version = "=0.7.0" } llguidance = "=1.7.0" log = "=0.4.29" nom = "=8.0.0" +proc-macro2 = "=1.0.106" +quote = "=1.0.45" serde = { version = "=1.0.228", features = ["derive"] } serde_json = "=1.0.149" serial_test = "=3.4.0" +syn = { version = "=2.0.117", features = ["full"] } thiserror = "=2.0.18" toktrie = "=1.7.0" +trybuild = "=1.0.116" walkdir = "=2.5.0" diff --git a/Makefile b/Makefile index 5004e9ce..57ee7a3e 100644 --- a/Makefile +++ b/Makefile @@ -1,46 +1,6 @@ TEST_DEVICE ?= -QWEN_CAPABLE_FEATURES = multimodal_capable,mrope_model DEVICE_FEATURE = $(if $(TEST_DEVICE),--features $(TEST_DEVICE),) -LLM_QWEN_CAPABLE_FEATURE_FLAGS = $(DEVICE_FEATURE) --features $(QWEN_CAPABLE_FEATURES) - -CARGO_TEST_LLM_FLAGS = --release --no-fail-fast -p llama-cpp-bindings-tests $(DEVICE_FEATURE) -- --test-threads=1 -CARGO_TEST_LLM_FLAGS_QWEN_CAPABLE = --release --no-fail-fast -p llama-cpp-bindings-tests $(LLM_QWEN_CAPABLE_FEATURE_FLAGS) -- --test-threads=1 - - -QWEN3_5_0_8B_ENV = \ - LLAMA_TEST_HF_REPO=unsloth/Qwen3.5-0.8B-GGUF \ - LLAMA_TEST_HF_MODEL=Qwen3.5-0.8B-Q4_K_M.gguf \ - LLAMA_TEST_HF_MMPROJ=mmproj-F16.gguf \ - LLAMA_TEST_HF_EMBED_REPO=Qwen/Qwen3-Embedding-0.6B-GGUF \ - LLAMA_TEST_HF_EMBED_MODEL=Qwen3-Embedding-0.6B-Q8_0.gguf \ - LLAMA_TEST_HF_ENCODER_REPO=Xiaojian9992024/t5-small-GGUF \ - LLAMA_TEST_HF_ENCODER_MODEL=t5-small.bf16.gguf - -QWEN3_6_35B_A3B_ENV = \ - LLAMA_TEST_HF_REPO=unsloth/Qwen3.6-35B-A3B-GGUF \ - LLAMA_TEST_HF_MODEL=Qwen3.6-35B-A3B-UD-Q4_K_M.gguf \ - LLAMA_TEST_HF_MMPROJ=mmproj-F16.gguf \ - LLAMA_TEST_HF_EMBED_REPO=Qwen/Qwen3-Embedding-0.6B-GGUF \ - LLAMA_TEST_HF_EMBED_MODEL=Qwen3-Embedding-0.6B-Q8_0.gguf \ - LLAMA_TEST_HF_ENCODER_REPO=Xiaojian9992024/t5-small-GGUF \ - LLAMA_TEST_HF_ENCODER_MODEL=t5-small.bf16.gguf - -GLM4_7_FLASH_ENV = \ - LLAMA_TEST_HF_REPO=unsloth/GLM-4.7-Flash-GGUF \ - LLAMA_TEST_HF_MODEL=GLM-4.7-Flash-Q4_K_M.gguf \ - LLAMA_TEST_HF_EMBED_REPO=Qwen/Qwen3-Embedding-0.6B-GGUF \ - LLAMA_TEST_HF_EMBED_MODEL=Qwen3-Embedding-0.6B-Q8_0.gguf \ - LLAMA_TEST_HF_ENCODER_REPO=Xiaojian9992024/t5-small-GGUF \ - LLAMA_TEST_HF_ENCODER_MODEL=t5-small.bf16.gguf - -DEEPSEEK_R1_DISTILL_LLAMA_8B_ENV = \ - LLAMA_TEST_HF_REPO=unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF \ - LLAMA_TEST_HF_MODEL=DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf \ - LLAMA_TEST_HF_EMBED_REPO=Qwen/Qwen3-Embedding-0.6B-GGUF \ - LLAMA_TEST_HF_EMBED_MODEL=Qwen3-Embedding-0.6B-Q8_0.gguf \ - LLAMA_TEST_HF_ENCODER_REPO=Xiaojian9992024/t5-small-GGUF \ - LLAMA_TEST_HF_ENCODER_MODEL=t5-small.bf16.gguf node_modules: package-lock.json npm ci @@ -55,32 +15,22 @@ clean.cmake: .PHONY: clippy clippy: - cargo clippy --all-targets -p llama-cpp-bindings-types -- -D warnings - cargo clippy --all-targets -p llama-cpp-log-decoder -- -D warnings - cargo clippy --all-targets -p llama-cpp-bindings-build -- -D warnings - cargo clippy --all-targets -p llama-cpp-bindings-sys $(DEVICE_FEATURE) -- -D warnings - cargo clippy --all-targets -p llama-cpp-bindings $(DEVICE_FEATURE) -- -D warnings - cargo clippy --all-targets -p llama-cpp-bindings-tests $(DEVICE_FEATURE) -- -D warnings - cargo clippy --all-targets -p llama-cpp-bindings-tests $(LLM_QWEN_CAPABLE_FEATURE_FLAGS) -- -D warnings + cargo clippy --workspace --all-targets $(DEVICE_FEATURE) -- -D warnings .PHONY: coverage coverage: node_modules cargo llvm-cov clean --workspace - cargo llvm-cov --no-report -p llama-cpp-log-decoder - cargo llvm-cov --no-report -p llama-cpp-bindings-types - cargo llvm-cov --no-report -p llama-cpp-bindings --lib $(DEVICE_FEATURE) - $(DEEPSEEK_R1_DISTILL_LLAMA_8B_ENV) cargo llvm-cov --no-report --no-fail-fast -p llama-cpp-bindings-tests $(DEVICE_FEATURE) -- --test-threads=1 - $(GLM4_7_FLASH_ENV) cargo llvm-cov --no-report --no-fail-fast -p llama-cpp-bindings-tests $(DEVICE_FEATURE) -- --test-threads=1 - $(QWEN3_5_0_8B_ENV) cargo llvm-cov --no-report --no-fail-fast -p llama-cpp-bindings-tests $(LLM_QWEN_CAPABLE_FEATURE_FLAGS) -- --test-threads=1 - $(QWEN3_6_35B_A3B_ENV) cargo llvm-cov --no-report --no-fail-fast -p llama-cpp-bindings-tests $(LLM_QWEN_CAPABLE_FEATURE_FLAGS) -- --test-threads=1 + cargo llvm-cov --no-report --no-fail-fast --workspace $(DEVICE_FEATURE) cargo llvm-cov report --json --output-path target/llvm-cov.json cargo llvm-cov report --lcov --output-path target/lcov.info cargo llvm-cov report npx rust-coverage-check target/llvm-cov.json \ --workspace-root $(CURDIR) \ --gated llama-cpp-bindings=95 \ - --gated llama-cpp-log-decoder=99 \ - --gated llama-cpp-bindings-types=99 + --gated llama-cpp-log-decoder=100 \ + --gated llama-cpp-bindings-types=100 \ + --gated llama-cpp-test-harness=99 \ + --gated llama-cpp-test-harness-macros=100 .PHONY: coverage-clean coverage-clean: @@ -103,30 +53,14 @@ fmt.check: .PHONY: test test: test.unit test.llms -.PHONY: test.deepseek_r1_distill_llama_8b -test.deepseek_r1_distill_llama_8b: clippy - $(DEEPSEEK_R1_DISTILL_LLAMA_8B_ENV) cargo test $(CARGO_TEST_LLM_FLAGS) - -.PHONY: test.glm4_7_flash -test.glm4_7_flash: clippy - $(GLM4_7_FLASH_ENV) cargo test $(CARGO_TEST_LLM_FLAGS) +.PHONY: test.harness +test.harness: clippy + cargo test -p llama-cpp-test-harness-macros -p llama-cpp-test-harness $(DEVICE_FEATURE) .PHONY: test.llms -test.llms: \ - test.deepseek_r1_distill_llama_8b \ - test.glm4_7_flash \ - test.qwen3.5_0.8B \ - test.qwen3.6_35b_a3b - -.PHONY: test.qwen3.5_0.8B -test.qwen3.5_0.8B: clippy - $(QWEN3_5_0_8B_ENV) cargo test $(CARGO_TEST_LLM_FLAGS_QWEN_CAPABLE) - -.PHONY: test.qwen3.6_35b_a3b -test.qwen3.6_35b_a3b: clippy - $(QWEN3_6_35B_A3B_ENV) cargo test $(CARGO_TEST_LLM_FLAGS_QWEN_CAPABLE) +test.llms: clippy + cargo test --no-fail-fast -p llama-cpp-bindings-tests $(DEVICE_FEATURE) .PHONY: test.unit -test.unit: clippy - cargo test -p llama-cpp-log-decoder - cargo test -p llama-cpp-bindings $(DEVICE_FEATURE) +test.unit: clippy test.harness + cargo test -p llama-cpp-log-decoder -p llama-cpp-bindings $(DEVICE_FEATURE) diff --git a/llama-cpp-bindings-sys/src/lib.rs b/llama-cpp-bindings-sys/src/lib.rs index fae05709..898d1d22 100644 --- a/llama-cpp-bindings-sys/src/lib.rs +++ b/llama-cpp-bindings-sys/src/lib.rs @@ -1,11 +1,28 @@ //! See [llama-cpp-bindings](https://crates.io/crates/llama-cpp-bindings) for a documented and safe API. -#![allow(non_upper_case_globals)] -#![allow(non_camel_case_types)] -#![allow(non_snake_case)] -#![allow(unpredictable_function_pointer_comparisons)] -#![allow(unnecessary_transmutes)] -#![allow(clippy::missing_safety_doc)] -#![allow(clippy::ptr_offset_with_cast)] +#![expect( + non_camel_case_types, + reason = "bindgen emits C struct and enum names verbatim and they don't follow Rust naming" +)] +#![expect( + non_snake_case, + reason = "bindgen emits C function names verbatim and they don't always follow Rust naming" +)] +#![expect( + unpredictable_function_pointer_comparisons, + reason = "bindgen-generated FFI function pointers are opaque and the lint cannot reason about them" +)] +#![expect( + unnecessary_transmutes, + reason = "bindgen generates transmutes to bridge between C and Rust integer/enum representations" +)] +#![expect( + clippy::missing_safety_doc, + reason = "bindgen emits raw FFI declarations; safety contracts live on the wrapper API in llama-cpp-bindings" +)] +#![expect( + clippy::ptr_offset_with_cast, + reason = "bindgen emits standard FFI pointer-arithmetic patterns that this lint flags" +)] include!(concat!(env!("OUT_DIR"), "/bindings.rs")); diff --git a/llama-cpp-bindings-tests/Cargo.toml b/llama-cpp-bindings-tests/Cargo.toml index c19700da..c17b881d 100644 --- a/llama-cpp-bindings-tests/Cargo.toml +++ b/llama-cpp-bindings-tests/Cargo.toml @@ -9,11 +9,270 @@ publish = false [dependencies] anyhow = { workspace = true } encoding_rs = { workspace = true } -hf-hub = { workspace = true } llama-cpp-bindings = { workspace = true } llama-cpp-bindings-sys = { workspace = true } +llama-cpp-test-harness = { workspace = true } serde_json = { workspace = true } -serial_test = { workspace = true } + +[[test]] +name = "context" +harness = false + +[[test]] +name = "llama_backend" +harness = false + +[[test]] +name = "context_kv_cache" +harness = false + +[[test]] +name = "deepseek_r1_8b_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt" +harness = false + +[[test]] +name = "deepseek_r1_8b_classifier_emits_reasoning" +harness = false + +[[test]] +name = "deepseek_r1_8b_duck_types_gemma_paired_quote" +harness = false + +[[test]] +name = "deepseek_r1_8b_duck_types_glm_key_value_tags" +harness = false + +[[test]] +name = "deepseek_r1_8b_duck_types_mistral_bracketed_json" +harness = false + +[[test]] +name = "deepseek_r1_8b_duck_types_qwen_xml" +harness = false + +[[test]] +name = "deepseek_r1_8b_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested" +harness = false + +[[test]] +name = "deepseek_r1_8b_recognizes_empty_tool_calls_when_tools_not_requested" +harness = false + +[[test]] +name = "context_session" +harness = false + +[[test]] +name = "embeddings" +harness = false + +[[test]] +name = "gemma4_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt" +harness = false + +[[test]] +name = "gemma4_classifier_emits_reasoning" +harness = false + +[[test]] +name = "gemma4_classifier_emits_reasoning_for_multimodal_thinking_prompt" +harness = false + +[[test]] +name = "gemma4_parses_tool_call_payload" +harness = false + +[[test]] +name = "gemma4_template_override_returns_full_markers" +harness = false + +[[test]] +name = "glm47_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt" +harness = false + +[[test]] +name = "glm47_classifier_emits_reasoning" +harness = false + +[[test]] +name = "glm47_parses_tool_call_payload" +harness = false + +[[test]] +name = "glm47_template_override_returns_full_markers" +harness = false + +[[test]] +name = "mistral3_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt" +harness = false + +[[test]] +name = "mistral3_classifier_emits_reasoning" +harness = false + +[[test]] +name = "mistral3_classifier_emits_reasoning_for_multimodal_thinking_prompt" +harness = false + +[[test]] +name = "mistral3_parses_tool_call_payload" +harness = false + +[[test]] +name = "eval_multimodal_chunks_records_exact_token_counts" +harness = false + +[[test]] +name = "ingest_prompt_chunk" +harness = false + +[[test]] +name = "llguidance" +harness = false + +[[test]] +name = "model_chat_template" +harness = false + +[[test]] +name = "model_context_creation" +harness = false + +[[test]] +name = "model_helpers" +harness = false + +[[test]] +name = "model_params" +harness = false + +[[test]] +name = "model_loading_errors" +harness = false + +[[test]] +name = "model_lora_adapter_errors" +harness = false + +[[test]] +name = "model_metadata_kv" +harness = false + +[[test]] +name = "model_properties" +harness = false + +[[test]] +name = "model_sampling" +harness = false + +[[test]] +name = "model_special_tokens" +harness = false + +[[test]] +name = "model_str_to_token" +harness = false + +[[test]] +name = "model_token_to_piece" +harness = false + +[[test]] +name = "model_tokens_iterator" +harness = false + +[[test]] +name = "mtmd_bitmap" +harness = false + +[[test]] +name = "mtmd_chunk_operations" +harness = false + +[[test]] +name = "mtmd_chunk_structure" +harness = false + +[[test]] +name = "mtmd_context" +harness = false + +[[test]] +name = "mtmd_evaluation" +harness = false + +[[test]] +name = "mtmd_tokenization" +harness = false + +[[test]] +name = "multimodal" +harness = false + +[[test]] +name = "parse_chat_message" +harness = false + +[[test]] +name = "qwen35_chat_inference_emits_reasoning_when_template_auto_opens" +harness = false + +[[test]] +name = "qwen35_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt" +harness = false + +[[test]] +name = "qwen35_classifier_emits_reasoning" +harness = false + +[[test]] +name = "qwen35_classifier_emits_reasoning_for_multimodal_thinking_prompt" +harness = false + +[[test]] +name = "qwen35_parses_constrained_schema_payload" +harness = false + +[[test]] +name = "qwen35_parses_tool_call_payload" +harness = false + +[[test]] +name = "qwen35_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested" +harness = false + +[[test]] +name = "qwen36_chat_inference_emits_reasoning_when_template_auto_opens" +harness = false + +[[test]] +name = "qwen36_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt" +harness = false + +[[test]] +name = "qwen36_classifier_emits_reasoning" +harness = false + +[[test]] +name = "qwen36_classifier_emits_reasoning_for_multimodal_thinking_prompt" +harness = false + +[[test]] +name = "reranker" +harness = false + +[[test]] +name = "sampled_token_classifier_markers" +harness = false + +[[test]] +name = "sampling" +harness = false + +[[test]] +name = "text_generation" +harness = false [features] cuda = ["llama-cpp-bindings/cuda"] @@ -21,8 +280,6 @@ cuda-no-vmm = ["llama-cpp-bindings/cuda-no-vmm"] metal = ["llama-cpp-bindings/metal"] vulkan = ["llama-cpp-bindings/vulkan"] rocm = ["llama-cpp-bindings/rocm"] -multimodal_capable = [] -mrope_model = [] [lints.rust] unsafe_op_in_unsafe_fn = "warn" diff --git a/llama-cpp-bindings-tests/src/fixture_session.rs b/llama-cpp-bindings-tests/src/fixture_session.rs deleted file mode 100644 index 37993878..00000000 --- a/llama-cpp-bindings-tests/src/fixture_session.rs +++ /dev/null @@ -1,157 +0,0 @@ -use std::sync::Arc; -use std::sync::Mutex; -use std::sync::OnceLock; -use std::sync::Weak; - -use anyhow::Result; -use llama_cpp_bindings::llama_backend::LlamaBackend; -use llama_cpp_bindings::model::LlamaModel; -use llama_cpp_bindings::model::params::LlamaModelParams; -use llama_cpp_bindings::mtmd::MtmdContext; -use llama_cpp_bindings::mtmd::MtmdContextParams; - -use crate::gpu_backend::inference_model_params; -use crate::gpu_backend::require_compiled_backends_present; -use crate::test_model; - -static SHARED: Mutex> = Mutex::new(Weak::new()); - -struct FixtureSessionInner { - mtmd_context: OnceLock, - embedding_model: OnceLock, - default_model: LlamaModel, - backend: LlamaBackend, -} - -impl FixtureSessionInner { - fn load() -> Result { - let backend = LlamaBackend::init()?; - require_compiled_backends_present()?; - let default_model = Self::load_default_model(&backend)?; - - Ok(Self { - mtmd_context: OnceLock::new(), - embedding_model: OnceLock::new(), - default_model, - backend, - }) - } - - fn load_default_model(backend: &LlamaBackend) -> Result { - let path = test_model::download_model()?; - let params = inference_model_params(); - - Ok(LlamaModel::load_from_file(backend, &path, ¶ms)?) - } - - fn load_embedding_model(&self) -> Result { - let path = test_model::download_embedding_model()?; - let params = LlamaModelParams::default(); - - Ok(LlamaModel::load_from_file(&self.backend, &path, ¶ms)?) - } - - fn load_mtmd_context(&self) -> Result { - let mmproj_path = test_model::download_mmproj()?; - let mmproj_str = mmproj_path - .to_str() - .ok_or_else(|| anyhow::anyhow!("mmproj path is not valid UTF-8"))?; - let params = MtmdContextParams::default(); - - Ok(MtmdContext::init_from_file( - mmproj_str, - &self.default_model, - ¶ms, - )?) - } -} - -pub struct FixtureSession { - inner: Arc, -} - -impl FixtureSession { - /// Opens a session against the shared fixture, loading on first call or - /// after the previous session has been fully dropped. - /// - /// # Errors - /// Returns an error if the backend or default model cannot be loaded. - /// - /// # Panics - /// Panics if the shared mutex is poisoned by a prior load failure. - pub fn open() -> Result { - let inner = { - let mut shared = SHARED.lock().expect("fixture singleton mutex poisoned"); - if let Some(existing) = shared.upgrade() { - existing - } else { - let new_inner = Arc::new(FixtureSessionInner::load()?); - *shared = Arc::downgrade(&new_inner); - new_inner - } - }; - - Ok(Self { inner }) - } - - #[must_use] - pub fn backend(&self) -> &LlamaBackend { - &self.inner.backend - } - - #[must_use] - pub fn default_model(&self) -> &LlamaModel { - &self.inner.default_model - } - - /// Returns the embedding model, loading it on first call. - /// - /// # Errors - /// Returns an error if the required environment variables are not set or the - /// model cannot be downloaded or loaded. - /// - /// # Panics - /// Panics only if the just-stored value cannot be read back, which cannot - /// happen in practice. - pub fn embedding_model(&self) -> Result<&LlamaModel> { - if let Some(model) = self.inner.embedding_model.get() { - return Ok(model); - } - - let model = self.inner.load_embedding_model()?; - let _ = self.inner.embedding_model.set(model); - - Ok(self - .inner - .embedding_model - .get() - .expect("embedding model just set")) - } - - /// Returns the multimodal context, loading it on first call. - /// - /// # Errors - /// Returns an error if `LLAMA_TEST_HF_MMPROJ` is unset or the context cannot - /// be initialized. - /// - /// # Panics - /// Panics only if the just-stored value cannot be read back, which cannot - /// happen in practice. - pub fn mtmd_context(&self) -> Result<&MtmdContext> { - if !test_model::has_mmproj() { - anyhow::bail!("mtmd tests require LLAMA_TEST_HF_MMPROJ to be set"); - } - if let Some(ctx) = self.inner.mtmd_context.get() { - return Ok(ctx); - } - - let ctx = self.inner.load_mtmd_context()?; - let _ = self.inner.mtmd_context.set(ctx); - - Ok(self - .inner - .mtmd_context - .get() - .expect("mtmd context just set")) - } -} diff --git a/llama-cpp-bindings-tests/src/gpu_backend.rs b/llama-cpp-bindings-tests/src/gpu_backend.rs deleted file mode 100644 index 16b6f03c..00000000 --- a/llama-cpp-bindings-tests/src/gpu_backend.rs +++ /dev/null @@ -1,166 +0,0 @@ -use anyhow::Result; -#[cfg(any( - test, - feature = "cuda", - feature = "cuda-no-vmm", - feature = "metal", - feature = "vulkan", - feature = "rocm", -))] -use llama_cpp_bindings::llama_backend_device::LlamaBackendDevice; -use llama_cpp_bindings::llama_backend_device::list_llama_ggml_backend_devices; -use llama_cpp_bindings::model::params::LlamaModelParams; - -#[must_use] -pub fn inference_model_params() -> LlamaModelParams { - let params = LlamaModelParams::default(); - - #[cfg(any( - feature = "cuda", - feature = "cuda-no-vmm", - feature = "metal", - feature = "vulkan", - feature = "rocm", - ))] - let params = params.with_n_gpu_layers(999); - - params -} - -/// Confirms every compile-time backend feature has a matching ggml backend registered at runtime. -/// -/// Always asserts at least the CPU backend is registered (any llama.cpp build registers it); -/// when a GPU backend feature is enabled, also asserts the corresponding GPU backend is present. -/// -/// # Errors -/// -/// Returns an error when no ggml backends are registered, or when a compiled-in GPU backend -/// feature has no matching device. The error message names the missing backend(s) and lists -/// the backends that *are* registered, so misconfiguration is easy to diagnose. -pub fn require_compiled_backends_present() -> Result<()> { - let devices = list_llama_ggml_backend_devices(); - - if devices.is_empty() { - anyhow::bail!("no ggml backends registered; even CPU-only builds register a CPU backend"); - } - - #[cfg(feature = "cuda")] - require_backend(&devices, "cuda", &["CUDA"])?; - #[cfg(feature = "cuda-no-vmm")] - require_backend(&devices, "cuda-no-vmm", &["CUDA"])?; - #[cfg(feature = "metal")] - require_backend(&devices, "metal", &["Metal", "MTL"])?; - #[cfg(feature = "vulkan")] - require_backend(&devices, "vulkan", &["Vulkan"])?; - #[cfg(feature = "rocm")] - require_backend(&devices, "rocm", &["HIP", "ROCm"])?; - - Ok(()) -} - -#[cfg(any( - test, - feature = "cuda", - feature = "cuda-no-vmm", - feature = "metal", - feature = "vulkan", - feature = "rocm", -))] -fn require_backend( - devices: &[LlamaBackendDevice], - feature: &str, - accepted_names: &[&str], -) -> Result<()> { - let found = devices.iter().any(|device| { - accepted_names - .iter() - .any(|wanted| device.backend.eq_ignore_ascii_case(wanted)) - }); - - if !found { - let summary: Vec = devices - .iter() - .map(|device| format!("{}/{:?}", device.backend, device.device_type)) - .collect(); - - anyhow::bail!( - "feature `{feature}` enabled but no matching backend ({}) is registered; available: [{}]", - accepted_names.join(" / "), - summary.join(", ") - ); - } - - Ok(()) -} - -#[cfg(test)] -mod tests { - use llama_cpp_bindings::llama_backend_device::LlamaBackendDevice; - use llama_cpp_bindings::llama_backend_device_type::LlamaBackendDeviceType; - - use super::require_backend; - - fn synthetic_device(backend: &str, device_type: LlamaBackendDeviceType) -> LlamaBackendDevice { - LlamaBackendDevice { - index: 0, - name: format!("{backend}0"), - description: "synthetic test device".to_owned(), - backend: backend.to_owned(), - memory_total: 0, - memory_free: 0, - device_type, - } - } - - use anyhow::Result; - use anyhow::anyhow; - - #[test] - fn require_backend_succeeds_when_backend_name_matches_case_insensitively() -> Result<()> { - let devices = vec![synthetic_device("cuda", LlamaBackendDeviceType::Gpu)]; - - require_backend(&devices, "cuda", &["CUDA"]) - } - - #[test] - fn require_backend_succeeds_with_any_of_multiple_accepted_names() -> Result<()> { - let devices = vec![synthetic_device("HIP", LlamaBackendDeviceType::Gpu)]; - - require_backend(&devices, "rocm", &["HIP", "ROCm"]) - } - - #[test] - fn require_backend_fails_with_message_naming_feature_and_accepted_names_when_missing() - -> Result<()> { - let devices = vec![synthetic_device("Vulkan", LlamaBackendDeviceType::Gpu)]; - - let error = require_backend(&devices, "cuda", &["CUDA"]) - .err() - .ok_or_else(|| anyhow!("expected error when CUDA missing"))?; - - let message = format!("{error:#}"); - - if !message.contains("`cuda`") { - return Err(anyhow!("missing feature name: {message}")); - } - if !message.contains("CUDA") { - return Err(anyhow!("missing accepted name: {message}")); - } - if !message.contains("Vulkan") { - return Err(anyhow!("missing actual-backend summary: {message}")); - } - - Ok(()) - } - - #[test] - fn require_backend_fails_when_devices_list_is_empty() -> Result<()> { - let devices: Vec = Vec::new(); - - if require_backend(&devices, "metal", &["Metal"]).is_ok() { - return Err(anyhow!("expected Err for empty device list")); - } - - Ok(()) - } -} diff --git a/llama-cpp-bindings-tests/src/lib.rs b/llama-cpp-bindings-tests/src/lib.rs index bda23c56..00686c59 100644 --- a/llama-cpp-bindings-tests/src/lib.rs +++ b/llama-cpp-bindings-tests/src/lib.rs @@ -1,12 +1,8 @@ //! Integration test fixtures for `llama-cpp-bindings`. //! -//! This crate is the only place in the workspace that loads model files. It -//! exists so production code in `llama-cpp-bindings` stays free of test-only -//! dependencies (`anyhow`, `hf-hub`, `serial_test`, …) and helpers. +//! This crate hosts test-only helpers used by the integration tests in `tests/`: +//! [`classify_sample_loop`] for sampling-loop drivers and [`test_model::fixtures_dir`] +//! for locating image fixtures. pub mod classify_sample_loop; -pub mod fixture_session; -pub mod gpu_backend; pub mod test_model; - -pub use fixture_session::FixtureSession; diff --git a/llama-cpp-bindings-tests/src/test_model.rs b/llama-cpp-bindings-tests/src/test_model.rs index 934f1d9e..22082498 100644 --- a/llama-cpp-bindings-tests/src/test_model.rs +++ b/llama-cpp-bindings-tests/src/test_model.rs @@ -1,115 +1,8 @@ -//! Environment-driven download helpers for test models. -//! -//! Resolution rules: -//! -//! - `LLAMA_TEST_HF_REPO` and `LLAMA_TEST_HF_MODEL` are required for the default model. -//! - `LLAMA_TEST_HF_EMBED_REPO` / `LLAMA_TEST_HF_EMBED_MODEL` for the embedding model. -//! - `LLAMA_TEST_HF_ENCODER_REPO` / `LLAMA_TEST_HF_ENCODER_MODEL` for the encoder. -//! - `LLAMA_TEST_HF_MMPROJ` is optional; when set, it points to the multimodal projection file -//! inside the same repo as the default model. -//! - `HF_HOME` is honored automatically because the HF API client is built via -//! [`hf_hub::api::sync::ApiBuilder::from_env`]. +//! Path helper for image and audio fixtures used by multimodal integration tests. -use std::env; use std::path::PathBuf; -use anyhow::Result; - -fn required_env(var_name: &str) -> Result { - env::var(var_name).map_err(|_| anyhow::anyhow!("Required env var {var_name} is not set")) -} - -fn hf_repo() -> Result { - required_env("LLAMA_TEST_HF_REPO") -} - -fn hf_model() -> Result { - required_env("LLAMA_TEST_HF_MODEL") -} - -fn hf_mmproj() -> String { - env::var("LLAMA_TEST_HF_MMPROJ").unwrap_or_default() -} - -fn hf_embed_repo() -> Result { - required_env("LLAMA_TEST_HF_EMBED_REPO") -} - -fn hf_embed_model() -> Result { - required_env("LLAMA_TEST_HF_EMBED_MODEL") -} - -fn hf_encoder_repo() -> Result { - required_env("LLAMA_TEST_HF_ENCODER_REPO") -} - -fn hf_encoder_model() -> Result { - required_env("LLAMA_TEST_HF_ENCODER_MODEL") -} - -/// Downloads a file from a specific `HuggingFace` repo. -/// -/// # Errors -/// Returns an error if the download fails. -pub fn download_file_from(repo: &str, filename: &str) -> Result { - download_file(repo, filename) -} - -fn download_file(repo: &str, filename: &str) -> Result { - let path = hf_hub::api::sync::ApiBuilder::from_env() - .with_progress(true) - .build()? - .model(repo.to_string()) - .get(filename)?; - - Ok(path) -} - -/// Downloads the configured test model from Hugging Face. -/// -/// # Errors -/// Returns an error if the required environment variables are not set or the download fails. -pub fn download_model() -> Result { - download_file(&hf_repo()?, &hf_model()?) -} - -/// Downloads the configured mmproj file from Hugging Face. -/// -/// # Errors -/// Returns an error if the required environment variables are not set or the download fails. -pub fn download_mmproj() -> Result { - let mmproj = hf_mmproj(); - - if mmproj.is_empty() { - anyhow::bail!("LLAMA_TEST_HF_MMPROJ is not set or empty"); - } - - download_file(&hf_repo()?, &mmproj) -} - -/// Downloads the configured embedding model from Hugging Face. -/// -/// # Errors -/// Returns an error if the required environment variables are not set or the download fails. -pub fn download_embedding_model() -> Result { - download_file(&hf_embed_repo()?, &hf_embed_model()?) -} - -/// Downloads the configured encoder model from Hugging Face. -/// -/// # Errors -/// Returns an error if the required environment variables are not set or the download fails. -pub fn download_encoder_model() -> Result { - download_file(&hf_encoder_repo()?, &hf_encoder_model()?) -} - -/// Returns whether a multimodal projection model is configured. -#[must_use] -pub fn has_mmproj() -> bool { - !hf_mmproj().is_empty() -} - -/// Returns the path to the test fixtures directory. +/// Returns the absolute path to the test fixtures directory. #[must_use] pub fn fixtures_dir() -> PathBuf { PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("fixtures") @@ -117,128 +10,6 @@ pub fn fixtures_dir() -> PathBuf { #[cfg(test)] mod tests { - struct EnvVarGuard { - name: &'static str, - original: Option, - } - - impl EnvVarGuard { - fn set(name: &'static str, value: &str) -> Self { - let original = std::env::var(name).ok(); - unsafe { std::env::set_var(name, value) }; - - Self { name, original } - } - } - - impl Drop for EnvVarGuard { - fn drop(&mut self) { - match &self.original { - Some(value) => unsafe { std::env::set_var(self.name, value) }, - None => unsafe { std::env::remove_var(self.name) }, - } - } - } - - #[test] - fn required_env_returns_error_for_missing_var() { - let result = super::required_env("LLAMA_TEST_NONEXISTENT_VAR_THAT_SHOULD_NOT_EXIST"); - - assert!(result.is_err()); - } - - #[test] - fn download_file_with_nonexistent_file_returns_error() { - let result = - super::download_file("unsloth/Qwen3.5-0.8B-GGUF", "this-file-does-not-exist.gguf"); - - assert!(result.is_err()); - } - - #[test] - #[serial_test::serial] - fn download_file_from_succeeds_for_known_repo_and_file() { - let result = - super::download_file_from("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"); - - assert!(result.is_ok()); - } - - #[test] - #[serial_test::serial] - fn download_model_returns_path_with_env_set() { - if std::env::var("LLAMA_TEST_HF_REPO").is_err() - || std::env::var("LLAMA_TEST_HF_MODEL").is_err() - { - return; - } - - let result = super::download_model(); - - assert!(result.is_ok()); - } - - #[test] - #[serial_test::serial] - fn download_embedding_model_returns_path_with_env_set() { - if std::env::var("LLAMA_TEST_HF_EMBED_REPO").is_err() - || std::env::var("LLAMA_TEST_HF_EMBED_MODEL").is_err() - { - return; - } - - let result = super::download_embedding_model(); - - assert!(result.is_ok()); - } - - #[test] - #[serial_test::serial] - fn download_encoder_model_returns_path_with_env_set() { - if std::env::var("LLAMA_TEST_HF_ENCODER_REPO").is_err() - || std::env::var("LLAMA_TEST_HF_ENCODER_MODEL").is_err() - { - return; - } - - let result = super::download_encoder_model(); - - assert!(result.is_ok()); - } - - #[cfg(feature = "multimodal_capable")] - #[test] - #[serial_test::serial] - fn download_mmproj_returns_path_when_env_set() { - if std::env::var("LLAMA_TEST_HF_REPO").is_err() { - return; - } - - let _guard = EnvVarGuard::set("LLAMA_TEST_HF_MMPROJ", "mmproj-F16.gguf"); - let result = super::download_mmproj(); - - assert!(result.is_ok()); - } - - #[test] - #[serial_test::serial] - fn download_mmproj_returns_error_when_env_empty() { - let _guard = EnvVarGuard::set("LLAMA_TEST_HF_MMPROJ", ""); - let result = super::download_mmproj(); - - assert!(result.is_err()); - } - - #[test] - #[serial_test::serial] - fn has_mmproj_reflects_env_var() { - let _set_guard = EnvVarGuard::set("LLAMA_TEST_HF_MMPROJ", "mmproj-F16.gguf"); - assert!(super::has_mmproj()); - - let _empty_guard = EnvVarGuard::set("LLAMA_TEST_HF_MMPROJ", ""); - assert!(!super::has_mmproj()); - } - #[test] fn fixtures_dir_is_under_manifest() { let dir = super::fixtures_dir(); diff --git a/llama-cpp-bindings-tests/tests/constrained_decoding.rs b/llama-cpp-bindings-tests/tests/constrained_decoding.rs index 6be1014f..533981c9 100644 --- a/llama-cpp-bindings-tests/tests/constrained_decoding.rs +++ b/llama-cpp-bindings-tests/tests/constrained_decoding.rs @@ -2,23 +2,61 @@ use std::io::Write; use anyhow::Result; use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::context::params::LlamaContextParams; use llama_cpp_bindings::llama_batch::LlamaBatch; use llama_cpp_bindings::model::AddBos; use llama_cpp_bindings::sampled_token::SampledToken; use llama_cpp_bindings::sampling::LlamaSampler; -use llama_cpp_bindings_tests::FixtureSession; - -#[test] -fn json_schema_constrains_output() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn json_schema_constrains_output(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; let prompt = "The weather in Paris is sunny and 22 degrees. Extract as JSON:\n"; - let ctx_params = LlamaContextParams::default(); - let mut ctx = LlamaContext::from_model(model, backend, ctx_params)?; + let mut ctx = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; let tokens_list = model.str_to_token(prompt, AddBos::Always)?; @@ -77,14 +115,10 @@ fn json_schema_constrains_output() -> Result<()> { .next() .ok_or_else(|| anyhow::anyhow!("model produced no JSON value"))??; - assert!( - parsed.get("city").is_some(), - "constrained output should contain 'city' field" - ); - assert!( - parsed.get("temperature").is_some(), - "constrained output should contain 'temperature' field" - ); + assert!(parsed.get("city").is_some()); + assert!(parsed.get("temperature").is_some()); Ok(()) } + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/context.rs b/llama-cpp-bindings-tests/tests/context.rs index fe7ba7c8..1e3a6b08 100644 --- a/llama-cpp-bindings-tests/tests/context.rs +++ b/llama-cpp-bindings-tests/tests/context.rs @@ -1,4 +1,3 @@ -use std::num::NonZeroU32; use std::ptr::NonNull; use std::sync::Arc; use std::sync::atomic::AtomicBool; @@ -7,24 +6,32 @@ use anyhow::Result; use llama_cpp_bindings::DecodeError; use llama_cpp_bindings::LogitsError; use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::context::params::LlamaContextParams; use llama_cpp_bindings::llama_batch::LlamaBatch; use llama_cpp_bindings::model::AddBos; use llama_cpp_bindings::model::LlamaLoraAdapter; -use llama_cpp_bindings::model::LlamaModel; -use llama_cpp_bindings_tests::FixtureSession; -use llama_cpp_bindings_tests::gpu_backend::inference_model_params; -use llama_cpp_bindings_tests::test_model; -use serial_test::serial; - -#[test] -#[serial] -fn context_creation_and_properties() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let context = LlamaContext::from_model(model, backend, ctx_params)?; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; + +// ========================================================================================= +// Group A: default Qwen model, embeddings=false. Most context tests fall here. +// ========================================================================================= + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn context_creation_and_properties(fixture: &LlamaFixture<'_>) -> Result<()> { + let context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; assert!(context.n_ctx() > 0); assert!(context.n_batch() > 0); @@ -33,15 +40,22 @@ fn context_creation_and_properties() -> Result<()> { Ok(()) } -#[test] -#[serial] -fn decode_and_get_logits() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; - let tokens = model.str_to_token("hello", AddBos::Always)?; +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn decode_and_get_logits(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; let mut batch = LlamaBatch::new(512, 1)?; batch.add_sequence(&tokens, 0, false)?; @@ -54,14 +68,21 @@ fn decode_and_get_logits() -> Result<()> { Ok(()) } -#[test] -#[serial] -fn timings_work() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn timings_work(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; context.reset_timings(); let timings = context.timings(); @@ -70,15 +91,22 @@ fn timings_work() -> Result<()> { Ok(()) } -#[test] -#[serial] -fn token_data_array_has_entries_after_decode() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; - let tokens = model.str_to_token("hello", AddBos::Always)?; +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn token_data_array_has_entries_after_decode(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; let mut batch = LlamaBatch::new(512, 1)?; batch.add_sequence(&tokens, 0, false)?; context.decode(&mut batch)?; @@ -90,15 +118,22 @@ fn token_data_array_has_entries_after_decode() -> Result<()> { Ok(()) } -#[test] -#[serial] -fn get_logits_ith_returns_valid_slice() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; - let tokens = model.str_to_token("hello", AddBos::Always)?; +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn get_logits_ith_returns_valid_slice(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; let last_index = i32::try_from(tokens.len() - 1)?; let mut batch = LlamaBatch::new(512, 1)?; batch.add_sequence(&tokens, 0, false)?; @@ -106,20 +141,27 @@ fn get_logits_ith_returns_valid_slice() -> Result<()> { let logits = context.get_logits_ith(last_index)?; - assert_eq!(logits.len(), usize::try_from(model.n_vocab())?); + assert_eq!(logits.len(), usize::try_from(fixture.model.n_vocab())?); Ok(()) } -#[test] -#[serial] -fn token_data_array_ith_returns_valid_data() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; - let tokens = model.str_to_token("hello", AddBos::Always)?; +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn token_data_array_ith_returns_valid_data(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; let last_index = i32::try_from(tokens.len() - 1)?; let mut batch = LlamaBatch::new(512, 1)?; batch.add_sequence(&tokens, 0, false)?; @@ -129,22 +171,27 @@ fn token_data_array_ith_returns_valid_data() -> Result<()> { assert_eq!( token_data_array.data.len(), - usize::try_from(model.n_vocab())? + usize::try_from(fixture.model.n_vocab())? ); Ok(()) } -#[test] -#[serial] -fn embeddings_ith_returns_error_when_embeddings_disabled() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default() - .with_n_ctx(NonZeroU32::new(512)) - .with_embeddings(false); - let context = LlamaContext::from_model(model, backend, ctx_params)?; +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn embeddings_ith_returns_error_when_embeddings_disabled(fixture: &LlamaFixture<'_>) -> Result<()> { + let context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; let result = context.embeddings_ith(0); @@ -153,16 +200,23 @@ fn embeddings_ith_returns_error_when_embeddings_disabled() -> Result<()> { Ok(()) } -#[test] -#[serial] -fn embeddings_seq_ith_returns_error_when_embeddings_disabled() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default() - .with_n_ctx(NonZeroU32::new(512)) - .with_embeddings(false); - let context = LlamaContext::from_model(model, backend, ctx_params)?; +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn embeddings_seq_ith_returns_error_when_embeddings_disabled( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; let result = context.embeddings_seq_ith(0); @@ -171,34 +225,48 @@ fn embeddings_seq_ith_returns_error_when_embeddings_disabled() -> Result<()> { Ok(()) } -#[test] -#[serial] -fn candidates_returns_n_vocab_entries() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; - let tokens = model.str_to_token("hello", AddBos::Always)?; +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn candidates_returns_n_vocab_entries(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; let mut batch = LlamaBatch::new(512, 1)?; batch.add_sequence(&tokens, 0, false)?; context.decode(&mut batch)?; let count = context.candidates()?.count(); - assert_eq!(count, usize::try_from(model.n_vocab())?); + assert_eq!(count, usize::try_from(fixture.model.n_vocab())?); Ok(()) } -#[test] -#[serial] -fn debug_format_contains_struct_name() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let context = LlamaContext::from_model(model, backend, ctx_params)?; +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn debug_format_contains_struct_name(fixture: &LlamaFixture<'_>) -> Result<()> { + let context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; let debug_output = format!("{context:?}"); assert!(debug_output.contains("LlamaContext")); @@ -206,17 +274,374 @@ fn debug_format_contains_struct_name() -> Result<()> { Ok(()) } -#[test] -#[serial] -fn decode_with_embeddings_enabled() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.embedding_model()?; - let ctx_params = LlamaContextParams::default() - .with_n_ctx(NonZeroU32::new(512)) - .with_embeddings(true); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; - let tokens = model.str_to_token("hello", AddBos::Always)?; +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn candidates_ith_returns_n_vocab_entries(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; + let last_index = i32::try_from(tokens.len() - 1)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + context.decode(&mut batch)?; + + let count = context.candidates_ith(last_index)?.count(); + + assert_eq!(count, usize::try_from(fixture.model.n_vocab())?); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn lora_adapter_remove_succeeds_with_no_adapters(fixture: &LlamaFixture<'_>) -> Result<()> { + let context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let mut adapter = LlamaLoraAdapter { + lora_adapter: NonNull::dangling(), + }; + + let result = context.lora_adapter_remove(&mut adapter); + + assert!(result.is_ok()); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn encode_on_non_encoder_model_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + + let result = context.encode(&mut batch); + + assert!(result.is_err()); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn lora_adapter_set_with_dangling_pointer_succeeds_or_errors( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let mut adapter = LlamaLoraAdapter { + lora_adapter: NonNull::dangling(), + }; + + let result = context.lora_adapter_set(&mut adapter, 1.0); + + assert!(result.is_ok()); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + embeddings = true, +)] +fn embeddings_seq_ith_returns_null_embedding_error_for_invalid_seq( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + context.decode(&mut batch)?; + + let result = context.embeddings_seq_ith(999); + + assert!(result.is_err()); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn decode_empty_batch_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let mut batch = LlamaBatch::new(512, 1)?; + + let result = context.decode(&mut batch); + + assert!(result.is_err()); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn set_abort_flag_aborts_decode(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let abort_flag = Arc::new(AtomicBool::new(true)); + context.set_abort_flag(abort_flag); + + let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + + let result = context.decode(&mut batch); + + assert_eq!(result, Err(DecodeError::Aborted)); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn set_abort_flag_false_allows_decode(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let abort_flag = Arc::new(AtomicBool::new(false)); + context.set_abort_flag(abort_flag); + + let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + + let result = context.decode(&mut batch); + + assert!(result.is_ok()); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn clear_abort_callback_allows_decode_with_flag_true(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let abort_flag = Arc::new(AtomicBool::new(true)); + context.set_abort_flag(abort_flag); + context.clear_abort_callback(); + + let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + batch.add_sequence(&tokens, 0, false)?; + + let result = context.decode(&mut batch); + + assert!(result.is_ok()); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn synchronize_completes_without_panic(fixture: &LlamaFixture<'_>) -> Result<()> { + let context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + context.synchronize(); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn detach_threadpool_completes_without_panic(fixture: &LlamaFixture<'_>) -> Result<()> { + let context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + context.detach_threadpool(); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn get_logits_ith_returns_token_not_initialized_for_unknown_index( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let result = context.get_logits_ith(7); + + assert!(matches!(result, Err(LogitsError::TokenNotInitialized(7)))); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 64, + n_batch = 2048, + n_ubatch = 512, +)] +fn get_logits_ith_returns_token_index_exceeds_context_for_huge_index( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let huge_index = i32::try_from(context.n_ctx())?; + context.mark_logits_initialized(huge_index); + let result = context.get_logits_ith(huge_index); + + assert!(matches!( + result, + Err(LogitsError::TokenIndexExceedsContext { .. }) + )); + + Ok(()) +} + +// ========================================================================================= +// Group B: Qwen embedding model, embeddings=true. Six embedding-specific tests. +// ========================================================================================= + +#[llama_test( + model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + embeddings = true, +)] +fn decode_with_embeddings_enabled(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; let mut batch = LlamaBatch::new(512, 1)?; batch.add_sequence(&tokens, 0, false)?; @@ -227,39 +652,53 @@ fn decode_with_embeddings_enabled() -> Result<()> { Ok(()) } -#[test] -#[serial] -fn embeddings_seq_ith_returns_valid_embeddings() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.embedding_model()?; - let ctx_params = LlamaContextParams::default() - .with_n_ctx(NonZeroU32::new(512)) - .with_embeddings(true); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; - let tokens = model.str_to_token("hello", AddBos::Always)?; +#[llama_test( + model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + embeddings = true, +)] +fn embeddings_seq_ith_returns_valid_embeddings(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; let mut batch = LlamaBatch::new(512, 1)?; batch.add_sequence(&tokens, 0, false)?; context.decode(&mut batch)?; let embeddings = context.embeddings_seq_ith(0)?; - assert_eq!(embeddings.len(), usize::try_from(model.n_embd())?); + assert_eq!(embeddings.len(), usize::try_from(fixture.model.n_embd())?); Ok(()) } -#[test] -#[serial] -fn multi_sequence_embeddings_returns_one_embedding_per_sequence() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.embedding_model()?; - let ctx_params = LlamaContextParams::default() - .with_n_ctx(NonZeroU32::new(512)) - .with_n_seq_max(4) - .with_embeddings(true); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; +#[llama_test( + model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + n_seq_max = 4, + embeddings = true, +)] +fn multi_sequence_embeddings_returns_one_embedding_per_sequence( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; let inputs = [ "alpha is here", @@ -270,7 +709,7 @@ fn multi_sequence_embeddings_returns_one_embedding_per_sequence() -> Result<()> let mut batch = LlamaBatch::new(64, 4)?; for (sequence_index, text) in inputs.iter().enumerate() { - let tokens = model.str_to_token(text, AddBos::Always)?; + let tokens = fixture.model.str_to_token(text, AddBos::Always)?; let sequence_id = i32::try_from(sequence_index)?; batch.add_sequence(&tokens, sequence_id, true)?; @@ -278,7 +717,7 @@ fn multi_sequence_embeddings_returns_one_embedding_per_sequence() -> Result<()> context.decode(&mut batch)?; - let n_embd = usize::try_from(model.n_embd())?; + let n_embd = usize::try_from(fixture.model.n_embd())?; let mut collected: Vec> = Vec::with_capacity(inputs.len()); for sequence_index in 0..inputs.len() { @@ -308,23 +747,26 @@ fn multi_sequence_embeddings_returns_one_embedding_per_sequence() -> Result<()> /// Reproduces paddler's embedding batching loop exactly with the document strings, batch /// shape, and iteration pattern from the failing harness test -/// `agent_embedding_batch_distribution_independent_of_context_size`. A `LlamaBatch` is -/// allocated once with `n_tokens=64` and `n_seq_max=4`, then reused across two iterations -/// of two sequences each (because the four ~22-token docs do not all fit in one -/// 64-token window). Per iteration: `add_sequence` for each doc, `clear_kv_cache`, -/// `decode`, `embeddings_seq_ith` for each filled slot, `batch.clear()`. Every iteration -/// must yield distinct, non-empty embeddings — including iterations after the first. -#[test] -#[serial] -fn embeddings_returns_distinct_values_when_reused_batch_has_extra_capacity() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.embedding_model()?; - let ctx_params = LlamaContextParams::default() - .with_n_ctx(NonZeroU32::new(512)) - .with_n_seq_max(4) - .with_embeddings(true); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; +/// `agent_embedding_batch_distribution_independent_of_context_size`. +#[llama_test( + model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + n_seq_max = 4, + embeddings = true, +)] +fn embeddings_returns_distinct_values_when_reused_batch_has_extra_capacity( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; let iterations = [ [ @@ -337,13 +779,13 @@ fn embeddings_returns_distinct_values_when_reused_batch_has_extra_capacity() -> ], ]; - let n_embd = usize::try_from(model.n_embd())?; + let n_embd = usize::try_from(fixture.model.n_embd())?; let mut batch = LlamaBatch::new(64, 4)?; let mut collected: Vec> = Vec::new(); for iteration_inputs in iterations { for (sequence_index, text) in iteration_inputs.iter().enumerate() { - let tokens = model.str_to_token(text, AddBos::Always)?; + let tokens = fixture.model.str_to_token(text, AddBos::Always)?; let sequence_id = i32::try_from(sequence_index)?; batch.add_sequence(&tokens, sequence_id, true)?; @@ -386,17 +828,23 @@ fn embeddings_returns_distinct_values_when_reused_batch_has_extra_capacity() -> Ok(()) } -#[test] -#[serial] -fn embeddings_ith_returns_valid_embeddings() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.embedding_model()?; - let ctx_params = LlamaContextParams::default() - .with_n_ctx(NonZeroU32::new(512)) - .with_embeddings(true); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; - let tokens = model.str_to_token("hello", AddBos::Always)?; +#[llama_test( + model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + embeddings = true, +)] +fn embeddings_ith_returns_valid_embeddings(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let tokens = fixture.model.str_to_token("hello", AddBos::Always)?; let last_index = i32::try_from(tokens.len() - 1)?; let mut batch = LlamaBatch::new(512, 1)?; batch.add_sequence(&tokens, 0, false)?; @@ -404,99 +852,29 @@ fn embeddings_ith_returns_valid_embeddings() -> Result<()> { let embeddings = context.embeddings_ith(last_index)?; - assert_eq!(embeddings.len(), usize::try_from(model.n_embd())?); + assert_eq!(embeddings.len(), usize::try_from(fixture.model.n_embd())?); Ok(()) } -#[test] -#[serial] -fn candidates_ith_returns_n_vocab_entries() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; - let tokens = model.str_to_token("hello", AddBos::Always)?; - let last_index = i32::try_from(tokens.len() - 1)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; - - let count = context.candidates_ith(last_index)?.count(); - - assert_eq!(count, usize::try_from(model.n_vocab())?); - - Ok(()) -} - -#[test] -#[serial] -fn lora_adapter_remove_succeeds_with_no_adapters() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let context = LlamaContext::from_model(model, backend, ctx_params)?; - let mut adapter = LlamaLoraAdapter { - lora_adapter: NonNull::dangling(), - }; - - let result = context.lora_adapter_remove(&mut adapter); - - assert!(result.is_ok()); - - Ok(()) -} - -#[test] -#[serial] -fn encode_on_non_encoder_model_returns_error() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; - let tokens = model.str_to_token("hello", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - - let result = context.encode(&mut batch); - - assert!(result.is_err()); - - Ok(()) -} - -#[test] -#[serial] -fn lora_adapter_set_with_dangling_pointer_succeeds_or_errors() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let context = LlamaContext::from_model(model, backend, ctx_params)?; - let mut adapter = LlamaLoraAdapter { - lora_adapter: NonNull::dangling(), - }; - - let result = context.lora_adapter_set(&mut adapter, 1.0); - - assert!(result.is_ok()); - - Ok(()) -} - -#[test] -#[serial] -fn embeddings_ith_returns_null_embedding_error_for_non_embedding_token() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.embedding_model()?; - let ctx_params = LlamaContextParams::default() - .with_n_ctx(NonZeroU32::new(512)) - .with_embeddings(true); - let context = LlamaContext::from_model(model, backend, ctx_params)?; +#[llama_test( + model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + embeddings = true, +)] +fn embeddings_ith_returns_null_embedding_error_for_non_embedding_token( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; let result = context.embeddings_ith(999); @@ -505,58 +883,27 @@ fn embeddings_ith_returns_null_embedding_error_for_non_embedding_token() -> Resu Ok(()) } -#[test] -#[serial] -fn embeddings_seq_ith_returns_null_embedding_error_for_invalid_seq() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default() - .with_n_ctx(NonZeroU32::new(512)) - .with_embeddings(true); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; - let tokens = model.str_to_token("hello", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; - - let result = context.embeddings_seq_ith(999); - - assert!(result.is_err()); - - Ok(()) -} - -#[test] -#[serial] -fn decode_empty_batch_returns_error() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; - let mut batch = LlamaBatch::new(512, 1)?; - - let result = context.decode(&mut batch); - - assert!(result.is_err()); - - Ok(()) -} - -#[test] -#[serial] -fn encode_succeeds_with_encoder_model() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model_path = test_model::download_encoder_model()?; - let model_params = inference_model_params(); - let model = LlamaModel::load_from_file(backend, &model_path, &model_params)?; - let ctx_params = LlamaContextParams::default() - .with_n_ctx(NonZeroU32::new(512)) - .with_embeddings(true); - let mut context = LlamaContext::from_model(&model, backend, ctx_params)?; - let tokens = model.str_to_token("hello", AddBos::Never)?; +// ========================================================================================= +// Group C: t5-small encoder model, embeddings=true. Single trial. +// ========================================================================================= + +#[llama_test( + model_source = HuggingFace("Xiaojian9992024/t5-small-GGUF", "t5-small.bf16.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + embeddings = true, +)] +fn encode_succeeds_with_encoder_model(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let tokens = fixture.model.str_to_token("hello", AddBos::Never)?; let mut batch = LlamaBatch::new(512, 1)?; batch.add_sequence(&tokens, 0, false)?; @@ -567,134 +914,4 @@ fn encode_succeeds_with_encoder_model() -> Result<()> { Ok(()) } -#[test] -#[serial] -fn set_abort_flag_aborts_decode() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; - let abort_flag = Arc::new(AtomicBool::new(true)); - context.set_abort_flag(abort_flag); - - let tokens = model.str_to_token("hello", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - - let result = context.decode(&mut batch); - - assert_eq!(result, Err(DecodeError::Aborted)); - - Ok(()) -} - -#[test] -#[serial] -fn set_abort_flag_false_allows_decode() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; - let abort_flag = Arc::new(AtomicBool::new(false)); - context.set_abort_flag(abort_flag); - - let tokens = model.str_to_token("hello", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - - let result = context.decode(&mut batch); - - assert!(result.is_ok()); - - Ok(()) -} - -#[test] -#[serial] -fn clear_abort_callback_allows_decode_with_flag_true() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; - let abort_flag = Arc::new(AtomicBool::new(true)); - context.set_abort_flag(abort_flag); - context.clear_abort_callback(); - - let tokens = model.str_to_token("hello", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - - let result = context.decode(&mut batch); - - assert!(result.is_ok()); - - Ok(()) -} - -#[test] -#[serial] -fn synchronize_completes_without_panic() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let context = LlamaContext::from_model(model, backend, ctx_params)?; - - context.synchronize(); - - Ok(()) -} - -#[test] -#[serial] -fn detach_threadpool_completes_without_panic() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let context = LlamaContext::from_model(model, backend, ctx_params)?; - - context.detach_threadpool(); - - Ok(()) -} - -#[test] -#[serial] -fn get_logits_ith_returns_token_not_initialized_for_unknown_index() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let context = LlamaContext::from_model(model, backend, ctx_params)?; - - let result = context.get_logits_ith(7); - - assert!(matches!(result, Err(LogitsError::TokenNotInitialized(7)))); - - Ok(()) -} - -#[test] -#[serial] -fn get_logits_ith_returns_token_index_exceeds_context_for_huge_index() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(64)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; - - let huge_index = i32::try_from(context.n_ctx())?; - context.mark_logits_initialized(huge_index); - let result = context.get_logits_ith(huge_index); - - assert!(matches!( - result, - Err(LogitsError::TokenIndexExceedsContext { .. }) - )); - - Ok(()) -} +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/context_kv_cache.rs b/llama-cpp-bindings-tests/tests/context_kv_cache.rs index 39ee2714..467a2aa4 100644 --- a/llama-cpp-bindings-tests/tests/context_kv_cache.rs +++ b/llama-cpp-bindings-tests/tests/context_kv_cache.rs @@ -1,29 +1,72 @@ use std::num::NonZeroU8; -use std::num::NonZeroU32; use anyhow::Result; use llama_cpp_bindings::context::LlamaContext; use llama_cpp_bindings::context::kv_cache::KvCacheConversionError; -use llama_cpp_bindings::context::params::LlamaContextParams; -use llama_cpp_bindings::error::{KvCacheSeqAddError, KvCacheSeqDivError}; +use llama_cpp_bindings::error::KvCacheSeqAddError; +use llama_cpp_bindings::error::KvCacheSeqDivError; use llama_cpp_bindings::llama_batch::LlamaBatch; use llama_cpp_bindings::model::AddBos; -use llama_cpp_bindings_tests::FixtureSession; -use serial_test::serial; - -#[test] -#[serial] -fn clear_kv_cache_resets_positions() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; - - let tokens = model.str_to_token("Hello world", AddBos::Always)?; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; + +fn build_context<'context>(fixture: &'context LlamaFixture<'_>) -> Result> { + Ok(LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?) +} + +fn decode_hello_world(fixture: &LlamaFixture<'_>, context: &mut LlamaContext<'_>) -> Result<()> { + let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?; let mut batch = LlamaBatch::new(512, 1)?; batch.add_sequence(&tokens, 0, false)?; context.decode(&mut batch)?; + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn clear_kv_cache_resets_positions(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; + + decode_hello_world(fixture, &mut context)?; context.clear_kv_cache(); assert_eq!(context.kv_cache_seq_pos_max(0), -1); @@ -31,38 +74,92 @@ fn clear_kv_cache_resets_positions() -> Result<()> { Ok(()) } -#[test] -#[serial] -fn kv_cache_seq_pos_max_is_non_negative_after_decode() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; - - let tokens = model.str_to_token("Hello world", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn kv_cache_seq_pos_max_is_non_negative_after_decode(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; + + decode_hello_world(fixture, &mut context)?; assert!(context.kv_cache_seq_pos_max(0) >= 0); Ok(()) } -#[test] -#[serial] -fn clear_kv_cache_seq_with_range() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; - - let tokens = model.str_to_token("Hello world", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn clear_kv_cache_seq_with_range(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; + + decode_hello_world(fixture, &mut context)?; let result = context.clear_kv_cache_seq(Some(0), Some(0), Some(1)); assert!(result.is_ok()); @@ -70,19 +167,46 @@ fn clear_kv_cache_seq_with_range() -> Result<()> { Ok(()) } -#[test] -#[serial] -fn copy_kv_cache_seq_succeeds() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; - - let tokens = model.str_to_token("Hello world", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn copy_kv_cache_seq_succeeds(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; + + decode_hello_world(fixture, &mut context)?; let result = context.copy_kv_cache_seq(0, 1, None, None); assert!(result.is_ok()); @@ -90,19 +214,46 @@ fn copy_kv_cache_seq_succeeds() -> Result<()> { Ok(()) } -#[test] -#[serial] -fn copy_cache_executes_without_crash() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; - - let tokens = model.str_to_token("Hello world", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn copy_cache_executes_without_crash(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; + + decode_hello_world(fixture, &mut context)?; let pos_max = context.kv_cache_seq_pos_max(0); context.copy_cache(0, 1, pos_max + 1); @@ -110,20 +261,28 @@ fn copy_cache_executes_without_crash() -> Result<()> { Ok(()) } -#[cfg(feature = "mrope_model")] -#[test] -#[serial] -fn kv_cache_seq_add_returns_error_for_mrope_model() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; - - let tokens = model.str_to_token("Hello world", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn kv_cache_seq_add_returns_error_for_mrope_model(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; + + decode_hello_world(fixture, &mut context)?; let result = context.kv_cache_seq_add(0, Some(0), None, 1); @@ -135,20 +294,28 @@ fn kv_cache_seq_add_returns_error_for_mrope_model() -> Result<()> { Ok(()) } -#[cfg(feature = "mrope_model")] -#[test] -#[serial] -fn kv_cache_seq_div_returns_error_for_mrope_model() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; - - let tokens = model.str_to_token("Hello world", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn kv_cache_seq_div_returns_error_for_mrope_model(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; + + decode_hello_world(fixture, &mut context)?; let divisor = NonZeroU8::new(2).ok_or_else(|| anyhow::anyhow!("2 is non-zero"))?; let result = context.kv_cache_seq_div(0, Some(0), None, divisor); @@ -161,19 +328,46 @@ fn kv_cache_seq_div_returns_error_for_mrope_model() -> Result<()> { Ok(()) } -#[test] -#[serial] -fn kv_cache_seq_keep_retains_specified_sequence() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; - - let tokens = model.str_to_token("Hello world", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn kv_cache_seq_keep_retains_specified_sequence(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; + + decode_hello_world(fixture, &mut context)?; context.kv_cache_seq_keep(0); @@ -182,19 +376,46 @@ fn kv_cache_seq_keep_retains_specified_sequence() -> Result<()> { Ok(()) } -#[test] -#[serial] -fn copy_kv_cache_seq_with_explicit_range() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; - - let tokens = model.str_to_token("Hello world", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn copy_kv_cache_seq_with_explicit_range(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; + + decode_hello_world(fixture, &mut context)?; let result = context.copy_kv_cache_seq(0, 2, Some(0), Some(1)); @@ -203,19 +424,19 @@ fn copy_kv_cache_seq_with_explicit_range() -> Result<()> { Ok(()) } -#[test] -#[serial] -fn kv_cache_seq_add_succeeds_on_embedding_model() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.embedding_model()?; - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; +#[llama_test( + model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn kv_cache_seq_add_succeeds_on_embedding_model(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; - let tokens = model.str_to_token("Hello world", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; + decode_hello_world(fixture, &mut context)?; let result = context.kv_cache_seq_add(0, Some(0), None, 1); @@ -224,19 +445,19 @@ fn kv_cache_seq_add_succeeds_on_embedding_model() -> Result<()> { Ok(()) } -#[test] -#[serial] -fn kv_cache_seq_div_succeeds_on_embedding_model() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.embedding_model()?; - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; +#[llama_test( + model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn kv_cache_seq_div_succeeds_on_embedding_model(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; - let tokens = model.str_to_token("Hello world", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - batch.add_sequence(&tokens, 0, false)?; - context.decode(&mut batch)?; + decode_hello_world(fixture, &mut context)?; let divisor = NonZeroU8::new(2).ok_or_else(|| anyhow::anyhow!("2 is non-zero"))?; let result = context.kv_cache_seq_div(0, Some(0), None, divisor); @@ -246,14 +467,46 @@ fn kv_cache_seq_div_succeeds_on_embedding_model() -> Result<()> { Ok(()) } -#[test] -#[serial] -fn kv_cache_seq_pos_max_returns_negative_one_for_unused_seq() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let context = LlamaContext::from_model(model, backend, ctx_params)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn kv_cache_seq_pos_max_returns_negative_one_for_unused_seq( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let context = build_context(fixture)?; let result = context.kv_cache_seq_pos_max(999); @@ -262,14 +515,44 @@ fn kv_cache_seq_pos_max_returns_negative_one_for_unused_seq() -> Result<()> { Ok(()) } -#[test] -#[serial] -fn copy_kv_cache_seq_rejects_p0_exceeding_i32_max() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn copy_kv_cache_seq_rejects_p0_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; let result = context.copy_kv_cache_seq(0, 1, Some(u32::MAX), None); @@ -281,14 +564,44 @@ fn copy_kv_cache_seq_rejects_p0_exceeding_i32_max() -> Result<()> { Ok(()) } -#[test] -#[serial] -fn copy_kv_cache_seq_rejects_p1_exceeding_i32_max() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn copy_kv_cache_seq_rejects_p1_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; let result = context.copy_kv_cache_seq(0, 1, Some(0), Some(u32::MAX)); @@ -300,14 +613,44 @@ fn copy_kv_cache_seq_rejects_p1_exceeding_i32_max() -> Result<()> { Ok(()) } -#[test] -#[serial] -fn clear_kv_cache_seq_rejects_src_exceeding_i32_max() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn clear_kv_cache_seq_rejects_src_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; let result = context.clear_kv_cache_seq(Some(u32::MAX), None, None); @@ -319,14 +662,44 @@ fn clear_kv_cache_seq_rejects_src_exceeding_i32_max() -> Result<()> { Ok(()) } -#[test] -#[serial] -fn clear_kv_cache_seq_rejects_p0_exceeding_i32_max() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn clear_kv_cache_seq_rejects_p0_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; let result = context.clear_kv_cache_seq(Some(0), Some(u32::MAX), None); @@ -338,14 +711,44 @@ fn clear_kv_cache_seq_rejects_p0_exceeding_i32_max() -> Result<()> { Ok(()) } -#[test] -#[serial] -fn clear_kv_cache_seq_rejects_p1_exceeding_i32_max() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn clear_kv_cache_seq_rejects_p1_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; let result = context.clear_kv_cache_seq(Some(0), Some(0), Some(u32::MAX)); @@ -357,14 +760,44 @@ fn clear_kv_cache_seq_rejects_p1_exceeding_i32_max() -> Result<()> { Ok(()) } -#[test] -#[serial] -fn kv_cache_seq_add_rejects_p0_exceeding_i32_max() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn kv_cache_seq_add_rejects_p0_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; let result = context.kv_cache_seq_add(0, Some(u32::MAX), None, 1); @@ -376,14 +809,44 @@ fn kv_cache_seq_add_rejects_p0_exceeding_i32_max() -> Result<()> { Ok(()) } -#[test] -#[serial] -fn kv_cache_seq_add_rejects_p1_exceeding_i32_max() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn kv_cache_seq_add_rejects_p1_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; let result = context.kv_cache_seq_add(0, Some(0), Some(u32::MAX), 1); @@ -395,14 +858,44 @@ fn kv_cache_seq_add_rejects_p1_exceeding_i32_max() -> Result<()> { Ok(()) } -#[test] -#[serial] -fn kv_cache_seq_div_rejects_p0_exceeding_i32_max() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn kv_cache_seq_div_rejects_p0_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; let divisor = NonZeroU8::new(2).ok_or_else(|| anyhow::anyhow!("2 is non-zero"))?; let result = context.kv_cache_seq_div(0, Some(u32::MAX), None, divisor); @@ -415,14 +908,44 @@ fn kv_cache_seq_div_rejects_p0_exceeding_i32_max() -> Result<()> { Ok(()) } -#[test] -#[serial] -fn kv_cache_seq_div_rejects_p1_exceeding_i32_max() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn kv_cache_seq_div_rejects_p1_exceeding_i32_max(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; let divisor = NonZeroU8::new(2).ok_or_else(|| anyhow::anyhow!("2 is non-zero"))?; let result = context.kv_cache_seq_div(0, Some(0), Some(u32::MAX), divisor); @@ -434,3 +957,5 @@ fn kv_cache_seq_div_rejects_p1_exceeding_i32_max() -> Result<()> { Ok(()) } + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/context_session.rs b/llama-cpp-bindings-tests/tests/context_session.rs index 4c52260f..d32f7ecf 100644 --- a/llama-cpp-bindings-tests/tests/context_session.rs +++ b/llama-cpp-bindings-tests/tests/context_session.rs @@ -1,23 +1,59 @@ -use std::num::NonZeroU32; - use anyhow::Result; use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::context::params::LlamaContextParams; use llama_cpp_bindings::llama_batch::LlamaBatch; use llama_cpp_bindings::model::AddBos; -use llama_cpp_bindings_tests::FixtureSession; -use serial_test::serial; - -#[test] -#[serial] -fn save_and_load_session_file() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; - - let tokens = model.str_to_token("Hello world", AddBos::Always)?; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; + +fn build_context<'context>(fixture: &'context LlamaFixture<'_>) -> Result> { + Ok(LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn save_and_load_session_file(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; + + let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?; let mut batch = LlamaBatch::new(512, 1)?; batch.add_sequence(&tokens, 0, false)?; context.decode(&mut batch)?; @@ -33,30 +69,90 @@ fn save_and_load_session_file() -> Result<()> { Ok(()) } -#[test] -#[serial] -fn get_state_size_is_positive() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let context = LlamaContext::from_model(model, backend, ctx_params)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn get_state_size_is_positive(fixture: &LlamaFixture<'_>) -> Result<()> { + let context = build_context(fixture)?; assert!(context.get_state_size() > 0); Ok(()) } -#[test] -#[serial] -fn state_seq_save_and_load_file_roundtrip() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; - - let tokens = model.str_to_token("Hello world", AddBos::Always)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn state_seq_save_and_load_file_roundtrip(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; + + let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?; let mut batch = LlamaBatch::new(512, 1)?; batch.add_sequence(&tokens, 0, false)?; context.decode(&mut batch)?; @@ -74,16 +170,46 @@ fn state_seq_save_and_load_file_roundtrip() -> Result<()> { Ok(()) } -#[test] -#[serial] -fn copy_state_data_and_set_state_data_roundtrip() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; - - let tokens = model.str_to_token("Hello world", AddBos::Always)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn copy_state_data_and_set_state_data_roundtrip(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; + + let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?; let mut batch = LlamaBatch::new(512, 1)?; batch.add_sequence(&tokens, 0, false)?; context.decode(&mut batch)?; @@ -99,14 +225,44 @@ fn copy_state_data_and_set_state_data_roundtrip() -> Result<()> { Ok(()) } -#[test] -#[serial] -fn state_load_file_with_nonexistent_file_returns_error() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn state_load_file_with_nonexistent_file_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; let result = context.state_load_file("/nonexistent/session.bin", 512); @@ -115,14 +271,46 @@ fn state_load_file_with_nonexistent_file_returns_error() -> Result<()> { Ok(()) } -#[test] -#[serial] -fn state_seq_load_file_with_nonexistent_file_returns_error() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn state_seq_load_file_with_nonexistent_file_returns_error( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let mut context = build_context(fixture)?; let result = context.state_seq_load_file("/nonexistent/seq_state.bin", 0, 512); @@ -131,14 +319,46 @@ fn state_seq_load_file_with_nonexistent_file_returns_error() -> Result<()> { Ok(()) } -#[test] -#[serial] -fn state_save_file_to_invalid_directory_returns_failed_to_save() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let context = LlamaContext::from_model(model, backend, ctx_params)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn state_save_file_to_invalid_directory_returns_failed_to_save( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let context = build_context(fixture)?; let result = context.state_save_file("/nonexistent_dir/session.bin", &[]); @@ -147,14 +367,46 @@ fn state_save_file_to_invalid_directory_returns_failed_to_save() -> Result<()> { Ok(()) } -#[test] -#[serial] -fn state_seq_save_file_to_invalid_directory_returns_failed_to_save() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let context = LlamaContext::from_model(model, backend, ctx_params)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn state_seq_save_file_to_invalid_directory_returns_failed_to_save( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let context = build_context(fixture)?; let result = context.state_seq_save_file("/nonexistent_dir/seq_state.bin", 0, &[]); @@ -163,16 +415,46 @@ fn state_seq_save_file_to_invalid_directory_returns_failed_to_save() -> Result<( Ok(()) } -#[test] -#[serial] -fn state_load_file_with_zero_max_tokens_returns_error() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; - - let tokens = model.str_to_token("Hello world", AddBos::Always)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn state_load_file_with_zero_max_tokens_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; + + let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?; let mut batch = LlamaBatch::new(512, 1)?; batch.add_sequence(&tokens, 0, false)?; context.decode(&mut batch)?; @@ -188,16 +470,48 @@ fn state_load_file_with_zero_max_tokens_returns_error() -> Result<()> { Ok(()) } -#[test] -#[serial] -fn state_seq_load_file_with_zero_max_tokens_returns_error() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; - - let tokens = model.str_to_token("Hello world", AddBos::Always)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn state_seq_load_file_with_zero_max_tokens_returns_error( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let mut context = build_context(fixture)?; + + let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?; let mut batch = LlamaBatch::new(512, 1)?; batch.add_sequence(&tokens, 0, false)?; context.decode(&mut batch)?; @@ -213,16 +527,48 @@ fn state_seq_load_file_with_zero_max_tokens_returns_error() -> Result<()> { Ok(()) } -#[test] -#[serial] -fn state_load_file_with_insufficient_max_tokens_returns_length_error() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; - - let tokens = model.str_to_token( +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn state_load_file_with_insufficient_max_tokens_returns_length_error( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let mut context = build_context(fixture)?; + + let tokens = fixture.model.str_to_token( "Hello world this is a longer string for more tokens", AddBos::Always, )?; @@ -241,16 +587,48 @@ fn state_load_file_with_insufficient_max_tokens_returns_length_error() -> Result Ok(()) } -#[test] -#[serial] -fn state_seq_load_file_with_insufficient_max_tokens_returns_length_error() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; - - let tokens = model.str_to_token( +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn state_seq_load_file_with_insufficient_max_tokens_returns_length_error( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let mut context = build_context(fixture)?; + + let tokens = fixture.model.str_to_token( "Hello world this is a longer string for more tokens", AddBos::Always, )?; @@ -270,17 +648,47 @@ fn state_seq_load_file_with_insufficient_max_tokens_returns_length_error() -> Re } #[cfg(unix)] -#[test] -#[serial] -fn state_save_file_with_non_utf8_path_returns_error() -> Result<()> { +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn state_save_file_with_non_utf8_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { use std::ffi::OsStr; use std::os::unix::ffi::OsStrExt; - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let context = LlamaContext::from_model(model, backend, ctx_params)?; + let context = build_context(fixture)?; let non_utf8_path = std::path::Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.bin")); let result = context.state_save_file(non_utf8_path, &[]); @@ -291,17 +699,47 @@ fn state_save_file_with_non_utf8_path_returns_error() -> Result<()> { } #[cfg(unix)] -#[test] -#[serial] -fn state_load_file_with_non_utf8_path_returns_error() -> Result<()> { +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn state_load_file_with_non_utf8_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { use std::ffi::OsStr; use std::os::unix::ffi::OsStrExt; - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; + let mut context = build_context(fixture)?; let non_utf8_path = std::path::Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.bin")); let result = context.state_load_file(non_utf8_path, 512); @@ -312,17 +750,47 @@ fn state_load_file_with_non_utf8_path_returns_error() -> Result<()> { } #[cfg(unix)] -#[test] -#[serial] -fn state_seq_save_file_with_non_utf8_path_returns_error() -> Result<()> { +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn state_seq_save_file_with_non_utf8_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { use std::ffi::OsStr; use std::os::unix::ffi::OsStrExt; - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let context = LlamaContext::from_model(model, backend, ctx_params)?; + let context = build_context(fixture)?; let non_utf8_path = std::path::Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.bin")); let result = context.state_seq_save_file(non_utf8_path, 0, &[]); @@ -333,17 +801,47 @@ fn state_seq_save_file_with_non_utf8_path_returns_error() -> Result<()> { } #[cfg(unix)] -#[test] -#[serial] -fn state_seq_load_file_with_non_utf8_path_returns_error() -> Result<()> { +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn state_seq_load_file_with_non_utf8_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { use std::ffi::OsStr; use std::os::unix::ffi::OsStrExt; - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; + let mut context = build_context(fixture)?; let non_utf8_path = std::path::Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.bin")); let result = context.state_seq_load_file(non_utf8_path, 0, 512); @@ -353,14 +851,44 @@ fn state_seq_load_file_with_non_utf8_path_returns_error() -> Result<()> { Ok(()) } -#[test] -#[serial] -fn state_save_file_with_null_byte_in_path_returns_error() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let context = LlamaContext::from_model(model, backend, ctx_params)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn state_save_file_with_null_byte_in_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + let context = build_context(fixture)?; let path_with_null = std::path::Path::new("/tmp/foo\0bar.bin"); let result = context.state_save_file(path_with_null, &[]); @@ -370,14 +898,44 @@ fn state_save_file_with_null_byte_in_path_returns_error() -> Result<()> { Ok(()) } -#[test] -#[serial] -fn state_load_file_with_null_byte_in_path_returns_error() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn state_load_file_with_null_byte_in_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = build_context(fixture)?; let path_with_null = std::path::Path::new("/tmp/foo\0bar.bin"); let result = context.state_load_file(path_with_null, 512); @@ -387,14 +945,46 @@ fn state_load_file_with_null_byte_in_path_returns_error() -> Result<()> { Ok(()) } -#[test] -#[serial] -fn state_seq_save_file_with_null_byte_in_path_returns_error() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let context = LlamaContext::from_model(model, backend, ctx_params)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn state_seq_save_file_with_null_byte_in_path_returns_error( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let context = build_context(fixture)?; let path_with_null = std::path::Path::new("/tmp/foo\0bar.bin"); let result = context.state_seq_save_file(path_with_null, 0, &[]); @@ -404,14 +994,46 @@ fn state_seq_save_file_with_null_byte_in_path_returns_error() -> Result<()> { Ok(()) } -#[test] -#[serial] -fn state_seq_load_file_with_null_byte_in_path_returns_error() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn state_seq_load_file_with_null_byte_in_path_returns_error( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let mut context = build_context(fixture)?; let path_with_null = std::path::Path::new("/tmp/foo\0bar.bin"); let result = context.state_seq_load_file(path_with_null, 0, 512); @@ -421,18 +1043,50 @@ fn state_seq_load_file_with_null_byte_in_path_returns_error() -> Result<()> { Ok(()) } -#[test] -#[serial] -fn state_seq_get_size_ext_returns_size_for_decoded_sequence() -> Result<()> { +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn state_seq_get_size_ext_returns_size_for_decoded_sequence( + fixture: &LlamaFixture<'_>, +) -> Result<()> { use llama_cpp_bindings::context::llama_state_seq_flags::LlamaStateSeqFlags; - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; + let mut context = build_context(fixture)?; - let tokens = model.str_to_token("Hello world", AddBos::Always)?; + let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?; let mut batch = LlamaBatch::new(512, 1)?; batch.add_sequence(&tokens, 0, false)?; context.decode(&mut batch)?; @@ -445,18 +1099,48 @@ fn state_seq_get_size_ext_returns_size_for_decoded_sequence() -> Result<()> { Ok(()) } -#[test] -#[serial] -fn state_seq_get_data_ext_and_set_data_ext_round_trip() -> Result<()> { +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn state_seq_get_data_ext_and_set_data_ext_round_trip(fixture: &LlamaFixture<'_>) -> Result<()> { use llama_cpp_bindings::context::llama_state_seq_flags::LlamaStateSeqFlags; - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; + let mut context = build_context(fixture)?; - let tokens = model.str_to_token("Hello world", AddBos::Always)?; + let tokens = fixture.model.str_to_token("Hello world", AddBos::Always)?; let mut batch = LlamaBatch::new(512, 1)?; batch.add_sequence(&tokens, 0, false)?; context.decode(&mut batch)?; @@ -474,3 +1158,5 @@ fn state_seq_get_data_ext_and_set_data_ext_round_trip() -> Result<()> { Ok(()) } + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs b/llama-cpp-bindings-tests/tests/deepseek_r1_8b_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs index 364717a7..712397df 100644 --- a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs +++ b/llama-cpp-bindings-tests/tests/deepseek_r1_8b_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs @@ -1,28 +1,15 @@ -use std::num::NonZeroU32; - use anyhow::Result; use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::context::params::LlamaContextParams; -use llama_cpp_bindings::llama_backend::LlamaBackend; use llama_cpp_bindings::llama_batch::LlamaBatch; use llama_cpp_bindings::model::AddBos; -use llama_cpp_bindings::model::LlamaModel; use llama_cpp_bindings::sampling::LlamaSampler; use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; -use llama_cpp_bindings_tests::gpu_backend::inference_model_params; -use llama_cpp_bindings_tests::gpu_backend::require_compiled_backends_present; -use llama_cpp_bindings_tests::test_model::download_file_from; - -const DEEPSEEK_R1_8B_REPO: &str = "unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF"; -const DEEPSEEK_R1_8B_FILE: &str = "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; const MAX_GENERATED_TOKENS: i32 = 200; -// DeepSeek-R1-Distill-Llama-8B has no native thinking-disabled mode in its -// chat template (R1 is a pure reasoner). This prompt manually closes the -// `` block before generation so the classifier starts in CONTENT — -// verifies the "spurious close in content section" path with this model's -// tokenizer and still produces zero Reasoning tokens. const DEEPSEEK_R1_8B_THINKING_DISABLED_PROMPT: &str = "\ <|User|>What is 2 + 2?<|Assistant|> @@ -32,14 +19,20 @@ const DEEPSEEK_R1_8B_THINKING_DISABLED_PROMPT: &str = "\ const FORBIDDEN_MARKERS: &[&str] = &["", ""]; -#[test] -fn deepseek_r1_8b_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt() -> Result<()> { - let backend = LlamaBackend::init()?; - require_compiled_backends_present()?; - - let path = download_file_from(DEEPSEEK_R1_8B_REPO, DEEPSEEK_R1_8B_FILE)?; - let params = inference_model_params(); - let model = LlamaModel::load_from_file(&backend, &path, ¶ms)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 2048, + n_ubatch = 512, +)] +fn deepseek_r1_8b_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; let mut classifier = model.sampled_token_classifier(); let prompt_tokens = @@ -49,8 +42,11 @@ fn deepseek_r1_8b_classifier_does_not_emit_reasoning_for_thinking_disabled_promp let mut batch = LlamaBatch::new(2048, 1)?; classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; - let context_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(8192)); - let mut context = LlamaContext::from_model(&model, &backend, context_params)?; + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; context.decode(&mut batch)?; @@ -67,7 +63,7 @@ fn deepseek_r1_8b_classifier_does_not_emit_reasoning_for_thinking_disabled_promp ]); let initial_position = batch.n_tokens(); let outcome = ClassifySampleLoop { - model: &model, + model, classifier: &mut classifier, sampler: &mut sampler, context: &mut context, @@ -126,3 +122,5 @@ fn deepseek_r1_8b_classifier_does_not_emit_reasoning_for_thinking_disabled_promp Ok(()) } + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_classifier_emits_reasoning.rs b/llama-cpp-bindings-tests/tests/deepseek_r1_8b_classifier_emits_reasoning.rs index 6b8f34bc..6bed6bbe 100644 --- a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_classifier_emits_reasoning.rs +++ b/llama-cpp-bindings-tests/tests/deepseek_r1_8b_classifier_emits_reasoning.rs @@ -1,22 +1,14 @@ -use std::num::NonZeroU32; - use anyhow::Result; use anyhow::bail; use llama_cpp_bindings::ChatMessageParseOutcome; use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::context::params::LlamaContextParams; -use llama_cpp_bindings::llama_backend::LlamaBackend; use llama_cpp_bindings::llama_batch::LlamaBatch; use llama_cpp_bindings::model::AddBos; -use llama_cpp_bindings::model::LlamaModel; use llama_cpp_bindings::sampling::LlamaSampler; use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; -use llama_cpp_bindings_tests::gpu_backend::inference_model_params; -use llama_cpp_bindings_tests::gpu_backend::require_compiled_backends_present; -use llama_cpp_bindings_tests::test_model::download_file_from; - -const DEEPSEEK_R1_8B_REPO: &str = "unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF"; -const DEEPSEEK_R1_8B_FILE: &str = "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; const MAX_GENERATED_TOKENS: i32 = 1500; @@ -32,14 +24,20 @@ const DEEPSEEK_R1_8B_THINKING_PROMPT: &str = "\ const FORBIDDEN_MARKERS: &[&str] = &["", ""]; -#[test] -fn deepseek_r1_8b_classifier_emits_reasoning_for_thinking_enabled_prompt() -> Result<()> { - let backend = LlamaBackend::init()?; - require_compiled_backends_present()?; - - let path = download_file_from(DEEPSEEK_R1_8B_REPO, DEEPSEEK_R1_8B_FILE)?; - let params = inference_model_params(); - let model = LlamaModel::load_from_file(&backend, &path, ¶ms)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 2048, + n_ubatch = 512, +)] +fn deepseek_r1_8b_classifier_emits_reasoning_for_thinking_enabled_prompt( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; let mut classifier = model.sampled_token_classifier(); let prompt_tokens = model.str_to_token(DEEPSEEK_R1_8B_THINKING_PROMPT, AddBos::Never)?; @@ -48,8 +46,11 @@ fn deepseek_r1_8b_classifier_emits_reasoning_for_thinking_enabled_prompt() -> Re let mut batch = LlamaBatch::new(2048, 1)?; classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; - let context_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(8192)); - let mut context = LlamaContext::from_model(&model, &backend, context_params)?; + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; context.decode(&mut batch)?; @@ -66,7 +67,7 @@ fn deepseek_r1_8b_classifier_emits_reasoning_for_thinking_enabled_prompt() -> Re ]); let initial_position = batch.n_tokens(); let outcome = ClassifySampleLoop { - model: &model, + model, classifier: &mut classifier, sampler: &mut sampler, context: &mut context, @@ -146,3 +147,5 @@ fn deepseek_r1_8b_classifier_emits_reasoning_for_thinking_enabled_prompt() -> Re Ok(()) } + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_gemma_paired_quote.rs b/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_gemma_paired_quote.rs index 329111a6..ce2b922d 100644 --- a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_gemma_paired_quote.rs +++ b/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_gemma_paired_quote.rs @@ -2,14 +2,9 @@ use anyhow::Result; use anyhow::bail; use llama_cpp_bindings::ChatMessageParseOutcome; use llama_cpp_bindings::ToolCallArguments; -use llama_cpp_bindings::llama_backend::LlamaBackend; -use llama_cpp_bindings::model::LlamaModel; -use llama_cpp_bindings_tests::gpu_backend::inference_model_params; -use llama_cpp_bindings_tests::gpu_backend::require_compiled_backends_present; -use llama_cpp_bindings_tests::test_model::download_file_from; - -const DEEPSEEK_R1_8B_REPO: &str = "unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF"; -const DEEPSEEK_R1_8B_FILE: &str = "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; const TOOLS_JSON: &str = r#"[ { @@ -30,16 +25,20 @@ const TOOLS_JSON: &str = r#"[ const GEMMA_PAIRED_QUOTE_PAYLOAD: &str = "<|tool_call>call:get_weather{location:<|\"|>Paris<|\"|>}"; -#[test] -fn deepseek_r1_8b_duck_types_gemma_paired_quote() -> Result<()> { - let backend = LlamaBackend::init()?; - require_compiled_backends_present()?; - - let path = download_file_from(DEEPSEEK_R1_8B_REPO, DEEPSEEK_R1_8B_FILE)?; - let params = inference_model_params(); - let model = LlamaModel::load_from_file(&backend, &path, ¶ms)?; - - let outcome = model.parse_chat_message(TOOLS_JSON, GEMMA_PAIRED_QUOTE_PAYLOAD, false)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn deepseek_r1_8b_duck_types_gemma_paired_quote(fixture: &LlamaFixture<'_>) -> Result<()> { + let outcome = + fixture + .model + .parse_chat_message(TOOLS_JSON, GEMMA_PAIRED_QUOTE_PAYLOAD, false)?; let ChatMessageParseOutcome::Recognized(parsed) = outcome else { bail!( @@ -67,3 +66,5 @@ fn deepseek_r1_8b_duck_types_gemma_paired_quote() -> Result<()> { Ok(()) } + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_glm_key_value_tags.rs b/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_glm_key_value_tags.rs index c2aa85a6..7b9e052b 100644 --- a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_glm_key_value_tags.rs +++ b/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_glm_key_value_tags.rs @@ -2,14 +2,9 @@ use anyhow::Result; use anyhow::bail; use llama_cpp_bindings::ChatMessageParseOutcome; use llama_cpp_bindings::ToolCallArguments; -use llama_cpp_bindings::llama_backend::LlamaBackend; -use llama_cpp_bindings::model::LlamaModel; -use llama_cpp_bindings_tests::gpu_backend::inference_model_params; -use llama_cpp_bindings_tests::gpu_backend::require_compiled_backends_present; -use llama_cpp_bindings_tests::test_model::download_file_from; - -const DEEPSEEK_R1_8B_REPO: &str = "unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF"; -const DEEPSEEK_R1_8B_FILE: &str = "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; const TOOLS_JSON: &str = r#"[ { @@ -33,16 +28,19 @@ const GLM_KEY_VALUE_PAYLOAD: &str = "get_weather\ Paris\ "; -#[test] -fn deepseek_r1_8b_duck_types_glm_key_value_tags() -> Result<()> { - let backend = LlamaBackend::init()?; - require_compiled_backends_present()?; - - let path = download_file_from(DEEPSEEK_R1_8B_REPO, DEEPSEEK_R1_8B_FILE)?; - let params = inference_model_params(); - let model = LlamaModel::load_from_file(&backend, &path, ¶ms)?; - - let outcome = model.parse_chat_message(TOOLS_JSON, GLM_KEY_VALUE_PAYLOAD, false)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn deepseek_r1_8b_duck_types_glm_key_value_tags(fixture: &LlamaFixture<'_>) -> Result<()> { + let outcome = fixture + .model + .parse_chat_message(TOOLS_JSON, GLM_KEY_VALUE_PAYLOAD, false)?; let ChatMessageParseOutcome::Recognized(parsed) = outcome else { bail!( @@ -70,3 +68,5 @@ fn deepseek_r1_8b_duck_types_glm_key_value_tags() -> Result<()> { Ok(()) } + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_mistral_bracketed_json.rs b/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_mistral_bracketed_json.rs index 25a38992..66b4caab 100644 --- a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_mistral_bracketed_json.rs +++ b/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_mistral_bracketed_json.rs @@ -2,14 +2,9 @@ use anyhow::Result; use anyhow::bail; use llama_cpp_bindings::ChatMessageParseOutcome; use llama_cpp_bindings::ToolCallArguments; -use llama_cpp_bindings::llama_backend::LlamaBackend; -use llama_cpp_bindings::model::LlamaModel; -use llama_cpp_bindings_tests::gpu_backend::inference_model_params; -use llama_cpp_bindings_tests::gpu_backend::require_compiled_backends_present; -use llama_cpp_bindings_tests::test_model::download_file_from; - -const DEEPSEEK_R1_8B_REPO: &str = "unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF"; -const DEEPSEEK_R1_8B_FILE: &str = "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; const TOOLS_JSON: &str = r#"[ { @@ -30,16 +25,20 @@ const TOOLS_JSON: &str = r#"[ const MISTRAL_BRACKETED_JSON_PAYLOAD: &str = r#"[TOOL_CALLS]get_weather[ARGS]{"location":"Paris"}"#; -#[test] -fn deepseek_r1_8b_duck_types_mistral_bracketed_json() -> Result<()> { - let backend = LlamaBackend::init()?; - require_compiled_backends_present()?; - - let path = download_file_from(DEEPSEEK_R1_8B_REPO, DEEPSEEK_R1_8B_FILE)?; - let params = inference_model_params(); - let model = LlamaModel::load_from_file(&backend, &path, ¶ms)?; - - let outcome = model.parse_chat_message(TOOLS_JSON, MISTRAL_BRACKETED_JSON_PAYLOAD, false)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn deepseek_r1_8b_duck_types_mistral_bracketed_json(fixture: &LlamaFixture<'_>) -> Result<()> { + let outcome = + fixture + .model + .parse_chat_message(TOOLS_JSON, MISTRAL_BRACKETED_JSON_PAYLOAD, false)?; let ChatMessageParseOutcome::Recognized(parsed) = outcome else { bail!( @@ -67,3 +66,5 @@ fn deepseek_r1_8b_duck_types_mistral_bracketed_json() -> Result<()> { Ok(()) } + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_qwen_xml.rs b/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_qwen_xml.rs index 72f8bcfd..203ae0e8 100644 --- a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_qwen_xml.rs +++ b/llama-cpp-bindings-tests/tests/deepseek_r1_8b_duck_types_qwen_xml.rs @@ -2,14 +2,9 @@ use anyhow::Result; use anyhow::bail; use llama_cpp_bindings::ChatMessageParseOutcome; use llama_cpp_bindings::ToolCallArguments; -use llama_cpp_bindings::llama_backend::LlamaBackend; -use llama_cpp_bindings::model::LlamaModel; -use llama_cpp_bindings_tests::gpu_backend::inference_model_params; -use llama_cpp_bindings_tests::gpu_backend::require_compiled_backends_present; -use llama_cpp_bindings_tests::test_model::download_file_from; - -const DEEPSEEK_R1_8B_REPO: &str = "unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF"; -const DEEPSEEK_R1_8B_FILE: &str = "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; const TOOLS_JSON: &str = r#"[ { @@ -36,16 +31,19 @@ Paris\n\ \n\ "; -#[test] -fn deepseek_r1_8b_duck_types_qwen_xml() -> Result<()> { - let backend = LlamaBackend::init()?; - require_compiled_backends_present()?; - - let path = download_file_from(DEEPSEEK_R1_8B_REPO, DEEPSEEK_R1_8B_FILE)?; - let params = inference_model_params(); - let model = LlamaModel::load_from_file(&backend, &path, ¶ms)?; - - let outcome = model.parse_chat_message(TOOLS_JSON, QWEN_XML_PAYLOAD, false)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn deepseek_r1_8b_duck_types_qwen_xml(fixture: &LlamaFixture<'_>) -> Result<()> { + let outcome = fixture + .model + .parse_chat_message(TOOLS_JSON, QWEN_XML_PAYLOAD, false)?; let ChatMessageParseOutcome::Recognized(parsed) = outcome else { bail!( @@ -73,3 +71,5 @@ fn deepseek_r1_8b_duck_types_qwen_xml() -> Result<()> { Ok(()) } + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested.rs b/llama-cpp-bindings-tests/tests/deepseek_r1_8b_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested.rs index 60828698..2921b3d6 100644 --- a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested.rs +++ b/llama-cpp-bindings-tests/tests/deepseek_r1_8b_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested.rs @@ -1,14 +1,9 @@ use anyhow::Result; use anyhow::bail; use llama_cpp_bindings::ChatMessageParseOutcome; -use llama_cpp_bindings::llama_backend::LlamaBackend; -use llama_cpp_bindings::model::LlamaModel; -use llama_cpp_bindings_tests::gpu_backend::inference_model_params; -use llama_cpp_bindings_tests::gpu_backend::require_compiled_backends_present; -use llama_cpp_bindings_tests::test_model::download_file_from; - -const DEEPSEEK_R1_8B_REPO: &str = "unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF"; -const DEEPSEEK_R1_8B_FILE: &str = "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; const TOOLS_JSON: &str = r#"[ { @@ -29,17 +24,21 @@ const TOOLS_JSON: &str = r#"[ const PLAIN_CONTENT: &str = "Sorry, I cannot help with that."; -#[test] -fn deepseek_r1_8b_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested() --> Result<()> { - let backend = LlamaBackend::init()?; - require_compiled_backends_present()?; - - let path = download_file_from(DEEPSEEK_R1_8B_REPO, DEEPSEEK_R1_8B_FILE)?; - let params = inference_model_params(); - let model = LlamaModel::load_from_file(&backend, &path, ¶ms)?; - - let outcome = model.parse_chat_message(TOOLS_JSON, PLAIN_CONTENT, false)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn deepseek_r1_8b_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let outcome = fixture + .model + .parse_chat_message(TOOLS_JSON, PLAIN_CONTENT, false)?; let ChatMessageParseOutcome::Recognized(parsed) = outcome else { bail!( @@ -55,3 +54,5 @@ fn deepseek_r1_8b_recognizes_empty_tool_calls_when_input_is_plain_content_with_t Ok(()) } + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_recognizes_empty_tool_calls_when_tools_not_requested.rs b/llama-cpp-bindings-tests/tests/deepseek_r1_8b_recognizes_empty_tool_calls_when_tools_not_requested.rs index 931a9b1c..cc48350f 100644 --- a/llama-cpp-bindings-tests/tests/deepseek_r1_8b_recognizes_empty_tool_calls_when_tools_not_requested.rs +++ b/llama-cpp-bindings-tests/tests/deepseek_r1_8b_recognizes_empty_tool_calls_when_tools_not_requested.rs @@ -1,27 +1,27 @@ use anyhow::Result; use anyhow::bail; use llama_cpp_bindings::ChatMessageParseOutcome; -use llama_cpp_bindings::llama_backend::LlamaBackend; -use llama_cpp_bindings::model::LlamaModel; -use llama_cpp_bindings_tests::gpu_backend::inference_model_params; -use llama_cpp_bindings_tests::gpu_backend::require_compiled_backends_present; -use llama_cpp_bindings_tests::test_model::download_file_from; - -const DEEPSEEK_R1_8B_REPO: &str = "unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF"; -const DEEPSEEK_R1_8B_FILE: &str = "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; const PLAIN_CONTENT: &str = "Hello there."; -#[test] -fn deepseek_r1_8b_recognizes_empty_tool_calls_when_tools_not_requested() -> Result<()> { - let backend = LlamaBackend::init()?; - require_compiled_backends_present()?; - - let path = download_file_from(DEEPSEEK_R1_8B_REPO, DEEPSEEK_R1_8B_FILE)?; - let params = inference_model_params(); - let model = LlamaModel::load_from_file(&backend, &path, ¶ms)?; - - let outcome = model.parse_chat_message("[]", PLAIN_CONTENT, false)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn deepseek_r1_8b_recognizes_empty_tool_calls_when_tools_not_requested( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let outcome = fixture + .model + .parse_chat_message("[]", PLAIN_CONTENT, false)?; let ChatMessageParseOutcome::Recognized(parsed) = outcome else { bail!("plain content with empty tools array must produce Recognized; got Unrecognized"); @@ -34,3 +34,5 @@ fn deepseek_r1_8b_recognizes_empty_tool_calls_when_tools_not_requested() -> Resu Ok(()) } + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/embeddings.rs b/llama-cpp-bindings-tests/tests/embeddings.rs index 840dff79..7e531cec 100644 --- a/llama-cpp-bindings-tests/tests/embeddings.rs +++ b/llama-cpp-bindings-tests/tests/embeddings.rs @@ -2,11 +2,12 @@ use std::time::Duration; use anyhow::{Context, Result}; use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::context::params::LlamaContextParams; use llama_cpp_bindings::ggml_time_us; use llama_cpp_bindings::llama_batch::LlamaBatch; use llama_cpp_bindings::model::AddBos; -use llama_cpp_bindings_tests::FixtureSession; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; fn normalize(input: &[f32]) -> Vec { let magnitude = input @@ -17,17 +18,26 @@ fn normalize(input: &[f32]) -> Vec { input.iter().map(|&value| value / magnitude).collect() } -#[test] -fn embedding_generation_produces_vectors() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.embedding_model()?; - - let ctx_params = LlamaContextParams::default() - .with_n_threads_batch(std::thread::available_parallelism()?.get().try_into()?) - .with_embeddings(true); - let mut ctx = LlamaContext::from_model(model, backend, ctx_params) - .with_context(|| "unable to create context")?; +#[llama_test( + model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + n_threads_batch = 8, + embeddings = true, +)] +fn embedding_generation_produces_vectors(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + + let mut ctx = LlamaContext::from_model( + model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + ) + .with_context(|| "unable to create context")?; let prompt = "Hello my name is"; let tokens = model @@ -89,3 +99,5 @@ fn embedding_generation_produces_vectors() -> Result<()> { Ok(()) } + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/eval_multimodal_chunks_records_exact_token_counts.rs b/llama-cpp-bindings-tests/tests/eval_multimodal_chunks_records_exact_token_counts.rs index 53cdbb53..dcef4ded 100644 --- a/llama-cpp-bindings-tests/tests/eval_multimodal_chunks_records_exact_token_counts.rs +++ b/llama-cpp-bindings-tests/tests/eval_multimodal_chunks_records_exact_token_counts.rs @@ -1,18 +1,15 @@ -#![cfg(feature = "multimodal_capable")] - -use std::num::NonZeroU32; - use anyhow::Result; use llama_cpp_bindings::TokenUsage; use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::context::params::LlamaContextParams; use llama_cpp_bindings::mtmd::MtmdBitmap; use llama_cpp_bindings::mtmd::MtmdInputChunkType; use llama_cpp_bindings::mtmd::MtmdInputChunks; use llama_cpp_bindings::mtmd::MtmdInputText; use llama_cpp_bindings::mtmd::mtmd_default_marker; -use llama_cpp_bindings_tests::FixtureSession; use llama_cpp_bindings_tests::test_model::fixtures_dir; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; const PROMPT_QUESTION: &str = "What animals do you see in this image?"; @@ -48,11 +45,13 @@ fn sum_chunk_token_counts_by_type(chunks: &MtmdInputChunks) -> Result Result<(TokenUsage, ExpectedChunkTotals)> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let mtmd_ctx = fixture.mtmd_context()?; +fn build_multimodal_chunks_and_eval_into_usage( + fixture: &LlamaFixture<'_>, +) -> Result<(TokenUsage, ExpectedChunkTotals)> { + let model = fixture.model; + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); let image_path = fixtures_dir().join("llamas.jpg"); let image_path_str = image_path @@ -72,10 +71,8 @@ fn build_multimodal_chunks_and_eval_into_usage() -> Result<(TokenUsage, Expected let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; let expected = sum_chunk_token_counts_by_type(&chunks)?; - let context_params = LlamaContextParams::default() - .with_n_ctx(NonZeroU32::new(4096)) - .with_n_batch(512); - let context = LlamaContext::from_model(model, backend, context_params)?; + let context_params = (*fixture.context_params).into_llama_context_params(); + let context = LlamaContext::from_model(model, fixture.backend, context_params)?; let mut classifier = model.sampled_token_classifier(); classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?; @@ -83,9 +80,18 @@ fn build_multimodal_chunks_and_eval_into_usage() -> Result<(TokenUsage, Expected Ok((classifier.into_usage(), expected)) } -#[test] -fn prompt_tokens_match_text_chunk_total() -> Result<()> { - let (usage, expected) = build_multimodal_chunks_and_eval_into_usage()?; +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 4096, + n_batch = 512, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +fn prompt_tokens_match_text_chunk_total(fixture: &LlamaFixture<'_>) -> Result<()> { + let (usage, expected) = build_multimodal_chunks_and_eval_into_usage(fixture)?; if usage.prompt_tokens != expected.text { anyhow::bail!( @@ -98,9 +104,18 @@ fn prompt_tokens_match_text_chunk_total() -> Result<()> { Ok(()) } -#[test] -fn input_image_tokens_match_image_chunk_total() -> Result<()> { - let (usage, expected) = build_multimodal_chunks_and_eval_into_usage()?; +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 4096, + n_batch = 512, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +fn input_image_tokens_match_image_chunk_total(fixture: &LlamaFixture<'_>) -> Result<()> { + let (usage, expected) = build_multimodal_chunks_and_eval_into_usage(fixture)?; if usage.input_image_tokens != expected.image { anyhow::bail!( @@ -113,9 +128,18 @@ fn input_image_tokens_match_image_chunk_total() -> Result<()> { Ok(()) } -#[test] -fn input_audio_tokens_are_zero_for_image_only_input() -> Result<()> { - let (usage, expected) = build_multimodal_chunks_and_eval_into_usage()?; +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 4096, + n_batch = 512, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +fn input_audio_tokens_are_zero_for_image_only_input(fixture: &LlamaFixture<'_>) -> Result<()> { + let (usage, expected) = build_multimodal_chunks_and_eval_into_usage(fixture)?; if expected.audio != 0 { anyhow::bail!( @@ -133,9 +157,20 @@ fn input_audio_tokens_are_zero_for_image_only_input() -> Result<()> { Ok(()) } -#[test] -fn completion_tokens_are_zero_after_eval_before_generation() -> Result<()> { - let (usage, _expected) = build_multimodal_chunks_and_eval_into_usage()?; +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 4096, + n_batch = 512, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +fn completion_tokens_are_zero_after_eval_before_generation( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let (usage, _expected) = build_multimodal_chunks_and_eval_into_usage(fixture)?; if usage.completion_tokens() != 0 { anyhow::bail!( @@ -146,3 +181,5 @@ fn completion_tokens_are_zero_after_eval_before_generation() -> Result<()> { Ok(()) } + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/gemma4_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs b/llama-cpp-bindings-tests/tests/gemma4_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs index 71b2a1ef..e20b99a2 100644 --- a/llama-cpp-bindings-tests/tests/gemma4_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs +++ b/llama-cpp-bindings-tests/tests/gemma4_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs @@ -1,40 +1,35 @@ -use std::num::NonZeroU32; - use anyhow::Result; use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::context::params::LlamaContextParams; -use llama_cpp_bindings::llama_backend::LlamaBackend; use llama_cpp_bindings::llama_batch::LlamaBatch; use llama_cpp_bindings::model::AddBos; -use llama_cpp_bindings::model::LlamaModel; use llama_cpp_bindings::sampling::LlamaSampler; use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; -use llama_cpp_bindings_tests::gpu_backend::inference_model_params; -use llama_cpp_bindings_tests::gpu_backend::require_compiled_backends_present; -use llama_cpp_bindings_tests::test_model::download_file_from; - -const GEMMA4_REPO: &str = "unsloth/gemma-4-E4B-it-GGUF"; -const GEMMA4_FILE: &str = "gemma-4-E4B-it-Q4_K_M.gguf"; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; const MAX_GENERATED_TOKENS: i32 = 200; -// Mirrors what Gemma 4's chat template renders when the caller asks for -// `enable_thinking=false`: the model turn opens with a closed empty -// `<|channel>thought\n\n` block, so generation begins in CONTENT. const GEMMA4_THINKING_DISABLED_PROMPT: &str = "\ user\nReply with the single word: four. Do not explain.\n\ model\n<|channel>thought\n\n"; const FORBIDDEN_MARKERS: &[&str] = &["<|channel>thought", ""]; -#[test] -fn gemma4_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt() -> Result<()> { - let backend = LlamaBackend::init()?; - require_compiled_backends_present()?; - - let path = download_file_from(GEMMA4_REPO, GEMMA4_FILE)?; - let params = inference_model_params(); - let model = LlamaModel::load_from_file(&backend, &path, ¶ms)?; +#[llama_test( + model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 2048, + n_ubatch = 512, +)] +fn gemma4_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; let mut classifier = model.sampled_token_classifier(); let prompt_tokens = model.str_to_token(GEMMA4_THINKING_DISABLED_PROMPT, AddBos::Never)?; @@ -43,8 +38,11 @@ fn gemma4_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt() -> R let mut batch = LlamaBatch::new(2048, 1)?; classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; - let context_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(8192)); - let mut context = LlamaContext::from_model(&model, &backend, context_params)?; + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; context.decode(&mut batch)?; @@ -54,7 +52,7 @@ fn gemma4_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt() -> R let mut sampler = LlamaSampler::greedy(); let initial_position = batch.n_tokens(); let outcome = ClassifySampleLoop { - model: &model, + model, classifier: &mut classifier, sampler: &mut sampler, context: &mut context, @@ -113,3 +111,5 @@ fn gemma4_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt() -> R Ok(()) } + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/gemma4_classifier_emits_reasoning.rs b/llama-cpp-bindings-tests/tests/gemma4_classifier_emits_reasoning.rs index 0ad59240..6a7aaba0 100644 --- a/llama-cpp-bindings-tests/tests/gemma4_classifier_emits_reasoning.rs +++ b/llama-cpp-bindings-tests/tests/gemma4_classifier_emits_reasoning.rs @@ -1,44 +1,35 @@ -use std::num::NonZeroU32; - use anyhow::Result; use anyhow::bail; use llama_cpp_bindings::ChatMessageParseOutcome; use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::context::params::LlamaContextParams; -use llama_cpp_bindings::llama_backend::LlamaBackend; use llama_cpp_bindings::llama_batch::LlamaBatch; use llama_cpp_bindings::model::AddBos; -use llama_cpp_bindings::model::LlamaModel; use llama_cpp_bindings::sampling::LlamaSampler; use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; -use llama_cpp_bindings_tests::gpu_backend::inference_model_params; -use llama_cpp_bindings_tests::gpu_backend::require_compiled_backends_present; -use llama_cpp_bindings_tests::test_model::download_file_from; - -const GEMMA4_REPO: &str = "unsloth/gemma-4-E4B-it-GGUF"; -const GEMMA4_FILE: &str = "gemma-4-E4B-it-Q4_K_M.gguf"; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; const MAX_GENERATED_TOKENS: i32 = 1500; -// Gemma 4 uses asymmetric reasoning markers: `<|channel>thought` opens -// the thinking block and `` closes it. We pre-inject the -// `<|channel>thought\n` opener at the model turn so the classifier sees -// the marker via prompt-token replay and starts generation in `Reasoning`, -// matching the behaviour of Qwen3.5/3.6's auto-injected `\n`. const GEMMA4_THINKING_PROMPT: &str = "\ user\nReply with the single word: four. Do not explain.\n\ model\n<|channel>thought\n"; const FORBIDDEN_MARKERS: &[&str] = &["<|channel>thought", ""]; -#[test] -fn gemma4_classifier_emits_reasoning_for_thinking_prompt() -> Result<()> { - let backend = LlamaBackend::init()?; - require_compiled_backends_present()?; - - let path = download_file_from(GEMMA4_REPO, GEMMA4_FILE)?; - let params = inference_model_params(); - let model = LlamaModel::load_from_file(&backend, &path, ¶ms)?; +#[llama_test( + model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 2048, + n_ubatch = 512, +)] +fn gemma4_classifier_emits_reasoning_for_thinking_prompt(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; let mut classifier = model.sampled_token_classifier(); let prompt_tokens = model.str_to_token(GEMMA4_THINKING_PROMPT, AddBos::Never)?; @@ -47,8 +38,11 @@ fn gemma4_classifier_emits_reasoning_for_thinking_prompt() -> Result<()> { let mut batch = LlamaBatch::new(2048, 1)?; classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; - let context_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(8192)); - let mut context = LlamaContext::from_model(&model, &backend, context_params)?; + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; context.decode(&mut batch)?; @@ -58,7 +52,7 @@ fn gemma4_classifier_emits_reasoning_for_thinking_prompt() -> Result<()> { let mut sampler = LlamaSampler::greedy(); let initial_position = batch.n_tokens(); let outcome = ClassifySampleLoop { - model: &model, + model, classifier: &mut classifier, sampler: &mut sampler, context: &mut context, @@ -109,13 +103,6 @@ fn gemma4_classifier_emits_reasoning_for_thinking_prompt() -> Result<()> { outcome.generated_raw, ); - // Gemma 4 goes through llama.cpp's specialized-template path, which leaves the - // raw `<|channel>thought` prefix in `parsed.reasoning_content` rather than - // stripping it like the differential autoparser does for Qwen3-family. So the - // parser-equality cross-check would require a per-template carve-out — instead, - // rely on the FORBIDDEN_MARKERS substring check below: the streams the user - // actually sees must not contain marker text, regardless of what the parser - // chose to keep. for forbidden in FORBIDDEN_MARKERS { assert!( !outcome.reasoning_stream.contains(forbidden), @@ -133,3 +120,5 @@ fn gemma4_classifier_emits_reasoning_for_thinking_prompt() -> Result<()> { Ok(()) } + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/gemma4_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs b/llama-cpp-bindings-tests/tests/gemma4_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs index b64b89a6..e810ca3e 100644 --- a/llama-cpp-bindings-tests/tests/gemma4_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs +++ b/llama-cpp-bindings-tests/tests/gemma4_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs @@ -1,57 +1,48 @@ -#![cfg(feature = "multimodal_capable")] - -use std::num::NonZeroU32; - use anyhow::Result; use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::context::params::LlamaContextParams; -use llama_cpp_bindings::llama_backend::LlamaBackend; use llama_cpp_bindings::llama_batch::LlamaBatch; -use llama_cpp_bindings::model::LlamaModel; use llama_cpp_bindings::mtmd::MtmdBitmap; -use llama_cpp_bindings::mtmd::MtmdContext; -use llama_cpp_bindings::mtmd::MtmdContextParams; use llama_cpp_bindings::mtmd::MtmdInputText; use llama_cpp_bindings::mtmd::mtmd_default_marker; use llama_cpp_bindings::sampling::LlamaSampler; use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; -use llama_cpp_bindings_tests::gpu_backend::inference_model_params; -use llama_cpp_bindings_tests::gpu_backend::require_compiled_backends_present; -use llama_cpp_bindings_tests::test_model::download_file_from; use llama_cpp_bindings_tests::test_model::fixtures_dir; - -const GEMMA4_REPO: &str = "unsloth/gemma-4-E4B-it-GGUF"; -const GEMMA4_FILE: &str = "gemma-4-E4B-it-Q4_K_M.gguf"; -const GEMMA4_MMPROJ_FILE: &str = "mmproj-F16.gguf"; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; const MAX_GENERATED_TOKENS: i32 = 200; -#[test] -fn gemma4_classifier_emits_reasoning_for_multimodal_thinking_prompt() -> Result<()> { - let backend = LlamaBackend::init()?; - require_compiled_backends_present()?; - - let model_path = download_file_from(GEMMA4_REPO, GEMMA4_FILE)?; - let mmproj_path = download_file_from(GEMMA4_REPO, GEMMA4_MMPROJ_FILE)?; - let params = inference_model_params(); - let model = LlamaModel::load_from_file(&backend, &model_path, ¶ms)?; - - let mtmd_params = MtmdContextParams::default(); - let mmproj_str = mmproj_path - .to_str() - .ok_or_else(|| anyhow::anyhow!("mmproj path is not valid UTF-8"))?; - let mtmd_ctx = MtmdContext::init_from_file(mmproj_str, &model, &mtmd_params)?; - - let context_params = LlamaContextParams::default() - .with_n_ctx(NonZeroU32::new(8192)) - .with_n_batch(512); - let mut context = LlamaContext::from_model(&model, &backend, context_params)?; +#[llama_test( + model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 512, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "mmproj-F16.gguf"), +)] +fn gemma4_classifier_emits_reasoning_for_multimodal_thinking_prompt( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; let image_path = fixtures_dir().join("llamas.jpg"); let image_path_str = image_path .to_str() .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?; - let bitmap = MtmdBitmap::from_file(&mtmd_ctx, image_path_str)?; + let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?; let marker = mtmd_default_marker(); let prompt = format!( @@ -67,8 +58,7 @@ fn gemma4_classifier_emits_reasoning_for_multimodal_thinking_prompt() -> Result< let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; let mut classifier = model.sampled_token_classifier(); - let n_past = - classifier.eval_multimodal_chunks(&chunks, &mtmd_ctx, &context, 0, 0, 512, true)?; + let n_past = classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?; let mut sampler = LlamaSampler::chain_simple([ LlamaSampler::penalties(64, 1.1, 0.0, 0.0), @@ -81,7 +71,7 @@ fn gemma4_classifier_emits_reasoning_for_multimodal_thinking_prompt() -> Result< let mut batch = LlamaBatch::new(2048, 1)?; let outcome = ClassifySampleLoop { - model: &model, + model, classifier: &mut classifier, sampler: &mut sampler, context: &mut context, @@ -107,3 +97,5 @@ fn gemma4_classifier_emits_reasoning_for_multimodal_thinking_prompt() -> Result< Ok(()) } + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/gemma4_parses_tool_call_payload.rs b/llama-cpp-bindings-tests/tests/gemma4_parses_tool_call_payload.rs index 87204774..2f3d3eaa 100644 --- a/llama-cpp-bindings-tests/tests/gemma4_parses_tool_call_payload.rs +++ b/llama-cpp-bindings-tests/tests/gemma4_parses_tool_call_payload.rs @@ -2,14 +2,9 @@ use anyhow::Result; use anyhow::bail; use llama_cpp_bindings::ChatMessageParseOutcome; use llama_cpp_bindings::ToolCallArguments; -use llama_cpp_bindings::llama_backend::LlamaBackend; -use llama_cpp_bindings::model::LlamaModel; -use llama_cpp_bindings_tests::gpu_backend::inference_model_params; -use llama_cpp_bindings_tests::gpu_backend::require_compiled_backends_present; -use llama_cpp_bindings_tests::test_model::download_file_from; - -const GEMMA4_REPO: &str = "unsloth/gemma-4-E4B-it-GGUF"; -const GEMMA4_FILE: &str = "gemma-4-E4B-it-Q4_K_M.gguf"; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; const TOOLS_JSON: &str = r#"[ { @@ -31,16 +26,20 @@ const TOOLS_JSON: &str = r#"[ const GEMMA4_PAIRED_QUOTE_PAYLOAD: &str = "<|tool_call>call:get_weather{location:<|\"|>Paris<|\"|>}"; -#[test] -fn gemma4_parses_tool_call_payload() -> Result<()> { - let backend = LlamaBackend::init()?; - require_compiled_backends_present()?; - - let path = download_file_from(GEMMA4_REPO, GEMMA4_FILE)?; - let params = inference_model_params(); - let model = LlamaModel::load_from_file(&backend, &path, ¶ms)?; - - let outcome = model.parse_chat_message(TOOLS_JSON, GEMMA4_PAIRED_QUOTE_PAYLOAD, false)?; +#[llama_test( + model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn gemma4_parses_tool_call_payload(fixture: &LlamaFixture<'_>) -> Result<()> { + let outcome = + fixture + .model + .parse_chat_message(TOOLS_JSON, GEMMA4_PAIRED_QUOTE_PAYLOAD, false)?; let ChatMessageParseOutcome::Recognized(parsed) = outcome else { bail!("expected Recognized for Gemma 4 PairedQuote on a Gemma-4 model; got Unrecognized"); @@ -65,3 +64,5 @@ fn gemma4_parses_tool_call_payload() -> Result<()> { Ok(()) } + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/gemma4_template_override_returns_full_markers.rs b/llama-cpp-bindings-tests/tests/gemma4_template_override_returns_full_markers.rs index 8acea37b..dc8099d7 100644 --- a/llama-cpp-bindings-tests/tests/gemma4_template_override_returns_full_markers.rs +++ b/llama-cpp-bindings-tests/tests/gemma4_template_override_returns_full_markers.rs @@ -1,23 +1,25 @@ +#![expect( + clippy::unnecessary_wraps, + reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" +)] + use anyhow::Result; use llama_cpp_bindings::ToolCallArgsShape; -use llama_cpp_bindings::llama_backend::LlamaBackend; -use llama_cpp_bindings::model::LlamaModel; -use llama_cpp_bindings_tests::gpu_backend::inference_model_params; -use llama_cpp_bindings_tests::gpu_backend::require_compiled_backends_present; -use llama_cpp_bindings_tests::test_model::download_file_from; - -const GEMMA4_REPO: &str = "unsloth/gemma-4-E4B-it-GGUF"; -const GEMMA4_FILE: &str = "gemma-4-E4B-it-Q4_K_M.gguf"; - -#[test] -fn gemma4_template_override_returns_full_markers() -> Result<()> { - let backend = LlamaBackend::init()?; - require_compiled_backends_present()?; - - let path = download_file_from(GEMMA4_REPO, GEMMA4_FILE)?; - let params = inference_model_params(); - let model = LlamaModel::load_from_file(&backend, &path, ¶ms)?; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; +#[llama_test( + model_source = HuggingFace("unsloth/gemma-4-E4B-it-GGUF", "gemma-4-E4B-it-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn gemma4_template_override_returns_full_markers(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; let template = model .chat_template(None) .expect("Gemma 4 chat template must be present"); @@ -44,3 +46,5 @@ fn gemma4_template_override_returns_full_markers() -> Result<()> { Ok(()) } + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/glm47_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs b/llama-cpp-bindings-tests/tests/glm47_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs index cea184bf..7b614ef9 100644 --- a/llama-cpp-bindings-tests/tests/glm47_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs +++ b/llama-cpp-bindings-tests/tests/glm47_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs @@ -1,27 +1,15 @@ -use std::num::NonZeroU32; - use anyhow::Result; use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::context::params::LlamaContextParams; -use llama_cpp_bindings::llama_backend::LlamaBackend; use llama_cpp_bindings::llama_batch::LlamaBatch; use llama_cpp_bindings::model::AddBos; -use llama_cpp_bindings::model::LlamaModel; use llama_cpp_bindings::sampling::LlamaSampler; use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; -use llama_cpp_bindings_tests::gpu_backend::inference_model_params; -use llama_cpp_bindings_tests::gpu_backend::require_compiled_backends_present; -use llama_cpp_bindings_tests::test_model::download_file_from; - -const GLM47_REPO: &str = "unsloth/GLM-4.7-Flash-GGUF"; -const GLM47_FILE: &str = "GLM-4.7-Flash-Q4_K_M.gguf"; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; const MAX_GENERATED_TOKENS: i32 = 200; -// GLM-4.7-Flash with reasoning disabled: the chat template renders a closed -// `` immediately after `<|assistant|>\n`, leaving the model outside -// the reasoning section before generation begins. No reasoning tokens should -// ever be classified. const GLM47_THINKING_DISABLED_PROMPT: &str = "\ <|user|> What is 2 + 2? @@ -32,14 +20,20 @@ What is 2 + 2? const FORBIDDEN_MARKERS: &[&str] = &["", ""]; -#[test] -fn glm47_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt() -> Result<()> { - let backend = LlamaBackend::init()?; - require_compiled_backends_present()?; - - let path = download_file_from(GLM47_REPO, GLM47_FILE)?; - let params = inference_model_params(); - let model = LlamaModel::load_from_file(&backend, &path, ¶ms)?; +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 2048, + n_ubatch = 512, +)] +fn glm47_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; let mut classifier = model.sampled_token_classifier(); let prompt_tokens = model.str_to_token(GLM47_THINKING_DISABLED_PROMPT, AddBos::Never)?; @@ -48,8 +42,11 @@ fn glm47_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt() -> Re let mut batch = LlamaBatch::new(2048, 1)?; classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; - let context_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(8192)); - let mut context = LlamaContext::from_model(&model, &backend, context_params)?; + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; context.decode(&mut batch)?; @@ -66,7 +63,7 @@ fn glm47_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt() -> Re ]); let initial_position = batch.n_tokens(); let outcome = ClassifySampleLoop { - model: &model, + model, classifier: &mut classifier, sampler: &mut sampler, context: &mut context, @@ -78,50 +75,19 @@ fn glm47_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt() -> Re let usage = classifier.usage(); - assert!( - !outcome.generated_raw.is_empty(), - "GLM-4.7: must generate at least one token" - ); - assert_eq!( - outcome.observed_reasoning, 0, - "GLM-4.7 thinking-disabled: classifier must not emit any Reasoning token \ - when the prompt closes the think block before generation begins; \ - generated={:?}", - outcome.generated_raw - ); - assert_eq!( - outcome.observed_undeterminable, 0, - "GLM-4.7 thinking-disabled: prompt-token replay must move section to Content \ - before generation, so no Undeterminable tokens may be emitted; \ - generated={:?}", - outcome.generated_raw - ); - assert_eq!( - usage.reasoning_tokens, 0, - "GLM-4.7 thinking-disabled: usage.reasoning_tokens must be zero; usage={usage:?}" - ); - assert_eq!( - usage.undeterminable_tokens, 0, - "GLM-4.7 thinking-disabled: usage.undeterminable_tokens must be zero; usage={usage:?}" - ); - assert!( - outcome.observed_content > 0, - "GLM-4.7 thinking-disabled: classifier must emit at least one Content token" - ); - assert_eq!( - usage.completion_tokens(), - outcome.observed_content, - "GLM-4.7 thinking-disabled: completion tokens must equal observed Content tokens" - ); + assert!(!outcome.generated_raw.is_empty()); + assert_eq!(outcome.observed_reasoning, 0); + assert_eq!(outcome.observed_undeterminable, 0); + assert_eq!(usage.reasoning_tokens, 0); + assert_eq!(usage.undeterminable_tokens, 0); + assert!(outcome.observed_content > 0); + assert_eq!(usage.completion_tokens(), outcome.observed_content); for forbidden in FORBIDDEN_MARKERS { - assert!( - !outcome.content_stream.contains(forbidden), - "GLM-4.7 thinking-disabled: content_stream leaked marker {forbidden:?}; \ - content_stream={:?}", - outcome.content_stream - ); + assert!(!outcome.content_stream.contains(forbidden)); } Ok(()) } + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/glm47_classifier_emits_reasoning.rs b/llama-cpp-bindings-tests/tests/glm47_classifier_emits_reasoning.rs index d4fec908..d4677a14 100644 --- a/llama-cpp-bindings-tests/tests/glm47_classifier_emits_reasoning.rs +++ b/llama-cpp-bindings-tests/tests/glm47_classifier_emits_reasoning.rs @@ -1,32 +1,17 @@ -use std::num::NonZeroU32; - use anyhow::Result; use anyhow::bail; use llama_cpp_bindings::ChatMessageParseOutcome; use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::context::params::LlamaContextParams; -use llama_cpp_bindings::llama_backend::LlamaBackend; use llama_cpp_bindings::llama_batch::LlamaBatch; use llama_cpp_bindings::model::AddBos; -use llama_cpp_bindings::model::LlamaModel; use llama_cpp_bindings::sampling::LlamaSampler; use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; -use llama_cpp_bindings_tests::gpu_backend::inference_model_params; -use llama_cpp_bindings_tests::gpu_backend::require_compiled_backends_present; -use llama_cpp_bindings_tests::test_model::download_file_from; - -const GLM47_REPO: &str = "unsloth/GLM-4.7-Flash-GGUF"; -const GLM47_FILE: &str = "GLM-4.7-Flash-Q4_K_M.gguf"; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; -// Budget tuned so the close marker reliably emits — enough thinking space for a -// concise question. The companion prompt is intentionally direct so the model -// finishes thinking quickly and emits . const MAX_GENERATED_TOKENS: i32 = 1500; -// GLM-4.7-Flash uses `...` reasoning markers (same lexical form -// as Qwen3.5/3.6) and `<|user|>` / `<|assistant|>` role tokens. The prompt -// ends inside an open `` block so generation resumes in the reasoning -// section, mirroring how the chat template renders when reasoning is enabled. const GLM47_THINKING_PROMPT: &str = "\ <|user|> What is 2 + 2? @@ -36,14 +21,20 @@ What is 2 + 2? const FORBIDDEN_MARKERS: &[&str] = &["", ""]; -#[test] -fn glm47_classifier_emits_reasoning_for_thinking_enabled_prompt() -> Result<()> { - let backend = LlamaBackend::init()?; - require_compiled_backends_present()?; - - let path = download_file_from(GLM47_REPO, GLM47_FILE)?; - let params = inference_model_params(); - let model = LlamaModel::load_from_file(&backend, &path, ¶ms)?; +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 2048, + n_ubatch = 512, +)] +fn glm47_classifier_emits_reasoning_for_thinking_enabled_prompt( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; let mut classifier = model.sampled_token_classifier(); let prompt_tokens = model.str_to_token(GLM47_THINKING_PROMPT, AddBos::Never)?; @@ -52,8 +43,11 @@ fn glm47_classifier_emits_reasoning_for_thinking_enabled_prompt() -> Result<()> let mut batch = LlamaBatch::new(2048, 1)?; classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; - let context_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(8192)); - let mut context = LlamaContext::from_model(&model, &backend, context_params)?; + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; context.decode(&mut batch)?; @@ -70,7 +64,7 @@ fn glm47_classifier_emits_reasoning_for_thinking_enabled_prompt() -> Result<()> ]); let initial_position = batch.n_tokens(); let outcome = ClassifySampleLoop { - model: &model, + model, classifier: &mut classifier, sampler: &mut sampler, context: &mut context, @@ -86,33 +80,14 @@ fn glm47_classifier_emits_reasoning_for_thinking_enabled_prompt() -> Result<()> bail!("GLM-4.7 chat template must be recognised by the parser; got Unrecognized"); }; - assert!( - !outcome.generated_raw.is_empty(), - "GLM-4.7: must generate at least one token" - ); - assert!( - outcome.observed_reasoning > 0, - "GLM-4.7: classifier must emit at least one Reasoning token when the prompt \ - opens a block; outcome={outcome:?}", - ); - assert!( - usage.reasoning_tokens > 0, - "GLM-4.7: usage.reasoning_tokens must be non-zero when the prompt opens a \ - block; usage was {usage:?}" - ); - assert_eq!( - outcome.observed_undeterminable, 0, - "GLM-4.7: prompt-token replay must move section to Reasoning before generation, \ - so no Undeterminable tokens may be emitted; outcome={outcome:?}" - ); - assert_eq!( - usage.undeterminable_tokens, 0, - "GLM-4.7: usage.undeterminable_tokens must be zero; usage={usage:?}" - ); + assert!(!outcome.generated_raw.is_empty()); + assert!(outcome.observed_reasoning > 0); + assert!(usage.reasoning_tokens > 0); + assert_eq!(outcome.observed_undeterminable, 0); + assert_eq!(usage.undeterminable_tokens, 0); assert_eq!( usage.completion_tokens(), - outcome.observed_content + outcome.observed_reasoning, - "GLM-4.7: completion tokens must equal observed Content + Reasoning" + outcome.observed_content + outcome.observed_reasoning ); if parsed.reasoning_content.is_empty() { @@ -121,32 +96,16 @@ fn glm47_classifier_emits_reasoning_for_thinking_enabled_prompt() -> Result<()> skipping strict parser-equality assertions" ); } else { - assert_eq!( - outcome.reasoning_stream, parsed.reasoning_content, - "GLM-4.7: per-token reasoning stream must equal parser-side reasoning_content \ - (any difference means a marker leaked into the user-visible stream)", - ); - assert_eq!( - outcome.content_stream, parsed.content, - "GLM-4.7: per-token content stream must equal parser-side content \ - (any difference means a marker leaked into the user-visible stream)", - ); + assert_eq!(outcome.reasoning_stream, parsed.reasoning_content); + assert_eq!(outcome.content_stream, parsed.content); } for forbidden in FORBIDDEN_MARKERS { - assert!( - !outcome.reasoning_stream.contains(forbidden), - "GLM-4.7: reasoning_stream leaked marker {forbidden:?}; \ - reasoning_stream={:?}", - outcome.reasoning_stream - ); - assert!( - !outcome.content_stream.contains(forbidden), - "GLM-4.7: content_stream leaked marker {forbidden:?}; \ - content_stream={:?}", - outcome.content_stream - ); + assert!(!outcome.reasoning_stream.contains(forbidden)); + assert!(!outcome.content_stream.contains(forbidden)); } Ok(()) } + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/glm47_parses_tool_call_payload.rs b/llama-cpp-bindings-tests/tests/glm47_parses_tool_call_payload.rs index f3b076ec..8f31901e 100644 --- a/llama-cpp-bindings-tests/tests/glm47_parses_tool_call_payload.rs +++ b/llama-cpp-bindings-tests/tests/glm47_parses_tool_call_payload.rs @@ -2,14 +2,9 @@ use anyhow::Result; use anyhow::bail; use llama_cpp_bindings::ChatMessageParseOutcome; use llama_cpp_bindings::ToolCallArguments; -use llama_cpp_bindings::llama_backend::LlamaBackend; -use llama_cpp_bindings::model::LlamaModel; -use llama_cpp_bindings_tests::gpu_backend::inference_model_params; -use llama_cpp_bindings_tests::gpu_backend::require_compiled_backends_present; -use llama_cpp_bindings_tests::test_model::download_file_from; - -const GLM47_REPO: &str = "unsloth/GLM-4.7-Flash-GGUF"; -const GLM47_FILE: &str = "GLM-4.7-Flash-Q4_K_M.gguf"; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; const TOOLS_JSON: &str = r#"[ { @@ -33,28 +28,26 @@ const GLM47_KEY_VALUE_PAYLOAD: &str = "get_weather\ Paris\ "; -#[test] -fn glm47_parses_tool_call_payload() -> Result<()> { - let backend = LlamaBackend::init()?; - require_compiled_backends_present()?; - - let path = download_file_from(GLM47_REPO, GLM47_FILE)?; - let params = inference_model_params(); - let model = LlamaModel::load_from_file(&backend, &path, ¶ms)?; - - let outcome = model.parse_chat_message(TOOLS_JSON, GLM47_KEY_VALUE_PAYLOAD, false)?; +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn glm47_parses_tool_call_payload(fixture: &LlamaFixture<'_>) -> Result<()> { + let outcome = fixture + .model + .parse_chat_message(TOOLS_JSON, GLM47_KEY_VALUE_PAYLOAD, false)?; let ChatMessageParseOutcome::Recognized(parsed) = outcome else { bail!( "expected Recognized for GLM-4.7 key-value tags on a GLM-4.7-Flash model; got Unrecognized" ); }; - assert_eq!( - parsed.tool_calls.len(), - 1, - "expected one tool call; got {:?}", - parsed.tool_calls - ); + assert_eq!(parsed.tool_calls.len(), 1); assert_eq!(parsed.tool_calls[0].name, "get_weather"); let location = match &parsed.tool_calls[0].arguments { ToolCallArguments::ValidJson(value) => value @@ -69,3 +62,5 @@ fn glm47_parses_tool_call_payload() -> Result<()> { Ok(()) } + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/glm47_template_override_returns_full_markers.rs b/llama-cpp-bindings-tests/tests/glm47_template_override_returns_full_markers.rs index 72ac1edb..491c46c4 100644 --- a/llama-cpp-bindings-tests/tests/glm47_template_override_returns_full_markers.rs +++ b/llama-cpp-bindings-tests/tests/glm47_template_override_returns_full_markers.rs @@ -1,33 +1,30 @@ +#![expect( + clippy::unnecessary_wraps, + reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" +)] + use anyhow::Result; use llama_cpp_bindings::ToolCallArgsShape; -use llama_cpp_bindings::llama_backend::LlamaBackend; -use llama_cpp_bindings::model::LlamaModel; -use llama_cpp_bindings_tests::gpu_backend::inference_model_params; -use llama_cpp_bindings_tests::gpu_backend::require_compiled_backends_present; -use llama_cpp_bindings_tests::test_model::download_file_from; - -const GLM47_REPO: &str = "unsloth/GLM-4.7-Flash-GGUF"; -const GLM47_FILE: &str = "GLM-4.7-Flash-Q4_K_M.gguf"; - -#[test] -fn glm47_template_override_returns_full_markers() -> Result<()> { - let backend = LlamaBackend::init()?; - require_compiled_backends_present()?; - - let path = download_file_from(GLM47_REPO, GLM47_FILE)?; - let params = inference_model_params(); - let model = LlamaModel::load_from_file(&backend, &path, ¶ms)?; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn glm47_template_override_returns_full_markers(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; let template = model .chat_template(None) .expect("GLM-4.7 chat template must be present"); let template_str = template.to_str().expect("template must be valid UTF-8"); - assert!( - template_str.contains(""), - "GLM-4.7 chat template must contain '' fingerprint; \ - template starts with: {:?}", - &template_str[..template_str.len().min(200)], - ); + assert!(template_str.contains("")); let markers = model .tool_call_markers() @@ -48,3 +45,5 @@ fn glm47_template_override_returns_full_markers() -> Result<()> { Ok(()) } + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/ingest_prompt_chunk.rs b/llama-cpp-bindings-tests/tests/ingest_prompt_chunk.rs index df1af8b6..24045f7c 100644 --- a/llama-cpp-bindings-tests/tests/ingest_prompt_chunk.rs +++ b/llama-cpp-bindings-tests/tests/ingest_prompt_chunk.rs @@ -1,19 +1,29 @@ -#![cfg(feature = "multimodal_capable")] - use anyhow::Result; use llama_cpp_bindings::ingest_prompt_chunk::ingest_prompt_chunk; use llama_cpp_bindings::mtmd::MtmdBitmap; use llama_cpp_bindings::mtmd::MtmdInputChunkType; use llama_cpp_bindings::mtmd::MtmdInputText; use llama_cpp_bindings::mtmd::mtmd_default_marker; -use llama_cpp_bindings_tests::FixtureSession; use llama_cpp_bindings_tests::test_model::fixtures_dir; - -#[test] -fn text_chunk_records_prompt_tokens() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - let mtmd_ctx = fixture.mtmd_context()?; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +fn text_chunk_records_prompt_tokens(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); let input_text = MtmdInputText { text: "hello world".to_owned(), @@ -29,7 +39,7 @@ fn text_chunk_records_prompt_tokens() -> Result<()> { anyhow::anyhow!("text-only tokenization should produce at least one text chunk") })?; - let n_tokens = text_chunk.n_tokens() as u64; + let n_tokens = u64::try_from(text_chunk.n_tokens())?; let mut classifier = model.sampled_token_classifier(); @@ -58,11 +68,21 @@ fn text_chunk_records_prompt_tokens() -> Result<()> { Ok(()) } -#[test] -fn image_chunk_records_input_image_tokens_only() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - let mtmd_ctx = fixture.mtmd_context()?; +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +fn image_chunk_records_input_image_tokens_only(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); let image_path = fixtures_dir().join("llamas.jpg"); let image_path_str = image_path @@ -83,7 +103,7 @@ fn image_chunk_records_input_image_tokens_only() -> Result<()> { .find(|chunk| chunk.chunk_type() == Ok(MtmdInputChunkType::Image)) .ok_or_else(|| anyhow::anyhow!("multimodal tokenization should produce an image chunk"))?; - let n_tokens = image_chunk.n_tokens() as u64; + let n_tokens = u64::try_from(image_chunk.n_tokens())?; if n_tokens == 0 { anyhow::bail!("image chunk should report at least one token"); } @@ -115,11 +135,21 @@ fn image_chunk_records_input_image_tokens_only() -> Result<()> { Ok(()) } -#[test] -fn text_chunk_drives_marker_state_machine_to_reasoning() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - let mtmd_ctx = fixture.mtmd_context()?; +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +fn text_chunk_drives_marker_state_machine_to_reasoning(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); let input_text = MtmdInputText { text: "<|im_start|>user\nHi<|im_end|>\n<|im_start|>assistant\n\n".to_owned(), @@ -147,3 +177,5 @@ fn text_chunk_drives_marker_state_machine_to_reasoning() -> Result<()> { Ok(()) } + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/llama_backend.rs b/llama-cpp-bindings-tests/tests/llama_backend.rs index aec05c41..4e99c339 100644 --- a/llama-cpp-bindings-tests/tests/llama_backend.rs +++ b/llama-cpp-bindings-tests/tests/llama_backend.rs @@ -1,18 +1,30 @@ +#![expect( + clippy::unnecessary_wraps, + reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" +)] + use anyhow::Result; -use llama_cpp_bindings::llama_backend::LlamaBackend; -use llama_cpp_bindings::model::LlamaModel; -use llama_cpp_bindings_tests::gpu_backend::inference_model_params; -use llama_cpp_bindings_tests::test_model; -use serial_test::serial; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; -#[test] -#[serial] -fn void_logs_suppresses_output() -> Result<()> { - let mut backend = LlamaBackend::init()?; - backend.void_logs(); - let model_path = test_model::download_model()?; - let model_params = inference_model_params(); - let _model = LlamaModel::load_from_file(&backend, model_path, &model_params)?; +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + void_logs = true, +)] +fn void_logs_suppresses_output(fixture: &LlamaFixture<'_>) -> Result<()> { + assert!( + fixture.model.n_vocab() > 0, + "model must load successfully even when void_logs has been called before init" + ); Ok(()) } + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/llguidance.rs b/llama-cpp-bindings-tests/tests/llguidance.rs index 06427e36..74bd229a 100644 --- a/llama-cpp-bindings-tests/tests/llguidance.rs +++ b/llama-cpp-bindings-tests/tests/llguidance.rs @@ -1,95 +1,326 @@ +#![expect( + clippy::unnecessary_wraps, + reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" +)] + use std::ffi::CStr; -use std::num::NonZeroU32; use std::sync::Arc; use anyhow::Result; use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::context::params::LlamaContextParams; use llama_cpp_bindings::llama_batch::LlamaBatch; use llama_cpp_bindings::llguidance_sampler::create_llg_sampler; use llama_cpp_bindings::model::AddBos; use llama_cpp_bindings::sampling::LlamaSampler; use llama_cpp_bindings::token::LlamaToken; -use llama_cpp_bindings_tests::FixtureSession; -use serial_test::serial; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; const JSON_SCHEMA: &str = r#"{"type":"object","properties":{"answer":{"type":"string"}},"required":["answer"]}"#; const REGEX_GRAMMAR: &str = r"yes|no"; const LARK_GRAMMAR: &str = r#"start: "yes" | "no""#; -#[test] -#[serial] -fn creates_sampler_with_valid_json_schema() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - let sampler = create_llg_sampler(model, "json", JSON_SCHEMA)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn creates_sampler_with_valid_json_schema(fixture: &LlamaFixture<'_>) -> Result<()> { + let sampler = create_llg_sampler(fixture.model, "json", JSON_SCHEMA)?; assert!(!sampler.sampler.is_null()); Ok(()) } -#[test] -#[serial] -fn creates_sampler_with_valid_regex_grammar() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - let sampler = create_llg_sampler(model, "regex", REGEX_GRAMMAR)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn creates_sampler_with_valid_regex_grammar(fixture: &LlamaFixture<'_>) -> Result<()> { + let sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?; assert!(!sampler.sampler.is_null()); Ok(()) } -#[test] -#[serial] -fn creates_sampler_with_valid_lark_grammar() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - let sampler = create_llg_sampler(model, "lark", LARK_GRAMMAR)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn creates_sampler_with_valid_lark_grammar(fixture: &LlamaFixture<'_>) -> Result<()> { + let sampler = create_llg_sampler(fixture.model, "lark", LARK_GRAMMAR)?; assert!(!sampler.sampler.is_null()); Ok(()) } -#[test] -#[serial] -fn returns_error_for_unknown_grammar_kind() { - let fixture = FixtureSession::open().expect("open fixture"); - let model = fixture.default_model(); - let result = create_llg_sampler(model, "not_a_real_kind", "anything"); +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn returns_error_for_unknown_grammar_kind(fixture: &LlamaFixture<'_>) -> Result<()> { + let result = create_llg_sampler(fixture.model, "not_a_real_kind", "anything"); assert!(result.is_err()); + Ok(()) } -#[test] -#[serial] -fn returns_error_for_malformed_json_schema() { - let fixture = FixtureSession::open().expect("open fixture"); - let model = fixture.default_model(); - let result = create_llg_sampler(model, "json", "{this is not valid json"); +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn returns_error_for_malformed_json_schema(fixture: &LlamaFixture<'_>) -> Result<()> { + let result = create_llg_sampler(fixture.model, "json", "{this is not valid json"); assert!(result.is_err()); + Ok(()) } -#[test] -#[serial] -fn returns_error_for_malformed_regex() { - let fixture = FixtureSession::open().expect("open fixture"); - let model = fixture.default_model(); - let result = create_llg_sampler(model, "regex", "[invalid"); +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn returns_error_for_malformed_regex(fixture: &LlamaFixture<'_>) -> Result<()> { + let result = create_llg_sampler(fixture.model, "regex", "[invalid"); assert!(result.is_err()); + Ok(()) } -#[test] -#[serial] -fn name_callback_returns_llguidance() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - let sampler = create_llg_sampler(model, "regex", REGEX_GRAMMAR)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn name_callback_returns_llguidance(fixture: &LlamaFixture<'_>) -> Result<()> { + let sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?; let name_ptr = unsafe { llama_cpp_bindings_sys::llama_sampler_name(sampler.sampler) }; assert!(!name_ptr.is_null()); @@ -100,24 +331,44 @@ fn name_callback_returns_llguidance() -> Result<()> { Ok(()) } -#[test] -#[serial] -fn reset_clears_sampler_state() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - let mut sampler = create_llg_sampler(model, "regex", REGEX_GRAMMAR)?; - - sampler.reset(); - - Ok(()) -} - -#[test] -#[serial] -fn clone_via_ffi_creates_independent_sampler() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - let sampler = create_llg_sampler(model, "regex", REGEX_GRAMMAR)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn clone_via_ffi_creates_independent_sampler(fixture: &LlamaFixture<'_>) -> Result<()> { + let sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?; let cloned = unsafe { llama_cpp_bindings_sys::llama_sampler_clone(sampler.sampler) }; @@ -128,14 +379,50 @@ fn clone_via_ffi_creates_independent_sampler() -> Result<()> { Ok(()) } -#[test] -#[serial] -fn samples_token_constrained_by_grammar() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn samples_token_constrained_by_grammar(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; let prompt = "Answer yes or no:"; let tokens = model.str_to_token(prompt, AddBos::Always)?; @@ -152,12 +439,44 @@ fn samples_token_constrained_by_grammar() -> Result<()> { Ok(()) } -#[test] -#[serial] -fn accept_invalid_token_id_does_not_panic() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - let mut sampler = create_llg_sampler(model, "regex", REGEX_GRAMMAR)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn accept_invalid_token_id_does_not_panic(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?; let huge_token = LlamaToken(i32::MAX - 1); let _ = sampler.accept(huge_token); @@ -165,28 +484,92 @@ fn accept_invalid_token_id_does_not_panic() -> Result<()> { Ok(()) } -#[test] -#[serial] -fn approximate_tok_env_returns_same_arc_across_calls() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - - let first = model.approximate_tok_env(); - let second = model.approximate_tok_env(); +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn approximate_tok_env_returns_same_arc_across_calls(fixture: &LlamaFixture<'_>) -> Result<()> { + let first = fixture.model.approximate_tok_env(); + let second = fixture.model.approximate_tok_env(); assert!(Arc::ptr_eq(&first, &second)); Ok(()) } -#[test] -#[serial] -fn approximate_tok_env_drives_consistent_grammar_constraint() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - - let first = create_llg_sampler(model, "regex", REGEX_GRAMMAR)?; - let second = create_llg_sampler(model, "regex", REGEX_GRAMMAR)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn approximate_tok_env_drives_consistent_grammar_constraint( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let first = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?; + let second = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?; assert!(!first.sampler.is_null()); assert!(!second.sampler.is_null()); @@ -194,14 +577,50 @@ fn approximate_tok_env_drives_consistent_grammar_constraint() -> Result<()> { Ok(()) } -#[test] -#[serial] -fn apply_through_chain_during_sample_does_not_panic() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn apply_through_chain_during_sample_does_not_panic(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; let tokens = model.str_to_token("Answer:", AddBos::Always)?; let mut batch = LlamaBatch::new(512, 1)?; @@ -214,3 +633,54 @@ fn apply_through_chain_during_sample_does_not_panic() -> Result<()> { Ok(()) } + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn reset_clears_sampler_state(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut sampler = create_llg_sampler(fixture.model, "regex", REGEX_GRAMMAR)?; + let huge_token = LlamaToken(i32::MAX - 1); + let _ = sampler.accept(huge_token); + sampler.reset(); + let after = sampler.accept(LlamaToken(0)); + assert!( + after.is_ok() || after.is_err(), + "after reset, sampler.accept must return Ok or Err (not panic)" + ); + Ok(()) +} + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/mistral3_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs b/llama-cpp-bindings-tests/tests/mistral3_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs index 08708097..6ae1d9cd 100644 --- a/llama-cpp-bindings-tests/tests/mistral3_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs +++ b/llama-cpp-bindings-tests/tests/mistral3_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs @@ -1,39 +1,34 @@ -use std::num::NonZeroU32; - use anyhow::Result; use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::context::params::LlamaContextParams; -use llama_cpp_bindings::llama_backend::LlamaBackend; use llama_cpp_bindings::llama_batch::LlamaBatch; use llama_cpp_bindings::model::AddBos; -use llama_cpp_bindings::model::LlamaModel; use llama_cpp_bindings::sampling::LlamaSampler; use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; -use llama_cpp_bindings_tests::gpu_backend::inference_model_params; -use llama_cpp_bindings_tests::gpu_backend::require_compiled_backends_present; -use llama_cpp_bindings_tests::test_model::download_file_from; - -const MISTRAL3_REPO: &str = "unsloth/Ministral-3-14B-Reasoning-2512-GGUF"; -const MISTRAL3_FILE: &str = "Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf"; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; const MAX_GENERATED_TOKENS: i32 = 200; -// Mirrors what Mistral 3 Reasoning's chat template renders when the caller -// asks for `enable_thinking=false`: the user turn is followed by a closed -// empty `[THINK][/THINK]` block, so generation begins in CONTENT. const MISTRAL3_THINKING_DISABLED_PROMPT: &str = "\ [INST]Reply with the single word: four. Do not explain.[/INST][THINK][/THINK]"; const FORBIDDEN_MARKERS: &[&str] = &["[THINK]", "[/THINK]"]; -#[test] -fn mistral3_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt() -> Result<()> { - let backend = LlamaBackend::init()?; - require_compiled_backends_present()?; - - let path = download_file_from(MISTRAL3_REPO, MISTRAL3_FILE)?; - let params = inference_model_params(); - let model = LlamaModel::load_from_file(&backend, &path, ¶ms)?; +#[llama_test( + model_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 2048, + n_ubatch = 512, +)] +fn mistral3_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; let mut classifier = model.sampled_token_classifier(); let prompt_tokens = model.str_to_token(MISTRAL3_THINKING_DISABLED_PROMPT, AddBos::Always)?; @@ -42,8 +37,11 @@ fn mistral3_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt() -> let mut batch = LlamaBatch::new(2048, 1)?; classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; - let context_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(8192)); - let mut context = LlamaContext::from_model(&model, &backend, context_params)?; + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; context.decode(&mut batch)?; @@ -53,7 +51,7 @@ fn mistral3_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt() -> let mut sampler = LlamaSampler::greedy(); let initial_position = batch.n_tokens(); let outcome = ClassifySampleLoop { - model: &model, + model, classifier: &mut classifier, sampler: &mut sampler, context: &mut context, @@ -65,50 +63,19 @@ fn mistral3_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt() -> let usage = classifier.usage(); - assert!( - !outcome.generated_raw.is_empty(), - "Mistral 3 must generate at least one token" - ); - assert_eq!( - outcome.observed_reasoning, 0, - "Mistral 3 thinking-disabled: classifier must not emit any Reasoning token \ - when the prompt closes the [THINK] block before generation begins; \ - generated={:?}", - outcome.generated_raw - ); - assert_eq!( - outcome.observed_undeterminable, 0, - "Mistral 3 thinking-disabled: prompt-token replay must move section to Content \ - before generation, so no Undeterminable tokens may be emitted; \ - generated={:?}", - outcome.generated_raw - ); - assert_eq!( - usage.reasoning_tokens, 0, - "Mistral 3 thinking-disabled: usage.reasoning_tokens must be zero; usage={usage:?}" - ); - assert_eq!( - usage.undeterminable_tokens, 0, - "Mistral 3 thinking-disabled: usage.undeterminable_tokens must be zero; usage={usage:?}" - ); - assert!( - outcome.observed_content > 0, - "Mistral 3 thinking-disabled: classifier must emit at least one Content token" - ); - assert_eq!( - usage.completion_tokens(), - outcome.observed_content, - "Mistral 3 thinking-disabled: completion tokens must equal observed Content tokens" - ); + assert!(!outcome.generated_raw.is_empty()); + assert_eq!(outcome.observed_reasoning, 0); + assert_eq!(outcome.observed_undeterminable, 0); + assert_eq!(usage.reasoning_tokens, 0); + assert_eq!(usage.undeterminable_tokens, 0); + assert!(outcome.observed_content > 0); + assert_eq!(usage.completion_tokens(), outcome.observed_content); for forbidden in FORBIDDEN_MARKERS { - assert!( - !outcome.content_stream.contains(forbidden), - "Mistral 3 thinking-disabled: content_stream leaked marker {forbidden:?}; \ - content_stream={:?}", - outcome.content_stream - ); + assert!(!outcome.content_stream.contains(forbidden)); } Ok(()) } + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/mistral3_classifier_emits_reasoning.rs b/llama-cpp-bindings-tests/tests/mistral3_classifier_emits_reasoning.rs index 83e39cb5..296ad348 100644 --- a/llama-cpp-bindings-tests/tests/mistral3_classifier_emits_reasoning.rs +++ b/llama-cpp-bindings-tests/tests/mistral3_classifier_emits_reasoning.rs @@ -1,31 +1,17 @@ -use std::num::NonZeroU32; - use anyhow::Result; use anyhow::bail; use llama_cpp_bindings::ChatMessageParseOutcome; use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::context::params::LlamaContextParams; -use llama_cpp_bindings::llama_backend::LlamaBackend; use llama_cpp_bindings::llama_batch::LlamaBatch; use llama_cpp_bindings::model::AddBos; -use llama_cpp_bindings::model::LlamaModel; use llama_cpp_bindings::sampling::LlamaSampler; use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; -use llama_cpp_bindings_tests::gpu_backend::inference_model_params; -use llama_cpp_bindings_tests::gpu_backend::require_compiled_backends_present; -use llama_cpp_bindings_tests::test_model::download_file_from; - -const MISTRAL3_REPO: &str = "unsloth/Ministral-3-14B-Reasoning-2512-GGUF"; -const MISTRAL3_FILE: &str = "Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf"; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; const MAX_GENERATED_TOKENS: i32 = 768; -// Mistral 3 Reasoning's chat template wraps thoughts in `[THINK]...[/THINK]` and -// relies on a fine-tuned default system prompt to make the model emit them. -// Unlike Qwen3.5/3.6, Mistral does not pre-inject `[THINK]` into the generation -// prompt — the model itself emits the open marker as its first generated token. -// We craft the prompt manually rather than going through the legacy chat-template -// engine to keep the test independent of jinja-engine quirks. const MISTRAL3_THINKING_PROMPT: &str = "\ [SYSTEM_PROMPT]# HOW YOU SHOULD THINK AND ANSWER\n\n\ First draft your thinking process (inner monologue) until you arrive at a response. \ @@ -39,14 +25,20 @@ to the user.[/THINK]Here, provide a self-contained response.[/SYSTEM_PROMPT]\ const FORBIDDEN_MARKERS: &[&str] = &["[THINK]", "[/THINK]"]; -#[test] -fn mistral3_classifier_emits_reasoning_for_thinking_prompt() -> Result<()> { - let backend = LlamaBackend::init()?; - require_compiled_backends_present()?; - - let path = download_file_from(MISTRAL3_REPO, MISTRAL3_FILE)?; - let params = inference_model_params(); - let model = LlamaModel::load_from_file(&backend, &path, ¶ms)?; +#[llama_test( + model_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 2048, + n_ubatch = 512, +)] +fn mistral3_classifier_emits_reasoning_for_thinking_prompt( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; let mut classifier = model.sampled_token_classifier(); let prompt_tokens = model.str_to_token(MISTRAL3_THINKING_PROMPT, AddBos::Always)?; @@ -55,8 +47,11 @@ fn mistral3_classifier_emits_reasoning_for_thinking_prompt() -> Result<()> { let mut batch = LlamaBatch::new(2048, 1)?; classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; - let context_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(8192)); - let mut context = LlamaContext::from_model(&model, &backend, context_params)?; + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; context.decode(&mut batch)?; @@ -66,7 +61,7 @@ fn mistral3_classifier_emits_reasoning_for_thinking_prompt() -> Result<()> { let mut sampler = LlamaSampler::greedy(); let initial_position = batch.n_tokens(); let outcome = ClassifySampleLoop { - model: &model, + model, classifier: &mut classifier, sampler: &mut sampler, context: &mut context, @@ -82,65 +77,25 @@ fn mistral3_classifier_emits_reasoning_for_thinking_prompt() -> Result<()> { bail!("Mistral 3 chat template must be recognised by the parser; got Unrecognized"); }; - assert!( - !outcome.generated_raw.is_empty(), - "Mistral 3 must generate at least one token" - ); - assert!( - outcome.observed_reasoning > 0, - "Mistral 3 classifier must emit at least one Reasoning token when the model \ - opens a [THINK] block; outcome={outcome:?}", - ); - assert!( - usage.reasoning_tokens > 0, - "Mistral 3 usage.reasoning_tokens must be non-zero when the model emits a \ - [THINK] block; usage was {usage:?}" - ); - assert_eq!( - outcome.observed_undeterminable, 0, - "Mistral 3: prompt-token replay must transition the section before generation, \ - so no Undeterminable tokens may be emitted; outcome={outcome:?}" - ); - assert_eq!( - usage.undeterminable_tokens, 0, - "Mistral 3: usage.undeterminable_tokens must be zero; usage={usage:?}" - ); + assert!(!outcome.generated_raw.is_empty()); + assert!(outcome.observed_reasoning > 0); + assert!(usage.reasoning_tokens > 0); + assert_eq!(outcome.observed_undeterminable, 0); + assert_eq!(usage.undeterminable_tokens, 0); assert_eq!( usage.completion_tokens(), outcome.observed_content + outcome.observed_reasoning, - "Mistral 3: completion tokens must equal observed Content + Reasoning" - ); - assert!( - !parsed.reasoning_content.is_empty(), - "Mistral 3 must close its reasoning block within {MAX_GENERATED_TOKENS} tokens; \ - increase the budget or pick a more direct prompt. generated={:?}", - outcome.generated_raw, - ); - assert_eq!( - outcome.reasoning_stream, parsed.reasoning_content, - "Mistral 3: per-token reasoning stream must equal parser-side reasoning_content \ - (any difference means a marker leaked into the user-visible stream)", - ); - assert_eq!( - outcome.content_stream, parsed.content, - "Mistral 3: per-token content stream must equal parser-side content \ - (any difference means a marker leaked into the user-visible stream)", ); + assert!(!parsed.reasoning_content.is_empty()); + assert_eq!(outcome.reasoning_stream, parsed.reasoning_content); + assert_eq!(outcome.content_stream, parsed.content); for forbidden in FORBIDDEN_MARKERS { - assert!( - !outcome.reasoning_stream.contains(forbidden), - "Mistral 3: reasoning_stream leaked marker {forbidden:?}; \ - reasoning_stream={:?}", - outcome.reasoning_stream - ); - assert!( - !outcome.content_stream.contains(forbidden), - "Mistral 3: content_stream leaked marker {forbidden:?}; \ - content_stream={:?}", - outcome.content_stream - ); + assert!(!outcome.reasoning_stream.contains(forbidden)); + assert!(!outcome.content_stream.contains(forbidden)); } Ok(()) } + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/mistral3_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs b/llama-cpp-bindings-tests/tests/mistral3_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs index 53138078..abb5c39f 100644 --- a/llama-cpp-bindings-tests/tests/mistral3_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs +++ b/llama-cpp-bindings-tests/tests/mistral3_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs @@ -1,57 +1,48 @@ -#![cfg(feature = "multimodal_capable")] - -use std::num::NonZeroU32; - use anyhow::Result; use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::context::params::LlamaContextParams; -use llama_cpp_bindings::llama_backend::LlamaBackend; use llama_cpp_bindings::llama_batch::LlamaBatch; -use llama_cpp_bindings::model::LlamaModel; use llama_cpp_bindings::mtmd::MtmdBitmap; -use llama_cpp_bindings::mtmd::MtmdContext; -use llama_cpp_bindings::mtmd::MtmdContextParams; use llama_cpp_bindings::mtmd::MtmdInputText; use llama_cpp_bindings::mtmd::mtmd_default_marker; use llama_cpp_bindings::sampling::LlamaSampler; use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; -use llama_cpp_bindings_tests::gpu_backend::inference_model_params; -use llama_cpp_bindings_tests::gpu_backend::require_compiled_backends_present; -use llama_cpp_bindings_tests::test_model::download_file_from; use llama_cpp_bindings_tests::test_model::fixtures_dir; - -const MISTRAL3_REPO: &str = "unsloth/Ministral-3-14B-Reasoning-2512-GGUF"; -const MISTRAL3_FILE: &str = "Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf"; -const MISTRAL3_MMPROJ_FILE: &str = "mmproj-F16.gguf"; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; const MAX_GENERATED_TOKENS: i32 = 768; -#[test] -fn mistral3_classifier_emits_reasoning_for_multimodal_thinking_prompt() -> Result<()> { - let backend = LlamaBackend::init()?; - require_compiled_backends_present()?; - - let model_path = download_file_from(MISTRAL3_REPO, MISTRAL3_FILE)?; - let mmproj_path = download_file_from(MISTRAL3_REPO, MISTRAL3_MMPROJ_FILE)?; - let params = inference_model_params(); - let model = LlamaModel::load_from_file(&backend, &model_path, ¶ms)?; - - let mtmd_params = MtmdContextParams::default(); - let mmproj_str = mmproj_path - .to_str() - .ok_or_else(|| anyhow::anyhow!("mmproj path is not valid UTF-8"))?; - let mtmd_ctx = MtmdContext::init_from_file(mmproj_str, &model, &mtmd_params)?; - - let context_params = LlamaContextParams::default() - .with_n_ctx(NonZeroU32::new(8192)) - .with_n_batch(512); - let mut context = LlamaContext::from_model(&model, &backend, context_params)?; +#[llama_test( + model_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 4096, + n_batch = 512, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "mmproj-F16.gguf"), +)] +fn mistral3_classifier_emits_reasoning_for_multimodal_thinking_prompt( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; let image_path = fixtures_dir().join("llamas.jpg"); let image_path_str = image_path .to_str() .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?; - let bitmap = MtmdBitmap::from_file(&mtmd_ctx, image_path_str)?; + let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?; let marker = mtmd_default_marker(); let prompt = format!( @@ -75,13 +66,12 @@ fn mistral3_classifier_emits_reasoning_for_multimodal_thinking_prompt() -> Resul let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; let mut classifier = model.sampled_token_classifier(); - let n_past = - classifier.eval_multimodal_chunks(&chunks, &mtmd_ctx, &context, 0, 0, 512, true)?; + let n_past = classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?; let mut sampler = LlamaSampler::greedy(); let mut batch = LlamaBatch::new(2048, 1)?; let outcome = ClassifySampleLoop { - model: &model, + model, classifier: &mut classifier, sampler: &mut sampler, context: &mut context, @@ -107,3 +97,5 @@ fn mistral3_classifier_emits_reasoning_for_multimodal_thinking_prompt() -> Resul Ok(()) } + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/mistral3_parses_tool_call_payload.rs b/llama-cpp-bindings-tests/tests/mistral3_parses_tool_call_payload.rs index e576de18..b67e0765 100644 --- a/llama-cpp-bindings-tests/tests/mistral3_parses_tool_call_payload.rs +++ b/llama-cpp-bindings-tests/tests/mistral3_parses_tool_call_payload.rs @@ -2,14 +2,9 @@ use anyhow::Result; use anyhow::bail; use llama_cpp_bindings::ChatMessageParseOutcome; use llama_cpp_bindings::ToolCallArguments; -use llama_cpp_bindings::llama_backend::LlamaBackend; -use llama_cpp_bindings::model::LlamaModel; -use llama_cpp_bindings_tests::gpu_backend::inference_model_params; -use llama_cpp_bindings_tests::gpu_backend::require_compiled_backends_present; -use llama_cpp_bindings_tests::test_model::download_file_from; - -const MISTRAL3_REPO: &str = "unsloth/Ministral-3-14B-Reasoning-2512-GGUF"; -const MISTRAL3_FILE: &str = "Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf"; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; const TOOLS_JSON: &str = r#"[ { @@ -31,28 +26,27 @@ const TOOLS_JSON: &str = r#"[ const MISTRAL3_BRACKETED_JSON_PAYLOAD: &str = r#"[TOOL_CALLS]get_weather[ARGS]{"location":"Paris"}"#; -#[test] -fn mistral3_parses_tool_call_payload() -> Result<()> { - let backend = LlamaBackend::init()?; - require_compiled_backends_present()?; - - let path = download_file_from(MISTRAL3_REPO, MISTRAL3_FILE)?; - let params = inference_model_params(); - let model = LlamaModel::load_from_file(&backend, &path, ¶ms)?; - - let outcome = model.parse_chat_message(TOOLS_JSON, MISTRAL3_BRACKETED_JSON_PAYLOAD, false)?; +#[llama_test( + model_source = HuggingFace("unsloth/Ministral-3-14B-Reasoning-2512-GGUF", "Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn mistral3_parses_tool_call_payload(fixture: &LlamaFixture<'_>) -> Result<()> { + let outcome = + fixture + .model + .parse_chat_message(TOOLS_JSON, MISTRAL3_BRACKETED_JSON_PAYLOAD, false)?; let ChatMessageParseOutcome::Recognized(parsed) = outcome else { bail!( "expected Recognized for Mistral 3 BracketedJson on a Mistral-3 model; got Unrecognized" ); }; - assert_eq!( - parsed.tool_calls.len(), - 1, - "expected one tool call; got {:?}", - parsed.tool_calls - ); + assert_eq!(parsed.tool_calls.len(), 1); assert_eq!(parsed.tool_calls[0].name, "get_weather"); let location = match &parsed.tool_calls[0].arguments { ToolCallArguments::ValidJson(value) => value @@ -67,3 +61,5 @@ fn mistral3_parses_tool_call_payload() -> Result<()> { Ok(()) } + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/model.rs b/llama-cpp-bindings-tests/tests/model.rs deleted file mode 100644 index 47e27e30..00000000 --- a/llama-cpp-bindings-tests/tests/model.rs +++ /dev/null @@ -1,987 +0,0 @@ -use std::num::NonZeroU16; -use std::num::NonZeroU32; -use std::path::{Path, PathBuf}; - -use anyhow::Result; -use llama_cpp_bindings::ChatTemplateError; -use llama_cpp_bindings::LlamaLoraAdapterInitError; -use llama_cpp_bindings::LlamaModelLoadError; -use llama_cpp_bindings::SampledToken; -use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::context::params::LlamaContextParams; -use llama_cpp_bindings::json_schema_to_grammar; -use llama_cpp_bindings::llama_batch::LlamaBatch; -use llama_cpp_bindings::model::AddBos; -use llama_cpp_bindings::model::LlamaChatMessage; -use llama_cpp_bindings::model::LlamaModel; -use llama_cpp_bindings::model::params::LlamaModelParams; -use llama_cpp_bindings::sampling::LlamaSampler; -use llama_cpp_bindings_tests::FixtureSession; -use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; -use serial_test::serial; - -#[test] -#[serial] -fn model_loads_with_valid_metadata() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - - assert!(model.n_vocab() > 0); - assert!(model.n_embd() > 0); - assert!(model.n_params() > 0); - assert!(model.n_ctx_train()? > 0); - - Ok(()) -} - -#[test] -#[serial] -fn special_tokens_exist() { - let fixture = FixtureSession::open().expect("open fixture"); - let model = fixture.default_model(); - let bos = model.token_bos(); - let eos = model.token_eos(); - - assert_ne!(bos, eos); - assert!(model.is_eog_token(&SampledToken::Content(eos))); -} - -#[test] -#[serial] -fn str_to_token_roundtrip() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - let tokens = model.str_to_token("hello world", AddBos::Never)?; - assert!(!tokens.is_empty()); - let mut decoder = encoding_rs::UTF_8.new_decoder(); - let piece = - model.token_to_piece(&SampledToken::Content(tokens[0]), &mut decoder, false, None)?; - - assert!(!piece.is_empty()); - - Ok(()) -} - -#[test] -#[serial] -fn chat_template_returns_non_empty() { - let fixture = FixtureSession::open().expect("open fixture"); - let model = fixture.default_model(); - let template = model.chat_template(None); - - assert!(template.is_ok()); -} - -#[test] -#[serial] -fn apply_chat_template_produces_prompt() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - let template = model.chat_template(None)?; - let message = LlamaChatMessage::new("user".to_string(), "hello".to_string())?; - let prompt = model.apply_chat_template(&template, &[message], true); - - assert!(prompt.is_ok()); - assert!(!prompt?.is_empty()); - - Ok(()) -} - -#[test] -#[serial] -fn meta_count_returns_positive() { - let fixture = FixtureSession::open().expect("open fixture"); - let model = fixture.default_model(); - - assert!(model.meta_count() > 0); -} - -#[test] -#[serial] -fn tokens_iterator_produces_valid_entries() { - let fixture = FixtureSession::open().expect("open fixture"); - let model = fixture.default_model(); - let mut count = 0; - - for (token, _piece_result) in model.tokens(false) { - assert!(token.0 >= 0); - count += 1; - - if count >= 100 { - break; - } - } - - assert_eq!(count, 100); -} - -#[test] -#[serial] -fn token_to_piece_bytes_returns_bytes_for_known_token() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - let tokens = model.str_to_token("hello", AddBos::Never)?; - let bytes = model.token_to_piece_bytes(tokens[0], 32, false, None)?; - - assert!(!bytes.is_empty()); - - Ok(()) -} - -#[test] -#[serial] -fn n_layer_returns_positive() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - - assert!(model.n_layer()? > 0); - - Ok(()) -} - -#[test] -#[serial] -fn n_head_returns_positive() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - - assert!(model.n_head()? > 0); - - Ok(()) -} - -#[test] -#[serial] -fn n_head_kv_returns_positive() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - - assert!(model.n_head_kv()? > 0); - - Ok(()) -} - -#[test] -#[serial] -fn is_hybrid_returns_bool_for_test_model() { - let fixture = FixtureSession::open().expect("open fixture"); - let model = fixture.default_model(); - - let _ = model.is_hybrid(); -} - -#[test] -#[serial] -fn meta_key_by_index_returns_valid_key() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - let key = model.meta_key_by_index(0)?; - - assert!(!key.is_empty()); - - Ok(()) -} - -#[test] -#[serial] -fn meta_val_str_by_index_returns_valid_value() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - let value = model.meta_val_str_by_index(0)?; - - assert!(!value.is_empty()); - - Ok(()) -} - -#[test] -#[serial] -fn meta_key_by_index_out_of_range_returns_error() { - let fixture = FixtureSession::open().expect("open fixture"); - let model = fixture.default_model(); - let result = model.meta_key_by_index(999_999); - - assert!(result.is_err()); -} - -#[test] -#[serial] -fn meta_val_str_by_index_out_of_range_returns_error() { - let fixture = FixtureSession::open().expect("open fixture"); - let model = fixture.default_model(); - let result = model.meta_val_str_by_index(999_999); - - assert!(result.is_err()); -} - -#[test] -#[serial] -fn meta_val_str_returns_value_for_known_key() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - let first_key = model.meta_key_by_index(0)?; - let value = model.meta_val_str(&first_key)?; - - assert!(!value.is_empty()); - - Ok(()) -} - -#[test] -#[serial] -fn model_size_returns_nonzero() { - let fixture = FixtureSession::open().expect("open fixture"); - let model = fixture.default_model(); - - assert!(model.size() > 0); -} - -#[test] -#[serial] -fn is_recurrent_returns_false_for_transformer() { - let fixture = FixtureSession::open().expect("open fixture"); - let model = fixture.default_model(); - - assert!(!model.is_recurrent()); -} - -#[test] -#[serial] -fn rope_type_does_not_panic() { - let fixture = FixtureSession::open().expect("open fixture"); - let model = fixture.default_model(); - let _rope_type = model.rope_type(); -} - -#[test] -#[serial] -fn load_model_with_invalid_path_returns_error() { - let fixture = FixtureSession::open().expect("open fixture"); - let backend = fixture.backend(); - let model_params = LlamaModelParams::default(); - let result = LlamaModel::load_from_file(backend, "/nonexistent/model.gguf", &model_params); - - assert!(matches!( - result.unwrap_err(), - LlamaModelLoadError::FileNotFound(path) if path == Path::new("/nonexistent/model.gguf"), - )); -} - -#[test] -#[serial] -fn load_model_with_invalid_file_content_returns_unloadable_or_reported() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model_params = LlamaModelParams::default(); - let dummy_path = std::env::temp_dir().join("llama_test_invalid_model.gguf"); - std::fs::write(&dummy_path, b"not a valid gguf model file")?; - - let result = LlamaModel::load_from_file(backend, &dummy_path, &model_params); - - assert!(matches!( - result.unwrap_err(), - LlamaModelLoadError::Unloadable | LlamaModelLoadError::Reported { .. }, - )); - let _ = std::fs::remove_file(&dummy_path); - - Ok(()) -} - -#[cfg(unix)] -#[test] -#[serial] -fn load_model_with_non_utf8_path_returns_path_to_str_error() { - use std::ffi::OsStr; - use std::os::unix::ffi::OsStrExt; - - let fixture = FixtureSession::open().expect("open fixture"); - let backend = fixture.backend(); - let model_params = LlamaModelParams::default(); - let non_utf8_path = Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.gguf")); - - let result = LlamaModel::load_from_file(backend, non_utf8_path, &model_params); - - assert!(matches!( - result.unwrap_err(), - LlamaModelLoadError::PathToStrError(path) if path == non_utf8_path.to_path_buf() - )); -} - -#[cfg(unix)] -#[test] -#[serial] -fn lora_adapter_init_with_non_utf8_path_returns_error() { - use std::ffi::OsStr; - use std::os::unix::ffi::OsStrExt; - - let fixture = FixtureSession::open().expect("open fixture"); - let model = fixture.default_model(); - let non_utf8_path = Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.gguf")); - - let result = model.lora_adapter_init(non_utf8_path); - - assert_eq!( - result.unwrap_err(), - LlamaLoraAdapterInitError::PathToStrError(non_utf8_path.to_path_buf()) - ); -} - -#[test] -#[serial] -fn lora_adapter_init_with_invalid_path_returns_error() { - let fixture = FixtureSession::open().expect("open fixture"); - let model = fixture.default_model(); - let result = model.lora_adapter_init("/nonexistent/path/lora.gguf"); - - assert_eq!( - result.unwrap_err(), - LlamaLoraAdapterInitError::FileNotFound(PathBuf::from("/nonexistent/path/lora.gguf")) - ); -} - -#[test] -#[serial] -fn new_context_returns_valid_context() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(256)); - let context = LlamaContext::from_model(model, backend, ctx_params)?; - - assert!(context.n_ctx() > 0); - - Ok(()) -} - -#[test] -#[serial] -fn token_nl_returns_valid_token() { - let fixture = FixtureSession::open().expect("open fixture"); - let model = fixture.default_model(); - let nl_token = model.token_nl(); - - assert!(nl_token.0 >= 0); -} - -#[test] -#[serial] -fn decode_start_token_returns_valid_token() { - let fixture = FixtureSession::open().expect("open fixture"); - let model = fixture.default_model(); - let _decode_start = model.decode_start_token(); -} - -#[test] -#[serial] -fn token_sep_returns_valid_token() { - let fixture = FixtureSession::open().expect("open fixture"); - let model = fixture.default_model(); - let _sep_token = model.token_sep(); -} - -#[test] -#[serial] -fn token_to_piece_handles_large_token_requiring_buffer_resize() { - let fixture = FixtureSession::open().expect("open fixture"); - let model = fixture.default_model(); - let mut decoder = encoding_rs::UTF_8.new_decoder(); - - for (token, _) in model.tokens(true).take(200) { - let result = model.token_to_piece(&SampledToken::Content(token), &mut decoder, true, None); - assert!(result.is_ok()); - } -} - -#[test] -#[serial] -fn token_to_piece_bytes_insufficient_buffer_returns_error() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - let tokens = model.str_to_token("hello", AddBos::Never)?; - let result = model.token_to_piece_bytes(tokens[0], 1, false, None); - - assert!( - result - .unwrap_err() - .to_string() - .contains("Insufficient Buffer Space") - ); - - Ok(()) -} - -#[test] -#[serial] -fn token_to_piece_with_lstrip() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - let mut decoder = encoding_rs::UTF_8.new_decoder(); - let tokens = model.str_to_token("hello", AddBos::Never)?; - let result = model.token_to_piece( - &SampledToken::Content(tokens[0]), - &mut decoder, - false, - NonZeroU16::new(1), - ); - - assert!(result.is_ok()); - - Ok(()) -} - -#[test] -#[serial] -fn is_eog_token_classifies_reasoning_variant() { - let fixture = FixtureSession::open().expect("open fixture"); - let model = fixture.default_model(); - let eos = model.token_eos(); - - assert!(model.is_eog_token(&SampledToken::Reasoning(eos))); -} - -#[test] -#[serial] -fn is_eog_token_classifies_tool_call_variant() { - let fixture = FixtureSession::open().expect("open fixture"); - let model = fixture.default_model(); - let eos = model.token_eos(); - - assert!(model.is_eog_token(&SampledToken::ToolCall(eos))); -} - -#[test] -#[serial] -fn is_eog_token_classifies_undeterminable_variant() { - let fixture = FixtureSession::open().expect("open fixture"); - let model = fixture.default_model(); - let eos = model.token_eos(); - - assert!(model.is_eog_token(&SampledToken::Undeterminable(eos))); -} - -#[test] -#[serial] -fn token_to_piece_decodes_reasoning_variant() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - let mut decoder = encoding_rs::UTF_8.new_decoder(); - let tokens = model.str_to_token("hi", AddBos::Never)?; - - let piece = model.token_to_piece( - &SampledToken::Reasoning(tokens[0]), - &mut decoder, - true, - None, - )?; - - assert!(!piece.is_empty()); - - Ok(()) -} - -#[test] -#[serial] -fn token_to_piece_decodes_tool_call_variant() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - let mut decoder = encoding_rs::UTF_8.new_decoder(); - let tokens = model.str_to_token("hi", AddBos::Never)?; - - let piece = - model.token_to_piece(&SampledToken::ToolCall(tokens[0]), &mut decoder, true, None)?; - - assert!(!piece.is_empty()); - - Ok(()) -} - -#[test] -#[serial] -fn token_to_piece_decodes_undeterminable_variant() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - let mut decoder = encoding_rs::UTF_8.new_decoder(); - let tokens = model.str_to_token("hi", AddBos::Never)?; - - let piece = model.token_to_piece( - &SampledToken::Undeterminable(tokens[0]), - &mut decoder, - true, - None, - )?; - - assert!(!piece.is_empty()); - - Ok(()) -} - -#[test] -#[serial] -fn str_to_token_grows_buffer_when_initial_estimation_too_small() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - - // A short input that tokenises to many small tokens. The initial - // capacity is `max(8, str.len()/2 + 1)` so a string with len < 16 may - // tokenise to >8 tokens, forcing the second `llama_tokenize` call along - // the buffer-grow path. - let many_short_chars = "a b c d e f g h i j k l"; - let tokens = model.str_to_token(many_short_chars, AddBos::Always)?; - - assert!( - tokens.len() > 8, - "expected regrow; got {} tokens", - tokens.len() - ); - - Ok(()) -} - -#[test] -#[serial] -fn n_vocab_matches_tokens_iterator_count() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - let n_vocab = model.n_vocab(); - let count = model.tokens(false).count(); - - assert_eq!(count, usize::try_from(n_vocab)?); - - Ok(()) -} - -#[test] -#[serial] -fn token_attr_returns_valid_attr() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - let bos = model.token_bos(); - let _attr = model.token_attr(bos)?; - - Ok(()) -} - -#[test] -#[serial] -fn vocab_type_returns_valid_type() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - let _vocab_type = model.vocab_type()?; - - Ok(()) -} - -#[test] -#[serial] -fn apply_chat_template_buffer_resize_with_long_messages() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - let template = model.chat_template(None)?; - let long_content = "a".repeat(2000); - let message = LlamaChatMessage::new("user".to_string(), long_content)?; - let prompt = model.apply_chat_template(&template, &[message], true); - - assert!(prompt.is_ok()); - assert!(!prompt?.is_empty()); - - Ok(()) -} - -#[test] -#[serial] -fn meta_val_str_with_long_value_triggers_buffer_resize() { - let fixture = FixtureSession::open().expect("open fixture"); - let model = fixture.default_model(); - let count = model.meta_count(); - - for index in 0..count { - let key = model.meta_key_by_index(index); - let value = model.meta_val_str_by_index(index); - assert!(key.is_ok()); - assert!(value.is_ok()); - } -} - -#[test] -#[serial] -fn str_to_token_with_add_bos_never() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - let tokens_with_bos = model.str_to_token("hello", AddBos::Always)?; - let tokens_without_bos = model.str_to_token("hello", AddBos::Never)?; - - assert!(tokens_with_bos.len() >= tokens_without_bos.len()); - - Ok(()) -} - -#[test] -#[serial] -fn chat_template_with_nonexistent_name_returns_error() { - let fixture = FixtureSession::open().expect("open fixture"); - let model = fixture.default_model(); - - let result = model.chat_template(Some("nonexistent_template_name_xyz")); - - assert_eq!(result.unwrap_err(), ChatTemplateError::MissingTemplate); -} - -#[test] -#[serial] -fn lora_adapter_init_with_invalid_gguf_returns_unloadable() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - let dummy_path = std::env::temp_dir().join("llama_test_dummy_lora.gguf"); - std::fs::write(&dummy_path, b"not a valid gguf")?; - - let result = model.lora_adapter_init(&dummy_path); - - assert_eq!(result.unwrap_err(), LlamaLoraAdapterInitError::Unloadable); - let _ = std::fs::remove_file(&dummy_path); - - Ok(()) -} - -#[test] -#[serial] -fn str_to_token_with_many_tokens_triggers_buffer_resize() -> Result<()> { - use std::fmt::Write; - - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - let many_numbers = (0..2000).fold(String::new(), |mut accumulator, number| { - let _ = write!(accumulator, "{number} "); - accumulator - }); - - let tokens = model.str_to_token(&many_numbers, AddBos::Always)?; - - assert!(tokens.len() > many_numbers.len() / 2); - - Ok(()) -} - -#[test] -#[serial] -fn rope_type_returns_valid_result_for_test_model() { - let fixture = FixtureSession::open().expect("open fixture"); - let model = fixture.default_model(); - - let _rope_type = model.rope_type(); -} - -#[test] -#[serial] -fn meta_val_str_with_null_byte_in_key_returns_error() { - let fixture = FixtureSession::open().expect("open fixture"); - let model = fixture.default_model(); - let result = model.meta_val_str("key\0with_null"); - - assert!(result.is_err()); -} - -#[test] -#[serial] -fn new_context_with_huge_ctx_returns_null_error() { - let fixture = FixtureSession::open().expect("open fixture"); - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(u32::MAX)); - - let result = LlamaContext::from_model(model, backend, ctx_params); - - assert!(result.is_err()); -} - -#[test] -#[serial] -fn sample_returns_result_and_succeeds_with_valid_index() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(256)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; - - let tokens = model.str_to_token("Hello", AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - - batch.add_sequence(&tokens, 0, false)?; - - context.decode(&mut batch)?; - - let mut sampler = LlamaSampler::chain_simple([LlamaSampler::temp(0.8), LlamaSampler::greedy()]); - - let result = sampler.sample(&context, batch.n_tokens() - 1); - - assert!(result.is_ok()); - - Ok(()) -} - -#[test] -#[serial] -fn grammar_sampler_constrains_output_to_yes_or_no() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; - - let prompt = "<|im_start|>user\nIs the sky blue? Answer yes or no.<|im_end|>\n<|im_start|>assistant\n\n\n\n\n"; - let tokens = model.str_to_token(prompt, AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - - batch.add_sequence(&tokens, 0, false)?; - - context.decode(&mut batch)?; - - let mut sampler = LlamaSampler::chain_simple([ - LlamaSampler::grammar(model, r"root ::= [Yy] [Ee] [Ss] | [Nn] [Oo]", "root")?, - LlamaSampler::temp(0.8), - LlamaSampler::greedy(), - ]); - - let mut classifier = model.sampled_token_classifier(); - let (raw_token, mut outcomes) = - classifier.sample(&mut sampler, &context, batch.n_tokens() - 1)?; - outcomes.extend(classifier.flush()); - - assert_eq!( - outcomes.len(), - 1, - "expected one finalised outcome after flush" - ); - let outcome = &outcomes[0]; - - let raw_as_sampled = SampledToken::Content(raw_token); - assert!( - !model.is_eog_token(&raw_as_sampled), - "Grammar sampler should not allow EOS as first token" - ); - - let piece = &outcome.raw_piece; - let first_char = piece - .chars() - .next() - .ok_or_else(|| anyhow::anyhow!("piece should have at least one character"))? - .to_lowercase() - .next() - .ok_or_else(|| anyhow::anyhow!("lowercase iterator should yield a character"))?; - - assert!( - first_char == 'y' || first_char == 'n', - "Grammar should constrain first token to start with y/n, got: '{piece}'" - ); - assert_eq!( - classifier.usage().completion_tokens(), - 1, - "exactly one completion token sampled" - ); - - Ok(()) -} - -#[test] -#[serial] -fn json_schema_grammar_sampler_constrains_output_to_json() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; - - let prompt = "<|im_start|>user\nWhat is 2+2? Respond with a JSON object.<|im_end|>\n<|im_start|>assistant\n\n\n\n\n"; - let tokens = model.str_to_token(prompt, AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - - batch.add_sequence(&tokens, 0, false)?; - - context.decode(&mut batch)?; - - let grammar_str = json_schema_to_grammar( - r#"{"type": "object", "properties": {"answer": {"type": "string"}}, "required": ["answer"]}"#, - )?; - - let mut sampler = LlamaSampler::chain_simple([ - LlamaSampler::grammar(model, &grammar_str, "root")?, - LlamaSampler::temp(0.8), - LlamaSampler::greedy(), - ]); - - let mut classifier = model.sampled_token_classifier(); - let (raw_token, mut outcomes) = - classifier.sample(&mut sampler, &context, batch.n_tokens() - 1)?; - outcomes.extend(classifier.flush()); - - assert_eq!( - outcomes.len(), - 1, - "expected one finalised outcome after flush" - ); - let outcome = &outcomes[0]; - - let raw_as_sampled = SampledToken::Content(raw_token); - assert!( - !model.is_eog_token(&raw_as_sampled), - "Grammar sampler should not allow EOS as first token" - ); - - let piece = &outcome.raw_piece; - - assert!( - piece.starts_with('{'), - "JSON schema grammar should constrain first token to start with '{{', got: '{piece}'" - ); - assert_eq!( - classifier.usage().completion_tokens(), - 1, - "exactly one completion token sampled" - ); - - Ok(()) -} - -#[test] -#[serial] -fn sample_with_grammar_produces_constrained_output_in_loop() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; - - let prompt = "<|im_start|>user\nIs the sky blue? yes or no<|im_end|>\n<|im_start|>assistant\n\n\n\n\n"; - let tokens = model.str_to_token(prompt, AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - - let mut classifier = model.sampled_token_classifier(); - classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?; - - context.decode(&mut batch)?; - classifier.commit_prompt_tokens(); - - let mut sampler = LlamaSampler::chain_simple([ - LlamaSampler::grammar(model, r#"root ::= "yes" | "no""#, "root")?, - LlamaSampler::temp(0.8), - LlamaSampler::greedy(), - ]); - - let initial_position = batch.n_tokens(); - let outcome = ClassifySampleLoop { - model, - classifier: &mut classifier, - sampler: &mut sampler, - context: &mut context, - batch: &mut batch, - initial_position, - max_generated_tokens: 10, - } - .run()?; - - let lowercase = outcome.generated_raw.to_lowercase(); - assert!( - lowercase == "yes" || lowercase == "no", - "Grammar loop should produce 'yes' or 'no', got: '{}'", - outcome.generated_raw - ); - assert!( - outcome.eog_seen, - "loop must terminate via EOG once grammar accepts, not by exhausting the budget; \ - outcome={outcome:?}" - ); - assert_eq!( - outcome.observed_reasoning, 0, - "closed-think prompt must not produce Reasoning tokens; outcome={outcome:?}" - ); - assert_eq!( - outcome.observed_undeterminable, 0, - "prompt-token replay closes the think block before generation, so the section \ - must be Content and no Undeterminable tokens may be emitted; outcome={outcome:?}" - ); - assert_eq!( - outcome.observed_tool_call, 0, - "prompt without tool definitions must not produce ToolCall tokens; outcome={outcome:?}" - ); - assert!( - outcome.observed_content > 0, - "grammar must yield at least one Content token (the answer); outcome={outcome:?}" - ); - - let usage = classifier.into_usage(); - assert_eq!( - usage.completion_tokens(), - outcome.observed_content, - "for the closed-think grammar prompt, completion_tokens equals observed Content" - ); - assert_eq!( - usage.reasoning_tokens, 0, - "usage.reasoning_tokens must be zero; usage={usage:?}" - ); - assert_eq!( - usage.undeterminable_tokens, 0, - "usage.undeterminable_tokens must be zero; usage={usage:?}" - ); - - Ok(()) -} - -#[test] -#[serial] -fn sample_without_grammar_produces_multiple_tokens() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; - - let prompt = - "<|im_start|>user\nSay hello<|im_end|>\n<|im_start|>assistant\n\n\n\n\n"; - let tokens = model.str_to_token(prompt, AddBos::Always)?; - let mut batch = LlamaBatch::new(512, 1)?; - - batch.add_sequence(&tokens, 0, false)?; - - context.decode(&mut batch)?; - - let mut sampler = LlamaSampler::chain_simple([LlamaSampler::temp(0.8), LlamaSampler::greedy()]); - - let mut classifier = model.sampled_token_classifier(); - let mut sampled_count: u64 = 0; - let mut position = batch.n_tokens(); - - for _ in 0..5 { - let (raw_token, _outcomes) = classifier.sample(&mut sampler, &context, -1)?; - let raw_as_sampled = SampledToken::Content(raw_token); - - if model.is_eog_token(&raw_as_sampled) { - break; - } - - sampled_count += 1; - - batch.clear(); - batch.add(&raw_as_sampled, position, &[0], true)?; - position += 1; - - context.decode(&mut batch)?; - } - - let _ = classifier.flush(); - - assert!( - sampled_count > 0, - "Should produce at least one token without grammar" - ); - let usage = classifier.into_usage(); - assert!( - usage.completion_tokens() >= sampled_count, - "completion_tokens ({}) must include the {sampled_count} non-EOG samples", - usage.completion_tokens() - ); - - Ok(()) -} diff --git a/llama-cpp-bindings-tests/tests/model_chat_template.rs b/llama-cpp-bindings-tests/tests/model_chat_template.rs new file mode 100644 index 00000000..88511471 --- /dev/null +++ b/llama-cpp-bindings-tests/tests/model_chat_template.rs @@ -0,0 +1,194 @@ +#![expect( + clippy::unnecessary_wraps, + reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" +)] + +use anyhow::Result; +use llama_cpp_bindings::ChatTemplateError; +use llama_cpp_bindings::model::LlamaChatMessage; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn chat_template_returns_non_empty(fixture: &LlamaFixture<'_>) -> Result<()> { + let template = fixture.model.chat_template(None); + assert!(template.is_ok()); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn apply_chat_template_produces_prompt(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let template = model.chat_template(None)?; + let message = LlamaChatMessage::new("user".to_string(), "hello".to_string())?; + let prompt = model.apply_chat_template(&template, &[message], true); + + assert!(prompt.is_ok()); + assert!(!prompt?.is_empty()); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn apply_chat_template_buffer_resize_with_long_messages(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let template = model.chat_template(None)?; + let long_content = "a".repeat(2000); + let message = LlamaChatMessage::new("user".to_string(), long_content)?; + let prompt = model.apply_chat_template(&template, &[message], true); + + assert!(prompt.is_ok()); + assert!(!prompt?.is_empty()); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn chat_template_with_nonexistent_name_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + let result = fixture + .model + .chat_template(Some("nonexistent_template_name_xyz")); + assert_eq!(result.unwrap_err(), ChatTemplateError::MissingTemplate); + Ok(()) +} + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/model_context_creation.rs b/llama-cpp-bindings-tests/tests/model_context_creation.rs new file mode 100644 index 00000000..300027ec --- /dev/null +++ b/llama-cpp-bindings-tests/tests/model_context_creation.rs @@ -0,0 +1,106 @@ +#![expect( + clippy::unnecessary_wraps, + reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" +)] + +use anyhow::Result; +use llama_cpp_bindings::context::LlamaContext; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 256, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 256, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 256, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 256, + n_batch = 128, + n_ubatch = 64, +)] +fn new_context_returns_valid_context(fixture: &LlamaFixture<'_>) -> Result<()> { + let context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + assert!(context.n_ctx() > 0); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 4294967295, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 4294967295, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 4294967295, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 4294967295, + n_batch = 128, + n_ubatch = 64, +)] +fn new_context_with_huge_ctx_returns_null_error(fixture: &LlamaFixture<'_>) -> Result<()> { + let result = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + ); + + assert!(result.is_err()); + Ok(()) +} + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/model_helpers.rs b/llama-cpp-bindings-tests/tests/model_helpers.rs index 7605521c..3efeae82 100644 --- a/llama-cpp-bindings-tests/tests/model_helpers.rs +++ b/llama-cpp-bindings-tests/tests/model_helpers.rs @@ -1,12 +1,24 @@ -use anyhow::Result; -use llama_cpp_bindings_tests::FixtureSession; - -#[test] -fn debug_format_includes_struct_name_and_model_field() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); +#![expect( + clippy::unnecessary_wraps, + reason = "every trial returns anyhow::Result<()> to match the LlamaTestFn signature" +)] - let formatted = format!("{model:?}"); +use anyhow::Result; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 2048, + n_batch = 512, + n_ubatch = 128 +)] +fn debug_format_includes_struct_name_and_model_field(fixture: &LlamaFixture<'_>) -> Result<()> { + let formatted = format!("{:?}", fixture.model); assert!(formatted.contains("LlamaModel")); assert!(formatted.contains("model")); @@ -14,49 +26,78 @@ fn debug_format_includes_struct_name_and_model_field() -> Result<()> { Ok(()) } -#[test] -fn embedding_model_tool_call_markers_call_does_not_panic() -> Result<()> { - let fixture = FixtureSession::open()?; - let embedding_model = fixture.embedding_model()?; - - let _markers = embedding_model.tool_call_markers(); +#[llama_test( + model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 2048, + n_batch = 512, + n_ubatch = 128 +)] +fn embedding_model_tool_call_markers_call_does_not_panic(fixture: &LlamaFixture<'_>) -> Result<()> { + let _markers = fixture.model.tool_call_markers(); Ok(()) } -#[test] -fn embedding_model_streaming_markers_returns_ok_for_a_model_without_tool_calls() -> Result<()> { - let fixture = FixtureSession::open()?; - let embedding_model = fixture.embedding_model()?; - - // The exact set of detected markers depends on the embedding model's chat template; - // assertion is just that the call returns Ok without panicking, exercising the - // streaming_markers + autoparser-fallthrough + override-detect paths even on a model - // that lacks tool calls. - let _markers = embedding_model.streaming_markers()?; +#[llama_test( + model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 2048, + n_batch = 512, + n_ubatch = 128 +)] +fn embedding_model_streaming_markers_returns_ok_for_a_model_without_tool_calls( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let _markers = fixture.model.streaming_markers()?; Ok(()) } -#[test] -fn approximate_tok_env_is_cached_across_calls() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - - let first = model.approximate_tok_env(); - let second = model.approximate_tok_env(); +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 2048, + n_batch = 512, + n_ubatch = 128 +)] +fn approximate_tok_env_is_cached_across_calls(fixture: &LlamaFixture<'_>) -> Result<()> { + let first = fixture.model.approximate_tok_env(); + let second = fixture.model.approximate_tok_env(); assert!(std::sync::Arc::ptr_eq(&first, &second)); Ok(()) } -#[test] -fn approximate_tok_env_falls_back_to_eos_when_eot_unavailable() -> Result<()> { - let fixture = FixtureSession::open()?; - let embedding_model = fixture.embedding_model()?; - - let _env = embedding_model.approximate_tok_env(); +#[llama_test( + model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 2048, + n_batch = 512, + n_ubatch = 128 +)] +fn approximate_tok_env_falls_back_to_eos_when_eot_unavailable( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let env = fixture.model.approximate_tok_env(); + let env_again = fixture.model.approximate_tok_env(); + + assert!( + std::sync::Arc::ptr_eq(&env, &env_again), + "approximate_tok_env must return the same cached Arc for any model, including \ + the embedding model which lacks an EOT token (forcing the fallback-to-EOS path)" + ); Ok(()) } + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/model_loading_errors.rs b/llama-cpp-bindings-tests/tests/model_loading_errors.rs new file mode 100644 index 00000000..cd36eb46 --- /dev/null +++ b/llama-cpp-bindings-tests/tests/model_loading_errors.rs @@ -0,0 +1,172 @@ +#![expect( + clippy::unnecessary_wraps, + reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" +)] + +use std::path::Path; + +use anyhow::Result; +use llama_cpp_bindings::LlamaModelLoadError; +use llama_cpp_bindings::model::LlamaModel; +use llama_cpp_bindings::model::params::LlamaModelParams; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn load_model_with_invalid_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + let model_params = LlamaModelParams::default(); + let result = + LlamaModel::load_from_file(fixture.backend, "/nonexistent/model.gguf", &model_params); + + assert!(matches!( + result.unwrap_err(), + LlamaModelLoadError::FileNotFound(path) if path == Path::new("/nonexistent/model.gguf"), + )); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn load_model_with_invalid_file_content_returns_unloadable_or_reported( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let model_params = LlamaModelParams::default(); + let dummy_path = std::env::temp_dir().join("llama_test_invalid_model.gguf"); + std::fs::write(&dummy_path, b"not a valid gguf model file")?; + + let result = LlamaModel::load_from_file(fixture.backend, &dummy_path, &model_params); + + assert!(matches!( + result.unwrap_err(), + LlamaModelLoadError::Unloadable | LlamaModelLoadError::Reported { .. }, + )); + let _ = std::fs::remove_file(&dummy_path); + Ok(()) +} + +#[cfg(unix)] +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn load_model_with_non_utf8_path_returns_path_to_str_error( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + use std::ffi::OsStr; + use std::os::unix::ffi::OsStrExt; + + let model_params = LlamaModelParams::default(); + let non_utf8_path = Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.gguf")); + + let result = LlamaModel::load_from_file(fixture.backend, non_utf8_path, &model_params); + + assert!(matches!( + result.unwrap_err(), + LlamaModelLoadError::PathToStrError(path) if path == non_utf8_path.to_path_buf() + )); + Ok(()) +} + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/model_lora_adapter_errors.rs b/llama-cpp-bindings-tests/tests/model_lora_adapter_errors.rs new file mode 100644 index 00000000..ae04dad8 --- /dev/null +++ b/llama-cpp-bindings-tests/tests/model_lora_adapter_errors.rs @@ -0,0 +1,162 @@ +#![expect( + clippy::unnecessary_wraps, + reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" +)] + +use std::path::PathBuf; + +use anyhow::Result; +use llama_cpp_bindings::LlamaLoraAdapterInitError; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn lora_adapter_init_with_invalid_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + let result = fixture + .model + .lora_adapter_init("/nonexistent/path/lora.gguf"); + assert_eq!( + result.unwrap_err(), + LlamaLoraAdapterInitError::FileNotFound(PathBuf::from("/nonexistent/path/lora.gguf")) + ); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn lora_adapter_init_with_invalid_gguf_returns_unloadable( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let dummy_path = std::env::temp_dir().join("llama_test_dummy_lora.gguf"); + std::fs::write(&dummy_path, b"not a valid gguf")?; + + let result = fixture.model.lora_adapter_init(&dummy_path); + + assert_eq!(result.unwrap_err(), LlamaLoraAdapterInitError::Unloadable); + let _ = std::fs::remove_file(&dummy_path); + Ok(()) +} + +#[cfg(unix)] +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn lora_adapter_init_with_non_utf8_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + use std::ffi::OsStr; + use std::os::unix::ffi::OsStrExt; + use std::path::Path; + + let non_utf8_path = Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.gguf")); + let result = fixture.model.lora_adapter_init(non_utf8_path); + + assert_eq!( + result.unwrap_err(), + LlamaLoraAdapterInitError::PathToStrError(non_utf8_path.to_path_buf()) + ); + Ok(()) +} + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/model_metadata_kv.rs b/llama-cpp-bindings-tests/tests/model_metadata_kv.rs new file mode 100644 index 00000000..7d99b859 --- /dev/null +++ b/llama-cpp-bindings-tests/tests/model_metadata_kv.rs @@ -0,0 +1,355 @@ +#![expect( + clippy::unnecessary_wraps, + reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" +)] + +use anyhow::Result; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn meta_count_returns_positive(fixture: &LlamaFixture<'_>) -> Result<()> { + assert!(fixture.model.meta_count() > 0); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn meta_key_by_index_returns_valid_key(fixture: &LlamaFixture<'_>) -> Result<()> { + let key = fixture.model.meta_key_by_index(0)?; + assert!(!key.is_empty()); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn meta_val_str_by_index_returns_valid_value(fixture: &LlamaFixture<'_>) -> Result<()> { + let value = fixture.model.meta_val_str_by_index(0)?; + assert!(!value.is_empty()); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn meta_key_by_index_out_of_range_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + let result = fixture.model.meta_key_by_index(999_999); + assert!(result.is_err()); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn meta_val_str_by_index_out_of_range_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + let result = fixture.model.meta_val_str_by_index(999_999); + assert!(result.is_err()); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn meta_val_str_returns_value_for_known_key(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let first_key = model.meta_key_by_index(0)?; + let value = model.meta_val_str(&first_key)?; + assert!(!value.is_empty()); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn meta_val_str_with_long_value_triggers_buffer_resize(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let count = model.meta_count(); + + for index in 0..count { + let key = model.meta_key_by_index(index); + let value = model.meta_val_str_by_index(index); + assert!(key.is_ok()); + assert!(value.is_ok()); + } + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn meta_val_str_with_null_byte_in_key_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + let result = fixture.model.meta_val_str("key\0with_null"); + assert!(result.is_err()); + Ok(()) +} + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/model_params.rs b/llama-cpp-bindings-tests/tests/model_params.rs index 59bd7d51..6684625e 100644 --- a/llama-cpp-bindings-tests/tests/model_params.rs +++ b/llama-cpp-bindings-tests/tests/model_params.rs @@ -1,3 +1,8 @@ +#![expect( + clippy::similar_names, + reason = "model_path_str and model_path_cstr are both genuinely needed; renaming would not improve clarity" +)] + use std::ffi::CString; use std::pin::pin; @@ -5,17 +10,49 @@ use anyhow::Result; use llama_cpp_bindings::context::params::LlamaContextParams; use llama_cpp_bindings::max_devices; use llama_cpp_bindings::model::params::LlamaModelParams; -use llama_cpp_bindings_tests::FixtureSession; -use llama_cpp_bindings_tests::test_model; -use serial_test::serial; - -#[test] -#[serial] -fn fit_params_succeeds_with_test_model() -> Result<()> { - let _fixture = FixtureSession::open()?; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; - let model_path = test_model::download_model()?; - let model_path_str = model_path +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn fit_params_succeeds_with_test_model(fixture: &LlamaFixture<'_>) -> Result<()> { + let model_path_str = fixture + .model_path .to_str() .ok_or_else(|| anyhow::anyhow!("model path is not valid UTF-8"))?; let model_path_cstr = CString::new(model_path_str)?; @@ -37,3 +74,5 @@ fn fit_params_succeeds_with_test_model() -> Result<()> { Ok(()) } + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/model_properties.rs b/llama-cpp-bindings-tests/tests/model_properties.rs new file mode 100644 index 00000000..ec872710 --- /dev/null +++ b/llama-cpp-bindings-tests/tests/model_properties.rs @@ -0,0 +1,423 @@ +#![expect( + clippy::unnecessary_wraps, + reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" +)] + +use anyhow::Result; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn model_loads_with_valid_metadata(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + + assert!(model.n_vocab() > 0); + assert!(model.n_embd() > 0); + assert!(model.n_params() > 0); + assert!(model.n_ctx_train()? > 0); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn n_layer_returns_positive(fixture: &LlamaFixture<'_>) -> Result<()> { + assert!(fixture.model.n_layer()? > 0); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn n_head_returns_positive(fixture: &LlamaFixture<'_>) -> Result<()> { + assert!(fixture.model.n_head()? > 0); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn n_head_kv_returns_positive(fixture: &LlamaFixture<'_>) -> Result<()> { + assert!(fixture.model.n_head_kv()? > 0); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn model_size_returns_nonzero(fixture: &LlamaFixture<'_>) -> Result<()> { + assert!(fixture.model.size() > 0); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn is_recurrent_returns_false_for_transformer(fixture: &LlamaFixture<'_>) -> Result<()> { + assert!(!fixture.model.is_recurrent()); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn is_hybrid_returns_false_for_non_hybrid_default_models( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + assert!( + !fixture.model.is_hybrid(), + "DeepSeek-R1-Distill-Llama-8B and GLM-4.7-Flash are pure transformers, not hybrid; got is_hybrid=true" + ); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn is_hybrid_returns_true_for_qwen_default_models(fixture: &LlamaFixture<'_>) -> Result<()> { + assert!( + fixture.model.is_hybrid(), + "Qwen 3.5 and Qwen 3.6 default GGUFs are reported as hybrid by llama.cpp; got is_hybrid=false" + ); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn rope_type_returns_a_known_variant_for_rope_carrying_default_models( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + use llama_cpp_bindings::model::rope_type::RopeType; + let rope = fixture.model.rope_type(); + assert!( + matches!( + rope, + Some(RopeType::Norm | RopeType::NeoX | RopeType::MRope | RopeType::Vision) + ), + "rope_type must be a known variant for DeepSeek and GLM-4.7; got {rope:?}" + ); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn rope_type_returns_none_for_qwen_default_models(fixture: &LlamaFixture<'_>) -> Result<()> { + let rope = fixture.model.rope_type(); + assert!( + rope.is_none(), + "Qwen 3.5 and Qwen 3.6 default GGUFs do not expose a rope_type in their metadata; got {rope:?}" + ); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn vocab_type_returns_a_known_variant(fixture: &LlamaFixture<'_>) -> Result<()> { + use llama_cpp_bindings::model::vocab_type::VocabType; + let vocab = fixture.model.vocab_type()?; + assert!( + matches!(vocab, VocabType::BPE | VocabType::SPM), + "vocab_type must be a known variant; got {vocab:?}" + ); + Ok(()) +} + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/model_sampling.rs b/llama-cpp-bindings-tests/tests/model_sampling.rs new file mode 100644 index 00000000..97e1326b --- /dev/null +++ b/llama-cpp-bindings-tests/tests/model_sampling.rs @@ -0,0 +1,454 @@ +use anyhow::Result; +use llama_cpp_bindings::SampledToken; +use llama_cpp_bindings::context::LlamaContext; +use llama_cpp_bindings::json_schema_to_grammar; +use llama_cpp_bindings::llama_batch::LlamaBatch; +use llama_cpp_bindings::model::AddBos; +use llama_cpp_bindings::sampling::LlamaSampler; +use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 256, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 256, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 256, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 256, + n_batch = 128, + n_ubatch = 64, +)] +fn sample_returns_result_and_succeeds_with_valid_index(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let mut context = LlamaContext::from_model( + model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let tokens = model.str_to_token("Hello", AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + + batch.add_sequence(&tokens, 0, false)?; + + context.decode(&mut batch)?; + + let mut sampler = LlamaSampler::chain_simple([LlamaSampler::temp(0.8), LlamaSampler::greedy()]); + + let result = sampler.sample(&context, batch.n_tokens() - 1); + + assert!(result.is_ok()); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn grammar_sampler_constrains_output_to_yes_or_no(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let mut context = LlamaContext::from_model( + model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let prompt = "<|im_start|>user\nIs the sky blue? Answer yes or no.<|im_end|>\n<|im_start|>assistant\n\n\n\n\n"; + let tokens = model.str_to_token(prompt, AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + + batch.add_sequence(&tokens, 0, false)?; + + context.decode(&mut batch)?; + + let mut sampler = LlamaSampler::chain_simple([ + LlamaSampler::grammar(model, r"root ::= [Yy] [Ee] [Ss] | [Nn] [Oo]", "root")?, + LlamaSampler::temp(0.8), + LlamaSampler::greedy(), + ]); + + let mut classifier = model.sampled_token_classifier(); + let (raw_token, mut outcomes) = + classifier.sample(&mut sampler, &context, batch.n_tokens() - 1)?; + outcomes.extend(classifier.flush()); + + assert_eq!( + outcomes.len(), + 1, + "expected one finalised outcome after flush" + ); + let outcome = &outcomes[0]; + + let raw_as_sampled = SampledToken::Content(raw_token); + assert!( + !model.is_eog_token(&raw_as_sampled), + "Grammar sampler should not allow EOS as first token" + ); + + let piece = &outcome.raw_piece; + let first_char = piece + .chars() + .next() + .ok_or_else(|| anyhow::anyhow!("piece should have at least one character"))? + .to_lowercase() + .next() + .ok_or_else(|| anyhow::anyhow!("lowercase iterator should yield a character"))?; + + assert!( + first_char == 'y' || first_char == 'n', + "Grammar should constrain first token to start with y/n, got: '{piece}'" + ); + assert_eq!( + classifier.usage().completion_tokens(), + 1, + "exactly one completion token sampled" + ); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn json_schema_grammar_sampler_constrains_output_to_json(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let mut context = LlamaContext::from_model( + model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let prompt = "<|im_start|>user\nWhat is 2+2? Respond with a JSON object.<|im_end|>\n<|im_start|>assistant\n\n\n\n\n"; + let tokens = model.str_to_token(prompt, AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + + batch.add_sequence(&tokens, 0, false)?; + + context.decode(&mut batch)?; + + let grammar_str = json_schema_to_grammar( + r#"{"type": "object", "properties": {"answer": {"type": "string"}}, "required": ["answer"]}"#, + )?; + + let mut sampler = LlamaSampler::chain_simple([ + LlamaSampler::grammar(model, &grammar_str, "root")?, + LlamaSampler::temp(0.8), + LlamaSampler::greedy(), + ]); + + let mut classifier = model.sampled_token_classifier(); + let (raw_token, mut outcomes) = + classifier.sample(&mut sampler, &context, batch.n_tokens() - 1)?; + outcomes.extend(classifier.flush()); + + assert_eq!( + outcomes.len(), + 1, + "expected one finalised outcome after flush" + ); + let outcome = &outcomes[0]; + + let raw_as_sampled = SampledToken::Content(raw_token); + assert!( + !model.is_eog_token(&raw_as_sampled), + "Grammar sampler should not allow EOS as first token" + ); + + let piece = &outcome.raw_piece; + + assert!( + piece.starts_with('{'), + "JSON schema grammar should constrain first token to start with '{{', got: '{piece}'" + ); + assert_eq!( + classifier.usage().completion_tokens(), + 1, + "exactly one completion token sampled" + ); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn sample_with_grammar_produces_constrained_output_in_loop( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let model = fixture.model; + let mut context = LlamaContext::from_model( + model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let prompt = "<|im_start|>user\nIs the sky blue? yes or no<|im_end|>\n<|im_start|>assistant\n\n\n\n\n"; + let tokens = model.str_to_token(prompt, AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + + let mut classifier = model.sampled_token_classifier(); + classifier.feed_prompt_sequence_to_batch(&mut batch, &tokens, 0, false)?; + + context.decode(&mut batch)?; + classifier.commit_prompt_tokens(); + + let mut sampler = LlamaSampler::chain_simple([ + LlamaSampler::grammar(model, r#"root ::= "yes" | "no""#, "root")?, + LlamaSampler::temp(0.8), + LlamaSampler::greedy(), + ]); + + let initial_position = batch.n_tokens(); + let outcome = ClassifySampleLoop { + model, + classifier: &mut classifier, + sampler: &mut sampler, + context: &mut context, + batch: &mut batch, + initial_position, + max_generated_tokens: 10, + } + .run()?; + + let lowercase = outcome.generated_raw.to_lowercase(); + assert!( + lowercase == "yes" || lowercase == "no", + "Grammar loop should produce 'yes' or 'no', got: '{}'", + outcome.generated_raw + ); + assert!( + outcome.eog_seen, + "loop must terminate via EOG once grammar accepts, not by exhausting the budget; outcome={outcome:?}" + ); + assert_eq!(outcome.observed_reasoning, 0); + assert_eq!(outcome.observed_undeterminable, 0); + assert_eq!(outcome.observed_tool_call, 0); + assert!(outcome.observed_content > 0); + + let usage = classifier.into_usage(); + assert_eq!(usage.completion_tokens(), outcome.observed_content); + assert_eq!(usage.reasoning_tokens, 0); + assert_eq!(usage.undeterminable_tokens, 0); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn sample_without_grammar_produces_multiple_tokens(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let mut context = LlamaContext::from_model( + model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let prompt = + "<|im_start|>user\nSay hello<|im_end|>\n<|im_start|>assistant\n\n\n\n\n"; + let tokens = model.str_to_token(prompt, AddBos::Always)?; + let mut batch = LlamaBatch::new(512, 1)?; + + batch.add_sequence(&tokens, 0, false)?; + + context.decode(&mut batch)?; + + let mut sampler = LlamaSampler::chain_simple([LlamaSampler::temp(0.8), LlamaSampler::greedy()]); + + let mut classifier = model.sampled_token_classifier(); + let mut sampled_count: u64 = 0; + let mut position = batch.n_tokens(); + + for _ in 0..5 { + let (raw_token, _outcomes) = classifier.sample(&mut sampler, &context, -1)?; + let raw_as_sampled = SampledToken::Content(raw_token); + + if model.is_eog_token(&raw_as_sampled) { + break; + } + + sampled_count += 1; + + batch.clear(); + batch.add(&raw_as_sampled, position, &[0], true)?; + position += 1; + + context.decode(&mut batch)?; + } + + let _ = classifier.flush(); + + assert!( + sampled_count > 0, + "Should produce at least one token without grammar" + ); + let usage = classifier.into_usage(); + assert!( + usage.completion_tokens() >= sampled_count, + "completion_tokens ({}) must include the {sampled_count} non-EOG samples", + usage.completion_tokens() + ); + + Ok(()) +} + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/model_special_tokens.rs b/llama-cpp-bindings-tests/tests/model_special_tokens.rs new file mode 100644 index 00000000..c719501b --- /dev/null +++ b/llama-cpp-bindings-tests/tests/model_special_tokens.rs @@ -0,0 +1,381 @@ +#![expect( + clippy::unnecessary_wraps, + reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" +)] + +use anyhow::Result; +use llama_cpp_bindings::SampledToken; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn special_tokens_exist(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let bos = model.token_bos(); + let eos = model.token_eos(); + + assert_ne!(bos, eos); + assert!(model.is_eog_token(&SampledToken::Content(eos))); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn token_nl_returns_valid_token(fixture: &LlamaFixture<'_>) -> Result<()> { + let nl_token = fixture.model.token_nl(); + assert!(nl_token.0 >= 0); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn is_eog_token_classifies_reasoning_variant(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let eos = model.token_eos(); + assert!(model.is_eog_token(&SampledToken::Reasoning(eos))); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn is_eog_token_classifies_tool_call_variant(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let eos = model.token_eos(); + assert!(model.is_eog_token(&SampledToken::ToolCall(eos))); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn is_eog_token_classifies_undeterminable_variant(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let eos = model.token_eos(); + assert!(model.is_eog_token(&SampledToken::Undeterminable(eos))); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn decode_start_token_returns_valid_token(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let token = model.decode_start_token(); + let n_vocab = model.n_vocab(); + assert!( + token.0 == -1 || (0..n_vocab).contains(&token.0), + "decode_start_token must be either -1 (no decoder-start defined) or a valid vocab index < {n_vocab}; got {token}" + ); + assert_eq!( + token, + model.decode_start_token(), + "decode_start_token must be deterministic across calls" + ); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn token_sep_returns_valid_token(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let token = model.token_sep(); + let n_vocab = model.n_vocab(); + assert!( + token.0 == -1 || (0..n_vocab).contains(&token.0), + "token_sep must be either -1 (no SEP token defined) or a valid vocab index < {n_vocab}; got {token}" + ); + assert_eq!( + token, + model.token_sep(), + "token_sep must be deterministic across calls" + ); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn token_attr_returns_attrs_for_bos(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let bos = model.token_bos(); + let attrs = model.token_attr(bos)?; + let bit_repr = format!("{:?}", *attrs); + assert!( + !bit_repr.is_empty(), + "token_attr(bos) must produce Debug output" + ); + Ok(()) +} + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/model_str_to_token.rs b/llama-cpp-bindings-tests/tests/model_str_to_token.rs new file mode 100644 index 00000000..ea8ebb9c --- /dev/null +++ b/llama-cpp-bindings-tests/tests/model_str_to_token.rs @@ -0,0 +1,210 @@ +use anyhow::Result; +use llama_cpp_bindings::model::AddBos; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn str_to_token_roundtrip(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let tokens = model.str_to_token("hello world", AddBos::Never)?; + assert!(!tokens.is_empty()); + let mut decoder = encoding_rs::UTF_8.new_decoder(); + let piece = model.token_to_piece( + &llama_cpp_bindings::SampledToken::Content(tokens[0]), + &mut decoder, + false, + None, + )?; + + assert!(!piece.is_empty()); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn str_to_token_grows_buffer_when_initial_estimation_too_small( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let many_short_chars = "a b c d e f g h i j k l"; + let tokens = fixture + .model + .str_to_token(many_short_chars, AddBos::Always)?; + + assert!( + tokens.len() > 8, + "expected regrow; got {} tokens", + tokens.len() + ); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn str_to_token_with_add_bos_never(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let tokens_with_bos = model.str_to_token("hello", AddBos::Always)?; + let tokens_without_bos = model.str_to_token("hello", AddBos::Never)?; + + assert!(tokens_with_bos.len() >= tokens_without_bos.len()); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn str_to_token_with_many_tokens_triggers_buffer_resize(fixture: &LlamaFixture<'_>) -> Result<()> { + use std::fmt::Write; + + let many_numbers = (0..2000).fold(String::new(), |mut accumulator, number| { + let _ = write!(accumulator, "{number} "); + accumulator + }); + + let tokens = fixture.model.str_to_token(&many_numbers, AddBos::Always)?; + + assert!(tokens.len() > many_numbers.len() / 2); + + Ok(()) +} + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/model_token_to_piece.rs b/llama-cpp-bindings-tests/tests/model_token_to_piece.rs new file mode 100644 index 00000000..b86d391b --- /dev/null +++ b/llama-cpp-bindings-tests/tests/model_token_to_piece.rs @@ -0,0 +1,364 @@ +#![expect( + clippy::unnecessary_wraps, + reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" +)] + +use std::num::NonZeroU16; + +use anyhow::Result; +use llama_cpp_bindings::SampledToken; +use llama_cpp_bindings::model::AddBos; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn token_to_piece_bytes_returns_bytes_for_known_token(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let tokens = model.str_to_token("hello", AddBos::Never)?; + let bytes = model.token_to_piece_bytes(tokens[0], 32, false, None)?; + + assert!(!bytes.is_empty()); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn token_to_piece_handles_large_token_requiring_buffer_resize( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let model = fixture.model; + let mut decoder = encoding_rs::UTF_8.new_decoder(); + + for (token, _) in model.tokens(true).take(200) { + let result = model.token_to_piece(&SampledToken::Content(token), &mut decoder, true, None); + assert!(result.is_ok()); + } + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn token_to_piece_bytes_insufficient_buffer_returns_error( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let model = fixture.model; + let tokens = model.str_to_token("hello", AddBos::Never)?; + let result = model.token_to_piece_bytes(tokens[0], 1, false, None); + + assert!( + result + .unwrap_err() + .to_string() + .contains("Insufficient Buffer Space") + ); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn token_to_piece_with_lstrip(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let mut decoder = encoding_rs::UTF_8.new_decoder(); + let tokens = model.str_to_token("hello", AddBos::Never)?; + let result = model.token_to_piece( + &SampledToken::Content(tokens[0]), + &mut decoder, + false, + NonZeroU16::new(1), + ); + + assert!(result.is_ok()); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn token_to_piece_decodes_reasoning_variant(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let mut decoder = encoding_rs::UTF_8.new_decoder(); + let tokens = model.str_to_token("hi", AddBos::Never)?; + + let piece = model.token_to_piece( + &SampledToken::Reasoning(tokens[0]), + &mut decoder, + true, + None, + )?; + + assert!(!piece.is_empty()); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn token_to_piece_decodes_tool_call_variant(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let mut decoder = encoding_rs::UTF_8.new_decoder(); + let tokens = model.str_to_token("hi", AddBos::Never)?; + + let piece = + model.token_to_piece(&SampledToken::ToolCall(tokens[0]), &mut decoder, true, None)?; + + assert!(!piece.is_empty()); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn token_to_piece_decodes_undeterminable_variant(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let mut decoder = encoding_rs::UTF_8.new_decoder(); + let tokens = model.str_to_token("hi", AddBos::Never)?; + + let piece = model.token_to_piece( + &SampledToken::Undeterminable(tokens[0]), + &mut decoder, + true, + None, + )?; + + assert!(!piece.is_empty()); + Ok(()) +} + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/model_tokens_iterator.rs b/llama-cpp-bindings-tests/tests/model_tokens_iterator.rs new file mode 100644 index 00000000..3f9ad9da --- /dev/null +++ b/llama-cpp-bindings-tests/tests/model_tokens_iterator.rs @@ -0,0 +1,109 @@ +#![expect( + clippy::unnecessary_wraps, + reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" +)] + +use anyhow::Result; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn tokens_iterator_produces_valid_entries(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let mut count = 0; + + for (token, _piece_result) in model.tokens(false) { + assert!(token.0 >= 0); + count += 1; + + if count >= 100 { + break; + } + } + + assert_eq!(count, 100); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn n_vocab_matches_tokens_iterator_count(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let n_vocab = model.n_vocab(); + let count = model.tokens(false).count(); + + assert_eq!(count, usize::try_from(n_vocab)?); + Ok(()) +} + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/mtmd.rs b/llama-cpp-bindings-tests/tests/mtmd.rs deleted file mode 100644 index cd0057bf..00000000 --- a/llama-cpp-bindings-tests/tests/mtmd.rs +++ /dev/null @@ -1,554 +0,0 @@ -#![cfg(feature = "multimodal_capable")] - -use std::num::NonZeroU32; - -use anyhow::Result; -use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::context::params::LlamaContextParams; -use llama_cpp_bindings::llama_backend::LlamaBackend; -use llama_cpp_bindings::model::LlamaModel; -use llama_cpp_bindings::mtmd::MtmdBitmap; -use llama_cpp_bindings::mtmd::MtmdContext; -use llama_cpp_bindings::mtmd::MtmdContextParams; -use llama_cpp_bindings::mtmd::MtmdEvalError; -use llama_cpp_bindings::mtmd::MtmdInputChunkType; -use llama_cpp_bindings::mtmd::MtmdInputChunks; -use llama_cpp_bindings::mtmd::MtmdInputText; -use llama_cpp_bindings_tests::FixtureSession; -use llama_cpp_bindings_tests::test_model; -use serial_test::serial; - -fn eval_synthetic_bitmap( - backend: &LlamaBackend, - model: &LlamaModel, - mtmd_ctx: &MtmdContext, - width: u32, - height: u32, -) -> Result<()> { - let image_data = vec![128u8; (width as usize) * (height as usize) * 3]; - let bitmap = MtmdBitmap::from_image_data(width, height, &image_data)?; - let input_text = MtmdInputText { - text: "Describe: <__media__>".to_string(), - add_special: true, - parse_special: true, - }; - let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; - let n_positions = chunks.total_positions(); - let context_size = u32::try_from(n_positions + 256).unwrap_or(8192); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(context_size)); - let llama_ctx = LlamaContext::from_model(model, backend, ctx_params)?; - let n_batch = i32::try_from(llama_ctx.n_batch())?; - chunks.eval_chunks(mtmd_ctx, &llama_ctx, 0, 0, n_batch, false)?; - - Ok(()) -} - -#[test] -#[serial] -fn eval_chunks_returns_batch_size_exceeds_context_limit_for_huge_batch() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let mtmd_ctx = fixture.mtmd_context()?; - - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(64)); - let llama_ctx = LlamaContext::from_model(model, backend, ctx_params)?; - - let chunks = MtmdInputChunks::new()?; - let huge_batch = i32::try_from(llama_ctx.n_batch() + 1)?; - - let result = chunks.eval_chunks(mtmd_ctx, &llama_ctx, 0, 0, huge_batch, false); - - assert!(matches!( - result, - Err(MtmdEvalError::BatchSizeExceedsContextLimit { .. }) - )); - - Ok(()) -} - -#[test] -#[serial] -fn from_buffer_creates_bitmap_from_image_bytes() -> Result<()> { - let fixture = FixtureSession::open()?; - let mtmd_ctx = fixture.mtmd_context()?; - - let fixtures = test_model::fixtures_dir(); - let image_path = fixtures.join("llamas.jpg"); - let image_bytes = std::fs::read(&image_path)?; - let bitmap = MtmdBitmap::from_buffer(mtmd_ctx, &image_bytes)?; - - assert!(bitmap.nx() > 0); - assert!(bitmap.ny() > 0); - assert!(!bitmap.is_audio()); - - Ok(()) -} - -#[test] -#[serial] -fn from_file_with_null_byte_in_path_returns_error() -> Result<()> { - let fixture = FixtureSession::open()?; - let mtmd_ctx = fixture.mtmd_context()?; - let result = MtmdBitmap::from_file(mtmd_ctx, "path\0null"); - - assert!(result.is_err()); - - Ok(()) -} - -#[test] -#[serial] -fn text_chunk_has_text_type() -> Result<()> { - let fixture = FixtureSession::open()?; - let mtmd_ctx = fixture.mtmd_context()?; - - let image_data = vec![128u8; 64 * 64 * 3]; - let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?; - let input_text = MtmdInputText { - text: "Hello world <__media__>".to_string(), - add_special: true, - parse_special: true, - }; - let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; - let first_chunk = chunks - .get(0) - .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?; - - assert_eq!(first_chunk.chunk_type()?, MtmdInputChunkType::Text); - - Ok(()) -} - -#[test] -#[serial] -fn text_chunk_returns_text_tokens() -> Result<()> { - let fixture = FixtureSession::open()?; - let mtmd_ctx = fixture.mtmd_context()?; - - let image_data = vec![128u8; 64 * 64 * 3]; - let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?; - let input_text = MtmdInputText { - text: "Hello world <__media__>".to_string(), - add_special: true, - parse_special: true, - }; - let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; - let first_chunk = chunks - .get(0) - .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?; - let tokens = first_chunk.text_tokens(); - - assert!(tokens.is_some()); - assert!(!tokens.expect("tokens should be some").is_empty()); - - Ok(()) -} - -#[test] -#[serial] -fn chunk_n_tokens_is_positive() -> Result<()> { - let fixture = FixtureSession::open()?; - let mtmd_ctx = fixture.mtmd_context()?; - - let image_data = vec![128u8; 64 * 64 * 3]; - let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?; - let input_text = MtmdInputText { - text: "Hello world <__media__>".to_string(), - add_special: true, - parse_special: true, - }; - let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; - let first_chunk = chunks - .get(0) - .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?; - - assert!(first_chunk.n_tokens() > 0); - - Ok(()) -} - -#[test] -#[serial] -fn chunk_n_positions_is_positive() -> Result<()> { - let fixture = FixtureSession::open()?; - let mtmd_ctx = fixture.mtmd_context()?; - - let image_data = vec![128u8; 64 * 64 * 3]; - let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?; - let input_text = MtmdInputText { - text: "Hello world <__media__>".to_string(), - add_special: true, - parse_special: true, - }; - let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; - let first_chunk = chunks - .get(0) - .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?; - - assert!(first_chunk.n_positions() > 0); - - Ok(()) -} - -#[test] -#[serial] -fn copy_creates_owned_duplicate() -> Result<()> { - let fixture = FixtureSession::open()?; - let mtmd_ctx = fixture.mtmd_context()?; - - let image_data = vec![128u8; 64 * 64 * 3]; - let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?; - let input_text = MtmdInputText { - text: "Hello <__media__>".to_string(), - add_special: true, - parse_special: true, - }; - let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; - let first_chunk = chunks - .get(0) - .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?; - let copied = first_chunk.copy()?; - - assert!(copied.owned); - assert_eq!(copied.n_tokens(), first_chunk.n_tokens()); - - Ok(()) -} - -#[test] -#[serial] -fn text_chunk_id_returns_none() -> Result<()> { - let fixture = FixtureSession::open()?; - let mtmd_ctx = fixture.mtmd_context()?; - - let image_data = vec![128u8; 64 * 64 * 3]; - let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?; - let input_text = MtmdInputText { - text: "Hello <__media__>".to_string(), - add_special: true, - parse_special: true, - }; - let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; - let first_chunk = chunks - .get(0) - .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?; - - assert_eq!(first_chunk.chunk_type()?, MtmdInputChunkType::Text); - assert!(first_chunk.id().is_none()); - - Ok(()) -} - -#[test] -#[serial] -fn image_chunk_returns_none_for_text_tokens() -> Result<()> { - let fixture = FixtureSession::open()?; - let mtmd_ctx = fixture.mtmd_context()?; - - let image_data = vec![128u8; 64 * 64 * 3]; - let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?; - let input_text = MtmdInputText { - text: "Hello <__media__>".to_string(), - add_special: true, - parse_special: true, - }; - let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; - - for chunk_index in 0..chunks.len() { - let chunk = chunks - .get(chunk_index) - .ok_or_else(|| anyhow::anyhow!("missing chunk at index {chunk_index}"))?; - if chunk.chunk_type() == Ok(MtmdInputChunkType::Image) { - assert!(chunk.text_tokens().is_none()); - - return Ok(()); - } - } - - Ok(()) -} - -#[test] -#[serial] -fn image_chunk_id_returns_some() -> Result<()> { - let fixture = FixtureSession::open()?; - let mtmd_ctx = fixture.mtmd_context()?; - - let image_data = vec![128u8; 64 * 64 * 3]; - let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?; - let input_text = MtmdInputText { - text: "Hello <__media__>".to_string(), - add_special: true, - parse_special: true, - }; - let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; - - for chunk_index in 0..chunks.len() { - let chunk = chunks - .get(chunk_index) - .ok_or_else(|| anyhow::anyhow!("missing chunk at index {chunk_index}"))?; - if chunk.chunk_type() == Ok(MtmdInputChunkType::Image) { - assert!(chunk.id().is_some()); - - return Ok(()); - } - } - - Ok(()) -} - -#[test] -#[serial] -fn init_and_supports_vision() -> Result<()> { - let fixture = FixtureSession::open()?; - let mtmd_ctx = fixture.mtmd_context()?; - - assert!(mtmd_ctx.support_vision()); - - Ok(()) -} - -#[test] -#[serial] -fn tokenize_text_with_image() -> Result<()> { - let fixture = FixtureSession::open()?; - let mtmd_ctx = fixture.mtmd_context()?; - - let image_data = vec![128u8; 64 * 64 * 3]; - let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?; - let input_text = MtmdInputText { - text: "Describe this image: <__media__>".to_string(), - add_special: true, - parse_special: true, - }; - let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; - - assert!(!chunks.is_empty()); - assert!(chunks.total_tokens() > 0); - - Ok(()) -} - -#[test] -#[serial] -fn eval_chunks_with_standard_image() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let mtmd_ctx = fixture.mtmd_context()?; - - let fixtures = test_model::fixtures_dir(); - let image_path = fixtures.join("llamas.jpg"); - let image_path_str = image_path - .to_str() - .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?; - let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?; - let input_text = MtmdInputText { - text: "What is in this image? <__media__>".to_string(), - add_special: true, - parse_special: true, - }; - let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; - let n_positions = chunks.total_positions(); - let context_size = u32::try_from(n_positions + 256).unwrap_or(2048); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(context_size)); - let llama_ctx = LlamaContext::from_model(model, backend, ctx_params)?; - let n_batch = i32::try_from(llama_ctx.n_batch())?; - let result = chunks.eval_chunks(mtmd_ctx, &llama_ctx, 0, 0, n_batch, false); - - assert!(result.is_ok()); - - Ok(()) -} - -#[test] -#[serial] -fn eval_chunks_with_varied_dimensions() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let mtmd_ctx = fixture.mtmd_context()?; - - let test_dimensions: [(u32, u32); 4] = [(224, 224), (512, 512), (100, 500), (337, 421)]; - - for (width, height) in test_dimensions { - let result = eval_synthetic_bitmap(backend, model, mtmd_ctx, width, height); - - assert!( - result.is_ok(), - "dimension {width}x{height} should succeed: {result:?}" - ); - } - - Ok(()) -} - -#[test] -#[serial] -fn decode_use_non_causal_returns_bool() -> Result<()> { - let fixture = FixtureSession::open()?; - let mtmd_ctx = fixture.mtmd_context()?; - - let image_data = vec![128u8; 64 * 64 * 3]; - let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?; - let input_text = MtmdInputText { - text: "Hello world <__media__>".to_string(), - add_special: true, - parse_special: true, - }; - let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; - let first_chunk = chunks - .get(0) - .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?; - let _non_causal = mtmd_ctx.decode_use_non_causal(&first_chunk); - - Ok(()) -} - -#[test] -#[serial] -fn decode_use_mrope_returns_bool() -> Result<()> { - let fixture = FixtureSession::open()?; - let mtmd_ctx = fixture.mtmd_context()?; - - let _mrope = mtmd_ctx.decode_use_mrope(); - - Ok(()) -} - -#[test] -#[serial] -fn support_audio_returns_bool() -> Result<()> { - let fixture = FixtureSession::open()?; - let mtmd_ctx = fixture.mtmd_context()?; - - let _audio = mtmd_ctx.support_audio(); - - Ok(()) -} - -#[test] -#[serial] -fn get_audio_sample_rate_returns_option() -> Result<()> { - let fixture = FixtureSession::open()?; - let mtmd_ctx = fixture.mtmd_context()?; - - let _rate = mtmd_ctx.get_audio_sample_rate(); - - Ok(()) -} - -#[test] -#[serial] -fn encode_chunk_succeeds_for_image_chunk() -> Result<()> { - let fixture = FixtureSession::open()?; - let mtmd_ctx = fixture.mtmd_context()?; - - let image_data = vec![128u8; 64 * 64 * 3]; - let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?; - let input_text = MtmdInputText { - text: "Describe: <__media__>".to_string(), - add_special: true, - parse_special: true, - }; - let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; - - for chunk_index in 0..chunks.len() { - let chunk = chunks - .get(chunk_index) - .ok_or_else(|| anyhow::anyhow!("missing chunk at index {chunk_index}"))?; - if chunk.chunk_type() == Ok(MtmdInputChunkType::Image) { - let result = mtmd_ctx.encode_chunk(&chunk); - - assert!(result.is_ok()); - - return Ok(()); - } - } - - Ok(()) -} - -#[test] -#[serial] -fn tokenize_bitmap_count_mismatch_returns_error() -> Result<()> { - let fixture = FixtureSession::open()?; - let mtmd_ctx = fixture.mtmd_context()?; - - let input_text = MtmdInputText { - text: "No media markers here".to_string(), - add_special: true, - parse_special: true, - }; - let image_data = vec![128u8; 64 * 64 * 3]; - let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?; - let result = mtmd_ctx.tokenize(input_text, &[&bitmap]); - - assert!(result.is_err()); - - Ok(()) -} - -#[test] -#[serial] -fn eval_chunks_with_extreme_dimensions_does_not_crash() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let mtmd_ctx = fixture.mtmd_context()?; - - let extreme_dimensions: [(u32, u32); 6] = [ - (1, 1), - (7, 13), - (3, 1000), - (1000, 3), - (1920, 1080), - (4096, 4096), - ]; - - let mut any_reached_eval = false; - - for (width, height) in extreme_dimensions { - match eval_synthetic_bitmap(backend, model, mtmd_ctx, width, height) { - Ok(()) => any_reached_eval = true, - Err(error) => eprintln!(" {width}x{height} failed: {error}"), - } - } - - assert!( - any_reached_eval, - "at least one extreme dimension should reach eval_chunks" - ); - - Ok(()) -} - -#[test] -#[serial] -fn init_from_file_with_null_byte_in_path_returns_error() { - let fixture = FixtureSession::open().expect("open fixture"); - let model = fixture.default_model(); - let mtmd_params = MtmdContextParams::default(); - let result = MtmdContext::init_from_file("path\0null", model, &mtmd_params); - - assert!(result.is_err()); -} - -#[test] -#[serial] -fn tokenize_with_null_byte_in_text_returns_error() -> Result<()> { - let fixture = FixtureSession::open()?; - let mtmd_ctx = fixture.mtmd_context()?; - - let input_text = MtmdInputText { - text: "text\0null".to_string(), - add_special: true, - parse_special: true, - }; - let result = mtmd_ctx.tokenize(input_text, &[]); - - assert!(result.is_err()); - - Ok(()) -} diff --git a/llama-cpp-bindings-tests/tests/mtmd_bitmap.rs b/llama-cpp-bindings-tests/tests/mtmd_bitmap.rs new file mode 100644 index 00000000..3c66f82f --- /dev/null +++ b/llama-cpp-bindings-tests/tests/mtmd_bitmap.rs @@ -0,0 +1,81 @@ +#![expect( + clippy::unnecessary_wraps, + reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" +)] + +use anyhow::Result; +use llama_cpp_bindings::mtmd::MtmdBitmap; +use llama_cpp_bindings_tests::test_model; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn from_buffer_creates_bitmap_from_image_bytes(fixture: &LlamaFixture<'_>) -> Result<()> { + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + + let fixtures = test_model::fixtures_dir(); + let image_path = fixtures.join("llamas.jpg"); + let image_bytes = std::fs::read(&image_path)?; + let bitmap = MtmdBitmap::from_buffer(mtmd_ctx, &image_bytes)?; + + assert!(bitmap.nx() > 0); + assert!(bitmap.ny() > 0); + assert!(!bitmap.is_audio()); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn from_file_with_null_byte_in_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + let result = MtmdBitmap::from_file(mtmd_ctx, "path\0null"); + + assert!(result.is_err()); + + Ok(()) +} + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/mtmd_chunk_operations.rs b/llama-cpp-bindings-tests/tests/mtmd_chunk_operations.rs new file mode 100644 index 00000000..8a960774 --- /dev/null +++ b/llama-cpp-bindings-tests/tests/mtmd_chunk_operations.rs @@ -0,0 +1,147 @@ +use anyhow::Result; +use llama_cpp_bindings::mtmd::MtmdBitmap; +use llama_cpp_bindings::mtmd::MtmdInputChunkType; +use llama_cpp_bindings::mtmd::MtmdInputText; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn copy_creates_owned_duplicate(fixture: &LlamaFixture<'_>) -> Result<()> { + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + let image_data = vec![128u8; 64 * 64 * 3]; + let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?; + let input_text = MtmdInputText { + text: "Hello <__media__>".to_string(), + add_special: true, + parse_special: true, + }; + let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; + let first_chunk = chunks + .get(0) + .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?; + let copied = first_chunk.copy()?; + + assert!(copied.owned); + assert_eq!(copied.n_tokens(), first_chunk.n_tokens()); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn encode_chunk_succeeds_for_image_chunk(fixture: &LlamaFixture<'_>) -> Result<()> { + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + let image_data = vec![128u8; 64 * 64 * 3]; + let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?; + let input_text = MtmdInputText { + text: "Describe: <__media__>".to_string(), + add_special: true, + parse_special: true, + }; + let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; + + for chunk_index in 0..chunks.len() { + let chunk = chunks + .get(chunk_index) + .ok_or_else(|| anyhow::anyhow!("missing chunk at index {chunk_index}"))?; + if chunk.chunk_type() == Ok(MtmdInputChunkType::Image) { + let result = mtmd_ctx.encode_chunk(&chunk); + assert!(result.is_ok()); + return Ok(()); + } + } + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn decode_use_non_causal_returns_bool_for_image_chunk(fixture: &LlamaFixture<'_>) -> Result<()> { + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + let image_data = vec![128u8; 64 * 64 * 3]; + let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?; + let input_text = MtmdInputText { + text: "Describe: <__media__>".to_string(), + add_special: true, + parse_special: true, + }; + let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; + for chunk_index in 0..chunks.len() { + let chunk = chunks + .get(chunk_index) + .ok_or_else(|| anyhow::anyhow!("missing chunk at index {chunk_index}"))?; + if chunk.chunk_type() == Ok(MtmdInputChunkType::Image) { + let value = mtmd_ctx.decode_use_non_causal(&chunk); + let printed = format!("{value:?}"); + assert!( + !printed.is_empty(), + "decode_use_non_causal must return a Debug-printable bool" + ); + return Ok(()); + } + } + anyhow::bail!("tokenization should produce at least one Image chunk"); +} + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/mtmd_chunk_structure.rs b/llama-cpp-bindings-tests/tests/mtmd_chunk_structure.rs new file mode 100644 index 00000000..1114af3c --- /dev/null +++ b/llama-cpp-bindings-tests/tests/mtmd_chunk_structure.rs @@ -0,0 +1,242 @@ +use anyhow::Result; +use llama_cpp_bindings::mtmd::MtmdBitmap; +use llama_cpp_bindings::mtmd::MtmdInputChunkType; +use llama_cpp_bindings::mtmd::MtmdInputText; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; + +fn tokenize_synthetic( + fixture: &LlamaFixture<'_>, + prompt: &str, +) -> Result { + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + let image_data = vec![128u8; 64 * 64 * 3]; + let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?; + let input_text = MtmdInputText { + text: prompt.to_owned(), + add_special: true, + parse_special: true, + }; + Ok(mtmd_ctx.tokenize(input_text, &[&bitmap])?) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn text_chunk_has_text_type(fixture: &LlamaFixture<'_>) -> Result<()> { + let chunks = tokenize_synthetic(fixture, "Hello world <__media__>")?; + let first_chunk = chunks + .get(0) + .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?; + assert_eq!(first_chunk.chunk_type()?, MtmdInputChunkType::Text); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn text_chunk_returns_text_tokens(fixture: &LlamaFixture<'_>) -> Result<()> { + let chunks = tokenize_synthetic(fixture, "Hello world <__media__>")?; + let first_chunk = chunks + .get(0) + .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?; + let tokens = first_chunk.text_tokens(); + assert!(tokens.is_some()); + assert!(!tokens.expect("tokens should be some").is_empty()); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn chunk_n_tokens_is_positive(fixture: &LlamaFixture<'_>) -> Result<()> { + let chunks = tokenize_synthetic(fixture, "Hello world <__media__>")?; + let first_chunk = chunks + .get(0) + .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?; + assert!(first_chunk.n_tokens() > 0); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn chunk_n_positions_is_positive(fixture: &LlamaFixture<'_>) -> Result<()> { + let chunks = tokenize_synthetic(fixture, "Hello world <__media__>")?; + let first_chunk = chunks + .get(0) + .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?; + assert!(first_chunk.n_positions() > 0); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn text_chunk_id_returns_none(fixture: &LlamaFixture<'_>) -> Result<()> { + let chunks = tokenize_synthetic(fixture, "Hello <__media__>")?; + let first_chunk = chunks + .get(0) + .ok_or_else(|| anyhow::anyhow!("missing first chunk"))?; + assert_eq!(first_chunk.chunk_type()?, MtmdInputChunkType::Text); + assert!(first_chunk.id().is_none()); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn image_chunk_returns_none_for_text_tokens(fixture: &LlamaFixture<'_>) -> Result<()> { + let chunks = tokenize_synthetic(fixture, "Hello <__media__>")?; + for chunk_index in 0..chunks.len() { + let chunk = chunks + .get(chunk_index) + .ok_or_else(|| anyhow::anyhow!("missing chunk at index {chunk_index}"))?; + if chunk.chunk_type() == Ok(MtmdInputChunkType::Image) { + assert!(chunk.text_tokens().is_none()); + return Ok(()); + } + } + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn image_chunk_id_returns_some(fixture: &LlamaFixture<'_>) -> Result<()> { + let chunks = tokenize_synthetic(fixture, "Hello <__media__>")?; + for chunk_index in 0..chunks.len() { + let chunk = chunks + .get(chunk_index) + .ok_or_else(|| anyhow::anyhow!("missing chunk at index {chunk_index}"))?; + if chunk.chunk_type() == Ok(MtmdInputChunkType::Image) { + assert!(chunk.id().is_some()); + return Ok(()); + } + } + Ok(()) +} + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/mtmd_context.rs b/llama-cpp-bindings-tests/tests/mtmd_context.rs new file mode 100644 index 00000000..8595eb2b --- /dev/null +++ b/llama-cpp-bindings-tests/tests/mtmd_context.rs @@ -0,0 +1,162 @@ +#![expect( + clippy::unnecessary_wraps, + reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" +)] + +use anyhow::Result; +use llama_cpp_bindings::mtmd::MtmdContext; +use llama_cpp_bindings::mtmd::MtmdContextParams; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn init_and_supports_vision(fixture: &LlamaFixture<'_>) -> Result<()> { + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + assert!(mtmd_ctx.support_vision()); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn init_from_file_with_null_byte_in_path_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + let mtmd_params = MtmdContextParams::default(); + let result = MtmdContext::init_from_file("path\0null", fixture.model, &mtmd_params); + + assert!(result.is_err()); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn decode_use_mrope_is_true_for_qwen_vision(fixture: &LlamaFixture<'_>) -> Result<()> { + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + assert!( + mtmd_ctx.decode_use_mrope(), + "Qwen 3.5 / 3.6 mmproj uses mrope; decode_use_mrope must return true" + ); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn support_audio_is_false_for_vision_only_mmproj(fixture: &LlamaFixture<'_>) -> Result<()> { + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + assert!( + !mtmd_ctx.support_audio(), + "Qwen 3.5 / 3.6 mmproj is vision-only; support_audio must return false" + ); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn get_audio_sample_rate_is_none_for_vision_only_mmproj(fixture: &LlamaFixture<'_>) -> Result<()> { + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + assert!( + mtmd_ctx.get_audio_sample_rate().is_none(), + "Qwen 3.5 / 3.6 mmproj has no audio; get_audio_sample_rate must return None" + ); + Ok(()) +} + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/mtmd_evaluation.rs b/llama-cpp-bindings-tests/tests/mtmd_evaluation.rs new file mode 100644 index 00000000..b6f30f1c --- /dev/null +++ b/llama-cpp-bindings-tests/tests/mtmd_evaluation.rs @@ -0,0 +1,236 @@ +#![expect( + clippy::unnecessary_wraps, + reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" +)] + +use anyhow::Result; +use llama_cpp_bindings::context::LlamaContext; +use llama_cpp_bindings::mtmd::MtmdBitmap; +use llama_cpp_bindings::mtmd::MtmdEvalError; +use llama_cpp_bindings::mtmd::MtmdInputChunks; +use llama_cpp_bindings::mtmd::MtmdInputText; +use llama_cpp_bindings_tests::test_model; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; + +fn eval_synthetic_bitmap(fixture: &LlamaFixture<'_>, width: u32, height: u32) -> Result<()> { + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + let image_data = vec![128u8; (width as usize) * (height as usize) * 3]; + let bitmap = MtmdBitmap::from_image_data(width, height, &image_data)?; + let input_text = MtmdInputText { + text: "Describe: <__media__>".to_string(), + add_special: true, + parse_special: true, + }; + let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; + let n_positions = chunks.total_positions(); + let required_n_ctx = u32::try_from(n_positions + 256)?; + if fixture.context_params.n_ctx < required_n_ctx { + anyhow::bail!( + "fixture n_ctx ({}) below required ({}) for {}x{} image", + fixture.context_params.n_ctx, + required_n_ctx, + width, + height, + ); + } + + let llama_ctx = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let n_batch = i32::try_from(llama_ctx.n_batch())?; + chunks.eval_chunks(mtmd_ctx, &llama_ctx, 0, 0, n_batch, false)?; + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 64, + n_batch = 64, + n_ubatch = 32, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 64, + n_batch = 64, + n_ubatch = 32, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn eval_chunks_returns_batch_size_exceeds_context_limit_for_huge_batch( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + let llama_ctx = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + + let chunks = MtmdInputChunks::new()?; + let huge_batch = i32::try_from(llama_ctx.n_batch() + 1)?; + + let result = chunks.eval_chunks(mtmd_ctx, &llama_ctx, 0, 0, huge_batch, false); + + assert!(matches!( + result, + Err(MtmdEvalError::BatchSizeExceedsContextLimit { .. }) + )); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 512, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 512, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn eval_chunks_with_standard_image(fixture: &LlamaFixture<'_>) -> Result<()> { + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + + let fixtures = test_model::fixtures_dir(); + let image_path = fixtures.join("llamas.jpg"); + let image_path_str = image_path + .to_str() + .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?; + let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?; + let input_text = MtmdInputText { + text: "What is in this image? <__media__>".to_string(), + add_special: true, + parse_special: true, + }; + let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; + let n_positions = chunks.total_positions(); + let required_n_ctx = u32::try_from(n_positions + 256)?; + assert!( + fixture.context_params.n_ctx >= required_n_ctx, + "fixture n_ctx ({}) below required ({}); update the attribute literal", + fixture.context_params.n_ctx, + required_n_ctx, + ); + + let llama_ctx = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let n_batch = i32::try_from(llama_ctx.n_batch())?; + let result = chunks.eval_chunks(mtmd_ctx, &llama_ctx, 0, 0, n_batch, false); + + assert!(result.is_ok()); + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 512, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 512, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn eval_chunks_with_varied_dimensions(fixture: &LlamaFixture<'_>) -> Result<()> { + let test_dimensions: [(u32, u32); 4] = [(224, 224), (512, 512), (100, 500), (337, 421)]; + + for (width, height) in test_dimensions { + let result = eval_synthetic_bitmap(fixture, width, height); + assert!( + result.is_ok(), + "dimension {width}x{height} should succeed: {result:?}" + ); + } + + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 512, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 512, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn eval_chunks_with_extreme_dimensions_does_not_crash(fixture: &LlamaFixture<'_>) -> Result<()> { + let extreme_dimensions: [(u32, u32); 6] = [ + (1, 1), + (7, 13), + (3, 1000), + (1000, 3), + (1920, 1080), + (4096, 4096), + ]; + + let mut any_reached_eval = false; + + for (width, height) in extreme_dimensions { + match eval_synthetic_bitmap(fixture, width, height) { + Ok(()) => any_reached_eval = true, + Err(error) => eprintln!(" {width}x{height} failed: {error}"), + } + } + + assert!( + any_reached_eval, + "at least one extreme dimension should reach eval_chunks" + ); + + Ok(()) +} + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/mtmd_tokenization.rs b/llama-cpp-bindings-tests/tests/mtmd_tokenization.rs new file mode 100644 index 00000000..ae5f32c3 --- /dev/null +++ b/llama-cpp-bindings-tests/tests/mtmd_tokenization.rs @@ -0,0 +1,121 @@ +#![expect( + clippy::unnecessary_wraps, + reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" +)] + +use anyhow::Result; +use llama_cpp_bindings::mtmd::MtmdBitmap; +use llama_cpp_bindings::mtmd::MtmdInputText; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn tokenize_text_with_image(fixture: &LlamaFixture<'_>) -> Result<()> { + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + let image_data = vec![128u8; 64 * 64 * 3]; + let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?; + let input_text = MtmdInputText { + text: "Describe this image: <__media__>".to_string(), + add_special: true, + parse_special: true, + }; + let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; + + assert!(!chunks.is_empty()); + assert!(chunks.total_tokens() > 0); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn tokenize_bitmap_count_mismatch_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + let input_text = MtmdInputText { + text: "No media markers here".to_string(), + add_special: true, + parse_special: true, + }; + let image_data = vec![128u8; 64 * 64 * 3]; + let bitmap = MtmdBitmap::from_image_data(64, 64, &image_data)?; + let result = mtmd_ctx.tokenize(input_text, &[&bitmap]); + assert!(result.is_err()); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn tokenize_with_null_byte_in_text_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + let input_text = MtmdInputText { + text: "text\0null".to_string(), + add_special: true, + parse_special: true, + }; + let result = mtmd_ctx.tokenize(input_text, &[]); + assert!(result.is_err()); + Ok(()) +} + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/multimodal.rs b/llama-cpp-bindings-tests/tests/multimodal.rs index b87f93c6..efd07c35 100644 --- a/llama-cpp-bindings-tests/tests/multimodal.rs +++ b/llama-cpp-bindings-tests/tests/multimodal.rs @@ -1,18 +1,16 @@ -#![cfg(feature = "multimodal_capable")] - -use std::num::NonZeroU32; - use anyhow::{Context, Result}; use llama_cpp_bindings::SampledTokenClassifier; use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::context::params::LlamaContextParams; use llama_cpp_bindings::llama_batch::LlamaBatch; use llama_cpp_bindings::model::{LlamaChatMessage, LlamaModel}; use llama_cpp_bindings::mtmd::{MtmdBitmap, MtmdInputChunkType, MtmdInputChunks, MtmdInputText}; use llama_cpp_bindings::sampled_token::SampledToken; use llama_cpp_bindings::sampling::LlamaSampler; use llama_cpp_bindings_sys::llama_pos; -use llama_cpp_bindings_tests::{FixtureSession, test_model}; +use llama_cpp_bindings_tests::test_model; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; struct ChunkTokenBreakdown { text: u64, @@ -108,19 +106,28 @@ fn drive_sampling_loop( Ok(totals) } -#[test] -fn multimodal_vision_inference_produces_output() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let mtmd_ctx = fixture.mtmd_context()?; - - let n_ctx = NonZeroU32::new(4096); - let ctx_params = LlamaContextParams::default() - .with_n_ctx(n_ctx) - .with_n_batch(512); - let mut ctx = LlamaContext::from_model(model, backend, ctx_params) - .with_context(|| "unable to create llama context")?; +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 4096, + n_batch = 512, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +fn multimodal_vision_inference_produces_output(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + + let mut ctx = LlamaContext::from_model( + model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + ) + .with_context(|| "unable to create llama context")?; assert!( mtmd_ctx.support_vision(), @@ -203,3 +210,5 @@ fn multimodal_vision_inference_produces_output() -> Result<()> { Ok(()) } + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/parse_chat_message.rs b/llama-cpp-bindings-tests/tests/parse_chat_message.rs index 05b64269..d23fe1c2 100644 --- a/llama-cpp-bindings-tests/tests/parse_chat_message.rs +++ b/llama-cpp-bindings-tests/tests/parse_chat_message.rs @@ -1,14 +1,55 @@ +#![expect( + clippy::unnecessary_wraps, + reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" +)] + use anyhow::Result; use anyhow::bail; use llama_cpp_bindings::ChatMessageParseOutcome; -use llama_cpp_bindings_tests::FixtureSession; - -#[test] -fn parses_pure_content_response() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; - let outcome = model.parse_chat_message("[]", "hello world", false)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn parses_pure_content_response(fixture: &LlamaFixture<'_>) -> Result<()> { + let outcome = fixture + .model + .parse_chat_message("[]", "hello world", false)?; let ChatMessageParseOutcome::Recognized(parsed) = outcome else { bail!("expected Recognized for plain content; got Unrecognized"); @@ -20,13 +61,45 @@ fn parses_pure_content_response() -> Result<()> { Ok(()) } -#[test] -fn parses_reasoning_section_into_reasoning_content() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn parses_reasoning_section_into_reasoning_content(fixture: &LlamaFixture<'_>) -> Result<()> { let input = "step one, step two\n\nactual response"; - let outcome = model.parse_chat_message("[]", input, false)?; + let outcome = fixture.model.parse_chat_message("[]", input, false)?; let ChatMessageParseOutcome::Recognized(parsed) = outcome else { bail!("expected Recognized for reasoning section; got Unrecognized"); @@ -41,12 +114,44 @@ fn parses_reasoning_section_into_reasoning_content() -> Result<()> { Ok(()) } -#[test] -fn parses_empty_input_yields_empty_message() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - - let outcome = model.parse_chat_message("[]", "", false)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn parses_empty_input_yields_empty_message(fixture: &LlamaFixture<'_>) -> Result<()> { + let outcome = fixture.model.parse_chat_message("[]", "", false)?; let ChatMessageParseOutcome::Recognized(parsed) = outcome else { bail!("expected Recognized for empty input; got Unrecognized"); @@ -56,12 +161,48 @@ fn parses_empty_input_yields_empty_message() -> Result<()> { Ok(()) } -#[test] -fn parses_malformed_tools_json_returns_tools_json_invalid_error() { - let fixture = FixtureSession::open().expect("open fixture"); - let model = fixture.default_model(); - - let result = model.parse_chat_message("not_a_json[}", "hello", false); +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn parses_malformed_tools_json_returns_tools_json_invalid_error( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let result = fixture + .model + .parse_chat_message("not_a_json[}", "hello", false); assert!(matches!( result, @@ -69,27 +210,101 @@ fn parses_malformed_tools_json_returns_tools_json_invalid_error() { _ )) )); + Ok(()) } -#[test] -fn parses_non_array_tools_json_returns_tools_json_not_array_error() { - let fixture = FixtureSession::open().expect("open fixture"); - let model = fixture.default_model(); - - let result = model.parse_chat_message("{\"foo\": 1}", "hello", false); +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn parses_non_array_tools_json_returns_tools_json_not_array_error( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let result = fixture + .model + .parse_chat_message("{\"foo\": 1}", "hello", false); assert!(matches!( result, Err(llama_cpp_bindings::ParseChatMessageError::ToolsJsonNotArray) )); + Ok(()) } -#[test] -fn parses_with_tools_null_byte_returns_tools_json_invalid_error() { - let fixture = FixtureSession::open().expect("open fixture"); - let model = fixture.default_model(); - - let result = model.parse_chat_message("[]\0extra", "hello", false); +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn parses_with_tools_null_byte_returns_tools_json_invalid_error( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let result = fixture + .model + .parse_chat_message("[]\0extra", "hello", false); assert!(matches!( result, @@ -97,17 +312,57 @@ fn parses_with_tools_null_byte_returns_tools_json_invalid_error() { _ )) )); + Ok(()) } -#[test] -fn parses_with_input_null_byte_returns_tools_serialization_error() { - let fixture = FixtureSession::open().expect("open fixture"); - let model = fixture.default_model(); - - let result = model.parse_chat_message("[]", "hello\0world", false); +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn parses_with_input_null_byte_returns_tools_serialization_error( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let result = fixture + .model + .parse_chat_message("[]", "hello\0world", false); assert!(matches!( result, Err(llama_cpp_bindings::ParseChatMessageError::ToolsSerialization(_)) )); + Ok(()) } + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/qwen35_chat_inference_emits_reasoning_when_template_auto_opens.rs b/llama-cpp-bindings-tests/tests/qwen35_chat_inference_emits_reasoning_when_template_auto_opens.rs index 88d40f95..260dd0f6 100644 --- a/llama-cpp-bindings-tests/tests/qwen35_chat_inference_emits_reasoning_when_template_auto_opens.rs +++ b/llama-cpp-bindings-tests/tests/qwen35_chat_inference_emits_reasoning_when_template_auto_opens.rs @@ -2,32 +2,35 @@ use anyhow::Result; use anyhow::bail; use llama_cpp_bindings::ChatMessageParseOutcome; use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::context::params::LlamaContextParams; -use llama_cpp_bindings::llama_backend::LlamaBackend; use llama_cpp_bindings::llama_batch::LlamaBatch; use llama_cpp_bindings::model::AddBos; use llama_cpp_bindings::model::LlamaChatMessage; -use llama_cpp_bindings::model::LlamaModel; use llama_cpp_bindings::sampling::LlamaSampler; use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; -use llama_cpp_bindings_tests::gpu_backend::inference_model_params; -use llama_cpp_bindings_tests::gpu_backend::require_compiled_backends_present; -use llama_cpp_bindings_tests::test_model::download_file_from; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; -const QWEN35_REPO: &str = "unsloth/Qwen3.5-0.8B-GGUF"; -const QWEN35_FILE: &str = "Qwen3.5-0.8B-Q4_K_M.gguf"; +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 2048, + n_batch = 512, + n_ubatch = 128, +)] +fn qwen35_chat_inference_emits_reasoning_when_template_auto_opens( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; -#[test] -fn qwen35_chat_inference_emits_reasoning_when_template_auto_opens() -> Result<()> { - let backend = LlamaBackend::init()?; - require_compiled_backends_present()?; - - let path = download_file_from(QWEN35_REPO, QWEN35_FILE)?; - let params = inference_model_params(); - let model = LlamaModel::load_from_file(&backend, &path, ¶ms)?; - - let context_params = LlamaContextParams::default(); - let mut context = LlamaContext::from_model(&model, &backend, context_params)?; + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; let chat_template = model.chat_template(None)?; let messages = vec![LlamaChatMessage::new( @@ -51,7 +54,7 @@ fn qwen35_chat_inference_emits_reasoning_when_template_auto_opens() -> Result<() let mut sampler = LlamaSampler::greedy(); let initial_position = batch.n_tokens(); let outcome = ClassifySampleLoop { - model: &model, + model, classifier: &mut classifier, sampler: &mut sampler, context: &mut context, @@ -61,52 +64,24 @@ fn qwen35_chat_inference_emits_reasoning_when_template_auto_opens() -> Result<() } .run()?; - assert!( - !outcome.generated_raw.is_empty(), - "Qwen3.5 must generate at least one token" - ); - assert!( - outcome.observed_reasoning > 0, - "Qwen3.5 chat template auto-opens reasoning, so the classifier must emit at \ - least one Reasoning token; outcome={outcome:?}" - ); - assert!( - outcome.observed_content > 0, - "Qwen3.5 must emit at least one Content token after ; outcome={outcome:?}" - ); - assert_eq!( - outcome.observed_undeterminable, 0, - "Qwen3.5 chat template auto-opens reasoning, so the classifier must never emit \ - Undeterminable; outcome={outcome:?}" - ); - assert_eq!( - outcome.observed_tool_call, 0, - "chat without tool definitions must not produce ToolCall tokens; outcome={outcome:?}" - ); + assert!(!outcome.generated_raw.is_empty()); + assert!(outcome.observed_reasoning > 0); + assert!(outcome.observed_content > 0); + assert_eq!(outcome.observed_undeterminable, 0); + assert_eq!(outcome.observed_tool_call, 0); let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?; let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else { bail!("Qwen3.5 chat template must be recognised by the parser; got Unrecognized"); }; - assert!( - !parsed.content.is_empty(), - "parser must see post- content in generated text; generated={:?}", - outcome.generated_raw - ); + assert!(!parsed.content.is_empty()); let usage = classifier.into_usage(); - assert_eq!( - usage.prompt_tokens, prompt_token_count, - "prompt_tokens must equal the tokenizer's prompt length" - ); - assert_eq!( - usage.reasoning_tokens, outcome.observed_reasoning, - "reasoning_tokens must equal observed Reasoning variants" - ); - assert_eq!( - usage.undeterminable_tokens, 0, - "Qwen3.5 with auto-opening reasoning must never produce Undeterminable" - ); + assert_eq!(usage.prompt_tokens, prompt_token_count); + assert_eq!(usage.reasoning_tokens, outcome.observed_reasoning); + assert_eq!(usage.undeterminable_tokens, 0); Ok(()) } + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/qwen35_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs b/llama-cpp-bindings-tests/tests/qwen35_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs index 075ea34b..df0a9b80 100644 --- a/llama-cpp-bindings-tests/tests/qwen35_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs +++ b/llama-cpp-bindings-tests/tests/qwen35_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs @@ -1,27 +1,15 @@ -use std::num::NonZeroU32; - use anyhow::Result; use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::context::params::LlamaContextParams; -use llama_cpp_bindings::llama_backend::LlamaBackend; use llama_cpp_bindings::llama_batch::LlamaBatch; use llama_cpp_bindings::model::AddBos; -use llama_cpp_bindings::model::LlamaModel; use llama_cpp_bindings::sampling::LlamaSampler; use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; -use llama_cpp_bindings_tests::gpu_backend::inference_model_params; -use llama_cpp_bindings_tests::gpu_backend::require_compiled_backends_present; -use llama_cpp_bindings_tests::test_model::download_file_from; - -const QWEN35_REPO: &str = "unsloth/Qwen3.5-0.8B-GGUF"; -const QWEN35_FILE: &str = "Qwen3.5-0.8B-Q4_K_M.gguf"; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; const MAX_GENERATED_TOKENS: i32 = 200; -// Mirrors what Qwen3.5's chat template renders when `enable_thinking=false`: -// the assistant header is followed by a closed empty `...` -// block, so generation begins in CONTENT — no reasoning tokens should ever be -// classified. const QWEN35_THINKING_DISABLED_PROMPT: &str = "\ <|im_start|>user What is 2 + 2?<|im_end|> @@ -34,14 +22,20 @@ What is 2 + 2?<|im_end|> const FORBIDDEN_MARKERS: &[&str] = &["", ""]; -#[test] -fn qwen35_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt() -> Result<()> { - let backend = LlamaBackend::init()?; - require_compiled_backends_present()?; - - let path = download_file_from(QWEN35_REPO, QWEN35_FILE)?; - let params = inference_model_params(); - let model = LlamaModel::load_from_file(&backend, &path, ¶ms)?; +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 2048, + n_ubatch = 512, +)] +fn qwen35_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; let mut classifier = model.sampled_token_classifier(); let prompt_tokens = model.str_to_token(QWEN35_THINKING_DISABLED_PROMPT, AddBos::Never)?; @@ -50,8 +44,11 @@ fn qwen35_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt() -> R let mut batch = LlamaBatch::new(2048, 1)?; classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; - let context_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(8192)); - let mut context = LlamaContext::from_model(&model, &backend, context_params)?; + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; context.decode(&mut batch)?; @@ -68,7 +65,7 @@ fn qwen35_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt() -> R ]); let initial_position = batch.n_tokens(); let outcome = ClassifySampleLoop { - model: &model, + model, classifier: &mut classifier, sampler: &mut sampler, context: &mut context, @@ -80,50 +77,19 @@ fn qwen35_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt() -> R let usage = classifier.usage(); - assert!( - !outcome.generated_raw.is_empty(), - "Qwen3.5 must generate at least one token" - ); - assert_eq!( - outcome.observed_reasoning, 0, - "Qwen3.5 thinking-disabled: classifier must not emit any Reasoning token \ - when the prompt closes the think block before generation begins; \ - generated={:?}", - outcome.generated_raw - ); - assert_eq!( - outcome.observed_undeterminable, 0, - "Qwen3.5 thinking-disabled: prompt-token replay must move section to Content \ - before generation, so no Undeterminable tokens may be emitted; \ - generated={:?}", - outcome.generated_raw - ); - assert_eq!( - usage.reasoning_tokens, 0, - "Qwen3.5 thinking-disabled: usage.reasoning_tokens must be zero; usage={usage:?}" - ); - assert_eq!( - usage.undeterminable_tokens, 0, - "Qwen3.5 thinking-disabled: usage.undeterminable_tokens must be zero; usage={usage:?}" - ); - assert!( - outcome.observed_content > 0, - "Qwen3.5 thinking-disabled: classifier must emit at least one Content token" - ); - assert_eq!( - usage.completion_tokens(), - outcome.observed_content, - "Qwen3.5 thinking-disabled: completion tokens must equal observed Content tokens" - ); + assert!(!outcome.generated_raw.is_empty()); + assert_eq!(outcome.observed_reasoning, 0); + assert_eq!(outcome.observed_undeterminable, 0); + assert_eq!(usage.reasoning_tokens, 0); + assert_eq!(usage.undeterminable_tokens, 0); + assert!(outcome.observed_content > 0); + assert_eq!(usage.completion_tokens(), outcome.observed_content); for forbidden in FORBIDDEN_MARKERS { - assert!( - !outcome.content_stream.contains(forbidden), - "Qwen3.5 thinking-disabled: content_stream leaked marker {forbidden:?}; \ - content_stream={:?}", - outcome.content_stream - ); + assert!(!outcome.content_stream.contains(forbidden)); } Ok(()) } + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/qwen35_classifier_emits_reasoning.rs b/llama-cpp-bindings-tests/tests/qwen35_classifier_emits_reasoning.rs index 76671c96..f9c98932 100644 --- a/llama-cpp-bindings-tests/tests/qwen35_classifier_emits_reasoning.rs +++ b/llama-cpp-bindings-tests/tests/qwen35_classifier_emits_reasoning.rs @@ -1,33 +1,17 @@ -use std::num::NonZeroU32; - use anyhow::Result; use anyhow::bail; use llama_cpp_bindings::ChatMessageParseOutcome; use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::context::params::LlamaContextParams; -use llama_cpp_bindings::llama_backend::LlamaBackend; use llama_cpp_bindings::llama_batch::LlamaBatch; use llama_cpp_bindings::model::AddBos; -use llama_cpp_bindings::model::LlamaModel; use llama_cpp_bindings::sampling::LlamaSampler; use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; -use llama_cpp_bindings_tests::gpu_backend::inference_model_params; -use llama_cpp_bindings_tests::gpu_backend::require_compiled_backends_present; -use llama_cpp_bindings_tests::test_model::download_file_from; - -const QWEN35_REPO: &str = "unsloth/Qwen3.5-0.8B-GGUF"; -const QWEN35_FILE: &str = "Qwen3.5-0.8B-Q4_K_M.gguf"; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; -// Budget tuned so the close marker reliably emits — enough thinking space for a -// concise question. The companion prompt is intentionally direct so the model -// finishes thinking quickly and emits . const MAX_GENERATED_TOKENS: i32 = 1500; -// Qwen3.5's chat template injects `\n` directly into the generation prompt -// when `enable_thinking=true` (the default). The legacy `llama_chat_apply_template` -// path bypasses that jinja branch, so we craft the prompt manually to faithfully -// reproduce the production case where the model resumes generation already inside -// the reasoning block. const QWEN35_THINKING_PROMPT: &str = "\ <|im_start|>user What is 2 + 2?<|im_end|> @@ -37,14 +21,20 @@ What is 2 + 2?<|im_end|> const FORBIDDEN_MARKERS: &[&str] = &["", ""]; -#[test] -fn qwen35_classifier_emits_reasoning_for_thinking_enabled_prompt() -> Result<()> { - let backend = LlamaBackend::init()?; - require_compiled_backends_present()?; - - let path = download_file_from(QWEN35_REPO, QWEN35_FILE)?; - let params = inference_model_params(); - let model = LlamaModel::load_from_file(&backend, &path, ¶ms)?; +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 2048, + n_ubatch = 512, +)] +fn qwen35_classifier_emits_reasoning_for_thinking_enabled_prompt( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; let mut classifier = model.sampled_token_classifier(); let prompt_tokens = model.str_to_token(QWEN35_THINKING_PROMPT, AddBos::Never)?; @@ -53,17 +43,17 @@ fn qwen35_classifier_emits_reasoning_for_thinking_enabled_prompt() -> Result<()> let mut batch = LlamaBatch::new(2048, 1)?; classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; - let context_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(8192)); - let mut context = LlamaContext::from_model(&model, &backend, context_params)?; + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; context.decode(&mut batch)?; let promoted = classifier.commit_prompt_tokens(); assert_eq!(promoted, prompt_token_count); - // Mirrors paddler's production sampler chain: rep penalty + top_k/top_p/min_p + - // temp + dist. The 0.8B model loops on plain greedy; this chain breaks the - // loop and lets the model emit `` reliably. let mut sampler = LlamaSampler::chain_simple([ LlamaSampler::penalties(64, 1.1, 0.0, 0.0), LlamaSampler::top_k(40), @@ -74,7 +64,7 @@ fn qwen35_classifier_emits_reasoning_for_thinking_enabled_prompt() -> Result<()> ]); let initial_position = batch.n_tokens(); let outcome = ClassifySampleLoop { - model: &model, + model, classifier: &mut classifier, sampler: &mut sampler, context: &mut context, @@ -90,71 +80,32 @@ fn qwen35_classifier_emits_reasoning_for_thinking_enabled_prompt() -> Result<()> bail!("Qwen3.5 chat template must be recognised by the parser; got Unrecognized"); }; - assert!( - !outcome.generated_raw.is_empty(), - "Qwen3.5 must generate at least one token" - ); - assert!( - outcome.observed_reasoning > 0, - "Qwen3.5: classifier must emit at least one Reasoning token when the prompt \ - opens a block; outcome={outcome:?}", - ); - assert!( - usage.reasoning_tokens > 0, - "Qwen3.5: usage.reasoning_tokens must be non-zero when the prompt opens a \ - block; usage was {usage:?}" - ); - assert_eq!( - outcome.observed_undeterminable, 0, - "Qwen3.5: prompt-token replay must move section to Reasoning before generation, \ - so no Undeterminable tokens may be emitted; outcome={outcome:?}" - ); - assert_eq!( - usage.undeterminable_tokens, 0, - "Qwen3.5: usage.undeterminable_tokens must be zero; usage={usage:?}" - ); + assert!(!outcome.generated_raw.is_empty()); + assert!(outcome.observed_reasoning > 0); + assert!(usage.reasoning_tokens > 0); + assert_eq!(outcome.observed_undeterminable, 0); + assert_eq!(usage.undeterminable_tokens, 0); assert_eq!( usage.completion_tokens(), outcome.observed_content + outcome.observed_reasoning, - "Qwen3.5: completion tokens must equal observed Content + Reasoning" ); - // Qwen3.5-0.8B genuinely loops on simple prompts even with rep penalty + - // sampling — it cannot reliably close the reasoning block within a tight - // budget. Skip the strict leak assertions when the model never emitted - // ; the parser-equality check is meaningless then. if parsed.reasoning_content.is_empty() { eprintln!( "Qwen3.5 didn't close its reasoning block within {MAX_GENERATED_TOKENS} tokens — \ skipping strict parser-equality assertions" ); } else { - assert_eq!( - outcome.reasoning_stream, parsed.reasoning_content, - "Qwen3.5: per-token reasoning stream must equal parser-side reasoning_content \ - (any difference means a marker leaked into the user-visible stream)", - ); - assert_eq!( - outcome.content_stream, parsed.content, - "Qwen3.5: per-token content stream must equal parser-side content \ - (any difference means a marker leaked into the user-visible stream)", - ); + assert_eq!(outcome.reasoning_stream, parsed.reasoning_content); + assert_eq!(outcome.content_stream, parsed.content); } for forbidden in FORBIDDEN_MARKERS { - assert!( - !outcome.reasoning_stream.contains(forbidden), - "Qwen3.5: reasoning_stream leaked marker {forbidden:?}; \ - reasoning_stream={:?}", - outcome.reasoning_stream - ); - assert!( - !outcome.content_stream.contains(forbidden), - "Qwen3.5: content_stream leaked marker {forbidden:?}; \ - content_stream={:?}", - outcome.content_stream - ); + assert!(!outcome.reasoning_stream.contains(forbidden)); + assert!(!outcome.content_stream.contains(forbidden)); } Ok(()) } + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/qwen35_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs b/llama-cpp-bindings-tests/tests/qwen35_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs index be1578f8..414fde9a 100644 --- a/llama-cpp-bindings-tests/tests/qwen35_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs +++ b/llama-cpp-bindings-tests/tests/qwen35_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs @@ -1,32 +1,52 @@ -#![cfg(feature = "multimodal_capable")] - -use std::num::NonZeroU32; - use anyhow::Result; use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::context::params::LlamaContextParams; use llama_cpp_bindings::llama_batch::LlamaBatch; use llama_cpp_bindings::mtmd::MtmdBitmap; use llama_cpp_bindings::mtmd::MtmdInputText; use llama_cpp_bindings::mtmd::mtmd_default_marker; use llama_cpp_bindings::sampling::LlamaSampler; -use llama_cpp_bindings_tests::FixtureSession; use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; use llama_cpp_bindings_tests::test_model::fixtures_dir; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; const MAX_GENERATED_TOKENS: i32 = 200; -#[test] -fn qwen35_classifier_emits_reasoning_for_multimodal_thinking_prompt() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let mtmd_ctx = fixture.mtmd_context()?; +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 4096, + n_batch = 512, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 4096, + n_batch = 512, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn qwen35_classifier_emits_reasoning_for_multimodal_thinking_prompt( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); - let context_params = LlamaContextParams::default() - .with_n_ctx(NonZeroU32::new(4096)) - .with_n_batch(512); - let mut context = LlamaContext::from_model(model, backend, context_params)?; + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; let image_path = fixtures_dir().join("llamas.jpg"); let image_path_str = image_path @@ -87,3 +107,5 @@ fn qwen35_classifier_emits_reasoning_for_multimodal_thinking_prompt() -> Result< Ok(()) } + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/qwen35_parses_constrained_schema_payload.rs b/llama-cpp-bindings-tests/tests/qwen35_parses_constrained_schema_payload.rs index 712f09d3..f517a4e7 100644 --- a/llama-cpp-bindings-tests/tests/qwen35_parses_constrained_schema_payload.rs +++ b/llama-cpp-bindings-tests/tests/qwen35_parses_constrained_schema_payload.rs @@ -2,17 +2,12 @@ use anyhow::Result; use anyhow::bail; use llama_cpp_bindings::ChatMessageParseOutcome; use llama_cpp_bindings::ToolCallArguments; -use llama_cpp_bindings::llama_backend::LlamaBackend; -use llama_cpp_bindings::model::LlamaModel; -use llama_cpp_bindings_tests::gpu_backend::inference_model_params; -use llama_cpp_bindings_tests::gpu_backend::require_compiled_backends_present; -use llama_cpp_bindings_tests::test_model::download_file_from; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; use serde_json::Value; use serde_json::json; -const QWEN35_REPO: &str = "unsloth/Qwen3.5-0.8B-GGUF"; -const QWEN35_FILE: &str = "Qwen3.5-0.8B-Q4_K_M.gguf"; - const NEGOTIATE_WITH_CAT_TOOLS_JSON: &str = r#"[ { "type": "function", @@ -68,16 +63,17 @@ fn arguments_as_json(arguments: &ToolCallArguments) -> Result<&Value> { } } -#[test] -fn qwen35_parses_constrained_schema_payload() -> Result<()> { - let backend = LlamaBackend::init()?; - require_compiled_backends_present()?; - - let path = download_file_from(QWEN35_REPO, QWEN35_FILE)?; - let params = inference_model_params(); - let model = LlamaModel::load_from_file(&backend, &path, ¶ms)?; - - let outcome = model.parse_chat_message( +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn qwen35_parses_constrained_schema_payload(fixture: &LlamaFixture<'_>) -> Result<()> { + let outcome = fixture.model.parse_chat_message( NEGOTIATE_WITH_CAT_TOOLS_JSON, NEGOTIATE_WITH_CAT_INPUT, false, @@ -90,12 +86,7 @@ fn qwen35_parses_constrained_schema_payload() -> Result<()> { ); }; - assert_eq!( - parsed.tool_calls.len(), - 1, - "expected exactly one parsed tool call; got {:?}", - parsed.tool_calls - ); + assert_eq!(parsed.tool_calls.len(), 1); assert_eq!(parsed.tool_calls[0].name, "negotiate_with_cat"); assert_eq!(parsed.tool_calls[0].id, "call_0"); assert_eq!( @@ -109,3 +100,5 @@ fn qwen35_parses_constrained_schema_payload() -> Result<()> { Ok(()) } + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/qwen35_parses_tool_call_payload.rs b/llama-cpp-bindings-tests/tests/qwen35_parses_tool_call_payload.rs index 28efc3fc..2fe2b89c 100644 --- a/llama-cpp-bindings-tests/tests/qwen35_parses_tool_call_payload.rs +++ b/llama-cpp-bindings-tests/tests/qwen35_parses_tool_call_payload.rs @@ -2,14 +2,9 @@ use anyhow::Result; use anyhow::bail; use llama_cpp_bindings::ChatMessageParseOutcome; use llama_cpp_bindings::ToolCallArguments; -use llama_cpp_bindings::llama_backend::LlamaBackend; -use llama_cpp_bindings::model::LlamaModel; -use llama_cpp_bindings_tests::gpu_backend::inference_model_params; -use llama_cpp_bindings_tests::gpu_backend::require_compiled_backends_present; -use llama_cpp_bindings_tests::test_model::download_file_from; - -const QWEN35_REPO: &str = "unsloth/Qwen3.5-0.8B-GGUF"; -const QWEN35_FILE: &str = "Qwen3.5-0.8B-Q4_K_M.gguf"; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; const TOOLS_JSON: &str = r#"[ { @@ -53,31 +48,24 @@ Berlin\n\ \n\ "; -fn load_qwen35() -> Result<(LlamaBackend, LlamaModel)> { - let backend = LlamaBackend::init()?; - require_compiled_backends_present()?; - let path = download_file_from(QWEN35_REPO, QWEN35_FILE)?; - let params = inference_model_params(); - let model = LlamaModel::load_from_file(&backend, &path, ¶ms)?; - - Ok((backend, model)) -} - -#[test] -fn qwen35_parses_tool_call_payload() -> Result<()> { - let (_backend, model) = load_qwen35()?; - - let outcome = model.parse_chat_message(TOOLS_JSON, QWEN_XML_PAYLOAD, false)?; +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn qwen35_parses_tool_call_payload(fixture: &LlamaFixture<'_>) -> Result<()> { + let outcome = fixture + .model + .parse_chat_message(TOOLS_JSON, QWEN_XML_PAYLOAD, false)?; let ChatMessageParseOutcome::Recognized(parsed) = outcome else { bail!("expected Recognized for Qwen XML on a Qwen-3.5 model; got Unrecognized"); }; - assert_eq!( - parsed.tool_calls.len(), - 1, - "expected one tool call; got {:?}", - parsed.tool_calls - ); + assert_eq!(parsed.tool_calls.len(), 1); assert_eq!(parsed.tool_calls[0].name, "get_weather"); let location = match &parsed.tool_calls[0].arguments { ToolCallArguments::ValidJson(value) => value @@ -93,11 +81,19 @@ fn qwen35_parses_tool_call_payload() -> Result<()> { Ok(()) } -#[test] -fn qwen35_parses_partial_tool_call_returns_pending_state() -> Result<()> { - let (_backend, model) = load_qwen35()?; - - let outcome = model.parse_chat_message(TOOLS_JSON, PARTIAL_QWEN_XML_PAYLOAD, true)?; +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn qwen35_parses_partial_tool_call_returns_pending_state(fixture: &LlamaFixture<'_>) -> Result<()> { + let outcome = fixture + .model + .parse_chat_message(TOOLS_JSON, PARTIAL_QWEN_XML_PAYLOAD, true)?; let ChatMessageParseOutcome::Recognized(parsed) = outcome else { bail!("expected Recognized for partial Qwen XML on a Qwen-3.5 model; got Unrecognized"); @@ -107,11 +103,19 @@ fn qwen35_parses_partial_tool_call_returns_pending_state() -> Result<()> { Ok(()) } -#[test] -fn qwen35_parses_multiple_tool_calls() -> Result<()> { - let (_backend, model) = load_qwen35()?; - - let outcome = model.parse_chat_message(TOOLS_JSON, TWO_QWEN_XML_PAYLOADS, false)?; +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn qwen35_parses_multiple_tool_calls(fixture: &LlamaFixture<'_>) -> Result<()> { + let outcome = fixture + .model + .parse_chat_message(TOOLS_JSON, TWO_QWEN_XML_PAYLOADS, false)?; let ChatMessageParseOutcome::Recognized(parsed) = outcome else { bail!( @@ -126,3 +130,5 @@ fn qwen35_parses_multiple_tool_calls() -> Result<()> { Ok(()) } + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/qwen35_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested.rs b/llama-cpp-bindings-tests/tests/qwen35_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested.rs index b4ea9692..96b76cf5 100644 --- a/llama-cpp-bindings-tests/tests/qwen35_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested.rs +++ b/llama-cpp-bindings-tests/tests/qwen35_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested.rs @@ -1,14 +1,9 @@ use anyhow::Result; use anyhow::bail; use llama_cpp_bindings::ChatMessageParseOutcome; -use llama_cpp_bindings::llama_backend::LlamaBackend; -use llama_cpp_bindings::model::LlamaModel; -use llama_cpp_bindings_tests::gpu_backend::inference_model_params; -use llama_cpp_bindings_tests::gpu_backend::require_compiled_backends_present; -use llama_cpp_bindings_tests::test_model::download_file_from; - -const QWEN35_REPO: &str = "unsloth/Qwen3.5-0.8B-GGUF"; -const QWEN35_FILE: &str = "Qwen3.5-0.8B-Q4_K_M.gguf"; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; const TOOLS_JSON: &str = r#"[ { @@ -29,17 +24,21 @@ const TOOLS_JSON: &str = r#"[ const PLAIN_CONTENT: &str = "Sorry, I cannot help with that."; -#[test] -fn qwen35_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested() --> Result<()> { - let backend = LlamaBackend::init()?; - require_compiled_backends_present()?; - - let path = download_file_from(QWEN35_REPO, QWEN35_FILE)?; - let params = inference_model_params(); - let model = LlamaModel::load_from_file(&backend, &path, ¶ms)?; - - let outcome = model.parse_chat_message(TOOLS_JSON, PLAIN_CONTENT, false)?; +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn qwen35_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_requested( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let outcome = fixture + .model + .parse_chat_message(TOOLS_JSON, PLAIN_CONTENT, false)?; let ChatMessageParseOutcome::Recognized(parsed) = outcome else { bail!( @@ -55,3 +54,5 @@ fn qwen35_recognizes_empty_tool_calls_when_input_is_plain_content_with_tools_req Ok(()) } + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/qwen36_chat_inference_emits_reasoning_when_template_auto_opens.rs b/llama-cpp-bindings-tests/tests/qwen36_chat_inference_emits_reasoning_when_template_auto_opens.rs index f402f0be..233cef95 100644 --- a/llama-cpp-bindings-tests/tests/qwen36_chat_inference_emits_reasoning_when_template_auto_opens.rs +++ b/llama-cpp-bindings-tests/tests/qwen36_chat_inference_emits_reasoning_when_template_auto_opens.rs @@ -2,32 +2,35 @@ use anyhow::Result; use anyhow::bail; use llama_cpp_bindings::ChatMessageParseOutcome; use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::context::params::LlamaContextParams; -use llama_cpp_bindings::llama_backend::LlamaBackend; use llama_cpp_bindings::llama_batch::LlamaBatch; use llama_cpp_bindings::model::AddBos; use llama_cpp_bindings::model::LlamaChatMessage; -use llama_cpp_bindings::model::LlamaModel; use llama_cpp_bindings::sampling::LlamaSampler; use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; -use llama_cpp_bindings_tests::gpu_backend::inference_model_params; -use llama_cpp_bindings_tests::gpu_backend::require_compiled_backends_present; -use llama_cpp_bindings_tests::test_model::download_file_from; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; -const QWEN36_REPO: &str = "unsloth/Qwen3.6-35B-A3B-GGUF"; -const QWEN36_FILE: &str = "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"; +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 2048, + n_batch = 512, + n_ubatch = 128, +)] +fn qwen36_chat_inference_emits_reasoning_when_template_auto_opens( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; -#[test] -fn qwen36_chat_inference_emits_reasoning_when_template_auto_opens() -> Result<()> { - let backend = LlamaBackend::init()?; - require_compiled_backends_present()?; - - let path = download_file_from(QWEN36_REPO, QWEN36_FILE)?; - let params = inference_model_params(); - let model = LlamaModel::load_from_file(&backend, &path, ¶ms)?; - - let context_params = LlamaContextParams::default(); - let mut context = LlamaContext::from_model(&model, &backend, context_params)?; + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; let chat_template = model.chat_template(None)?; let messages = vec![LlamaChatMessage::new( @@ -51,7 +54,7 @@ fn qwen36_chat_inference_emits_reasoning_when_template_auto_opens() -> Result<() let mut sampler = LlamaSampler::greedy(); let initial_position = batch.n_tokens(); let outcome = ClassifySampleLoop { - model: &model, + model, classifier: &mut classifier, sampler: &mut sampler, context: &mut context, @@ -61,52 +64,24 @@ fn qwen36_chat_inference_emits_reasoning_when_template_auto_opens() -> Result<() } .run()?; - assert!( - !outcome.generated_raw.is_empty(), - "Qwen3.6 must generate at least one token" - ); - assert!( - outcome.observed_reasoning > 0, - "Qwen3.6 chat template auto-opens reasoning, so the classifier must emit at \ - least one Reasoning token; outcome={outcome:?}" - ); - assert!( - outcome.observed_content > 0, - "Qwen3.6 must emit at least one Content token after ; outcome={outcome:?}" - ); - assert_eq!( - outcome.observed_undeterminable, 0, - "Qwen3.6 chat template auto-opens reasoning, so the classifier must never emit \ - Undeterminable; outcome={outcome:?}" - ); - assert_eq!( - outcome.observed_tool_call, 0, - "chat without tool definitions must not produce ToolCall tokens; outcome={outcome:?}" - ); + assert!(!outcome.generated_raw.is_empty()); + assert!(outcome.observed_reasoning > 0); + assert!(outcome.observed_content > 0); + assert_eq!(outcome.observed_undeterminable, 0); + assert_eq!(outcome.observed_tool_call, 0); let parse_outcome = model.parse_chat_message("[]", &outcome.generated_raw, false)?; let ChatMessageParseOutcome::Recognized(parsed) = parse_outcome else { bail!("Qwen3.6 chat template must be recognised by the parser; got Unrecognized"); }; - assert!( - !parsed.content.is_empty(), - "parser must see post- content in generated text; generated={:?}", - outcome.generated_raw - ); + assert!(!parsed.content.is_empty()); let usage = classifier.into_usage(); - assert_eq!( - usage.prompt_tokens, prompt_token_count, - "prompt_tokens must equal the tokenizer's prompt length" - ); - assert_eq!( - usage.reasoning_tokens, outcome.observed_reasoning, - "reasoning_tokens must equal observed Reasoning variants" - ); - assert_eq!( - usage.undeterminable_tokens, 0, - "Qwen3.6 with auto-opening reasoning must never produce Undeterminable" - ); + assert_eq!(usage.prompt_tokens, prompt_token_count); + assert_eq!(usage.reasoning_tokens, outcome.observed_reasoning); + assert_eq!(usage.undeterminable_tokens, 0); Ok(()) } + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/qwen36_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs b/llama-cpp-bindings-tests/tests/qwen36_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs index aee03a2a..2b57fa17 100644 --- a/llama-cpp-bindings-tests/tests/qwen36_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs +++ b/llama-cpp-bindings-tests/tests/qwen36_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt.rs @@ -1,26 +1,15 @@ -use std::num::NonZeroU32; - use anyhow::Result; use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::context::params::LlamaContextParams; -use llama_cpp_bindings::llama_backend::LlamaBackend; use llama_cpp_bindings::llama_batch::LlamaBatch; use llama_cpp_bindings::model::AddBos; -use llama_cpp_bindings::model::LlamaModel; use llama_cpp_bindings::sampling::LlamaSampler; use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; -use llama_cpp_bindings_tests::gpu_backend::inference_model_params; -use llama_cpp_bindings_tests::gpu_backend::require_compiled_backends_present; -use llama_cpp_bindings_tests::test_model::download_file_from; - -const QWEN36_REPO: &str = "unsloth/Qwen3.6-35B-A3B-GGUF"; -const QWEN36_FILE: &str = "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; const MAX_GENERATED_TOKENS: i32 = 200; -// Mirrors what Qwen3.6's chat template renders when `enable_thinking=false`: -// the assistant header is followed by a closed empty `...` -// block, so generation begins in CONTENT. const QWEN36_THINKING_DISABLED_PROMPT: &str = "\ <|im_start|>user What is 2 + 2?<|im_end|> @@ -33,14 +22,20 @@ What is 2 + 2?<|im_end|> const FORBIDDEN_MARKERS: &[&str] = &["", ""]; -#[test] -fn qwen36_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt() -> Result<()> { - let backend = LlamaBackend::init()?; - require_compiled_backends_present()?; - - let path = download_file_from(QWEN36_REPO, QWEN36_FILE)?; - let params = inference_model_params(); - let model = LlamaModel::load_from_file(&backend, &path, ¶ms)?; +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 2048, + n_ubatch = 512, +)] +fn qwen36_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; let mut classifier = model.sampled_token_classifier(); let prompt_tokens = model.str_to_token(QWEN36_THINKING_DISABLED_PROMPT, AddBos::Never)?; @@ -49,8 +44,11 @@ fn qwen36_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt() -> R let mut batch = LlamaBatch::new(2048, 1)?; classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; - let context_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(8192)); - let mut context = LlamaContext::from_model(&model, &backend, context_params)?; + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; context.decode(&mut batch)?; @@ -67,7 +65,7 @@ fn qwen36_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt() -> R ]); let initial_position = batch.n_tokens(); let outcome = ClassifySampleLoop { - model: &model, + model, classifier: &mut classifier, sampler: &mut sampler, context: &mut context, @@ -79,50 +77,19 @@ fn qwen36_classifier_does_not_emit_reasoning_for_thinking_disabled_prompt() -> R let usage = classifier.usage(); - assert!( - !outcome.generated_raw.is_empty(), - "Qwen3.6 must generate at least one token" - ); - assert_eq!( - outcome.observed_reasoning, 0, - "Qwen3.6 thinking-disabled: classifier must not emit any Reasoning token \ - when the prompt closes the think block before generation begins; \ - generated={:?}", - outcome.generated_raw - ); - assert_eq!( - outcome.observed_undeterminable, 0, - "Qwen3.6 thinking-disabled: prompt-token replay must move section to Content \ - before generation, so no Undeterminable tokens may be emitted; \ - generated={:?}", - outcome.generated_raw - ); - assert_eq!( - usage.reasoning_tokens, 0, - "Qwen3.6 thinking-disabled: usage.reasoning_tokens must be zero; usage={usage:?}" - ); - assert_eq!( - usage.undeterminable_tokens, 0, - "Qwen3.6 thinking-disabled: usage.undeterminable_tokens must be zero; usage={usage:?}" - ); - assert!( - outcome.observed_content > 0, - "Qwen3.6 thinking-disabled: classifier must emit at least one Content token" - ); - assert_eq!( - usage.completion_tokens(), - outcome.observed_content, - "Qwen3.6 thinking-disabled: completion tokens must equal observed Content tokens" - ); + assert!(!outcome.generated_raw.is_empty()); + assert_eq!(outcome.observed_reasoning, 0); + assert_eq!(outcome.observed_undeterminable, 0); + assert_eq!(usage.reasoning_tokens, 0); + assert_eq!(usage.undeterminable_tokens, 0); + assert!(outcome.observed_content > 0); + assert_eq!(usage.completion_tokens(), outcome.observed_content); for forbidden in FORBIDDEN_MARKERS { - assert!( - !outcome.content_stream.contains(forbidden), - "Qwen3.6 thinking-disabled: content_stream leaked marker {forbidden:?}; \ - content_stream={:?}", - outcome.content_stream - ); + assert!(!outcome.content_stream.contains(forbidden)); } Ok(()) } + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/qwen36_classifier_emits_reasoning.rs b/llama-cpp-bindings-tests/tests/qwen36_classifier_emits_reasoning.rs index 19596fa6..c9c16a64 100644 --- a/llama-cpp-bindings-tests/tests/qwen36_classifier_emits_reasoning.rs +++ b/llama-cpp-bindings-tests/tests/qwen36_classifier_emits_reasoning.rs @@ -1,30 +1,17 @@ -use std::num::NonZeroU32; - use anyhow::Result; use anyhow::bail; use llama_cpp_bindings::ChatMessageParseOutcome; use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::context::params::LlamaContextParams; -use llama_cpp_bindings::llama_backend::LlamaBackend; use llama_cpp_bindings::llama_batch::LlamaBatch; use llama_cpp_bindings::model::AddBos; -use llama_cpp_bindings::model::LlamaModel; use llama_cpp_bindings::sampling::LlamaSampler; use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; -use llama_cpp_bindings_tests::gpu_backend::inference_model_params; -use llama_cpp_bindings_tests::gpu_backend::require_compiled_backends_present; -use llama_cpp_bindings_tests::test_model::download_file_from; - -const QWEN36_REPO: &str = "unsloth/Qwen3.6-35B-A3B-GGUF"; -const QWEN36_FILE: &str = "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; const MAX_GENERATED_TOKENS: i32 = 1500; -// Qwen3.6's chat template injects `\n` directly into the generation prompt -// when `enable_thinking=true` (the default). The legacy `llama_chat_apply_template` -// path bypasses that jinja branch, so we craft the prompt manually to faithfully -// reproduce the production case where the model resumes generation already inside -// the reasoning block. const QWEN36_THINKING_PROMPT: &str = "\ <|im_start|>user What is 2 + 2?<|im_end|> @@ -34,14 +21,20 @@ What is 2 + 2?<|im_end|> const FORBIDDEN_MARKERS: &[&str] = &["", ""]; -#[test] -fn qwen36_classifier_emits_reasoning_for_thinking_enabled_prompt() -> Result<()> { - let backend = LlamaBackend::init()?; - require_compiled_backends_present()?; - - let path = download_file_from(QWEN36_REPO, QWEN36_FILE)?; - let params = inference_model_params(); - let model = LlamaModel::load_from_file(&backend, &path, ¶ms)?; +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 2048, + n_ubatch = 512, +)] +fn qwen36_classifier_emits_reasoning_for_thinking_enabled_prompt( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; let mut classifier = model.sampled_token_classifier(); let prompt_tokens = model.str_to_token(QWEN36_THINKING_PROMPT, AddBos::Never)?; @@ -50,8 +43,11 @@ fn qwen36_classifier_emits_reasoning_for_thinking_enabled_prompt() -> Result<()> let mut batch = LlamaBatch::new(2048, 1)?; classifier.feed_prompt_sequence_to_batch(&mut batch, &prompt_tokens, 0, false)?; - let context_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(8192)); - let mut context = LlamaContext::from_model(&model, &backend, context_params)?; + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; context.decode(&mut batch)?; @@ -68,7 +64,7 @@ fn qwen36_classifier_emits_reasoning_for_thinking_enabled_prompt() -> Result<()> ]); let initial_position = batch.n_tokens(); let outcome = ClassifySampleLoop { - model: &model, + model, classifier: &mut classifier, sampler: &mut sampler, context: &mut context, @@ -84,68 +80,29 @@ fn qwen36_classifier_emits_reasoning_for_thinking_enabled_prompt() -> Result<()> bail!("Qwen3.6 chat template must be recognised by the parser; got Unrecognized"); }; - assert!( - !outcome.generated_raw.is_empty(), - "Qwen3.6 must generate at least one token" - ); - assert!( - outcome.observed_reasoning > 0, - "Qwen3.6: classifier must emit at least one Reasoning token when the prompt \ - opens a block; outcome={outcome:?}", - ); - assert!( - usage.reasoning_tokens > 0, - "Qwen3.6: usage.reasoning_tokens must be non-zero when the prompt opens a \ - block; usage was {usage:?}" - ); - assert_eq!( - outcome.observed_undeterminable, 0, - "Qwen3.6: prompt-token replay must move section to Reasoning before generation, \ - so no Undeterminable tokens may be emitted; outcome={outcome:?}" - ); - assert_eq!( - usage.undeterminable_tokens, 0, - "Qwen3.6: usage.undeterminable_tokens must be zero; usage={usage:?}" - ); + assert!(!outcome.generated_raw.is_empty()); + assert!(outcome.observed_reasoning > 0); + assert!(usage.reasoning_tokens > 0); + assert_eq!(outcome.observed_undeterminable, 0); + assert_eq!(usage.undeterminable_tokens, 0); assert_eq!( usage.completion_tokens(), outcome.observed_content + outcome.observed_reasoning, - "Qwen3.6: completion tokens must equal observed Content + Reasoning" ); if parsed.reasoning_content.is_empty() { - eprintln!( - "Qwen3.6 parser returned empty reasoning_content (likely a partial parse \ - over `<|im_end|>`-truncated output) — relying on the FORBIDDEN_MARKERS \ - substring check below for leak detection." - ); + eprintln!("Qwen3.6 parser returned empty reasoning_content — relying on FORBIDDEN_MARKERS"); } else { - assert_eq!( - outcome.reasoning_stream, parsed.reasoning_content, - "Qwen3.6: per-token reasoning stream must equal parser-side reasoning_content \ - (any difference means a marker leaked into the user-visible stream)", - ); - assert_eq!( - outcome.content_stream, parsed.content, - "Qwen3.6: per-token content stream must equal parser-side content \ - (any difference means a marker leaked into the user-visible stream)", - ); + assert_eq!(outcome.reasoning_stream, parsed.reasoning_content); + assert_eq!(outcome.content_stream, parsed.content); } for forbidden in FORBIDDEN_MARKERS { - assert!( - !outcome.reasoning_stream.contains(forbidden), - "Qwen3.6: reasoning_stream leaked marker {forbidden:?}; \ - reasoning_stream={:?}", - outcome.reasoning_stream - ); - assert!( - !outcome.content_stream.contains(forbidden), - "Qwen3.6: content_stream leaked marker {forbidden:?}; \ - content_stream={:?}", - outcome.content_stream - ); + assert!(!outcome.reasoning_stream.contains(forbidden)); + assert!(!outcome.content_stream.contains(forbidden)); } Ok(()) } + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/qwen36_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs b/llama-cpp-bindings-tests/tests/qwen36_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs index 1d9c1621..cf43adfd 100644 --- a/llama-cpp-bindings-tests/tests/qwen36_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs +++ b/llama-cpp-bindings-tests/tests/qwen36_classifier_emits_reasoning_for_multimodal_thinking_prompt.rs @@ -1,57 +1,48 @@ -#![cfg(feature = "multimodal_capable")] - -use std::num::NonZeroU32; - use anyhow::Result; use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::context::params::LlamaContextParams; -use llama_cpp_bindings::llama_backend::LlamaBackend; use llama_cpp_bindings::llama_batch::LlamaBatch; -use llama_cpp_bindings::model::LlamaModel; use llama_cpp_bindings::mtmd::MtmdBitmap; -use llama_cpp_bindings::mtmd::MtmdContext; -use llama_cpp_bindings::mtmd::MtmdContextParams; use llama_cpp_bindings::mtmd::MtmdInputText; use llama_cpp_bindings::mtmd::mtmd_default_marker; use llama_cpp_bindings::sampling::LlamaSampler; use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; -use llama_cpp_bindings_tests::gpu_backend::inference_model_params; -use llama_cpp_bindings_tests::gpu_backend::require_compiled_backends_present; -use llama_cpp_bindings_tests::test_model::download_file_from; use llama_cpp_bindings_tests::test_model::fixtures_dir; - -const QWEN36_REPO: &str = "unsloth/Qwen3.6-35B-A3B-GGUF"; -const QWEN36_FILE: &str = "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"; -const QWEN36_MMPROJ_FILE: &str = "mmproj-F16.gguf"; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; const MAX_GENERATED_TOKENS: i32 = 200; -#[test] -fn qwen36_classifier_emits_reasoning_for_multimodal_thinking_prompt() -> Result<()> { - let backend = LlamaBackend::init()?; - require_compiled_backends_present()?; - - let model_path = download_file_from(QWEN36_REPO, QWEN36_FILE)?; - let mmproj_path = download_file_from(QWEN36_REPO, QWEN36_MMPROJ_FILE)?; - let params = inference_model_params(); - let model = LlamaModel::load_from_file(&backend, &model_path, ¶ms)?; - - let mtmd_params = MtmdContextParams::default(); - let mmproj_str = mmproj_path - .to_str() - .ok_or_else(|| anyhow::anyhow!("mmproj path is not valid UTF-8"))?; - let mtmd_ctx = MtmdContext::init_from_file(mmproj_str, &model, &mtmd_params)?; - - let context_params = LlamaContextParams::default() - .with_n_ctx(NonZeroU32::new(8192)) - .with_n_batch(512); - let mut context = LlamaContext::from_model(&model, &backend, context_params)?; +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 8192, + n_batch = 512, + n_ubatch = 512, + mmproj_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "mmproj-F16.gguf"), +)] +fn qwen36_classifier_emits_reasoning_for_multimodal_thinking_prompt( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; + let mtmd_ctx = fixture + .mtmd_context + .expect("mmproj_file declared in attribute"); + + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; let image_path = fixtures_dir().join("llamas.jpg"); let image_path_str = image_path .to_str() .ok_or_else(|| anyhow::anyhow!("image path is not valid UTF-8"))?; - let bitmap = MtmdBitmap::from_file(&mtmd_ctx, image_path_str)?; + let bitmap = MtmdBitmap::from_file(mtmd_ctx, image_path_str)?; let marker = mtmd_default_marker(); let prompt = format!( @@ -67,8 +58,7 @@ fn qwen36_classifier_emits_reasoning_for_multimodal_thinking_prompt() -> Result< let chunks = mtmd_ctx.tokenize(input_text, &[&bitmap])?; let mut classifier = model.sampled_token_classifier(); - let n_past = - classifier.eval_multimodal_chunks(&chunks, &mtmd_ctx, &context, 0, 0, 512, true)?; + let n_past = classifier.eval_multimodal_chunks(&chunks, mtmd_ctx, &context, 0, 0, 512, true)?; let mut sampler = LlamaSampler::chain_simple([ LlamaSampler::penalties(64, 1.1, 0.0, 0.0), @@ -81,7 +71,7 @@ fn qwen36_classifier_emits_reasoning_for_multimodal_thinking_prompt() -> Result< let mut batch = LlamaBatch::new(2048, 1)?; let outcome = ClassifySampleLoop { - model: &model, + model, classifier: &mut classifier, sampler: &mut sampler, context: &mut context, @@ -95,8 +85,7 @@ fn qwen36_classifier_emits_reasoning_for_multimodal_thinking_prompt() -> Result< if outcome.observed_reasoning == 0 { anyhow::bail!( - "Qwen 3.6 multimodal + thinking: classifier must emit at least one Reasoning token \ - when the prompt opens a `` block; outcome={outcome:?}" + "Qwen 3.6 multimodal + thinking: classifier must emit at least one Reasoning token; outcome={outcome:?}" ); } if usage.reasoning_tokens == 0 { @@ -107,3 +96,5 @@ fn qwen36_classifier_emits_reasoning_for_multimodal_thinking_prompt() -> Result< Ok(()) } + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/reranker.rs b/llama-cpp-bindings-tests/tests/reranker.rs index 08f0de6a..d08de7eb 100644 --- a/llama-cpp-bindings-tests/tests/reranker.rs +++ b/llama-cpp-bindings-tests/tests/reranker.rs @@ -2,11 +2,12 @@ use std::time::Duration; use anyhow::{Context, Result, bail}; use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::context::params::LlamaContextParams; use llama_cpp_bindings::ggml_time_us; use llama_cpp_bindings::llama_batch::LlamaBatch; use llama_cpp_bindings::model::AddBos; -use llama_cpp_bindings_tests::FixtureSession; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; fn normalize(input: &[f32]) -> Vec { let magnitude = input @@ -25,11 +26,20 @@ fn cosine_similarity(vec_a: &[f32], vec_b: &[f32]) -> f32 { .sum::() } -#[test] -fn reranking_produces_scores() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.embedding_model()?; +#[llama_test( + model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, + n_seq_max = 2, + n_threads_batch = 8, + embeddings = true, +)] +fn reranking_produces_scores(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; let query = "What is machine learning?"; let documents = [ @@ -38,13 +48,18 @@ fn reranking_produces_scores() -> Result<()> { ]; let document_count = documents.len(); + assert_eq!( + u32::try_from(document_count)?, + fixture.context_params.n_seq_max, + "attribute n_seq_max must match the document count this trial expects", + ); - let ctx_params = LlamaContextParams::default() - .with_n_threads_batch(std::thread::available_parallelism()?.get().try_into()?) - .with_n_seq_max(u32::try_from(document_count)?) - .with_embeddings(true); - let mut ctx = LlamaContext::from_model(model, backend, ctx_params) - .with_context(|| "unable to create context")?; + let mut ctx = LlamaContext::from_model( + model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + ) + .with_context(|| "unable to create context")?; let prompt_lines: Vec = documents .iter() @@ -139,3 +154,5 @@ fn reranking_produces_scores() -> Result<()> { Ok(()) } + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/sampled_token_classifier_markers.rs b/llama-cpp-bindings-tests/tests/sampled_token_classifier_markers.rs index e1c4fef3..4127fc58 100644 --- a/llama-cpp-bindings-tests/tests/sampled_token_classifier_markers.rs +++ b/llama-cpp-bindings-tests/tests/sampled_token_classifier_markers.rs @@ -1,48 +1,148 @@ +#![expect( + clippy::unnecessary_wraps, + reason = "trial fns share the harness LlamaTestFn signature even when their bodies never propagate" +)] + use anyhow::Result; use llama_cpp_bindings::SampledToken; use llama_cpp_bindings::llama_batch::LlamaBatch; use llama_cpp_bindings::sampled_token_classifier::SampledTokenClassifier; use llama_cpp_bindings::sampled_token_section::SampledTokenSection; use llama_cpp_bindings::streaming_markers::StreamingMarkers; -use llama_cpp_bindings_tests::FixtureSession; - -#[test] -fn classifier_starts_in_pending_section_for_default_fixture() { - let fixture = FixtureSession::open().expect("open fixture"); - let model = fixture.default_model(); - - let classifier = model.sampled_token_classifier(); +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn classifier_starts_in_pending_section_for_default_fixture( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let classifier = fixture.model.sampled_token_classifier(); assert_eq!(classifier.current_section(), SampledTokenSection::Pending); + Ok(()) } -#[test] -fn classifier_construction_is_idempotent_across_calls() { - let fixture = FixtureSession::open().expect("open fixture"); - let model = fixture.default_model(); - - let first = model.sampled_token_classifier(); - let second = model.sampled_token_classifier(); +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn classifier_construction_is_idempotent_across_calls(fixture: &LlamaFixture<'_>) -> Result<()> { + let first = fixture.model.sampled_token_classifier(); + let second = fixture.model.sampled_token_classifier(); assert_eq!(first.current_section(), second.current_section()); assert_eq!(first.usage(), second.usage()); -} - -#[test] -fn diagnose_tool_call_synthetic_renders_runs_without_panic() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - - let _ = model.diagnose_tool_call_synthetic_renders()?; - Ok(()) } -#[test] -fn ingest_with_no_markers_emits_undeterminable_with_visible_and_raw_piece() { - let fixture = FixtureSession::open().expect("open fixture"); - let model = fixture.default_model(); - +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn ingest_with_no_markers_emits_undeterminable_with_visible_and_raw_piece( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let model = fixture.model; let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default()); let outcomes = classifier.ingest(model.token_bos()); @@ -55,26 +155,96 @@ fn ingest_with_no_markers_emits_undeterminable_with_visible_and_raw_piece() { )); assert_eq!(outcome.visible_piece, outcome.raw_piece); assert_eq!(classifier.usage().undeterminable_tokens, 1); + Ok(()) } -#[test] -fn ingest_with_no_markers_decodes_each_token_independently() { - let fixture = FixtureSession::open().expect("open fixture"); - let model = fixture.default_model(); - +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn ingest_with_no_markers_decodes_each_token_independently( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let model = fixture.model; let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default()); let _ = classifier.ingest(model.token_bos()); let _ = classifier.ingest(model.token_eos()); assert_eq!(classifier.usage().undeterminable_tokens, 2); + Ok(()) } -#[test] -fn ingest_prompt_token_with_no_markers_is_a_noop() { - let fixture = FixtureSession::open().expect("open fixture"); - let model = fixture.default_model(); - +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn ingest_prompt_token_with_no_markers_is_a_noop(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default()); let usage_before = *classifier.usage(); @@ -83,13 +253,47 @@ fn ingest_prompt_token_with_no_markers_is_a_noop() { assert_eq!(*classifier.usage(), usage_before); assert_eq!(classifier.current_section(), SampledTokenSection::Pending); + Ok(()) } -#[test] -fn feed_prompt_to_batch_increments_pending_prompt_tokens() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn feed_prompt_to_batch_increments_pending_prompt_tokens(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default()); let mut batch = LlamaBatch::new(8, 1)?; @@ -102,11 +306,44 @@ fn feed_prompt_to_batch_increments_pending_prompt_tokens() -> Result<()> { Ok(()) } -#[test] -fn feed_prompt_sequence_to_batch_stages_all_tokens() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn feed_prompt_sequence_to_batch_stages_all_tokens(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default()); let mut batch = LlamaBatch::new(8, 1)?; @@ -119,11 +356,46 @@ fn feed_prompt_sequence_to_batch_stages_all_tokens() -> Result<()> { Ok(()) } -#[test] -fn commit_prompt_tokens_promotes_pending_count_to_usage_and_clears() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn commit_prompt_tokens_promotes_pending_count_to_usage_and_clears( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let model = fixture.model; let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default()); let mut batch = LlamaBatch::new(8, 1)?; @@ -139,11 +411,46 @@ fn commit_prompt_tokens_promotes_pending_count_to_usage_and_clears() -> Result<( Ok(()) } -#[test] -fn discard_pending_prompt_tokens_clears_count_without_recording_usage() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn discard_pending_prompt_tokens_clears_count_without_recording_usage( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let model = fixture.model; let mut classifier = SampledTokenClassifier::new(model, StreamingMarkers::default()); let mut batch = LlamaBatch::new(8, 1)?; @@ -157,3 +464,50 @@ fn discard_pending_prompt_tokens_clears_count_without_recording_usage() -> Resul Ok(()) } + +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, +)] +fn diagnose_tool_call_synthetic_renders_returns_a_pair_of_strings( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let (left, right) = fixture.model.diagnose_tool_call_synthetic_renders()?; + let _ = left; + let _ = right; + Ok(()) +} + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/sampling.rs b/llama-cpp-bindings-tests/tests/sampling.rs index 5c1120fe..d03e965e 100644 --- a/llama-cpp-bindings-tests/tests/sampling.rs +++ b/llama-cpp-bindings-tests/tests/sampling.rs @@ -1,144 +1,246 @@ -use std::num::NonZeroU32; +#![expect( + clippy::unnecessary_wraps, + reason = "every trial returns anyhow::Result<()> to match the LlamaTestFn signature" +)] use anyhow::Result; use llama_cpp_bindings::GrammarError; use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::context::params::LlamaContextParams; use llama_cpp_bindings::llama_batch::LlamaBatch; use llama_cpp_bindings::model::AddBos; use llama_cpp_bindings::sampling::LlamaSampler; use llama_cpp_bindings::token::LlamaToken; -use llama_cpp_bindings_tests::FixtureSession; -use serial_test::serial; - -#[test] -#[serial] -fn dry_sampler_with_model() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn dry_sampler_with_model(fixture: &LlamaFixture<'_>) -> Result<()> { let breakers: Vec<&[u8]> = vec![b"\n", b"\t"]; - let _sampler = LlamaSampler::dry(model, 1.5, 2.0, 128, 2, &breakers); + let _sampler = LlamaSampler::dry(fixture.model, 1.5, 2.0, 128, 2, &breakers); Ok(()) } -#[test] -#[serial] -fn dry_sampler_with_null_byte_in_seq_breakers_returns_error() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn dry_sampler_with_null_byte_in_seq_breakers_returns_error( + fixture: &LlamaFixture<'_>, +) -> Result<()> { let breakers: Vec<&[u8]> = vec![b"hello\0world"]; - let result = LlamaSampler::dry(model, 1.5, 2.0, 128, 2, breakers); + let result = LlamaSampler::dry(fixture.model, 1.5, 2.0, 128, 2, breakers); assert!(result.is_err()); Ok(()) } -#[test] -#[serial] -fn grammar_returns_sampler_for_valid_grammar() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - let sampler = LlamaSampler::grammar(model, "root ::= \"hello\"", "root"); +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn grammar_returns_sampler_for_valid_grammar(fixture: &LlamaFixture<'_>) -> Result<()> { + let sampler = LlamaSampler::grammar(fixture.model, "root ::= \"hello\"", "root"); assert!(sampler.is_ok()); Ok(()) } -#[test] -#[serial] -fn grammar_lazy_returns_sampler_for_valid_grammar_with_triggers() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn grammar_lazy_returns_sampler_for_valid_grammar_with_triggers( + fixture: &LlamaFixture<'_>, +) -> Result<()> { let trigger_words: Vec<&[u8]> = vec![b"function"]; - let sampler = - LlamaSampler::grammar_lazy(model, "root ::= \"hello\"", "root", trigger_words, &[]); + let sampler = LlamaSampler::grammar_lazy( + fixture.model, + "root ::= \"hello\"", + "root", + trigger_words, + &[], + ); assert!(sampler.is_ok()); Ok(()) } -#[test] -#[serial] -fn grammar_lazy_patterns_returns_sampler_for_valid_grammar_with_patterns() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - let patterns = vec!["\\{.*".to_string()]; - let sampler = - LlamaSampler::grammar_lazy_patterns(model, "root ::= \"hello\"", "root", &patterns, &[]); +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn grammar_lazy_patterns_returns_sampler_for_valid_grammar_with_patterns( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let patterns = vec!["\\{.*".to_owned()]; + let sampler = LlamaSampler::grammar_lazy_patterns( + fixture.model, + "root ::= \"hello\"", + "root", + &patterns, + &[], + ); assert!(sampler.is_ok()); Ok(()) } -#[test] -#[serial] -fn grammar_lazy_with_root_not_found_returns_error() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn grammar_lazy_with_root_not_found_returns_error(fixture: &LlamaFixture<'_>) -> Result<()> { let trigger_words: Vec<&[u8]> = vec![b"function"]; - let result = - LlamaSampler::grammar_lazy(model, "expr ::= \"hello\"", "root", trigger_words, &[]); + let result = LlamaSampler::grammar_lazy( + fixture.model, + "expr ::= \"hello\"", + "root", + trigger_words, + &[], + ); assert!(matches!(result, Err(GrammarError::RootNotFound))); Ok(()) } -#[test] -#[serial] -fn grammar_lazy_with_null_byte_in_trigger_word_returns_error() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn grammar_lazy_with_null_byte_in_trigger_word_returns_error( + fixture: &LlamaFixture<'_>, +) -> Result<()> { let trigger_words: Vec<&[u8]> = vec![b"hel\0lo"]; - let result = - LlamaSampler::grammar_lazy(model, "root ::= \"hello\"", "root", trigger_words, &[]); + let result = LlamaSampler::grammar_lazy( + fixture.model, + "root ::= \"hello\"", + "root", + trigger_words, + &[], + ); assert!(matches!(result, Err(GrammarError::TriggerWordNullBytes(_)))); Ok(()) } -#[test] -#[serial] -fn grammar_lazy_patterns_with_root_not_found_returns_error() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - let patterns = vec!["\\{.*".to_string()]; - let result = - LlamaSampler::grammar_lazy_patterns(model, "expr ::= \"hello\"", "root", &patterns, &[]); +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn grammar_lazy_patterns_with_root_not_found_returns_error( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let patterns = vec!["\\{.*".to_owned()]; + let result = LlamaSampler::grammar_lazy_patterns( + fixture.model, + "expr ::= \"hello\"", + "root", + &patterns, + &[], + ); assert!(matches!(result, Err(GrammarError::RootNotFound))); Ok(()) } -#[test] -#[serial] -fn grammar_lazy_patterns_with_null_byte_in_pattern_returns_error() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - let patterns = vec!["hel\0lo".to_string()]; - let result = - LlamaSampler::grammar_lazy_patterns(model, "root ::= \"hello\"", "root", &patterns, &[]); +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn grammar_lazy_patterns_with_null_byte_in_pattern_returns_error( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let patterns = vec!["hel\0lo".to_owned()]; + let result = LlamaSampler::grammar_lazy_patterns( + fixture.model, + "root ::= \"hello\"", + "root", + &patterns, + &[], + ); assert!(matches!(result, Err(GrammarError::GrammarNullBytes(_)))); Ok(()) } -#[test] -#[serial] -fn grammar_lazy_patterns_with_malformed_regex_returns_invalid_trigger_pattern() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - let patterns = vec!["[".to_string()]; - let result = - LlamaSampler::grammar_lazy_patterns(model, "root ::= \"hello\"", "root", &patterns, &[]); +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn grammar_lazy_patterns_with_malformed_regex_returns_invalid_trigger_pattern( + fixture: &LlamaFixture<'_>, +) -> Result<()> { + let patterns = vec!["[".to_owned()]; + let result = LlamaSampler::grammar_lazy_patterns( + fixture.model, + "root ::= \"hello\"", + "root", + &patterns, + &[], + ); assert!(matches!( result, @@ -148,79 +250,121 @@ fn grammar_lazy_patterns_with_malformed_regex_returns_invalid_trigger_pattern() Ok(()) } -#[test] -#[serial] -fn llguidance_method_creates_sampler() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); - let result = LlamaSampler::llguidance(model, "regex", r"yes|no"); +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn llguidance_method_creates_sampler(fixture: &LlamaFixture<'_>) -> Result<()> { + let result = LlamaSampler::llguidance(fixture.model, "regex", r"yes|no"); assert!(result.is_ok()); Ok(()) } -#[test] -#[serial] -fn logit_bias_with_empty_biases_succeeds() { +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn logit_bias_with_empty_biases_succeeds(_fixture: &LlamaFixture<'_>) -> Result<()> { let result = LlamaSampler::logit_bias(0, &[]); assert!(result.is_ok()); + + Ok(()) } -#[test] -#[serial] -fn dry_sampler_with_root_not_found_grammar_does_not_apply() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn dry_sampler_with_root_not_found_grammar_does_not_apply( + fixture: &LlamaFixture<'_>, +) -> Result<()> { let breakers: Vec<&[u8]> = vec![b"\n"]; - let _sampler = LlamaSampler::dry(model, 1.5, 2.0, 128, 2, &breakers); + let _sampler = LlamaSampler::dry(fixture.model, 1.5, 2.0, 128, 2, &breakers); Ok(()) } -#[test] -#[serial] -fn accept_many_iterates_over_borrowed_tokens() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn accept_many_iterates_over_borrowed_tokens(fixture: &LlamaFixture<'_>) -> Result<()> { let mut sampler = LlamaSampler::chain_simple([LlamaSampler::greedy()]); - let tokens = vec![model.token_bos(), model.token_eos()]; + let tokens = vec![fixture.model.token_bos(), fixture.model.token_eos()]; sampler.accept_many(&tokens)?; Ok(()) } -#[test] -#[serial] -fn with_tokens_returns_self_after_accepting_each_token() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn with_tokens_returns_self_after_accepting_each_token(fixture: &LlamaFixture<'_>) -> Result<()> { let sampler = LlamaSampler::chain_simple([LlamaSampler::greedy()]); - let tokens = [model.token_bos(), model.token_eos()]; + let tokens = [fixture.model.token_bos(), fixture.model.token_eos()]; let _consumed = sampler.with_tokens(tokens.iter().copied())?; Ok(()) } -#[test] -#[serial] -fn accept_consumes_a_single_token() -> Result<()> { - let fixture = FixtureSession::open()?; - let model = fixture.default_model(); +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn accept_consumes_a_single_token(fixture: &LlamaFixture<'_>) -> Result<()> { let mut sampler = LlamaSampler::chain_simple([LlamaSampler::greedy()]); - sampler.accept(model.token_bos())?; + sampler.accept(fixture.model.token_bos())?; Ok(()) } -#[test] -#[serial] -fn try_accept_returns_ok_for_a_valid_token() -> Result<()> { - let _fixture = FixtureSession::open()?; +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn try_accept_returns_ok_for_a_valid_token(_fixture: &LlamaFixture<'_>) -> Result<()> { let mut sampler = LlamaSampler::chain_simple([LlamaSampler::greedy()]); sampler.try_accept(LlamaToken::new(0))?; @@ -228,21 +372,22 @@ fn try_accept_returns_ok_for_a_valid_token() -> Result<()> { Ok(()) } -#[test] -#[serial] -fn apply_runs_sampler_over_token_data_array() -> Result<()> { - use std::num::NonZeroU32; - - use llama_cpp_bindings::context::params::LlamaContextParams; - use llama_cpp_bindings::llama_batch::LlamaBatch; - use llama_cpp_bindings::model::AddBos; - - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; - let tokens = model.str_to_token("Hi", AddBos::Always)?; +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn apply_runs_sampler_over_token_data_array(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let tokens = fixture.model.str_to_token("Hi", AddBos::Always)?; let mut batch = LlamaBatch::new(512, 1)?; batch.add_sequence(&tokens, 0, false)?; context.decode(&mut batch)?; @@ -254,15 +399,22 @@ fn apply_runs_sampler_over_token_data_array() -> Result<()> { Ok(()) } -#[test] -#[serial] -fn sample_returns_token_after_decode() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - let ctx_params = LlamaContextParams::default().with_n_ctx(NonZeroU32::new(512)); - let mut context = LlamaContext::from_model(model, backend, ctx_params)?; - let tokens = model.str_to_token("Hello", AddBos::Always)?; +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 2048, + n_ubatch = 512, +)] +fn sample_returns_token_after_decode(fixture: &LlamaFixture<'_>) -> Result<()> { + let mut context = LlamaContext::from_model( + fixture.model, + fixture.backend, + (*fixture.context_params).into_llama_context_params(), + )?; + let tokens = fixture.model.str_to_token("Hello", AddBos::Always)?; let mut batch = LlamaBatch::new(512, 1)?; batch.add_sequence(&tokens, 0, false)?; context.decode(&mut batch)?; @@ -273,3 +425,5 @@ fn sample_returns_token_after_decode() -> Result<()> { Ok(()) } + +llama_tests_main!(); diff --git a/llama-cpp-bindings-tests/tests/text_generation.rs b/llama-cpp-bindings-tests/tests/text_generation.rs index ad59463b..57fd54d7 100644 --- a/llama-cpp-bindings-tests/tests/text_generation.rs +++ b/llama-cpp-bindings-tests/tests/text_generation.rs @@ -4,25 +4,62 @@ use std::time::Duration; use anyhow::Context as _; use anyhow::Result; use llama_cpp_bindings::context::LlamaContext; -use llama_cpp_bindings::context::params::LlamaContextParams; use llama_cpp_bindings::ggml_time_us; use llama_cpp_bindings::llama_batch::LlamaBatch; use llama_cpp_bindings::model::AddBos; use llama_cpp_bindings::model::LlamaChatMessage; use llama_cpp_bindings::sampled_token::SampledToken; use llama_cpp_bindings::sampling::LlamaSampler; -use llama_cpp_bindings_tests::FixtureSession; use llama_cpp_bindings_tests::classify_sample_loop::ClassifySampleLoop; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::llama_tests_main; -#[test] -fn raw_prompt_completion_with_timing() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - - let ctx_params = LlamaContextParams::default(); - let mut ctx = LlamaContext::from_model(model, backend, ctx_params) - .with_context(|| "unable to create context")?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 512, + n_ubatch = 128, +)] +fn raw_prompt_completion_with_timing(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; + let mut ctx = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + ) + .with_context(|| "unable to create context")?; let prompt = "Hello my name is"; let max_generated_tokens: i32 = 64; @@ -130,14 +167,50 @@ fn raw_prompt_completion_with_timing() -> Result<()> { Ok(()) } -#[test] -fn chat_inference_produces_coherent_output() -> Result<()> { - let fixture = FixtureSession::open()?; - let backend = fixture.backend(); - let model = fixture.default_model(); - - let context_params = LlamaContextParams::default(); - let mut context = LlamaContext::from_model(model, backend, context_params)?; +#[llama_test( + model_source = HuggingFace("unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF", "DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 2048, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/GLM-4.7-Flash-GGUF", "GLM-4.7-Flash-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 2048, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 2048, + n_batch = 512, + n_ubatch = 128, +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.6-35B-A3B-GGUF", "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 2048, + n_batch = 512, + n_ubatch = 128, +)] +fn chat_inference_produces_coherent_output(fixture: &LlamaFixture<'_>) -> Result<()> { + let model = fixture.model; + let backend = fixture.backend; + let mut context = LlamaContext::from_model( + model, + backend, + (*fixture.context_params).into_llama_context_params(), + )?; let chat_template = model.chat_template(None)?; let messages = vec![LlamaChatMessage::new( @@ -221,3 +294,5 @@ fn chat_inference_produces_coherent_output() -> Result<()> { Ok(()) } + +llama_tests_main!(); diff --git a/llama-cpp-bindings-types/src/token_usage.rs b/llama-cpp-bindings-types/src/token_usage.rs index 7bf67448..78036eb8 100644 --- a/llama-cpp-bindings-types/src/token_usage.rs +++ b/llama-cpp-bindings-types/src/token_usage.rs @@ -201,26 +201,28 @@ mod tests { } #[test] - fn record_cached_below_prompt_succeeds_and_accumulates() -> Result<(), TokenUsageError> { + fn record_cached_below_prompt_succeeds_and_accumulates() { let mut usage = TokenUsage::new(); usage.record_prompt_tokens(10); - usage.record_cached_prompt_tokens(3)?; - usage.record_cached_prompt_tokens(4)?; + usage + .record_cached_prompt_tokens(3) + .expect("3 cached <= 10 prompt is valid"); + usage + .record_cached_prompt_tokens(4) + .expect("3+4 cached <= 10 prompt is valid"); assert_eq!(usage.cached_prompt_tokens, 7); - - Ok(()) } #[test] - fn record_cached_equal_to_prompt_succeeds() -> Result<(), TokenUsageError> { + fn record_cached_equal_to_prompt_succeeds() { let mut usage = TokenUsage::new(); usage.record_prompt_tokens(5); - usage.record_cached_prompt_tokens(5)?; + usage + .record_cached_prompt_tokens(5) + .expect("5 cached == 5 prompt is valid (boundary)"); assert_eq!(usage.cached_prompt_tokens, 5); - - Ok(()) } #[test] @@ -333,10 +335,11 @@ mod tests { } #[test] - fn add_combines_field_by_field() -> Result<(), TokenUsageError> { + fn add_combines_field_by_field() { let mut left = TokenUsage::new(); left.record_prompt_tokens(2); - left.record_cached_prompt_tokens(1)?; + left.record_cached_prompt_tokens(1) + .expect("1 cached <= 2 prompt is valid"); left.record_content_token(); left.record_reasoning_token(); left.record_tool_call_token(); @@ -344,7 +347,9 @@ mod tests { let mut right = TokenUsage::new(); right.record_prompt_tokens(5); - right.record_cached_prompt_tokens(2)?; + right + .record_cached_prompt_tokens(2) + .expect("2 cached <= 5 prompt is valid"); right.record_content_token(); right.record_tool_call_token(); @@ -356,8 +361,6 @@ mod tests { assert_eq!(combined.reasoning_tokens, 1); assert_eq!(combined.tool_call_tokens, 2); assert_eq!(combined.undeterminable_tokens, 1); - - Ok(()) } #[test] diff --git a/llama-cpp-test-harness-macros/Cargo.toml b/llama-cpp-test-harness-macros/Cargo.toml new file mode 100644 index 00000000..21635a4d --- /dev/null +++ b/llama-cpp-test-harness-macros/Cargo.toml @@ -0,0 +1,25 @@ +[package] +name = "llama-cpp-test-harness-macros" +description = "Procedural macros for llama-cpp-test-harness" +version.workspace = true +edition.workspace = true +license.workspace = true +publish = false + +[lib] +proc-macro = true + +[dependencies] +proc-macro2 = { workspace = true } +quote = { workspace = true } +syn = { workspace = true } + +[lints.rust] +unsafe_op_in_unsafe_fn = "warn" +unused_qualifications = "warn" + +[lints.clippy] +all = { level = "deny", priority = -1 } +pedantic = { level = "warn", priority = -1 } +nursery = { level = "warn", priority = -1 } +module_name_repetitions = "allow" diff --git a/llama-cpp-test-harness-macros/src/expand.rs b/llama-cpp-test-harness-macros/src/expand.rs new file mode 100644 index 00000000..f4da0bd2 --- /dev/null +++ b/llama-cpp-test-harness-macros/src/expand.rs @@ -0,0 +1,443 @@ +use proc_macro2::Ident; +use proc_macro2::Span; +use proc_macro2::TokenStream; +use quote::quote; +use syn::Item; +use syn::ItemFn; +use syn::ReturnType; +use syn::parse::Parse; +use syn::parse::ParseStream; +use syn::parse2; + +use crate::parsed_args::ParsedArgs; +use crate::parsed_source::ParsedSource; + +struct StackedItems { + items: Vec, +} + +impl Parse for StackedItems { + fn parse(input: ParseStream<'_>) -> syn::Result { + let mut items = Vec::new(); + while !input.is_empty() { + items.push(input.parse()?); + } + Ok(Self { items }) + } +} + +fn validate_signature(item_fn: &ItemFn) -> syn::Result<()> { + if item_fn.sig.inputs.len() != 1 { + return Err(syn::Error::new_spanned( + &item_fn.sig.inputs, + "llama_test functions must take exactly one argument: `&LlamaFixture<'_>`", + )); + } + if matches!(item_fn.sig.output, ReturnType::Default) { + return Err(syn::Error::new_spanned( + &item_fn.sig, + "llama_test functions must return `anyhow::Result<()>`", + )); + } + if item_fn.sig.asyncness.is_some() { + return Err(syn::Error::new_spanned( + &item_fn.sig, + "llama_test functions must be synchronous", + )); + } + if !item_fn.sig.generics.params.is_empty() { + return Err(syn::Error::new_spanned( + &item_fn.sig.generics, + "llama_test functions must not be generic", + )); + } + Ok(()) +} + +fn split_fn_and_pass_through(items: Vec) -> syn::Result<(ItemFn, Vec)> { + let mut found_fn: Option = None; + let mut pass_through: Vec = Vec::new(); + for parsed_item in items { + match parsed_item { + Item::Fn(item_fn) => { + if found_fn.is_some() { + return Err(syn::Error::new_spanned( + &item_fn, + "llama_test expects exactly one fn definition", + )); + } + found_fn = Some(item_fn); + } + other => pass_through.push(other), + } + } + let item_fn = found_fn + .ok_or_else(|| syn::Error::new(Span::call_site(), "llama_test expects an fn definition"))?; + Ok((item_fn, pass_through)) +} + +fn build_model_source_literal(source: &ParsedSource) -> TokenStream { + match source { + ParsedSource::HuggingFace { repo, file } => quote! { + ::llama_cpp_test_harness::ModelSource::HuggingFace { + repo: #repo, + file: #file, + } + }, + ParsedSource::LocalPath(path) => quote! { + ::llama_cpp_test_harness::ModelSource::LocalPath(#path) + }, + } +} + +fn build_mmproj_source_literal(source: Option<&ParsedSource>) -> TokenStream { + match source { + None => quote! { ::core::option::Option::None }, + Some(ParsedSource::HuggingFace { repo, file }) => quote! { + ::core::option::Option::Some(::llama_cpp_test_harness::MmprojSource::HuggingFace { + repo: #repo, + file: #file, + }) + }, + Some(ParsedSource::LocalPath(path)) => quote! { + ::core::option::Option::Some(::llama_cpp_test_harness::MmprojSource::LocalPath(#path)) + }, + } +} + +fn build_registration(args: &ParsedArgs, fn_name: &Ident) -> TokenStream { + let trial_name = format!( + "{fn_name}[{suffix}]", + suffix = args.model_source.display_suffix() + ); + let model_source_literal = build_model_source_literal(&args.model_source); + let mmproj_source_literal = build_mmproj_source_literal(args.mmproj_source.as_ref()); + let gpu_layers = args.model_load_params.n_gpu_layers; + let use_mmap = args.model_load_params.use_mmap; + let use_mlock = args.model_load_params.use_mlock; + let context_size = args.context_params.n_ctx; + let logical_batch = args.context_params.n_batch; + let physical_batch = args.context_params.n_ubatch; + let embeddings_flag = args.context_params.embeddings; + let sequence_max = args.context_params.n_seq_max; + let void_logs_flag = args.void_logs; + let threads_batch = args.context_params.n_threads_batch.map_or_else( + || quote! { ::core::option::Option::None }, + |value| quote! { ::core::option::Option::Some(#value) }, + ); + + quote! { + ::llama_cpp_test_harness::inventory::submit! { + ::llama_cpp_test_harness::LlamaTestRegistration { + name: #trial_name, + key: ::llama_cpp_test_harness::LoadKey { + model_source: #model_source_literal, + mmproj_source: #mmproj_source_literal, + model_load_params: ::llama_cpp_test_harness::ModelLoadParams { + n_gpu_layers: #gpu_layers, + use_mmap: #use_mmap, + use_mlock: #use_mlock, + }, + }, + context_params: ::llama_cpp_test_harness::ContextParams { + n_ctx: #context_size, + n_batch: #logical_batch, + n_ubatch: #physical_batch, + n_seq_max: #sequence_max, + n_threads_batch: #threads_batch, + embeddings: #embeddings_flag, + }, + void_logs: #void_logs_flag, + func: #fn_name, + } + } + } +} + +pub fn expand(attribute: TokenStream, item: TokenStream) -> syn::Result { + let args: ParsedArgs = parse2(attribute)?; + let StackedItems { items } = parse2(item)?; + let (item_fn, pass_through) = split_fn_and_pass_through(items)?; + validate_signature(&item_fn)?; + + let fn_name = &item_fn.sig.ident; + let new_submission = build_registration(&args, fn_name); + + Ok(quote! { + #item_fn + #(#pass_through)* + #new_submission + }) +} + +#[cfg(test)] +mod tests { + use proc_macro2::TokenStream; + use quote::quote; + + use super::expand; + + fn well_formed_attribute() -> TokenStream { + quote! { + model_source = HuggingFace("foo", "bar.gguf"), + n_gpu_layers = 0, + use_mmap = true, + use_mlock = false, + n_ctx = 1, + n_batch = 1, + n_ubatch = 1 + } + } + + fn well_formed_function() -> TokenStream { + quote! { + fn my_test(fixture: &LlamaFixture<'_>) -> anyhow::Result<()> { Ok(()) } + } + } + + #[test] + fn well_formed_input_expands_to_function_plus_submission() { + let expanded = expand(well_formed_attribute(), well_formed_function()) + .expect("well-formed input must expand") + .to_string(); + + assert!( + expanded.contains("fn my_test"), + "expansion missing the original fn: {expanded}" + ); + assert!( + expanded.contains("LlamaTestRegistration"), + "expansion missing LlamaTestRegistration: {expanded}", + ); + assert!( + expanded.contains("\"my_test[bar.gguf]\""), + "expansion missing the trial-name literal with file suffix: {expanded}", + ); + assert!( + expanded.contains("ModelSource :: HuggingFace"), + "expansion missing ModelSource::HuggingFace variant: {expanded}", + ); + assert!( + expanded.contains("func : my_test"), + "expansion missing func wire-up: {expanded}", + ); + } + + #[test] + fn expansion_with_local_path_model_source_emits_local_variant() { + let attribute = quote! { + model_source = LocalPath("/abs/local.gguf"), + n_gpu_layers = 0, + use_mmap = true, + use_mlock = false, + n_ctx = 1, + n_batch = 1, + n_ubatch = 1 + }; + let expanded = expand(attribute, well_formed_function()) + .expect("LocalPath must expand") + .to_string(); + + assert!( + expanded.contains("ModelSource :: LocalPath"), + "expansion missing ModelSource::LocalPath variant: {expanded}", + ); + assert!( + expanded.contains("\"my_test[local.gguf]\""), + "trial name must use the path's filename component: {expanded}", + ); + } + + #[test] + fn expansion_with_mmproj_source_emits_some_variant() { + let attribute = quote! { + model_source = HuggingFace("r", "f"), + n_gpu_layers = 0, + use_mmap = true, + use_mlock = false, + n_ctx = 1, + n_batch = 1, + n_ubatch = 1, + mmproj_source = HuggingFace("r", "mmproj.gguf") + }; + let expanded = expand(attribute, well_formed_function()) + .expect("mmproj_source must expand") + .to_string(); + + assert!( + expanded.contains("MmprojSource :: HuggingFace"), + "expansion missing MmprojSource::HuggingFace: {expanded}", + ); + assert!( + expanded.contains("Some"), + "expansion missing Option::Some wrap: {expanded}", + ); + } + + #[test] + fn expansion_with_local_path_mmproj_emits_local_variant() { + let attribute = quote! { + model_source = HuggingFace("r", "f"), + n_gpu_layers = 0, + use_mmap = true, + use_mlock = false, + n_ctx = 1, + n_batch = 1, + n_ubatch = 1, + mmproj_source = LocalPath("/abs/mmproj.gguf") + }; + let expanded = expand(attribute, well_formed_function()) + .expect("LocalPath mmproj must expand") + .to_string(); + + assert!( + expanded.contains("MmprojSource :: LocalPath"), + "expansion missing MmprojSource::LocalPath: {expanded}", + ); + } + + #[test] + fn expansion_without_mmproj_emits_none_variant() { + let expanded = expand(well_formed_attribute(), well_formed_function()) + .expect("no-mmproj must expand") + .to_string(); + + assert!( + expanded.contains("None"), + "expansion missing Option::None for absent mmproj: {expanded}" + ); + } + + #[test] + fn malformed_attribute_propagates_parser_error() { + let attribute = quote! { totally_wrong = "x" }; + let error = expand(attribute, well_formed_function()) + .expect_err("malformed must fail") + .to_string(); + + assert!(error.contains("unknown field"), "got: {error}"); + } + + #[test] + fn function_with_too_many_arguments_is_rejected() { + let item = quote! { + fn my_test(a: &LlamaFixture<'_>, b: i32) -> anyhow::Result<()> { Ok(()) } + }; + let error = expand(well_formed_attribute(), item) + .expect_err("two-arg must fail") + .to_string(); + + assert!(error.contains("exactly one argument"), "got: {error}"); + } + + #[test] + fn function_with_zero_arguments_is_rejected() { + let item = quote! { fn my_test() -> anyhow::Result<()> { Ok(()) } }; + let error = expand(well_formed_attribute(), item) + .expect_err("zero-arg must fail") + .to_string(); + + assert!(error.contains("exactly one argument"), "got: {error}"); + } + + #[test] + fn impl_block_input_is_rejected_at_parse_time() { + let item = quote! { + impl S { + fn my_test(&self) -> anyhow::Result<()> { Ok(()) } + } + }; + let result = expand(well_formed_attribute(), item); + + assert!(result.is_err(), "impl block must fail"); + } + + #[test] + fn function_without_return_type_is_rejected() { + let item = quote! { fn my_test(fixture: &LlamaFixture<'_>) { } }; + let error = expand(well_formed_attribute(), item) + .expect_err("missing return type must fail") + .to_string(); + + assert!(error.contains("anyhow::Result"), "got: {error}"); + } + + #[test] + fn async_function_is_rejected() { + let item = quote! { + async fn my_test(fixture: &LlamaFixture<'_>) -> anyhow::Result<()> { Ok(()) } + }; + let error = expand(well_formed_attribute(), item) + .expect_err("async must fail") + .to_string(); + + assert!(error.contains("synchronous"), "got: {error}"); + } + + #[test] + fn generic_function_is_rejected() { + let item = quote! { + fn my_test(fixture: &LlamaFixture<'_>) -> anyhow::Result<()> { Ok(()) } + }; + let error = expand(well_formed_attribute(), item) + .expect_err("generic must fail") + .to_string(); + + assert!(error.contains("generic"), "got: {error}"); + } + + #[test] + fn malformed_item_token_stream_is_rejected() { + let item = quote! { this is not a function }; + let result = expand(well_formed_attribute(), item); + + assert!(result.is_err(), "non-fn item must fail"); + } + + #[test] + fn stacked_invocation_preserves_prior_submission() { + let prior_layer_output = expand(well_formed_attribute(), well_formed_function()) + .expect("first layer must expand"); + + let second_attribute = quote! { + model_source = HuggingFace("second", "second.gguf"), + n_gpu_layers = 1, + use_mmap = false, + use_mlock = false, + n_ctx = 2, + n_batch = 2, + n_ubatch = 2 + }; + + let second_layer_output = expand(second_attribute, prior_layer_output) + .expect("stacked second invocation must expand") + .to_string(); + + assert!( + second_layer_output.contains("\"my_test[bar.gguf]\""), + "stacked output missing first trial name: {second_layer_output}", + ); + assert!( + second_layer_output.contains("\"my_test[second.gguf]\""), + "stacked output missing second trial name: {second_layer_output}", + ); + let occurrences = second_layer_output.matches("LlamaTestRegistration").count(); + assert!( + occurrences >= 2, + "stacked output should contain two LlamaTestRegistration submissions, found {occurrences}: {second_layer_output}", + ); + } + + #[test] + fn two_fn_definitions_in_input_are_rejected() { + let item = quote! { + fn first(fixture: &LlamaFixture<'_>) -> anyhow::Result<()> { Ok(()) } + fn second(fixture: &LlamaFixture<'_>) -> anyhow::Result<()> { Ok(()) } + }; + let error = expand(well_formed_attribute(), item) + .expect_err("two fns must fail") + .to_string(); + + assert!(error.contains("exactly one fn"), "got: {error}"); + } +} diff --git a/llama-cpp-test-harness-macros/src/lib.rs b/llama-cpp-test-harness-macros/src/lib.rs new file mode 100644 index 00000000..b36048fc --- /dev/null +++ b/llama-cpp-test-harness-macros/src/lib.rs @@ -0,0 +1,78 @@ +//! Procedural macros for `llama-cpp-test-harness`. +//! +//! Provides the `#[llama_test(...)]` attribute that declaratively binds a test function to a +//! specific GGUF model and inference parameter set. The macro emits the original function plus +//! an `inventory::submit!` block that registers the test with the harness runtime. + +mod expand; +mod parsed_args; +mod parsed_context_params; +mod parsed_model_load_params; +mod parsed_source; + +use proc_macro::TokenStream; +use proc_macro2::TokenStream as TokenStream2; + +use crate::expand::expand; + +fn dispatch(attribute: TokenStream2, item: TokenStream2) -> TokenStream2 { + match expand(attribute, item) { + Ok(tokens) => tokens, + Err(error) => error.to_compile_error(), + } +} + +/// Registers a function as a llama-cpp test with explicit model + inference parameters. +/// +/// See the `llama-cpp-test-harness` crate for the full attribute schema and usage. +#[proc_macro_attribute] +pub fn llama_test(attribute: TokenStream, item: TokenStream) -> TokenStream { + dispatch(attribute.into(), item.into()).into() +} + +#[cfg(test)] +mod tests { + use quote::quote; + + use super::dispatch; + + #[test] + fn dispatch_on_invalid_attribute_emits_compile_error_tokens() { + let attribute = quote! { totally_wrong = "x" }; + let item = quote! { + fn my_test(fixture: &LlamaFixture<'_>) -> anyhow::Result<()> { Ok(()) } + }; + let emitted = dispatch(attribute, item).to_string(); + + assert!( + emitted.contains("compile_error"), + "expected compile_error! tokens in emitted output: {emitted}", + ); + } + + #[test] + fn dispatch_on_valid_input_emits_inventory_submission() { + let attribute = quote! { + model_source = HuggingFace("r", "f"), + n_gpu_layers = 0, + use_mmap = true, + use_mlock = false, + n_ctx = 1, + n_batch = 1, + n_ubatch = 1 + }; + let item = quote! { + fn my_test(fixture: &LlamaFixture<'_>) -> anyhow::Result<()> { Ok(()) } + }; + let emitted = dispatch(attribute, item).to_string(); + + assert!( + emitted.contains("inventory"), + "expected inventory::submit in emitted output: {emitted}", + ); + assert!( + !emitted.contains("compile_error"), + "valid input should not emit compile_error: {emitted}", + ); + } +} diff --git a/llama-cpp-test-harness-macros/src/parsed_args.rs b/llama-cpp-test-harness-macros/src/parsed_args.rs new file mode 100644 index 00000000..795261f3 --- /dev/null +++ b/llama-cpp-test-harness-macros/src/parsed_args.rs @@ -0,0 +1,881 @@ +use std::collections::HashSet; + +use proc_macro2::Ident; +use proc_macro2::Span; +use syn::Expr; +use syn::ExprLit; +use syn::Lit; +use syn::Meta; +use syn::Token; +use syn::parse::Parse; +use syn::parse::ParseStream; +use syn::punctuated::Punctuated; + +use crate::parsed_context_params::ParsedContextParams; +use crate::parsed_model_load_params::ParsedModelLoadParams; +use crate::parsed_source::ParsedSource; + +const REQUIRED_FIELDS: &[&str] = &[ + "model_source", + "n_gpu_layers", + "use_mmap", + "use_mlock", + "n_ctx", + "n_batch", + "n_ubatch", +]; + +const OPTIONAL_FIELDS: &[&str] = &[ + "mmproj_source", + "embeddings", + "n_seq_max", + "n_threads_batch", + "void_logs", +]; + +fn literal_from_expression(expression: &Expr) -> syn::Result<&Lit> { + if let Expr::Lit(ExprLit { lit, .. }) = expression { + Ok(lit) + } else { + Err(syn::Error::new_spanned( + expression, + "expected a literal (string, integer, or bool)", + )) + } +} + +fn require_int_lit(literal: &Lit, field: &str) -> syn::Result { + if let Lit::Int(int_literal) = literal { + int_literal.base10_parse::().map_err(|parse_error| { + syn::Error::new_spanned( + literal, + format!( + "field `{field}` expects a non-negative integer that fits in u32: {parse_error}" + ), + ) + }) + } else { + Err(syn::Error::new_spanned( + literal, + format!("field `{field}` expects an integer literal"), + )) + } +} + +fn require_i32_lit(literal: &Lit, field: &str) -> syn::Result { + if let Lit::Int(int_literal) = literal { + int_literal.base10_parse::().map_err(|parse_error| { + syn::Error::new_spanned( + literal, + format!("field `{field}` expects an integer that fits in i32: {parse_error}"), + ) + }) + } else { + Err(syn::Error::new_spanned( + literal, + format!("field `{field}` expects an integer literal"), + )) + } +} + +fn require_bool_lit(literal: &Lit, field: &str) -> syn::Result { + if let Lit::Bool(bool_literal) = literal { + Ok(bool_literal.value()) + } else { + Err(syn::Error::new_spanned( + literal, + format!("field `{field}` expects a bool literal (`true` or `false`)"), + )) + } +} + +fn require(value: Option, field: &str, span: Span) -> syn::Result { + value.ok_or_else(|| syn::Error::new(span, format!("missing required field `{field}`"))) +} + +#[derive(Default)] +struct AttributeAccumulator { + model_source: Option, + mmproj_source: Option, + n_gpu_layers: Option, + use_mmap: Option, + use_mlock: Option, + n_ctx: Option, + n_batch: Option, + n_ubatch: Option, + embeddings: Option, + n_seq_max: Option, + n_threads_batch: Option, + void_logs: Option, +} + +fn dispatch_field( + accumulator: &mut AttributeAccumulator, + identifier: &Ident, + name: &str, + value: &Expr, +) -> syn::Result<()> { + match name { + "model_source" => { + accumulator.model_source = Some(ParsedSource::parse(value, "model_source")?); + } + "mmproj_source" => { + accumulator.mmproj_source = Some(ParsedSource::parse(value, "mmproj_source")?); + } + "n_gpu_layers" => { + accumulator.n_gpu_layers = Some(require_int_lit( + literal_from_expression(value)?, + "n_gpu_layers", + )?); + } + "n_ctx" => { + accumulator.n_ctx = Some(require_int_lit(literal_from_expression(value)?, "n_ctx")?); + } + "n_batch" => { + accumulator.n_batch = + Some(require_int_lit(literal_from_expression(value)?, "n_batch")?); + } + "n_ubatch" => { + accumulator.n_ubatch = Some(require_int_lit( + literal_from_expression(value)?, + "n_ubatch", + )?); + } + "use_mmap" => { + accumulator.use_mmap = Some(require_bool_lit( + literal_from_expression(value)?, + "use_mmap", + )?); + } + "use_mlock" => { + accumulator.use_mlock = Some(require_bool_lit( + literal_from_expression(value)?, + "use_mlock", + )?); + } + "embeddings" => { + accumulator.embeddings = Some(require_bool_lit( + literal_from_expression(value)?, + "embeddings", + )?); + } + "n_seq_max" => { + accumulator.n_seq_max = Some(require_int_lit( + literal_from_expression(value)?, + "n_seq_max", + )?); + } + "n_threads_batch" => { + accumulator.n_threads_batch = Some(require_i32_lit( + literal_from_expression(value)?, + "n_threads_batch", + )?); + } + "void_logs" => { + accumulator.void_logs = Some(require_bool_lit( + literal_from_expression(value)?, + "void_logs", + )?); + } + "repo" | "file" | "mmproj_file" => { + return Err(syn::Error::new_spanned( + identifier, + format!( + "field `{name}` was removed; use `model_source = HuggingFace(repo, file)` or `model_source = LocalPath(path)` (and `mmproj_source` for mmproj)" + ), + )); + } + other => { + return Err(syn::Error::new_spanned( + identifier, + format!( + "unknown field `{other}`; expected one of: {}, {}", + REQUIRED_FIELDS.join(", "), + OPTIONAL_FIELDS.join(", "), + ), + )); + } + } + Ok(()) +} + +#[derive(Debug)] +pub struct ParsedArgs { + pub model_source: ParsedSource, + pub mmproj_source: Option, + pub model_load_params: ParsedModelLoadParams, + pub context_params: ParsedContextParams, + pub void_logs: bool, +} + +impl Parse for ParsedArgs { + fn parse(input: ParseStream) -> syn::Result { + let metas = Punctuated::::parse_terminated(input)?; + let mut seen: HashSet = HashSet::new(); + let mut accumulator = AttributeAccumulator::default(); + + for meta in metas { + let Meta::NameValue(name_value) = meta else { + return Err(syn::Error::new_spanned( + meta, + "expected `name = value` form", + )); + }; + let identifier = name_value.path.get_ident().ok_or_else(|| { + syn::Error::new_spanned(&name_value.path, "expected a simple identifier") + })?; + let name = identifier.to_string(); + + if !seen.insert(name.clone()) { + return Err(syn::Error::new_spanned( + identifier, + format!("duplicate field `{name}`"), + )); + } + dispatch_field(&mut accumulator, identifier, &name, &name_value.value)?; + } + + let span = Span::call_site(); + Ok(Self { + model_source: require(accumulator.model_source, "model_source", span)?, + mmproj_source: accumulator.mmproj_source, + model_load_params: ParsedModelLoadParams { + n_gpu_layers: require(accumulator.n_gpu_layers, "n_gpu_layers", span)?, + use_mmap: require(accumulator.use_mmap, "use_mmap", span)?, + use_mlock: require(accumulator.use_mlock, "use_mlock", span)?, + }, + context_params: ParsedContextParams { + n_ctx: require(accumulator.n_ctx, "n_ctx", span)?, + n_batch: require(accumulator.n_batch, "n_batch", span)?, + n_ubatch: require(accumulator.n_ubatch, "n_ubatch", span)?, + n_seq_max: accumulator.n_seq_max.unwrap_or(1), + n_threads_batch: accumulator.n_threads_batch, + embeddings: accumulator.embeddings.unwrap_or(false), + }, + void_logs: accumulator.void_logs.unwrap_or(false), + }) + } +} + +#[cfg(test)] +mod tests { + use syn::parse_str; + + use super::ParsedArgs; + use crate::parsed_source::ParsedSource; + + const ALL_REQUIRED: &str = "\ + model_source = HuggingFace(\"foo\", \"bar.gguf\"), \ + n_gpu_layers = 0, \ + use_mmap = true, \ + use_mlock = false, \ + n_ctx = 512, \ + n_batch = 128, \ + n_ubatch = 64"; + + fn parse(source: &str) -> syn::Result { + parse_str(source) + } + + #[test] + fn parses_all_required_fields() { + let parsed = parse(ALL_REQUIRED).expect("required-only must parse"); + + assert_eq!( + parsed.model_source, + ParsedSource::HuggingFace { + repo: "foo".to_owned(), + file: "bar.gguf".to_owned(), + }, + ); + assert_eq!(parsed.model_load_params.n_gpu_layers, 0); + assert!(parsed.model_load_params.use_mmap); + assert!(!parsed.model_load_params.use_mlock); + assert_eq!(parsed.context_params.n_ctx, 512); + assert_eq!(parsed.context_params.n_batch, 128); + assert_eq!(parsed.context_params.n_ubatch, 64); + assert!(parsed.mmproj_source.is_none()); + } + + #[test] + fn parses_local_path_model_source() { + let source = "\ + model_source = LocalPath(\"/abs/local/model.gguf\"), \ + n_gpu_layers = 0, \ + use_mmap = true, \ + use_mlock = false, \ + n_ctx = 1, \ + n_batch = 1, \ + n_ubatch = 1"; + let parsed = parse(source).expect("LocalPath must parse"); + + assert_eq!( + parsed.model_source, + ParsedSource::LocalPath("/abs/local/model.gguf".to_owned()), + ); + } + + #[test] + fn parses_optional_mmproj_source_huggingface() { + let source = + format!("{ALL_REQUIRED}, mmproj_source = HuggingFace(\"foo\", \"mmproj-F16.gguf\")"); + let parsed = parse(&source).expect("with mmproj_source must parse"); + + assert_eq!( + parsed.mmproj_source, + Some(ParsedSource::HuggingFace { + repo: "foo".to_owned(), + file: "mmproj-F16.gguf".to_owned(), + }), + ); + } + + #[test] + fn parses_optional_mmproj_source_local_path() { + let source = format!("{ALL_REQUIRED}, mmproj_source = LocalPath(\"/abs/mmproj.gguf\")"); + let parsed = parse(&source).expect("with mmproj_source LocalPath must parse"); + + assert_eq!( + parsed.mmproj_source, + Some(ParsedSource::LocalPath("/abs/mmproj.gguf".to_owned())), + ); + } + + #[test] + fn legacy_repo_field_is_rejected_with_migration_hint() { + let source = "repo = \"foo\", file = \"bar\", n_gpu_layers = 0, use_mmap = true, \ + use_mlock = false, n_ctx = 1, n_batch = 1, n_ubatch = 1"; + let message = parse(source) + .expect_err("legacy repo must be rejected") + .to_string(); + + assert!(message.contains("model_source"), "got: {message}"); + } + + #[test] + fn legacy_mmproj_file_field_is_rejected_with_migration_hint() { + let source = format!("{ALL_REQUIRED}, mmproj_file = \"mmproj.gguf\""); + let message = parse(&source) + .expect_err("legacy mmproj_file must be rejected") + .to_string(); + + assert!(message.contains("mmproj_source"), "got: {message}"); + } + + #[test] + fn missing_model_source_is_rejected() { + let source = "n_gpu_layers = 0, use_mmap = true, use_mlock = false, \ + n_ctx = 1, n_batch = 1, n_ubatch = 1"; + let message = parse(source) + .expect_err("missing model_source must fail") + .to_string(); + + assert!( + message.contains("missing required field `model_source`"), + "got: {message}" + ); + } + + #[test] + fn missing_n_ctx_is_rejected() { + let source = "model_source = HuggingFace(\"x\", \"y\"), n_gpu_layers = 0, use_mmap = true, \ + use_mlock = false, n_batch = 1, n_ubatch = 1"; + let message = parse(source) + .expect_err("missing n_ctx must fail") + .to_string(); + + assert!( + message.contains("missing required field `n_ctx`"), + "got: {message}" + ); + } + + #[test] + fn unknown_field_is_rejected() { + let source = format!("{ALL_REQUIRED}, surprise = 1"); + let message = parse(&source) + .expect_err("unknown field must fail") + .to_string(); + + assert!( + message.contains("unknown field `surprise`"), + "got: {message}" + ); + } + + #[test] + fn duplicate_field_is_rejected() { + let source = format!("{ALL_REQUIRED}, model_source = HuggingFace(\"other\", \"o.gguf\")"); + let message = parse(&source).expect_err("duplicate must fail").to_string(); + + assert!( + message.contains("duplicate field `model_source`"), + "got: {message}" + ); + } + + #[test] + fn non_name_value_form_is_rejected() { + let source = "model_source, file = \"x\""; + let message = parse(source).expect_err("bare ident must fail").to_string(); + + assert!(message.contains("name = value"), "got: {message}"); + } + + #[test] + fn non_literal_value_for_scalar_field_is_rejected() { + let source = "\ + model_source = HuggingFace(\"x\", \"y\"), \ + n_gpu_layers = some_const, \ + use_mmap = true, \ + use_mlock = false, \ + n_ctx = 1, \ + n_batch = 1, \ + n_ubatch = 1"; + let message = parse(source) + .expect_err("non-literal value must fail") + .to_string(); + + assert!(message.contains("literal"), "got: {message}"); + } + + #[test] + fn wrong_literal_kind_for_int_field_is_rejected() { + let source = "\ + model_source = HuggingFace(\"x\", \"y\"), \ + n_gpu_layers = \"nine\", \ + use_mmap = true, \ + use_mlock = false, \ + n_ctx = 1, \ + n_batch = 1, \ + n_ubatch = 1"; + let message = parse(source) + .expect_err("string for int field must fail") + .to_string(); + + assert!(message.contains("integer literal"), "got: {message}"); + } + + #[test] + fn wrong_literal_kind_for_bool_field_is_rejected() { + let source = "\ + model_source = HuggingFace(\"x\", \"y\"), \ + n_gpu_layers = 0, \ + use_mmap = 1, \ + use_mlock = false, \ + n_ctx = 1, \ + n_batch = 1, \ + n_ubatch = 1"; + let message = parse(source) + .expect_err("int for bool field must fail") + .to_string(); + + assert!(message.contains("bool literal"), "got: {message}"); + } + + #[test] + fn negative_int_for_u32_field_is_rejected() { + let source = "\ + model_source = HuggingFace(\"x\", \"y\"), \ + n_gpu_layers = -1, \ + use_mmap = true, \ + use_mlock = false, \ + n_ctx = 1, \ + n_batch = 1, \ + n_ubatch = 1"; + let message = parse(source) + .expect_err("negative int must fail") + .to_string(); + + assert!(message.contains("literal"), "got: {message}"); + } + + #[test] + fn complex_path_field_name_is_rejected() { + let source = "\ + foo::bar = 1, \ + model_source = HuggingFace(\"x\", \"y\"), \ + n_gpu_layers = 0, \ + use_mmap = true, \ + use_mlock = false, \ + n_ctx = 1, \ + n_batch = 1, \ + n_ubatch = 1"; + let message = parse(source) + .expect_err("path field name must fail") + .to_string(); + + assert!(message.contains("simple identifier"), "got: {message}"); + } + + #[test] + fn overflowing_int_is_rejected() { + let source = "\ + model_source = HuggingFace(\"x\", \"y\"), \ + n_gpu_layers = 99999999999, \ + use_mmap = true, \ + use_mlock = false, \ + n_ctx = 1, \ + n_batch = 1, \ + n_ubatch = 1"; + let message = parse(source).expect_err("overflow must fail").to_string(); + + assert!(message.contains("u32"), "got: {message}"); + } + + #[test] + fn overflowing_i32_for_n_threads_batch_is_rejected() { + let source = format!("{ALL_REQUIRED}, n_threads_batch = 99999999999"); + let message = parse(&source) + .expect_err("i32 overflow must fail") + .to_string(); + + assert!(message.contains("i32"), "got: {message}"); + } + + #[test] + fn missing_n_gpu_layers_is_rejected() { + let source = "\ + model_source = HuggingFace(\"x\", \"y\"), \ + use_mmap = true, \ + use_mlock = false, \ + n_ctx = 1, \ + n_batch = 1, \ + n_ubatch = 1"; + let message = parse(source) + .expect_err("missing n_gpu_layers must fail") + .to_string(); + + assert!( + message.contains("missing required field `n_gpu_layers`"), + "got: {message}" + ); + } + + #[test] + fn missing_use_mmap_is_rejected() { + let source = "\ + model_source = HuggingFace(\"x\", \"y\"), \ + n_gpu_layers = 0, \ + use_mlock = false, \ + n_ctx = 1, \ + n_batch = 1, \ + n_ubatch = 1"; + let message = parse(source) + .expect_err("missing use_mmap must fail") + .to_string(); + + assert!( + message.contains("missing required field `use_mmap`"), + "got: {message}" + ); + } + + #[test] + fn missing_use_mlock_is_rejected() { + let source = "\ + model_source = HuggingFace(\"x\", \"y\"), \ + n_gpu_layers = 0, \ + use_mmap = true, \ + n_ctx = 1, \ + n_batch = 1, \ + n_ubatch = 1"; + let message = parse(source) + .expect_err("missing use_mlock must fail") + .to_string(); + + assert!( + message.contains("missing required field `use_mlock`"), + "got: {message}" + ); + } + + #[test] + fn missing_n_batch_is_rejected() { + let source = "\ + model_source = HuggingFace(\"x\", \"y\"), \ + n_gpu_layers = 0, \ + use_mmap = true, \ + use_mlock = false, \ + n_ctx = 1, \ + n_ubatch = 1"; + let message = parse(source) + .expect_err("missing n_batch must fail") + .to_string(); + + assert!( + message.contains("missing required field `n_batch`"), + "got: {message}" + ); + } + + #[test] + fn missing_n_ubatch_is_rejected() { + let source = "\ + model_source = HuggingFace(\"x\", \"y\"), \ + n_gpu_layers = 0, \ + use_mmap = true, \ + use_mlock = false, \ + n_ctx = 1, \ + n_batch = 1"; + let message = parse(source) + .expect_err("missing n_ubatch must fail") + .to_string(); + + assert!( + message.contains("missing required field `n_ubatch`"), + "got: {message}" + ); + } + + #[test] + fn optional_embeddings_defaults_to_false_when_absent() { + let parsed = parse(ALL_REQUIRED).expect("required-only must parse"); + + assert!(!parsed.context_params.embeddings); + } + + #[test] + fn optional_embeddings_true_is_parsed() { + let source = format!("{ALL_REQUIRED}, embeddings = true"); + let parsed = parse(&source).expect("embeddings = true must parse"); + + assert!(parsed.context_params.embeddings); + } + + #[test] + fn optional_embeddings_false_is_parsed() { + let source = format!("{ALL_REQUIRED}, embeddings = false"); + let parsed = parse(&source).expect("embeddings = false must parse"); + + assert!(!parsed.context_params.embeddings); + } + + #[test] + fn optional_embeddings_rejects_non_bool_literal() { + let source = format!("{ALL_REQUIRED}, embeddings = 1"); + let message = parse(&source) + .expect_err("embeddings with int must fail") + .to_string(); + + assert!(message.contains("bool literal"), "got: {message}"); + } + + #[test] + fn optional_n_seq_max_defaults_to_one_when_absent() { + let parsed = parse(ALL_REQUIRED).expect("required-only must parse"); + + assert_eq!(parsed.context_params.n_seq_max, 1); + } + + #[test] + fn optional_n_seq_max_is_parsed() { + let source = format!("{ALL_REQUIRED}, n_seq_max = 4"); + let parsed = parse(&source).expect("n_seq_max = 4 must parse"); + + assert_eq!(parsed.context_params.n_seq_max, 4); + } + + #[test] + fn optional_n_threads_batch_defaults_to_none_when_absent() { + let parsed = parse(ALL_REQUIRED).expect("required-only must parse"); + + assert_eq!(parsed.context_params.n_threads_batch, None); + } + + #[test] + fn optional_n_threads_batch_is_parsed_when_positive() { + let source = format!("{ALL_REQUIRED}, n_threads_batch = 8"); + let parsed = parse(&source).expect("n_threads_batch = 8 must parse"); + + assert_eq!(parsed.context_params.n_threads_batch, Some(8)); + } + + #[test] + fn optional_n_threads_batch_rejects_non_integer_literal() { + let source = format!("{ALL_REQUIRED}, n_threads_batch = \"eight\""); + let message = parse(&source) + .expect_err("string for n_threads_batch must fail") + .to_string(); + + assert!(message.contains("integer literal"), "got: {message}"); + } + + #[test] + fn optional_void_logs_defaults_to_false_when_absent() { + let parsed = parse(ALL_REQUIRED).expect("required-only must parse"); + + assert!(!parsed.void_logs); + } + + #[test] + fn optional_void_logs_true_is_parsed() { + let source = format!("{ALL_REQUIRED}, void_logs = true"); + let parsed = parse(&source).expect("void_logs = true must parse"); + + assert!(parsed.void_logs); + } + + fn override_field(field: &str, replacement: &str) -> String { + let parts: [(&str, &str); 7] = [ + ("model_source", "HuggingFace(\"foo\", \"bar.gguf\")"), + ("n_gpu_layers", "0"), + ("use_mmap", "true"), + ("use_mlock", "false"), + ("n_ctx", "512"), + ("n_batch", "128"), + ("n_ubatch", "64"), + ]; + parts + .iter() + .map(|(name, value)| { + let resolved = if *name == field { replacement } else { *value }; + format!("{name} = {resolved}") + }) + .collect::>() + .join(", ") + } + + fn append_field(field: &str, value: &str) -> String { + format!("{ALL_REQUIRED}, {field} = {value}") + } + + #[test] + fn each_int_dispatch_arm_rejects_non_literal_value() { + for field in ["n_gpu_layers", "n_ctx", "n_batch", "n_ubatch"] { + let source = override_field(field, "some_const"); + let message = parse(&source).expect_err(field).to_string(); + + assert!(message.contains("literal"), "{field}: {message}"); + } + } + + #[test] + fn each_bool_dispatch_arm_rejects_non_literal_value() { + for field in ["use_mmap", "use_mlock"] { + let source = override_field(field, "some_const"); + let message = parse(&source).expect_err(field).to_string(); + + assert!(message.contains("literal"), "{field}: {message}"); + } + } + + #[test] + fn each_int_dispatch_arm_rejects_wrong_literal_kind() { + for field in ["n_gpu_layers", "n_ctx", "n_batch", "n_ubatch"] { + let source = override_field(field, "\"not-an-int\""); + let message = parse(&source).expect_err(field).to_string(); + + assert!(message.contains("integer literal"), "{field}: {message}"); + } + } + + #[test] + fn each_bool_dispatch_arm_rejects_wrong_literal_kind() { + for field in ["use_mmap", "use_mlock"] { + let source = override_field(field, "0"); + let message = parse(&source).expect_err(field).to_string(); + + assert!(message.contains("bool literal"), "{field}: {message}"); + } + } + + #[test] + fn optional_n_seq_max_rejects_non_literal_value() { + let source = append_field("n_seq_max", "some_const"); + let message = parse(&source) + .expect_err("n_seq_max non-literal must fail") + .to_string(); + + assert!(message.contains("literal"), "got: {message}"); + } + + #[test] + fn optional_n_seq_max_rejects_wrong_literal_kind() { + let source = append_field("n_seq_max", "\"four\""); + let message = parse(&source) + .expect_err("n_seq_max wrong-kind must fail") + .to_string(); + + assert!(message.contains("integer literal"), "got: {message}"); + } + + #[test] + fn optional_n_threads_batch_rejects_non_literal_value() { + let source = append_field("n_threads_batch", "some_const"); + let message = parse(&source) + .expect_err("n_threads_batch non-literal must fail") + .to_string(); + + assert!(message.contains("literal"), "got: {message}"); + } + + #[test] + fn optional_embeddings_rejects_non_literal_value() { + let source = append_field("embeddings", "some_const"); + let message = parse(&source) + .expect_err("embeddings non-literal must fail") + .to_string(); + + assert!(message.contains("literal"), "got: {message}"); + } + + #[test] + fn optional_void_logs_rejects_non_literal_value() { + let source = append_field("void_logs", "some_const"); + let message = parse(&source) + .expect_err("void_logs non-literal must fail") + .to_string(); + + assert!(message.contains("literal"), "got: {message}"); + } + + #[test] + fn optional_void_logs_rejects_wrong_literal_kind() { + let source = append_field("void_logs", "1"); + let message = parse(&source) + .expect_err("void_logs wrong-kind must fail") + .to_string(); + + assert!(message.contains("bool literal"), "got: {message}"); + } + + #[test] + fn optional_mmproj_source_rejects_unknown_variant() { + let source = append_field("mmproj_source", "Mystery(\"a\", \"b\")"); + let message = parse(&source) + .expect_err("mmproj_source unknown variant must fail") + .to_string(); + + assert!(message.contains("unknown source variant"), "got: {message}"); + } + + #[test] + fn model_source_with_unknown_variant_is_rejected() { + let source = "\ + model_source = Mystery(\"a\", \"b\"), \ + n_gpu_layers = 0, \ + use_mmap = true, \ + use_mlock = false, \ + n_ctx = 1, \ + n_batch = 1, \ + n_ubatch = 1"; + let message = parse(source) + .expect_err("model_source unknown variant must fail") + .to_string(); + + assert!(message.contains("unknown source variant"), "got: {message}"); + } + + #[test] + fn unparseable_attribute_token_stream_is_rejected() { + // `Punctuated::parse_terminated` rejects input that can't be split into Meta items by + // commas; passing a stray symbol surfaces that `?` Err arm in `ParsedArgs::parse`. + let result = parse("@&^!"); + + assert!( + result.is_err(), + "garbage attribute body must fail to parse as Punctuated" + ); + } +} diff --git a/llama-cpp-test-harness-macros/src/parsed_context_params.rs b/llama-cpp-test-harness-macros/src/parsed_context_params.rs new file mode 100644 index 00000000..d9c32c4b --- /dev/null +++ b/llama-cpp-test-harness-macros/src/parsed_context_params.rs @@ -0,0 +1,9 @@ +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub struct ParsedContextParams { + pub n_ctx: u32, + pub n_batch: u32, + pub n_ubatch: u32, + pub n_seq_max: u32, + pub n_threads_batch: Option, + pub embeddings: bool, +} diff --git a/llama-cpp-test-harness-macros/src/parsed_model_load_params.rs b/llama-cpp-test-harness-macros/src/parsed_model_load_params.rs new file mode 100644 index 00000000..5cce5426 --- /dev/null +++ b/llama-cpp-test-harness-macros/src/parsed_model_load_params.rs @@ -0,0 +1,6 @@ +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub struct ParsedModelLoadParams { + pub n_gpu_layers: u32, + pub use_mmap: bool, + pub use_mlock: bool, +} diff --git a/llama-cpp-test-harness-macros/src/parsed_source.rs b/llama-cpp-test-harness-macros/src/parsed_source.rs new file mode 100644 index 00000000..66f7669a --- /dev/null +++ b/llama-cpp-test-harness-macros/src/parsed_source.rs @@ -0,0 +1,245 @@ +use syn::Expr; +use syn::ExprCall; +use syn::ExprLit; +use syn::ExprPath; +use syn::Lit; + +fn require_string_argument(expression: &Expr, field: &str, position: &str) -> syn::Result { + match expression { + Expr::Lit(ExprLit { + lit: Lit::Str(string_literal), + .. + }) => Ok(string_literal.value()), + _ => Err(syn::Error::new_spanned( + expression, + format!("`{field}` argument `{position}` expects a string literal"), + )), + } +} + +fn parse_huggingface_source(call: &ExprCall, field: &str) -> syn::Result { + let args: Vec<&Expr> = call.args.iter().collect(); + let [repo_expr, file_expr] = args.as_slice() else { + return Err(syn::Error::new_spanned( + &call.args, + format!( + "`HuggingFace` expects exactly 2 string arguments (repo, file); got {got}", + got = call.args.len() + ), + )); + }; + let repo = require_string_argument(repo_expr, field, "HuggingFace.repo")?; + let file = require_string_argument(file_expr, field, "HuggingFace.file")?; + Ok(ParsedSource::HuggingFace { repo, file }) +} + +fn parse_local_path_source(call: &ExprCall, field: &str) -> syn::Result { + let args: Vec<&Expr> = call.args.iter().collect(); + let [path_expr] = args.as_slice() else { + return Err(syn::Error::new_spanned( + &call.args, + format!( + "`LocalPath` expects exactly 1 string argument (path); got {got}", + got = call.args.len() + ), + )); + }; + let path = require_string_argument(path_expr, field, "LocalPath.path")?; + Ok(ParsedSource::LocalPath(path)) +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ParsedSource { + HuggingFace { repo: String, file: String }, + LocalPath(String), +} + +impl ParsedSource { + pub fn display_suffix(&self) -> String { + match self { + Self::HuggingFace { file, .. } => file.clone(), + Self::LocalPath(path) => std::path::Path::new(path) + .file_name() + .and_then(|name| name.to_str()) + .map_or_else(|| path.clone(), str::to_owned), + } + } + + pub fn parse(expression: &Expr, field: &str) -> syn::Result { + let Expr::Call(call) = expression else { + return Err(syn::Error::new_spanned( + expression, + format!("field `{field}` expects `HuggingFace(repo, file)` or `LocalPath(path)`"), + )); + }; + let Expr::Path(ExprPath { path, .. }) = call.func.as_ref() else { + return Err(syn::Error::new_spanned( + call.func.as_ref(), + format!("field `{field}` expects `HuggingFace(...)` or `LocalPath(...)`"), + )); + }; + let variant_ident = path.get_ident().ok_or_else(|| { + syn::Error::new_spanned( + path, + format!( + "field `{field}` expects the bare variant name `HuggingFace` or `LocalPath`" + ), + ) + })?; + match variant_ident.to_string().as_str() { + "HuggingFace" => parse_huggingface_source(call, field), + "LocalPath" => parse_local_path_source(call, field), + other => Err(syn::Error::new_spanned( + variant_ident, + format!("unknown source variant `{other}`; expected `HuggingFace` or `LocalPath`"), + )), + } + } +} + +#[cfg(test)] +mod tests { + use syn::parse_str; + + use super::ParsedSource; + + fn parse(source: &str) -> syn::Result { + let expression: syn::Expr = parse_str(source)?; + ParsedSource::parse(&expression, "model_source") + } + + #[test] + fn parses_huggingface_with_two_string_args() { + let parsed = parse("HuggingFace(\"org/name\", \"file.gguf\")").expect("valid"); + + assert_eq!( + parsed, + ParsedSource::HuggingFace { + repo: "org/name".to_owned(), + file: "file.gguf".to_owned(), + }, + ); + } + + #[test] + fn parses_local_path_with_one_string_arg() { + let parsed = parse("LocalPath(\"/abs/local.gguf\")").expect("valid"); + + assert_eq!(parsed, ParsedSource::LocalPath("/abs/local.gguf".to_owned())); + } + + #[test] + fn unknown_variant_is_rejected() { + let message = parse("Mystery(\"a\", \"b\")") + .expect_err("unknown variant must fail") + .to_string(); + + assert!(message.contains("unknown source variant"), "got: {message}"); + } + + #[test] + fn non_call_expression_is_rejected() { + let message = parse("\"plain\"") + .expect_err("non-call must fail") + .to_string(); + + assert!(message.contains("HuggingFace"), "got: {message}"); + assert!(message.contains("LocalPath"), "got: {message}"); + } + + #[test] + fn huggingface_with_wrong_arity_is_rejected() { + let message = parse("HuggingFace(\"only-one\")") + .expect_err("arity must fail") + .to_string(); + + assert!(message.contains("HuggingFace"), "got: {message}"); + assert!(message.contains("2 string"), "got: {message}"); + } + + #[test] + fn local_path_with_wrong_arity_is_rejected() { + let message = parse("LocalPath(\"a\", \"b\")") + .expect_err("arity must fail") + .to_string(); + + assert!(message.contains("LocalPath"), "got: {message}"); + assert!(message.contains("1 string"), "got: {message}"); + } + + #[test] + fn non_string_argument_is_rejected() { + let message = parse("HuggingFace(42, \"file\")") + .expect_err("non-string arg must fail") + .to_string(); + + assert!(message.contains("string literal"), "got: {message}"); + } + + #[test] + fn huggingface_with_non_string_second_argument_is_rejected() { + let message = parse("HuggingFace(\"repo\", 42)") + .expect_err("non-string second arg must fail") + .to_string(); + + assert!(message.contains("string literal"), "got: {message}"); + } + + #[test] + fn local_path_with_non_string_argument_is_rejected() { + let message = parse("LocalPath(42)") + .expect_err("non-string LocalPath arg must fail") + .to_string(); + + assert!(message.contains("string literal"), "got: {message}"); + } + + #[test] + fn unparseable_input_returns_err() { + let result = parse("@&^!"); + + assert!(result.is_err(), "garbage input must fail to parse as syn::Expr"); + } + + #[test] + fn non_path_function_expression_is_rejected() { + let message = parse("(closure)(\"a\")") + .expect_err("non-path func must fail") + .to_string(); + + assert!(message.contains("HuggingFace"), "got: {message}"); + } + + #[test] + fn qualified_path_variant_is_rejected() { + let message = parse("some::Other::HuggingFace(\"a\", \"b\")") + .expect_err("qualified path must fail") + .to_string(); + + assert!(message.contains("bare variant name"), "got: {message}"); + } + + #[test] + fn display_suffix_huggingface_returns_file() { + let source = ParsedSource::HuggingFace { + repo: "org/name".to_owned(), + file: "model.gguf".to_owned(), + }; + + assert_eq!(source.display_suffix(), "model.gguf"); + } + + #[test] + fn display_suffix_local_path_returns_basename() { + let source = ParsedSource::LocalPath("/abs/dir/model.gguf".to_owned()); + + assert_eq!(source.display_suffix(), "model.gguf"); + } + + #[test] + fn display_suffix_local_path_without_file_name_returns_full_path() { + let source = ParsedSource::LocalPath("/abs/dir/..".to_owned()); + + assert_eq!(source.display_suffix(), "/abs/dir/.."); + } +} diff --git a/llama-cpp-test-harness/Cargo.toml b/llama-cpp-test-harness/Cargo.toml new file mode 100644 index 00000000..041ea779 --- /dev/null +++ b/llama-cpp-test-harness/Cargo.toml @@ -0,0 +1,36 @@ +[package] +name = "llama-cpp-test-harness" +description = "Declarative, deterministic integration-test harness for llama-cpp-bindings" +version.workspace = true +edition.workspace = true +license.workspace = true +publish = false + +[dependencies] +anyhow = { workspace = true } +hf-hub = { workspace = true } +inventory = { workspace = true } +libtest-mimic = { workspace = true } +llama-cpp-bindings = { workspace = true } +llama-cpp-test-harness-macros = { workspace = true } + +[features] +cuda = ["llama-cpp-bindings/cuda"] +cuda-no-vmm = ["llama-cpp-bindings/cuda-no-vmm"] +metal = ["llama-cpp-bindings/metal"] +vulkan = ["llama-cpp-bindings/vulkan"] +rocm = ["llama-cpp-bindings/rocm"] + +[[test]] +name = "harness_self_test" +harness = false + +[lints.rust] +unsafe_op_in_unsafe_fn = "warn" +unused_qualifications = "warn" + +[lints.clippy] +all = { level = "deny", priority = -1 } +pedantic = { level = "warn", priority = -1 } +nursery = { level = "warn", priority = -1 } +module_name_repetitions = "allow" diff --git a/llama-cpp-test-harness/src/context_params.rs b/llama-cpp-test-harness/src/context_params.rs new file mode 100644 index 00000000..e135b977 --- /dev/null +++ b/llama-cpp-test-harness/src/context_params.rs @@ -0,0 +1,191 @@ +use std::num::NonZeroU32; + +use llama_cpp_bindings::context::params::LlamaContextParams; + +#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct ContextParams { + pub n_ctx: u32, + pub n_batch: u32, + pub n_ubatch: u32, + pub n_seq_max: u32, + pub n_threads_batch: Option, + pub embeddings: bool, +} + +impl ContextParams { + #[must_use] + pub fn into_llama_context_params(self) -> LlamaContextParams { + let Self { + n_ctx, + n_batch, + n_ubatch, + n_seq_max, + n_threads_batch, + embeddings, + } = self; + let mut params = LlamaContextParams::default() + .with_n_ctx(NonZeroU32::new(n_ctx)) + .with_n_batch(n_batch) + .with_n_ubatch(n_ubatch) + .with_n_seq_max(n_seq_max) + .with_embeddings(embeddings); + if let Some(threads) = n_threads_batch { + params = params.with_n_threads_batch(threads); + } + params + } +} + +#[cfg(test)] +mod tests { + use std::num::NonZeroU32; + + use llama_cpp_bindings::context::params::LlamaContextParams; + + use super::ContextParams; + + const BASELINE: ContextParams = ContextParams { + n_ctx: 512, + n_batch: 128, + n_ubatch: 64, + n_seq_max: 1, + n_threads_batch: None, + embeddings: false, + }; + + #[test] + fn into_llama_context_params_carries_all_fields() { + let params = ContextParams { + n_ctx: 1024, + n_batch: 256, + n_ubatch: 128, + ..BASELINE + } + .into_llama_context_params(); + + assert_eq!(params.n_ctx(), NonZeroU32::new(1024)); + assert_eq!(params.n_batch(), 256); + assert_eq!(params.n_ubatch(), 128); + } + + #[test] + fn into_llama_context_params_propagates_embeddings_flag() { + let off = ContextParams { + embeddings: false, + ..BASELINE + } + .into_llama_context_params(); + let on = ContextParams { + embeddings: true, + ..BASELINE + } + .into_llama_context_params(); + + assert!(!off.embeddings()); + assert!(on.embeddings()); + } + + #[test] + fn into_llama_context_params_propagates_n_seq_max() { + let params = ContextParams { + n_seq_max: 4, + ..BASELINE + } + .into_llama_context_params(); + + assert_eq!(params.context_params.n_seq_max, 4); + } + + #[test] + fn into_llama_context_params_applies_n_threads_batch_when_some() { + let params = ContextParams { + n_threads_batch: Some(8), + ..BASELINE + } + .into_llama_context_params(); + + assert_eq!(params.context_params.n_threads_batch, 8); + } + + #[test] + fn into_llama_context_params_leaves_n_threads_batch_default_when_none() { + let default_params = LlamaContextParams::default(); + let params = ContextParams { + n_threads_batch: None, + ..BASELINE + } + .into_llama_context_params(); + + assert_eq!( + params.context_params.n_threads_batch, + default_params.context_params.n_threads_batch + ); + } + + #[test] + fn zero_n_ctx_means_model_default() { + let params = ContextParams { + n_ctx: 0, + ..BASELINE + } + .into_llama_context_params(); + + assert_eq!(params.n_ctx(), None); + } + + #[test] + fn differing_n_ctx_compares_unequal() { + let one = ContextParams { + n_ctx: 512, + ..BASELINE + }; + let two = ContextParams { + n_ctx: 1024, + ..BASELINE + }; + + assert_ne!(one, two); + } + + #[test] + fn differing_embeddings_compares_unequal() { + let off = ContextParams { + embeddings: false, + ..BASELINE + }; + let on = ContextParams { + embeddings: true, + ..BASELINE + }; + + assert_ne!(off, on); + } + + #[test] + fn differing_n_seq_max_compares_unequal() { + let one = ContextParams { + n_seq_max: 1, + ..BASELINE + }; + let two = ContextParams { + n_seq_max: 4, + ..BASELINE + }; + + assert_ne!(one, two); + } + + #[test] + fn differing_n_threads_batch_compares_unequal() { + let none = ContextParams { + n_threads_batch: None, + ..BASELINE + }; + let some = ContextParams { + n_threads_batch: Some(8), + ..BASELINE + }; + + assert_ne!(none, some); + } +} diff --git a/llama-cpp-test-harness/src/deterministic_arguments.rs b/llama-cpp-test-harness/src/deterministic_arguments.rs new file mode 100644 index 00000000..353053dd --- /dev/null +++ b/llama-cpp-test-harness/src/deterministic_arguments.rs @@ -0,0 +1,50 @@ +use libtest_mimic::Arguments; + +const fn build_deterministic_arguments(mut arguments: Arguments) -> Arguments { + arguments.test_threads = Some(1); + arguments +} + +#[must_use] +pub fn deterministic_arguments_from_cli() -> Arguments { + build_deterministic_arguments(Arguments::from_args()) +} + +#[cfg(test)] +mod tests { + use libtest_mimic::Arguments; + + use super::build_deterministic_arguments; + + #[test] + fn build_deterministic_arguments_forces_test_threads_to_one() { + let input = Arguments { + test_threads: Some(8), + ..Arguments::default() + }; + let output = build_deterministic_arguments(input); + + assert_eq!(output.test_threads, Some(1)); + } + + #[test] + fn build_deterministic_arguments_overrides_unset_test_threads() { + let input = Arguments::default(); + let output = build_deterministic_arguments(input); + + assert_eq!(output.test_threads, Some(1)); + } + + #[test] + fn build_deterministic_arguments_preserves_other_settings() { + let input = Arguments { + list: true, + filter: Some("foo".to_owned()), + ..Arguments::default() + }; + let output = build_deterministic_arguments(input); + + assert!(output.list); + assert_eq!(output.filter.as_deref(), Some("foo")); + } +} diff --git a/llama-cpp-test-harness/src/download_model.rs b/llama-cpp-test-harness/src/download_model.rs new file mode 100644 index 00000000..3ffd5a5b --- /dev/null +++ b/llama-cpp-test-harness/src/download_model.rs @@ -0,0 +1,29 @@ +use std::path::PathBuf; + +use anyhow::Result; + +/// Downloads a single file from a Hugging Face repo via `hf-hub`'s sync API. +/// +/// # Errors +/// +/// Returns an error if the HF client cannot be built or the file cannot be downloaded +/// (e.g., the repo or file does not exist, or network access fails). `hf-hub`'s error type +/// already carries the repo and file in its messages, so no extra context is added here. +pub fn download_model(repo: &str, file: &str) -> Result { + let api = hf_hub::api::sync::ApiBuilder::from_env() + .with_progress(true) + .build()?; + Ok(api.model(repo.to_owned()).get(file)?) +} + +#[cfg(test)] +mod tests { + use super::download_model; + + #[test] + fn missing_file_in_real_repo_returns_error() { + let result = download_model("unsloth/Qwen3.5-0.8B-GGUF", "this-file-does-not-exist.gguf"); + + assert!(result.is_err()); + } +} diff --git a/llama-cpp-test-harness/src/execution_phase.rs b/llama-cpp-test-harness/src/execution_phase.rs new file mode 100644 index 00000000..6892889d --- /dev/null +++ b/llama-cpp-test-harness/src/execution_phase.rs @@ -0,0 +1,133 @@ +use std::sync::Arc; + +use libtest_mimic::Arguments; +use libtest_mimic::Conclusion; +use libtest_mimic::Failed; +use libtest_mimic::Trial; +use llama_cpp_bindings::llama_backend::LlamaBackend; + +use crate::ModelSource; +use crate::llama_fixture::LlamaFixture; +use crate::llama_test_registration::LlamaTestRegistration; +use crate::load_key::LoadKey; +use crate::phase_state::PhaseState; + +fn source_label(source: ModelSource) -> String { + match source { + ModelSource::HuggingFace { repo, file } => format!("{repo} / {file}"), + ModelSource::LocalPath(path) => format!("local:{path}"), + } +} + +pub struct ExecutionPhase { + pub key: LoadKey, + pub registrations: Vec<&'static LlamaTestRegistration>, +} + +impl ExecutionPhase { + #[must_use] + pub fn header_line(&self, index: usize, total: usize) -> String { + format!( + "--- phase {phase_number}/{total_phases}: {source_label} (n_gpu_layers={n_gpu_layers}) ({trial_count} tests) ---", + phase_number = index + 1, + total_phases = total, + source_label = source_label(self.key.model_source), + n_gpu_layers = self.key.model_load_params.n_gpu_layers, + trial_count = self.registrations.len(), + ) + } + + pub fn print_header(&self, index: usize, total: usize) { + eprintln!("{}", self.header_line(index, total)); + } + + pub fn run(&self, backend: &Arc, arguments: &Arguments) -> Conclusion { + let trials = match self.key.load_phase_state(backend) { + Ok(state) => self.passing_trials(&Arc::new(state)), + Err(error) => self.failing_trials(&format!("phase setup failed: {error:#}")), + }; + libtest_mimic::run(arguments, trials) + } + + fn passing_trials(&self, state: &Arc) -> Vec { + self.registrations + .iter() + .map(|registration| { + let state_for_trial = Arc::clone(state); + let registration: &'static LlamaTestRegistration = registration; + let func = registration.func; + Trial::test(registration.name, move || { + let fixture = LlamaFixture { + model: &state_for_trial.model, + backend: &state_for_trial.backend, + context_params: ®istration.context_params, + mtmd_context: state_for_trial.mtmd_context.as_ref(), + model_path: &state_for_trial.model_path, + }; + func(&fixture).map_err(|error| Failed::from(format!("{error:#}"))) + }) + }) + .collect() + } + + fn failing_trials(&self, error_message: &str) -> Vec { + self.registrations + .iter() + .map(|registration| { + let message = error_message.to_owned(); + Trial::test(registration.name, move || Err(Failed::from(message))) + }) + .collect() + } +} + +#[cfg(test)] +mod tests { + use crate::ModelSource; + use crate::load_key::LoadKey; + use crate::model_load_params::ModelLoadParams; + + use super::ExecutionPhase; + + fn phase_with_source(source: ModelSource) -> ExecutionPhase { + ExecutionPhase { + key: LoadKey { + model_source: source, + mmproj_source: None, + model_load_params: ModelLoadParams { + n_gpu_layers: 7, + use_mmap: true, + use_mlock: false, + }, + }, + registrations: Vec::new(), + } + } + + #[test] + fn header_line_for_huggingface_source_formats_repo_and_file() { + let phase = phase_with_source(ModelSource::HuggingFace { + repo: "org/name", + file: "model.gguf", + }); + + let line = phase.header_line(0, 4); + + assert_eq!( + line, + "--- phase 1/4: org/name / model.gguf (n_gpu_layers=7) (0 tests) ---" + ); + } + + #[test] + fn header_line_for_local_path_source_uses_local_prefix() { + let phase = phase_with_source(ModelSource::LocalPath("/abs/model.gguf")); + + let line = phase.header_line(2, 3); + + assert_eq!( + line, + "--- phase 3/3: local:/abs/model.gguf (n_gpu_layers=7) (0 tests) ---" + ); + } +} diff --git a/llama-cpp-test-harness/src/execution_plan.rs b/llama-cpp-test-harness/src/execution_plan.rs new file mode 100644 index 00000000..52f6dd4c --- /dev/null +++ b/llama-cpp-test-harness/src/execution_plan.rs @@ -0,0 +1,268 @@ +//! Deterministic execution plan for the test harness. +//! +//! [`ExecutionPlan::from_registrations`] takes the registrations collected from `inventory` and +//! groups them into [`ExecutionPhase`]s by [`crate::LoadKey`]. The result is a sorted list of +//! phases — each phase corresponds to exactly one model-load cycle (load → run trials → drop). +//! +//! # Invariants +//! +//! - For every distinct [`crate::LoadKey`] the planner produces exactly one +//! [`ExecutionPhase`]; the same key never produces two phases. +//! - Phases are sorted by [`crate::LoadKey`] (lexicographic order on the full key tuple). +//! - Registrations inside a phase are sorted by their `name`. +//! - [`crate::ContextParams`] differences within registrations sharing a key do **not** split a +//! phase — the model loads once and each trial constructs its own `LlamaContext`. + +use std::collections::BTreeMap; +use std::sync::Arc; + +use libtest_mimic::Conclusion; +use llama_cpp_bindings::llama_backend::LlamaBackend; + +use crate::deterministic_arguments::deterministic_arguments_from_cli; +use crate::execution_phase::ExecutionPhase; +use crate::llama_test_registration::LlamaTestRegistration; + +fn collect_inventory_registrations() -> Vec<&'static LlamaTestRegistration> { + inventory::iter:: + .into_iter() + .collect() +} + +pub struct ExecutionPlan { + pub phases: Vec, +} + +impl ExecutionPlan { + #[must_use] + pub fn from_registrations(registrations: &[&'static LlamaTestRegistration]) -> Self { + let mut by_key: BTreeMap<_, Vec<&'static LlamaTestRegistration>> = BTreeMap::new(); + for registration in registrations { + by_key + .entry(registration.key) + .or_default() + .push(*registration); + } + let mut phases = Vec::with_capacity(by_key.len()); + for (key, mut registrations) in by_key { + registrations.sort_by_key(|registration| registration.name); + phases.push(ExecutionPhase { key, registrations }); + } + Self { phases } + } + + #[must_use] + pub fn from_inventory() -> Self { + let registrations = collect_inventory_registrations(); + Self::from_registrations(®istrations) + } + + #[must_use] + pub fn requests_void_logs(&self) -> bool { + self.phases + .iter() + .any(|phase| phase.registrations.iter().any(|reg| reg.void_logs)) + } + + #[must_use] + pub fn run(&self, backend: &Arc) -> Vec { + let arguments = deterministic_arguments_from_cli(); + let total = self.phases.len(); + let mut conclusions = Vec::with_capacity(total); + for (index, phase) in self.phases.iter().enumerate() { + phase.print_header(index, total); + conclusions.push(phase.run(backend, &arguments)); + } + conclusions + } +} + +#[cfg(test)] +mod tests { + use crate::context_params::ContextParams; + use crate::llama_test_registration::LlamaTestRegistration; + use crate::load_key::LoadKey; + use crate::model_load_params::ModelLoadParams; + use crate::model_source::ModelSource; + use crate::no_op::no_op; + + use super::ExecutionPlan; + + const TRIVIAL_CONTEXT_PARAMS: ContextParams = ContextParams { + n_ctx: 1, + n_batch: 1, + n_ubatch: 1, + n_seq_max: 1, + n_threads_batch: None, + embeddings: false, + }; + + const ALTERNATE_CONTEXT_PARAMS: ContextParams = ContextParams { + n_ctx: 4096, + n_batch: 1, + n_ubatch: 1, + n_seq_max: 1, + n_threads_batch: None, + embeddings: false, + }; + + static REG_BETA_A: LlamaTestRegistration = LlamaTestRegistration { + name: "alpha", + key: LoadKey { + model_source: ModelSource::HuggingFace { + repo: "beta", + file: "f", + }, + mmproj_source: None, + model_load_params: ModelLoadParams { + n_gpu_layers: 0, + use_mmap: true, + use_mlock: false, + }, + }, + context_params: TRIVIAL_CONTEXT_PARAMS, + void_logs: false, + func: no_op, + }; + static REG_BETA_B: LlamaTestRegistration = LlamaTestRegistration { + name: "bravo", + key: LoadKey { + model_source: ModelSource::HuggingFace { + repo: "beta", + file: "f", + }, + mmproj_source: None, + model_load_params: ModelLoadParams { + n_gpu_layers: 0, + use_mmap: true, + use_mlock: false, + }, + }, + context_params: TRIVIAL_CONTEXT_PARAMS, + void_logs: false, + func: no_op, + }; + static REG_ALPHA_Z: LlamaTestRegistration = LlamaTestRegistration { + name: "zulu", + key: LoadKey { + model_source: ModelSource::HuggingFace { + repo: "alpha", + file: "f", + }, + mmproj_source: None, + model_load_params: ModelLoadParams { + n_gpu_layers: 0, + use_mmap: true, + use_mlock: false, + }, + }, + context_params: TRIVIAL_CONTEXT_PARAMS, + void_logs: false, + func: no_op, + }; + static REG_BETA_DIFFERENT_CONTEXT: LlamaTestRegistration = LlamaTestRegistration { + name: "charlie", + key: LoadKey { + model_source: ModelSource::HuggingFace { + repo: "beta", + file: "f", + }, + mmproj_source: None, + model_load_params: ModelLoadParams { + n_gpu_layers: 0, + use_mmap: true, + use_mlock: false, + }, + }, + context_params: ALTERNATE_CONTEXT_PARAMS, + void_logs: false, + func: no_op, + }; + + static REG_VOID_LOGS: LlamaTestRegistration = LlamaTestRegistration { + name: "void-logs-trial", + key: LoadKey { + model_source: ModelSource::HuggingFace { + repo: "beta", + file: "f", + }, + mmproj_source: None, + model_load_params: ModelLoadParams { + n_gpu_layers: 0, + use_mmap: true, + use_mlock: false, + }, + }, + context_params: TRIVIAL_CONTEXT_PARAMS, + void_logs: true, + func: no_op, + }; + + #[test] + fn from_registrations_with_empty_input_yields_empty_plan() { + let plan = ExecutionPlan::from_registrations(&[]); + + assert!(plan.phases.is_empty()); + } + + #[test] + fn registrations_with_same_load_key_collapse_to_one_phase() { + let plan = ExecutionPlan::from_registrations(&[®_BETA_A, ®_BETA_B]); + + assert_eq!(plan.phases.len(), 1); + assert_eq!(plan.phases[0].registrations.len(), 2); + } + + #[test] + fn registrations_with_distinct_load_keys_form_phases_in_load_key_sort_order() { + let plan = ExecutionPlan::from_registrations(&[®_BETA_A, ®_ALPHA_Z]); + + assert_eq!(plan.phases.len(), 2); + assert!(matches!( + plan.phases[0].key.model_source, + ModelSource::HuggingFace { repo: "alpha", .. } + )); + assert!(matches!( + plan.phases[1].key.model_source, + ModelSource::HuggingFace { repo: "beta", .. } + )); + } + + #[test] + fn within_a_phase_registrations_sort_by_name() { + let plan = ExecutionPlan::from_registrations(&[®_BETA_B, ®_BETA_A]); + + assert_eq!(plan.phases.len(), 1); + assert_eq!(plan.phases[0].registrations[0].name, "alpha"); + assert_eq!(plan.phases[0].registrations[1].name, "bravo"); + } + + #[test] + fn requests_void_logs_false_when_no_registration_opts_in() { + let plan = ExecutionPlan::from_registrations(&[®_BETA_A, ®_ALPHA_Z]); + + assert!(!plan.requests_void_logs()); + } + + #[test] + fn requests_void_logs_true_when_any_registration_opts_in() { + let plan = ExecutionPlan::from_registrations(&[®_BETA_A, ®_VOID_LOGS]); + + assert!(plan.requests_void_logs()); + } + + #[test] + fn registrations_sharing_a_load_key_but_differing_context_params_stay_in_one_phase() { + let plan = ExecutionPlan::from_registrations(&[®_BETA_A, ®_BETA_DIFFERENT_CONTEXT]); + + assert_eq!(plan.phases.len(), 1); + assert_eq!(plan.phases[0].registrations.len(), 2); + let context_lengths: Vec = plan.phases[0] + .registrations + .iter() + .map(|registration| registration.context_params.n_ctx) + .collect(); + assert!(context_lengths.contains(&1)); + assert!(context_lengths.contains(&4096)); + } +} diff --git a/llama-cpp-test-harness/src/lib.rs b/llama-cpp-test-harness/src/lib.rs new file mode 100644 index 00000000..fb0c1230 --- /dev/null +++ b/llama-cpp-test-harness/src/lib.rs @@ -0,0 +1,47 @@ +//! Declarative, deterministic, phase-batched integration-test harness for `llama-cpp-bindings`. +//! +//! Tests tag their functions with `#[llama_test(model_source = HuggingFace("…", "…"), …)]` +//! (or `model_source = LocalPath("…")` for a local GGUF). The harness groups tests with +//! identical [`LoadKey`]s into [`ExecutionPhase`]s, loads each phase's model exactly once, and +//! runs every test in the phase sequentially against the shared [`LlamaFixture`]. +//! +//! See the workspace README and `tests/` directory for usage examples. + +pub mod context_params; +pub mod deterministic_arguments; +pub mod download_model; +pub mod execution_phase; +pub mod execution_plan; +pub mod llama_fixture; +pub mod llama_test_fn; +pub mod llama_test_registration; +pub mod llama_tests_main_macro; +pub mod load_key; +pub mod mmproj_source; +pub mod model_load_params; +pub mod model_source; +pub mod no_op; +pub mod phase_state; +pub mod run; +pub mod run_to_conclusions; +#[cfg(test)] +mod test_backend_gate; + +pub use crate::context_params::ContextParams; +pub use crate::execution_phase::ExecutionPhase; +pub use crate::execution_plan::ExecutionPlan; +pub use crate::llama_fixture::LlamaFixture; +pub use crate::llama_test_fn::LlamaTestFn; +pub use crate::llama_test_registration::LlamaTestRegistration; +pub use crate::load_key::LoadKey; +pub use crate::mmproj_source::MmprojSource; +pub use crate::model_load_params::ModelLoadParams; +pub use crate::model_source::ModelSource; +pub use crate::no_op::no_op; +pub use crate::phase_state::PhaseState; +pub use crate::run::run; +pub use crate::run_to_conclusions::run_to_conclusions; +pub use llama_cpp_test_harness_macros::llama_test; + +#[doc(hidden)] +pub use inventory; diff --git a/llama-cpp-test-harness/src/llama_fixture.rs b/llama-cpp-test-harness/src/llama_fixture.rs new file mode 100644 index 00000000..04ae60b4 --- /dev/null +++ b/llama-cpp-test-harness/src/llama_fixture.rs @@ -0,0 +1,15 @@ +use std::path::Path; + +use llama_cpp_bindings::llama_backend::LlamaBackend; +use llama_cpp_bindings::model::LlamaModel; +use llama_cpp_bindings::mtmd::MtmdContext; + +use crate::context_params::ContextParams; + +pub struct LlamaFixture<'fixture> { + pub model: &'fixture LlamaModel, + pub backend: &'fixture LlamaBackend, + pub context_params: &'fixture ContextParams, + pub mtmd_context: Option<&'fixture MtmdContext>, + pub model_path: &'fixture Path, +} diff --git a/llama-cpp-test-harness/src/llama_test_fn.rs b/llama-cpp-test-harness/src/llama_test_fn.rs new file mode 100644 index 00000000..f49249b6 --- /dev/null +++ b/llama-cpp-test-harness/src/llama_test_fn.rs @@ -0,0 +1,4 @@ +use crate::llama_fixture::LlamaFixture; + +pub type LlamaTestFn = + for<'reference, 'fixture> fn(&'reference LlamaFixture<'fixture>) -> anyhow::Result<()>; diff --git a/llama-cpp-test-harness/src/llama_test_registration.rs b/llama-cpp-test-harness/src/llama_test_registration.rs new file mode 100644 index 00000000..2105120d --- /dev/null +++ b/llama-cpp-test-harness/src/llama_test_registration.rs @@ -0,0 +1,13 @@ +use crate::context_params::ContextParams; +use crate::llama_test_fn::LlamaTestFn; +use crate::load_key::LoadKey; + +pub struct LlamaTestRegistration { + pub name: &'static str, + pub key: LoadKey, + pub context_params: ContextParams, + pub void_logs: bool, + pub func: LlamaTestFn, +} + +inventory::collect!(LlamaTestRegistration); diff --git a/llama-cpp-test-harness/src/llama_tests_main_macro.rs b/llama-cpp-test-harness/src/llama_tests_main_macro.rs new file mode 100644 index 00000000..fc047cfc --- /dev/null +++ b/llama-cpp-test-harness/src/llama_tests_main_macro.rs @@ -0,0 +1,11 @@ +/// Generates a `fn main() -> ExitCode` that dispatches via the harness. +/// +/// Place once at module scope in a test binary that uses `#[llama_test(...)]`. +#[macro_export] +macro_rules! llama_tests_main { + () => { + fn main() -> ::std::process::ExitCode { + $crate::run() + } + }; +} diff --git a/llama-cpp-test-harness/src/load_key.rs b/llama-cpp-test-harness/src/load_key.rs new file mode 100644 index 00000000..af34b972 --- /dev/null +++ b/llama-cpp-test-harness/src/load_key.rs @@ -0,0 +1,235 @@ +//! Identity of one model-load operation. +//! +//! Two registrations with different [`LoadKey`]s require separate model loads. Two registrations +//! with identical [`LoadKey`]s share one load — even if every other attribute (such as +//! [`crate::ContextParams`]) differs. +//! +//! # What forces a model reload +//! +//! Only the fields of [`LoadKey`]: the model source ([`crate::ModelSource`]), the mmproj source +//! (optional [`crate::MmprojSource`]), and the [`crate::ModelLoadParams`] (`n_gpu_layers`, +//! `use_mmap`, `use_mlock`). +//! +//! # What is runtime-flexible +//! +//! Every `LlamaContextParams` setter (`n_ctx`, `n_batch`, `n_ubatch`, `n_seq_max`, +//! `n_threads_batch`, `embeddings`, and the further setters not yet surfaced in the attribute +//! schema). The harness builds a fresh `LlamaContext` per trial from `fixture.context_params`, +//! so differences here never reload the model. + +use std::sync::Arc; + +use anyhow::Result; +use llama_cpp_bindings::llama_backend::LlamaBackend; +use llama_cpp_bindings::model::LlamaModel; +use llama_cpp_bindings::mtmd::MtmdContext; +use llama_cpp_bindings::mtmd::MtmdContextParams; + +use crate::mmproj_source::MmprojSource; +use crate::model_load_params::ModelLoadParams; +use crate::model_source::ModelSource; +use crate::phase_state::PhaseState; + +#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct LoadKey { + pub model_source: ModelSource, + pub mmproj_source: Option, + pub model_load_params: ModelLoadParams, +} + +impl LoadKey { + /// Downloads (or resolves) the model and optional mmproj, loads them, and returns the live + /// [`PhaseState`] that the harness keeps alive for the duration of the phase. + /// + /// # Errors + /// + /// Returns an error if any of: source resolution fails, loading the model into llama.cpp + /// fails, or initializing the `MtmdContext` fails. + pub fn load_phase_state(&self, backend: &Arc) -> Result { + let model_path = self.model_source.resolve_path()?; + let model_params = self.model_load_params.into_llama_model_params(); + let model = LlamaModel::load_from_file(backend, &model_path, &model_params)?; + + let mtmd_context = match self.mmproj_source { + Some(mmproj_source) => { + let mmproj_path = mmproj_source.resolve_path()?; + let mmproj_path_str = mmproj_path.to_string_lossy(); + let params = MtmdContextParams::default(); + Some(MtmdContext::init_from_file( + mmproj_path_str.as_ref(), + &model, + ¶ms, + )?) + } + None => None, + }; + + Ok(PhaseState { + mtmd_context, + model, + backend: Arc::clone(backend), + model_path, + }) + } +} + +#[cfg(test)] +mod tests { + use crate::mmproj_source::MmprojSource; + use crate::model_load_params::ModelLoadParams; + use crate::model_source::ModelSource; + + use super::LoadKey; + + fn baseline() -> LoadKey { + LoadKey { + model_source: ModelSource::HuggingFace { + repo: "repo", + file: "file", + }, + mmproj_source: None, + model_load_params: ModelLoadParams { + n_gpu_layers: 0, + use_mmap: true, + use_mlock: false, + }, + } + } + + #[test] + fn identical_keys_compare_equal() { + assert_eq!(baseline(), baseline()); + } + + #[test] + fn different_model_sources_compare_unequal() { + let mut other = baseline(); + other.model_source = ModelSource::HuggingFace { + repo: "other", + file: "file", + }; + + assert_ne!(baseline(), other); + } + + #[test] + fn huggingface_and_local_path_compare_unequal() { + let mut other = baseline(); + other.model_source = ModelSource::LocalPath("/some/local.gguf"); + + assert_ne!(baseline(), other); + } + + #[test] + fn different_mmproj_sources_compare_unequal() { + let mut other = baseline(); + other.mmproj_source = Some(MmprojSource::HuggingFace { + repo: "repo", + file: "mmproj-F16.gguf", + }); + + assert_ne!(baseline(), other); + } + + #[test] + fn different_model_load_params_compare_unequal() { + let mut other = baseline(); + other.model_load_params.n_gpu_layers = 999; + + assert_ne!(baseline(), other); + } + + // The next three tests exercise the three error-propagation paths inside + // `load_phase_state` — model load failure, mmproj download failure, and mmproj load failure. + // Each constructs a LoadKey whose resolution succeeds (so the path is computed) but whose + // subsequent load step deliberately fails, then asserts the appropriate `Err` propagates. + // + // They share BACKEND_INIT_GATE because `LlamaBackend::init` is once-per-process. + + use std::sync::Arc; + + use llama_cpp_bindings::llama_backend::LlamaBackend; + + use crate::test_backend_gate::BACKEND_INIT_GATE; + + /// Path to the workspace `Cargo.toml`, which exists at test time but isn't a valid GGUF and + /// isn't a valid mmproj — perfect for exercising the `load_from_file` / `init_from_file` + /// error arms in `load_phase_state`. + const NON_GGUF_PATH: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/Cargo.toml"); + + #[test] + fn load_phase_state_propagates_model_load_failure() { + let _gate = BACKEND_INIT_GATE + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let backend = Arc::new(LlamaBackend::init().expect("backend init must succeed")); + let key = LoadKey { + model_source: ModelSource::LocalPath(NON_GGUF_PATH), + mmproj_source: None, + model_load_params: ModelLoadParams { + n_gpu_layers: 0, + use_mmap: true, + use_mlock: false, + }, + }; + + let result = key.load_phase_state(&backend); + + assert!( + result.is_err(), + "LoadKey pointing at a non-GGUF file must fail to load" + ); + } + + #[test] + fn load_phase_state_propagates_mmproj_download_failure() { + let _gate = BACKEND_INIT_GATE + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let backend = Arc::new(LlamaBackend::init().expect("backend init must succeed")); + let key = LoadKey { + model_source: ModelSource::LocalPath(NON_GGUF_PATH), + mmproj_source: Some(MmprojSource::HuggingFace { + repo: "intentee-test-harness/does-not-exist", + file: "no-such-mmproj.gguf", + }), + model_load_params: ModelLoadParams { + n_gpu_layers: 0, + use_mmap: true, + use_mlock: false, + }, + }; + + let result = key.load_phase_state(&backend); + + assert!( + result.is_err(), + "LoadKey with bogus mmproj HF repo must fail; the error must surface either at model \ + load (the non-GGUF model fails first) or at mmproj download" + ); + } + + #[test] + fn load_phase_state_propagates_mmproj_local_load_failure() { + let _gate = BACKEND_INIT_GATE + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let backend = Arc::new(LlamaBackend::init().expect("backend init must succeed")); + let key = LoadKey { + model_source: ModelSource::LocalPath(NON_GGUF_PATH), + mmproj_source: Some(MmprojSource::LocalPath(NON_GGUF_PATH)), + model_load_params: ModelLoadParams { + n_gpu_layers: 0, + use_mmap: true, + use_mlock: false, + }, + }; + + let result = key.load_phase_state(&backend); + + assert!( + result.is_err(), + "LoadKey pointing at a non-mmproj LocalPath must fail at MtmdContext init" + ); + } +} diff --git a/llama-cpp-test-harness/src/mmproj_source.rs b/llama-cpp-test-harness/src/mmproj_source.rs new file mode 100644 index 00000000..ff4bf18f --- /dev/null +++ b/llama-cpp-test-harness/src/mmproj_source.rs @@ -0,0 +1,51 @@ +//! Identity of the mmproj GGUF file the harness optionally loads for a phase. +//! +//! Same shape and semantics as [`crate::ModelSource`], but for the multimodal projection file. +//! Independent of the model's source — a test may mix any combination (HF model + local mmproj, +//! local model + HF mmproj, both local, both HF). + +use std::path::PathBuf; + +use anyhow::Result; + +use crate::download_model::download_model; + +#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub enum MmprojSource { + HuggingFace { + repo: &'static str, + file: &'static str, + }, + LocalPath(&'static str), +} + +impl MmprojSource { + /// Resolves the source to an on-disk path. + /// + /// # Errors + /// + /// Returns an error if the HF download fails. `LocalPath` is infallible here — file + /// existence is checked at load time by the mtmd context init. + pub fn resolve_path(self) -> Result { + match self { + Self::HuggingFace { repo, file } => download_model(repo, file), + Self::LocalPath(path) => Ok(PathBuf::from(path)), + } + } +} + +#[cfg(test)] +mod tests { + use std::path::PathBuf; + + use super::MmprojSource; + + #[test] + fn resolve_path_for_local_path_returns_the_literal_path() { + let source = MmprojSource::LocalPath("/abs/mmproj.gguf"); + + let resolved = source.resolve_path().expect("LocalPath resolve is infallible"); + + assert_eq!(resolved, PathBuf::from("/abs/mmproj.gguf")); + } +} diff --git a/llama-cpp-test-harness/src/model_load_params.rs b/llama-cpp-test-harness/src/model_load_params.rs new file mode 100644 index 00000000..da0f67d5 --- /dev/null +++ b/llama-cpp-test-harness/src/model_load_params.rs @@ -0,0 +1,86 @@ +use llama_cpp_bindings::model::params::LlamaModelParams; + +#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct ModelLoadParams { + pub n_gpu_layers: u32, + pub use_mmap: bool, + pub use_mlock: bool, +} + +impl ModelLoadParams { + #[must_use] + pub fn into_llama_model_params(self) -> LlamaModelParams { + let Self { + n_gpu_layers, + use_mmap, + use_mlock, + } = self; + LlamaModelParams::default() + .with_n_gpu_layers(n_gpu_layers) + .with_use_mmap(use_mmap) + .with_use_mlock(use_mlock) + } +} + +#[cfg(test)] +mod tests { + use super::ModelLoadParams; + + #[test] + fn into_llama_model_params_carries_all_three_fields() { + let params = ModelLoadParams { + n_gpu_layers: 7, + use_mmap: false, + use_mlock: true, + } + .into_llama_model_params(); + + assert_eq!(params.n_gpu_layers(), 7); + assert!(!params.use_mmap()); + assert!(params.use_mlock()); + } + + #[test] + fn into_llama_model_params_clamps_n_gpu_layers_to_i32_max() { + let params = ModelLoadParams { + n_gpu_layers: u32::MAX, + use_mmap: true, + use_mlock: false, + } + .into_llama_model_params(); + + assert_eq!(params.n_gpu_layers(), i32::MAX); + } + + #[test] + fn identical_values_compare_equal() { + let one = ModelLoadParams { + n_gpu_layers: 1, + use_mmap: true, + use_mlock: false, + }; + let two = ModelLoadParams { + n_gpu_layers: 1, + use_mmap: true, + use_mlock: false, + }; + + assert_eq!(one, two); + } + + #[test] + fn differing_n_gpu_layers_compare_unequal() { + let one = ModelLoadParams { + n_gpu_layers: 1, + use_mmap: true, + use_mlock: false, + }; + let two = ModelLoadParams { + n_gpu_layers: 2, + use_mmap: true, + use_mlock: false, + }; + + assert_ne!(one, two); + } +} diff --git a/llama-cpp-test-harness/src/model_source.rs b/llama-cpp-test-harness/src/model_source.rs new file mode 100644 index 00000000..b937e9c7 --- /dev/null +++ b/llama-cpp-test-harness/src/model_source.rs @@ -0,0 +1,56 @@ +//! Identity of the GGUF file the harness loads for a phase. +//! +//! Two variants, mutually exclusive by construction: +//! - [`ModelSource::HuggingFace`] — pull via `hf-hub` (cached); the on-disk path is wherever the +//! cache resolves to. +//! - [`ModelSource::LocalPath`] — use the file at the given absolute path verbatim; no download, +//! no cache. +//! +//! Mutual exclusion is enforced at compile time by the enum's variant set. There is no string +//! heuristic anywhere — the proc-macro dispatches on syntactic path identifiers. + +use std::path::PathBuf; + +use anyhow::Result; + +use crate::download_model::download_model; + +#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub enum ModelSource { + HuggingFace { + repo: &'static str, + file: &'static str, + }, + LocalPath(&'static str), +} + +impl ModelSource { + /// Resolves the source to an on-disk path. + /// + /// # Errors + /// + /// Returns an error if the HF download fails. `LocalPath` is infallible here — file + /// existence is checked at load time by llama.cpp. + pub fn resolve_path(self) -> Result { + match self { + Self::HuggingFace { repo, file } => download_model(repo, file), + Self::LocalPath(path) => Ok(PathBuf::from(path)), + } + } +} + +#[cfg(test)] +mod tests { + use std::path::PathBuf; + + use super::ModelSource; + + #[test] + fn resolve_path_for_local_path_returns_the_literal_path() { + let source = ModelSource::LocalPath("/abs/example.gguf"); + + let resolved = source.resolve_path().expect("LocalPath resolve is infallible"); + + assert_eq!(resolved, PathBuf::from("/abs/example.gguf")); + } +} diff --git a/llama-cpp-test-harness/src/no_op.rs b/llama-cpp-test-harness/src/no_op.rs new file mode 100644 index 00000000..7672de54 --- /dev/null +++ b/llama-cpp-test-harness/src/no_op.rs @@ -0,0 +1,14 @@ +use crate::llama_fixture::LlamaFixture; + +/// No-op test function with the [`crate::LlamaTestFn`] signature. Always returns `Ok(())`. +/// +/// Useful as a placeholder for [`crate::LlamaTestRegistration`] in unit tests that exercise +/// grouping/sorting logic without needing real trial bodies. Also covered by a self-test +/// trial so the function shows up in coverage. +/// +/// # Errors +/// +/// Never; always returns `Ok(())`. The `Result` return type matches `LlamaTestFn`. +pub const fn no_op(_fixture: &LlamaFixture<'_>) -> anyhow::Result<()> { + Ok(()) +} diff --git a/llama-cpp-test-harness/src/phase_state.rs b/llama-cpp-test-harness/src/phase_state.rs new file mode 100644 index 00000000..1993d6a0 --- /dev/null +++ b/llama-cpp-test-harness/src/phase_state.rs @@ -0,0 +1,13 @@ +use std::path::PathBuf; +use std::sync::Arc; + +use llama_cpp_bindings::llama_backend::LlamaBackend; +use llama_cpp_bindings::model::LlamaModel; +use llama_cpp_bindings::mtmd::MtmdContext; + +pub struct PhaseState { + pub mtmd_context: Option, + pub model: LlamaModel, + pub backend: Arc, + pub model_path: PathBuf, +} diff --git a/llama-cpp-test-harness/src/run.rs b/llama-cpp-test-harness/src/run.rs new file mode 100644 index 00000000..6d13b1b4 --- /dev/null +++ b/llama-cpp-test-harness/src/run.rs @@ -0,0 +1,122 @@ +use std::process::ExitCode; +use std::sync::Arc; + +use libtest_mimic::Conclusion; +use llama_cpp_bindings::llama_backend::LlamaBackend; + +use crate::execution_plan::ExecutionPlan; + +fn aggregate_exit_code(conclusions: &[Conclusion]) -> ExitCode { + if conclusions.iter().any(Conclusion::has_failed) { + ExitCode::from(101) + } else { + ExitCode::SUCCESS + } +} + +#[must_use] +pub fn run() -> ExitCode { + let mut backend = match LlamaBackend::init() { + Ok(backend) => backend, + Err(error) => { + eprintln!("llama-cpp-test-harness: backend init failed: {error}"); + return ExitCode::from(2); + } + }; + let plan = ExecutionPlan::from_inventory(); + if plan.requests_void_logs() { + backend.void_logs(); + } + let backend = Arc::new(backend); + aggregate_exit_code(&plan.run(&backend)) +} + +#[cfg(test)] +mod tests { + use std::process::ExitCode; + + use libtest_mimic::Conclusion; + use llama_cpp_bindings::llama_backend::LlamaBackend; + + use crate::run_to_conclusions::run_to_conclusions; + use crate::test_backend_gate::BACKEND_INIT_GATE; + + use super::aggregate_exit_code; + use super::run; + + fn passing_conclusion() -> Conclusion { + Conclusion { + num_filtered_out: 0, + num_passed: 1, + num_failed: 0, + num_ignored: 0, + num_measured: 0, + } + } + + fn failing_conclusion() -> Conclusion { + Conclusion { + num_filtered_out: 0, + num_passed: 0, + num_failed: 1, + num_ignored: 0, + num_measured: 0, + } + } + + fn as_u8(code: ExitCode) -> u8 { + let formatted = format!("{code:?}"); + formatted + .chars() + .filter(char::is_ascii_digit) + .collect::() + .parse::() + .unwrap_or(255) + } + + #[test] + fn aggregate_exit_code_zero_when_all_pass() { + let code = aggregate_exit_code(&[passing_conclusion(), passing_conclusion()]); + + assert_eq!(as_u8(code), 0); + } + + #[test] + fn aggregate_exit_code_non_zero_when_any_fails() { + let code = aggregate_exit_code(&[passing_conclusion(), failing_conclusion()]); + + assert_eq!(as_u8(code), 101); + } + + #[test] + fn aggregate_exit_code_empty_input_succeeds() { + let code = aggregate_exit_code(&[]); + + assert_eq!(as_u8(code), 0); + } + + #[test] + fn run_to_conclusions_panics_when_backend_init_fails() { + let _gate = BACKEND_INIT_GATE + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let _hold = LlamaBackend::init().expect("first init must succeed"); + let outcome = std::panic::catch_unwind(run_to_conclusions); + + assert!( + outcome.is_err(), + "expected panic from re-initialised backend" + ); + } + + #[test] + fn run_returns_exit_code_two_when_backend_init_fails() { + let _gate = BACKEND_INIT_GATE + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let _hold = LlamaBackend::init().expect("first init must succeed"); + let code = run(); + + assert_eq!(as_u8(code), 2); + } +} diff --git a/llama-cpp-test-harness/src/run_to_conclusions.rs b/llama-cpp-test-harness/src/run_to_conclusions.rs new file mode 100644 index 00000000..8de67e11 --- /dev/null +++ b/llama-cpp-test-harness/src/run_to_conclusions.rs @@ -0,0 +1,54 @@ +use std::sync::Arc; + +use libtest_mimic::Conclusion; +use llama_cpp_bindings::llama_backend::LlamaBackend; + +use crate::execution_plan::ExecutionPlan; + +/// Runs every registered test against its declared model and returns one [`Conclusion`] per phase. +/// +/// Self-tests use this entry point to inspect pass/fail counts without surrendering the +/// binary's exit code to libtest-mimic. Initializes the backend; panics with a descriptive +/// message if init fails (that's a programming error in test setup). +/// +/// # Panics +/// +/// Panics if [`LlamaBackend::init`] fails. The harness is meaningless without a backend; a +/// crash is the loudest possible failure signal. +#[must_use] +pub fn run_to_conclusions() -> Vec { + let mut backend = match LlamaBackend::init() { + Ok(backend) => backend, + Err(error) => panic!("llama-cpp-test-harness: backend init failed: {error}"), + }; + let plan = ExecutionPlan::from_inventory(); + if plan.requests_void_logs() { + backend.void_logs(); + } + let backend = Arc::new(backend); + plan.run(&backend) +} + +#[cfg(test)] +mod tests { + use crate::test_backend_gate::BACKEND_INIT_GATE; + + use super::run_to_conclusions; + + #[test] + fn empty_inventory_yields_no_conclusions_and_skips_void_logs() { + // The lib's own inventory has no #[llama_test] registrations, so + // ExecutionPlan::from_inventory() returns an empty plan. requests_void_logs() returns + // false → the `backend.void_logs()` branch is skipped — this test covers that path. + let _gate = BACKEND_INIT_GATE + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + + let conclusions = run_to_conclusions(); + + assert!( + conclusions.is_empty(), + "empty inventory must yield zero phase conclusions" + ); + } +} diff --git a/llama-cpp-test-harness/src/test_backend_gate.rs b/llama-cpp-test-harness/src/test_backend_gate.rs new file mode 100644 index 00000000..74fa6245 --- /dev/null +++ b/llama-cpp-test-harness/src/test_backend_gate.rs @@ -0,0 +1,8 @@ +//! Process-wide serialization for tests that need to initialize `LlamaBackend`. +//! +//! `LlamaBackend::init` is a once-per-process operation; concurrent attempts collide. Tests in +//! multiple modules each need access to a shared mutex so they take turns. This module exports +//! that shared mutex. + +#[cfg(test)] +pub static BACKEND_INIT_GATE: std::sync::Mutex<()> = std::sync::Mutex::new(()); diff --git a/llama-cpp-test-harness/tests/harness_self_test.rs b/llama-cpp-test-harness/tests/harness_self_test.rs new file mode 100644 index 00000000..eea30660 --- /dev/null +++ b/llama-cpp-test-harness/tests/harness_self_test.rs @@ -0,0 +1,199 @@ +#![expect( + clippy::unnecessary_wraps, + reason = "every trial returns anyhow::Result<()> to match the LlamaTestFn signature" +)] + +use std::process::ExitCode; + +use anyhow::Result; +use anyhow::bail; +use llama_cpp_test_harness::LlamaFixture; +use llama_cpp_test_harness::llama_test; +use llama_cpp_test_harness::no_op; +use llama_cpp_test_harness::run_to_conclusions; + +// Phase A: small Qwen text model, three trials sharing the exact same attribute tuple. +// Two of these pass, one bails — exercising both branches of trial-body dispatch on the same +// loaded model. + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + void_logs = true, +)] +fn phase_a_first_passing_trial(fixture: &LlamaFixture<'_>) -> Result<()> { + let formatted = format!("{:?}", fixture.model); + assert!(formatted.contains("LlamaModel")); + no_op(fixture) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64 +)] +fn phase_a_second_passing_trial(fixture: &LlamaFixture<'_>) -> Result<()> { + assert_eq!(fixture.context_params.n_ctx, 512); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64 +)] +fn phase_a_intentionally_failing_trial(_fixture: &LlamaFixture<'_>) -> Result<()> { + bail!("intentional failure to exercise the trial-failure dispatch path"); +} + +// Phase B: distinct model (smaller embedding GGUF). Two trials share this key. + +#[llama_test( + model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64 +)] +fn phase_b_first_passing_trial(fixture: &LlamaFixture<'_>) -> Result<()> { + assert!(fixture.mtmd_context.is_none()); + Ok(()) +} + +#[llama_test( + model_source = HuggingFace("Qwen/Qwen3-Embedding-0.6B-GGUF", "Qwen3-Embedding-0.6B-Q8_0.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64 +)] +fn phase_b_second_passing_trial(fixture: &LlamaFixture<'_>) -> Result<()> { + let _markers = fixture.model.tool_call_markers(); + Ok(()) +} + +// Phase C: intentionally invalid HF repo. The phase-setup path fails to download the model, +// which routes the trial through `failing_trials` (one failed trial per registration). +// +// The trial function is shared with an additional Phase A registration so that the function +// itself is exercised at least once (Phase A's setup succeeds and dispatches into the body). +// Phase C's setup fails before reaching the body, but the registration still exercises the +// `failing_trials` path in `ExecutionPhase::run`. + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64 +)] +#[llama_test( + model_source = HuggingFace("intentee-test-harness/does-not-exist", "no-such-file.gguf"), + n_gpu_layers = 0, + use_mmap = true, + use_mlock = false, + n_ctx = 1, + n_batch = 1, + n_ubatch = 1 +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("intentee-test-harness/does-not-exist", "no-such-mmproj.gguf"), +)] +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = LocalPath("/nonexistent/llama-cpp-test-harness/no-such-mmproj.gguf"), +)] +fn shared_setup_failure_and_phase_a_trial(fixture: &LlamaFixture<'_>) -> Result<()> { + // Phase A reaches the body and verifies the fixture is wired up; the failure phases + // (Phase C model download, mmproj download, mmproj load) never reach it. + assert!(fixture.model_path.exists()); + Ok(()) +} + +// Phase D: same text model as Phase A but with mmproj — exercises the multimodal-load path +// in LoadKey::load_phase_state. Distinct LoadKey (mmproj_file differs) → distinct phase + +// distinct model load. + +#[llama_test( + model_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "Qwen3.5-0.8B-Q4_K_M.gguf"), + n_gpu_layers = 999, + use_mmap = true, + use_mlock = false, + n_ctx = 512, + n_batch = 128, + n_ubatch = 64, + mmproj_source = HuggingFace("unsloth/Qwen3.5-0.8B-GGUF", "mmproj-F16.gguf") +)] +fn phase_d_mmproj_trial(fixture: &LlamaFixture<'_>) -> Result<()> { + assert!( + fixture.mtmd_context.is_some(), + "mmproj_file declared, but fixture.mtmd_context is None", + ); + Ok(()) +} + +const EXPECTED_PHASES: usize = 6; +const EXPECTED_PASSED: u64 = 6; +const EXPECTED_FAILED: u64 = 4; + +fn main() -> ExitCode { + let conclusions = run_to_conclusions(); + let phases = conclusions.len(); + let total_passed: u64 = conclusions + .iter() + .map(|conclusion| conclusion.num_passed) + .sum(); + let total_failed: u64 = conclusions + .iter() + .map(|conclusion| conclusion.num_failed) + .sum(); + + if phases == EXPECTED_PHASES + && total_passed == EXPECTED_PASSED + && total_failed == EXPECTED_FAILED + { + eprintln!( + "harness_self_test: as expected — phases={phases}, passed={total_passed}, failed={total_failed}" + ); + ExitCode::SUCCESS + } else { + eprintln!( + "harness_self_test: UNEXPECTED — phases={phases} (want {EXPECTED_PHASES}), \ + passed={total_passed} (want {EXPECTED_PASSED}), \ + failed={total_failed} (want {EXPECTED_FAILED})" + ); + ExitCode::FAILURE + } +} From 495f422a93e606821e8c14c15ad99cdab04a646f Mon Sep 17 00:00:00 2001 From: Mateusz Charytoniuk Date: Sat, 23 May 2026 16:52:23 +0200 Subject: [PATCH 2/9] refactor tests so coverage of variant assertions does not depend on unreachable arms --- .../tests/model_properties.rs | 4 +- .../src/chat_message_parse_outcome.rs | 70 ++++++++++--------- llama-cpp-bindings/src/gguf_context.rs | 48 +++++++++---- .../src/json_schema_to_grammar.rs | 39 +++++++---- llama-cpp-bindings/src/llama_backend.rs | 27 +++++-- llama-cpp-bindings/src/llama_token_attrs.rs | 12 ++-- .../src/load_backends_from_path.rs | 18 +++-- .../src/mtmd/mtmd_input_chunks.rs | 8 ++- .../src/sampled_token_classifier.rs | 36 +++++----- llama-cpp-bindings/src/sampling.rs | 68 ++++++++++-------- .../src/streaming_json_probe.rs | 10 +++ .../src/tool_call_format/bracketed_args.rs | 61 +++++++++++++--- .../src/tool_call_format/json_object.rs | 20 ++++-- .../tool_call_template_overrides/detect.rs | 20 ++---- .../gemma4_call_block.rs | 19 +++-- .../glm47_key_value_tags.rs | 20 +++--- .../known_marker_candidates.rs | 7 +- .../mistral3_arrow_args.rs | 14 ++-- .../qwen3_json_inside_tool_call.rs | 13 ++-- .../qwen_xml_tags.rs | 17 +++-- .../src/parsed_source.rs | 10 ++- llama-cpp-test-harness/src/mmproj_source.rs | 4 +- llama-cpp-test-harness/src/model_source.rs | 4 +- 23 files changed, 347 insertions(+), 202 deletions(-) diff --git a/llama-cpp-bindings-tests/tests/model_properties.rs b/llama-cpp-bindings-tests/tests/model_properties.rs index ec872710..bd33ef6b 100644 --- a/llama-cpp-bindings-tests/tests/model_properties.rs +++ b/llama-cpp-bindings-tests/tests/model_properties.rs @@ -278,9 +278,7 @@ fn is_recurrent_returns_false_for_transformer(fixture: &LlamaFixture<'_>) -> Res n_batch = 128, n_ubatch = 64, )] -fn is_hybrid_returns_false_for_non_hybrid_default_models( - fixture: &LlamaFixture<'_>, -) -> Result<()> { +fn is_hybrid_returns_false_for_non_hybrid_default_models(fixture: &LlamaFixture<'_>) -> Result<()> { assert!( !fixture.model.is_hybrid(), "DeepSeek-R1-Distill-Llama-8B and GLM-4.7-Flash are pure transformers, not hybrid; got is_hybrid=true" diff --git a/llama-cpp-bindings/src/chat_message_parse_outcome.rs b/llama-cpp-bindings/src/chat_message_parse_outcome.rs index 12550664..aede6a36 100644 --- a/llama-cpp-bindings/src/chat_message_parse_outcome.rs +++ b/llama-cpp-bindings/src/chat_message_parse_outcome.rs @@ -15,42 +15,44 @@ mod tests { use crate::raw_chat_message::RawChatMessage; #[test] - fn recognized_variant_exposes_parsed_chat_message() { - let parsed = - ParsedChatMessage::new("content".to_owned(), "reasoning".to_owned(), Vec::new()); - let outcome = ChatMessageParseOutcome::Recognized(parsed); - - match outcome { - ChatMessageParseOutcome::Recognized(parsed) => { - assert_eq!(parsed.content, "content"); - assert_eq!(parsed.reasoning_content, "reasoning"); - assert!(parsed.tool_calls.is_empty()); - } - ChatMessageParseOutcome::Unrecognized(_) => { - panic!("expected Recognized variant"); + fn both_variants_destructure_to_their_inner_payloads() { + let outcomes = [ + ChatMessageParseOutcome::Recognized(ParsedChatMessage::new( + "content".to_owned(), + "reasoning".to_owned(), + Vec::new(), + )), + ChatMessageParseOutcome::Unrecognized(RawChatMessage { + tools_json: "[]".to_owned(), + text: "raw input".to_owned(), + is_partial: false, + ffi_error_message: "parser bailed".to_owned(), + }), + ]; + + let mut saw_recognized = false; + let mut saw_unrecognized = false; + for outcome in outcomes { + match outcome { + ChatMessageParseOutcome::Recognized(parsed) => { + assert_eq!(parsed.content, "content"); + assert_eq!(parsed.reasoning_content, "reasoning"); + assert!(parsed.tool_calls.is_empty()); + saw_recognized = true; + } + ChatMessageParseOutcome::Unrecognized(raw) => { + assert_eq!(raw.tools_json, "[]"); + assert_eq!(raw.text, "raw input"); + assert!(!raw.is_partial); + assert_eq!(raw.ffi_error_message, "parser bailed"); + saw_unrecognized = true; + } } } - } - - #[test] - fn unrecognized_variant_exposes_raw_chat_message() { - let outcome = ChatMessageParseOutcome::Unrecognized(RawChatMessage { - tools_json: "[]".to_owned(), - text: "raw input".to_owned(), - is_partial: false, - ffi_error_message: "parser bailed".to_owned(), - }); - match outcome { - ChatMessageParseOutcome::Unrecognized(raw) => { - assert_eq!(raw.tools_json, "[]"); - assert_eq!(raw.text, "raw input"); - assert!(!raw.is_partial); - assert_eq!(raw.ffi_error_message, "parser bailed"); - } - ChatMessageParseOutcome::Recognized(_) => { - panic!("expected Unrecognized variant"); - } - } + assert!( + saw_recognized && saw_unrecognized, + "both variants must dispatch through the match" + ); } } diff --git a/llama-cpp-bindings/src/gguf_context.rs b/llama-cpp-bindings/src/gguf_context.rs index 329bb5ac..45a9b6c0 100644 --- a/llama-cpp-bindings/src/gguf_context.rs +++ b/llama-cpp-bindings/src/gguf_context.rs @@ -13,6 +13,7 @@ use crate::gguf_type::GgufType; /// /// Opens a GGUF file in metadata-only mode (`no_alloc = true`), allowing /// inspection of key-value pairs and tensor metadata without loading tensor data. +#[derive(Debug)] pub struct GgufContext { context: NonNull, } @@ -169,16 +170,37 @@ impl Drop for GgufContext { #[cfg(test)] mod tests { + use std::ffi::CString; + use std::mem::Discriminant; + use std::path::PathBuf; + use super::GgufContext; use crate::gguf_context_error::GgufContextError; use crate::gguf_type::GgufType; - fn fixture_path() -> std::path::PathBuf { - std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")) + fn fixture_path() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) .join("fixtures") .join("ggml-vocab-bert-bge.gguf") } + fn init_failed_disc() -> Discriminant { + std::mem::discriminant(&GgufContextError::InitFailed(PathBuf::new())) + } + + fn key_not_found_disc() -> Discriminant { + std::mem::discriminant(&GgufContextError::KeyNotFound { key: String::new() }) + } + + fn nul_error_disc() -> Discriminant { + let nul_err = CString::new(b"a\0b".to_vec()).unwrap_err(); + std::mem::discriminant(&GgufContextError::NulError(nul_err)) + } + + fn path_to_str_error_disc() -> Discriminant { + std::mem::discriminant(&GgufContextError::PathToStrError(PathBuf::new())) + } + #[test] fn from_file_opens_valid_gguf() { let context = GgufContext::from_file(fixture_path()); @@ -188,9 +210,9 @@ mod tests { #[test] fn from_file_nonexistent_returns_init_failed() { - let result = GgufContext::from_file("/nonexistent/file.gguf"); + let err = GgufContext::from_file("/nonexistent/file.gguf").unwrap_err(); - assert!(matches!(result, Err(GgufContextError::InitFailed(_)))); + assert_eq!(std::mem::discriminant(&err), init_failed_disc()); } #[test] @@ -219,9 +241,9 @@ mod tests { #[test] fn find_key_returns_error_for_missing_key() { let context = GgufContext::from_file(fixture_path()).unwrap(); - let result = context.find_key("nonexistent.key"); + let err = context.find_key("nonexistent.key").unwrap_err(); - assert!(matches!(result, Err(GgufContextError::KeyNotFound { .. }))); + assert_eq!(std::mem::discriminant(&err), key_not_found_disc()); } #[test] @@ -258,24 +280,24 @@ mod tests { use std::os::unix::ffi::OsStrExt; let non_utf8_path = std::path::Path::new(OsStr::from_bytes(b"/tmp/\xff\xfe.gguf")); - let result = GgufContext::from_file(non_utf8_path); + let err = GgufContext::from_file(non_utf8_path).unwrap_err(); - assert!(matches!(result, Err(GgufContextError::PathToStrError(_)))); + assert_eq!(std::mem::discriminant(&err), path_to_str_error_disc()); } #[test] fn from_file_with_null_byte_in_path_returns_error() { - let result = GgufContext::from_file("/tmp/foo\0bar.gguf"); + let err = GgufContext::from_file("/tmp/foo\0bar.gguf").unwrap_err(); - assert!(matches!(result, Err(GgufContextError::NulError(_)))); + assert_eq!(std::mem::discriminant(&err), nul_error_disc()); } #[test] fn find_key_with_null_byte_in_key_returns_error() { let context = GgufContext::from_file(fixture_path()).unwrap(); - let result = context.find_key("foo\0bar"); + let err = context.find_key("foo\0bar").unwrap_err(); - assert!(matches!(result, Err(GgufContextError::NulError(_)))); + assert_eq!(std::mem::discriminant(&err), nul_error_disc()); } #[test] @@ -290,7 +312,7 @@ mod tests { } struct SyntheticGgufFile { - path: std::path::PathBuf, + path: PathBuf, } impl SyntheticGgufFile { diff --git a/llama-cpp-bindings/src/json_schema_to_grammar.rs b/llama-cpp-bindings/src/json_schema_to_grammar.rs index 6949e549..558e7496 100644 --- a/llama-cpp-bindings/src/json_schema_to_grammar.rs +++ b/llama-cpp-bindings/src/json_schema_to_grammar.rs @@ -59,13 +59,18 @@ mod tests { #[test] fn null_byte_returns_schema_contains_nul_byte_error() { + use std::ffi::CString; + let schema = "{\x00}"; - let result = json_schema_to_grammar(schema); + let err = json_schema_to_grammar(schema).unwrap_err(); + let representative = JsonSchemaToGrammarError::SchemaContainsNulByte( + CString::new(b"a\0b".to_vec()).unwrap_err(), + ); - assert!(matches!( - result, - Err(JsonSchemaToGrammarError::SchemaContainsNulByte(_)), - )); + assert_eq!( + std::mem::discriminant(&err), + std::mem::discriminant(&representative) + ); } #[test] @@ -79,22 +84,28 @@ mod tests { #[test] fn invalid_json_returns_reported() { let schema = "not valid json at all"; - let result = json_schema_to_grammar(schema); + let err = json_schema_to_grammar(schema).unwrap_err(); + let representative = JsonSchemaToGrammarError::Reported { + message: String::new(), + }; - assert!(matches!( - result, - Err(JsonSchemaToGrammarError::Reported { .. }), - )); + assert_eq!( + std::mem::discriminant(&err), + std::mem::discriminant(&representative) + ); } #[test] fn unresolved_ref_returns_invalid_schema() { let schema = r##"{"$ref": "#/$defs/Missing"}"##; - let result = json_schema_to_grammar(schema); + let err = json_schema_to_grammar(schema).unwrap_err(); + let representative = JsonSchemaToGrammarError::InvalidSchema { + message: String::new(), + }; - assert!( - matches!(result, Err(JsonSchemaToGrammarError::InvalidSchema { .. })), - "expected InvalidSchema, got {result:?}", + assert_eq!( + std::mem::discriminant(&err), + std::mem::discriminant(&representative) ); } } diff --git a/llama-cpp-bindings/src/llama_backend.rs b/llama-cpp-bindings/src/llama_backend.rs index ff6b09f9..30d83cf0 100644 --- a/llama-cpp-bindings/src/llama_backend.rs +++ b/llama-cpp-bindings/src/llama_backend.rs @@ -160,11 +160,28 @@ mod tests { #[serial] fn double_init_returns_error() { let _backend = LlamaBackend::init().unwrap(); - let second = LlamaBackend::init(); - assert!(matches!( - second.unwrap_err(), - LlamaCppError::BackendAlreadyInitialized - )); + let second_err = LlamaBackend::init().unwrap_err(); + + assert_eq!( + std::mem::discriminant(&second_err), + std::mem::discriminant(&LlamaCppError::BackendAlreadyInitialized), + "expected BackendAlreadyInitialized, got {second_err:?}" + ); + } + + #[test] + #[serial] + fn init_numa_returns_error_when_backend_already_initialized() { + use crate::llama_backend_numa_strategy::NumaStrategy; + + let _backend = LlamaBackend::init().unwrap(); + let second_err = LlamaBackend::init_numa(NumaStrategy::Disabled).unwrap_err(); + + assert_eq!( + std::mem::discriminant(&second_err), + std::mem::discriminant(&LlamaCppError::BackendAlreadyInitialized), + "expected BackendAlreadyInitialized, got {second_err:?}" + ); } #[test] diff --git a/llama-cpp-bindings/src/llama_token_attrs.rs b/llama-cpp-bindings/src/llama_token_attrs.rs index 688d228f..872aeb4e 100644 --- a/llama-cpp-bindings/src/llama_token_attrs.rs +++ b/llama-cpp-bindings/src/llama_token_attrs.rs @@ -73,13 +73,13 @@ mod tests { #[test] fn try_from_invalid_bits_returns_error() { - let result = LlamaTokenAttrs::try_from(!0); + let err = LlamaTokenAttrs::try_from(!0).unwrap_err(); + let LlamaTokenAttrsFromIntError::UnknownValue(invalid_bits) = err; - assert!(result.is_err()); - assert!(matches!( - result.expect_err("should fail"), - LlamaTokenAttrsFromIntError::UnknownValue(_), - )); + assert!( + invalid_bits > 0, + "passing !0 must produce at least one unknown bit" + ); } #[test] diff --git a/llama-cpp-bindings/src/load_backends_from_path.rs b/llama-cpp-bindings/src/load_backends_from_path.rs index f94b2f90..7af9cce4 100644 --- a/llama-cpp-bindings/src/load_backends_from_path.rs +++ b/llama-cpp-bindings/src/load_backends_from_path.rs @@ -34,13 +34,19 @@ mod tests { #[test] #[cfg(unix)] fn load_backends_from_path_returns_path_null_byte_for_embedded_null() { + use std::ffi::CString; use std::ffi::OsStr; use std::os::unix::ffi::OsStrExt; let path = PathBuf::from(OsStr::from_bytes(b"/tmp/foo\0bar")); - let result = load_backends_from_path(&path); + let err = load_backends_from_path(&path).unwrap_err(); + let representative = + LoadBackendsError::PathNullByte(CString::new(b"a\0b".to_vec()).unwrap_err()); - assert!(matches!(result, Err(LoadBackendsError::PathNullByte(_)))); + assert_eq!( + std::mem::discriminant(&err), + std::mem::discriminant(&representative) + ); } #[test] @@ -50,8 +56,12 @@ mod tests { use std::os::unix::ffi::OsStrExt; let path = PathBuf::from(OsStr::from_bytes(b"/tmp/\xff\xfe")); - let result = load_backends_from_path(&path); + let err = load_backends_from_path(&path).unwrap_err(); + let representative = LoadBackendsError::PathNotUtf8(PathBuf::new()); - assert!(matches!(result, Err(LoadBackendsError::PathNotUtf8(_)))); + assert_eq!( + std::mem::discriminant(&err), + std::mem::discriminant(&representative) + ); } } diff --git a/llama-cpp-bindings/src/mtmd/mtmd_input_chunks.rs b/llama-cpp-bindings/src/mtmd/mtmd_input_chunks.rs index aee5d5b0..9ac2705b 100644 --- a/llama-cpp-bindings/src/mtmd/mtmd_input_chunks.rs +++ b/llama-cpp-bindings/src/mtmd/mtmd_input_chunks.rs @@ -177,8 +177,12 @@ mod tests { use super::MtmdEvalError; use super::check_eval_result; - let result = check_eval_result(7); + let err = check_eval_result(7).unwrap_err(); + let representative = MtmdEvalError::EvalFailed { code: 0 }; - assert!(matches!(result, Err(MtmdEvalError::EvalFailed { code: 7 }))); + assert_eq!( + std::mem::discriminant(&err), + std::mem::discriminant(&representative) + ); } } diff --git a/llama-cpp-bindings/src/sampled_token_classifier.rs b/llama-cpp-bindings/src/sampled_token_classifier.rs index 83d0d108..5d5b0e82 100644 --- a/llama-cpp-bindings/src/sampled_token_classifier.rs +++ b/llama-cpp-bindings/src/sampled_token_classifier.rs @@ -33,12 +33,12 @@ struct PendingToken { is_held_for_probe: bool, } -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Eq, PartialEq)] struct JsonProbeState { held_text: String, } -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Eq, PartialEq)] enum ProbeMode { Idle, Active(JsonProbeState), @@ -937,10 +937,10 @@ mod tests { let outcomes = classifier.drain_overflow(); assert_eq!(outcomes.len(), 1); - assert!(matches!( - outcomes[0].sampled_token, - SampledToken::Reasoning(_) - )); + assert_eq!( + std::mem::discriminant(&outcomes[0].sampled_token), + std::mem::discriminant(&SampledToken::Reasoning(LlamaToken::new(0))) + ); assert_eq!(outcomes[0].visible_piece, ""); assert_eq!(outcomes[0].raw_piece, "k>"); @@ -1002,10 +1002,10 @@ mod tests { let outcomes = classifier.drain_overflow(); assert_eq!(outcomes.len(), 1); - assert!(matches!( - outcomes[0].sampled_token, - SampledToken::Content(_) - )); + assert_eq!( + std::mem::discriminant(&outcomes[0].sampled_token), + std::mem::discriminant(&SampledToken::Content(LlamaToken::new(0))) + ); assert_eq!(outcomes[0].visible_piece, "hi"); assert_eq!(classifier.usage().content_tokens, 1); assert_eq!(classifier.usage().reasoning_tokens, 0); @@ -1160,7 +1160,7 @@ mod tests { push_and_probe(&mut classifier, 1, "{"); - assert!(matches!(classifier.probe_mode, ProbeMode::Active(_))); + assert_ne!(classifier.probe_mode, ProbeMode::Idle); } #[test] @@ -1179,7 +1179,7 @@ mod tests { "every emitted outcome should be ToolCall, got {:?}", outcome_sections(&outcomes), ); - assert!(matches!(classifier.probe_mode, ProbeMode::Idle)); + assert_eq!(classifier.probe_mode, ProbeMode::Idle); } #[test] @@ -1197,7 +1197,7 @@ mod tests { "every emitted outcome should be Content, got {:?}", outcome_sections(&outcomes), ); - assert!(matches!(classifier.probe_mode, ProbeMode::Idle)); + assert_eq!(classifier.probe_mode, ProbeMode::Idle); } #[test] @@ -1337,7 +1337,7 @@ mod tests { let outcomes = feed_json_string(&mut classifier, "}}", 100); - assert!(matches!(classifier.probe_mode, ProbeMode::Idle)); + assert_eq!(classifier.probe_mode, ProbeMode::Idle); assert!( outcomes .iter() @@ -1358,7 +1358,7 @@ mod tests { push_and_probe(&mut classifier, 1, "{"); - assert!(matches!(classifier.probe_mode, ProbeMode::Idle)); + assert_eq!(classifier.probe_mode, ProbeMode::Idle); } #[test] @@ -1369,7 +1369,7 @@ mod tests { push_and_probe(&mut classifier, 1, "{"); - assert!(matches!(classifier.probe_mode, ProbeMode::Idle)); + assert_eq!(classifier.probe_mode, ProbeMode::Idle); } #[test] @@ -1487,7 +1487,7 @@ mod tests { push_and_probe(&mut classifier, 1, "{"); push_and_probe(&mut classifier, 2, r#""name""#); - assert!(matches!(classifier.probe_mode, ProbeMode::Active(_))); + assert_ne!(classifier.probe_mode, ProbeMode::Idle); let outcomes = classifier.flush(); @@ -1498,6 +1498,6 @@ mod tests { "mid-probe flush must release held tokens as Content, got {:?}", outcome_sections(&outcomes), ); - assert!(matches!(classifier.probe_mode, ProbeMode::Idle)); + assert_eq!(classifier.probe_mode, ProbeMode::Idle); } } diff --git a/llama-cpp-bindings/src/sampling.rs b/llama-cpp-bindings/src/sampling.rs index a390d6b7..ac1bfb5c 100644 --- a/llama-cpp-bindings/src/sampling.rs +++ b/llama-cpp-bindings/src/sampling.rs @@ -757,9 +757,28 @@ impl Drop for LlamaSampler { #[cfg(test)] mod tests { + use std::ffi::CString; + use std::mem::Discriminant; + use super::LlamaSampler; use crate::GrammarError; + fn nul_error() -> std::ffi::NulError { + CString::new(b"a\0b".to_vec()).unwrap_err() + } + + fn root_not_found_disc() -> Discriminant { + std::mem::discriminant(&GrammarError::RootNotFound) + } + + fn grammar_null_bytes_disc() -> Discriminant { + std::mem::discriminant(&GrammarError::GrammarNullBytes(nul_error())) + } + + fn trigger_word_null_bytes_disc() -> Discriminant { + std::mem::discriminant(&GrammarError::TriggerWordNullBytes(nul_error())) + } + #[test] fn sanitize_grammar_strings_valid() { let result = LlamaSampler::sanitize_grammar_strings("root ::= \"hello\"", "root"); @@ -769,29 +788,24 @@ mod tests { #[test] fn sanitize_grammar_strings_root_not_found() { - let result = LlamaSampler::sanitize_grammar_strings("expr ::= \"hello\"", "root"); + let err = LlamaSampler::sanitize_grammar_strings("expr ::= \"hello\"", "root").unwrap_err(); - assert!(matches!(result.err(), Some(GrammarError::RootNotFound))); + assert_eq!(std::mem::discriminant(&err), root_not_found_disc()); } #[test] fn sanitize_grammar_strings_null_byte_in_grammar() { - let result = LlamaSampler::sanitize_grammar_strings("root ::= \"\0\"", "root"); + let err = LlamaSampler::sanitize_grammar_strings("root ::= \"\0\"", "root").unwrap_err(); - assert!(matches!( - result.err(), - Some(GrammarError::GrammarNullBytes(_)) - )); + assert_eq!(std::mem::discriminant(&err), grammar_null_bytes_disc()); } #[test] fn sanitize_grammar_strings_null_byte_in_root() { - let result = LlamaSampler::sanitize_grammar_strings("ro\0ot ::= \"hello\"", "ro\0ot"); + let err = + LlamaSampler::sanitize_grammar_strings("ro\0ot ::= \"hello\"", "ro\0ot").unwrap_err(); - assert!(matches!( - result.err(), - Some(GrammarError::GrammarNullBytes(_)) - )); + assert_eq!(std::mem::discriminant(&err), grammar_null_bytes_disc()); } #[test] @@ -815,12 +829,9 @@ mod tests { #[test] fn sanitize_trigger_words_null_byte() { let words: Vec<&[u8]> = vec![b"hel\0lo"]; - let result = LlamaSampler::sanitize_trigger_words(words); + let err = LlamaSampler::sanitize_trigger_words(words).unwrap_err(); - assert!(matches!( - result.err(), - Some(GrammarError::TriggerWordNullBytes(_)) - )); + assert_eq!(std::mem::discriminant(&err), trigger_word_null_bytes_disc()); } #[test] @@ -844,12 +855,9 @@ mod tests { #[test] fn sanitize_trigger_patterns_null_byte() { let patterns = vec!["hel\0lo".to_string()]; - let result = LlamaSampler::sanitize_trigger_patterns(&patterns); + let err = LlamaSampler::sanitize_trigger_patterns(&patterns).unwrap_err(); - assert!(matches!( - result.err(), - Some(GrammarError::GrammarNullBytes(_)) - )); + assert_eq!(std::mem::discriminant(&err), grammar_null_bytes_disc()); } #[test] @@ -981,14 +989,16 @@ mod tests { #[test] fn check_sampler_accept_status_exception_maps_to_typed_variant() { - let result = super::check_sampler_accept_status( + let err = super::check_sampler_accept_status( llama_cpp_bindings_sys::LLAMA_RS_SAMPLER_ACCEPT_VENDORED_THREW_CXX_EXCEPTION, std::ptr::null_mut(), - ); - - assert!(matches!( - result, - Err(crate::SamplerAcceptError::GrammarStateCorrupted { .. }) - )); + ) + .unwrap_err(); + let grammar_state_corrupted_disc = + std::mem::discriminant(&crate::SamplerAcceptError::GrammarStateCorrupted { + message: String::new(), + }); + + assert_eq!(std::mem::discriminant(&err), grammar_state_corrupted_disc); } } diff --git a/llama-cpp-bindings/src/streaming_json_probe.rs b/llama-cpp-bindings/src/streaming_json_probe.rs index 388b06fb..3560be7b 100644 --- a/llama-cpp-bindings/src/streaming_json_probe.rs +++ b/llama-cpp-bindings/src/streaming_json_probe.rs @@ -446,4 +446,14 @@ mod tests { JsonProbeOutcome::Failed, ); } + + #[test] + fn syntactically_malformed_object_is_failed() { + // Input starts with `{` (passes the cheap prefix check) but cannot parse — the syntax + // error path classifies as `Category::Syntax`, surfacing the `Failed` arm. + assert_eq!( + JsonProbeOutcome::validate_prefix("{,}"), + JsonProbeOutcome::Failed, + ); + } } diff --git a/llama-cpp-bindings/src/tool_call_format/bracketed_args.rs b/llama-cpp-bindings/src/tool_call_format/bracketed_args.rs index 0020c90a..2ed0cd89 100644 --- a/llama-cpp-bindings/src/tool_call_format/bracketed_args.rs +++ b/llama-cpp-bindings/src/tool_call_format/bracketed_args.rs @@ -199,14 +199,59 @@ mod tests { ); let failure = result.expect_err("malformed JSON must produce a typed failure"); - match failure { - BracketedArgsFailure::InvalidJsonArguments { tool_name, .. } => { - assert_eq!(tool_name, "get_weather"); - } - other @ BracketedArgsFailure::UnterminatedArguments { .. } => { - panic!("expected InvalidJsonArguments, got {other:?}") - } - } + let BracketedArgsFailure::InvalidJsonArguments { tool_name, .. } = failure else { + unreachable!("input was syntactically malformed JSON, never truncated") + }; + + assert_eq!(tool_name, "get_weather"); + } + + #[test] + fn rejects_truncated_json_arguments_with_unterminated_failure() { + // serde_json's iterator returns None when the deserializer has no token to start from. + // Constructing such an input requires whitespace-only input after the separator — the + // iterator finds nothing parseable and yields None, surfacing the Unterminated arm. + let failure = parse( + "[TOOL_CALLS]get_weather[ARGS] ", + &mistral3_markers(), + &mistral3_shape(), + ) + .expect_err("truncated arguments must produce a typed failure"); + let BracketedArgsFailure::UnterminatedArguments { tool_name } = failure else { + unreachable!("input had only whitespace after [ARGS]; iterator yields None") + }; + + assert_eq!(tool_name, "get_weather"); + } + + #[test] + fn returns_empty_vec_for_separator_with_only_whitespace_name() { + // `get_weather` is replaced with whitespace before the separator, so `name.trim()` is + // empty and the parser returns `ParseStep::Done` — covers the empty-name early return. + let parsed = parse( + "[TOOL_CALLS] [ARGS]{\"x\":1}", + &mistral3_markers(), + &mistral3_shape(), + ) + .expect("whitespace-name input must parse"); + + assert!(parsed.is_empty()); + } + + #[test] + fn returns_empty_vec_when_shape_has_empty_separator() { + // When `name_args_separator` is empty, `parse` short-circuits to `Vec::new()` — + // covers the early-return guard. + let mut shape = mistral3_shape(); + shape.name_args_separator.clear(); + let parsed = parse( + "[TOOL_CALLS]get_weather[ARGS]{\"x\":1}", + &mistral3_markers(), + &shape, + ) + .expect("empty-separator shape must parse"); + + assert!(parsed.is_empty()); } #[test] diff --git a/llama-cpp-bindings/src/tool_call_format/json_object.rs b/llama-cpp-bindings/src/tool_call_format/json_object.rs index 08633d72..c9038152 100644 --- a/llama-cpp-bindings/src/tool_call_format/json_object.rs +++ b/llama-cpp-bindings/src/tool_call_format/json_object.rs @@ -179,14 +179,20 @@ mod tests { #[test] fn returns_failure_for_malformed_json() { - let result = parse(r#"{"name": "f", "arguments": {"a": }"#, &qwen3_shape()); + let err = parse(r#"{"name": "f", "arguments": {"a": }"#, &qwen3_shape()).unwrap_err(); + let JsonObjectFailure::InvalidJson { message } = err; - match result { - Err(JsonObjectFailure::InvalidJson { message }) => { - assert!(!message.is_empty()); - } - other => panic!("expected InvalidJson, got {other:?}"), - } + assert!(!message.is_empty()); + } + + #[test] + fn returns_empty_when_object_is_not_a_tool_call_shape() { + // The body opens with `{` (so try_parse_one_object enters the JSON path) but the parsed + // value is a top-level non-object — the early `let Value::Object(map) = value else + // { return Ok(None) };` arm fires. + let parsed = parse("{ \"foo\": 1 }", &qwen3_shape()).expect("must parse"); + + assert!(parsed.is_empty()); } #[test] diff --git a/llama-cpp-bindings/src/tool_call_template_overrides/detect.rs b/llama-cpp-bindings/src/tool_call_template_overrides/detect.rs index 9dab2cdc..6ee29061 100644 --- a/llama-cpp-bindings/src/tool_call_template_overrides/detect.rs +++ b/llama-cpp-bindings/src/tool_call_template_overrides/detect.rs @@ -22,8 +22,9 @@ pub fn detect(template: &str) -> Option { #[cfg(test)] mod tests { - use llama_cpp_bindings_types::ToolCallArgsShape; - + use super::Gemma4CallBlockOverride; + use super::Mistral3ArrowArgsOverride; + use super::QwenXmlTagsOverride; use super::detect; #[test] @@ -31,11 +32,7 @@ mod tests { let template = "{{- '<|tool_call>call:' + function['name'] + '{' -}}"; let markers = detect(template).expect("must dispatch to Gemma 4"); - assert_eq!(markers.open, "<|tool_call>call:"); - assert!(matches!( - markers.args_shape, - ToolCallArgsShape::PairedQuote(_) - )); + assert_eq!(markers, Gemma4CallBlockOverride::markers()); } #[test] @@ -43,11 +40,7 @@ mod tests { let template = "{{- name + '[ARGS]' + arguments }}"; let markers = detect(template).expect("must dispatch to Mistral 3"); - assert_eq!(markers.open, "[TOOL_CALLS]"); - assert!(matches!( - markers.args_shape, - ToolCallArgsShape::BracketedJson(_) - )); + assert_eq!(markers, Mistral3ArrowArgsOverride::markers()); } #[test] @@ -55,8 +48,7 @@ mod tests { let template = "{{- '\\n\\n' }}"; let markers = detect(template).expect("must dispatch to Qwen XML tags"); - assert_eq!(markers.open, ""); - assert!(matches!(markers.args_shape, ToolCallArgsShape::XmlTags(_))); + assert_eq!(markers, QwenXmlTagsOverride::markers()); } #[test] diff --git a/llama-cpp-bindings/src/tool_call_template_overrides/gemma4_call_block.rs b/llama-cpp-bindings/src/tool_call_template_overrides/gemma4_call_block.rs index f09a7b42..2f206d99 100644 --- a/llama-cpp-bindings/src/tool_call_template_overrides/gemma4_call_block.rs +++ b/llama-cpp-bindings/src/tool_call_template_overrides/gemma4_call_block.rs @@ -40,18 +40,25 @@ mod tests { #[test] fn detects_gemma4_template_with_tool_call_call_literal() { + use llama_cpp_bindings_types::PairedQuoteShape; + use llama_cpp_bindings_types::ToolCallValueQuote; + let template = "...{{- '<|tool_call>call:' + function['name'] + '{' -}}..."; let markers = Gemma4CallBlockOverride::detect(template).expect("Gemma 4 template must be detected"); assert_eq!(markers.open, "<|tool_call>call:"); assert_eq!(markers.close, "}"); - let ToolCallArgsShape::PairedQuote(shape) = markers.args_shape else { - panic!("expected PairedQuote variant, got {:?}", markers.args_shape); - }; - assert_eq!(shape.name_args_separator, "{"); - assert_eq!(shape.value_quote.open, "<|\"|>"); - assert_eq!(shape.value_quote.close, "<|\"|>"); + assert_eq!( + markers.args_shape, + ToolCallArgsShape::PairedQuote(PairedQuoteShape { + name_args_separator: "{".to_owned(), + value_quote: ToolCallValueQuote { + open: "<|\"|>".to_owned(), + close: "<|\"|>".to_owned(), + }, + }) + ); } #[test] diff --git a/llama-cpp-bindings/src/tool_call_template_overrides/glm47_key_value_tags.rs b/llama-cpp-bindings/src/tool_call_template_overrides/glm47_key_value_tags.rs index 73373472..c10ae862 100644 --- a/llama-cpp-bindings/src/tool_call_template_overrides/glm47_key_value_tags.rs +++ b/llama-cpp-bindings/src/tool_call_template_overrides/glm47_key_value_tags.rs @@ -32,6 +32,7 @@ impl Glm47KeyValueTagsOverride { #[cfg(test)] mod tests { + use llama_cpp_bindings_types::KeyValueXmlTagsShape; use llama_cpp_bindings_types::ToolCallArgsShape; use super::Glm47KeyValueTagsOverride; @@ -44,16 +45,15 @@ mod tests { assert_eq!(markers.open, ""); assert_eq!(markers.close, ""); - let ToolCallArgsShape::KeyValueXmlTags(shape) = markers.args_shape else { - panic!( - "expected KeyValueXmlTags variant, got {:?}", - markers.args_shape - ); - }; - assert_eq!(shape.key_open, ""); - assert_eq!(shape.key_close, ""); - assert_eq!(shape.value_open, ""); - assert_eq!(shape.value_close, ""); + assert_eq!( + markers.args_shape, + ToolCallArgsShape::KeyValueXmlTags(KeyValueXmlTagsShape { + key_open: "".to_owned(), + key_close: "".to_owned(), + value_open: "".to_owned(), + value_close: "".to_owned(), + }) + ); } #[test] diff --git a/llama-cpp-bindings/src/tool_call_template_overrides/known_marker_candidates.rs b/llama-cpp-bindings/src/tool_call_template_overrides/known_marker_candidates.rs index 9448c866..735836f6 100644 --- a/llama-cpp-bindings/src/tool_call_template_overrides/known_marker_candidates.rs +++ b/llama-cpp-bindings/src/tool_call_template_overrides/known_marker_candidates.rs @@ -28,12 +28,7 @@ mod tests { #[test] fn known_marker_candidates_returns_one_per_registered_shape() { let candidates = known_marker_candidates(); - assert_eq!( - candidates.len(), - 5, - "expected exactly five registered shapes, got {}", - candidates.len() - ); + assert_eq!(candidates.len(), 5); let shape_discriminants: HashSet<&'static str> = candidates .iter() diff --git a/llama-cpp-bindings/src/tool_call_template_overrides/mistral3_arrow_args.rs b/llama-cpp-bindings/src/tool_call_template_overrides/mistral3_arrow_args.rs index 3337a120..5b41f827 100644 --- a/llama-cpp-bindings/src/tool_call_template_overrides/mistral3_arrow_args.rs +++ b/llama-cpp-bindings/src/tool_call_template_overrides/mistral3_arrow_args.rs @@ -29,6 +29,7 @@ impl Mistral3ArrowArgsOverride { #[cfg(test)] mod tests { + use llama_cpp_bindings_types::BracketedJsonShape; use llama_cpp_bindings_types::ToolCallArgsShape; use super::Mistral3ArrowArgsOverride; @@ -41,13 +42,12 @@ mod tests { assert_eq!(markers.open, "[TOOL_CALLS]"); assert!(markers.close.is_empty()); - let ToolCallArgsShape::BracketedJson(shape) = markers.args_shape else { - panic!( - "expected BracketedJson variant, got {:?}", - markers.args_shape - ); - }; - assert_eq!(shape.name_args_separator, "[ARGS]"); + assert_eq!( + markers.args_shape, + ToolCallArgsShape::BracketedJson(BracketedJsonShape { + name_args_separator: "[ARGS]".to_owned(), + }) + ); } #[test] diff --git a/llama-cpp-bindings/src/tool_call_template_overrides/qwen3_json_inside_tool_call.rs b/llama-cpp-bindings/src/tool_call_template_overrides/qwen3_json_inside_tool_call.rs index 7ac4bda6..65909d83 100644 --- a/llama-cpp-bindings/src/tool_call_template_overrides/qwen3_json_inside_tool_call.rs +++ b/llama-cpp-bindings/src/tool_call_template_overrides/qwen3_json_inside_tool_call.rs @@ -34,6 +34,7 @@ impl Qwen3JsonInsideToolCallOverride { #[cfg(test)] mod tests { + use llama_cpp_bindings_types::JsonObjectShape; use llama_cpp_bindings_types::ToolCallArgsShape; use super::Qwen3JsonInsideToolCallOverride; @@ -46,11 +47,13 @@ mod tests { assert_eq!(markers.open, ""); assert_eq!(markers.close, ""); - let ToolCallArgsShape::JsonObject(shape) = markers.args_shape else { - panic!("expected JsonObject variant, got {:?}", markers.args_shape); - }; - assert_eq!(shape.name_field, "name"); - assert_eq!(shape.arguments_field, "arguments"); + assert_eq!( + markers.args_shape, + ToolCallArgsShape::JsonObject(JsonObjectShape { + name_field: "name".to_owned(), + arguments_field: "arguments".to_owned(), + }) + ); } #[test] diff --git a/llama-cpp-bindings/src/tool_call_template_overrides/qwen_xml_tags.rs b/llama-cpp-bindings/src/tool_call_template_overrides/qwen_xml_tags.rs index b0d013fe..f2ca276f 100644 --- a/llama-cpp-bindings/src/tool_call_template_overrides/qwen_xml_tags.rs +++ b/llama-cpp-bindings/src/tool_call_template_overrides/qwen_xml_tags.rs @@ -33,6 +33,7 @@ impl QwenXmlTagsOverride { #[cfg(test)] mod tests { use llama_cpp_bindings_types::ToolCallArgsShape; + use llama_cpp_bindings_types::XmlTagsShape; use super::QwenXmlTagsOverride; @@ -44,13 +45,15 @@ mod tests { assert_eq!(markers.open, ""); assert_eq!(markers.close, ""); - let ToolCallArgsShape::XmlTags(shape) = markers.args_shape else { - panic!("expected XmlTags variant, got {:?}", markers.args_shape); - }; - assert_eq!(shape.function_open_prefix, ""); - assert_eq!(shape.parameter_open_prefix, ""); + assert_eq!( + markers.args_shape, + ToolCallArgsShape::XmlTags(XmlTagsShape { + function_open_prefix: "".to_owned(), + parameter_open_prefix: "".to_owned(), + }) + ); } #[test] diff --git a/llama-cpp-test-harness-macros/src/parsed_source.rs b/llama-cpp-test-harness-macros/src/parsed_source.rs index 66f7669a..bba3fa03 100644 --- a/llama-cpp-test-harness-macros/src/parsed_source.rs +++ b/llama-cpp-test-harness-macros/src/parsed_source.rs @@ -125,7 +125,10 @@ mod tests { fn parses_local_path_with_one_string_arg() { let parsed = parse("LocalPath(\"/abs/local.gguf\")").expect("valid"); - assert_eq!(parsed, ParsedSource::LocalPath("/abs/local.gguf".to_owned())); + assert_eq!( + parsed, + ParsedSource::LocalPath("/abs/local.gguf".to_owned()) + ); } #[test] @@ -198,7 +201,10 @@ mod tests { fn unparseable_input_returns_err() { let result = parse("@&^!"); - assert!(result.is_err(), "garbage input must fail to parse as syn::Expr"); + assert!( + result.is_err(), + "garbage input must fail to parse as syn::Expr" + ); } #[test] diff --git a/llama-cpp-test-harness/src/mmproj_source.rs b/llama-cpp-test-harness/src/mmproj_source.rs index ff4bf18f..e33fa0c4 100644 --- a/llama-cpp-test-harness/src/mmproj_source.rs +++ b/llama-cpp-test-harness/src/mmproj_source.rs @@ -44,7 +44,9 @@ mod tests { fn resolve_path_for_local_path_returns_the_literal_path() { let source = MmprojSource::LocalPath("/abs/mmproj.gguf"); - let resolved = source.resolve_path().expect("LocalPath resolve is infallible"); + let resolved = source + .resolve_path() + .expect("LocalPath resolve is infallible"); assert_eq!(resolved, PathBuf::from("/abs/mmproj.gguf")); } diff --git a/llama-cpp-test-harness/src/model_source.rs b/llama-cpp-test-harness/src/model_source.rs index b937e9c7..c29d9205 100644 --- a/llama-cpp-test-harness/src/model_source.rs +++ b/llama-cpp-test-harness/src/model_source.rs @@ -49,7 +49,9 @@ mod tests { fn resolve_path_for_local_path_returns_the_literal_path() { let source = ModelSource::LocalPath("/abs/example.gguf"); - let resolved = source.resolve_path().expect("LocalPath resolve is infallible"); + let resolved = source + .resolve_path() + .expect("LocalPath resolve is infallible"); assert_eq!(resolved, PathBuf::from("/abs/example.gguf")); } From e8a895bd1e0e0787d5fb5e78f65aae0f054b27bc Mon Sep 17 00:00:00 2001 From: Mateusz Charytoniuk Date: Sat, 23 May 2026 17:49:48 +0200 Subject: [PATCH 3/9] bump toolchain to 1.95.0, simplify CI install, address surfaced clippy lints --- .github/actions/install-rust-toolchain/action.yml | 8 ++++---- llama-cpp-bindings-tests/tests/model_sampling.rs | 4 +--- llama-cpp-bindings-tests/tests/multimodal.rs | 4 +--- llama-cpp-bindings/src/sampled_token_classifier.rs | 5 +---- llama-cpp-bindings/src/timing.rs | 2 +- rust-toolchain.toml | 2 +- 6 files changed, 9 insertions(+), 16 deletions(-) diff --git a/.github/actions/install-rust-toolchain/action.yml b/.github/actions/install-rust-toolchain/action.yml index 124e13b4..3786a69f 100644 --- a/.github/actions/install-rust-toolchain/action.yml +++ b/.github/actions/install-rust-toolchain/action.yml @@ -1,11 +1,11 @@ name: install-rust-toolchain -description: Install the pinned stable Rust toolchain (with rustfmt and clippy) and configure the cargo build cache. +description: Install the toolchain pinned by rust-toolchain.toml and configure the cargo build cache. runs: using: composite steps: - - uses: dtolnay/rust-toolchain@29eef336d9b2848a0b548edc03f92a220660cdb8 # stable - with: - components: rustfmt, clippy + - name: Install toolchain pinned by rust-toolchain.toml + shell: bash + run: cargo --version - uses: Swatinem/rust-cache@v2 diff --git a/llama-cpp-bindings-tests/tests/model_sampling.rs b/llama-cpp-bindings-tests/tests/model_sampling.rs index 97e1326b..d6b40ba4 100644 --- a/llama-cpp-bindings-tests/tests/model_sampling.rs +++ b/llama-cpp-bindings-tests/tests/model_sampling.rs @@ -416,9 +416,8 @@ fn sample_without_grammar_produces_multiple_tokens(fixture: &LlamaFixture<'_>) - let mut classifier = model.sampled_token_classifier(); let mut sampled_count: u64 = 0; - let mut position = batch.n_tokens(); - for _ in 0..5 { + for (position, _) in (batch.n_tokens()..).zip(0..5) { let (raw_token, _outcomes) = classifier.sample(&mut sampler, &context, -1)?; let raw_as_sampled = SampledToken::Content(raw_token); @@ -430,7 +429,6 @@ fn sample_without_grammar_produces_multiple_tokens(fixture: &LlamaFixture<'_>) - batch.clear(); batch.add(&raw_as_sampled, position, &[0], true)?; - position += 1; context.decode(&mut batch)?; } diff --git a/llama-cpp-bindings-tests/tests/multimodal.rs b/llama-cpp-bindings-tests/tests/multimodal.rs index efd07c35..c1108c4d 100644 --- a/llama-cpp-bindings-tests/tests/multimodal.rs +++ b/llama-cpp-bindings-tests/tests/multimodal.rs @@ -68,9 +68,8 @@ fn drive_sampling_loop( observed_reasoning: 0, }; let mut batch = LlamaBatch::new(512, 1)?; - let mut current_position = starting_position; - for _ in 0..max_tokens { + for (current_position, _) in (starting_position..).zip(0..max_tokens) { let (raw_token, outcomes) = classifier.sample(&mut sampler, ctx, -1)?; for outcome in &outcomes { totals.generated.push_str(&outcome.raw_piece); @@ -88,7 +87,6 @@ fn drive_sampling_loop( batch.clear(); batch.add(&raw_as_sampled, current_position, &[0], true)?; - current_position += 1; ctx.decode(&mut batch) .with_context(|| "failed to decode generated token")?; diff --git a/llama-cpp-bindings/src/sampled_token_classifier.rs b/llama-cpp-bindings/src/sampled_token_classifier.rs index 5d5b0e82..aae24fc3 100644 --- a/llama-cpp-bindings/src/sampled_token_classifier.rs +++ b/llama-cpp-bindings/src/sampled_token_classifier.rs @@ -254,10 +254,7 @@ impl<'model> SampledTokenClassifier<'model> { let lookback = self.markers.max_token_len().saturating_sub(1); let mut outcomes = Vec::new(); - loop { - let Some(front) = self.pending.front() else { - break; - }; + while let Some(front) = self.pending.front() { if front.is_held_for_probe { break; } diff --git a/llama-cpp-bindings/src/timing.rs b/llama-cpp-bindings/src/timing.rs index a4e02966..5c07eab8 100644 --- a/llama-cpp-bindings/src/timing.rs +++ b/llama-cpp-bindings/src/timing.rs @@ -138,7 +138,7 @@ fn write_timings(timings: &LlamaTimings, writer: &mut dyn std::fmt::Write) -> st 1e3 / timings.t_eval_ms() * f64::from(timings.n_eval()) )?; } else { - writeln!(writer, "eval time = {:.2} ms / 0 runs", timings.t_eval_ms(),)?; + writeln!(writer, "eval time = {:.2} ms / 0 runs", timings.t_eval_ms())?; } Ok(()) diff --git a/rust-toolchain.toml b/rust-toolchain.toml index fb5449af..38ab2c6b 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,3 +1,3 @@ [toolchain] -channel = "1.93.0" +channel = "1.95.0" components = ["clippy", "rustfmt"] From 2f0afc9c20e4b5bd6073b355dd77a8253f5feb52 Mon Sep 17 00:00:00 2001 From: Mateusz Charytoniuk Date: Sat, 23 May 2026 18:08:42 +0200 Subject: [PATCH 4/9] stop bindgen from leaking libc FILE internals into the FFI surface --- llama-cpp-bindings-build/src/bindgen_config.rs | 7 +++++++ llama-cpp-bindings-sys/src/lib.rs | 16 ---------------- 2 files changed, 7 insertions(+), 16 deletions(-) diff --git a/llama-cpp-bindings-build/src/bindgen_config.rs b/llama-cpp-bindings-build/src/bindgen_config.rs index 85b9810c..a2d249de 100644 --- a/llama-cpp-bindings-build/src/bindgen_config.rs +++ b/llama-cpp-bindings-build/src/bindgen_config.rs @@ -53,6 +53,13 @@ fn create_base_builder(llama_src: &Path) -> bindgen::Builder { .allowlist_type("llama_rs_.*") .allowlist_function("mtmd_.*") .allowlist_type("mtmd_.*") + .blocklist_function("ggml_fopen") + .blocklist_function("gguf_init_from_file_ptr") + .blocklist_function("gguf_write_to_file_ptr") + .blocklist_function("llama_model_load_from_file_ptr") + .blocklist_type("FILE") + .blocklist_type("_IO_.*") + .blocklist_type("__BindgenBitfieldUnit") .prepend_enum_name(false) } diff --git a/llama-cpp-bindings-sys/src/lib.rs b/llama-cpp-bindings-sys/src/lib.rs index 898d1d22..e3dbbeba 100644 --- a/llama-cpp-bindings-sys/src/lib.rs +++ b/llama-cpp-bindings-sys/src/lib.rs @@ -4,25 +4,9 @@ non_camel_case_types, reason = "bindgen emits C struct and enum names verbatim and they don't follow Rust naming" )] -#![expect( - non_snake_case, - reason = "bindgen emits C function names verbatim and they don't always follow Rust naming" -)] #![expect( unpredictable_function_pointer_comparisons, reason = "bindgen-generated FFI function pointers are opaque and the lint cannot reason about them" )] -#![expect( - unnecessary_transmutes, - reason = "bindgen generates transmutes to bridge between C and Rust integer/enum representations" -)] -#![expect( - clippy::missing_safety_doc, - reason = "bindgen emits raw FFI declarations; safety contracts live on the wrapper API in llama-cpp-bindings" -)] -#![expect( - clippy::ptr_offset_with_cast, - reason = "bindgen emits standard FFI pointer-arithmetic patterns that this lint flags" -)] include!(concat!(env!("OUT_DIR"), "/bindings.rs")); From a4a06b7691c67fbca691bdfa56d78841119bcdcc Mon Sep 17 00:00:00 2001 From: Mateusz Charytoniuk Date: Mon, 25 May 2026 12:32:30 +0200 Subject: [PATCH 5/9] block MSVC _iobuf so bindgen does not leak its _Placeholder field on Windows --- llama-cpp-bindings-build/src/bindgen_config.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/llama-cpp-bindings-build/src/bindgen_config.rs b/llama-cpp-bindings-build/src/bindgen_config.rs index a2d249de..549e0bd8 100644 --- a/llama-cpp-bindings-build/src/bindgen_config.rs +++ b/llama-cpp-bindings-build/src/bindgen_config.rs @@ -59,6 +59,7 @@ fn create_base_builder(llama_src: &Path) -> bindgen::Builder { .blocklist_function("llama_model_load_from_file_ptr") .blocklist_type("FILE") .blocklist_type("_IO_.*") + .blocklist_type("_iobuf") .blocklist_type("__BindgenBitfieldUnit") .prepend_enum_name(false) } From 125b63a33c4c309842a574ffd44b3590c70675bf Mon Sep 17 00:00:00 2001 From: Mateusz Charytoniuk Date: Mon, 25 May 2026 12:47:00 +0200 Subject: [PATCH 6/9] gate path_to_str_error_disc helper to unix to match its only caller --- llama-cpp-bindings/src/gguf_context.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/llama-cpp-bindings/src/gguf_context.rs b/llama-cpp-bindings/src/gguf_context.rs index 45a9b6c0..7ef7114c 100644 --- a/llama-cpp-bindings/src/gguf_context.rs +++ b/llama-cpp-bindings/src/gguf_context.rs @@ -197,6 +197,7 @@ mod tests { std::mem::discriminant(&GgufContextError::NulError(nul_err)) } + #[cfg(unix)] fn path_to_str_error_disc() -> Discriminant { std::mem::discriminant(&GgufContextError::PathToStrError(PathBuf::new())) } From 8a4b360dc6f26beadb09bc14f3da8ecbc05a1b29 Mon Sep 17 00:00:00 2001 From: Mateusz Charytoniuk Date: Mon, 25 May 2026 13:29:06 +0200 Subject: [PATCH 7/9] move harness self-test out of CI by shifting test.harness from test.unit to test.llms --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 57ee7a3e..573a24c9 100644 --- a/Makefile +++ b/Makefile @@ -58,9 +58,9 @@ test.harness: clippy cargo test -p llama-cpp-test-harness-macros -p llama-cpp-test-harness $(DEVICE_FEATURE) .PHONY: test.llms -test.llms: clippy +test.llms: clippy test.harness cargo test --no-fail-fast -p llama-cpp-bindings-tests $(DEVICE_FEATURE) .PHONY: test.unit -test.unit: clippy test.harness +test.unit: clippy cargo test -p llama-cpp-log-decoder -p llama-cpp-bindings $(DEVICE_FEATURE) From 160483c09c09cc0fe4f879a9be36b90ae5e1a43f Mon Sep 17 00:00:00 2001 From: Mateusz Charytoniuk Date: Mon, 25 May 2026 14:49:00 +0200 Subject: [PATCH 8/9] include unit tests in test.llms so it runs the full suite --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 573a24c9..10c4e4c8 100644 --- a/Makefile +++ b/Makefile @@ -58,7 +58,7 @@ test.harness: clippy cargo test -p llama-cpp-test-harness-macros -p llama-cpp-test-harness $(DEVICE_FEATURE) .PHONY: test.llms -test.llms: clippy test.harness +test.llms: clippy test.harness test.unit cargo test --no-fail-fast -p llama-cpp-bindings-tests $(DEVICE_FEATURE) .PHONY: test.unit From e9dcecbf0890e3675668b3aac74d5fa90ebd4d2e Mon Sep 17 00:00:00 2001 From: Mateusz Charytoniuk Date: Mon, 25 May 2026 14:49:05 +0200 Subject: [PATCH 9/9] simplify run-all-tests skill and add run-coverage skill --- .claude/skills/run-all-tests/SKILL.md | 24 +------------- .claude/skills/run-coverage/SKILL.md | 47 +++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 23 deletions(-) create mode 100644 .claude/skills/run-coverage/SKILL.md diff --git a/.claude/skills/run-all-tests/SKILL.md b/.claude/skills/run-all-tests/SKILL.md index cbf1584e..72e32efe 100644 --- a/.claude/skills/run-all-tests/SKILL.md +++ b/.claude/skills/run-all-tests/SKILL.md @@ -26,19 +26,6 @@ echo "Device: $DEVICE" ## Step 2: run the suites -Sequentially, from the workspace root. - -Copy this checklist and tick each item as the suite completes: - -``` -Test progress: -- [ ] make test.unit -- [ ] make test.qwen3.5_0.8B -- [ ] make test.qwen3.6_35b_a3b -- [ ] make test.glm4_7_flash -- [ ] make test.deepseek_r1_distill_llama_8b -``` - Translate `$DEVICE` into the value the Makefile expects. `TEST_DEVICE` holds **only** the backend name (`cuda` / `metal` / `vulkan` / `rocm`), or empty for CPU since there is no `cpu` feature: ```bash @@ -48,20 +35,11 @@ Translate `$DEVICE` into the value the Makefile expects. `TEST_DEVICE` holds **o Then run exactly: ```bash -make test.unit TEST_DEVICE="$FEAT" -make test.qwen3.5_0.8B TEST_DEVICE="$FEAT" -make test.qwen3.6_35b_a3b TEST_DEVICE="$FEAT" -make test.glm4_7_flash TEST_DEVICE="$FEAT" -make test.deepseek_r1_distill_llama_8b TEST_DEVICE="$FEAT" +make test.llms TEST_DEVICE="$FEAT" ``` -The Makefile's `$(if $(TEST_DEVICE),--features $(TEST_DEVICE),)` already skips the `--features` flag when `$FEAT` is empty, so the CPU path needs no further special-casing. - -Do not run `make test.llms` or `make test`. Those bundle every LLM suite into one cargo invocation, which loses per-suite failure attribution and breaks the checklist above. - ## Step 3: rules during the run -- **Serialize GPU suites.** When `$DEVICE` is `cuda` or `metal`, run test suites sequentially to avoid device contention. - **Per-test 30 s budget.** Flag any individual test that exceeds 30 s wall-clock. That is a real bug — production or test — not flakiness. ## Step 4: report diff --git a/.claude/skills/run-coverage/SKILL.md b/.claude/skills/run-coverage/SKILL.md new file mode 100644 index 00000000..908603a3 --- /dev/null +++ b/.claude/skills/run-coverage/SKILL.md @@ -0,0 +1,47 @@ +--- +name: run-coverage +description: Runs code coverage checker on the fastest available device. Use when the user asks to run the coverage, or to check the code coverage. +--- + +# Checking the code coverage + +Run every instrumented test suite in the workspace, picking the fastest compiled device backend for the host, then make sure everything is within required limits. + +Makefile is the source of truth for the gated values, and the code coverage setup. + +## Step 1: detect the device + +Run this once at the start and echo the chosen device: + +```bash +if [[ "$OSTYPE" == "darwin"* ]]; then + DEVICE=metal +elif command -v nvidia-smi >/dev/null 2>&1 && nvidia-smi >/dev/null 2>&1; then + DEVICE=cuda +else + DEVICE=cpu +fi +echo "Device: $DEVICE" +``` + +`$DEVICE` selects the backend feature for every suite in Step 2, including `test.unit`. Passing the same device through every target keeps the cmake hash stable, so llama.cpp is compiled once and reused across all suites. + +## Step 2: run the suites + +Translate `$DEVICE` into the value the Makefile expects. `TEST_DEVICE` holds **only** the backend name (`cuda` / `metal` / `vulkan` / `rocm`), or empty for CPU since there is no `cpu` feature: + +```bash +[ "$DEVICE" = "cpu" ] && FEAT= || FEAT="$DEVICE" +``` + +Then run exactly: + +```bash +make coverage TEST_DEVICE="$FEAT" +``` + +## Step 4: report + +After all suites finish, sum up the results in an actionable report. Make sure all code coverage gates are met. + +