diff --git a/Cargo.lock b/Cargo.lock
index a1e508d94..37ed6d833 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -98,7 +98,7 @@ dependencies = [
  "mt-air",
  "mt-fiat-shamir",
  "mt-field",
- "mt-koala-bear",
+ "mt-goldilocks",
  "mt-poly",
  "mt-sumcheck",
  "mt-symetric",
@@ -532,7 +532,6 @@ dependencies = [
  "rand",
  "rec_aggregation",
  "serde",
- "serde_json",
  "sub_protocols",
  "tracing",
  "utils",
@@ -617,7 +616,7 @@ name = "mt-fiat-shamir"
 version = "0.1.0"
 dependencies = [
  "mt-field",
- "mt-koala-bear",
+ "mt-goldilocks",
  "mt-symetric",
  "mt-utils",
  "rayon",
@@ -639,6 +638,21 @@ dependencies = [
  "tracing",
 ]
 
+[[package]]
+name = "mt-goldilocks"
+version = "0.1.0"
+dependencies = [
+ "itertools",
+ "mt-field",
+ "mt-utils",
+ "num-bigint",
+ "paste",
+ "rand",
+ "rayon",
+ "serde",
+ "tracing",
+]
+
 [[package]]
 name = "mt-koala-bear"
 version = "0.1.0"
@@ -660,7 +674,7 @@ version = "0.1.0"
 dependencies = [
  "itertools",
  "mt-field",
- "mt-koala-bear",
+ "mt-goldilocks",
  "mt-utils",
  "rand",
  "rayon",
@@ -686,7 +700,7 @@ name = "mt-symetric"
 version = "0.1.0"
 dependencies = [
  "mt-field",
- "mt-koala-bear",
+ "mt-goldilocks",
  "rayon",
 ]
 
@@ -704,7 +718,7 @@ dependencies = [
  "itertools",
  "mt-fiat-shamir",
  "mt-field",
- "mt-koala-bear",
+ "mt-goldilocks",
  "mt-poly",
  "mt-sumcheck",
  "mt-symetric",
diff --git a/Cargo.toml b/Cargo.toml
index f8e2ada76..8bc19c7d8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -12,7 +12,7 @@ members = [
     "crates/*",
     "crates/backend/utils",
     "crates/backend/field",
-    "crates/backend/koala-bear",
+    "crates/backend/goldilocks",
     "crates/backend/poly",
     "crates/backend/symetric",
     "crates/backend/air",
diff --git a/README.md b/README.md
index b945bd705..c80e9458b 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,6 @@ Minimal hash-based zkVM, for a Post-Quantum Ethereum.
 <p align="center">
   <a href="https://github.com/leanEthereum/leanVM/releases/download/spec-latest/minimal_zkVM.pdf"><img src="https://img.shields.io/badge/Documentation-blue?style=for-the-badge&logo=data:image/svg%2bxml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgMCAyNCAyNCIgZmlsbD0id2hpdGUiPjxwYXRoIGQ9Ik0xNCAySDZjLTEuMSAwLTIgLjktMiAydjE2YzAgMS4xLjg5IDIgMS45OSAySDE4YzEuMSAwIDItLjkgMi0yVjhsLTYtNnpNOC41IDE0LjVoMS4yNWMuOTcgMCAxLjc1LS43OCAxLjc1LTEuNzVTMTAuNzIgMTEgOS43NSAxMUg3LjV2Nmgxdi0yLjV6bTAtMVYxMmgxLjI1Yy40MSAwIC43NS4zNC43NS43NXMtLjM0Ljc1LS43NS43NUg4LjV6bTUuNSAzLjVoMnYtMWgtMnYtMWgydi0xaC0ydi0xLjVjMC0uMjguMjItLjUuNS0uNUgxN3YtMWgtMmMtLjgzIDAtMS41LjY3LTEuNSAxLjVWMTd6TTEzIDlWMy41TDE4LjUgOUgxM3oiLz48L3N2Zz4=" alt="Documentation"></a>
   <a href="crates/lean_compiler/zkDSL.md"><img src="https://img.shields.io/badge/zkDSL%20reference-7c3aed?style=for-the-badge&logo=markdown&logoColor=white" alt="zkDSL reference"></a>
-  <a href="crates/lean_prover/python-verifier/verifier.py"><img src="https://img.shields.io/badge/Python%20verifier-d97706?style=for-the-badge&logo=python&logoColor=white" alt="Python verifier"></a>
 </p>
 
 ## Proving System
@@ -44,7 +43,7 @@ cargo run --release -- xmss --n-signatures 1550 --log-inv-rate 1
 
 ### Recursion
 
-Aggregating together n previously aggregated signatures, each containing 700 XMSS.
+Aggregating together n previously aggregated signatures, each containing 775 XMSS.
 
 
 ```bash
@@ -81,13 +80,11 @@ cargo run --release -- fancy-aggregation
 
 ### snark
 
-≈ 124 bits of provable security, given by Johnson bound + degree 5 extension of koala-bear. (128 bits requires bigger hash digests (8 koalabears ≈ 248 bits) -> TODO). In the benchmarks, we also display performance with conjectured security, even though leanVM targets the proven regime by default.
+≈ 128 bits of provable security, given by Johnson bound + degree 5 extension of koala-bear. (128 bits requires bigger hash digests (8 koalabears ≈ 248 bits) -> TODO). In the benchmarks, we also display performance with conjectured security, even though leanVM targets the proven regime by default.
 
 ### XMSS
 
-Currently, we use an [XMSS](crates/xmss/xmss.md) with hash digests of 4 field elements ≈ 124 bits. Tweaks and public parameters ensure domain separation. An analysis in the ROM (resp. QROM), inspired by the section 3.1 of [Tight adaptive reprogramming in the QROM](https://arxiv.org/pdf/2010.15103) would lead to ≈ 124 (resp. 62) bits of classical (resp. quantum) security. Going to 128 / 64 bits of classical / quantum security, i.e. NIST level 1 (in the ROM/QROM), is an ongoing effort. It requires either:
-- hash digests of 5 field elements (drawback: we need to double the hash chain length from 8 to 16 if we want to stay below one IPv6 MTU = 1280 bytes)
-- a new prime, close to 32 bits (typically p = 125.2^25 + 1) or 64 bits ([goldilocks](https://2π.com/22/goldilocks/))
+Currently, we use an [XMSS](crates/xmss/xmss.md) with hash digests of 2 goldilocks ≈ 128 bits. Tweaks and public parameters ensure domain separation. An analysis in the ROM (resp. QROM), inspired by the section 3.1 of [Tight adaptive reprogramming in the QROM](https://arxiv.org/pdf/2010.15103) would lead to ≈ 128 (resp. 64) bits of classical (resp. quantum) security, i.e. NIST level 1.
 
 It's important to mention that a security analysis in the ROM / QROM is not the most conservative. In particular, [eprint 2025/055](https://eprint.iacr.org/2025/055.pdf)'s security proof holds in the standard model (at the cost of bigger hash digests): the implementation is available in the [leanSig](https://github.com/leanEthereum/leanSig) repository. A compatible version of leanVM can be found in the [devnet4](https://github.com/leanEthereum/leanVM/tree/devnet4) branch.
 
diff --git a/TODO.md b/TODO.md
index 5bd14c0e3..1d194c540 100644
--- a/TODO.md
+++ b/TODO.md
@@ -9,7 +9,6 @@
 
 ## Security:
 
-- 128 bits security? (currently 124)
 - Fiat Shamir: add a claim tracing feature, to ensure all the claims are indeed checked (Lev)
 - Double Check AIR constraints, logup overflows etc
 - Do we need to enforce some values at the first row of the dot-product table?
diff --git a/crates/backend/Cargo.toml b/crates/backend/Cargo.toml
index 3f61957af..7557dcbb4 100644
--- a/crates/backend/Cargo.toml
+++ b/crates/backend/Cargo.toml
@@ -13,5 +13,5 @@ rayon.workspace = true
 whir = { path = "../whir", package = "mt-whir" }
 tracing.workspace = true
 fiat-shamir = { path = "fiat-shamir", package = "mt-fiat-shamir" }
-koala-bear = { path = "koala-bear", package = "mt-koala-bear" }
+goldilocks = { path = "goldilocks", package = "mt-goldilocks" }
 utils = { path = "utils", package = "mt-utils" }
diff --git a/crates/backend/air/src/constraint_folder/packed.rs b/crates/backend/air/src/constraint_folder/packed.rs
index bad2d76b0..7ff9f5bb3 100644
--- a/crates/backend/air/src/constraint_folder/packed.rs
+++ b/crates/backend/air/src/constraint_folder/packed.rs
@@ -9,10 +9,6 @@ pub struct ConstraintFolderPacked<'a, IF, EF: ExtensionField<PF<EF>>, ExtraData:
     pub extra_data: &'a ExtraData,
     pub accumulator: EFPacking<EF>,
     pub constraint_index: usize,
-    pub skip_low: bool,
-    pub accumulator_low: EFPacking<EF>,
-    pub cached_state: Option<Vec<IF>>,
-    pub low_ci_count: usize,
 }
 
 impl<'a, IF, EF, ExtraData> ConstraintFolderPacked<'a, IF, EF, ExtraData>
@@ -28,10 +24,6 @@ where
             extra_data,
             accumulator: EFPacking::<EF>::ZERO,
             constraint_index: 0,
-            skip_low: false,
-            accumulator_low: EFPacking::<EF>::ZERO,
-            cached_state: None,
-            low_ci_count: 0,
         }
     }
 }
@@ -70,30 +62,4 @@ where
         self.accumulator += EFPacking::<EF>::from(alpha_power) * x;
         self.constraint_index += 1;
     }
-
-    #[inline]
-    fn assert_eq_low(&mut self, x: IF, y: IF) {
-        let alpha_power = self.extra_data.alpha_powers()[self.constraint_index];
-        let contrib = EFPacking::<EF>::from(alpha_power) * (x - y);
-        self.accumulator += contrib;
-        self.accumulator_low += contrib;
-        self.constraint_index += 1;
-    }
-
-    #[inline]
-    fn low_degree_block<F>(&mut self, state: &mut [IF], block: F)
-    where
-        F: FnOnce(&mut Self, &mut [IF]),
-    {
-        if self.skip_low {
-            state.copy_from_slice(self.cached_state.as_ref().unwrap());
-            self.constraint_index += self.low_ci_count;
-        } else {
-            block(self, state);
-            if let Some(cache) = &mut self.cached_state {
-                cache.clear();
-                cache.extend_from_slice(state);
-            }
-        }
-    }
 }
diff --git a/crates/backend/air/src/lib.rs b/crates/backend/air/src/lib.rs
index b8b3361f1..f295c5c06 100644
--- a/crates/backend/air/src/lib.rs
+++ b/crates/backend/air/src/lib.rs
@@ -24,11 +24,6 @@ pub trait Air: Send + Sync + 'static {
     fn n_shift_columns(&self) -> usize;
 
     fn eval<AB: AirBuilder>(&self, builder: &mut AB, extra_data: &Self::ExtraData);
-
-    /// If the AIR contains a `low_degree_block` sub-region, returns `(degree, n_constraints)`
-    fn low_degree_air(&self) -> Option<(usize, usize)> {
-        None
-    }
 }
 
 pub trait AirBuilder: Sized {
@@ -64,19 +59,6 @@ pub trait AirBuilder: Sized {
         self.assert_zero(x.bool_check());
     }
 
-    fn assert_eq_low(&mut self, x: Self::IF, y: Self::IF) {
-        self.assert_eq(x, y);
-    }
-
-    /// Execute `block` as a low-degree sub-region whose post-state is "cacheable"
-    /// = linear in z without the low-degree constraints
-    fn low_degree_block<F>(&mut self, state: &mut [Self::IF], block: F)
-    where
-        F: FnOnce(&mut Self, &mut [Self::IF]),
-    {
-        block(self, state);
-    }
-
     /// useful to build the recursion program
     #[inline(always)]
     fn declare_values(&mut self, values: &[Self::IF]) {
diff --git a/crates/backend/fiat-shamir/Cargo.toml b/crates/backend/fiat-shamir/Cargo.toml
index ec8649bc2..5797a1596 100644
--- a/crates/backend/fiat-shamir/Cargo.toml
+++ b/crates/backend/fiat-shamir/Cargo.toml
@@ -5,7 +5,7 @@ edition.workspace = true
 
 [dependencies]
 field = { path = "../field", package = "mt-field" }
-koala-bear = { path = "../koala-bear", package = "mt-koala-bear" }
+goldilocks = { path = "../goldilocks", package = "mt-goldilocks" }
 symetric = { path = "../symetric", package = "mt-symetric" }
 utils = { path = "../utils", package = "mt-utils" }
 tracing.workspace = true
diff --git a/crates/backend/fiat-shamir/src/challenger.rs b/crates/backend/fiat-shamir/src/challenger.rs
index 25e116e6b..32deb1dd5 100644
--- a/crates/backend/fiat-shamir/src/challenger.rs
+++ b/crates/backend/fiat-shamir/src/challenger.rs
@@ -1,7 +1,7 @@
 use field::PrimeField64;
-use koala_bear::symmetric::Permutation;
+use symetric::Permutation;
 
-pub const RATE: usize = 8;
+pub const RATE: usize = 4;
 pub const WIDTH: usize = RATE * 2;
 pub const CAPACITY: usize = WIDTH - RATE;
 
diff --git a/crates/backend/fiat-shamir/src/prover.rs b/crates/backend/fiat-shamir/src/prover.rs
index 80bb6d13e..691e0e9b3 100644
--- a/crates/backend/fiat-shamir/src/prover.rs
+++ b/crates/backend/fiat-shamir/src/prover.rs
@@ -8,11 +8,11 @@ use field::PackedValue;
 use field::PrimeCharacteristicRing;
 use field::integers::QuotientMap;
 use field::{ExtensionField, PrimeField64};
-use koala_bear::symmetric::Permutation;
 use rayon::prelude::*;
 use std::sync::atomic::{AtomicU64, Ordering};
 use std::time::Duration;
 use std::{fmt::Debug, sync::Mutex, time::Instant};
+use symetric::Permutation;
 
 static POW_GRINDING_NANOS: AtomicU64 = AtomicU64::new(0);
 
diff --git a/crates/backend/fiat-shamir/src/transcript.rs b/crates/backend/fiat-shamir/src/transcript.rs
index 612c2d109..b9c2e946e 100644
--- a/crates/backend/fiat-shamir/src/transcript.rs
+++ b/crates/backend/fiat-shamir/src/transcript.rs
@@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize};
 
 use crate::PrunedMerklePaths;
 
-pub const DIGEST_LEN_FE: usize = 8;
+pub const DIGEST_LEN_FE: usize = 4;
 
 #[derive(Debug, Clone)]
 pub struct MerkleOpening<F> {
diff --git a/crates/backend/fiat-shamir/src/utils.rs b/crates/backend/fiat-shamir/src/utils.rs
index fcdafa5bb..94efed6eb 100644
--- a/crates/backend/fiat-shamir/src/utils.rs
+++ b/crates/backend/fiat-shamir/src/utils.rs
@@ -1,5 +1,5 @@
 use field::{BasedVectorSpace, ExtensionField, Field, PrimeCharacteristicRing, PrimeField64};
-use koala_bear::symmetric::Permutation;
+use symetric::Permutation;
 
 use crate::challenger::{Challenger, RATE, WIDTH};
 
diff --git a/crates/backend/fiat-shamir/src/verifier.rs b/crates/backend/fiat-shamir/src/verifier.rs
index bf4b9e6f0..b6077fd80 100644
--- a/crates/backend/fiat-shamir/src/verifier.rs
+++ b/crates/backend/fiat-shamir/src/verifier.rs
@@ -10,8 +10,8 @@ use crate::{
 };
 use field::PrimeCharacteristicRing;
 use field::{ExtensionField, PrimeField64};
-use koala_bear::symmetric::Permutation;
-use koala_bear::{KoalaBear, default_koalabear_poseidon1_16};
+use goldilocks::{Goldilocks, default_goldilocks_poseidon1_8};
+use symetric::Permutation;
 
 pub struct VerifierState<EF: ExtensionField<PF<EF>>, P> {
     challenger: Challenger<PF<EF>, P>,
@@ -75,16 +75,16 @@ where
 
     #[allow(clippy::missing_transmute_annotations)]
     fn restore_merkle_paths(paths: PrunedMerklePaths<PF<EF>, PF<EF>>) -> Option<Vec<MerkleOpening<PF<EF>>>> {
-        assert_eq!(TypeId::of::<PF<EF>>(), TypeId::of::<KoalaBear>());
-        // SAFETY: We've confirmed PF<EF> == KoalaBear
-        let paths: PrunedMerklePaths<KoalaBear, KoalaBear> = unsafe { std::mem::transmute(paths) };
-        let perm = default_koalabear_poseidon1_16();
-        let hash_fn = |data: &[KoalaBear]| symetric::hash_slice_rtl::<_, _, 16, 8, DIGEST_LEN_FE>(&perm, data);
-        let combine_fn = |left: &[KoalaBear; DIGEST_LEN_FE], right: &[KoalaBear; DIGEST_LEN_FE]| {
+        assert_eq!(TypeId::of::<PF<EF>>(), TypeId::of::<Goldilocks>());
+        // SAFETY: We've confirmed PF<EF> == Goldilocks
+        let paths: PrunedMerklePaths<Goldilocks, Goldilocks> = unsafe { std::mem::transmute(paths) };
+        let perm = default_goldilocks_poseidon1_8();
+        let hash_fn = |data: &[Goldilocks]| symetric::hash_slice_rtl::<_, _, 8, 4, DIGEST_LEN_FE>(&perm, data);
+        let combine_fn = |left: &[Goldilocks; DIGEST_LEN_FE], right: &[Goldilocks; DIGEST_LEN_FE]| {
             symetric::compress(&perm, [*left, *right])
         };
-        let restored: MerklePaths<KoalaBear, KoalaBear> = paths.restore(&hash_fn, &combine_fn)?;
-        let openings: Vec<MerkleOpening<KoalaBear>> = restored
+        let restored: MerklePaths<Goldilocks, Goldilocks> = paths.restore(&hash_fn, &combine_fn)?;
+        let openings: Vec<MerkleOpening<Goldilocks>> = restored
             .0
             .into_iter()
             .map(|path| MerkleOpening {
@@ -92,7 +92,7 @@ where
                 path: path.sibling_hashes,
             })
             .collect();
-        // SAFETY: PF<EF> == KoalaBear
+        // SAFETY: PF<EF> == Goldilocks
         Some(unsafe { std::mem::transmute(openings) })
     }
 }
diff --git a/crates/backend/fiat-shamir/tests/grinding.rs b/crates/backend/fiat-shamir/tests/grinding.rs
index 894089646..afb2ce853 100644
--- a/crates/backend/fiat-shamir/tests/grinding.rs
+++ b/crates/backend/fiat-shamir/tests/grinding.rs
@@ -1,23 +1,23 @@
-use koala_bear::{QuinticExtensionFieldKB, default_koalabear_poseidon1_16};
+use goldilocks::{CubicExtensionFieldGL, default_goldilocks_poseidon1_8};
 use mt_fiat_shamir::{FSProver, FSVerifier, ProverState, VerifierState};
 use std::time::Instant;
 
-type EF = QuinticExtensionFieldKB;
+type EF = CubicExtensionFieldGL;
 
 #[test]
 #[ignore]
 fn bench_grinding() {
     let n_reps = 100;
-    let perm = default_koalabear_poseidon1_16();
+    let perm = default_goldilocks_poseidon1_8();
     for grinding_bits in 20..=20 {
-        let mut prover_state = ProverState::<EF, _>::new(perm.clone(), Default::default());
+        let mut prover_state = ProverState::<EF, _>::new(perm, Default::default());
         let time = Instant::now();
         for _ in 0..n_reps {
             prover_state.pow_grinding(grinding_bits);
         }
         let elapsed = time.elapsed();
         let mut verifier_state =
-            VerifierState::<EF, _>::new(prover_state.into_proof(), perm.clone(), Default::default()).unwrap();
+            VerifierState::<EF, _>::new(prover_state.into_proof(), perm, Default::default()).unwrap();
         for _ in 0..n_reps {
             verifier_state.check_pow_grinding(grinding_bits).unwrap()
         }
diff --git a/crates/backend/field/src/exponentiation.rs b/crates/backend/field/src/exponentiation.rs
index 2e9f567e4..92fb17f69 100644
--- a/crates/backend/field/src/exponentiation.rs
+++ b/crates/backend/field/src/exponentiation.rs
@@ -8,7 +8,7 @@ pub(crate) const fn bits_u64(n: u64) -> usize {
 
 /// Compute the exponential `x -> x^1420470955` using a custom addition chain.
 ///
-/// This map computes the third root of `x` if `x` is a member of the field `KoalaBear`.
+/// This map computes the third root of `x` if `x` is a member of the field `Goldilocks`.
 /// This follows from the computation: `3 * 1420470955 = 2*(2^31 - 2^24) + 1 = 1 mod (p - 1)`.
 #[must_use]
 pub fn exp_1420470955<R: PrimeCharacteristicRing>(val: R) -> R {
diff --git a/crates/backend/field/src/field.rs b/crates/backend/field/src/field.rs
index b44ed45ed..e028e30f4 100644
--- a/crates/backend/field/src/field.rs
+++ b/crates/backend/field/src/field.rs
@@ -71,7 +71,7 @@ pub trait PrimeCharacteristicRing:
     + PartialEq
 {
     /// The field `ℤ/p` where the characteristic of this ring is p.
-    type PrimeSubfield: PrimeField32;
+    type PrimeSubfield: PrimeField64;
 
     /// The additive identity of the ring.
     ///
diff --git a/crates/backend/goldilocks/Cargo.toml b/crates/backend/goldilocks/Cargo.toml
new file mode 100644
index 000000000..d602351bc
--- /dev/null
+++ b/crates/backend/goldilocks/Cargo.toml
@@ -0,0 +1,16 @@
+[package]
+name = "mt-goldilocks"
+version.workspace = true
+edition.workspace = true
+
+[dependencies]
+field = { path = "../field", package = "mt-field" }
+utils = { path = "../utils", package = "mt-utils" }
+
+rand.workspace = true
+rayon.workspace = true
+serde.workspace = true
+itertools.workspace = true
+tracing.workspace = true
+num-bigint = "*"
+paste = "1"
diff --git a/crates/backend/goldilocks/src/aarch64_neon/mod.rs b/crates/backend/goldilocks/src/aarch64_neon/mod.rs
new file mode 100644
index 000000000..730a8675b
--- /dev/null
+++ b/crates/backend/goldilocks/src/aarch64_neon/mod.rs
@@ -0,0 +1,5 @@
+// Credits: Plonky3 (https://github.com/Plonky3/Plonky3) (MIT and Apache-2.0 licenses).
+
+mod packing;
+
+pub use packing::*;
diff --git a/crates/backend/goldilocks/src/aarch64_neon/packing.rs b/crates/backend/goldilocks/src/aarch64_neon/packing.rs
new file mode 100644
index 000000000..6f0bb93af
--- /dev/null
+++ b/crates/backend/goldilocks/src/aarch64_neon/packing.rs
@@ -0,0 +1,270 @@
+// Credits: Plonky3 (https://github.com/Plonky3/Plonky3) (MIT and Apache-2.0 licenses).
+
+use alloc::vec::Vec;
+use core::arch::aarch64::{
+    uint64x2_t, vaddq_u64, vandq_u64, vdupq_n_u64, vgetq_lane_u64, vsetq_lane_u64, vshrq_n_u64, vsubq_u64,
+};
+use core::fmt::Debug;
+use core::iter::{Product, Sum};
+use core::mem::transmute;
+use core::ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Sub, SubAssign};
+
+use field::op_assign_macros::{
+    impl_add_assign, impl_add_base_field, impl_div_methods, impl_mul_base_field, impl_mul_methods, impl_packed_value,
+    impl_rng, impl_sub_assign, impl_sub_base_field, impl_sum_prod_base_field, ring_sum,
+};
+use field::{
+    Algebra, Field, InjectiveMonomial, PackedField, PackedFieldPow2, PackedValue, PermutationMonomial,
+    PrimeCharacteristicRing, PrimeField64,
+};
+use rand::Rng;
+use rand::distr::{Distribution, StandardUniform};
+use utils::reconstitute_from_base;
+
+use crate::helpers::exp_10540996611094048183;
+use crate::{Goldilocks, P};
+
+const WIDTH: usize = 2;
+
+/// Equal to `2^32 - 1 = 2^64 mod P`.
+const EPSILON: u64 = Goldilocks::ORDER_U64.wrapping_neg();
+
+/// Hand-scheduled inline-asm variant tuned for the **scalar / single-lane Mul**
+/// path on aarch64. Saves one ALU op vs the LLVM-emitted form by collapsing `lsr+subs` into the
+/// shifted-register `subs xT, lo, hi, lsr #32` form.
+#[inline(always)]
+pub(super) fn mul_reduce_asm(a: u64, b: u64) -> u64 {
+    let result: u64;
+    // SAFETY: integer ALU only; `pure, nomem, nostack` lets LLVM schedule, CSE, DCE.
+    unsafe {
+        core::arch::asm!(
+            "mul     {lo},        {a},  {b}",
+            "umulh   {hi},        {a},  {b}",
+            "subs    {tmp},       {lo}, {hi}, lsr #32",
+            "csel    {corr1},     {p},  xzr,  lo",
+            "add     {tmp},       {corr1}, {tmp}",
+            "lsl     {hi_lo_eps}, {hi}, #32",
+            "sub     {hi_lo_eps}, {hi_lo_eps}, {hi:w}, uxtw",
+            "adds    {res},       {tmp}, {hi_lo_eps}",
+            "csel    {corr2},     {eps}, xzr,  hs",
+            "add     {result},    {corr2}, {res}",
+            a = in(reg) a,
+            b = in(reg) b,
+            lo = out(reg) _,
+            hi = out(reg) _,
+            tmp = out(reg) _,
+            corr1 = out(reg) _,
+            hi_lo_eps = out(reg) _,
+            res = out(reg) _,
+            corr2 = out(reg) _,
+            result = lateout(reg) result,
+            p = in(reg) Goldilocks::ORDER_U64,
+            eps = in(reg) EPSILON,
+            options(pure, nomem, nostack),
+        );
+    }
+    result
+}
+
+/// Vectorized NEON implementation of `Goldilocks` arithmetic.
+#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)]
+#[repr(transparent)]
+#[must_use]
+pub struct PackedGoldilocksNeon(pub [Goldilocks; WIDTH]);
+
+impl PackedGoldilocksNeon {
+    #[inline]
+    #[must_use]
+    pub(crate) fn to_vector(self) -> uint64x2_t {
+        unsafe { transmute(self) }
+    }
+
+    #[inline]
+    pub(crate) fn from_vector(vector: uint64x2_t) -> Self {
+        unsafe { transmute(vector) }
+    }
+
+    #[inline]
+    const fn broadcast(value: Goldilocks) -> Self {
+        Self([value; WIDTH])
+    }
+}
+
+impl From<Goldilocks> for PackedGoldilocksNeon {
+    fn from(x: Goldilocks) -> Self {
+        Self::broadcast(x)
+    }
+}
+
+// Add/Sub/Neg are emulated as two independent scalar Goldilocks ops. On Apple Silicon's wide
+// scalar pipeline, two pipelined scalar adds beat the NEON modular-reduction sequence (XOR-shift
+// + signed compare + conditional add) per element. Storage stays as `[Goldilocks; 2]` (16 bytes)
+// so the compiler keeps elements in either GPRs or NEON regs as needed; only `mul`/`square` use
+// the dual-lane interleaved ASM.
+impl Add for PackedGoldilocksNeon {
+    type Output = Self;
+    #[inline]
+    fn add(self, rhs: Self) -> Self {
+        Self([self.0[0] + rhs.0[0], self.0[1] + rhs.0[1]])
+    }
+}
+
+impl Sub for PackedGoldilocksNeon {
+    type Output = Self;
+    #[inline]
+    fn sub(self, rhs: Self) -> Self {
+        Self([self.0[0] - rhs.0[0], self.0[1] - rhs.0[1]])
+    }
+}
+
+impl Neg for PackedGoldilocksNeon {
+    type Output = Self;
+    #[inline]
+    fn neg(self) -> Self {
+        Self([-self.0[0], -self.0[1]])
+    }
+}
+
+impl Mul for PackedGoldilocksNeon {
+    type Output = Self;
+    #[inline]
+    fn mul(self, rhs: Self) -> Self {
+        // Hand-scheduled `mul_reduce_asm` saves one ALU op per lane vs LLVM's pure-Rust form.
+        Self([
+            Goldilocks::new(mul_reduce_asm(self.0[0].value, rhs.0[0].value)),
+            Goldilocks::new(mul_reduce_asm(self.0[1].value, rhs.0[1].value)),
+        ])
+    }
+}
+
+impl_add_assign!(PackedGoldilocksNeon);
+impl_sub_assign!(PackedGoldilocksNeon);
+impl_mul_methods!(PackedGoldilocksNeon);
+ring_sum!(PackedGoldilocksNeon);
+impl_rng!(PackedGoldilocksNeon);
+
+impl PrimeCharacteristicRing for PackedGoldilocksNeon {
+    type PrimeSubfield = Goldilocks;
+
+    const ZERO: Self = Self::broadcast(Goldilocks::ZERO);
+    const ONE: Self = Self::broadcast(Goldilocks::ONE);
+    const TWO: Self = Self::broadcast(Goldilocks::TWO);
+    const NEG_ONE: Self = Self::broadcast(Goldilocks::NEG_ONE);
+
+    #[inline]
+    fn from_prime_subfield(f: Self::PrimeSubfield) -> Self {
+        f.into()
+    }
+
+    #[inline]
+    fn halve(&self) -> Self {
+        Self::from_vector(halve(self.to_vector()))
+    }
+
+    #[inline]
+    fn dot_product<const N: usize>(lhs: &[Self; N], rhs: &[Self; N]) -> Self {
+        Self::from_fn(|lane| {
+            let lhs_lane: [Goldilocks; N] = core::array::from_fn(|i| lhs[i].as_slice()[lane]);
+            let rhs_lane: [Goldilocks; N] = core::array::from_fn(|i| rhs[i].as_slice()[lane]);
+            Goldilocks::dot_product(&lhs_lane, &rhs_lane)
+        })
+    }
+
+    #[inline]
+    fn square(&self) -> Self {
+        // Same rationale as `Mul`: scalar reduction avoids NEON<->GPR moves.
+        let x0 = self.0[0].value;
+        let x1 = self.0[1].value;
+        Self([
+            Goldilocks::new(mul_reduce_asm(x0, x0)),
+            Goldilocks::new(mul_reduce_asm(x1, x1)),
+        ])
+    }
+
+    #[inline]
+    fn zero_vec(len: usize) -> Vec<Self> {
+        unsafe { reconstitute_from_base(Goldilocks::zero_vec(len * WIDTH)) }
+    }
+}
+
+impl InjectiveMonomial<7> for PackedGoldilocksNeon {}
+
+impl PermutationMonomial<7> for PackedGoldilocksNeon {
+    fn injective_exp_root_n(&self) -> Self {
+        exp_10540996611094048183(*self)
+    }
+}
+
+impl_add_base_field!(PackedGoldilocksNeon, Goldilocks);
+impl_sub_base_field!(PackedGoldilocksNeon, Goldilocks);
+impl_mul_base_field!(PackedGoldilocksNeon, Goldilocks);
+impl_div_methods!(PackedGoldilocksNeon, Goldilocks);
+impl_sum_prod_base_field!(PackedGoldilocksNeon, Goldilocks);
+
+impl Algebra<Goldilocks> for PackedGoldilocksNeon {}
+
+impl_packed_value!(PackedGoldilocksNeon, Goldilocks, WIDTH);
+
+unsafe impl PackedField for PackedGoldilocksNeon {
+    type Scalar = Goldilocks;
+}
+
+/// Interleave two 64-bit vectors at the element level.
+/// For block_len=1: `[a0, a1]` x `[b0, b1]` -> `[a0, b0]`, `[a1, b1]`.
+#[inline]
+pub fn interleave_u64(v0: uint64x2_t, v1: uint64x2_t) -> (uint64x2_t, uint64x2_t) {
+    unsafe {
+        let a0 = vgetq_lane_u64::<0>(v0);
+        let a1 = vgetq_lane_u64::<1>(v0);
+        let b0 = vgetq_lane_u64::<0>(v1);
+        let b1 = vgetq_lane_u64::<1>(v1);
+
+        let r0 = vsetq_lane_u64::<1>(b0, vsetq_lane_u64::<0>(a0, vdupq_n_u64(0)));
+        let r1 = vsetq_lane_u64::<1>(b1, vsetq_lane_u64::<0>(a1, vdupq_n_u64(0)));
+
+        (r0, r1)
+    }
+}
+
+unsafe impl PackedFieldPow2 for PackedGoldilocksNeon {
+    fn interleave(&self, other: Self, block_len: usize) -> (Self, Self) {
+        let (v0, v1) = (self.to_vector(), other.to_vector());
+        let (res0, res1) = match block_len {
+            1 => interleave_u64(v0, v1),
+            2 => (v0, v1),
+            _ => panic!("unsupported block length"),
+        };
+        (Self::from_vector(res0), Self::from_vector(res1))
+    }
+}
+
+/// Halve a vector of Goldilocks field elements.
+#[inline(always)]
+pub(crate) fn halve(input: uint64x2_t) -> uint64x2_t {
+    unsafe {
+        let one = vdupq_n_u64(1);
+        let zero = vdupq_n_u64(0);
+        let half = vdupq_n_u64(P.div_ceil(2));
+
+        let least_bit = vandq_u64(input, one);
+        let t = vshrq_n_u64::<1>(input);
+        // neg_least_bit is 0 or -1 (all bits 1).
+        let neg_least_bit = vsubq_u64(zero, least_bit);
+        let maybe_half = vandq_u64(half, neg_least_bit);
+        vaddq_u64(t, maybe_half)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::{Goldilocks, PackedGoldilocksNeon, WIDTH};
+
+    const SPECIAL_VALS: [Goldilocks; WIDTH] = Goldilocks::new_array([0xFFFF_FFFF_0000_0000, 0xFFFF_FFFF_FFFF_FFFF]);
+
+    #[test]
+    fn pack_round_trip() {
+        let p = PackedGoldilocksNeon(SPECIAL_VALS);
+        let v = p.to_vector();
+        assert_eq!(PackedGoldilocksNeon::from_vector(v).0, SPECIAL_VALS);
+    }
+}
diff --git a/crates/backend/goldilocks/src/benchmark_poseidons_goldilocks.rs b/crates/backend/goldilocks/src/benchmark_poseidons_goldilocks.rs
new file mode 100644
index 000000000..170c6f078
--- /dev/null
+++ b/crates/backend/goldilocks/src/benchmark_poseidons_goldilocks.rs
@@ -0,0 +1,39 @@
+use std::hint::black_box;
+use std::time::Instant;
+
+use field::Field;
+use field::PackedValue;
+use field::PrimeCharacteristicRing;
+
+use crate::{Goldilocks, default_goldilocks_poseidon1_8};
+
+type FPacking = <Goldilocks as Field>::Packing;
+const PACKING_WIDTH: usize = <FPacking as PackedValue>::WIDTH;
+
+#[test]
+#[ignore]
+fn bench_poseidon() {
+    // cargo test --release --package mt-goldilocks --lib -- benchmark_poseidons_goldilocks::bench_poseidon --exact --nocapture --ignored
+
+    let n = 1 << 23;
+    let poseidon1_8 = default_goldilocks_poseidon1_8();
+
+    // warming
+    let mut state_8: [FPacking; 8] = [FPacking::ZERO; 8];
+    for _ in 0..1 << 15 {
+        poseidon1_8.compress_in_place(&mut state_8);
+    }
+    let _ = black_box(state_8);
+
+    let time = Instant::now();
+    for _ in 0..n / PACKING_WIDTH {
+        poseidon1_8.compress_in_place(&mut state_8);
+    }
+    let _ = black_box(state_8);
+    let time_p1_simd = time.elapsed();
+    println!(
+        "Poseidon1 8 SIMD (width {}): {:.2}M hashes/s",
+        PACKING_WIDTH,
+        (n as f64 / time_p1_simd.as_secs_f64() / 1_000_000.0)
+    );
+}
diff --git a/crates/backend/goldilocks/src/cubic_extension.rs b/crates/backend/goldilocks/src/cubic_extension.rs
new file mode 100644
index 000000000..109d884b1
--- /dev/null
+++ b/crates/backend/goldilocks/src/cubic_extension.rs
@@ -0,0 +1,681 @@
+// Credits: Plonky3 (https://github.com/Plonky3/Plonky3) (MIT and Apache-2.0 licenses).
+
+//! Degree-3 trinomial extension of Goldilocks, `F_p[X] / (X^3 - X - 1)`.
+//!
+//! Elements are `a_0 + a_1*X + a_2*X^2`. Reduction rule: `X^3 = X + 1`,
+//! consequently `X^4 = X^2 + X`.
+
+use alloc::format;
+use alloc::string::ToString;
+use alloc::vec::Vec;
+use core::array;
+use core::fmt::{self, Debug, Display, Formatter};
+use core::iter::{Product, Sum};
+use core::ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Sub, SubAssign};
+
+use field::{
+    Algebra, BasedVectorSpace, ExtensionField, Field, Packable, PrimeCharacteristicRing, RawDataSerializable,
+    TwoAdicField, field_to_array,
+};
+use itertools::Itertools;
+use num_bigint::BigUint;
+use rand::distr::{Distribution, StandardUniform};
+use rand::prelude::Rng;
+use serde::{Deserialize, Serialize};
+use utils::{as_base_slice, as_base_slice_mut, flatten_to_base, reconstitute_from_base};
+
+use crate::Goldilocks;
+
+/// Frobenius coefficients for `X^3 - X - 1` over Goldilocks.
+///
+/// `FROBENIUS_COEFFS[0]` is `X^p mod (X^3 - X - 1)`, `FROBENIUS_COEFFS[1]` is `X^{2p} mod …`.
+///
+/// Values verified by the companion `plonky3/goldilocks` code.
+pub const FROBENIUS_COEFFS: [[Goldilocks; 3]; 2] = [
+    [
+        Goldilocks::new(10615703402128488253),
+        Goldilocks::new(10050274602728160328),
+        Goldilocks::new(11746561000929144102),
+    ],
+    [
+        Goldilocks::new(6700183068485440220),
+        Goldilocks::new(14531223735771536287),
+        Goldilocks::new(8396469466686423992),
+    ],
+];
+
+/// Generator of the multiplicative group of the cubic extension, as a coefficient triple.
+const EXT_GENERATOR: [Goldilocks; 3] = [Goldilocks::new(2), Goldilocks::new(1), Goldilocks::new(0)];
+
+/// Degree-3 trinomial extension of Goldilocks.
+#[derive(Copy, Clone, Eq, PartialEq, Hash, Debug, Serialize, Deserialize, PartialOrd, Ord)]
+#[repr(transparent)]
+#[must_use]
+pub struct CubicExtensionFieldGL {
+    #[serde(with = "utils::array_serialization")]
+    pub(crate) value: [Goldilocks; 3],
+}
+
+impl CubicExtensionFieldGL {
+    /// Construct from a coefficient triple `[a_0, a_1, a_2]`.
+    #[inline]
+    pub const fn new(value: [Goldilocks; 3]) -> Self {
+        Self { value }
+    }
+}
+
+impl Default for CubicExtensionFieldGL {
+    fn default() -> Self {
+        Self::new([Goldilocks::ZERO; 3])
+    }
+}
+
+impl From<Goldilocks> for CubicExtensionFieldGL {
+    fn from(x: Goldilocks) -> Self {
+        Self::new(field_to_array(x))
+    }
+}
+
+impl From<[Goldilocks; 3]> for CubicExtensionFieldGL {
+    fn from(x: [Goldilocks; 3]) -> Self {
+        Self::new(x)
+    }
+}
+
+impl Packable for CubicExtensionFieldGL {}
+
+impl BasedVectorSpace<Goldilocks> for CubicExtensionFieldGL {
+    const DIMENSION: usize = 3;
+
+    #[inline]
+    fn as_basis_coefficients_slice(&self) -> &[Goldilocks] {
+        &self.value
+    }
+
+    #[inline]
+    fn from_basis_coefficients_fn<Fn: FnMut(usize) -> Goldilocks>(f: Fn) -> Self {
+        Self::new(array::from_fn(f))
+    }
+
+    #[inline]
+    fn from_basis_coefficients_iter<I: ExactSizeIterator<Item = Goldilocks>>(mut iter: I) -> Option<Self> {
+        (iter.len() == 3).then(|| Self::new(array::from_fn(|_| iter.next().unwrap())))
+    }
+
+    #[inline]
+    fn flatten_to_base(vec: Vec<Self>) -> Vec<Goldilocks> {
+        // SAFETY: `Self` is `repr(transparent)` over `[Goldilocks; 3]`.
+        unsafe { flatten_to_base::<Goldilocks, Self>(vec) }
+    }
+
+    #[inline]
+    fn reconstitute_from_base(vec: Vec<Goldilocks>) -> Vec<Self> {
+        // SAFETY: `Self` is `repr(transparent)` over `[Goldilocks; 3]`.
+        unsafe { reconstitute_from_base::<Goldilocks, Self>(vec) }
+    }
+}
+
+impl ExtensionField<Goldilocks> for CubicExtensionFieldGL {
+    type ExtensionPacking = crate::packed_cubic_extension::PackedCubicExtensionFieldGL<<Goldilocks as Field>::Packing>;
+
+    #[inline]
+    fn is_in_basefield(&self) -> bool {
+        self.value[1].is_zero() && self.value[2].is_zero()
+    }
+
+    #[inline]
+    fn as_base(&self) -> Option<Goldilocks> {
+        <Self as ExtensionField<Goldilocks>>::is_in_basefield(self).then(|| self.value[0])
+    }
+}
+
+impl CubicExtensionFieldGL {
+    /// Apply the Frobenius `x -> x^p`.
+    ///
+    /// `φ(a) = a_0 + a_1 * X^p + a_2 * X^{2p}`, reduced with the stored coefficients.
+    #[inline]
+    pub fn frobenius(&self) -> Self {
+        let a = &self.value;
+        let fc = &FROBENIUS_COEFFS;
+        let tail = [a[1], a[2]];
+        let c0 = a[0] + Goldilocks::dot_product::<2>(&tail, &[fc[0][0], fc[1][0]]);
+        let c1 = Goldilocks::dot_product::<2>(&tail, &[fc[0][1], fc[1][1]]);
+        let c2 = Goldilocks::dot_product::<2>(&tail, &[fc[0][2], fc[1][2]]);
+        Self::new([c0, c1, c2])
+    }
+}
+
+impl PrimeCharacteristicRing for CubicExtensionFieldGL {
+    type PrimeSubfield = <Goldilocks as PrimeCharacteristicRing>::PrimeSubfield;
+
+    const ZERO: Self = Self::new([Goldilocks::ZERO; 3]);
+    const ONE: Self = Self::new(field_to_array(Goldilocks::ONE));
+    const TWO: Self = Self::new(field_to_array(Goldilocks::TWO));
+    const NEG_ONE: Self = Self::new(field_to_array(Goldilocks::NEG_ONE));
+
+    #[inline]
+    fn from_prime_subfield(f: Self::PrimeSubfield) -> Self {
+        <Goldilocks as PrimeCharacteristicRing>::from_prime_subfield(f).into()
+    }
+
+    #[inline]
+    fn halve(&self) -> Self {
+        Self::new(self.value.map(|x| x.halve()))
+    }
+
+    #[inline]
+    fn square(&self) -> Self {
+        let mut res = Self::default();
+        cubic_square(&self.value, &mut res.value);
+        res
+    }
+
+    #[inline]
+    fn mul_2exp_u64(&self, exp: u64) -> Self {
+        Self::new(self.value.map(|x| x.mul_2exp_u64(exp)))
+    }
+
+    #[inline]
+    fn div_2exp_u64(&self, exp: u64) -> Self {
+        Self::new(self.value.map(|x| x.div_2exp_u64(exp)))
+    }
+
+    #[inline]
+    fn zero_vec(len: usize) -> Vec<Self> {
+        // SAFETY: `repr(transparent)` over `[Goldilocks; 3]`.
+        unsafe { reconstitute_from_base(Goldilocks::zero_vec(len * 3)) }
+    }
+}
+
+impl Algebra<Goldilocks> for CubicExtensionFieldGL {}
+
+impl RawDataSerializable for CubicExtensionFieldGL {
+    const NUM_BYTES: usize = <Goldilocks as RawDataSerializable>::NUM_BYTES * 3;
+
+    #[inline]
+    fn into_bytes(self) -> impl IntoIterator<Item = u8> {
+        self.value.into_iter().flat_map(|x| x.into_bytes())
+    }
+
+    #[inline]
+    fn into_byte_stream(input: impl IntoIterator<Item = Self>) -> impl IntoIterator<Item = u8> {
+        Goldilocks::into_byte_stream(input.into_iter().flat_map(|x| x.value))
+    }
+
+    #[inline]
+    fn into_u32_stream(input: impl IntoIterator<Item = Self>) -> impl IntoIterator<Item = u32> {
+        Goldilocks::into_u32_stream(input.into_iter().flat_map(|x| x.value))
+    }
+
+    #[inline]
+    fn into_u64_stream(input: impl IntoIterator<Item = Self>) -> impl IntoIterator<Item = u64> {
+        Goldilocks::into_u64_stream(input.into_iter().flat_map(|x| x.value))
+    }
+
+    #[inline]
+    fn into_parallel_byte_streams<const N: usize>(
+        input: impl IntoIterator<Item = [Self; N]>,
+    ) -> impl IntoIterator<Item = [u8; N]> {
+        Goldilocks::into_parallel_byte_streams(
+            input
+                .into_iter()
+                .flat_map(|x| (0..3).map(move |i| array::from_fn(|j| x[j].value[i]))),
+        )
+    }
+
+    #[inline]
+    fn into_parallel_u32_streams<const N: usize>(
+        input: impl IntoIterator<Item = [Self; N]>,
+    ) -> impl IntoIterator<Item = [u32; N]> {
+        Goldilocks::into_parallel_u32_streams(
+            input
+                .into_iter()
+                .flat_map(|x| (0..3).map(move |i| array::from_fn(|j| x[j].value[i]))),
+        )
+    }
+
+    #[inline]
+    fn into_parallel_u64_streams<const N: usize>(
+        input: impl IntoIterator<Item = [Self; N]>,
+    ) -> impl IntoIterator<Item = [u64; N]> {
+        Goldilocks::into_parallel_u64_streams(
+            input
+                .into_iter()
+                .flat_map(|x| (0..3).map(move |i| array::from_fn(|j| x[j].value[i]))),
+        )
+    }
+}
+
+impl Field for CubicExtensionFieldGL {
+    type Packing = Self;
+
+    const GENERATOR: Self = Self::new(EXT_GENERATOR);
+
+    fn try_inverse(&self) -> Option<Self> {
+        if self.is_zero() {
+            return None;
+        }
+        Some(cubic_inv(self))
+    }
+
+    #[inline]
+    fn add_slices(slice_1: &mut [Self], slice_2: &[Self]) {
+        // SAFETY: `repr(transparent)` + addition is base-linear.
+        unsafe {
+            let base_slice_1 = as_base_slice_mut(slice_1);
+            let base_slice_2 = as_base_slice(slice_2);
+            Goldilocks::add_slices(base_slice_1, base_slice_2);
+        }
+    }
+
+    #[inline]
+    fn order() -> BigUint {
+        Goldilocks::order().pow(3)
+    }
+}
+
+impl Display for CubicExtensionFieldGL {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        if self.is_zero() {
+            write!(f, "0")
+        } else {
+            let str = self
+                .value
+                .iter()
+                .enumerate()
+                .filter(|(_, x)| !x.is_zero())
+                .map(|(i, x)| match (i, x.is_one()) {
+                    (0, _) => format!("{x}"),
+                    (1, true) => "X".to_string(),
+                    (1, false) => format!("{x} X"),
+                    (_, true) => format!("X^{i}"),
+                    (_, false) => format!("{x} X^{i}"),
+                })
+                .join(" + ");
+            write!(f, "{str}")
+        }
+    }
+}
+
+impl Neg for CubicExtensionFieldGL {
+    type Output = Self;
+
+    #[inline]
+    fn neg(self) -> Self {
+        Self::new(self.value.map(Goldilocks::neg))
+    }
+}
+
+impl Add for CubicExtensionFieldGL {
+    type Output = Self;
+
+    #[inline]
+    fn add(self, rhs: Self) -> Self {
+        Self::new([
+            self.value[0] + rhs.value[0],
+            self.value[1] + rhs.value[1],
+            self.value[2] + rhs.value[2],
+        ])
+    }
+}
+
+impl Add<Goldilocks> for CubicExtensionFieldGL {
+    type Output = Self;
+
+    #[inline]
+    fn add(mut self, rhs: Goldilocks) -> Self {
+        self.value[0] += rhs;
+        self
+    }
+}
+
+impl AddAssign for CubicExtensionFieldGL {
+    #[inline]
+    fn add_assign(&mut self, rhs: Self) {
+        for i in 0..3 {
+            self.value[i] += rhs.value[i];
+        }
+    }
+}
+
+impl AddAssign<Goldilocks> for CubicExtensionFieldGL {
+    #[inline]
+    fn add_assign(&mut self, rhs: Goldilocks) {
+        self.value[0] += rhs;
+    }
+}
+
+impl Sum for CubicExtensionFieldGL {
+    #[inline]
+    fn sum<I: Iterator<Item = Self>>(iter: I) -> Self {
+        iter.reduce(|acc, x| acc + x).unwrap_or(Self::ZERO)
+    }
+}
+
+impl Sub for CubicExtensionFieldGL {
+    type Output = Self;
+
+    #[inline]
+    fn sub(self, rhs: Self) -> Self {
+        Self::new([
+            self.value[0] - rhs.value[0],
+            self.value[1] - rhs.value[1],
+            self.value[2] - rhs.value[2],
+        ])
+    }
+}
+
+impl Sub<Goldilocks> for CubicExtensionFieldGL {
+    type Output = Self;
+
+    #[inline]
+    fn sub(mut self, rhs: Goldilocks) -> Self {
+        self.value[0] -= rhs;
+        self
+    }
+}
+
+impl SubAssign for CubicExtensionFieldGL {
+    #[inline]
+    fn sub_assign(&mut self, rhs: Self) {
+        for i in 0..3 {
+            self.value[i] -= rhs.value[i];
+        }
+    }
+}
+
+impl SubAssign<Goldilocks> for CubicExtensionFieldGL {
+    #[inline]
+    fn sub_assign(&mut self, rhs: Goldilocks) {
+        self.value[0] -= rhs;
+    }
+}
+
+impl Mul for CubicExtensionFieldGL {
+    type Output = Self;
+
+    #[inline]
+    fn mul(self, rhs: Self) -> Self {
+        let mut res = Self::default();
+        cubic_mul(&self.value, &rhs.value, &mut res.value);
+        res
+    }
+}
+
+impl Mul<Goldilocks> for CubicExtensionFieldGL {
+    type Output = Self;
+
+    #[inline]
+    fn mul(self, rhs: Goldilocks) -> Self {
+        Self::new([self.value[0] * rhs, self.value[1] * rhs, self.value[2] * rhs])
+    }
+}
+
+impl MulAssign for CubicExtensionFieldGL {
+    #[inline]
+    fn mul_assign(&mut self, rhs: Self) {
+        *self = *self * rhs;
+    }
+}
+
+impl MulAssign<Goldilocks> for CubicExtensionFieldGL {
+    #[inline]
+    fn mul_assign(&mut self, rhs: Goldilocks) {
+        *self = *self * rhs;
+    }
+}
+
+impl Product for CubicExtensionFieldGL {
+    #[inline]
+    fn product<I: Iterator<Item = Self>>(iter: I) -> Self {
+        iter.reduce(|acc, x| acc * x).unwrap_or(Self::ONE)
+    }
+}
+
+impl Div for CubicExtensionFieldGL {
+    type Output = Self;
+
+    #[allow(clippy::suspicious_arithmetic_impl)]
+    #[inline]
+    fn div(self, rhs: Self) -> Self::Output {
+        self * rhs.inverse()
+    }
+}
+
+impl DivAssign for CubicExtensionFieldGL {
+    #[inline]
+    fn div_assign(&mut self, rhs: Self) {
+        *self = *self / rhs;
+    }
+}
+
+impl Distribution<CubicExtensionFieldGL> for StandardUniform {
+    #[inline]
+    fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> CubicExtensionFieldGL {
+        CubicExtensionFieldGL::new(array::from_fn(|_| self.sample(rng)))
+    }
+}
+
+impl TwoAdicField for CubicExtensionFieldGL {
+    const TWO_ADICITY: usize = Goldilocks::TWO_ADICITY;
+
+    #[inline]
+    fn two_adic_generator(bits: usize) -> Self {
+        Goldilocks::two_adic_generator(bits).into()
+    }
+}
+
+// `PackedFieldExtension<Goldilocks, CubicExtensionFieldGL>` is implemented by
+// `PackedCubicExtensionFieldGL<<Goldilocks as Field>::Packing>` (see `packed_cubic_extension.rs`).
+
+// ============================================================================
+// Arithmetic kernels for `F_p[X] / (X^3 - X - 1)`.
+// ============================================================================
+
+/// Multiply two cubic extension elements over any algebra `R` over `Goldilocks`.
+///
+/// Given `a = a_0 + a_1 X + a_2 X^2` and `b = b_0 + b_1 X + b_2 X^2`, computes the
+/// product reduced by `X^3 - X - 1` (so `X^3 = X + 1`, `X^4 = X^2 + X`).
+///
+/// Uses 3-term Karatsuba: 6 multiplications instead of the 9 of schoolbook.
+/// On Goldilocks each multiply carries a 128->64-bit reduction (the dominant
+/// cost), so trading 3 of them for cheap field adds/subs is a net win — this
+/// is the hottest field op in the prover (sumcheck + poseidon AIR eval).
+#[inline]
+pub fn cubic_mul_generic<R>(a: &[R; 3], b: &[R; 3], res: &mut [R; 3])
+where
+    R: Copy + core::ops::Mul<Output = R> + core::ops::Add<Output = R> + core::ops::Sub<Output = R>,
+{
+    let a0 = a[0];
+    let a1 = a[1];
+    let a2 = a[2];
+    let b0 = b[0];
+    let b1 = b[1];
+    let b2 = b[2];
+
+    // Karatsuba products for the degree-4 polynomial product A(X)*B(X).
+    let m0 = a0 * b0;
+    let m1 = a1 * b1;
+    let m2 = a2 * b2;
+    let m3 = (a0 + a1) * (b0 + b1);
+    let m4 = (a0 + a2) * (b0 + b2);
+    let m5 = (a1 + a2) * (b1 + b2);
+
+    // Coefficients of A*B = c0 + c1 X + c2 X^2 + c3 X^3 + c4 X^4:
+    //   c0 = m0,  c1 = m3-m0-m1,  c2 = m4-m0-m2+m1,  c3 = m5-m1-m2,  c4 = m2.
+    // Reduce by X^3 = X+1, X^4 = X^2+X:
+    //   res0 = c0 + c3      = m0 + m5 - m1 - m2
+    //   res1 = c1 + c3 + c4 = m3 + m5 - m0 - m1 - m1
+    //   res2 = c2 + c4      = m4 + m1 - m0
+    res[0] = m0 + m5 - m1 - m2;
+    res[1] = m3 + m5 - m0 - m1 - m1;
+    res[2] = m4 + m1 - m0;
+}
+
+/// Square a cubic extension element (same reduction rule as `cubic_mul_generic`).
+#[inline]
+pub fn cubic_square_generic<R>(a: &[R; 3], res: &mut [R; 3])
+where
+    R: PrimeCharacteristicRing + Copy,
+{
+    let a0 = a[0];
+    let a1 = a[1];
+    let a2 = a[2];
+
+    let a0_sq = a0.square();
+    let a1_sq = a1.square();
+    let a2_sq = a2.square();
+    let two_a0_a1 = (a0 * a1).double();
+    let two_a0_a2 = (a0 * a2).double();
+    let two_a1_a2 = (a1 * a2).double();
+
+    // constant: a0^2 + 2 a1 a2
+    res[0] = a0_sq + two_a1_a2;
+    // linear: 2 a0 a1 + 2 a1 a2 + a2^2
+    res[1] = two_a0_a1 + two_a1_a2 + a2_sq;
+    // quadratic: 2 a0 a2 + a1^2 + a2^2
+    res[2] = two_a0_a2 + a1_sq + a2_sq;
+}
+
+/// Multiply two cubic extension elements (Goldilocks scalars).
+#[inline]
+pub fn cubic_mul(a: &[Goldilocks; 3], b: &[Goldilocks; 3], res: &mut [Goldilocks; 3]) {
+    cubic_mul_generic(a, b, res);
+}
+
+/// Square a cubic extension element (Goldilocks scalar).
+#[inline]
+pub fn cubic_square(a: &[Goldilocks; 3], res: &mut [Goldilocks; 3]) {
+    cubic_square_generic(a, res);
+}
+
+/// Invert a cubic extension element via adjugate/determinant — no Frobenius round trip needed.
+///
+/// The multiplication-by-`a` matrix (in the basis `{1, X, X^2}`, using `X^3 = X + 1`) is
+///
+/// ```text
+/// M = | a0    a2      a1      |
+///     | a1    a0 + a2 a1 + a2 |
+///     | a2    a1      a0 + a2 |
+/// ```
+///
+/// so `a^{-1} = adj(M) · e_0 / det(M)`.
+#[inline]
+fn cubic_inv(a: &CubicExtensionFieldGL) -> CubicExtensionFieldGL {
+    let [a0, a1, a2] = a.value;
+
+    let a0_sq = a0.square();
+    let a1_sq = a1.square();
+    let a2_sq = a2.square();
+    let a0a1 = a0 * a1;
+    let a0a2 = a0 * a2;
+    let a1a2 = a1 * a2;
+
+    // Cofactors of the first row of `M` (see matrix above):
+    //   n0 = a1 a2 + a1^2 - a0^2 - 2 a0 a2 - a2^2
+    let n0 = a1a2 + a1_sq - a0_sq - a0a2.double() - a2_sq;
+    //   n1 = a0 a1 - a2^2
+    let n1 = a0a1 - a2_sq;
+    //   n2 = a0 a2 + a2^2 - a1^2
+    let n2 = a0a2 + a2_sq - a1_sq;
+
+    // `t = -det(M) = a0 n0 + a2 n1 + a1 n2`.
+    let t = a0 * n0 + a2 * n1 + a1 * n2;
+    let t_inv = t.inverse();
+
+    CubicExtensionFieldGL::new([n0 * t_inv, n1 * t_inv, n2 * t_inv])
+}
+
+// ============================================================================
+// Frobenius sanity test — exercised during `cargo test`.
+// ============================================================================
+
+#[cfg(test)]
+mod tests {
+    use field::{Field, PrimeCharacteristicRing, PrimeField64};
+    use rand::rngs::StdRng;
+    use rand::{RngExt, SeedableRng};
+
+    use super::*;
+
+    #[test]
+    fn inverse_roundtrip() {
+        let mut rng = StdRng::seed_from_u64(1);
+        for _ in 0..32 {
+            let a: CubicExtensionFieldGL = rng.random();
+            if a.is_zero() {
+                continue;
+            }
+            let a_inv = a.inverse();
+            assert_eq!(a * a_inv, CubicExtensionFieldGL::ONE);
+        }
+    }
+
+    #[test]
+    fn x_cubed_equals_x_plus_one() {
+        // The extension is `F_p[X]/(X^3 - X - 1)`, so `X^3 = X + 1`.
+        let x = CubicExtensionFieldGL::new([Goldilocks::ZERO, Goldilocks::ONE, Goldilocks::ZERO]);
+        let x_cubed = x * x * x;
+        let expected = CubicExtensionFieldGL::new([Goldilocks::ONE, Goldilocks::ONE, Goldilocks::ZERO]);
+        assert_eq!(x_cubed, expected);
+    }
+
+    #[test]
+    fn frobenius_matches_pth_power() {
+        let mut rng = StdRng::seed_from_u64(2);
+        for _ in 0..8 {
+            let a: CubicExtensionFieldGL = rng.random();
+            let a_frob = a.frobenius();
+            let a_pth = a.exp_u64(Goldilocks::ORDER_U64);
+            assert_eq!(a_frob, a_pth);
+        }
+    }
+
+    // Reference schoolbook cubic multiply (9 muls), reduced by `X^3 = X+1`.
+    fn cubic_mul_schoolbook(a: &[Goldilocks; 3], b: &[Goldilocks; 3]) -> [Goldilocks; 3] {
+        let [a0, a1, a2] = *a;
+        let [b0, b1, b2] = *b;
+        let a1b2 = a1 * b2;
+        let a2b1 = a2 * b1;
+        let a2b2 = a2 * b2;
+        [
+            a0 * b0 + a1b2 + a2b1,
+            a0 * b1 + a1 * b0 + a1b2 + a2b1 + a2b2,
+            a0 * b2 + a1 * b1 + a2 * b0 + a2b2,
+        ]
+    }
+
+    #[test]
+    fn karatsuba_matches_schoolbook_scalar() {
+        let mut rng = StdRng::seed_from_u64(7);
+        for _ in 0..10_000 {
+            let a: [Goldilocks; 3] = [rng.random(), rng.random(), rng.random()];
+            let b: [Goldilocks; 3] = [rng.random(), rng.random(), rng.random()];
+            let mut got = [Goldilocks::ZERO; 3];
+            cubic_mul_generic(&a, &b, &mut got);
+            assert_eq!(got, cubic_mul_schoolbook(&a, &b));
+        }
+    }
+
+    #[test]
+    fn karatsuba_matches_schoolbook_packed() {
+        use field::{Field, PackedValue};
+        type P = <Goldilocks as Field>::Packing;
+        let mut rng = StdRng::seed_from_u64(11);
+        for _ in 0..2_000 {
+            let a: [P; 3] = core::array::from_fn(|_| P::from_fn(|_| rng.random()));
+            let b: [P; 3] = core::array::from_fn(|_| P::from_fn(|_| rng.random()));
+            let mut got = [P::ZERO; 3];
+            cubic_mul_generic(&a, &b, &mut got);
+            // Compare lane-by-lane against the scalar schoolbook reference.
+            for lane in 0..P::WIDTH {
+                let a_s = [a[0].as_slice()[lane], a[1].as_slice()[lane], a[2].as_slice()[lane]];
+                let b_s = [b[0].as_slice()[lane], b[1].as_slice()[lane], b[2].as_slice()[lane]];
+                let want = cubic_mul_schoolbook(&a_s, &b_s);
+                for i in 0..3 {
+                    assert_eq!(got[i].as_slice()[lane], want[i], "lane {lane} coord {i}");
+                }
+            }
+        }
+    }
+}
diff --git a/crates/backend/goldilocks/src/goldilocks.rs b/crates/backend/goldilocks/src/goldilocks.rs
new file mode 100644
index 000000000..9c5db5441
--- /dev/null
+++ b/crates/backend/goldilocks/src/goldilocks.rs
@@ -0,0 +1,603 @@
+// Credits: Plonky3 (https://github.com/Plonky3/Plonky3) (MIT and Apache-2.0 licenses).
+
+use alloc::vec;
+use alloc::vec::Vec;
+use core::fmt::{Debug, Display, Formatter};
+use core::hash::{Hash, Hasher};
+use core::iter::{Product, Sum};
+use core::ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Sub, SubAssign};
+use core::{array, fmt};
+
+use field::integers::QuotientMap;
+use field::op_assign_macros::{impl_add_assign, impl_div_methods, impl_mul_methods, impl_sub_assign};
+use field::{
+    Field, InjectiveMonomial, Packable, PermutationMonomial, PrimeCharacteristicRing, PrimeField, PrimeField64,
+    RawDataSerializable, TwoAdicField, impl_raw_serializable_primefield64, quotient_map_large_iint,
+    quotient_map_large_uint, quotient_map_small_int,
+};
+use num_bigint::BigUint;
+use rand::Rng;
+use rand::distr::{Distribution, StandardUniform};
+use serde::{Deserialize, Serialize};
+use utils::{assume, branch_hint, flatten_to_base};
+
+use crate::helpers::{exp_10540996611094048183, gcd_inner};
+
+/// The Goldilocks prime.
+pub const P: u64 = 0xFFFF_FFFF_0000_0001;
+
+/// The prime field known as Goldilocks, defined as `F_p` where `p = 2^64 - 2^32 + 1`.
+///
+/// The internal representation is not necessarily canonical — any `u64` is allowed.
+#[derive(Copy, Clone, Default, Serialize, Deserialize)]
+#[repr(transparent)]
+#[must_use]
+pub struct Goldilocks {
+    pub(crate) value: u64,
+}
+
+impl Goldilocks {
+    /// Create a new field element from any `u64`.
+    ///
+    /// Any `u64` value is accepted. No reduction is performed since Goldilocks uses a
+    /// non-canonical internal representation.
+    #[inline]
+    pub const fn new(value: u64) -> Self {
+        Self { value }
+    }
+
+    /// Convert a `[u64; N]` array to an array of field elements.
+    #[inline]
+    pub const fn new_array<const N: usize>(input: [u64; N]) -> [Self; N] {
+        let mut output = [Self::ZERO; N];
+        let mut i = 0;
+        while i < N {
+            output[i].value = input[i];
+            i += 1;
+        }
+        output
+    }
+
+    /// Convert a `[[u64; N]; M]` array to a 2D array of field elements.
+    #[inline]
+    pub const fn new_2d_array<const N: usize, const M: usize>(input: [[u64; N]; M]) -> [[Self; N]; M] {
+        let mut output = [[Self::ZERO; N]; M];
+        let mut i = 0;
+        while i < M {
+            output[i] = Self::new_array(input[i]);
+            i += 1;
+        }
+        output
+    }
+
+    /// Two's complement of `ORDER`, i.e. `2^64 - ORDER = 2^32 - 1`.
+    const NEG_ORDER: u64 = Self::ORDER_U64.wrapping_neg();
+
+    /// Generators of the two-adic subgroups: `TWO_ADIC_GENERATORS[0] = 1`,
+    /// `TWO_ADIC_GENERATORS[i+1]^2 = TWO_ADIC_GENERATORS[i]`.
+    pub const TWO_ADIC_GENERATORS: [Self; 33] = Self::new_array([
+        0x0000000000000001,
+        0xffffffff00000000,
+        0x0001000000000000,
+        0xfffffffeff000001,
+        0xefffffff00000001,
+        0x00003fffffffc000,
+        0x0000008000000000,
+        0xf80007ff08000001,
+        0xbf79143ce60ca966,
+        0x1905d02a5c411f4e,
+        0x9d8f2ad78bfed972,
+        0x0653b4801da1c8cf,
+        0xf2c35199959dfcb6,
+        0x1544ef2335d17997,
+        0xe0ee099310bba1e2,
+        0xf6b2cffe2306baac,
+        0x54df9630bf79450e,
+        0xabd0a6e8aa3d8a0e,
+        0x81281a7b05f9beac,
+        0xfbd41c6b8caa3302,
+        0x30ba2ecd5e93e76d,
+        0xf502aef532322654,
+        0x4b2a18ade67246b5,
+        0xea9d5a1336fbc98b,
+        0x86cdcc31c307e171,
+        0x4bbaf5976ecfefd8,
+        0xed41d05b78d6e286,
+        0x10d78dd8915a171d,
+        0x59049500004a4485,
+        0xdfa8c93ba46d2666,
+        0x7e9bd009b86a0845,
+        0x400a7f755588e659,
+        0x185629dcda58878c,
+    ]);
+
+    /// Powers of two from 2^0 to 2^95 (inclusive).
+    ///
+    /// Note that `2^96 = -1 mod P`, so any power of two can be derived from this table.
+    const POWERS_OF_TWO: [Self; 96] = {
+        let mut powers_of_two = [Self::ONE; 96];
+        let mut i = 1;
+        while i < 64 {
+            powers_of_two[i] = Self::new(1 << i);
+            i += 1;
+        }
+        let mut var = Self::new(1 << 63);
+        while i < 96 {
+            var = const_add(var, var);
+            powers_of_two[i] = var;
+            i += 1;
+        }
+        powers_of_two
+    };
+}
+
+impl PartialEq for Goldilocks {
+    fn eq(&self, other: &Self) -> bool {
+        self.as_canonical_u64() == other.as_canonical_u64()
+    }
+}
+
+impl Eq for Goldilocks {}
+
+impl Packable for Goldilocks {}
+
+impl Hash for Goldilocks {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        state.write_u64(self.as_canonical_u64());
+    }
+}
+
+impl Ord for Goldilocks {
+    fn cmp(&self, other: &Self) -> core::cmp::Ordering {
+        self.as_canonical_u64().cmp(&other.as_canonical_u64())
+    }
+}
+
+impl PartialOrd for Goldilocks {
+    fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Display for Goldilocks {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        Display::fmt(&self.as_canonical_u64(), f)
+    }
+}
+
+impl Debug for Goldilocks {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        Debug::fmt(&self.as_canonical_u64(), f)
+    }
+}
+
+impl Distribution<Goldilocks> for StandardUniform {
+    fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> Goldilocks {
+        loop {
+            let next_u64 = rng.next_u64();
+            if next_u64 < Goldilocks::ORDER_U64 {
+                return Goldilocks::new(next_u64);
+            }
+        }
+    }
+}
+
+impl PrimeCharacteristicRing for Goldilocks {
+    type PrimeSubfield = Self;
+
+    const ZERO: Self = Self::new(0);
+    const ONE: Self = Self::new(1);
+    const TWO: Self = Self::new(2);
+    const NEG_ONE: Self = Self::new(Self::ORDER_U64 - 1);
+
+    #[inline]
+    fn from_prime_subfield(f: Self::PrimeSubfield) -> Self {
+        f
+    }
+
+    #[inline]
+    fn from_bool(b: bool) -> Self {
+        Self::new(b.into())
+    }
+
+    #[inline]
+    fn halve(&self) -> Self {
+        Self::new(crate::helpers::halve_u64::<P>(self.value))
+    }
+
+    #[inline]
+    fn mul_2exp_u64(&self, exp: u64) -> Self {
+        // 2^96 = -1 mod P, 2^192 = 1 mod P.
+        match exp {
+            0 => *self,
+            1 => *self + *self,
+            _ => {
+                if exp < 96 {
+                    *self * Self::POWERS_OF_TWO[exp as usize]
+                } else if exp < 192 {
+                    -*self * Self::POWERS_OF_TWO[(exp - 96) as usize]
+                } else {
+                    self.mul_2exp_u64(exp % 192)
+                }
+            }
+        }
+    }
+
+    #[inline]
+    fn div_2exp_u64(&self, mut exp: u64) -> Self {
+        // 2^{-n} = 2^{192 - n} mod P.
+        exp %= 192;
+        match exp {
+            0 => *self,
+            1 => self.halve(),
+            _ => self.mul_2exp_u64(192 - exp),
+        }
+    }
+
+    #[inline]
+    fn sum_array<const N: usize>(input: &[Self]) -> Self {
+        assert_eq!(N, input.len());
+        match N {
+            0 => Self::ZERO,
+            1 => input[0],
+            2 => input[0] + input[1],
+            3 => input[0] + input[1] + input[2],
+            _ => input.iter().copied().sum(),
+        }
+    }
+
+    #[inline]
+    fn dot_product<const N: usize>(lhs: &[Self; N], rhs: &[Self; N]) -> Self {
+        // OFFSET has two key properties:
+        //   1. it's a multiple of P,
+        //   2. it exceeds the maximum sum of two u64 products.
+        const OFFSET: u128 = ((P as u128) << 64) - (P as u128) + ((P as u128) << 32);
+        const {
+            assert!((N as u32) <= (1 << 31));
+        }
+        match N {
+            0 => Self::ZERO,
+            1 => lhs[0] * rhs[0],
+            2 => {
+                let long_prod_0 = (lhs[0].value as u128) * (rhs[0].value as u128);
+                let long_prod_1 = (lhs[1].value as u128) * (rhs[1].value as u128);
+                let (sum, over) = long_prod_0.overflowing_add(long_prod_1);
+                let sum_corr = sum.wrapping_sub(OFFSET);
+                if over { reduce128(sum_corr) } else { reduce128(sum) }
+            }
+            _ => {
+                let (lo_plus_hi, hi) = lhs
+                    .iter()
+                    .zip(rhs)
+                    .map(|(x, y)| (x.value as u128) * (y.value as u128))
+                    .fold((0_u128, 0_u64), |(acc_lo, acc_hi), val| {
+                        let val_hi = (val >> 96) as u64;
+                        unsafe { (acc_lo.wrapping_add(val), acc_hi.unchecked_add(val_hi)) }
+                    });
+                let lo = lo_plus_hi.wrapping_sub((hi as u128) << 96);
+                let sum = unsafe { lo.unchecked_add(P.unchecked_sub(hi) as u128) };
+                reduce128(sum)
+            }
+        }
+    }
+
+    #[inline]
+    fn zero_vec(len: usize) -> Vec<Self> {
+        // SAFETY: `#[repr(transparent)]` means `Goldilocks` and `u64` share layout.
+        unsafe { flatten_to_base(vec![0u64; len]) }
+    }
+}
+
+/// `p - 1 = 2^32 * 3 * 5 * 17 * 257 * 65537`. The smallest `D` with `gcd(p - 1, D) = 1` is 7.
+impl InjectiveMonomial<7> for Goldilocks {}
+
+impl PermutationMonomial<7> for Goldilocks {
+    fn injective_exp_root_n(&self) -> Self {
+        exp_10540996611094048183(*self)
+    }
+}
+
+impl RawDataSerializable for Goldilocks {
+    impl_raw_serializable_primefield64!();
+}
+
+impl Field for Goldilocks {
+    #[cfg(all(target_arch = "x86_64", target_feature = "avx2", not(target_feature = "avx512f")))]
+    type Packing = crate::PackedGoldilocksAVX2;
+
+    #[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
+    type Packing = crate::PackedGoldilocksAVX512;
+
+    #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+    type Packing = crate::PackedGoldilocksNeon;
+
+    #[cfg(not(any(
+        all(target_arch = "x86_64", target_feature = "avx2", not(target_feature = "avx512f")),
+        all(target_arch = "x86_64", target_feature = "avx512f"),
+        all(target_arch = "aarch64", target_feature = "neon"),
+    )))]
+    type Packing = Self;
+
+    const GENERATOR: Self = Self::new(7);
+
+    #[inline]
+    fn is_zero(&self) -> bool {
+        self.value == 0 || self.value == Self::ORDER_U64
+    }
+
+    fn try_inverse(&self) -> Option<Self> {
+        if self.is_zero() {
+            return None;
+        }
+        Some(gcd_inversion(*self))
+    }
+
+    #[inline]
+    fn order() -> BigUint {
+        P.into()
+    }
+}
+
+quotient_map_small_int!(Goldilocks, u64, [u8, u16, u32]);
+quotient_map_small_int!(Goldilocks, i64, [i8, i16, i32]);
+quotient_map_large_uint!(
+    Goldilocks,
+    u64,
+    Goldilocks::ORDER_U64,
+    "`[0, 2^64 - 2^32]`",
+    "`[0, 2^64 - 1]`",
+    [u128]
+);
+quotient_map_large_iint!(
+    Goldilocks,
+    i64,
+    "`[-(2^63 - 2^31), 2^63 - 2^31]`",
+    "`[1 + 2^32 - 2^64, 2^64 - 1]`",
+    [(i128, u128)]
+);
+
+impl QuotientMap<u64> for Goldilocks {
+    #[inline]
+    fn from_int(int: u64) -> Self {
+        Self::new(int)
+    }
+
+    #[inline]
+    fn from_canonical_checked(int: u64) -> Option<Self> {
+        (int < Self::ORDER_U64).then(|| Self::new(int))
+    }
+
+    #[inline(always)]
+    unsafe fn from_canonical_unchecked(int: u64) -> Self {
+        Self::new(int)
+    }
+}
+
+impl QuotientMap<i64> for Goldilocks {
+    #[inline]
+    fn from_int(int: i64) -> Self {
+        if int >= 0 {
+            Self::new(int as u64)
+        } else {
+            Self::new(Self::ORDER_U64.wrapping_add_signed(int))
+        }
+    }
+
+    #[inline]
+    fn from_canonical_checked(int: i64) -> Option<Self> {
+        const POS_BOUND: i64 = (P >> 1) as i64;
+        const NEG_BOUND: i64 = -POS_BOUND;
+        match int {
+            0..=POS_BOUND => Some(Self::new(int as u64)),
+            NEG_BOUND..0 => Some(Self::new(Self::ORDER_U64.wrapping_add_signed(int))),
+            _ => None,
+        }
+    }
+
+    #[inline(always)]
+    unsafe fn from_canonical_unchecked(int: i64) -> Self {
+        Self::from_int(int)
+    }
+}
+
+impl PrimeField for Goldilocks {
+    fn as_canonical_biguint(&self) -> BigUint {
+        self.as_canonical_u64().into()
+    }
+}
+
+impl PrimeField64 for Goldilocks {
+    const ORDER_U64: u64 = P;
+
+    #[inline]
+    fn as_canonical_u64(&self) -> u64 {
+        let mut c = self.value;
+        // Single conditional subtraction is sufficient since `2 * ORDER` would overflow u64.
+        if c >= Self::ORDER_U64 {
+            c -= Self::ORDER_U64;
+        }
+        c
+    }
+}
+
+impl TwoAdicField for Goldilocks {
+    const TWO_ADICITY: usize = 32;
+
+    fn two_adic_generator(bits: usize) -> Self {
+        assert!(bits <= Self::TWO_ADICITY);
+        Self::TWO_ADIC_GENERATORS[bits]
+    }
+}
+
+/// `const` version of addition — useful for building const tables.
+#[inline]
+const fn const_add(lhs: Goldilocks, rhs: Goldilocks) -> Goldilocks {
+    let (sum, over) = lhs.value.overflowing_add(rhs.value);
+    let (mut sum, over) = sum.overflowing_add((over as u64) * Goldilocks::NEG_ORDER);
+    if over {
+        sum += Goldilocks::NEG_ORDER;
+    }
+    Goldilocks::new(sum)
+}
+
+impl Add for Goldilocks {
+    type Output = Self;
+
+    #[inline]
+    fn add(self, rhs: Self) -> Self {
+        let (sum, over) = self.value.overflowing_add(rhs.value);
+        let (mut sum, over) = sum.overflowing_add(u64::from(over) * Self::NEG_ORDER);
+        if over {
+            unsafe {
+                assume(self.value > Self::ORDER_U64 && rhs.value > Self::ORDER_U64);
+            }
+            branch_hint();
+            sum += Self::NEG_ORDER;
+        }
+        Self::new(sum)
+    }
+}
+
+impl Sub for Goldilocks {
+    type Output = Self;
+
+    #[inline]
+    fn sub(self, rhs: Self) -> Self {
+        let (diff, under) = self.value.overflowing_sub(rhs.value);
+        let (mut diff, under) = diff.overflowing_sub(u64::from(under) * Self::NEG_ORDER);
+        if under {
+            unsafe {
+                assume(self.value < Self::NEG_ORDER - 1 && rhs.value > Self::ORDER_U64);
+            }
+            branch_hint();
+            diff -= Self::NEG_ORDER;
+        }
+        Self::new(diff)
+    }
+}
+
+impl Neg for Goldilocks {
+    type Output = Self;
+
+    #[inline]
+    fn neg(self) -> Self::Output {
+        Self::new(Self::ORDER_U64 - self.as_canonical_u64())
+    }
+}
+
+impl Mul for Goldilocks {
+    type Output = Self;
+
+    #[inline]
+    fn mul(self, rhs: Self) -> Self {
+        reduce128(u128::from(self.value) * u128::from(rhs.value))
+    }
+}
+
+impl_add_assign!(Goldilocks);
+impl_sub_assign!(Goldilocks);
+impl_mul_methods!(Goldilocks);
+impl_div_methods!(Goldilocks, Goldilocks);
+
+impl Sum for Goldilocks {
+    fn sum<I: Iterator<Item = Self>>(iter: I) -> Self {
+        // Faster than `reduce` for iterators of length > 2; cannot overflow provided len < 2^64.
+        let sum = iter.map(|x| x.value as u128).sum::<u128>();
+        reduce128(sum)
+    }
+}
+
+/// Reduce to a 64-bit value. Output may be in `[0, 2^64)`, i.e. not necessarily canonical.
+#[inline]
+pub(crate) fn reduce128(x: u128) -> Goldilocks {
+    let (x_lo, x_hi) = split(x);
+    let x_hi_hi = x_hi >> 32;
+    let x_hi_lo = x_hi & Goldilocks::NEG_ORDER;
+
+    let (mut t0, borrow) = x_lo.overflowing_sub(x_hi_hi);
+    if borrow {
+        branch_hint();
+        t0 -= Goldilocks::NEG_ORDER;
+    }
+    let t1 = x_hi_lo * Goldilocks::NEG_ORDER;
+    let t2 = unsafe { add_no_canonicalize_trashing_input(t0, t1) };
+    Goldilocks::new(t2)
+}
+
+#[inline]
+#[allow(clippy::cast_possible_truncation)]
+const fn split(x: u128) -> (u64, u64) {
+    (x as u64, (x >> 64) as u64)
+}
+
+/// Fast addition modulo `ORDER` on x86-64, using CF/SBB to pick the adjustment branchlessly.
+///
+/// # Safety
+/// - Only correct if `x + y < 2^64 + ORDER = 0x1_FFFF_FFFF_0000_0001`.
+/// - Overwrites both inputs in registers on x86; avoid reusing them.
+#[inline(always)]
+#[cfg(target_arch = "x86_64")]
+unsafe fn add_no_canonicalize_trashing_input(x: u64, y: u64) -> u64 {
+    unsafe {
+        let res_wrapped: u64;
+        let adjustment: u64;
+        core::arch::asm!(
+            "add {0}, {1}",
+            "sbb {1:e}, {1:e}",
+            inlateout(reg) x => res_wrapped,
+            inlateout(reg) y => adjustment,
+            options(pure, nomem, nostack),
+        );
+        assume(x != 0 || (res_wrapped == y && adjustment == 0));
+        assume(y != 0 || (res_wrapped == x && adjustment == 0));
+        res_wrapped + adjustment
+    }
+}
+
+#[inline(always)]
+#[cfg(not(target_arch = "x86_64"))]
+unsafe fn add_no_canonicalize_trashing_input(x: u64, y: u64) -> u64 {
+    let (res_wrapped, carry) = x.overflowing_add(y);
+    res_wrapped + Goldilocks::NEG_ORDER * u64::from(carry)
+}
+
+/// Binary-GCD inversion for Goldilocks.
+///
+/// Uses the "update factor" variant from https://eprint.iacr.org/2020/972.pdf: compute
+/// factors off by a known power of two, then correct at the end via a linear combination.
+fn gcd_inversion(input: Goldilocks) -> Goldilocks {
+    let (mut a, mut b) = (input.value, P);
+
+    // `len(a) + len(b) <= 128` initially; 126 iterations suffice to drive it to <= 2.
+    // Split into 2 rounds of 63.
+    const ROUND_SIZE: usize = 63;
+
+    let (f00, _, f10, _) = gcd_inner::<ROUND_SIZE>(&mut a, &mut b);
+    let (_, _, f11, g11) = gcd_inner::<ROUND_SIZE>(&mut a, &mut b);
+
+    // The update factors are i64's, but we interpret `-2^63` as `2^63` because
+    // `gcd_inner` outputs sit in `(-2^ROUND_SIZE, 2^ROUND_SIZE]`.
+    let u = from_unusual_int(f00);
+    let v = from_unusual_int(f10);
+    let u_fac11 = from_unusual_int(f11);
+    let v_fac11 = from_unusual_int(g11);
+
+    // Each iteration introduced a factor of 2, so we need to divide by `2^126`.
+    // `2^192 = 1 mod P`, so multiply by `2^66` instead (192 - 126 = 66).
+    (u * u_fac11 + v * v_fac11).mul_2exp_u64(66)
+}
+
+/// Convert an `i64` to Goldilocks, interpreting `i64::MIN` as `2^63` (not `-2^63`).
+const fn from_unusual_int(int: i64) -> Goldilocks {
+    if (int >= 0) || (int == i64::MIN) {
+        Goldilocks::new(int as u64)
+    } else {
+        Goldilocks::new(Goldilocks::ORDER_U64.wrapping_add_signed(int))
+    }
+}
+
+// A few unused-variable suppression helpers that clippy might warn about
+#[allow(dead_code)]
+fn _unused_array_touch() {
+    let _ = array::from_fn::<u8, 0, _>(|_| 0);
+}
diff --git a/crates/backend/goldilocks/src/helpers.rs b/crates/backend/goldilocks/src/helpers.rs
new file mode 100644
index 000000000..65b08b104
--- /dev/null
+++ b/crates/backend/goldilocks/src/helpers.rs
@@ -0,0 +1,73 @@
+// Credits: Plonky3 (https://github.com/Plonky3/Plonky3) (MIT and Apache-2.0 licenses).
+
+//! Helpers ported from `p3_util` and `p3_field::exponentiation`, scoped to what the
+//! Goldilocks field implementation needs.
+
+use field::PrimeCharacteristicRing;
+
+/// Given an element `x` from a 64-bit field `F_P`, compute `x / 2`.
+#[inline]
+#[must_use]
+pub const fn halve_u64<const P: u64>(x: u64) -> u64 {
+    let shift = (P + 1) >> 1;
+    let half = x >> 1;
+    if x & 1 == 0 { half } else { half + shift }
+}
+
+/// Inner loop of the binary-GCD-based inversion algorithm used by Goldilocks.
+///
+/// See https://eprint.iacr.org/2020/972.pdf for background; this mini-GCD builds up
+/// a small transformation using u64 ops and bit shifts, which we then apply to the
+/// big-int values in the outer loop.
+#[inline]
+pub const fn gcd_inner<const NUM_ROUNDS: usize>(a: &mut u64, b: &mut u64) -> (i64, i64, i64, i64) {
+    let (mut f0, mut g0, mut f1, mut g1) = (1, 0, 0, 1);
+
+    let mut round = 0;
+    while round < NUM_ROUNDS {
+        if *a & 1 == 0 {
+            *a >>= 1;
+        } else {
+            if *a < *b {
+                core::mem::swap(a, b);
+                (f0, f1) = (f1, f0);
+                (g0, g1) = (g1, g0);
+            }
+            *a -= *b;
+            *a >>= 1;
+            f0 -= f1;
+            g0 -= g1;
+        }
+        f1 <<= 1;
+        g1 <<= 1;
+
+        round += 1;
+    }
+
+    (f0, g0, f1, g1)
+}
+
+/// Compute `x -> x^{10540996611094048183}` using a custom addition chain.
+///
+/// This map computes the seventh root of `x` if `x` is a member of the `Goldilocks` field.
+/// It follows from: `7 * 10540996611094048183 = 4*(2^64 - 2^32) + 1 = 1 mod (p - 1)`.
+#[must_use]
+pub fn exp_10540996611094048183<R: PrimeCharacteristicRing>(val: R) -> R {
+    let p1 = val;
+    let p10 = p1.square();
+    let p11 = p10 * p1;
+    let p100 = p10.square();
+    let p111 = p100 * p11;
+    let p1_30 = p100.exp_power_of_2(30);
+    let p1_30_11 = p1_30 * p11;
+    let p1_30_11_000 = p1_30_11.exp_power_of_2(3);
+    let p1_30_11_011 = p1_30_11_000 * p1_30_11;
+    let p1_30_11_011_000000 = p1_30_11_011.exp_power_of_2(6);
+    let p_chunk12 = p1_30_11_011_000000 * p1_30_11_011;
+    let p_chunk12_000000000000 = p_chunk12.exp_power_of_2(12);
+    let p_chunk24 = p_chunk12_000000000000 * p_chunk12;
+    let p_chunk24_000000 = p_chunk24.exp_power_of_2(6);
+    let p_chunk30 = p_chunk24_000000 * p1_30_11;
+    let p_chunk30_0000 = p_chunk30.exp_power_of_2(4);
+    p_chunk30_0000 * p111
+}
diff --git a/crates/backend/goldilocks/src/lib.rs b/crates/backend/goldilocks/src/lib.rs
new file mode 100644
index 000000000..f8e3e5781
--- /dev/null
+++ b/crates/backend/goldilocks/src/lib.rs
@@ -0,0 +1,37 @@
+// Credits: Plonky3 (https://github.com/Plonky3/Plonky3) (MIT and Apache-2.0 licenses).
+
+//! The Goldilocks prime field `F_p` where `p = 2^64 - 2^32 + 1`, and a degree-3 extension.
+//!
+//! This is a port of `plonky3/goldilocks/` adapted to the in-tree `field` trait crate.
+
+extern crate alloc;
+
+mod cubic_extension;
+mod goldilocks;
+mod helpers;
+mod packed_cubic_extension;
+mod poseidon1;
+
+#[cfg(test)]
+mod benchmark_poseidons_goldilocks;
+
+pub use cubic_extension::*;
+pub use goldilocks::*;
+pub use helpers::*;
+pub use packed_cubic_extension::*;
+pub use poseidon1::*;
+
+#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+mod aarch64_neon;
+#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+pub use aarch64_neon::*;
+
+#[cfg(all(target_arch = "x86_64", target_feature = "avx2", not(target_feature = "avx512f")))]
+mod x86_64_avx2;
+#[cfg(all(target_arch = "x86_64", target_feature = "avx2", not(target_feature = "avx512f")))]
+pub use x86_64_avx2::*;
+
+#[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
+mod x86_64_avx512;
+#[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
+pub use x86_64_avx512::*;
diff --git a/crates/backend/goldilocks/src/packed_cubic_extension.rs b/crates/backend/goldilocks/src/packed_cubic_extension.rs
new file mode 100644
index 000000000..57827fd76
--- /dev/null
+++ b/crates/backend/goldilocks/src/packed_cubic_extension.rs
@@ -0,0 +1,377 @@
+// Credits: Plonky3 (https://github.com/Plonky3/Plonky3) (MIT and Apache-2.0 licenses).
+
+//! Packed (SIMD) version of the cubic extension `F_p[X] / (X^3 - X - 1)`.
+//!
+//! Mirrors `koala-bear`'s `PackedQuinticExtensionField` shape: a SoA array of
+//! `[PF; 3]` packed-base-field lanes, so each field operation is a SIMD
+//! multiply/add over `PF::WIDTH` extension elements at once.
+
+use alloc::vec::Vec;
+use core::array;
+use core::fmt::Debug;
+use core::iter::{Product, Sum};
+use core::ops::{Add, AddAssign, Mul, MulAssign, Neg, Sub, SubAssign};
+
+use field::{
+    Algebra, BasedVectorSpace, Field, PackedField, PackedFieldExtension, PackedValue, Powers, PrimeCharacteristicRing,
+    field_to_array,
+};
+use itertools::Itertools;
+use rand::distr::{Distribution, StandardUniform};
+use serde::{Deserialize, Serialize};
+use utils::{flatten_to_base, reconstitute_from_base};
+
+use crate::Goldilocks;
+use crate::cubic_extension::{CubicExtensionFieldGL, cubic_mul_generic, cubic_square_generic};
+
+const D: usize = 3;
+
+/// Packed cubic extension over `Goldilocks`, parameterized by base field packing `PF`.
+#[derive(Copy, Clone, Eq, PartialEq, Hash, Debug, Serialize, Deserialize, PartialOrd, Ord)]
+#[repr(transparent)]
+pub struct PackedCubicExtensionFieldGL<PF: PackedField<Scalar = Goldilocks>> {
+    #[serde(
+        with = "utils::array_serialization",
+        bound(serialize = "PF: Serialize", deserialize = "PF: Deserialize<'de>")
+    )]
+    pub(crate) value: [PF; D],
+}
+
+impl<PF: PackedField<Scalar = Goldilocks>> PackedCubicExtensionFieldGL<PF> {
+    const fn new(value: [PF; D]) -> Self {
+        Self { value }
+    }
+}
+
+impl<PF: PackedField<Scalar = Goldilocks>> Default for PackedCubicExtensionFieldGL<PF> {
+    #[inline]
+    fn default() -> Self {
+        Self {
+            value: array::from_fn(|_| PF::ZERO),
+        }
+    }
+}
+
+impl<PF: PackedField<Scalar = Goldilocks>> From<CubicExtensionFieldGL> for PackedCubicExtensionFieldGL<PF> {
+    #[inline]
+    fn from(x: CubicExtensionFieldGL) -> Self {
+        Self {
+            value: x.value.map(Into::into),
+        }
+    }
+}
+
+impl<PF: PackedField<Scalar = Goldilocks>> From<PF> for PackedCubicExtensionFieldGL<PF> {
+    #[inline]
+    fn from(x: PF) -> Self {
+        Self {
+            value: field_to_array(x),
+        }
+    }
+}
+
+impl<PF: PackedField<Scalar = Goldilocks>> Distribution<PackedCubicExtensionFieldGL<PF>> for StandardUniform
+where
+    Self: Distribution<PF>,
+{
+    #[inline]
+    fn sample<R: rand::Rng + ?Sized>(&self, rng: &mut R) -> PackedCubicExtensionFieldGL<PF> {
+        PackedCubicExtensionFieldGL::new(array::from_fn(|_| self.sample(rng)))
+    }
+}
+
+impl<PF: PackedField<Scalar = Goldilocks>> Algebra<CubicExtensionFieldGL> for PackedCubicExtensionFieldGL<PF> {}
+
+impl<PF: PackedField<Scalar = Goldilocks>> Algebra<PF> for PackedCubicExtensionFieldGL<PF> {}
+
+impl<PF: PackedField<Scalar = Goldilocks>> PrimeCharacteristicRing for PackedCubicExtensionFieldGL<PF> {
+    type PrimeSubfield = PF::PrimeSubfield;
+
+    const ZERO: Self = Self { value: [PF::ZERO; D] };
+
+    const ONE: Self = Self {
+        value: field_to_array(PF::ONE),
+    };
+
+    const TWO: Self = Self {
+        value: field_to_array(PF::TWO),
+    };
+
+    const NEG_ONE: Self = Self {
+        value: field_to_array(PF::NEG_ONE),
+    };
+
+    #[inline]
+    fn from_prime_subfield(val: Self::PrimeSubfield) -> Self {
+        PF::from_prime_subfield(val).into()
+    }
+
+    #[inline]
+    fn from_bool(b: bool) -> Self {
+        PF::from_bool(b).into()
+    }
+
+    #[inline(always)]
+    fn square(&self) -> Self {
+        let mut res = Self::default();
+        cubic_square_generic(&self.value, &mut res.value);
+        res
+    }
+
+    #[inline]
+    fn zero_vec(len: usize) -> Vec<Self> {
+        // SAFETY: this is a repr(transparent) wrapper around an array.
+        unsafe { reconstitute_from_base(PF::zero_vec(len * D)) }
+    }
+}
+
+impl<PF: PackedField<Scalar = Goldilocks>> BasedVectorSpace<PF> for PackedCubicExtensionFieldGL<PF> {
+    const DIMENSION: usize = D;
+
+    #[inline]
+    fn as_basis_coefficients_slice(&self) -> &[PF] {
+        &self.value
+    }
+
+    #[inline]
+    fn from_basis_coefficients_fn<Fn: FnMut(usize) -> PF>(f: Fn) -> Self {
+        Self {
+            value: array::from_fn(f),
+        }
+    }
+
+    #[inline]
+    fn from_basis_coefficients_iter<I: ExactSizeIterator<Item = PF>>(mut iter: I) -> Option<Self> {
+        (iter.len() == D).then(|| Self::new(array::from_fn(|_| iter.next().unwrap())))
+    }
+
+    #[inline]
+    fn flatten_to_base(vec: Vec<Self>) -> Vec<PF> {
+        // SAFETY: `Self` is `repr(transparent)` over `[PF; D]`.
+        unsafe { flatten_to_base(vec) }
+    }
+
+    #[inline]
+    fn reconstitute_from_base(vec: Vec<PF>) -> Vec<Self> {
+        // SAFETY: `Self` is `repr(transparent)` over `[PF; D]`.
+        unsafe { reconstitute_from_base(vec) }
+    }
+}
+
+impl PackedFieldExtension<Goldilocks, CubicExtensionFieldGL>
+    for PackedCubicExtensionFieldGL<<Goldilocks as Field>::Packing>
+{
+    #[inline]
+    fn from_ext_slice(ext_slice: &[CubicExtensionFieldGL]) -> Self {
+        let width = <Goldilocks as Field>::Packing::WIDTH;
+        assert_eq!(ext_slice.len(), width);
+
+        let res = array::from_fn(|i| <Goldilocks as Field>::Packing::from_fn(|j| ext_slice[j].value[i]));
+        Self::new(res)
+    }
+
+    #[inline]
+    fn to_ext_iter(iter: impl IntoIterator<Item = Self>) -> impl Iterator<Item = CubicExtensionFieldGL> {
+        let width = <Goldilocks as Field>::Packing::WIDTH;
+        iter.into_iter().flat_map(move |x| {
+            (0..width).map(move |i| {
+                let values = array::from_fn(|j| x.value[j].as_slice()[i]);
+                CubicExtensionFieldGL::new(values)
+            })
+        })
+    }
+
+    #[inline]
+    fn packed_ext_powers(base: CubicExtensionFieldGL) -> Powers<Self> {
+        let width = <Goldilocks as Field>::Packing::WIDTH;
+        let powers = base.powers().take(width + 1).collect_vec();
+        let current = Self::from_ext_slice(&powers[..width]);
+        let multiplier = powers[width].into();
+
+        Powers {
+            base: multiplier,
+            current,
+        }
+    }
+}
+
+impl<PF: PackedField<Scalar = Goldilocks>> Neg for PackedCubicExtensionFieldGL<PF> {
+    type Output = Self;
+    #[inline]
+    fn neg(self) -> Self {
+        Self {
+            value: self.value.map(PF::neg),
+        }
+    }
+}
+
+impl<PF: PackedField<Scalar = Goldilocks>> Add for PackedCubicExtensionFieldGL<PF> {
+    type Output = Self;
+    #[inline]
+    fn add(self, rhs: Self) -> Self {
+        Self {
+            value: array::from_fn(|i| self.value[i] + rhs.value[i]),
+        }
+    }
+}
+
+impl<PF: PackedField<Scalar = Goldilocks>> Add<CubicExtensionFieldGL> for PackedCubicExtensionFieldGL<PF> {
+    type Output = Self;
+    #[inline]
+    fn add(self, rhs: CubicExtensionFieldGL) -> Self {
+        Self {
+            value: array::from_fn(|i| self.value[i] + rhs.value[i]),
+        }
+    }
+}
+
+impl<PF: PackedField<Scalar = Goldilocks>> Add<PF> for PackedCubicExtensionFieldGL<PF> {
+    type Output = Self;
+    #[inline]
+    fn add(mut self, rhs: PF) -> Self {
+        self.value[0] += rhs;
+        self
+    }
+}
+
+impl<PF: PackedField<Scalar = Goldilocks>> AddAssign for PackedCubicExtensionFieldGL<PF> {
+    #[inline]
+    fn add_assign(&mut self, rhs: Self) {
+        for i in 0..D {
+            self.value[i] += rhs.value[i];
+        }
+    }
+}
+
+impl<PF: PackedField<Scalar = Goldilocks>> AddAssign<CubicExtensionFieldGL> for PackedCubicExtensionFieldGL<PF> {
+    #[inline]
+    fn add_assign(&mut self, rhs: CubicExtensionFieldGL) {
+        for i in 0..D {
+            self.value[i] += rhs.value[i];
+        }
+    }
+}
+
+impl<PF: PackedField<Scalar = Goldilocks>> AddAssign<PF> for PackedCubicExtensionFieldGL<PF> {
+    #[inline]
+    fn add_assign(&mut self, rhs: PF) {
+        self.value[0] += rhs;
+    }
+}
+
+impl<PF: PackedField<Scalar = Goldilocks>> Sum for PackedCubicExtensionFieldGL<PF> {
+    #[inline]
+    fn sum<I: Iterator<Item = Self>>(iter: I) -> Self {
+        iter.reduce(|acc, x| acc + x).unwrap_or(Self::ZERO)
+    }
+}
+
+impl<PF: PackedField<Scalar = Goldilocks>> Sub for PackedCubicExtensionFieldGL<PF> {
+    type Output = Self;
+    #[inline]
+    fn sub(self, rhs: Self) -> Self {
+        Self {
+            value: array::from_fn(|i| self.value[i] - rhs.value[i]),
+        }
+    }
+}
+
+impl<PF: PackedField<Scalar = Goldilocks>> Sub<CubicExtensionFieldGL> for PackedCubicExtensionFieldGL<PF> {
+    type Output = Self;
+    #[inline]
+    fn sub(self, rhs: CubicExtensionFieldGL) -> Self {
+        Self {
+            value: array::from_fn(|i| self.value[i] - rhs.value[i]),
+        }
+    }
+}
+
+impl<PF: PackedField<Scalar = Goldilocks>> Sub<PF> for PackedCubicExtensionFieldGL<PF> {
+    type Output = Self;
+    #[inline]
+    fn sub(self, rhs: PF) -> Self {
+        let mut res = self.value;
+        res[0] -= rhs;
+        Self { value: res }
+    }
+}
+
+impl<PF: PackedField<Scalar = Goldilocks>> SubAssign for PackedCubicExtensionFieldGL<PF> {
+    #[inline]
+    fn sub_assign(&mut self, rhs: Self) {
+        *self = *self - rhs;
+    }
+}
+
+impl<PF: PackedField<Scalar = Goldilocks>> SubAssign<CubicExtensionFieldGL> for PackedCubicExtensionFieldGL<PF> {
+    #[inline]
+    fn sub_assign(&mut self, rhs: CubicExtensionFieldGL) {
+        *self = *self - rhs;
+    }
+}
+
+impl<PF: PackedField<Scalar = Goldilocks>> SubAssign<PF> for PackedCubicExtensionFieldGL<PF> {
+    #[inline]
+    fn sub_assign(&mut self, rhs: PF) {
+        *self = *self - rhs;
+    }
+}
+
+impl<PF: PackedField<Scalar = Goldilocks>> Mul for PackedCubicExtensionFieldGL<PF> {
+    type Output = Self;
+    #[inline(always)]
+    fn mul(self, rhs: Self) -> Self {
+        let mut res = Self::default();
+        cubic_mul_generic(&self.value, &rhs.value, &mut res.value);
+        res
+    }
+}
+
+impl<PF: PackedField<Scalar = Goldilocks>> Mul<CubicExtensionFieldGL> for PackedCubicExtensionFieldGL<PF> {
+    type Output = Self;
+    #[inline(always)]
+    fn mul(self, rhs: CubicExtensionFieldGL) -> Self {
+        let b: [PF; D] = rhs.value.map(|x| x.into());
+        let mut res = Self::default();
+        cubic_mul_generic(&self.value, &b, &mut res.value);
+        res
+    }
+}
+
+impl<PF: PackedField<Scalar = Goldilocks>> Mul<PF> for PackedCubicExtensionFieldGL<PF> {
+    type Output = Self;
+    #[inline]
+    fn mul(self, rhs: PF) -> Self {
+        Self {
+            value: self.value.map(|x| x * rhs),
+        }
+    }
+}
+
+impl<PF: PackedField<Scalar = Goldilocks>> Product for PackedCubicExtensionFieldGL<PF> {
+    #[inline]
+    fn product<I: Iterator<Item = Self>>(iter: I) -> Self {
+        iter.reduce(|acc, x| acc * x).unwrap_or(Self::ONE)
+    }
+}
+
+impl<PF: PackedField<Scalar = Goldilocks>> MulAssign for PackedCubicExtensionFieldGL<PF> {
+    #[inline(always)]
+    fn mul_assign(&mut self, rhs: Self) {
+        *self = *self * rhs;
+    }
+}
+
+impl<PF: PackedField<Scalar = Goldilocks>> MulAssign<CubicExtensionFieldGL> for PackedCubicExtensionFieldGL<PF> {
+    #[inline(always)]
+    fn mul_assign(&mut self, rhs: CubicExtensionFieldGL) {
+        *self = *self * rhs;
+    }
+}
+
+impl<PF: PackedField<Scalar = Goldilocks>> MulAssign<PF> for PackedCubicExtensionFieldGL<PF> {
+    #[inline]
+    fn mul_assign(&mut self, rhs: PF) {
+        *self = *self * rhs;
+    }
+}
diff --git a/crates/backend/goldilocks/src/poseidon1.rs b/crates/backend/goldilocks/src/poseidon1.rs
new file mode 100644
index 000000000..43b03cf77
--- /dev/null
+++ b/crates/backend/goldilocks/src/poseidon1.rs
@@ -0,0 +1,1008 @@
+// Credits: Plonky3 (https://github.com/Plonky3/Plonky3) (MIT and Apache-2.0 licenses).
+
+//! Scalar Poseidon1 permutation at width 8 for Goldilocks.
+//!
+//! Parameters:
+//! - S-box `x^7` (smallest `d` with `gcd(d, p - 1) = 1` for Goldilocks)
+//! - `R_F = 8` full rounds (4 initial + 4 terminal)
+//! - `R_P = 22` partial rounds in the middle
+//! - External MDS is the circulant matrix with first row `[7, 1, 3, 8, 8, 3, 4, 9]`
+//!   (Plonky2/upstream-Plonky3 "small MDS" — same matrix the upstream
+//!   `MdsMatrixGoldilocks` uses at width 8).
+//!
+//! The permutation is generic over any algebra `R` over `Goldilocks` that also
+//! implements `InjectiveMonomial<7>`, mirroring the koala-bear crate's
+//! Poseidon1 surface.
+
+#[cfg(any(
+    test,
+    not(any(
+        all(target_arch = "x86_64", target_feature = "avx2", not(target_feature = "avx512f")),
+        all(target_arch = "x86_64", target_feature = "avx512f"),
+    )),
+))]
+use field::PackedValue;
+use field::{Algebra, Field, InjectiveMonomial, PrimeCharacteristicRing};
+
+use crate::Goldilocks;
+
+pub const POSEIDON1_WIDTH: usize = 8;
+pub const POSEIDON1_HALF_FULL_ROUNDS: usize = 4;
+pub const POSEIDON1_PARTIAL_ROUNDS: usize = 22;
+pub const POSEIDON1_SBOX_DEGREE: u64 = 7;
+pub const POSEIDON1_DIGEST_LEN: usize = 4;
+
+pub const POSEIDON1_N_ROUNDS: usize = 2 * POSEIDON1_HALF_FULL_ROUNDS + POSEIDON1_PARTIAL_ROUNDS;
+
+// =========================================================================
+// MDS matrix (circulant, width 8)
+// =========================================================================
+//
+// First row of the circulant MDS matrix. `MDS8_COL[i] = r_{(N - i) mod N}` is
+// the first column — more convenient for a row-major apply of a circulant
+// since `row_i = cyclic_shift(col, i)`, i.e. `M[i][j] = COL[(j - i + N) mod N]`
+// (equivalently `ROW[(j - i) mod N]`).
+pub const MDS8_ROW: [i64; 8] = [7, 1, 3, 8, 8, 3, 4, 9];
+
+/// Apply the width-8 circulant MDS matrix in place, generic over `R`.
+///
+/// The matrix has tiny integer entries (max 9), so even without any delayed
+/// reduction a plain algebra-over-Goldilocks multiply is fine.
+#[inline]
+fn mds_mul_generic<R: Algebra<Goldilocks>>(state: &mut [R; 8]) {
+    // Precompute the constants as Goldilocks once — `From<Goldilocks>` for `R`
+    // gives us `R` conversions.
+    let coeffs: [Goldilocks; 8] = {
+        let mut arr = [Goldilocks::ZERO; 8];
+        for i in 0..8 {
+            arr[i] = Goldilocks::new(MDS8_ROW[i] as u64);
+        }
+        arr
+    };
+
+    let input = *state;
+    for i in 0..8 {
+        // `row_i · input = sum_j ROW[(j - i) mod 8] · input[j]`
+        let mut acc = input[0] * coeffs[(8 - i) % 8];
+        for j in 1..8 {
+            acc += input[j] * coeffs[(j + 8 - i) % 8];
+        }
+        state[i] = acc;
+    }
+}
+
+/// Specialized fast MDS for the concrete `Goldilocks` scalar.
+///
+/// Each output is a dot product `sum_j MDS_ROW[(j-i) mod 8] * state[j]` with
+/// MDS coefficients in `{1, 3, 4, 7, 8, 9}` (all fit in 4 bits). With the
+/// constants spelled out explicitly LLVM strength-reduces `c * s` to shifts
+/// and adds (e.g. `8*s = s<<3`, `7*s = (s<<3)-s`), eliminating the variable
+/// multiplications entirely. We accumulate into `u128` (8·9·2^64 ≈ 2^71 fits
+/// comfortably) and reduce once per output via `reduce128`. The explicit
+/// `1 *` factors keep the circulant structure readable column-by-column.
+#[inline(always)]
+#[allow(clippy::identity_op)]
+fn mds_mul_scalar(state: &mut [Goldilocks; 8]) {
+    let s0 = state[0].value as u128;
+    let s1 = state[1].value as u128;
+    let s2 = state[2].value as u128;
+    let s3 = state[3].value as u128;
+    let s4 = state[4].value as u128;
+    let s5 = state[5].value as u128;
+    let s6 = state[6].value as u128;
+    let s7 = state[7].value as u128;
+
+    // MDS_ROW = [7, 1, 3, 8, 8, 3, 4, 9]; row i is MDS_ROW rotated right by i.
+    let acc0 = 7 * s0 + 1 * s1 + 3 * s2 + 8 * s3 + 8 * s4 + 3 * s5 + 4 * s6 + 9 * s7;
+    let acc1 = 9 * s0 + 7 * s1 + 1 * s2 + 3 * s3 + 8 * s4 + 8 * s5 + 3 * s6 + 4 * s7;
+    let acc2 = 4 * s0 + 9 * s1 + 7 * s2 + 1 * s3 + 3 * s4 + 8 * s5 + 8 * s6 + 3 * s7;
+    let acc3 = 3 * s0 + 4 * s1 + 9 * s2 + 7 * s3 + 1 * s4 + 3 * s5 + 8 * s6 + 8 * s7;
+    let acc4 = 8 * s0 + 3 * s1 + 4 * s2 + 9 * s3 + 7 * s4 + 1 * s5 + 3 * s6 + 8 * s7;
+    let acc5 = 8 * s0 + 8 * s1 + 3 * s2 + 4 * s3 + 9 * s4 + 7 * s5 + 1 * s6 + 3 * s7;
+    let acc6 = 3 * s0 + 8 * s1 + 8 * s2 + 3 * s3 + 4 * s4 + 9 * s5 + 7 * s6 + 1 * s7;
+    let acc7 = 1 * s0 + 3 * s1 + 8 * s2 + 8 * s3 + 3 * s4 + 4 * s5 + 9 * s6 + 7 * s7;
+
+    state[0] = crate::goldilocks::reduce128(acc0);
+    state[1] = crate::goldilocks::reduce128(acc1);
+    state[2] = crate::goldilocks::reduce128(acc2);
+    state[3] = crate::goldilocks::reduce128(acc3);
+    state[4] = crate::goldilocks::reduce128(acc4);
+    state[5] = crate::goldilocks::reduce128(acc5);
+    state[6] = crate::goldilocks::reduce128(acc6);
+    state[7] = crate::goldilocks::reduce128(acc7);
+}
+
+// =========================================================================
+// Round constants (width 8)
+// =========================================================================
+//
+// Layout: [4 initial full][22 partial][4 terminal full].
+// Generated by the Grain LFSR (Poseidon1, Appendix E) with
+// `field_type = 1, alpha = 7, n = 64, t = 8, R_F = 8, R_P = 22`.
+// Values carried over verbatim from `plonky3/goldilocks/src/poseidon1.rs`.
+pub const GOLDILOCKS_POSEIDON1_RC_8: [[Goldilocks; POSEIDON1_WIDTH]; POSEIDON1_N_ROUNDS] = Goldilocks::new_2d_array([
+    // ---- Initial full rounds (4) ----
+    [
+        0xdd5743e7f2a5a5d9,
+        0xcb3a864e58ada44b,
+        0xffa2449ed32f8cdc,
+        0x42025f65d6bd13ee,
+        0x7889175e25506323,
+        0x34b98bb03d24b737,
+        0xbdcc535ecc4faa2a,
+        0x5b20ad869fc0d033,
+    ],
+    [
+        0xf1dda5b9259dfcb4,
+        0x27515210be112d59,
+        0x4227d1718c766c3f,
+        0x26d333161a5bd794,
+        0x49b938957bf4b026,
+        0x4a56b5938b213669,
+        0x1120426b48c8353d,
+        0x6b323c3f10a56cad,
+    ],
+    [
+        0xce57d6245ddca6b2,
+        0xb1fc8d402bba1eb1,
+        0xb5c5096ca959bd04,
+        0x6db55cd306d31f7f,
+        0xc49d293a81cb9641,
+        0x1ce55a4fe979719f,
+        0xa92e60a9d178a4d1,
+        0x002cc64973bcfd8c,
+    ],
+    [
+        0xcea721cce82fb11b,
+        0xe5b55eb8098ece81,
+        0x4e30525c6f1ddd66,
+        0x43c6702827070987,
+        0xaca68430a7b5762a,
+        0x3674238634df9c93,
+        0x88cee1c825e33433,
+        0xde99ae8d74b57176,
+    ],
+    // ---- Partial rounds (22) ----
+    [
+        0x488897d85ff51f56,
+        0x1140737ccb162218,
+        0xa7eeb9215866ed35,
+        0x9bd2976fee49fcc9,
+        0xc0c8f0de580a3fcc,
+        0x4fb2dae6ee8fc793,
+        0x343a89f35f37395b,
+        0x223b525a77ca72c8,
+    ],
+    [
+        0x56ccb62574aaa918,
+        0xc4d507d8027af9ed,
+        0xa080673cf0b7e95c,
+        0xf0184884eb70dcf8,
+        0x044f10b0cb3d5c69,
+        0xe9e3f7993938f186,
+        0x1b761c80e772f459,
+        0x606cec607a1b5fac,
+    ],
+    [
+        0x14a0c2e1d45f03cd,
+        0x4eace8855398574f,
+        0xf905ca7103eff3e6,
+        0xf8c8f8d20862c059,
+        0xb524fe8bdd678e5a,
+        0xfbb7865901a1ec41,
+        0x014ef1197d341346,
+        0x9725e20825d07394,
+    ],
+    [
+        0xfdb25aef2c5bae3b,
+        0xbe5402dc598c971e,
+        0x93a5711f04cdca3d,
+        0xc45a9a5b2f8fb97b,
+        0xfe8946a924933545,
+        0x2af997a27369091c,
+        0xaa62c88e0b294011,
+        0x058eb9d810ce9f74,
+    ],
+    [
+        0xb3cb23eced349ae4,
+        0xa3648177a77b4a84,
+        0x43153d905992d95d,
+        0xf4e2a97cda44aa4b,
+        0x5baa2702b908682f,
+        0x082923bdf4f750d1,
+        0x98ae09a325893803,
+        0xf8a6475077968838,
+    ],
+    [
+        0xceb0735bf00b2c5f,
+        0x0a1a5d953888e072,
+        0x2fcb190489f94475,
+        0xb5be06270dec69fc,
+        0x739cb934b09acf8b,
+        0x537750b75ec7f25b,
+        0xe9dd318bae1f3961,
+        0xf7462137299efe1a,
+    ],
+    [
+        0xb1f6b8eee9adb940,
+        0xbdebcc8a809dfe6b,
+        0x40fc1f791b178113,
+        0x3ac1c3362d014864,
+        0x9a016184bdb8aeba,
+        0x95f2394459fbc25e,
+        0xe3f34a07a76a66c2,
+        0x8df25f9ad98b1b96,
+    ],
+    [
+        0x85ffc27171439d9d,
+        0xddcb9a2dcfd26910,
+        0x26b5ba4bf3afb94e,
+        0xffff9cc7c7651e2f,
+        0x8c88364698280b55,
+        0xebc114167b910501,
+        0x2d77b4d89ecfb516,
+        0x332e0828eba151f2,
+    ],
+    [
+        0x46fa6a6450dd4735,
+        0xd00db7dd92384a33,
+        0x5fd4fb751f3a5fc5,
+        0x496fb90c0bb65ea2,
+        0xf3baec0bb87cc5c7,
+        0x862a3c0a7d4c7713,
+        0xbf5f38336a3f47d8,
+        0x41ad9dbc1394a20c,
+    ],
+    [
+        0xcc535945b7dbf0f7,
+        0x82af2bc93685bcec,
+        0x8e4c8d0c8cebfccd,
+        0x17cb39417e84597e,
+        0xd4a965a8c749b232,
+        0xa2cab040f33f3ee5,
+        0xa98811a1fed4e3a6,
+        0x1cc48b54f377e2a1,
+    ],
+    [
+        0xe40cd4f6c5609a27,
+        0x11de79ebca97a4a4,
+        0x9177c73d8b7e929d,
+        0x2a6fe8085797e792,
+        0x3de6e93329f8d5ae,
+        0x3f7af9125da962ff,
+        0xd710682cfc77d3ac,
+        0x48faf05f3b053cf4,
+    ],
+    [
+        0x287db8630da89c8b,
+        0x4d0de32053cb30e9,
+        0x8b37a4f20c5ada7b,
+        0xe7cc6ebe78c84ecf,
+        0x240bdc0a66a2610d,
+        0x8299e7f02caa1650,
+        0x380a53fefb6e754e,
+        0x684a1d8cf8eb6810,
+    ],
+    [
+        0xe839452eb4b8a5e1,
+        0xb03fa62e90626af4,
+        0x11a688602fbc5efc,
+        0x30dda75c355a2d62,
+        0x0f712adcb73810de,
+        0xffdc1102187f1ae1,
+        0x40c34f398254b99c,
+        0xede021b9dc289a4a,
+    ],
+    [
+        0x8b7b05225c4e7dad,
+        0x3bc794346f9d9ff9,
+        0xfccb5a57f2ca86ff,
+        0xbb1502015a7da9d4,
+        0xd7e0a35d4352a015,
+        0x27af7a44f8160931,
+        0xc37442f6782f4615,
+        0xbdf392a9bd095dcb,
+    ],
+    [
+        0xc17f55037cf00de9,
+        0xbcffedd34c71a874,
+        0x5eb45d2a8133d1f2,
+        0xbabe251e1612ebdf,
+        0x3efeb9fbe438c536,
+        0x2d7cef97b4afe1cf,
+        0xe5de1b4660016c0b,
+        0xcdcc26c332f5657c,
+    ],
+    [
+        0xe01dd653daf15809,
+        0xb0a6bdd4b41094b5,
+        0x27eac858b0b03a05,
+        0x51d43b5e93adbdc0,
+        0x8b89a23b0fea5fc9,
+        0xdc8ac3b14f7f2fc1,
+        0xe793f82f1efec039,
+        0x9f6f2cf8969e7b80,
+    ],
+    [
+        0x49d45382e0f21d4a,
+        0x5f4ad1797cd72786,
+        0x4dc3dbebfd45f795,
+        0x03a3ef84dba6e1bc,
+        0x204bc9b3d3fc4c01,
+        0x9ad706081e89b9ba,
+        0x638bfb4d840e9f89,
+        0x5ef2938cd095ae35,
+    ],
+    [
+        0x42cca18ebeb265c8,
+        0xb7b2ec5c29aecbf8,
+        0x0d84f9535dc78f0f,
+        0x04e64ad942e77b8c,
+        0xb4880dffffc9da0b,
+        0x16db16d9c29adeb1,
+        0x09bbaf2a0590cd1e,
+        0x76460e74961fcf8d,
+    ],
+    [
+        0xed12a2276dfa1553,
+        0x0b5acec5de0436fd,
+        0x3c6cfea033a1f0a8,
+        0x2b5ecefe546cac15,
+        0x6e2d82884cd3bf6f,
+        0xc134878d1add7b83,
+        0x997963422eb7a280,
+        0x5e834537ac648cf6,
+    ],
+    [
+        0x89e779214737c0b7,
+        0x1a8c05e8581ad95b,
+        0x8d18b72796437cf7,
+        0xe7252c949e04b106,
+        0x53267c4fd174585a,
+        0xa16ef5d9c81dad47,
+        0xda65191937270a46,
+        0xcb2a5b55f2df664c,
+    ],
+    [
+        0x854aee2dc1924137,
+        0xf37013c9d479ece6,
+        0x0e163bc0630c4696,
+        0x384ee64955048f76,
+        0xf65d814e28ee4ec5,
+        0xe57bc564fd82f1b1,
+        0x4b338937b6876614,
+        0x66ee0b04ed43cd8d,
+    ],
+    [
+        0x49884bf25f4ef15d,
+        0xeb51fe28de1c6f54,
+        0x2cd64e84fce8dfcc,
+        0x29164a96a541a013,
+        0x173ce7558f4cacb8,
+        0xeb5b1ce5877c89e9,
+        0x5faff4b0f5217bf6,
+        0xac42d0b1c20f205e,
+    ],
+    // ---- Terminal full rounds (4) ----
+    [
+        0xfb1d6bf0ca43221b,
+        0x97b0a1b01d6a2955,
+        0x08c60bd622952b30,
+        0x43f2be0f9e24147c,
+        0xfa7268b7d3730f5d,
+        0x43a6c419a23983bb,
+        0xcd77c1f7b29b113c,
+        0xcfa43c9db8eec29f,
+    ],
+    [
+        0xcaaa95a6c7365dec,
+        0x0a91193f798f3be0,
+        0x1104497652735dc6,
+        0x35aecb93663b515e,
+        0x8dbc9916065aa858,
+        0xada8f7a0266579ed,
+        0x524dee7bec1ea789,
+        0xa93aee9dd5af9521,
+    ],
+    [
+        0x9d1f1b54750d707e,
+        0x7c9feab87096d5dc,
+        0xa2e1fb19f9d4261b,
+        0xb714deb448de6346,
+        0x225d1f0d011c5403,
+        0x1549b7f1d28cedc0,
+        0xaef3e46f97d43942,
+        0x6dfc7ffe0b38bf08,
+    ],
+    [
+        0x7de853fdc542b663,
+        0xa68ecc96610657b2,
+        0xe88bb5428af289b1,
+        0xd7cfa1504c5569f5,
+        0x78a9aad0d642d30a,
+        0xd68315f2353dce52,
+        0x46e56300f86fcfd5,
+        0x323d95332b145fd6,
+    ],
+]);
+
+// =========================================================================
+// S-box helpers
+// =========================================================================
+
+#[inline(always)]
+fn sbox_full<R: InjectiveMonomial<7>>(x: R) -> R {
+    x.injective_exp_n()
+}
+
+// =========================================================================
+// Permutation driver
+// =========================================================================
+
+/// Width-8 Poseidon1 permutation for Goldilocks.
+///
+/// Zero-sized — all state lives in the round-constant tables above. Mirrors
+/// `Poseidon1Goldilocks8`'s public surface: `permute{,_mut}`,
+/// `compress{,_in_place}`, plus a `default_goldilocks_poseidon1_8()` constructor.
+#[derive(Clone, Copy, Debug, Default)]
+pub struct Poseidon1Goldilocks8;
+
+impl Poseidon1Goldilocks8 {
+    /// Fast scalar permutation — direct `Goldilocks` arithmetic with a `u128`
+    /// MDS accumulator.
+    pub fn permute(&self, mut state: [Goldilocks; POSEIDON1_WIDTH]) -> [Goldilocks; POSEIDON1_WIDTH] {
+        self.permute_mut(&mut state);
+        state
+    }
+
+    #[inline]
+    pub fn permute_mut(&self, state: &mut [Goldilocks; POSEIDON1_WIDTH]) {
+        for rc in GOLDILOCKS_POSEIDON1_RC_8.iter().take(POSEIDON1_HALF_FULL_ROUNDS) {
+            for (i, s) in state.iter_mut().enumerate() {
+                *s += rc[i];
+            }
+            for s in state.iter_mut() {
+                *s = sbox_full::<Goldilocks>(*s);
+            }
+            mds_mul_scalar(state);
+        }
+
+        for rc in GOLDILOCKS_POSEIDON1_RC_8
+            .iter()
+            .skip(POSEIDON1_HALF_FULL_ROUNDS)
+            .take(POSEIDON1_PARTIAL_ROUNDS)
+        {
+            for (i, s) in state.iter_mut().enumerate() {
+                *s += rc[i];
+            }
+            state[0] = sbox_full::<Goldilocks>(state[0]);
+            mds_mul_scalar(state);
+        }
+
+        for rc in GOLDILOCKS_POSEIDON1_RC_8
+            .iter()
+            .take(POSEIDON1_N_ROUNDS)
+            .skip(POSEIDON1_HALF_FULL_ROUNDS + POSEIDON1_PARTIAL_ROUNDS)
+        {
+            for (i, s) in state.iter_mut().enumerate() {
+                *s += rc[i];
+            }
+            for s in state.iter_mut() {
+                *s = sbox_full::<Goldilocks>(*s);
+            }
+            mds_mul_scalar(state);
+        }
+    }
+
+    /// Generic permutation over any algebra `R` over `Goldilocks` with `x^7`
+    /// as an injective monomial. Used by the AIR / symbolic trace builders.
+    pub fn permute_generic<R>(&self, state: &mut [R; POSEIDON1_WIDTH])
+    where
+        R: Algebra<Goldilocks> + InjectiveMonomial<7> + Copy,
+    {
+        for rc in GOLDILOCKS_POSEIDON1_RC_8.iter().take(POSEIDON1_HALF_FULL_ROUNDS) {
+            for (i, s) in state.iter_mut().enumerate() {
+                *s += rc[i];
+            }
+            for s in state.iter_mut() {
+                *s = sbox_full::<R>(*s);
+            }
+            mds_mul_generic(state);
+        }
+
+        for rc in GOLDILOCKS_POSEIDON1_RC_8
+            .iter()
+            .skip(POSEIDON1_HALF_FULL_ROUNDS)
+            .take(POSEIDON1_PARTIAL_ROUNDS)
+        {
+            for (i, s) in state.iter_mut().enumerate() {
+                *s += rc[i];
+            }
+            state[0] = sbox_full::<R>(state[0]);
+            mds_mul_generic(state);
+        }
+
+        for rc in GOLDILOCKS_POSEIDON1_RC_8
+            .iter()
+            .take(POSEIDON1_N_ROUNDS)
+            .skip(POSEIDON1_HALF_FULL_ROUNDS + POSEIDON1_PARTIAL_ROUNDS)
+        {
+            for (i, s) in state.iter_mut().enumerate() {
+                *s += rc[i];
+            }
+            for s in state.iter_mut() {
+                *s = sbox_full::<R>(*s);
+            }
+            mds_mul_generic(state);
+        }
+    }
+
+    /// Compression-mode in-place permutation: `output = permute(input) + input`.
+    ///
+    /// When `R` matches the architecture's packed Goldilocks type, dispatches
+    /// to the SIMD-parallel path. When `R == Goldilocks`, uses the scalar fast
+    /// path (avoids the symbolic-friendly but slow `permute_generic`).
+    /// Otherwise falls back to the generic algebra path.
+    #[inline(always)]
+    pub fn compress_in_place<R>(&self, state: &mut [R; POSEIDON1_WIDTH])
+    where
+        R: Algebra<Goldilocks> + InjectiveMonomial<7> + Copy + 'static,
+    {
+        use core::any::TypeId;
+
+        type Packing = <Goldilocks as Field>::Packing;
+
+        if TypeId::of::<R>() == TypeId::of::<Packing>() {
+            // SAFETY: TypeId equality guarantees R has the same layout as Packing,
+            // and the array is repr-transparent as a slice of W*8 Goldilocks.
+            let s = unsafe { &mut *(state as *mut [R; POSEIDON1_WIDTH] as *mut [Packing; POSEIDON1_WIDTH]) };
+            self.simd_core::<true>(s);
+            return;
+        }
+        if TypeId::of::<R>() == TypeId::of::<Goldilocks>() {
+            // SAFETY: TypeId equality.
+            let s = unsafe { &mut *(state as *mut [R; POSEIDON1_WIDTH] as *mut [Goldilocks; POSEIDON1_WIDTH]) };
+            let initial = *s;
+            self.permute_mut(s);
+            for (slot, init) in s.iter_mut().zip(initial) {
+                *slot += init;
+            }
+            return;
+        }
+
+        let initial = *state;
+        self.permute_generic(state);
+        for (s, init) in state.iter_mut().zip(initial) {
+            *s += init;
+        }
+    }
+
+    /// Permutation-mode in-place permutation (no feedforward), mirroring
+    /// [`Self::compress_in_place`]'s SIMD dispatch. Used by the overwrite sponge
+    /// for Merkle leaf/node hashing — without this the packed `Permutation` impl
+    /// would fall back to the slow `permute_generic` (fully-reducing packed MDS),
+    /// regressing all Merkle tree building ~4x.
+    #[inline(always)]
+    pub fn permute_in_place<R>(&self, state: &mut [R; POSEIDON1_WIDTH])
+    where
+        R: Algebra<Goldilocks> + InjectiveMonomial<7> + Copy + 'static,
+    {
+        use core::any::TypeId;
+
+        type Packing = <Goldilocks as Field>::Packing;
+
+        if TypeId::of::<R>() == TypeId::of::<Packing>() {
+            // SAFETY: TypeId equality guarantees R has the same layout as Packing.
+            let s = unsafe { &mut *(state as *mut [R; POSEIDON1_WIDTH] as *mut [Packing; POSEIDON1_WIDTH]) };
+            self.simd_core::<false>(s);
+            return;
+        }
+        if TypeId::of::<R>() == TypeId::of::<Goldilocks>() {
+            // SAFETY: TypeId equality.
+            let s = unsafe { &mut *(state as *mut [R; POSEIDON1_WIDTH] as *mut [Goldilocks; POSEIDON1_WIDTH]) };
+            self.permute_mut(s);
+            return;
+        }
+
+        self.permute_generic(state);
+    }
+
+    /// SIMD-parallel compression over `<Goldilocks as Field>::Packing`.
+    ///
+    /// On x86_64 (AVX2 or AVX512), keeps state in packed registers throughout
+    /// the rounds. RC adds and sboxes use the packed `Add`/`square`/`Mul`
+    /// (which fully reduce), and the MDS uses the dedicated `mds_mul_simd`
+    /// (delayed reduction via shift+add multiplication by tiny constants).
+    ///
+    /// On other architectures (e.g. aarch64+NEON, scalar fallback), we
+    /// deinterleave to per-lane scalar arrays and run the rounds in lockstep
+    /// across all W lanes. The MDS coefficients are tiny (max 9), so the
+    /// scalar `mds_mul_scalar` (u128 accumulator + single `reduce128` per
+    /// output) is far cheaper than the packed type's fully-reducing `Mul`.
+    ///
+    /// `FEEDFORWARD = true` adds back the original input (compression / Davies-Meyer);
+    /// `FEEDFORWARD = false` is the raw permutation (overwrite sponge).
+    #[inline(always)]
+    fn simd_core<const FEEDFORWARD: bool>(&self, state: &mut [<Goldilocks as Field>::Packing; POSEIDON1_WIDTH]) {
+        #[cfg(any(
+            all(target_arch = "x86_64", target_feature = "avx2", not(target_feature = "avx512f")),
+            all(target_arch = "x86_64", target_feature = "avx512f"),
+        ))]
+        {
+            type P = <Goldilocks as Field>::Packing;
+
+            #[cfg(all(target_arch = "x86_64", target_feature = "avx2", not(target_feature = "avx512f")))]
+            use crate::x86_64_avx2::packing::{add_canonical_scalar, mds_mul_simd};
+            #[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
+            use crate::x86_64_avx512::packing::{add_canonical_scalar, mds_mul_simd};
+
+            // 8 named SSA scalars rather than an array — otherwise LLVM
+            // re-rolls the (identical-shape) per-slot sboxes back into a loop,
+            // serializing them through a memory-resident state. Naming each
+            // slot keeps each sbox a distinct value, enabling ILP across the
+            // 8 slots and keeping everything in zmm/ymm registers across all
+            // 30 rounds.
+            //
+            // `add_canonical_scalar` skips the `canonicalize` that the generic
+            // packed `Add` applies to its RHS — round constants are canonical
+            // by construction (all `< P`).
+            let initial = *state;
+            let [mut s0, mut s1, mut s2, mut s3, mut s4, mut s5, mut s6, mut s7] = initial;
+
+            // Initial full rounds.
+            for rc in GOLDILOCKS_POSEIDON1_RC_8.iter().take(POSEIDON1_HALF_FULL_ROUNDS) {
+                s0 = add_canonical_scalar(s0, rc[0]);
+                s1 = add_canonical_scalar(s1, rc[1]);
+                s2 = add_canonical_scalar(s2, rc[2]);
+                s3 = add_canonical_scalar(s3, rc[3]);
+                s4 = add_canonical_scalar(s4, rc[4]);
+                s5 = add_canonical_scalar(s5, rc[5]);
+                s6 = add_canonical_scalar(s6, rc[6]);
+                s7 = add_canonical_scalar(s7, rc[7]);
+                s0 = sbox_full::<P>(s0);
+                s1 = sbox_full::<P>(s1);
+                s2 = sbox_full::<P>(s2);
+                s3 = sbox_full::<P>(s3);
+                s4 = sbox_full::<P>(s4);
+                s5 = sbox_full::<P>(s5);
+                s6 = sbox_full::<P>(s6);
+                s7 = sbox_full::<P>(s7);
+                [s0, s1, s2, s3, s4, s5, s6, s7] = mds_mul_simd([s0, s1, s2, s3, s4, s5, s6, s7]);
+            }
+
+            // Partial rounds.
+            //
+            // NB: the Appendix-B sparse partial-round decomposition (one dense
+            // `m_i` multiply + per-round rank-1 updates, as used by the AIR and
+            // the KoalaBear-16 permutation) was implemented and measured here and
+            // is ~13% SLOWER for Goldilocks: this circulant MDS has tiny entries
+            // {1,3,4,7,8,9} that strength-reduce to shift/adds and batch 8 terms
+            // into a single `reduce128` per output, whereas the sparse form needs
+            // arbitrary-constant 64x64 multiplies (one `reduce128` each → 15 vs 8
+            // reductions per round). Kept the full circulant MDS.
+            for rc in GOLDILOCKS_POSEIDON1_RC_8
+                .iter()
+                .skip(POSEIDON1_HALF_FULL_ROUNDS)
+                .take(POSEIDON1_PARTIAL_ROUNDS)
+            {
+                s0 = add_canonical_scalar(s0, rc[0]);
+                s1 = add_canonical_scalar(s1, rc[1]);
+                s2 = add_canonical_scalar(s2, rc[2]);
+                s3 = add_canonical_scalar(s3, rc[3]);
+                s4 = add_canonical_scalar(s4, rc[4]);
+                s5 = add_canonical_scalar(s5, rc[5]);
+                s6 = add_canonical_scalar(s6, rc[6]);
+                s7 = add_canonical_scalar(s7, rc[7]);
+                s0 = sbox_full::<P>(s0);
+                [s0, s1, s2, s3, s4, s5, s6, s7] = mds_mul_simd([s0, s1, s2, s3, s4, s5, s6, s7]);
+            }
+
+            // Terminal full rounds.
+            for rc in GOLDILOCKS_POSEIDON1_RC_8
+                .iter()
+                .take(POSEIDON1_N_ROUNDS)
+                .skip(POSEIDON1_HALF_FULL_ROUNDS + POSEIDON1_PARTIAL_ROUNDS)
+            {
+                s0 = add_canonical_scalar(s0, rc[0]);
+                s1 = add_canonical_scalar(s1, rc[1]);
+                s2 = add_canonical_scalar(s2, rc[2]);
+                s3 = add_canonical_scalar(s3, rc[3]);
+                s4 = add_canonical_scalar(s4, rc[4]);
+                s5 = add_canonical_scalar(s5, rc[5]);
+                s6 = add_canonical_scalar(s6, rc[6]);
+                s7 = add_canonical_scalar(s7, rc[7]);
+                s0 = sbox_full::<P>(s0);
+                s1 = sbox_full::<P>(s1);
+                s2 = sbox_full::<P>(s2);
+                s3 = sbox_full::<P>(s3);
+                s4 = sbox_full::<P>(s4);
+                s5 = sbox_full::<P>(s5);
+                s6 = sbox_full::<P>(s6);
+                s7 = sbox_full::<P>(s7);
+                [s0, s1, s2, s3, s4, s5, s6, s7] = mds_mul_simd([s0, s1, s2, s3, s4, s5, s6, s7]);
+            }
+
+            if FEEDFORWARD {
+                // Compression-mode add-back of the original input.
+                state[0] = s0 + initial[0];
+                state[1] = s1 + initial[1];
+                state[2] = s2 + initial[2];
+                state[3] = s3 + initial[3];
+                state[4] = s4 + initial[4];
+                state[5] = s5 + initial[5];
+                state[6] = s6 + initial[6];
+                state[7] = s7 + initial[7];
+            } else {
+                state[0] = s0;
+                state[1] = s1;
+                state[2] = s2;
+                state[3] = s3;
+                state[4] = s4;
+                state[5] = s5;
+                state[6] = s6;
+                state[7] = s7;
+            }
+        }
+
+        #[cfg(not(any(
+            all(target_arch = "x86_64", target_feature = "avx2", not(target_feature = "avx512f")),
+            all(target_arch = "x86_64", target_feature = "avx512f"),
+        )))]
+        {
+            type P = <Goldilocks as Field>::Packing;
+            const W: usize = <P as PackedValue>::WIDTH;
+
+            let mut lanes: [[Goldilocks; POSEIDON1_WIDTH]; W] = [[Goldilocks::ZERO; POSEIDON1_WIDTH]; W];
+            for i in 0..POSEIDON1_WIDTH {
+                let s = state[i].as_slice();
+                for (k, lane) in lanes.iter_mut().enumerate() {
+                    lane[i] = s[k];
+                }
+            }
+            let initial = lanes;
+
+            // Initial full rounds.
+            for rc in GOLDILOCKS_POSEIDON1_RC_8.iter().take(POSEIDON1_HALF_FULL_ROUNDS) {
+                for lane in lanes.iter_mut() {
+                    for (i, s) in lane.iter_mut().enumerate() {
+                        *s += rc[i];
+                    }
+                }
+                for lane in lanes.iter_mut() {
+                    for s in lane.iter_mut() {
+                        *s = sbox_full::<Goldilocks>(*s);
+                    }
+                }
+                for lane in lanes.iter_mut() {
+                    mds_mul_scalar(lane);
+                }
+            }
+
+            // Partial rounds.
+            for rc in GOLDILOCKS_POSEIDON1_RC_8
+                .iter()
+                .skip(POSEIDON1_HALF_FULL_ROUNDS)
+                .take(POSEIDON1_PARTIAL_ROUNDS)
+            {
+                for lane in lanes.iter_mut() {
+                    for (i, s) in lane.iter_mut().enumerate() {
+                        *s += rc[i];
+                    }
+                }
+                for lane in lanes.iter_mut() {
+                    lane[0] = sbox_full::<Goldilocks>(lane[0]);
+                }
+                for lane in lanes.iter_mut() {
+                    mds_mul_scalar(lane);
+                }
+            }
+
+            // Terminal full rounds.
+            for rc in GOLDILOCKS_POSEIDON1_RC_8
+                .iter()
+                .take(POSEIDON1_N_ROUNDS)
+                .skip(POSEIDON1_HALF_FULL_ROUNDS + POSEIDON1_PARTIAL_ROUNDS)
+            {
+                for lane in lanes.iter_mut() {
+                    for (i, s) in lane.iter_mut().enumerate() {
+                        *s += rc[i];
+                    }
+                }
+                for lane in lanes.iter_mut() {
+                    for s in lane.iter_mut() {
+                        *s = sbox_full::<Goldilocks>(*s);
+                    }
+                }
+                for lane in lanes.iter_mut() {
+                    mds_mul_scalar(lane);
+                }
+            }
+
+            for i in 0..POSEIDON1_WIDTH {
+                state[i] = if FEEDFORWARD {
+                    P::from_fn(|k| lanes[k][i] + initial[k][i])
+                } else {
+                    P::from_fn(|k| lanes[k][i])
+                };
+            }
+        }
+    }
+}
+
+/// Return the default width-8 Poseidon1 permutation.
+#[inline]
+pub fn default_goldilocks_poseidon1_8() -> Poseidon1Goldilocks8 {
+    Poseidon1Goldilocks8
+}
+
+// =========================================================================
+// Tests
+// =========================================================================
+
+#[cfg(test)]
+#[allow(clippy::needless_range_loop)]
+mod tests {
+    use super::*;
+
+    /// The scalar and generic paths must agree on all inputs.
+    #[test]
+    fn scalar_matches_generic() {
+        let p = Poseidon1Goldilocks8;
+        let mut input = [Goldilocks::ZERO; 8];
+        for i in 0..8 {
+            input[i] = Goldilocks::new(0xdead_beef_0000_0001u64.wrapping_mul(i as u64 + 1));
+        }
+        let fast = p.permute(input);
+        let mut slow = input;
+        p.permute_generic(&mut slow);
+        assert_eq!(fast, slow);
+    }
+
+    /// SIMD MDS path must match the scalar MDS for arbitrary state.
+    #[cfg(any(
+        all(target_arch = "x86_64", target_feature = "avx2", not(target_feature = "avx512f")),
+        all(target_arch = "x86_64", target_feature = "avx512f"),
+    ))]
+    #[test]
+    fn simd_mds_matches_scalar_mds() {
+        type P = <Goldilocks as Field>::Packing;
+        let width = <P as PackedValue>::WIDTH;
+
+        // Build packed state with distinct per-lane values, including some
+        // u64s near the field-order boundary to stress the reduction.
+        let mut packed: [P; 8] = [P::ZERO; 8];
+        let edges: [u64; 4] = [0, 1, crate::P - 1, u64::MAX];
+        for i in 0..8 {
+            packed[i] = P::from_fn(|k| {
+                if k < 4 && i % 2 == 0 {
+                    Goldilocks::new(edges[k])
+                } else {
+                    Goldilocks::new(0xa5a5_0000_0000_0001u64.wrapping_mul((i * 17 + k * 31 + 1) as u64))
+                }
+            });
+        }
+        let initial = packed;
+
+        // Reference: per-lane scalar MDS.
+        let mut expected_lanes: Vec<[Goldilocks; 8]> = (0..width)
+            .map(|k| std::array::from_fn(|i| initial[i].as_slice()[k]))
+            .collect();
+        for lane in expected_lanes.iter_mut() {
+            mds_mul_scalar(lane);
+        }
+
+        #[cfg(all(target_arch = "x86_64", target_feature = "avx2", not(target_feature = "avx512f")))]
+        {
+            packed = crate::x86_64_avx2::packing::mds_mul_simd(packed);
+        }
+        #[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
+        {
+            packed = crate::x86_64_avx512::packing::mds_mul_simd(packed);
+        }
+
+        for i in 0..8 {
+            for k in 0..width {
+                assert_eq!(
+                    packed[i].as_slice()[k],
+                    expected_lanes[k][i],
+                    "mismatch at slot {i}, lane {k}"
+                );
+            }
+        }
+    }
+
+    /// `compress_in_place::<Packing>` must agree with per-lane scalar compression.
+    /// Exercises the SIMD dispatch branch.
+    #[test]
+    fn compress_in_place_dispatches_packed_correctly() {
+        type P = <Goldilocks as Field>::Packing;
+        let width = <P as PackedValue>::WIDTH;
+        let p = Poseidon1Goldilocks8;
+
+        // Build distinct inputs per lane so we'd notice a swap or duplication.
+        let mut packed: [P; 8] = [P::ZERO; 8];
+        for i in 0..8 {
+            packed[i] =
+                P::from_fn(|k| Goldilocks::new(0xa5a5_0000_0000_0001u64.wrapping_mul((i * 17 + k * 31 + 1) as u64)));
+        }
+        let initial = packed;
+
+        // Reference: per-lane scalar compress.
+        let mut expected_lanes: Vec<[Goldilocks; 8]> = (0..width)
+            .map(|k| std::array::from_fn(|i| initial[i].as_slice()[k]))
+            .collect();
+        for lane in expected_lanes.iter_mut() {
+            p.compress_in_place(lane);
+        }
+
+        p.compress_in_place(&mut packed);
+
+        for i in 0..8 {
+            for k in 0..width {
+                assert_eq!(
+                    packed[i].as_slice()[k],
+                    expected_lanes[k][i],
+                    "mismatch at slot {i}, lane {k}"
+                );
+            }
+        }
+    }
+
+    /// The permutation is deterministic and non-trivial.
+    #[test]
+    fn permutation_is_deterministic() {
+        let input: [Goldilocks; 8] = [
+            Goldilocks::new(1),
+            Goldilocks::new(2),
+            Goldilocks::new(3),
+            Goldilocks::new(4),
+            Goldilocks::new(5),
+            Goldilocks::new(6),
+            Goldilocks::new(7),
+            Goldilocks::new(8),
+        ];
+        let p = Poseidon1Goldilocks8;
+        let a = p.permute(input);
+        let b = p.permute(input);
+        assert_eq!(a, b);
+        assert_ne!(a, input);
+    }
+
+    /// Rough avalanche smoke test: distinct inputs produce distinct outputs.
+    #[test]
+    fn permutation_is_injective_on_small_inputs() {
+        let p = Poseidon1Goldilocks8;
+        let mut seen = std::collections::HashSet::new();
+        for i in 0..64u64 {
+            let mut input = [Goldilocks::ZERO; 8];
+            input[0] = Goldilocks::new(i);
+            let out = p.permute(input);
+            assert!(seen.insert(out[0].value), "collision at i={i}");
+        }
+    }
+
+    /// Plonky3-compatibility known-answer vector.
+    ///
+    /// Reference: `plonky3/goldilocks/src/poseidon1.rs::tests::test_poseidon_goldilocks_width_8`
+    /// — input `[0..8)`, expected output hardcoded from upstream.
+    #[test]
+    fn test_plonky3_compatibility() {
+        use field::PrimeField64;
+
+        let p = default_goldilocks_poseidon1_8();
+        let mut input: [Goldilocks; 8] = [0, 1, 2, 3, 4, 5, 6, 7].map(Goldilocks::new);
+        p.permute_mut(&mut input);
+        let expected: [u64; 8] = [
+            2431226948502761687,
+            9427563026145807618,
+            6827549936272051660,
+            16907684411084503785,
+            10131745626715172913,
+            17448305483431576765,
+            9066501914269485014,
+            12095238468458521303,
+        ];
+        let got: [u64; 8] = input.map(|x| x.as_canonical_u64());
+        assert_eq!(got, expected);
+    }
+}
diff --git a/crates/backend/goldilocks/src/x86_64_avx2/mod.rs b/crates/backend/goldilocks/src/x86_64_avx2/mod.rs
new file mode 100644
index 000000000..4e8ba31a8
--- /dev/null
+++ b/crates/backend/goldilocks/src/x86_64_avx2/mod.rs
@@ -0,0 +1,5 @@
+// Credits: Plonky3 (https://github.com/Plonky3/Plonky3) (MIT and Apache-2.0 licenses).
+
+pub(crate) mod packing;
+
+pub use packing::*;
diff --git a/crates/backend/goldilocks/src/x86_64_avx2/packing.rs b/crates/backend/goldilocks/src/x86_64_avx2/packing.rs
new file mode 100644
index 000000000..21a206655
--- /dev/null
+++ b/crates/backend/goldilocks/src/x86_64_avx2/packing.rs
@@ -0,0 +1,496 @@
+// Credits: Plonky3 (https://github.com/Plonky3/Plonky3) (MIT and Apache-2.0 licenses).
+
+use alloc::vec::Vec;
+use core::arch::x86_64::*;
+use core::fmt::Debug;
+use core::iter::{Product, Sum};
+use core::mem::transmute;
+use core::ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Sub, SubAssign};
+
+use field::interleave::{interleave_u64, interleave_u128};
+use field::op_assign_macros::{
+    impl_add_assign, impl_add_base_field, impl_div_methods, impl_mul_base_field, impl_mul_methods, impl_packed_value,
+    impl_rng, impl_sub_assign, impl_sub_base_field, impl_sum_prod_base_field, ring_sum,
+};
+use field::{
+    Algebra, Field, InjectiveMonomial, PackedField, PackedFieldPow2, PackedValue, PermutationMonomial,
+    PrimeCharacteristicRing, PrimeField64, impl_packed_field_pow_2,
+};
+use rand::Rng;
+use rand::distr::{Distribution, StandardUniform};
+use utils::reconstitute_from_base;
+
+use crate::helpers::exp_10540996611094048183;
+use crate::{Goldilocks, P};
+
+const WIDTH: usize = 4;
+
+/// Vectorized AVX2 implementation of `Goldilocks` arithmetic.
+#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)]
+#[repr(transparent)] // Needed to make `transmute`s safe.
+#[must_use]
+pub struct PackedGoldilocksAVX2(pub [Goldilocks; WIDTH]);
+
+impl PackedGoldilocksAVX2 {
+    /// Get an arch-specific vector representing the packed values.
+    #[inline]
+    #[must_use]
+    pub(crate) fn to_vector(self) -> __m256i {
+        unsafe {
+            // Safety: `Goldilocks` is `repr(transparent)` over `u64`, so
+            // `[Goldilocks; 4]` and `__m256i` share size and layout.
+            transmute(self)
+        }
+    }
+
+    /// Make a packed field vector from an arch-specific vector.
+    ///
+    /// Elements of `Goldilocks` are allowed to be arbitrary u64s so this function
+    /// is safe (unlike Mersenne31/MontyField31 variants).
+    #[inline]
+    pub(crate) fn from_vector(vector: __m256i) -> Self {
+        unsafe { transmute(vector) }
+    }
+
+    /// Copy `value` to all positions in a packed vector. This is the same as
+    /// `From<Goldilocks>::from`, but `const`.
+    #[inline]
+    const fn broadcast(value: Goldilocks) -> Self {
+        Self([value; WIDTH])
+    }
+}
+
+impl From<Goldilocks> for PackedGoldilocksAVX2 {
+    fn from(x: Goldilocks) -> Self {
+        Self::broadcast(x)
+    }
+}
+
+impl Add for PackedGoldilocksAVX2 {
+    type Output = Self;
+    #[inline]
+    fn add(self, rhs: Self) -> Self {
+        Self::from_vector(add(self.to_vector(), rhs.to_vector()))
+    }
+}
+
+impl Sub for PackedGoldilocksAVX2 {
+    type Output = Self;
+    #[inline]
+    fn sub(self, rhs: Self) -> Self {
+        Self::from_vector(sub(self.to_vector(), rhs.to_vector()))
+    }
+}
+
+impl Neg for PackedGoldilocksAVX2 {
+    type Output = Self;
+    #[inline]
+    fn neg(self) -> Self {
+        Self::from_vector(neg(self.to_vector()))
+    }
+}
+
+impl Mul for PackedGoldilocksAVX2 {
+    type Output = Self;
+    #[inline]
+    fn mul(self, rhs: Self) -> Self {
+        Self::from_vector(mul(self.to_vector(), rhs.to_vector()))
+    }
+}
+
+impl_add_assign!(PackedGoldilocksAVX2);
+impl_sub_assign!(PackedGoldilocksAVX2);
+impl_mul_methods!(PackedGoldilocksAVX2);
+ring_sum!(PackedGoldilocksAVX2);
+impl_rng!(PackedGoldilocksAVX2);
+
+impl PrimeCharacteristicRing for PackedGoldilocksAVX2 {
+    type PrimeSubfield = Goldilocks;
+
+    const ZERO: Self = Self::broadcast(Goldilocks::ZERO);
+    const ONE: Self = Self::broadcast(Goldilocks::ONE);
+    const TWO: Self = Self::broadcast(Goldilocks::TWO);
+    const NEG_ONE: Self = Self::broadcast(Goldilocks::NEG_ONE);
+
+    #[inline]
+    fn from_prime_subfield(f: Self::PrimeSubfield) -> Self {
+        f.into()
+    }
+
+    #[inline]
+    fn halve(&self) -> Self {
+        Self::from_vector(halve(self.to_vector()))
+    }
+
+    #[inline]
+    fn square(&self) -> Self {
+        Self::from_vector(square(self.to_vector()))
+    }
+
+    #[inline]
+    fn zero_vec(len: usize) -> Vec<Self> {
+        // SAFETY: this is a repr(transparent) wrapper around an array.
+        unsafe { reconstitute_from_base(Goldilocks::zero_vec(len * WIDTH)) }
+    }
+}
+
+// Goldilocks: p - 1 = 2^32 * 3 * 5 * 17 * ...; smallest D coprime to (p-1) is 7.
+impl InjectiveMonomial<7> for PackedGoldilocksAVX2 {}
+
+impl PermutationMonomial<7> for PackedGoldilocksAVX2 {
+    fn injective_exp_root_n(&self) -> Self {
+        exp_10540996611094048183(*self)
+    }
+}
+
+impl_add_base_field!(PackedGoldilocksAVX2, Goldilocks);
+impl_sub_base_field!(PackedGoldilocksAVX2, Goldilocks);
+impl_mul_base_field!(PackedGoldilocksAVX2, Goldilocks);
+impl_div_methods!(PackedGoldilocksAVX2, Goldilocks);
+impl_sum_prod_base_field!(PackedGoldilocksAVX2, Goldilocks);
+
+impl Algebra<Goldilocks> for PackedGoldilocksAVX2 {}
+
+impl_packed_value!(PackedGoldilocksAVX2, Goldilocks, WIDTH);
+
+unsafe impl PackedField for PackedGoldilocksAVX2 {
+    type Scalar = Goldilocks;
+}
+
+impl_packed_field_pow_2!(
+    PackedGoldilocksAVX2;
+    [
+        (1, interleave_u64),
+        (2, interleave_u128),
+    ],
+    WIDTH
+);
+
+// Resources:
+// 1. Intel Intrinsics Guide: https://software.intel.com/sites/landingpage/IntrinsicsGuide/
+// 2. uops.info: https://uops.info/table.html
+//
+// Implementation notes:
+// - AVX has no unsigned 64-bit comparisons. We emulate them via signed comparisons after a
+//   1<<63 shift (`shift`/`canonicalize_s`/etc).
+// - AVX has no add-with-carry; emulated via `result < operand` overflow detection.
+
+const SIGN_BIT: __m256i = unsafe { transmute([i64::MIN; WIDTH]) };
+const SHIFTED_FIELD_ORDER: __m256i = unsafe { transmute([Goldilocks::ORDER_U64 ^ (i64::MIN as u64); WIDTH]) };
+
+/// Equal to `2^32 - 1 = 2^64 mod P`.
+const EPSILON: __m256i = unsafe { transmute([Goldilocks::ORDER_U64.wrapping_neg(); WIDTH]) };
+
+/// Add 2^63 (XOR with sign bit). Used to emulate unsigned compares with signed ones.
+#[inline]
+fn shift(x: __m256i) -> __m256i {
+    unsafe { _mm256_xor_si256(x, SIGN_BIT) }
+}
+
+/// Convert to canonical representation. Argument is shifted by 1<<63; result is too.
+#[inline]
+unsafe fn canonicalize_s(x_s: __m256i) -> __m256i {
+    unsafe {
+        let mask = _mm256_cmpgt_epi64(SHIFTED_FIELD_ORDER, x_s);
+        let wrapback_amt = _mm256_andnot_si256(mask, EPSILON);
+        _mm256_add_epi64(x_s, wrapback_amt)
+    }
+}
+
+/// Add `x + y_s` where `y_s` is pre-shifted; output is shifted. Assumes `x + y < 2^64 + P`.
+#[inline]
+unsafe fn add_no_double_overflow_64_64s_s(x: __m256i, y_s: __m256i) -> __m256i {
+    unsafe {
+        let res_wrapped_s = _mm256_add_epi64(x, y_s);
+        let mask = _mm256_cmpgt_epi64(y_s, res_wrapped_s);
+        let wrapback_amt = _mm256_srli_epi64::<32>(mask);
+        _mm256_add_epi64(res_wrapped_s, wrapback_amt)
+    }
+}
+
+/// Goldilocks modular addition. Result may exceed `P`.
+#[inline]
+fn add(x: __m256i, y: __m256i) -> __m256i {
+    unsafe {
+        let y_s = shift(y);
+        let res_s = add_no_double_overflow_64_64s_s(x, canonicalize_s(y_s));
+        shift(res_s)
+    }
+}
+
+/// Goldilocks modular subtraction. Result may exceed `P`.
+#[inline]
+fn sub(x: __m256i, y: __m256i) -> __m256i {
+    unsafe {
+        let mut y_s = shift(y);
+        y_s = canonicalize_s(y_s);
+        let x_s = shift(x);
+        let mask = _mm256_cmpgt_epi64(y_s, x_s);
+        let wrapback_amt = _mm256_srli_epi64::<32>(mask);
+        let res_wrapped = _mm256_sub_epi64(x_s, y_s);
+        _mm256_sub_epi64(res_wrapped, wrapback_amt)
+    }
+}
+
+/// Goldilocks modular negation. Result may exceed `P`.
+#[inline]
+fn neg(y: __m256i) -> __m256i {
+    unsafe {
+        let y_s = shift(y);
+        _mm256_sub_epi64(SHIFTED_FIELD_ORDER, canonicalize_s(y_s))
+    }
+}
+
+/// Halve a vector of Goldilocks field elements.
+#[inline(always)]
+pub(crate) fn halve(input: __m256i) -> __m256i {
+    // For val in [0, P): val even -> val/2 = val>>1; val odd -> (val+P)/2 = (val>>1) + (P+1)/2.
+    unsafe {
+        const ONE: __m256i = unsafe { transmute([1_i64; 4]) };
+        const ZERO: __m256i = unsafe { transmute([0_i64; 4]) };
+        let half = _mm256_set1_epi64x(P.div_ceil(2) as i64);
+
+        let least_bit = _mm256_and_si256(input, ONE);
+        let t = _mm256_srli_epi64::<1>(input);
+        let neg_least_bit = _mm256_sub_epi64(ZERO, least_bit);
+        let maybe_half = _mm256_and_si256(half, neg_least_bit);
+        _mm256_add_epi64(t, maybe_half)
+    }
+}
+
+/// Full 64x64 -> 128 multiplication, returning `(hi, lo)`.
+#[inline]
+fn mul64_64(x: __m256i, y: __m256i) -> (__m256i, __m256i) {
+    unsafe {
+        // Move the high 32 bits of each lane into the low 32 bits via a float-domain swizzle.
+        // (vpshufd / movehdup runs on port 5 and doesn't compete with the multiplier on ports 0/1.)
+        let x_hi = _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(x)));
+        let y_hi = _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(y)));
+
+        let mul_ll = _mm256_mul_epu32(x, y);
+        let mul_lh = _mm256_mul_epu32(x, y_hi);
+        let mul_hl = _mm256_mul_epu32(x_hi, y);
+        let mul_hh = _mm256_mul_epu32(x_hi, y_hi);
+
+        let mul_ll_hi = _mm256_srli_epi64::<32>(mul_ll);
+        let t0 = _mm256_add_epi64(mul_hl, mul_ll_hi);
+        let t0_lo = _mm256_and_si256(t0, EPSILON);
+        let t0_hi = _mm256_srli_epi64::<32>(t0);
+        let t1 = _mm256_add_epi64(mul_lh, t0_lo);
+        let t2 = _mm256_add_epi64(mul_hh, t0_hi);
+        let t1_hi = _mm256_srli_epi64::<32>(t1);
+        let res_hi = _mm256_add_epi64(t2, t1_hi);
+
+        let t1_lo = _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(t1)));
+        let res_lo = _mm256_blend_epi32::<0xaa>(mul_ll, t1_lo);
+
+        (res_hi, res_lo)
+    }
+}
+
+/// Full 64-bit squaring.
+#[inline]
+fn square64(x: __m256i) -> (__m256i, __m256i) {
+    unsafe {
+        let x_hi = _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(x)));
+
+        let mul_ll = _mm256_mul_epu32(x, x);
+        let mul_lh = _mm256_mul_epu32(x, x_hi);
+        let mul_hh = _mm256_mul_epu32(x_hi, x_hi);
+
+        let mul_ll_hi = _mm256_srli_epi64::<33>(mul_ll);
+        let t0 = _mm256_add_epi64(mul_lh, mul_ll_hi);
+        let t0_hi = _mm256_srli_epi64::<31>(t0);
+        let res_hi = _mm256_add_epi64(mul_hh, t0_hi);
+
+        let mul_lh_lo = _mm256_slli_epi64::<33>(mul_lh);
+        let res_lo = _mm256_add_epi64(mul_ll, mul_lh_lo);
+
+        (res_hi, res_lo)
+    }
+}
+
+/// Add `x_s + y` where `x_s` is pre-shifted by 2^63 and `y <= 2^64 - 2^32`. Result is shifted.
+#[inline]
+unsafe fn add_small_64s_64_s(x_s: __m256i, y: __m256i) -> __m256i {
+    unsafe {
+        let res_wrapped_s = _mm256_add_epi64(x_s, y);
+        let mask = _mm256_cmpgt_epi32(x_s, res_wrapped_s);
+        let wrapback_amt = _mm256_srli_epi64::<32>(mask);
+        _mm256_add_epi64(res_wrapped_s, wrapback_amt)
+    }
+}
+
+/// Subtract `y` from `x_s` (`x_s` pre-shifted, `y <= 2^64 - 2^32`). Result is shifted.
+#[inline]
+unsafe fn sub_small_64s_64_s(x_s: __m256i, y: __m256i) -> __m256i {
+    unsafe {
+        let res_wrapped_s = _mm256_sub_epi64(x_s, y);
+        let mask = _mm256_cmpgt_epi32(res_wrapped_s, x_s);
+        let wrapback_amt = _mm256_srli_epi64::<32>(mask);
+        _mm256_sub_epi64(res_wrapped_s, wrapback_amt)
+    }
+}
+
+/// Reduce a 128-bit value (high, low) modulo `P`. Result may exceed `P`.
+#[inline]
+fn reduce128(x: (__m256i, __m256i)) -> __m256i {
+    unsafe {
+        let (hi0, lo0) = x;
+
+        let lo0_s = shift(lo0);
+
+        let hi_hi0 = _mm256_srli_epi64::<32>(hi0);
+
+        // 2^96 = -1 mod P.
+        let lo1_s = sub_small_64s_64_s(lo0_s, hi_hi0);
+
+        // Bottom 32 bits of hi0 times 2^64 = 2^32 - 1 = EPSILON mod P.
+        let t1 = _mm256_mul_epu32(hi0, EPSILON);
+
+        let lo2_s = add_small_64s_64_s(lo1_s, t1);
+        shift(lo2_s)
+    }
+}
+
+/// Goldilocks modular multiplication. Result may exceed `P`.
+#[inline]
+fn mul(x: __m256i, y: __m256i) -> __m256i {
+    reduce128(mul64_64(x, y))
+}
+
+/// Goldilocks modular square. Result may exceed `P`.
+#[inline]
+fn square(x: __m256i) -> __m256i {
+    reduce128(square64(x))
+}
+
+// =========================================================================
+// SIMD-vectorized Poseidon1 MDS multiplication
+// =========================================================================
+//
+// Computes the width-8 circulant MDS matrix-vector product entirely in
+// `__m256i` registers, with delayed reduction. Each output is
+// `sum_j MDS_ROW[(j-i) mod 8] * state[j]`. Coefficients are in
+// {1, 3, 4, 7, 8, 9} (max 9), so per-term products fit in u68 and sums of
+// 8 terms fit comfortably in u71.
+//
+// We multiply via two 32x32 `_mm256_mul_epu32` calls (low half and high
+// half of state). Sums of the low and high halves are accumulated
+// separately into u64s, then we assemble the (hi, lo) u128 pair and call
+// `reduce128`.
+
+use crate::poseidon1::{MDS8_ROW, POSEIDON1_WIDTH};
+
+/// Add a known-canonical `Goldilocks` scalar to a packed state, skipping the
+/// `canonicalize` that the generic `Add` applies to its right-hand side.
+///
+/// # Safety contract
+/// The caller must guarantee that `c.value < P`. Round constants from
+/// `GOLDILOCKS_POSEIDON1_RC_8` satisfy this trivially.
+#[inline(always)]
+pub(crate) fn add_canonical_scalar(x: PackedGoldilocksAVX2, c: Goldilocks) -> PackedGoldilocksAVX2 {
+    unsafe {
+        let c_vec = PackedGoldilocksAVX2::from(c).to_vector();
+        // Pre-shift the canonical value so `add_no_double_overflow_64_64s_s`
+        // can run on it without going through the generic `canonicalize_s`
+        // (which is only needed for arbitrary u64s).
+        let c_vec_s = shift(c_vec);
+        let res_s = add_no_double_overflow_64_64s_s(x.to_vector(), c_vec_s);
+        PackedGoldilocksAVX2::from_vector(shift(res_s))
+    }
+}
+
+/// Compute the `I`-th output of the width-8 circulant MDS matrix-vector product.
+///
+/// `I` is a const generic so that each instantiation is a distinct function
+/// from LLVM's perspective — otherwise LLVM rolls all 8 output computations
+/// back into a loop, serializing them and bouncing state through stack memory.
+#[inline(always)]
+unsafe fn mds_output<const I: usize>(s: &[__m256i; 8], s_hi: &[__m256i; 8]) -> __m256i {
+    unsafe {
+        let mut sum_ll = _mm256_setzero_si256();
+        let mut sum_hl = _mm256_setzero_si256();
+        let mut j = 0;
+        while j < 8 {
+            let c = MDS8_ROW[(j + 8 - I) % 8];
+            let c_vec = _mm256_set1_epi64x(c);
+            sum_ll = _mm256_add_epi64(sum_ll, _mm256_mul_epu32(s[j], c_vec));
+            sum_hl = _mm256_add_epi64(sum_hl, _mm256_mul_epu32(s_hi[j], c_vec));
+            j += 1;
+        }
+
+        let sum_hl_shifted = _mm256_slli_epi64::<32>(sum_hl);
+        let lo = _mm256_add_epi64(sum_ll, sum_hl_shifted);
+        let lo_s = _mm256_xor_si256(lo, SIGN_BIT);
+        let sum_hl_shifted_s = _mm256_xor_si256(sum_hl_shifted, SIGN_BIT);
+        let carry_mask = _mm256_cmpgt_epi64(sum_hl_shifted_s, lo_s);
+        let hi_no_carry = _mm256_srli_epi64::<32>(sum_hl);
+        let hi = _mm256_sub_epi64(hi_no_carry, carry_mask);
+
+        reduce128((hi, lo))
+    }
+}
+
+/// SIMD MDS multiplication for the width-8 circulant Poseidon1 matrix.
+///
+/// Takes/returns by value so the caller can keep state in named SSA scalars
+/// (ymm registers) rather than indexing through a `&mut [P; 8]` (which forces
+/// the array through the stack). Each of the 8 outputs is computed by a
+/// distinct const-generic instantiation of `mds_output`, preventing LLVM
+/// from re-rolling them.
+#[inline(always)]
+pub(crate) fn mds_mul_simd(state: [PackedGoldilocksAVX2; POSEIDON1_WIDTH]) -> [PackedGoldilocksAVX2; POSEIDON1_WIDTH] {
+    unsafe {
+        let s: [__m256i; 8] = [
+            state[0].to_vector(),
+            state[1].to_vector(),
+            state[2].to_vector(),
+            state[3].to_vector(),
+            state[4].to_vector(),
+            state[5].to_vector(),
+            state[6].to_vector(),
+            state[7].to_vector(),
+        ];
+        let s_hi: [__m256i; 8] = [
+            _mm256_srli_epi64::<32>(s[0]),
+            _mm256_srli_epi64::<32>(s[1]),
+            _mm256_srli_epi64::<32>(s[2]),
+            _mm256_srli_epi64::<32>(s[3]),
+            _mm256_srli_epi64::<32>(s[4]),
+            _mm256_srli_epi64::<32>(s[5]),
+            _mm256_srli_epi64::<32>(s[6]),
+            _mm256_srli_epi64::<32>(s[7]),
+        ];
+
+        [
+            PackedGoldilocksAVX2::from_vector(mds_output::<0>(&s, &s_hi)),
+            PackedGoldilocksAVX2::from_vector(mds_output::<1>(&s, &s_hi)),
+            PackedGoldilocksAVX2::from_vector(mds_output::<2>(&s, &s_hi)),
+            PackedGoldilocksAVX2::from_vector(mds_output::<3>(&s, &s_hi)),
+            PackedGoldilocksAVX2::from_vector(mds_output::<4>(&s, &s_hi)),
+            PackedGoldilocksAVX2::from_vector(mds_output::<5>(&s, &s_hi)),
+            PackedGoldilocksAVX2::from_vector(mds_output::<6>(&s, &s_hi)),
+            PackedGoldilocksAVX2::from_vector(mds_output::<7>(&s, &s_hi)),
+        ]
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::{Goldilocks, PackedGoldilocksAVX2, WIDTH};
+
+    const SPECIAL_VALS: [Goldilocks; WIDTH] = Goldilocks::new_array([
+        0xFFFF_FFFF_0000_0000,
+        0xFFFF_FFFF_FFFF_FFFF,
+        0x0000_0000_0000_0001,
+        0xFFFF_FFFF_0000_0001,
+    ]);
+
+    #[test]
+    fn pack_round_trip() {
+        let p = PackedGoldilocksAVX2(SPECIAL_VALS);
+        let v = p.to_vector();
+        assert_eq!(PackedGoldilocksAVX2::from_vector(v).0, SPECIAL_VALS);
+    }
+}
diff --git a/crates/backend/goldilocks/src/x86_64_avx512/mod.rs b/crates/backend/goldilocks/src/x86_64_avx512/mod.rs
new file mode 100644
index 000000000..4e8ba31a8
--- /dev/null
+++ b/crates/backend/goldilocks/src/x86_64_avx512/mod.rs
@@ -0,0 +1,5 @@
+// Credits: Plonky3 (https://github.com/Plonky3/Plonky3) (MIT and Apache-2.0 licenses).
+
+pub(crate) mod packing;
+
+pub use packing::*;
diff --git a/crates/backend/goldilocks/src/x86_64_avx512/packing.rs b/crates/backend/goldilocks/src/x86_64_avx512/packing.rs
new file mode 100644
index 000000000..93852147e
--- /dev/null
+++ b/crates/backend/goldilocks/src/x86_64_avx512/packing.rs
@@ -0,0 +1,453 @@
+// Credits: Plonky3 (https://github.com/Plonky3/Plonky3) (MIT and Apache-2.0 licenses).
+
+use alloc::vec::Vec;
+use core::arch::x86_64::*;
+use core::fmt::Debug;
+use core::iter::{Product, Sum};
+use core::mem::transmute;
+use core::ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Sub, SubAssign};
+
+use field::interleave::{interleave_u64, interleave_u128, interleave_u256};
+use field::op_assign_macros::{
+    impl_add_assign, impl_add_base_field, impl_div_methods, impl_mul_base_field, impl_mul_methods, impl_packed_value,
+    impl_rng, impl_sub_assign, impl_sub_base_field, impl_sum_prod_base_field, ring_sum,
+};
+use field::{
+    Algebra, Field, InjectiveMonomial, PackedField, PackedFieldPow2, PackedValue, PermutationMonomial,
+    PrimeCharacteristicRing, PrimeField64, impl_packed_field_pow_2,
+};
+use rand::Rng;
+use rand::distr::{Distribution, StandardUniform};
+use utils::reconstitute_from_base;
+
+use crate::helpers::exp_10540996611094048183;
+use crate::{Goldilocks, P};
+
+const WIDTH: usize = 8;
+
+/// Vectorized AVX512 implementation of `Goldilocks` arithmetic.
+#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)]
+#[repr(transparent)] // Needed to make `transmute`s safe.
+#[must_use]
+pub struct PackedGoldilocksAVX512(pub [Goldilocks; WIDTH]);
+
+impl PackedGoldilocksAVX512 {
+    /// Get an arch-specific vector representing the packed values.
+    #[inline]
+    #[must_use]
+    pub(crate) fn to_vector(self) -> __m512i {
+        unsafe { transmute(self) }
+    }
+
+    /// Make a packed field vector from an arch-specific vector.
+    ///
+    /// Goldilocks elements may be arbitrary u64s, so this is always safe.
+    #[inline]
+    pub(crate) fn from_vector(vector: __m512i) -> Self {
+        unsafe { transmute(vector) }
+    }
+
+    /// Copy `value` to all positions in a packed vector. `const` version of `From<Goldilocks>`.
+    #[inline]
+    const fn broadcast(value: Goldilocks) -> Self {
+        Self([value; WIDTH])
+    }
+}
+
+impl From<Goldilocks> for PackedGoldilocksAVX512 {
+    fn from(x: Goldilocks) -> Self {
+        Self::broadcast(x)
+    }
+}
+
+impl Add for PackedGoldilocksAVX512 {
+    type Output = Self;
+    #[inline]
+    fn add(self, rhs: Self) -> Self {
+        Self::from_vector(add(self.to_vector(), rhs.to_vector()))
+    }
+}
+
+impl Sub for PackedGoldilocksAVX512 {
+    type Output = Self;
+    #[inline]
+    fn sub(self, rhs: Self) -> Self {
+        Self::from_vector(sub(self.to_vector(), rhs.to_vector()))
+    }
+}
+
+impl Neg for PackedGoldilocksAVX512 {
+    type Output = Self;
+    #[inline]
+    fn neg(self) -> Self {
+        Self::from_vector(neg(self.to_vector()))
+    }
+}
+
+impl Mul for PackedGoldilocksAVX512 {
+    type Output = Self;
+    #[inline]
+    fn mul(self, rhs: Self) -> Self {
+        Self::from_vector(mul(self.to_vector(), rhs.to_vector()))
+    }
+}
+
+impl_add_assign!(PackedGoldilocksAVX512);
+impl_sub_assign!(PackedGoldilocksAVX512);
+impl_mul_methods!(PackedGoldilocksAVX512);
+ring_sum!(PackedGoldilocksAVX512);
+impl_rng!(PackedGoldilocksAVX512);
+
+impl PrimeCharacteristicRing for PackedGoldilocksAVX512 {
+    type PrimeSubfield = Goldilocks;
+
+    const ZERO: Self = Self::broadcast(Goldilocks::ZERO);
+    const ONE: Self = Self::broadcast(Goldilocks::ONE);
+    const TWO: Self = Self::broadcast(Goldilocks::TWO);
+    const NEG_ONE: Self = Self::broadcast(Goldilocks::NEG_ONE);
+
+    #[inline]
+    fn from_prime_subfield(f: Self::PrimeSubfield) -> Self {
+        f.into()
+    }
+
+    #[inline]
+    fn halve(&self) -> Self {
+        Self::from_vector(halve(self.to_vector()))
+    }
+
+    #[inline]
+    fn square(&self) -> Self {
+        Self::from_vector(square(self.to_vector()))
+    }
+
+    #[inline]
+    fn zero_vec(len: usize) -> Vec<Self> {
+        // SAFETY: this is a repr(transparent) wrapper around an array.
+        unsafe { reconstitute_from_base(Goldilocks::zero_vec(len * WIDTH)) }
+    }
+}
+
+impl_add_base_field!(PackedGoldilocksAVX512, Goldilocks);
+impl_sub_base_field!(PackedGoldilocksAVX512, Goldilocks);
+impl_mul_base_field!(PackedGoldilocksAVX512, Goldilocks);
+impl_div_methods!(PackedGoldilocksAVX512, Goldilocks);
+impl_sum_prod_base_field!(PackedGoldilocksAVX512, Goldilocks);
+
+impl Algebra<Goldilocks> for PackedGoldilocksAVX512 {}
+
+impl InjectiveMonomial<7> for PackedGoldilocksAVX512 {}
+
+impl PermutationMonomial<7> for PackedGoldilocksAVX512 {
+    fn injective_exp_root_n(&self) -> Self {
+        exp_10540996611094048183(*self)
+    }
+}
+
+impl_packed_value!(PackedGoldilocksAVX512, Goldilocks, WIDTH);
+
+unsafe impl PackedField for PackedGoldilocksAVX512 {
+    type Scalar = Goldilocks;
+}
+
+impl_packed_field_pow_2!(
+    PackedGoldilocksAVX512;
+    [
+        (1, interleave_u64),
+        (2, interleave_u128),
+        (4, interleave_u256),
+    ],
+    WIDTH
+);
+
+const FIELD_ORDER: __m512i = unsafe { transmute([Goldilocks::ORDER_U64; WIDTH]) };
+const EPSILON: __m512i = unsafe { transmute([Goldilocks::ORDER_U64.wrapping_neg(); WIDTH]) };
+
+#[inline]
+unsafe fn canonicalize(x: __m512i) -> __m512i {
+    // For `x < ORDER`, `x - ORDER` underflows to a huge u64, so `min` picks the
+    // original. For `x >= ORDER`, `x - ORDER` is the canonical form (smaller),
+    // so `min` picks it. One sub + one min instead of cmpge + masked sub.
+    unsafe { _mm512_min_epu64(x, _mm512_sub_epi64(x, FIELD_ORDER)) }
+}
+
+/// Compute `x + y mod P`. Result may be > P.
+///
+/// # Safety
+/// Caller must ensure `x + y < 2^64 + P`.
+#[inline]
+unsafe fn add_no_double_overflow_64_64(x: __m512i, y: __m512i) -> __m512i {
+    unsafe {
+        let res_wrapped = _mm512_add_epi64(x, y);
+        let mask = _mm512_cmplt_epu64_mask(res_wrapped, y);
+        _mm512_mask_sub_epi64(res_wrapped, mask, res_wrapped, FIELD_ORDER)
+    }
+}
+
+/// Compute `x - y mod P`. Result may be > P.
+///
+/// # Safety
+/// Caller must ensure `x - y > -P`.
+#[inline]
+unsafe fn sub_no_double_overflow_64_64(x: __m512i, y: __m512i) -> __m512i {
+    unsafe {
+        let mask = _mm512_cmplt_epu64_mask(x, y);
+        let res_wrapped = _mm512_sub_epi64(x, y);
+        _mm512_mask_add_epi64(res_wrapped, mask, res_wrapped, FIELD_ORDER)
+    }
+}
+
+#[inline]
+fn add(x: __m512i, y: __m512i) -> __m512i {
+    unsafe { add_no_double_overflow_64_64(x, canonicalize(y)) }
+}
+
+#[inline]
+fn sub(x: __m512i, y: __m512i) -> __m512i {
+    unsafe { sub_no_double_overflow_64_64(x, canonicalize(y)) }
+}
+
+#[inline]
+fn neg(y: __m512i) -> __m512i {
+    unsafe { _mm512_sub_epi64(FIELD_ORDER, canonicalize(y)) }
+}
+
+/// Halve a vector of Goldilocks field elements.
+#[inline(always)]
+pub(crate) fn halve(input: __m512i) -> __m512i {
+    // For val in [0, P): val even -> val/2 = val>>1; val odd -> (val+P)/2 = (val>>1) + (P+1)/2.
+    unsafe {
+        const ONE: __m512i = unsafe { transmute([1_i64; 8]) };
+        let half = _mm512_set1_epi64(P.div_ceil(2) as i64);
+
+        let least_bit = _mm512_test_epi64_mask(input, ONE);
+        let t = _mm512_srli_epi64::<1>(input);
+        _mm512_mask_add_epi64(t, least_bit, t, half)
+    }
+}
+
+#[allow(clippy::useless_transmute)]
+const LO_32_BITS_MASK: __mmask16 = unsafe { transmute(0b0101010101010101u16) };
+
+/// Full 64x64 -> 128 multiplication, returning `(hi, lo)`.
+#[inline]
+fn mul64_64(x: __m512i, y: __m512i) -> (__m512i, __m512i) {
+    unsafe {
+        let x_hi = _mm512_castps_si512(_mm512_movehdup_ps(_mm512_castsi512_ps(x)));
+        let y_hi = _mm512_castps_si512(_mm512_movehdup_ps(_mm512_castsi512_ps(y)));
+
+        let mul_ll = _mm512_mul_epu32(x, y);
+        let mul_lh = _mm512_mul_epu32(x, y_hi);
+        let mul_hl = _mm512_mul_epu32(x_hi, y);
+        let mul_hh = _mm512_mul_epu32(x_hi, y_hi);
+
+        let mul_ll_hi = _mm512_srli_epi64::<32>(mul_ll);
+        let t0 = _mm512_add_epi64(mul_hl, mul_ll_hi);
+        let t0_lo = _mm512_and_si512(t0, EPSILON);
+        let t0_hi = _mm512_srli_epi64::<32>(t0);
+        let t1 = _mm512_add_epi64(mul_lh, t0_lo);
+        let t2 = _mm512_add_epi64(mul_hh, t0_hi);
+        let t1_hi = _mm512_srli_epi64::<32>(t1);
+        let res_hi = _mm512_add_epi64(t2, t1_hi);
+
+        let t1_lo = _mm512_castps_si512(_mm512_moveldup_ps(_mm512_castsi512_ps(t1)));
+        let res_lo = _mm512_mask_blend_epi32(LO_32_BITS_MASK, t1_lo, mul_ll);
+
+        (res_hi, res_lo)
+    }
+}
+
+/// Full 64-bit squaring.
+#[inline]
+fn square64(x: __m512i) -> (__m512i, __m512i) {
+    unsafe {
+        let x_hi = _mm512_castps_si512(_mm512_movehdup_ps(_mm512_castsi512_ps(x)));
+
+        let mul_ll = _mm512_mul_epu32(x, x);
+        let mul_lh = _mm512_mul_epu32(x, x_hi);
+        let mul_hh = _mm512_mul_epu32(x_hi, x_hi);
+
+        let mul_ll_hi = _mm512_srli_epi64::<33>(mul_ll);
+        let t0 = _mm512_add_epi64(mul_lh, mul_ll_hi);
+        let t0_hi = _mm512_srli_epi64::<31>(t0);
+        let res_hi = _mm512_add_epi64(mul_hh, t0_hi);
+
+        let mul_lh_lo = _mm512_slli_epi64::<33>(mul_lh);
+        let res_lo = _mm512_add_epi64(mul_ll, mul_lh_lo);
+
+        (res_hi, res_lo)
+    }
+}
+
+/// Reduce a 128-bit value (high, low) modulo `P`. Result may be > P.
+#[inline]
+fn reduce128(x: (__m512i, __m512i)) -> __m512i {
+    unsafe {
+        let (hi0, lo0) = x;
+
+        let hi_hi0 = _mm512_srli_epi64::<32>(hi0);
+
+        // 2^96 = -1 mod P.
+        let lo1 = sub_no_double_overflow_64_64(lo0, hi_hi0);
+
+        // Bottom 32 bits of hi0 times 2^64 = 2^32 - 1 mod P.
+        let t1 = _mm512_mul_epu32(hi0, EPSILON);
+
+        add_no_double_overflow_64_64(lo1, t1)
+    }
+}
+
+#[inline]
+fn mul(x: __m512i, y: __m512i) -> __m512i {
+    reduce128(mul64_64(x, y))
+}
+
+#[inline]
+fn square(x: __m512i) -> __m512i {
+    reduce128(square64(x))
+}
+
+// =========================================================================
+// SIMD-vectorized Poseidon1 MDS multiplication
+// =========================================================================
+//
+// Computes the width-8 circulant MDS matrix-vector product entirely in
+// `__m512i` registers, with delayed reduction. Each output is
+// `sum_j MDS_ROW[(j-i) mod 8] * state[j]`. Coefficients are in
+// {1, 3, 4, 7, 8, 9} (max 9), so per-term products fit in u68 and sums of
+// 8 terms fit comfortably in u71.
+//
+// We multiply via two 32x32 `_mm512_mul_epu32` calls (low half and high
+// half of state), which exploits that the constants fit in 4 bits (so the
+// "high 32 bits" operand of mul_epu32 is zero by construction). Sums of
+// the low and high halves are accumulated separately into u64s, then we
+// assemble the (hi, lo) u128 pair and call `reduce128`.
+
+use crate::poseidon1::{MDS8_ROW, POSEIDON1_WIDTH};
+
+/// Add a known-canonical `Goldilocks` scalar to a packed state, skipping the
+/// `canonicalize` that the generic `Add` applies to its right-hand side.
+///
+/// # Safety contract
+/// The caller must guarantee that `c.value < P`. Otherwise `x + c` may exceed
+/// `2^64 + P` and the wrap-detection in `add_no_double_overflow_64_64` will
+/// produce a wrong result. Round constants pulled from
+/// `GOLDILOCKS_POSEIDON1_RC_8` satisfy this trivially.
+#[inline(always)]
+pub(crate) fn add_canonical_scalar(x: PackedGoldilocksAVX512, c: Goldilocks) -> PackedGoldilocksAVX512 {
+    unsafe {
+        let c_vec = PackedGoldilocksAVX512::from(c).to_vector();
+        PackedGoldilocksAVX512::from_vector(add_no_double_overflow_64_64(x.to_vector(), c_vec))
+    }
+}
+
+/// Compute the `I`-th output of the width-8 circulant MDS matrix-vector product.
+///
+/// `I` is a const generic so that each instantiation is a distinct function
+/// from LLVM's perspective — otherwise LLVM rolls all 8 output computations
+/// back into a loop, serializing them and bouncing state through stack memory.
+#[inline(always)]
+unsafe fn mds_output<const I: usize>(s: &[__m512i; 8], s_hi: &[__m512i; 8]) -> __m512i {
+    unsafe {
+        let mut sum_ll = _mm512_setzero_si512();
+        let mut sum_hl = _mm512_setzero_si512();
+        // Row I of the circulant matrix is `MDS8_ROW` rotated right by I.
+        // The j loop is fully unrolled by LLVM since both bounds and indices
+        // are compile-time constants.
+        let mut j = 0;
+        while j < 8 {
+            let c = MDS8_ROW[(j + 8 - I) % 8];
+            let c_vec = _mm512_set1_epi64(c);
+            sum_ll = _mm512_add_epi64(sum_ll, _mm512_mul_epu32(s[j], c_vec));
+            sum_hl = _mm512_add_epi64(sum_hl, _mm512_mul_epu32(s_hi[j], c_vec));
+            j += 1;
+        }
+
+        // Total = sum_ll + (sum_hl << 32). Compose into (hi, lo) u128.
+        // sum_ll < 2^39, sum_hl < 2^39, so sum_hl >> 32 < 2^7.
+        let sum_hl_shifted = _mm512_slli_epi64::<32>(sum_hl);
+        let lo = _mm512_add_epi64(sum_ll, sum_hl_shifted);
+        // Detect unsigned overflow: lo < sum_hl_shifted iff the add wrapped.
+        let carry_mask = _mm512_cmplt_epu64_mask(lo, sum_hl_shifted);
+        let hi_no_carry = _mm512_srli_epi64::<32>(sum_hl);
+        let hi = _mm512_mask_add_epi64(hi_no_carry, carry_mask, hi_no_carry, _mm512_set1_epi64(1));
+
+        reduce128((hi, lo))
+    }
+}
+
+/// SIMD MDS multiplication for the width-8 circulant Poseidon1 matrix.
+///
+/// Takes/returns by value so the caller can keep state in named SSA scalars
+/// (zmm registers) rather than indexing through a `&mut [P; 8]` (which forces
+/// the array through the stack). Each of the 8 outputs is computed by a
+/// distinct const-generic instantiation of `mds_output`, preventing LLVM
+/// from re-rolling them.
+///
+/// Note: an `avx512ifma` variant of this (using `vpmadd52luq` to fuse the
+/// mul-add accumulation) was tried and measured ~15% *slower* on Zen 4 — the
+/// fused IFMA op runs on the multiplier port at no better throughput than
+/// `vpmuludq`, while the `vpaddq` it replaces was happily dual-issuing on the
+/// add ports. Kept the `vpmuludq + vpaddq` form.
+#[inline(always)]
+pub(crate) fn mds_mul_simd(
+    state: [PackedGoldilocksAVX512; POSEIDON1_WIDTH],
+) -> [PackedGoldilocksAVX512; POSEIDON1_WIDTH] {
+    unsafe {
+        let s: [__m512i; 8] = [
+            state[0].to_vector(),
+            state[1].to_vector(),
+            state[2].to_vector(),
+            state[3].to_vector(),
+            state[4].to_vector(),
+            state[5].to_vector(),
+            state[6].to_vector(),
+            state[7].to_vector(),
+        ];
+        // Precompute the high 32 bits of every state slot once.
+        let s_hi: [__m512i; 8] = [
+            _mm512_srli_epi64::<32>(s[0]),
+            _mm512_srli_epi64::<32>(s[1]),
+            _mm512_srli_epi64::<32>(s[2]),
+            _mm512_srli_epi64::<32>(s[3]),
+            _mm512_srli_epi64::<32>(s[4]),
+            _mm512_srli_epi64::<32>(s[5]),
+            _mm512_srli_epi64::<32>(s[6]),
+            _mm512_srli_epi64::<32>(s[7]),
+        ];
+
+        [
+            PackedGoldilocksAVX512::from_vector(mds_output::<0>(&s, &s_hi)),
+            PackedGoldilocksAVX512::from_vector(mds_output::<1>(&s, &s_hi)),
+            PackedGoldilocksAVX512::from_vector(mds_output::<2>(&s, &s_hi)),
+            PackedGoldilocksAVX512::from_vector(mds_output::<3>(&s, &s_hi)),
+            PackedGoldilocksAVX512::from_vector(mds_output::<4>(&s, &s_hi)),
+            PackedGoldilocksAVX512::from_vector(mds_output::<5>(&s, &s_hi)),
+            PackedGoldilocksAVX512::from_vector(mds_output::<6>(&s, &s_hi)),
+            PackedGoldilocksAVX512::from_vector(mds_output::<7>(&s, &s_hi)),
+        ]
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::{Goldilocks, PackedGoldilocksAVX512, WIDTH};
+
+    const SPECIAL_VALS: [Goldilocks; WIDTH] = Goldilocks::new_array([
+        0xFFFF_FFFF_0000_0001,
+        0xFFFF_FFFF_0000_0000,
+        0xFFFF_FFFE_FFFF_FFFF,
+        0xFFFF_FFFF_FFFF_FFFF,
+        0x0000_0000_0000_0000,
+        0x0000_0000_0000_0001,
+        0x0000_0000_0000_0002,
+        0x0FFF_FFFF_F000_0000,
+    ]);
+
+    #[test]
+    fn pack_round_trip() {
+        let p = PackedGoldilocksAVX512(SPECIAL_VALS);
+        let v = p.to_vector();
+        assert_eq!(PackedGoldilocksAVX512::from_vector(v).0, SPECIAL_VALS);
+    }
+}
diff --git a/crates/backend/koala-bear/src/benchmark_poseidons.rs b/crates/backend/koala-bear/src/benchmark_poseidons_koalabear.rs
similarity index 93%
rename from crates/backend/koala-bear/src/benchmark_poseidons.rs
rename to crates/backend/koala-bear/src/benchmark_poseidons_koalabear.rs
index 66c6a5a0d..ec34b729e 100644
--- a/crates/backend/koala-bear/src/benchmark_poseidons.rs
+++ b/crates/backend/koala-bear/src/benchmark_poseidons_koalabear.rs
@@ -13,7 +13,7 @@ const PACKING_WIDTH: usize = <FPacking as PackedValue>::WIDTH;
 #[test]
 #[ignore]
 fn bench_poseidon() {
-    // cargo test --release --package mt-koala-bear --lib -- benchmark_poseidons::bench_poseidon --exact --nocapture --ignored
+    // cargo test --release --package mt-koala-bear --lib -- benchmark_poseidons_koalabear::bench_poseidon --exact --nocapture --ignored
 
     let n = 1 << 23;
     let poseidon1_16 = default_koalabear_poseidon1_16();
diff --git a/crates/backend/koala-bear/src/lib.rs b/crates/backend/koala-bear/src/lib.rs
index 959ed3ada..d329843cc 100644
--- a/crates/backend/koala-bear/src/lib.rs
+++ b/crates/backend/koala-bear/src/lib.rs
@@ -11,7 +11,7 @@ pub mod quintic_extension;
 pub mod symmetric;
 
 #[cfg(test)]
-mod benchmark_poseidons;
+mod benchmark_poseidons_koalabear;
 
 #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
 mod aarch64_neon;
diff --git a/crates/backend/poly/Cargo.toml b/crates/backend/poly/Cargo.toml
index dcdf80aed..10f98a5ed 100644
--- a/crates/backend/poly/Cargo.toml
+++ b/crates/backend/poly/Cargo.toml
@@ -14,4 +14,4 @@ rand.workspace = true
 serde.workspace = true
 
 [dev-dependencies]
-koala-bear = { path = "../koala-bear", package = "mt-koala-bear" }
+goldilocks = { path = "../goldilocks", package = "mt-goldilocks" }
diff --git a/crates/backend/poly/src/eq_mle.rs b/crates/backend/poly/src/eq_mle.rs
index 64d3733f5..53848496d 100644
--- a/crates/backend/poly/src/eq_mle.rs
+++ b/crates/backend/poly/src/eq_mle.rs
@@ -1214,12 +1214,12 @@ mod tests {
     use std::time::Instant;
 
     use field::Field;
-    use koala_bear::QuinticExtensionFieldKB;
+    use goldilocks::CubicExtensionFieldGL;
     use rand::{RngExt, SeedableRng, rngs::StdRng};
 
     use super::*;
-    type F = koala_bear::KoalaBear;
-    type EF = QuinticExtensionFieldKB;
+    type F = goldilocks::Goldilocks;
+    type EF = CubicExtensionFieldGL;
 
     #[test]
     fn test_compute_sparse_eval() {
@@ -1304,8 +1304,10 @@ mod tests {
                 compute_eval_eq_packed::<_, true>(&eval, &mut out_2, scalar);
                 println!("EXTENSION PACKED: {:?}", time.elapsed());
 
-                let unpacked_out_2: Vec<EF> =
-                    <EF as ExtensionField<F>>::ExtensionPacking::to_ext_iter_vec(out_2.clone());
+                let unpacked_out_2: Vec<EF> = <<EF as ExtensionField<F>>::ExtensionPacking as PackedFieldExtension<
+                    F,
+                    EF,
+                >>::to_ext_iter_vec(out_2.clone());
                 assert_eq!(out_1, unpacked_out_2);
 
                 let mut out_3 = EF::zero_vec(1 << n_vars);
@@ -1313,7 +1315,7 @@ mod tests {
                 compute_eval_eq::<F, EF, true>(&eval, &mut out_3, scalar);
                 let out_3_packed = out_3
                     .par_chunks_exact(packing_width)
-                    .map(<EF as ExtensionField<F>>::ExtensionPacking::from_ext_slice)
+                    .map(<<EF as ExtensionField<F>>::ExtensionPacking as PackedFieldExtension<F, EF>>::from_ext_slice)
                     .collect::<Vec<_>>();
                 println!("EXTENSION PACKED AFTER: {:?}", time.elapsed());
 
@@ -1339,8 +1341,10 @@ mod tests {
                 compute_eval_eq_base_packed::<F, _, true>(&eval, &mut out_2, scalar);
                 println!("BASE PACKED: {:?}", time.elapsed());
 
-                let unpacked_out_2: Vec<EF> =
-                    <EF as ExtensionField<F>>::ExtensionPacking::to_ext_iter_vec(out_2.clone());
+                let unpacked_out_2: Vec<EF> = <<EF as ExtensionField<F>>::ExtensionPacking as PackedFieldExtension<
+                    F,
+                    EF,
+                >>::to_ext_iter_vec(out_2.clone());
                 assert_eq!(out_1, unpacked_out_2);
 
                 let mut out_3 = EF::zero_vec(1 << n_vars);
@@ -1348,7 +1352,7 @@ mod tests {
                 compute_eval_eq_base::<F, EF, true>(&eval, &mut out_3, scalar);
                 let out_3_packed = out_3
                     .par_chunks_exact(packing_width)
-                    .map(<EF as ExtensionField<F>>::ExtensionPacking::from_ext_slice)
+                    .map(<<EF as ExtensionField<F>>::ExtensionPacking as PackedFieldExtension<F, EF>>::from_ext_slice)
                     .collect::<Vec<_>>();
                 println!("BASE PACKED AFTER: {:?}", time.elapsed());
 
diff --git a/crates/backend/poly/src/evals.rs b/crates/backend/poly/src/evals.rs
index 7e0e07b4f..46926dc6e 100644
--- a/crates/backend/poly/src/evals.rs
+++ b/crates/backend/poly/src/evals.rs
@@ -350,11 +350,11 @@ where
 mod tests {
     use std::time::Instant;
 
-    use koala_bear::QuinticExtensionFieldKB;
+    use goldilocks::CubicExtensionFieldGL;
     use rand::{RngExt, SeedableRng, rngs::StdRng};
 
-    type F = QuinticExtensionFieldKB;
-    type EF = QuinticExtensionFieldKB;
+    type F = CubicExtensionFieldGL;
+    type EF = CubicExtensionFieldGL;
 
     use super::*;
 
diff --git a/crates/backend/poly/src/mle/mle_custom.rs b/crates/backend/poly/src/mle/mle_custom.rs
index 709d8afb8..b234dcae3 100644
--- a/crates/backend/poly/src/mle/mle_custom.rs
+++ b/crates/backend/poly/src/mle/mle_custom.rs
@@ -22,11 +22,11 @@ pub fn mle_of_zeros_then_ones<F: Field>(n_zeros: usize, point: &[F]) -> F {
 mod tests {
     use crate::{EvaluationsList, MultilinearPoint};
     use field::PrimeCharacteristicRing;
-    use koala_bear::KoalaBear;
+    use goldilocks::Goldilocks;
     use rand::{RngExt, SeedableRng, rngs::StdRng};
 
     use super::*;
-    type F = KoalaBear;
+    type F = Goldilocks;
 
     #[test]
     fn test_mle_of_zeros_then_ones() {
diff --git a/crates/backend/poly/src/next_mle.rs b/crates/backend/poly/src/next_mle.rs
index 7c9c687c2..960387e60 100644
--- a/crates/backend/poly/src/next_mle.rs
+++ b/crates/backend/poly/src/next_mle.rs
@@ -56,11 +56,11 @@ where
 #[cfg(test)]
 mod tests {
     use field::PrimeCharacteristicRing;
-    use koala_bear::KoalaBear;
+    use goldilocks::Goldilocks;
 
     use crate::{EvaluationsList, MultilinearPoint, matrix_next_mle_folded, next_mle, to_big_endian_in_field};
 
-    type F = KoalaBear;
+    type F = Goldilocks;
 
     #[test]
     fn test_matrix_down_folded() {
diff --git a/crates/backend/poly/src/utils.rs b/crates/backend/poly/src/utils.rs
index 5bb5fb1b4..4d1f2c313 100644
--- a/crates/backend/poly/src/utils.rs
+++ b/crates/backend/poly/src/utils.rs
@@ -399,89 +399,3 @@ pub fn to_little_endian_in_field<F: Field>(value: usize, bit_count: usize) -> Ve
     res.reverse();
     res
 }
-
-#[cfg(test)]
-mod bench_tests {
-    use std::time::{Duration, Instant};
-
-    use koala_bear::QuinticExtensionFieldKB;
-    use rand::{RngExt, SeedableRng, rngs::StdRng};
-
-    use super::*;
-
-    type EF = QuinticExtensionFieldKB;
-
-    const LOG_SIZES: [usize; 6] = [8, 12, 16, 20, 22, 24];
-    const REPETITIONS: usize = 10;
-
-    fn print_header(name: &str) {
-        println!(
-            "\nBenchmarking {} (packing_width = {}, repetitions = {})",
-            name,
-            packing_width::<EF>(),
-            REPETITIONS
-        );
-        println!(
-            "{:>10} | {:>14} | {:>14} | {:>14} | {:>14}",
-            "log_n", "n_ext_elems", "avg (ms)", "min (ms)", "max (ms)"
-        );
-    }
-
-    fn measure<R>(mut f: impl FnMut() -> R) -> (Duration, Duration, Duration) {
-        let mut total = Duration::ZERO;
-        let mut min_t = Duration::MAX;
-        let mut max_t = Duration::ZERO;
-        for _ in 0..REPETITIONS {
-            let t = Instant::now();
-            let out = f();
-            let d = t.elapsed();
-            std::hint::black_box(out);
-            total += d;
-            if d < min_t {
-                min_t = d;
-            }
-            if d > max_t {
-                max_t = d;
-            }
-        }
-        (total / REPETITIONS as u32, min_t, max_t)
-    }
-
-    fn print_row(log_n: usize, n: usize, avg: Duration, min_t: Duration, max_t: Duration) {
-        println!(
-            "{:>10} | {:>14} | {:>14.3} | {:>14.3} | {:>14.3}",
-            log_n,
-            n,
-            avg.as_secs_f64() * 1000.0,
-            min_t.as_secs_f64() * 1000.0,
-            max_t.as_secs_f64() * 1000.0,
-        );
-    }
-
-    #[test]
-    fn bench_unpack_extension() {
-        let mut rng = StdRng::seed_from_u64(0);
-        print_header("unpack_extension");
-        for &log_n in &LOG_SIZES {
-            let n = 1usize << log_n;
-            let ext_vec: Vec<EF> = (0..n).map(|_| rng.random()).collect();
-            let packed = pack_extension(&ext_vec);
-            let _ = unpack_extension::<EF>(&packed); // warmup
-            let (avg, min_t, max_t) = measure(|| unpack_extension::<EF>(&packed));
-            print_row(log_n, n, avg, min_t, max_t);
-        }
-    }
-
-    #[test]
-    fn bench_pack_extension() {
-        let mut rng = StdRng::seed_from_u64(0);
-        print_header("pack_extension");
-        for &log_n in &LOG_SIZES {
-            let n = 1usize << log_n;
-            let ext_vec: Vec<EF> = (0..n).map(|_| rng.random()).collect();
-            let _ = pack_extension::<EF>(&ext_vec); // warmup
-            let (avg, min_t, max_t) = measure(|| pack_extension::<EF>(&ext_vec));
-            print_row(log_n, n, avg, min_t, max_t);
-        }
-    }
-}
diff --git a/crates/backend/src/lib.rs b/crates/backend/src/lib.rs
index cbd44fb2b..be346c38c 100644
--- a/crates/backend/src/lib.rs
+++ b/crates/backend/src/lib.rs
@@ -1,7 +1,7 @@
 pub use air::*;
 pub use fiat_shamir::*;
 pub use field::*;
-pub use koala_bear::*;
+pub use goldilocks::*;
 pub use poly::*;
 pub use rayon;
 pub use rayon::prelude::*;
diff --git a/crates/backend/sumcheck/src/product_computation.rs b/crates/backend/sumcheck/src/product_computation.rs
index 2828af039..027bb5a3a 100644
--- a/crates/backend/sumcheck/src/product_computation.rs
+++ b/crates/backend/sumcheck/src/product_computation.rs
@@ -45,7 +45,11 @@ pub fn run_product_sumcheck<EF: ExtensionField<PF<EF>>>(
     assert!(n_rounds >= 1);
     let first_sumcheck_poly = match (pol_a, pol_b) {
         (MleRef::BasePacked(evals), MleRef::ExtensionPacked(weights)) => {
-            compute_product_sumcheck_polynomial(evals, weights, sum, |e| EFPacking::<EF>::to_ext_iter([e]).collect())
+            if EF::DIMENSION == 3 {
+                compute_product_sumcheck_polynomial_base_ext_packed::<3, _, _, _, EF>(evals, weights, sum)
+            } else {
+                unimplemented!()
+            }
         }
         (MleRef::ExtensionPacked(evals), MleRef::ExtensionPacked(weights)) => {
             compute_product_sumcheck_polynomial(evals, weights, sum, |e| EFPacking::<EF>::to_ext_iter([e]).collect())
@@ -164,10 +168,12 @@ pub fn compute_product_sumcheck_polynomial<
     DensePolynomial::new(vec![c0, c1, c2])
 }
 
-// using delayed modular reduction
+// Generic over PrimeField64 (Goldilocks and Goldilocks both qualify). The Goldilocks-specific
+// delayed u128/i128 accumulation path is retained as a specialization candidate for a future
+// pass — see `crates/backend/goldilocks/README.md`.
 pub fn compute_product_sumcheck_polynomial_base_ext_packed<
     const DIM: usize,
-    F: PrimeField32,
+    F: PrimeField64,
     PF: PackedField<Scalar = F>,
     EFP: BasedVectorSpace<PF> + Copy + Send + Sync,
     EF: Field + BasedVectorSpace<F>,
@@ -182,8 +188,6 @@ pub fn compute_product_sumcheck_polynomial_base_ext_packed<
     assert!(n.is_power_of_two());
     let half = n / 2;
 
-    type Acc<const D: usize> = ([u128; D], [i128; D]);
-
     let chunk_size = 1024;
 
     let (c0_acc, c2_acc) = pol_0[..half]
@@ -195,8 +199,8 @@ pub fn compute_product_sumcheck_polynomial_base_ext_packed<
                 .zip(pol_1[half..].par_chunks(chunk_size)),
         )
         .map(|((b_lo, b_hi), (e_lo, e_hi))| {
-            let mut c0 = [0u128; DIM];
-            let mut c2 = [0i128; DIM];
+            let mut c0 = [F::ZERO; DIM];
+            let mut c2 = [F::ZERO; DIM];
             for i in 0..b_lo.len() {
                 let x0_lanes = b_lo[i].as_slice();
                 let x1_lanes = b_hi[i].as_slice();
@@ -206,20 +210,20 @@ pub fn compute_product_sumcheck_polynomial_base_ext_packed<
                     let y0_j = y0_coords[j].as_slice();
                     let y1_j = y1_coords[j].as_slice();
                     for lane in 0..PF::WIDTH {
-                        let x0 = x0_lanes[lane].to_unique_u32() as u64;
-                        let y0 = y0_j[lane].to_unique_u32();
-                        let y1 = y1_j[lane].to_unique_u32();
-                        c0[j] += (y0 as u64 * x0) as u128;
-                        c2[j] += (y1 as i64 - y0 as i64) as i128
-                            * (x1_lanes[lane].to_unique_u32() as i64 - x0 as i64) as i128;
+                        let x0 = x0_lanes[lane];
+                        let x1 = x1_lanes[lane];
+                        let y0 = y0_j[lane];
+                        let y1 = y1_j[lane];
+                        c0[j] += y0 * x0;
+                        c2[j] += (y1 - y0) * (x1 - x0);
                     }
                 }
             }
             (c0, c2)
         })
         .reduce(
-            || ([0u128; DIM], [0i128; DIM]),
-            |(mut a0, mut a2): Acc<DIM>, (b0, b2): Acc<DIM>| {
+            || ([F::ZERO; DIM], [F::ZERO; DIM]),
+            |(mut a0, mut a2): ([F; DIM], [F; DIM]), (b0, b2): ([F; DIM], [F; DIM])| {
                 for j in 0..DIM {
                     a0[j] += b0[j];
                     a2[j] += b2[j];
@@ -228,8 +232,8 @@ pub fn compute_product_sumcheck_polynomial_base_ext_packed<
             },
         );
 
-    let c0 = EF::from_basis_coefficients_fn(|j| F::reduce_product_sum(c0_acc[j]));
-    let c2 = EF::from_basis_coefficients_fn(|j| F::reduce_signed_product_sum(c2_acc[j]));
+    let c0 = EF::from_basis_coefficients_fn(|j| c0_acc[j]);
+    let c2 = EF::from_basis_coefficients_fn(|j| c2_acc[j]);
     let c1 = sum - c0.double() - c2;
 
     DensePolynomial::new(vec![c0, c1, c2])
diff --git a/crates/backend/symetric/Cargo.toml b/crates/backend/symetric/Cargo.toml
index 125fb5535..d959ae0e9 100644
--- a/crates/backend/symetric/Cargo.toml
+++ b/crates/backend/symetric/Cargo.toml
@@ -4,6 +4,6 @@ version.workspace = true
 edition.workspace = true
 
 [dependencies]
-koala-bear = { path = "../koala-bear", package = "mt-koala-bear" }
+goldilocks = { path = "../goldilocks", package = "mt-goldilocks" }
 field = { path = "../field", package = "mt-field" }
 rayon.workspace = true
diff --git a/crates/backend/symetric/src/merkle.rs b/crates/backend/symetric/src/merkle.rs
index 2fe194855..17a3c6681 100644
--- a/crates/backend/symetric/src/merkle.rs
+++ b/crates/backend/symetric/src/merkle.rs
@@ -8,7 +8,7 @@ use rayon::prelude::*;
 
 use crate::Compression;
 
-pub const DIGEST_ELEMS: usize = 8;
+pub const DIGEST_ELEMS: usize = 4;
 
 /// A Merkle tree storing only the digest layers (no leaf data).
 #[derive(Debug, Clone)]
@@ -100,7 +100,7 @@ pub fn merkle_verify<F, LeafPerm, NodeComp, const DIGEST_ELEMS: usize, const WID
 ) -> bool
 where
     F: field::PrimeCharacteristicRing + PartialEq,
-    LeafPerm: koala_bear::symmetric::Permutation<[F; WIDTH]>,
+    LeafPerm: crate::Permutation<[F; WIDTH]>,
     NodeComp: Compression<[F; WIDTH]>,
 {
     if opening_proof.len() != log_height {
diff --git a/crates/backend/symetric/src/permutation.rs b/crates/backend/symetric/src/permutation.rs
index c129a1dc4..381d00c85 100644
--- a/crates/backend/symetric/src/permutation.rs
+++ b/crates/backend/symetric/src/permutation.rs
@@ -1,7 +1,7 @@
 // Credits: Plonky3 (https://github.com/Plonky3/Plonky3) (MIT and Apache-2.0 licenses).
 
 use field::{Algebra, InjectiveMonomial};
-use koala_bear::{KoalaBear, Poseidon1KoalaBear16};
+use goldilocks::{Goldilocks, Poseidon1Goldilocks8};
 
 pub trait Compression<T: Clone>: Clone + Sync {
     #[inline(always)]
@@ -13,10 +13,29 @@ pub trait Compression<T: Clone>: Clone + Sync {
     fn compress_mut(&self, input: &mut T);
 }
 
-impl<R: Algebra<KoalaBear> + InjectiveMonomial<3> + Send + Sync + 'static> Compression<[R; 16]>
-    for Poseidon1KoalaBear16
+impl<R: Algebra<Goldilocks> + InjectiveMonomial<7> + Copy + Send + Sync + 'static> Compression<[R; 8]>
+    for Poseidon1Goldilocks8
 {
-    fn compress_mut(&self, input: &mut [R; 16]) {
+    fn compress_mut(&self, input: &mut [R; 8]) {
         self.compress_in_place(input);
     }
 }
+
+/// A permutation in the mathematical sense.
+pub trait Permutation<T: Clone>: Clone + Sync {
+    #[inline(always)]
+    fn permute(&self, mut input: T) -> T {
+        self.permute_mut(&mut input);
+        input
+    }
+
+    fn permute_mut(&self, input: &mut T);
+}
+
+impl<R: Algebra<Goldilocks> + InjectiveMonomial<7> + Copy + Send + Sync + 'static> Permutation<[R; 8]>
+    for Poseidon1Goldilocks8
+{
+    fn permute_mut(&self, input: &mut [R; 8]) {
+        self.permute_in_place(input);
+    }
+}
diff --git a/crates/backend/symetric/src/sponge.rs b/crates/backend/symetric/src/sponge.rs
index 956ec6088..c7beb9133 100644
--- a/crates/backend/symetric/src/sponge.rs
+++ b/crates/backend/symetric/src/sponge.rs
@@ -1,7 +1,7 @@
 // Credits: Plonky3 (https://github.com/Plonky3/Plonky3) (MIT and Apache-2.0 licenses).
 
+use crate::Permutation;
 use field::PrimeCharacteristicRing;
-use koala_bear::symmetric::Permutation;
 
 /// Overwrite-sponge
 pub fn hash_slice_rtl<T, Perm, const WIDTH: usize, const RATE: usize, const OUT: usize>(
diff --git a/crates/lean_compiler/snark_lib.py b/crates/lean_compiler/snark_lib.py
index d27d2497f..2c42102bd 100644
--- a/crates/lean_compiler/snark_lib.py
+++ b/crates/lean_compiler/snark_lib.py
@@ -45,47 +45,47 @@ def __len__(self):
         return
 
 
-# Poseidon16 precompiles on input x = m[left..left+8] || m[right..right+8], written at `output`:
+# Poseidon8 precompiles on input x = m[left..left+4] || m[right..right+4], written at `output`:
 #   - `compress_*` adds the input back, i.e. feed-forward (Poseidon(x) + x); `permute_*` is the raw Poseidon(x).
-#   - `_half` keeps 8 elements, `_quarter` keeps 4, plain `permute` keeps 16
-#   - `_hardcoded_left`: the left half is m[offset..offset+4] || m[left..left+4], at the compile-time constant `offset`.
+#   - `_half` keeps 4 elements, `_quarter` keeps 2, plain `permute` keeps 8
+#   - `_hardcoded_left`: the left half is m[offset..offset+2] || m[left..left+2], at the compile-time constant `offset`.
 
 
-def poseidon16_compress_half(left, right, output):
-    """m[output..output+8] = (Poseidon(x) + x)[0..8]."""
+def poseidon8_compress_half(left, right, output):
+    """m[output..output+4] = (Poseidon(x) + x)[0..4]."""
     _ = left, right, output
 
 
-def poseidon16_compress_quarter(left, right, output):
-    """m[output..output+4] = (Poseidon(x) + x)[0..4]."""
+def poseidon8_compress_quarter(left, right, output):
+    """m[output..output+2] = (Poseidon(x) + x)[0..2]."""
     _ = left, right, output
 
 
-def poseidon16_compress_half_hardcoded_left(left, right, output, offset):
-    """`poseidon16_compress_half` with a hardcoded left prefix: the left half of the input is
-    m[offset..offset+4] || m[left..left+4]."""
+def poseidon8_compress_half_hardcoded_left(left, right, output, offset):
+    """`poseidon8_compress_half` with a hardcoded left prefix: the left half of the input is
+    m[offset..offset+2] || m[left..left+2]."""
     _ = left, right, output, offset
 
 
-def poseidon16_compress_quarter_hardcoded_left(left, right, output, offset):
-    """`poseidon16_compress_quarter` with a hardcoded left prefix: the left half of the input is
-    m[offset..offset+4] || m[left..left+4]."""
+def poseidon8_compress_quarter_hardcoded_left(left, right, output, offset):
+    """`poseidon8_compress_quarter` with a hardcoded left prefix: the left half of the input is
+    m[offset..offset+2] || m[left..left+2]."""
     _ = left, right, output, offset
 
 
-def poseidon16_permute(left, right, output):
-    """m[output..output+16] = Poseidon(x) (raw permutation, no feed-forward)."""
+def poseidon8_permute(left, right, output):
+    """m[output..output+8] = Poseidon(x) (raw permutation, no feed-forward)."""
     _ = left, right, output
 
 
-def poseidon16_permute_half(left, right, output):
-    """m[output..output+8] = Poseidon(x)[0..8] (raw permutation, no feed-forward; high 8 discarded)."""
+def poseidon8_permute_half(left, right, output):
+    """m[output..output+4] = Poseidon(x)[0..4] (raw permutation, no feed-forward; high 4 discarded)."""
     _ = left, right, output
 
 
-def poseidon16_permute_half_hardcoded_left(left, right, output, offset):
-    """`poseidon16_permute_half` with a hardcoded left prefix: the left half of the input is
-    m[offset..offset+4] || m[left..left+4]."""
+def poseidon8_permute_half_hardcoded_left(left, right, output, offset):
+    """`poseidon8_permute_half` with a hardcoded left prefix: the left half of the input is
+    m[offset..offset+2] || m[left..left+2]."""
     _ = left, right, output, offset
 
 
diff --git a/crates/lean_compiler/src/a_simplify_lang/mod.rs b/crates/lean_compiler/src/a_simplify_lang/mod.rs
index c824c118c..a9d998947 100644
--- a/crates/lean_compiler/src/a_simplify_lang/mod.rs
+++ b/crates/lean_compiler/src/a_simplify_lang/mod.rs
@@ -1,9 +1,9 @@
 use crate::{F, a_simplify_lang::post_optimization::propagate_copies, lang::*, parser::ConstArrayValue};
 use backend::PrimeCharacteristicRing;
 use lean_vm::{
-    ALL_POSEIDON16_NAMES, Boolean, BooleanExpr, CustomHint, ExtensionOpMode, FunctionName,
-    POSEIDON16_HARDCODED_LEFT_NAME, POSEIDON16_PERMUTE_HALF_HARDCODED_LEFT_NAME, POSEIDON16_PERMUTE_HALF_NAME,
-    POSEIDON16_PERMUTE_NAME, POSEIDON16_QUARTER_HARDCODED_LEFT_NAME, POSEIDON16_QUARTER_NAME, PrecompileArgs,
+    ALL_POSEIDON8_NAMES, Boolean, BooleanExpr, CustomHint, ExtensionOpMode, FunctionName,
+    POSEIDON8_HARDCODED_LEFT_NAME, POSEIDON8_PERMUTE_HALF_HARDCODED_LEFT_NAME, POSEIDON8_PERMUTE_HALF_NAME,
+    POSEIDON8_PERMUTE_NAME, POSEIDON8_QUARTER_HARDCODED_LEFT_NAME, POSEIDON8_QUARTER_NAME, PrecompileArgs,
     PrecompileCompTimeArgs, SourceLocation,
 };
 use std::{
@@ -1853,30 +1853,30 @@ fn simplify_lines(
                             continue;
                         }
 
-                        // Special handling for poseidon16 precompile (5 variants).
-                        if ALL_POSEIDON16_NAMES.contains(&function_name.as_str()) {
+                        // Special handling for poseidon8 precompile (5 variants).
+                        if ALL_POSEIDON8_NAMES.contains(&function_name.as_str()) {
                             if !targets.is_empty() {
                                 return Err(format!(
                                     "Precompile {function_name} should not return values, at {location}"
                                 ));
                             }
                             let permute = [
-                                POSEIDON16_PERMUTE_NAME,
-                                POSEIDON16_PERMUTE_HALF_NAME,
-                                POSEIDON16_PERMUTE_HALF_HARDCODED_LEFT_NAME,
+                                POSEIDON8_PERMUTE_NAME,
+                                POSEIDON8_PERMUTE_HALF_NAME,
+                                POSEIDON8_PERMUTE_HALF_HARDCODED_LEFT_NAME,
                             ]
                             .contains(&function_name.as_str());
                             let half_output = [
-                                POSEIDON16_QUARTER_NAME,
-                                POSEIDON16_QUARTER_HARDCODED_LEFT_NAME,
-                                POSEIDON16_PERMUTE_HALF_NAME,
-                                POSEIDON16_PERMUTE_HALF_HARDCODED_LEFT_NAME,
+                                POSEIDON8_QUARTER_NAME,
+                                POSEIDON8_QUARTER_HARDCODED_LEFT_NAME,
+                                POSEIDON8_PERMUTE_HALF_NAME,
+                                POSEIDON8_PERMUTE_HALF_HARDCODED_LEFT_NAME,
                             ]
                             .contains(&function_name.as_str());
                             let is_hardcoded_left = [
-                                POSEIDON16_HARDCODED_LEFT_NAME,
-                                POSEIDON16_QUARTER_HARDCODED_LEFT_NAME,
-                                POSEIDON16_PERMUTE_HALF_HARDCODED_LEFT_NAME,
+                                POSEIDON8_HARDCODED_LEFT_NAME,
+                                POSEIDON8_QUARTER_HARDCODED_LEFT_NAME,
+                                POSEIDON8_PERMUTE_HALF_HARDCODED_LEFT_NAME,
                             ]
                             .contains(&function_name.as_str());
                             let expected_args = if is_hardcoded_left { 4 } else { 3 };
@@ -1908,7 +1908,7 @@ fn simplify_lines(
                                 arg_0: simplified_args[0].clone(),
                                 arg_1: simplified_args[1].clone(),
                                 res: simplified_args[2].clone(),
-                                data: PrecompileCompTimeArgs::Poseidon16 {
+                                data: PrecompileCompTimeArgs::Poseidon8 {
                                     half_output,
                                     hardcoded_offset_left,
                                     permute,
@@ -2068,6 +2068,9 @@ fn simplify_lines(
                                             res.push(SimpleLine::equality(target_var, SimpleExpr::Constant(result)));
                                         } else {
                                             if !operation.supports_runtime() {
+                                                eprintln!(
+                                                    "[COMPILE-TIME-OP DEBUG] operation={operation:?}, args={args_simplified:?}, var={var:?}, target_var={target_var:?}, is_mutable={is_mutable}"
+                                                );
                                                 return Err(format!(
                                                     "Operation `{operation}` is compile-time only; all operands must be constants"
                                                 ));
diff --git a/crates/lean_compiler/src/c_compile_final.rs b/crates/lean_compiler/src/c_compile_final.rs
index e10516e11..d69bcd686 100644
--- a/crates/lean_compiler/src/c_compile_final.rs
+++ b/crates/lean_compiler/src/c_compile_final.rs
@@ -216,7 +216,7 @@ fn compile_block(
         let dest = try_as_mem_or_constant(&dest).expect("Fatal: Could not materialize jump destination");
         let label = match dest {
             MemOrConstant::Constant(dest) => hints
-                .get(&usize::try_from(dest.as_canonical_u32()).unwrap())
+                .get(&usize::try_from(dest.as_canonical_u64()).unwrap())
                 .and_then(|hints: &Vec<Hint>| {
                     hints.iter().find_map(|x| match x {
                         Hint::Label { label } => Some(label),
diff --git a/crates/lean_compiler/src/instruction_encoder.rs b/crates/lean_compiler/src/instruction_encoder.rs
index e4db6a610..9bcb1cab0 100644
--- a/crates/lean_compiler/src/instruction_encoder.rs
+++ b/crates/lean_compiler/src/instruction_encoder.rs
@@ -48,17 +48,17 @@ pub fn field_representation(instr: &Instruction) -> [F; N_INSTRUCTION_COLUMNS] {
         }
         Instruction::Precompile(precompile) => {
             let domainsep = match &precompile.data {
-                PrecompileCompTimeArgs::Poseidon16 {
+                PrecompileCompTimeArgs::Poseidon8 {
                     half_output,
                     hardcoded_offset_left,
                     permute,
                 } => {
                     let flag_left = hardcoded_offset_left.is_some() as usize;
                     let offset_left_val = hardcoded_offset_left.unwrap_or(0);
-                    let out8 = (!*half_output && !*permute) || (*half_output && *permute);
+                    let out4 = (!*half_output && !*permute) || (*half_output && *permute);
                     POSEIDON_DOMAINSEP_BASE
                         + POSEIDON_FLAG_PERMUTE_SHIFT * (*permute as usize)
-                        + POSEIDON_FLAG_OUT8_SHIFT * (out8 as usize)
+                        + POSEIDON_FLAG_OUT4_SHIFT * (out4 as usize)
                         + POSEIDON_FLAG_LEFT_SHIFT * flag_left
                         + POSEIDON_OFFSET_LEFT_SHIFT * offset_left_val
                 }
diff --git a/crates/lean_compiler/src/parser/parsers/function.rs b/crates/lean_compiler/src/parser/parsers/function.rs
index 0334f5dc8..240e19458 100644
--- a/crates/lean_compiler/src/parser/parsers/function.rs
+++ b/crates/lean_compiler/src/parser/parsers/function.rs
@@ -8,7 +8,7 @@ use crate::{
         grammar::{ParsePair, Rule},
     },
 };
-use lean_vm::{ALL_POSEIDON16_NAMES, CUSTOM_HINTS, ExtensionOpMode};
+use lean_vm::{ALL_POSEIDON8_NAMES, CUSTOM_HINTS, ExtensionOpMode};
 
 /// Reserved function names that users cannot define.
 pub const RESERVED_FUNCTION_NAMES: &[&str] = &[
@@ -34,7 +34,7 @@ fn is_reserved_function_name(name: &str) -> bool {
     if RESERVED_FUNCTION_NAMES.contains(&name) || CUSTOM_HINTS.iter().any(|hint| hint.name() == name) {
         return true;
     }
-    if ALL_POSEIDON16_NAMES.contains(&name) {
+    if ALL_POSEIDON8_NAMES.contains(&name) {
         return true;
     }
     if ExtensionOpMode::from_name(name).is_some() {
diff --git a/crates/lean_compiler/tests/test_compiler.rs b/crates/lean_compiler/tests/test_compiler.rs
index 0f3b33a98..307c54ae8 100644
--- a/crates/lean_compiler/tests/test_compiler.rs
+++ b/crates/lean_compiler/tests/test_compiler.rs
@@ -4,11 +4,35 @@ use backend::{BasedVectorSpace, PrimeCharacteristicRing};
 use lean_compiler::*;
 use lean_vm::*;
 use rand::{RngExt, SeedableRng, rngs::StdRng};
+use utils::poseidon8_compress;
+
+#[test]
+fn test_poseidon() {
+    // Goldilocks width-8 Poseidon: two 4-element halves in, one 4-element digest out.
+    let program = r#"
+def main():
+    data = Array(8)
+    for i in unroll(0, 8):
+        data[i] = i
+    out = Array(4)
+    poseidon8_compress_half(data, data + 4, out)
+
+    for i in range(0, 4):
+        cc = out[i]
+        print(cc)
+    return
+   "#;
+    let public_input = [F::ZERO; PUBLIC_INPUT_LEN];
+    compile_and_run(&ProgramSource::Raw(program.to_string()), &public_input, false);
+
+    let input: [F; 8] = std::array::from_fn(|i| F::new(i as u64));
+    let _ = dbg!(poseidon8_compress(input));
+}
 
 #[test]
 fn test_div_extension_field() {
     let program = r#"
-DIM = 5
+DIM = 3
 
 def main():
     nd = Array(2 * DIM)
@@ -182,7 +206,7 @@ def main():
 
 @inline
 def func(a, b):
-    poseidon16_compress_half(a, a, b)
+    poseidon8_compress_half(a, a, b)
     return
    "#;
     let bytecode = compile_program(&ProgramSource::Raw(program.to_string()));
@@ -261,18 +285,18 @@ def main():
 fn test_soundness_suite() {
     #[allow(clippy::type_complexity)]
     let cases: &[(&str, &[u32], &[(usize, u32)])] = &[
-        ("soundness_0", &[3, 6, 7, 10, 9, 20, 26, 1], &[(0, 4), (1, 7), (2, 8), (3, 11), (4, 10), (5, 21), (6, 27), (7, 0), (7, 2)]),
-        ("soundness_1", &[5, 10, 6, 7, 42, 9, 5, 4],  &[(0, 6), (1, 11), (2, 7), (3, 8), (4, 43), (5, 10), (6, 6), (7, 5)]),
-        ("soundness_2", &[3, 4, 5, 29, 7, 1, 17, 46], &[(0, 2), (1, 5), (2, 6), (3, 30), (4, 8), (5, 0), (5, 2), (6, 18), (7, 47)]),
-        ("soundness_3", &[4, 2, 14, 120, 5, 10, 50, 55], &[(0, 5), (1, 3), (2, 15), (3, 121), (4, 6), (5, 11), (6, 51), (7, 56)]),
-        ("soundness_4", &[5, 10, 10, 3, 4, 19, 20, 1], &[(0, 6), (1, 11), (2, 11), (3, 4), (4, 5), (5, 20), (6, 50), (7, 0), (7, 2)]),
-        ("soundness_5", &[3, 4, 7, 19, 49, 28, 1, 3],  &[(0, 4), (1, 5), (2, 8), (3, 20), (4, 50), (5, 29), (6, 0), (6, 2), (7, 4)]),
+        ("soundness_0", &[3, 6, 10, 7],   &[(0, 4), (1, 7), (2, 8), (3, 11)]),
+        ("soundness_1", &[5, 10, 4, 16],  &[(0, 6), (1, 11), (2, 5), (3, 17)]),
+        ("soundness_2", &[2, 4, 5, 20],   &[(0, 0), (1, 3), (2, 6), (3, 21)]),
+        ("soundness_3", &[2, 14, 120, 3], &[(0, 3), (1, 15), (2, 121), (3, 4)]),
+        ("soundness_4", &[20, 3, 15, 1],  &[(0, 21), (1, 4), (2, 16), (3, 0)]),
+        ("soundness_5", &[2, 7, 18, 4],   &[(0, 3), (1, 8), (2, 19), (3, 5)]),
     ];
 
     let to_input = |v: &[u32]| -> [F; PUBLIC_INPUT_LEN] {
         let mut out = [F::ZERO; PUBLIC_INPUT_LEN];
         for (slot, &x) in out.iter_mut().zip(v) {
-            *slot = F::new(x);
+            *slot = F::new(x as u64);
         }
         out
     };
diff --git a/crates/lean_compiler/tests/test_data/error_89.py b/crates/lean_compiler/tests/test_data/error_89.py
index da39d4a22..4107e11d1 100644
--- a/crates/lean_compiler/tests/test_data/error_89.py
+++ b/crates/lean_compiler/tests/test_data/error_89.py
@@ -4,7 +4,7 @@
 def choose(flag):
     x: Mut = 7
     if flag != 0:
-        x: Imu
+        x: Imm
         x = 42
     return x
 
diff --git a/crates/lean_compiler/tests/test_data/program_15.py b/crates/lean_compiler/tests/test_data/program_15.py
index 55433c261..0c3f95aa8 100644
--- a/crates/lean_compiler/tests/test_data/program_15.py
+++ b/crates/lean_compiler/tests/test_data/program_15.py
@@ -1,6 +1,6 @@
 from snark_lib import *
 
-ONE_EF_PTR = 8  # right after the 8-cell public input region
+ONE_EF_PTR = 4  # right after the 4-cell public input region, inside the reserved preamble
 
 
 def main():
@@ -10,7 +10,7 @@ def main():
     i, j, k = func_1(x, y)
     assert i == 2
     assert j == 3
-    assert k == 2130706432
+    assert k == 18446744069414584320  # -1 mod P_Goldilocks
 
     g = Array(8)
     h = Array(8)
diff --git a/crates/lean_compiler/tests/test_data/program_179.py b/crates/lean_compiler/tests/test_data/program_179.py
index 521d0af63..f6be744c5 100644
--- a/crates/lean_compiler/tests/test_data/program_179.py
+++ b/crates/lean_compiler/tests/test_data/program_179.py
@@ -1,6 +1,6 @@
 from snark_lib import *
 
-ONE_EF_PTR = 8  # right after the 8-cell public input region
+ONE_EF_PTR = 4  # right after the 4-cell public input region, inside the reserved preamble
 
 
 def main():
diff --git a/crates/lean_compiler/tests/test_data/program_30.py b/crates/lean_compiler/tests/test_data/program_30.py
index 0348faa65..4cb608e91 100644
--- a/crates/lean_compiler/tests/test_data/program_30.py
+++ b/crates/lean_compiler/tests/test_data/program_30.py
@@ -9,7 +9,7 @@ def main():
     for i in unroll(0, 2):
         res = f1(ARR[i])
         buff[i + 1] = res
-    assert buff[2] == 1390320454
+    assert buff[2] == 17401132340371191870  # regenerated for Goldilocks (P=2^64-2^32+1)
     return
 
 
diff --git a/crates/lean_compiler/tests/test_data/soundness_0.py b/crates/lean_compiler/tests/test_data/soundness_0.py
index dfeb620a1..77ce1268c 100644
--- a/crates/lean_compiler/tests/test_data/soundness_0.py
+++ b/crates/lean_compiler/tests/test_data/soundness_0.py
@@ -7,26 +7,10 @@ def main():
     b = p[1]
     c = p[2]
     d = p[3]
-    e = p[4]
-    f = p[5]
-    g = p[6]
-    h = p[7]
 
     assert double(a) == b
-    assert square_plus_one(a) == d
-    assert a + c == 10
-    assert e < 10
-    assert f <= 20
-
-    acc: Mut = 0
-    for i in unroll(0, 4):
-        acc = acc + p[i]
-    assert acc == g
-
-    if h == 1:
-        assert a + b == 9
-    else:
-        assert a == 0
+    assert square_plus_one(a) == c
+    assert a + d == 10
     return
 
 
diff --git a/crates/lean_compiler/tests/test_data/soundness_1.py b/crates/lean_compiler/tests/test_data/soundness_1.py
index 6add781c6..bdc55b56a 100644
--- a/crates/lean_compiler/tests/test_data/soundness_1.py
+++ b/crates/lean_compiler/tests/test_data/soundness_1.py
@@ -6,11 +6,7 @@ def main():
     n = p[0]
     sum_range = p[1]
     x = p[2]
-    y = p[3]
-    prod_xy = p[4]
-    outer = p[5]
-    inner_bound = p[6]
-    v = p[7]
+    prod = p[3]
 
     assert n == 5
 
@@ -19,16 +15,7 @@ def main():
         s = s + i
     assert s == sum_range
 
-    assert mul(x, y) == prod_xy
-
-    nested: Mut = 0
-    for i in unroll(0, 3):
-        for j in unroll(0, 3):
-            nested = nested + i * j
-    assert nested == outer
-
-    assert v < inner_bound
-    assert inner_bound == v + 1
+    assert mul(x, x) == prod
     return
 
 
diff --git a/crates/lean_compiler/tests/test_data/soundness_2.py b/crates/lean_compiler/tests/test_data/soundness_2.py
index 630d756a6..3380f6866 100644
--- a/crates/lean_compiler/tests/test_data/soundness_2.py
+++ b/crates/lean_compiler/tests/test_data/soundness_2.py
@@ -7,10 +7,6 @@ def main():
     x = p[1]
     y = p[2]
     expected = p[3]
-    secondary = p[4]
-    flag = p[5]
-    offset = p[6]
-    total = p[7]
 
     computed: Imm
     match mode:
@@ -23,17 +19,6 @@ def main():
         case 3:
             computed = combined(x, y)
     assert computed == expected
-
-    adjusted: Imm
-    if flag == 0:
-        adjusted = bump(secondary, 1)
-    elif flag == 1:
-        adjusted = bump(secondary, 10)
-    else:
-        adjusted = bump(secondary, 100)
-    assert adjusted == offset
-
-    assert total == expected + offset
     return
 
 
@@ -51,8 +36,3 @@ def mul_op(a, b):
 
 def combined(a, b):
     return mul_op(a, b) + add_op(a, b)
-
-
-@inline
-def bump(v, k):
-    return v + k
diff --git a/crates/lean_compiler/tests/test_data/soundness_3.py b/crates/lean_compiler/tests/test_data/soundness_3.py
index a775b7510..2c24022a5 100644
--- a/crates/lean_compiler/tests/test_data/soundness_3.py
+++ b/crates/lean_compiler/tests/test_data/soundness_3.py
@@ -3,16 +3,10 @@
 
 def main():
     p = 0
-    n = p[0]
-    seed = p[1]
-    sum_expected = p[2]
-    prod_expected = p[3]
-    max_val = p[4]
-    upper = p[5]
-    w = p[6]
-    expected_final = p[7]
-
-    assert n == 4
+    seed = p[0]
+    sum_expected = p[1]
+    prod_expected = p[2]
+    w = p[3]
 
     arr = Array(4)
     for i in unroll(0, 4):
@@ -28,10 +22,7 @@ def main():
         prod = times(prod, arr[i])
     assert prod == prod_expected
 
-    assert max_val < upper
-    assert upper <= 100
-    assert upper == max_val + 5
-    assert w + max_val == expected_final
+    assert w == seed + 1
     return
 
 
diff --git a/crates/lean_compiler/tests/test_data/soundness_4.py b/crates/lean_compiler/tests/test_data/soundness_4.py
index 9f86ccae4..f8ee10f96 100644
--- a/crates/lean_compiler/tests/test_data/soundness_4.py
+++ b/crates/lean_compiler/tests/test_data/soundness_4.py
@@ -3,40 +3,32 @@
 
 def main():
     p = 0
-    n = p[0]
-    expected_sum_pos = p[1]
-    expected_sum_neg = p[2]
-    x = p[3]
-    y = p[4]
-    expected_pipeline = p[5]
-    threshold = p[6]
-    threshold_check = p[7]
-
-    assert n == 5
+    expected_sum = p[0]
+    x = p[1]
+    expected_pipeline = p[2]
+    flag = p[3]
 
     markers = Array(5)
     for i in unroll(0, 5):
         markers[i] = i
 
-    sum_pos: Mut = 0
-    sum_neg: Mut = 0
+    s: Mut = 0
     for i in range(0, 5):
         m = markers[i]
         if m == 0:
-            sum_neg = sum_neg + 10
+            s = s + 10
         else:
-            sum_pos = sum_pos + m
-    assert sum_pos == expected_sum_pos
-    assert sum_neg == expected_sum_neg
+            s = s + m
+    assert s == expected_sum
 
-    assert pipeline(x, y) == expected_pipeline
+    assert pipeline(x, x) == expected_pipeline
 
-    if threshold_check == 1:
-        assert threshold < 50
+    if flag == 1:
+        assert expected_sum < 50
     else:
-        assert threshold == 0
+        assert expected_sum == 0
 
-    assert threshold_check * (1 - threshold_check) == 0
+    assert flag * (1 - flag) == 0
     return
 
 
diff --git a/crates/lean_compiler/tests/test_data/soundness_5.py b/crates/lean_compiler/tests/test_data/soundness_5.py
index eaa31b7cf..4190edef7 100644
--- a/crates/lean_compiler/tests/test_data/soundness_5.py
+++ b/crates/lean_compiler/tests/test_data/soundness_5.py
@@ -4,15 +4,9 @@
 def main():
     p = 0
     seed = p[0]
-    n = p[1]
-    last_write = p[2]
-    match_tally = p[3]
-    pipeline_squared = p[4]
-    paired = p[5]
-    flag = p[6]
-    alt = p[7]
-
-    assert n == 4
+    last_write = p[1]
+    match_tally = p[2]
+    alt = p[3]
 
     counter: Mut = 0
     for i in range(0, 4):
@@ -32,32 +26,10 @@ def main():
                 acc = acc + 7
     assert acc == match_tally
 
-    assert sqr_via_pipeline(seed + n) == pipeline_squared
-
-    assert paired_sum(seed, n) == paired
-
     chosen: Imm
-    if flag == 1:
-        chosen = seed
+    if seed == 0:
+        chosen = 0
     else:
         chosen = seed * 2
     assert chosen == alt
-
-    assert flag * (1 - flag) == 0
     return
-
-
-@inline
-def sqr_via_pipeline(x):
-    return mul_boxed(x, x)
-
-
-def mul_boxed(a, b):
-    return a * b
-
-
-def paired_sum(a, b):
-    total: Mut = 0
-    for i in range(0, 4):
-        total = total + a + b
-    return total
diff --git a/crates/lean_compiler/zkDSL.md b/crates/lean_compiler/zkDSL.md
index f3ade74a8..134b718c9 100644
--- a/crates/lean_compiler/zkDSL.md
+++ b/crates/lean_compiler/zkDSL.md
@@ -402,7 +402,7 @@ a compile error.
 
 ### Arithmetic
 
-`+`, `-`, `*`, `/` are field operations and work at runtime, modulo `p = 2^31 - 2^24 + 1` (koalabear prime).
+`+`, `-`, `*`, `/` are field operations and work at runtime, modulo `p = 2^64 - 2^32 + 1` (Goldilocks prime).
 
 **Division by zero is undefined behaviour.**
 
@@ -626,14 +626,14 @@ assert acc == value
 
 The full list:
 
-| Hint                              | Arguments                                                          | Effect                                                                                                                                  |
-| --------------------------------- | ------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------- |
-| `hint_decompose_bits`             | `(value, ptr, n_bits)`                                             | Writes `n_bits` big-endian 0/1 field elements at `ptr` (MSB at `ptr[0]`). Requires `n_bits <= 31`.                                      |
-| `hint_decompose_bits_merkle_whir` | `(decomposed_ptr, value, chunk_size)`                              | Writes `24 / chunk_size` little-endian `chunk_size`-bit chunks of `value` at `decomposed_ptr` (`chunk_size` must divide 24).            |
-| `hint_decompose_bits_xmss`        | `(decomposed_ptr, to_decompose_ptr, num_to_decompose, chunk_size)` | For each of `num_to_decompose` values at `to_decompose_ptr[..]`, writes its `24 / chunk_size` little-endian chunks at `decomposed_ptr`. |
-| `hint_less_than`                  | `(a, b, result_ptr)`                                               | `1` at `result_ptr` if `a < b` (canonical integer compare), else `0`.                                                                   |
-| `hint_log2_ceil`                  | `(n, result_ptr)`                                                  | `ceil(log2(n))` at `result_ptr`.                                                                                                        |
-| `hint_div_floor`                  | `(a, b, q_ptr, r_ptr)`                                             | `floor(a / b)` at `q_ptr`, `a mod b` at `r_ptr` (requires `b != 0`).                                                                    |
+| Hint                              | Arguments                                          | Effect                                                                                                                                                                                                                                                                              |
+| --------------------------------- | -------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `hint_decompose_bits`             | `(to_decompose, ptr, num_bits)`                    | Writes `num_bits` big-endian 0/1 field elements at `ptr` (MSB at `ptr[0]`). Requires `num_bits <= 64`.                                                                                                                                                                              |
+| `hint_decompose_bits_merkle_whir` | `(decomposed_ptr, value, num_chunks, chunk_size)`  | Writes `num_chunks` little-endian `chunk_size`-bit chunks of `value` at `decomposed_ptr`. Requires `num_chunks * chunk_size <= 64`.                                                                                                                                                 |
+| `hint_decompose_bits_xmss`        | `(chunks_ptr, limbs_ptr, src_value)`               | WOTS-encoding decomposition of one Goldilocks FE: 5 6-bit chunks of the low 30 bits at `chunks_ptr[0..5]` (each packs two consecutive chain steps as `step_a + CHAIN_LENGTH * step_b`) + 2 u16 limbs of the high 32 bits at `limbs_ptr[0..2]` (see `crates/lean_vm/src/isa/hint.rs`). |
+| `hint_less_than`                  | `(a, b, result_ptr)`                               | `1` at `result_ptr` if `a < b` (canonical integer compare), else `0`.                                                                                                                                                                                                              |
+| `hint_log2_ceil`                  | `(n, result_ptr)`                                  | `ceil(log2(n))` at `result_ptr`.                                                                                                                                                                                                                                                   |
+| `hint_div_floor`                  | `(a, b, q_ptr, r_ptr)`                             | `floor(a / b)` at `q_ptr`, `a mod b` at `r_ptr` (requires `b != 0`).                                                                                                                                                                                                                |
 
 ## Precompiles
 
diff --git a/crates/lean_prover/Cargo.toml b/crates/lean_prover/Cargo.toml
index 2163ed200..bab7da203 100644
--- a/crates/lean_prover/Cargo.toml
+++ b/crates/lean_prover/Cargo.toml
@@ -27,4 +27,3 @@ serde.workspace = true
 [dev-dependencies]
 xmss.workspace = true
 rec_aggregation.workspace = true
-serde_json.workspace = true
diff --git a/crates/lean_prover/python-verifier/primitives.py b/crates/lean_prover/python-verifier/primitives.py
deleted file mode 100644
index 0e1c8b95f..000000000
--- a/crates/lean_prover/python-verifier/primitives.py
+++ /dev/null
@@ -1,427 +0,0 @@
-# source: https://github.com/leanEthereum/leanSpec
-
-from __future__ import annotations
-from itertools import accumulate, repeat
-from typing import Final, Sequence
-
-P: Final = 2**31 - 2**24 + 1  # Koalabear prime
-TWO_ADICITY = 24
-MDS_FIRST_ROW_16: Final = (1, 1, 51, 1, 11, 17, 2, 1, 101, 63, 15, 2, 67, 22, 13, 3)  # for Poseidon
-KB_TWO_ADIC_GENERATORS: Final = tuple(pow(0x6AC49F88, 1 << (TWO_ADICITY - b), P) for b in range(TWO_ADICITY + 1))
-
-SPONGE_RATE, SPONGE_STATE, DIGEST_ELEMS = 8, 16, 8
-SPONGE_CAPACITY = SPONGE_STATE - SPONGE_RATE
-
-
-class Fp:
-    """An element of the KoalaBear prime field `F_p`."""
-
-    __slots__ = ("value",)
-
-    def __init__(self, value: int) -> None:
-        self.value = value % P
-
-    def __add__(self, other):
-        if not isinstance(other, Fp):
-            return NotImplemented  # let EF.__radd__ / etc. handle mixed-type arithmetic.
-        return Fp(self.value + other.value)
-
-    def __sub__(self, other):
-        if not isinstance(other, Fp):
-            return NotImplemented
-        return Fp(self.value - other.value)
-
-    def __neg__(self) -> "Fp":
-        return Fp(-self.value)
-
-    def __mul__(self, other):
-        if not isinstance(other, Fp):
-            return NotImplemented
-        return Fp(self.value * other.value)
-
-    def __pow__(self, exponent: int) -> "Fp":
-        return Fp(pow(self.value, exponent, P))
-
-    def cube(self) -> "Fp":
-        return self * self * self
-
-    def __eq__(self, other: object) -> bool:
-        return isinstance(other, Fp) and self.value == other.value
-
-    def __hash__(self) -> int:
-        return hash(self.value)
-
-    def __repr__(self) -> str:
-        return f"Fp(value={self.value})"
-
-
-def quintic_mul(a, b, zero):
-    """Schoolbook product in `Fp[X]/(X⁵+X²−1)`"""
-    prod = [zero] * 9
-    for i in range(5):
-        for j in range(5):
-            prod[i + j] = prod[i + j] + a[i] * b[j]
-    for k in range(8, 4, -1):  # X^k = X^(k−5)·(1 − X²) for k ≥ 5.
-        prod[k - 5] = prod[k - 5] + prod[k]
-        prod[k - 3] = prod[k - 3] - prod[k]
-    return prod[:5]
-
-
-class EF:
-    """Quintic extension `Fp[X] / (X⁵ + X² − 1)`."""
-
-    __slots__ = ("c",)
-    DIMENSION = 5
-
-    def __init__(self, value):
-        """Accepts an `int` (lifted via `Fp`), an `Fp` (lifted), or a length-5 `Sequence[Fp]`."""
-        if isinstance(value, int):
-            self.c = (Fp(value), Fp(0), Fp(0), Fp(0), Fp(0))
-        elif isinstance(value, Fp):
-            self.c = (value, Fp(0), Fp(0), Fp(0), Fp(0))
-        else:
-            assert len(value) == 5
-            self.c = tuple(value)
-
-    def __add__(self, o):
-        if isinstance(o, int):
-            return self if o == 0 else self + EF(o)
-        if isinstance(o, Fp):
-            return EF([self.c[0] + o, *self.c[1:]])
-        return EF([a + b for a, b in zip(self.c, o.c)])
-
-    def __sub__(self, o):
-        if isinstance(o, int):
-            return self if o == 0 else self - EF(o)
-        if isinstance(o, Fp):
-            return EF([self.c[0] - o, *self.c[1:]])
-        return EF([a - b for a, b in zip(self.c, o.c)])
-
-    def __neg__(self):
-        return EF([-a for a in self.c])
-
-    __radd__ = __add__
-
-    def __mul__(self, o):
-        if isinstance(o, int):
-            return self if o == 1 else self * EF(o)
-        if isinstance(o, Fp):
-            return EF([a * o for a in self.c])
-        return EF(quintic_mul(self.c, o.c, Fp(0)))
-
-    __rmul__ = __mul__
-
-    def __eq__(self, o):
-        return isinstance(o, EF) and self.c == o.c
-
-    def __hash__(self):
-        return hash(self.c)
-
-    def __repr__(self):
-        return f"EF({[int(x.value) for x in self.c]})"
-
-    def cube(self) -> "EF":
-        return self * self * self
-
-    def inv(self) -> "EF":
-        result, base, n = ONE, self, P**5 - 2
-        while n > 0:
-            if n & 1:
-                result = result * base
-            base = base * base
-            n >>= 1
-        return result
-
-
-ZERO = EF(0)
-ONE = EF(1)
-
-
-def ef_powers(x: EF, n: int) -> list[EF]:
-    """`[1, x, x², …, x^(n−1)]`."""
-    return list(accumulate(repeat(x, n), lambda a, _: a * x, initial=ONE))[:n]
-
-
-def pack_ef(flat: Sequence[Fp]) -> list[EF]:
-    """Pack a length-(n·DIM) Fp vector into n EF elements (5 Fp coordinates per EF)."""
-    return [EF(flat[i : i + EF.DIMENSION]) for i in range(0, len(flat), EF.DIMENSION)]
-
-
-# 448 raw Poseidon1-KoalaBear width-16 round constants generated by the Grain
-# LFSR (Poseidon paper §5.3, parameters field_type=1, α=3, n=31, t=16, R_F=8,
-# R_P=20). Reference: https://github.com/Plonky3/Plonky3/blob/main/poseidon1/generate_constants.py
-# Layout: 4 initial-full rounds × 16 + 20 partial rounds × 16 + 4 terminal-full rounds × 16.
-def _grain_lfsr_round_constants_16() -> tuple[int, ...]:
-    bits_msb = lambda v, w: [(v >> (w - 1 - i)) & 1 for i in range(w)]
-    state = bits_msb(1, 2) + bits_msb(0, 4) + bits_msb(31, 12) + bits_msb(16, 12) + bits_msb(8, 10) + bits_msb(20, 10) + [1] * 30  # fmt: skip
-
-    def step() -> int:
-        nonlocal state
-        new = state[62] ^ state[51] ^ state[38] ^ state[23] ^ state[13] ^ state[0]
-        state = state[1:] + [new]
-        return new
-
-    for _ in range(160):  # spec-mandated warm-up
-        step()
-
-    def next_bit() -> int:  # self-shrinking generator: keep step()'s output only when the prior step was 1
-        while True:
-            if step() == 1:
-                return step()
-            step()
-
-    def next_fe() -> int:  # rejection sampling into [0, P)
-        while True:
-            x = 0
-            for _ in range(31):
-                x = (x << 1) | next_bit()
-            if x < P:
-                return x
-
-    return tuple(next_fe() for _ in range((8 + 20) * 16))
-
-
-P1_ROUND_CONSTANTS_16: Final = _grain_lfsr_round_constants_16()
-
-
-class Poseidon1Params:
-    """Parameters for a Poseidon1 instance."""
-
-    __slots__ = ("width", "rounds_f", "rounds_p", "mds_first_row", "round_constants")
-
-    def __init__(
-        self,
-        width: int,
-        rounds_f: int,
-        rounds_p: int,
-        mds_first_row: Sequence[int],
-        round_constants: Sequence[int],
-    ) -> None:
-        assert len(mds_first_row) == width
-        assert len(round_constants) == (rounds_f + rounds_p) * width
-        self.width = width
-        self.rounds_f = rounds_f
-        self.rounds_p = rounds_p
-        self.mds_first_row = mds_first_row
-        self.round_constants = round_constants
-
-
-class Poseidon1:
-    """Pure-Python Poseidon1 permutation (S-box: x → x^3; dense circulant MDS).
-
-    Round structure: AddRoundConstants → S-box (full state for full rounds, only
-    position 0 for partial rounds) → MDS multiply.
-    """
-
-    __slots__ = ("_width", "_half_rounds_f", "_rounds_p", "_mds", "_rc")
-
-    def __init__(self, params: Poseidon1Params) -> None:
-        self._width = params.width
-        self._half_rounds_f = params.rounds_f // 2
-        self._rounds_p = params.rounds_p
-        n = params.width
-        # Build circulant MDS: M[i][j] = first_row[(j - i) mod n].
-        self._mds = [[params.mds_first_row[(j - i) % n] for j in range(n)] for i in range(n)]
-        self._rc = list(params.round_constants)
-
-    def permute(self, current_state: Sequence[Fp]) -> list[Fp]:
-        assert len(current_state) == self._width
-        s = [x.value for x in current_state]
-        w, p, mds, rc = self._width, P, self._mds, self._rc
-        idx = 0
-
-        def mds_mul() -> None:
-            new = [sum((mds[i][j] * s[j]) % p for j in range(w)) % p for i in range(w)]
-            s[:] = new
-
-        for _ in range(self._half_rounds_f):
-            for i in range(w):
-                s[i] = (s[i] + rc[idx + i]) % p
-            idx += w
-            for i in range(w):
-                s[i] = (s[i] * s[i] % p) * s[i] % p
-            mds_mul()
-        for _ in range(self._rounds_p):
-            for i in range(w):
-                s[i] = (s[i] + rc[idx + i]) % p
-            idx += w
-            s[0] = (s[0] * s[0] % p) * s[0] % p
-            mds_mul()
-        for _ in range(self._half_rounds_f):
-            for i in range(w):
-                s[i] = (s[i] + rc[idx + i]) % p
-            idx += w
-            for i in range(w):
-                s[i] = (s[i] * s[i] % p) * s[i] % p
-            mds_mul()
-
-        return [Fp(v) for v in s]
-
-
-PARAMS_16 = Poseidon1Params(
-    width=16,
-    rounds_f=8,
-    rounds_p=20,
-    mds_first_row=MDS_FIRST_ROW_16,
-    round_constants=P1_ROUND_CONSTANTS_16,
-)
-"""Poseidon1 parameters for width-16 (8 full rounds, 20 partial)."""
-
-
-POSEIDON16 = Poseidon1(PARAMS_16)
-
-
-def poseidon16_compress(left: Sequence[Fp], right: Sequence[Fp]) -> list[Fp]:
-    state = list(left) + list(right)
-    assert len(state) == SPONGE_STATE
-    return [a + b for a, b in zip(POSEIDON16.permute(state), state)][:DIGEST_ELEMS]
-
-
-def log2_ceil(x: int) -> int:
-    return 0 if x <= 1 else (x - 1).bit_length()
-
-
-def log2_strict(x: int) -> int:
-    assert x > 0 and (x & (x - 1)) == 0, f"{x} is not a power of two"
-    return x.bit_length() - 1
-
-
-def next_multiple_of(n: int, k: int) -> int:
-    return (n + k - 1) // k * k
-
-
-def div_ceil(n: int, k: int) -> int:
-    return (n + k - 1) // k
-
-
-# ---------------------------------------------------------------------------
-# Poseidon2-16 sparse optimization for partial rounds (see Appendix B of https://eprint.iacr.org/2019/458.pdf)
-# ---------------------------------------------------------------------------
-
-POSEIDON_FULL_ROUNDS = 8
-POSEIDON_WIDTH = 16
-POSEIDON_PARTIAL_ROUNDS = 20
-POSEIDON_HALF_FULL_ROUNDS = POSEIDON_FULL_ROUNDS // 2  # = 4 full rounds per side
-
-
-def _mat_mul(a: list[list[int]], b: list[list[int]], n: int) -> list[list[int]]:
-    return [[sum(a[i][k] * b[k][j] for k in range(n)) % P for j in range(n)] for i in range(n)]
-
-
-def _mat_vec(m: list[list[int]], v: Sequence[int], n: int) -> list[int]:
-    return [sum(m[i][j] * v[j] for j in range(n)) % P for i in range(n)]
-
-
-def _mat_transpose(m: list[list[int]], n: int) -> list[list[int]]:
-    return [[m[j][i] for j in range(n)] for i in range(n)]
-
-
-def _gauss_jordan_inv(m_in: list[list[int]], n: int) -> list[list[int]]:
-    aug = [row[:] for row in m_in]
-    inv = [[1 if i == j else 0 for j in range(n)] for i in range(n)]
-    for col in range(n):
-        pivot = next(r for r in range(col, n) if aug[r][col] != 0)
-        if pivot != col:
-            aug[col], aug[pivot] = aug[pivot], aug[col]
-            inv[col], inv[pivot] = inv[pivot], inv[col]
-        piv_inv = pow(aug[col][col], P - 2, P)
-        for j in range(n):
-            aug[col][j] = aug[col][j] * piv_inv % P
-            inv[col][j] = inv[col][j] * piv_inv % P
-        for i in range(n):
-            if i == col or aug[i][col] == 0:
-                continue
-            factor = aug[i][col]
-            for j in range(n):
-                aug[i][j] = (aug[i][j] - factor * aug[col][j]) % P
-                inv[i][j] = (inv[i][j] - factor * inv[col][j]) % P
-    return inv
-
-
-def _compute_sparse_constants() -> dict:
-    """Compress partial rounds into per-round (sparse first row, sparse v, scalar rc) triples.
-
-    Output:
-      sparse_m_i: 16×16 — applied once when entering the partial-round phase.
-      sparse_first_row[r], sparse_v[r]: row-r operator that replaces the full MDS matvec.
-      sparse_first_round_constants, sparse_scalar_round_constants: compressed RCs.
-    """
-    w = PARAMS_16.width
-    hf = PARAMS_16.rounds_f // 2
-    rp = PARAMS_16.rounds_p
-    rc = PARAMS_16.round_constants
-
-    mds = [[MDS_FIRST_ROW_16[(j - i) % w] for j in range(w)] for i in range(w)]
-    mds_inv = _gauss_jordan_inv(mds, w)
-    partial_rc = [list(rc[(hf + i) * w : (hf + i + 1) * w]) for i in range(rp)]
-
-    # Backward substitution through MDS^{-1} to collapse each round's RC vector into
-    # one scalar (the lane-0 RC kept inline) plus a constant carry on the next round.
-    scalar_rc: list[int] = [0] * rp
-    tmp = list(partial_rc[rp - 1])
-    for i in range(rp - 2, -1, -1):
-        inv_cip = _mat_vec(mds_inv, tmp, w)
-        scalar_rc[i + 1] = inv_cip[0]
-        tmp = list(partial_rc[i])
-        for j in range(1, w):
-            tmp[j] = (tmp[j] + inv_cip[j]) % P
-    sparse_first_round_constants = tmp
-    sparse_scalar_round_constants = scalar_rc[1:]
-
-    # Factor MDS into per-round sparse matrices (first row + v column).
-    mds_t = _mat_transpose(mds, w)
-    m_mul = [row[:] for row in mds_t]
-    v_collection: list[list[int]] = []
-    w_hat_collection: list[list[int]] = []
-    m_i = [[0] * w for _ in range(w)]
-    for _ in range(rp):
-        v_row = [m_mul[0][j + 1] if j < 15 else 0 for j in range(w)]
-        w_col = [m_mul[i + 1][0] for i in range(15)]
-        sub = [[m_mul[i + 1][j + 1] for j in range(15)] for i in range(15)]
-        m_hat_inv = _gauss_jordan_inv(sub, 15)
-        w_hat = [sum(m_hat_inv[i][k] * w_col[k] for k in range(15)) % P if i < 15 else 0 for i in range(w)]
-        v_collection.append(v_row)
-        w_hat_collection.append(w_hat)
-        m_i = [row[:] for row in m_mul]
-        m_i[0][0] = 1
-        for i in range(1, w):
-            m_i[i][0] = 0
-        for j in range(1, w):
-            m_i[0][j] = 0
-        m_mul = _mat_mul(mds_t, m_i, w)
-    sparse_m_i = _mat_transpose(m_i, w)
-    v_collection.reverse()
-    w_hat_collection.reverse()
-
-    mds_0_0 = mds[0][0]
-    sparse_first_row = [[mds_0_0] + w_hat_collection[r][:15] for r in range(rp)]
-    return {
-        "sparse_m_i": sparse_m_i,
-        "sparse_first_row": sparse_first_row,
-        "sparse_v": v_collection,
-        "sparse_first_round_constants": sparse_first_round_constants,
-        "sparse_scalar_round_constants": sparse_scalar_round_constants,
-    }
-
-
-_HF, _W = POSEIDON_HALF_FULL_ROUNDS, POSEIDON_WIDTH
-_N = len(MDS_FIRST_ROW_16)
-_RCS = PARAMS_16.round_constants
-_SPARSE = _compute_sparse_constants()
-
-# Dense circulant MDS matrix: M[i][j] = MDS_FIRST_ROW_16[(j - i) % 16].
-POSEIDON_AIR_MDS_DENSE: list[list[Fp]] = [[Fp(MDS_FIRST_ROW_16[(j - i) % _N]) for j in range(_N)] for i in range(_N)]
-
-# External full-round constants: first / last POSEIDON_HALF_FULL_ROUNDS slices of round_constants.
-POSEIDON_AIR_INITIAL_CONSTANTS: list[list[Fp]] = [[Fp(v) for v in _RCS[i * _W : (i + 1) * _W]] for i in range(_HF)]
-_TAIL = (_HF + POSEIDON_PARTIAL_ROUNDS) * _W
-POSEIDON_AIR_FINAL_CONSTANTS: list[list[Fp]] = [
-    [Fp(v) for v in _RCS[_TAIL + i * _W : _TAIL + (i + 1) * _W]] for i in range(_HF)
-]
-
-# Sparse partial-round constants (Fp-wrapped).
-POSEIDON_AIR_SPARSE_M_I: list[list[Fp]] = [[Fp(v) for v in row] for row in _SPARSE["sparse_m_i"]]
-POSEIDON_AIR_SPARSE_FIRST_ROW: list[list[Fp]] = [[Fp(v) for v in row] for row in _SPARSE["sparse_first_row"]]
-POSEIDON_AIR_SPARSE_V: list[list[Fp]] = [[Fp(v) for v in row] for row in _SPARSE["sparse_v"]]
-POSEIDON_AIR_SPARSE_FIRST_RC: list[Fp] = [Fp(v) for v in _SPARSE["sparse_first_round_constants"]]
-POSEIDON_AIR_SPARSE_SCALAR_RC: list[Fp] = [Fp(v) for v in _SPARSE["sparse_scalar_round_constants"]]
diff --git a/crates/lean_prover/python-verifier/verifier.py b/crates/lean_prover/python-verifier/verifier.py
deleted file mode 100644
index b61aeb5d5..000000000
--- a/crates/lean_prover/python-verifier/verifier.py
+++ /dev/null
@@ -1,1197 +0,0 @@
-"""Pure-Python verifier for leanVM proofs.
-Setup the test vector (one-time):
-    cargo test --release --package lean_prover --lib -- test_zkvm::dump_test_vector_for_python_verifier --include-ignored
-Run:
-    python3 crates/lean_prover/python-verifier/verifier.py
-Format:
-    ruff format --line-length 120 crates/lean_prover/python-verifier
-"""
-
-from __future__ import annotations
-import array
-import json
-import math
-import sys
-from dataclasses import dataclass
-from enum import IntEnum
-from pathlib import Path
-from typing import Sequence
-from primitives import *
-
-
-PUBLIC_INPUT_SIZE = DIGEST_ELEMS
-SNARK_DOMAIN_SEP = [Fp(v) for v in (130704175, 1303721200, 493664240, 1035493700, 2063844858, 1410214009, 1938905908, 1696767928)]  # fmt: skip
-
-WHIR_INITIAL_FOLDING_FACTOR, WHIR_SUBSEQUENT_FOLDING_FACTOR, WHIR_MAX_NUM_VARIABLES_TO_SEND_COEFFS = 7, 5, 8
-MIN_WHIR_LOG_INV_RATE, MAX_WHIR_LOG_INV_RATE, RS_DOMAIN_INITIAL_REDUCTION_FACTOR = 1, 4, 5
-_WHIR_CONFIGS = ((1,7,1,10,220,16,()),(1,8,1,11,220,16,()),(1,9,1,12,220,16,()),(1,10,1,13,220,16,()),(1,11,1,14,220,16,()),(1,12,1,15,220,16,()),(1,13,1,16,220,16,()),(1,14,1,15,221,16,()),(1,15,1,16,221,16,()),(1,16,1,16,73,16,((222,1,16,11),)),(1,17,1,16,73,16,((223,1,16,12),)),(1,18,1,16,73,16,((224,1,16,13),)),(1,19,1,16,73,16,((225,1,16,14),)),(1,20,1,16,73,16,((227,1,16,15),)),(1,21,2,16,32,16,((229,1,16,16),(73,1,16,9))),(1,22,2,16,32,16,((230,1,16,12),(74,1,16,10))),(1,23,2,16,32,16,((234,1,16,13),(74,1,16,11))),(1,24,2,16,32,16,((235,1,16,14),(74,1,16,12))),(1,25,2,16,32,16,((241,2,16,15),(74,2,16,13))),(1,26,2,16,21,14,((243,2,16,16),(74,2,16,14),(32,2,16,14))),(1,27,2,16,21,14,((248,2,16,15),(75,2,16,15),(32,2,16,15))),(1,28,2,16,21,14,((256,2,16,16),(75,2,16,16),(32,2,16,16))),(1,29,2,16,21,14,((262,2,16,15),(76,2,16,12),(33,2,16,17))),(1,30,2,16,21,14,((270,2,16,16),(76,2,16,13),(33,2,16,18))),(2,7,1,13,109,16,()),(2,8,1,14,109,16,()),(2,9,1,15,109,16,()),(2,10,1,16,109,16,()),(2,11,1,12,110,16,()),(2,12,1,13,110,16,()),(2,13,1,14,110,16,()),(2,14,1,15,110,16,()),(2,15,1,16,110,16,()),(2,16,1,14,55,16,((111,1,16,10),)),(2,17,1,15,55,16,((111,1,16,11),)),(2,18,1,16,55,16,((111,1,16,12),)),(2,19,1,15,55,16,((112,1,16,13),)),(2,20,2,16,55,16,((112,1,16,14),)),(2,21,2,16,28,16,((113,1,16,15),(55,1,16,10))),(2,22,2,15,28,16,((114,1,16,16),(55,1,16,11))),(2,23,2,16,28,16,((114,1,16,13),(56,1,16,12))),(2,24,2,16,28,16,((115,1,16,14),(56,2,16,13))),(2,25,2,15,28,16,((118,2,16,15),(56,2,16,14))),(2,26,2,16,19,15,((118,2,16,16),(56,2,16,15),(28,2,16,17))),(2,27,2,16,19,15,((119,2,16,13),(57,2,16,16),(28,2,16,18))),(2,28,2,16,19,15,((120,2,16,14),(57,2,16,14),(29,2,15,19))),(2,29,2,16,19,15,((123,2,16,15),(57,2,16,15),(29,2,15,20))),(3,7,1,9,73,16,()),(3,8,1,10,73,16,()),(3,9,1,11,73,16,()),(3,10,1,12,73,16,()),(3,11,1,13,73,16,()),(3,12,1,14,73,16,()),(3,13,1,15,73,16,()),(3,14,1,16,73,16,()),(3,15,1,12,74,16,()),(3,16,1,13,44,16,((74,1,16,11),)),(3,17,1,14,44,16,((74,1,16,12),)),(3,18,2,15,44,16,((74,1,16,13),)),(3,19,2,16,44,16,((74,1,16,14),)),(3,20,2,15,44,16,((75,1,16,15),)),(3,21,2,16,25,16,((75,1,16,16),(44,1,16,11))),(3,22,2,15,25,16,((76,1,16,11),(45,1,16,12))),(3,23,2,16,25,16,((76,1,16,12),(45,2,16,13))),(3,24,2,16,25,16,((77,2,16,13),(45,2,16,14))),(3,25,2,16,25,16,((78,2,15,14),(45,2,16,15))),(3,26,2,16,18,12,((79,2,15,15),(45,2,16,16),(25,2,16,19))),(3,27,2,16,18,12,((80,2,16,16),(45,2,16,15),(26,2,13,20))),(3,28,2,15,18,12,((82,2,15,15),(46,2,16,16),(26,2,13,21))),(4,7,1,8,55,16,()),(4,8,1,9,55,16,()),(4,9,1,10,55,16,()),(4,10,1,11,55,16,()),(4,11,1,12,55,16,()),(4,12,1,13,55,16,()),(4,13,1,14,55,16,()),(4,14,1,15,55,16,()),(4,15,1,16,55,16,()),(4,16,1,13,37,16,((56,1,16,9),)),(4,17,1,14,37,16,((56,1,16,10),)),(4,18,2,15,37,16,((56,1,16,11),)),(4,19,2,16,37,16,((56,1,16,12),)),(4,20,2,13,37,16,((57,1,16,13),)),(4,21,2,14,23,15,((57,2,16,14),(37,2,16,12))),(4,22,2,15,23,15,((57,2,16,15),(37,2,16,13))),(4,23,2,16,23,15,((57,2,16,16),(37,2,16,14))),(4,24,2,15,23,15,((58,2,16,13),(38,2,16,15))),(4,25,2,16,23,15,((58,2,16,14),(38,2,16,16))),(4,26,2,16,16,16,((60,2,15,15),(38,2,16,17),(23,2,15,22))),(4,27,2,15,16,16,((61,2,16,16),(38,2,16,18),(23,2,15,23))))  # fmt: skip
-WHIR_CONFIGS = {
-    (c[0], c[1]): {
-        "log_inv_rate": c[0],
-        "num_variables": c[1],
-        "commitment_ood_samples": c[2],
-        "starting_folding_pow_bits": c[3],
-        "final_queries": c[4],
-        "final_query_pow_bits": c[5],
-        "rounds": [
-            {"num_queries": r[0], "ood_samples": r[1], "query_pow_bits": r[2], "folding_pow_bits": r[3]} for r in c[6]
-        ],
-    }
-    for c in _WHIR_CONFIGS
-}
-
-MIN_LOG_MEMORY_SIZE, MAX_LOG_MEMORY_SIZE = 16, 26
-MIN_LOG_N_ROWS_PER_TABLE, MIN_BYTECODE_LOG_SIZE, MAX_BYTECODE_LOG_SIZE = 8, 8, 22
-N_VARS_TO_SEND_GKR_COEFFS = 5
-
-N_RUNTIME_COLUMNS, N_INSTRUCTION_COLUMNS = 8, 12
-
-LOGUP_MEMORY_DOMAINSEP, LOGUP_BYTECODE_DOMAINSEP = 1, 2
-POSEIDON_DOMAINSEP_BASE = 3  # odd ≥ 3
-POSEIDON_FLAG_PERMUTE_SHIFT, POSEIDON_FLAG_OUT8_SHIFT = 1 << 1, 1 << 2
-POSEIDON_FLAG_LEFT_SHIFT, POSEIDON_OFFSET_LEFT_SHIFT = 1 << 3, 1 << 4
-EXT_OP_FLAG_BE, EXT_OP_FLAG_ADD, EXT_OP_FLAG_DOT_PRODUCT, EXT_OP_FLAG_EQ, EXT_OP_LEN_MULTIPLIER = 4, 8, 16, 32, 64
-
-STARTING_PC = 0  # every program starts at PC = 0, and ends at PC = len(bytecode) - 1
-
-
-class ProofError(Exception):
-    pass
-
-
-class BusDirection(IntEnum):
-    PUSH = 1
-    PULL = -1
-
-
-class BusInteraction(IntEnum):
-    PRECOMPILE = 0
-    BYTECODE = 1
-    MEMORY = 2
-
-
-@dataclass(frozen=True)
-class Table:
-    name: str
-    columns: tuple[str, ...]
-    buses: tuple
-    air_degree: int
-    n_constraints: int
-    n_shift: int  # shift (next-row) columns are always the first ones
-    max_log_height: int
-    air_constraints_fn: object  # (folder, logup_beta_eq) -> None
-
-    @property
-    def n_columns(self) -> int:
-        return len(self.columns)
-
-    @property
-    def n_buses(self) -> int:
-        return sum(b[3] if b[0] == BusInteraction.MEMORY else 1 for b in self.buses)
-
-    @property
-    def precompile_bus_interraction_sign(self) -> EF:
-        return EF(self.buses[0][1])  # precompile interraction is the first, by convention
-
-    def col(self, name: str) -> int:
-        return self.columns.index(name)
-
-    def eval_air(self, col_evals: Sequence[EF], alpha_powers: Sequence[EF], logup_beta_eq: list[EF]) -> EF:
-        folder = ConstraintFolder(col_evals[: self.n_columns], col_evals[self.n_columns :], alpha_powers, self.columns)
-        self.air_constraints_fn(folder, logup_beta_eq)
-        return folder.accumulator
-
-    def boundary_statements(
-        self, stacked_n_vars: int, offset: int, n_vars: int, ending_pc: int
-    ) -> list["SparseStatements"]:
-        if self.name != "execution":
-            return []
-        pc_col_offset = offset + (self.col("pc") << n_vars)
-        return [
-            SparseStatements(stacked_n_vars, [], [(pc_col_offset + idx, EF(pc))])
-            for idx, pc in [(0, STARTING_PC), ((1 << n_vars) - 1, ending_pc)]
-        ]
-
-
-# Overwrite-sponge
-def sponge_hash(data: Sequence[Fp]) -> list[Fp]:
-    assert len(data) % SPONGE_RATE == 0 and len(data) > 0
-    capacity = [Fp(len(data))] + [Fp(0)] * (SPONGE_CAPACITY - 1)
-    full = list(capacity) + [Fp(0)] * SPONGE_RATE
-    for k in range(len(data) // SPONGE_RATE):
-        chunk = data[k * SPONGE_RATE : (k + 1) * SPONGE_RATE]
-        full = POSEIDON16.permute(list(capacity) + list(chunk))
-        capacity = full[:SPONGE_CAPACITY]
-    return full[SPONGE_CAPACITY:]
-
-
-class DuplexSpongeChallenger:  # https://eprint.iacr.org/2025/536.pdf
-    def __init__(self, initial_capacity: Sequence[Fp]) -> None:
-        self.state: list[Fp] = list(initial_capacity) + [Fp(0)] * SPONGE_RATE
-        self.rate_fresh: bool = False
-
-    def observe(self, chunk: Sequence[Fp]) -> None:
-        assert len(chunk) == SPONGE_RATE
-        self.state = POSEIDON16.permute(self.state[:SPONGE_CAPACITY] + list(chunk))
-        self.rate_fresh = True
-
-    def observe_many(self, scalars: Sequence[Fp]) -> None:
-        for i in range(0, len(scalars), SPONGE_RATE):
-            chunk = list(scalars[i : i + SPONGE_RATE])
-            chunk += [Fp(0)] * (SPONGE_RATE - len(chunk))
-            self.observe(chunk)
-
-    def duplex(self) -> None:
-        self.observe([Fp(0)] * SPONGE_RATE)
-
-    def _sample_rate(self) -> list[Fp]:
-        assert self.rate_fresh, "stale rate — insert duplex() before sampling"
-        self.rate_fresh = False
-        return self.state[SPONGE_CAPACITY:]
-
-    def _sample_many(self, n: int) -> list[Fp]:
-        out: list[Fp] = []
-        for i in range(n):
-            if i:
-                self.duplex()
-            out.extend(self._sample_rate())
-        return out
-
-    def sample_many_ef(self, n: int) -> list[EF]:
-        flat = self._sample_many(div_ceil(n * EF.DIMENSION, SPONGE_RATE))[: n * EF.DIMENSION]
-        return pack_ef(flat)
-
-    def sample_ef(self) -> EF:
-        return self.sample_many_ef(1)[0]
-
-    def sample_in_range(self, bits: int, n_samples: int) -> list[int]:
-        assert bits < 31
-        flat = self._sample_many(div_ceil(n_samples, SPONGE_RATE))[:n_samples]
-        return [int(x.value) & ((1 << bits) - 1) for x in flat]
-
-
-@dataclass
-class MerkleOpening:
-    leaf_data: list[Fp]
-    path: list[list[Fp]]
-
-
-@dataclass
-class Proof:
-    transcript: list[Fp]
-    merkle_openings: list[MerkleOpening]
-
-
-class FiatShamir(DuplexSpongeChallenger):
-    def __init__(self, proof: Proof, initial_capacity: Sequence[Fp]) -> None:
-        super().__init__(initial_capacity)
-        self.transcript = list(proof.transcript)
-        self.openings = list(reversed(proof.merkle_openings))
-        self.offset = 0
-
-    def _read_padded(self, n: int) -> list[Fp]:
-        n_pad = next_multiple_of(n, SPONGE_RATE)
-        if self.offset + n_pad > len(self.transcript):
-            raise ProofError("ExceededTranscript")
-        chunk = self.transcript[self.offset : self.offset + n_pad]
-        self.offset += n_pad
-        if any(int(chunk[i].value) for i in range(n, n_pad)):
-            raise ProofError("InvalidTranscript: non-zero padding")
-        self.observe_many(chunk)
-        return chunk
-
-    def observe_scalars(self, scalars: Sequence[Fp]) -> None:
-        self.observe_many(list(scalars))
-
-    def next_base_scalars_vec(self, n: int) -> list[Fp]:
-        return self._read_padded(n)[:n]
-
-    def next_extension_scalars_vec(self, n: int) -> list[EF]:
-        flat = self.next_base_scalars_vec(n * EF.DIMENSION)
-        return pack_ef(flat)
-
-    def next_extension_scalar(self) -> EF:
-        return self.next_extension_scalars_vec(1)[0]
-
-    def next_merkle_opening(self) -> MerkleOpening:
-        if not self.openings:
-            raise ProofError("ExceededTranscript: no more Merkle openings")
-        return self.openings.pop()
-
-    def check_pow_grinding(self, bits: int) -> None:
-        if bits == 0:
-            return
-        self._read_padded(SPONGE_RATE)
-        if int(self.state[SPONGE_CAPACITY].value) & ((1 << bits) - 1) != 0:
-            raise ProofError("InvalidGrindingWitness")
-
-
-def merkle_verify_path(
-    root: list[Fp],
-    log_height: int,
-    index: int,
-    opened_values: Sequence[Fp],
-    opening_proof: Sequence[list[Fp]],
-) -> None:
-    if len(opening_proof) != log_height:
-        raise ProofError("Merkle verification failed: opening proof has wrong length")
-    chunks = [list(opened_values[i : i + SPONGE_RATE]) for i in range(0, len(opened_values), SPONGE_RATE)]
-    current = sponge_hash([x for c in reversed(chunks) for x in c])
-    for sibling in opening_proof:
-        current = poseidon16_compress(current, sibling) if index & 1 == 0 else poseidon16_compress(sibling, current)
-        index >>= 1
-    if root != current:
-        raise ProofError("Merkle verification failed: root mismatch")
-
-
-def expand_from_univariate(x: EF, num_variables: int) -> list[EF]:
-    return list(accumulate(repeat(x, num_variables), lambda a, _: a * a))  # [x, x², x⁴, …, x^(2^(n−1))]
-
-
-def eq_poly(a: Sequence[EF], b: Sequence[EF]) -> EF:
-    assert len(a) == len(b)
-    return math.prod(x * y + (ONE - x) * (ONE - y) for x, y in zip(a, b))
-
-
-def eq_at_index(point: Sequence[EF], idx: int, n: int) -> EF:
-    """eq(point, big-endian-bits(idx, n)). Specialization of eq_poly for boolean points."""
-    return math.prod(point[j] if (idx >> (n - 1 - j)) & 1 else ONE - point[j] for j in range(n))
-
-
-def dot_product(a: Sequence, b: Sequence):
-    return sum(x * y for x, y in zip(a, b))
-
-
-def next_mle(x: Sequence[EF], y: Sequence[EF]) -> EF:
-    assert len(x) == len(y)
-    s, eq_prefix = ZERO, ONE
-    for xi, yi in zip(x, y):
-        s = xi * (ONE - yi) * s + eq_prefix * (ONE - xi) * yi
-        eq_prefix *= xi * yi + (ONE - xi) * (ONE - yi)
-    return s + math.prod([*x, *y])
-
-
-def eval_multilinear_evals(evals: Sequence[Fp | EF], point: Sequence[EF]) -> EF:
-    """Evaluate a multilinear in evaluation form at `point`."""
-    assert len(evals) == 1 << len(point)
-    cur: Sequence = evals
-    for r in reversed(point):
-        cur = [cur[j] + (cur[j + 1] - cur[j]) * r for j in range(0, len(cur), 2)]
-    return cur[0]
-
-
-def eval_multilinear_coeffs(coeffs: Sequence[EF], point: Sequence[EF]) -> EF:
-    """Evaluate a multilinear in coefficient form at `point`."""
-    assert len(coeffs) == 1 << len(point)
-    if not point:
-        return coeffs[0]
-    half = len(coeffs) // 2
-    lo = eval_multilinear_coeffs(coeffs[:half], point[1:])
-    hi = eval_multilinear_coeffs(coeffs[half:], point[1:])
-    return lo + hi * point[0]
-
-
-def eval_univariate_polynomial(coeffs: list[EF], x: EF) -> EF:
-    acc = ZERO
-    for c in reversed(coeffs):
-        acc = acc * x + c
-    return acc
-
-
-def mle_of_01234567_etc(point: Sequence[EF]) -> EF:
-    """evaluate the MLE of `f(i) = i` (big-endian) at `point`."""
-    n = len(point)
-    return sum(p * (1 << (n - 1 - i)) for i, p in enumerate(point))
-
-
-def mle_of_zeros_then_ones(n_zeros: int, point: Sequence[EF]) -> EF:
-    """evaluate the MLE of `[0]*n_zeros ++ [1]*(2^len(point) - n_zeros)` at `point`."""
-    n_values = 1 << len(point)
-    assert n_zeros <= n_values
-    if n_zeros == 0:
-        return ONE
-    if n_zeros == n_values:
-        return ZERO
-    half, tail = n_values >> 1, point[1:]
-    if n_zeros < half:
-        return (ONE - point[0]) * mle_of_zeros_then_ones(n_zeros, tail) + point[0]
-    return point[0] * mle_of_zeros_then_ones(n_zeros - half, tail)
-
-
-def eval_eq(point: Sequence[EF]) -> list[EF]:
-    out = [ONE]
-    for p in point:
-        out = [w for v in out for w in (v * (ONE - p), v * p)]
-    return out
-
-
-@dataclass
-class SparseStatements:
-    total_num_variables: int
-    point: list[EF]
-    values: list[tuple[int, EF]]
-    is_next: bool = False
-
-    @property
-    def selector_num_variables(self) -> int:
-        return self.total_num_variables - len(self.point)
-
-
-def whir_folding_factor_at_round(r: int) -> int:
-    return WHIR_INITIAL_FOLDING_FACTOR if r == 0 else WHIR_SUBSEQUENT_FOLDING_FACTOR
-
-
-def whir_n_rounds_and_final_sumcheck(num_variables: int) -> tuple[int, int]:
-    nv = num_variables - WHIR_INITIAL_FOLDING_FACTOR
-    if nv < WHIR_MAX_NUM_VARIABLES_TO_SEND_COEFFS:
-        return 0, nv
-    n = div_ceil(nv - WHIR_MAX_NUM_VARIABLES_TO_SEND_COEFFS, WHIR_SUBSEQUENT_FOLDING_FACTOR)
-    return n, nv - n * WHIR_SUBSEQUENT_FOLDING_FACTOR
-
-
-@dataclass
-class ParsedCommitment:
-    num_variables: int
-    root: list[Fp]
-    ood_points: list[EF]
-    ood_answers: list[EF]
-
-    def oods_constraints(self) -> list[SparseStatements]:
-        return [
-            SparseStatements(self.num_variables, expand_from_univariate(p, self.num_variables), [(0, ev)])
-            for p, ev in zip(self.ood_points, self.ood_answers)
-        ]
-
-
-def verify_sumcheck(
-    fiat_shamir: FiatShamir, target: EF, n_rounds: int, degree: int, pow_bits: int = 0
-) -> tuple[list[EF], EF]:
-    point: list[EF] = []
-    for _ in range(n_rounds):
-        coeffs = fiat_shamir.next_extension_scalars_vec(degree + 1)
-        s = coeffs[0] + sum(coeffs)
-        if s != target:
-            raise ProofError("Sumcheck identity failed: h(0) + h(1) != target")
-        fiat_shamir.check_pow_grinding(pow_bits)
-        r = fiat_shamir.sample_ef()
-        point.append(r)
-        target = eval_univariate_polynomial(coeffs, r)
-    return point, target
-
-
-def verify_stir_challenges(
-    fiat_shamir: FiatShamir,
-    round_index: int,
-    log_height: int,
-    num_variables: int,
-    num_queries: int,
-    query_pow_bits: int,
-    commitment: ParsedCommitment,
-    folding_randomness: list[EF],
-) -> list[SparseStatements]:
-    gen = Fp(KB_TWO_ADIC_GENERATORS[log_height])
-    fiat_shamir.check_pow_grinding(query_pow_bits)
-    indices = fiat_shamir.sample_in_range(log_height, num_queries)
-    constraints: list[SparseStatements] = []
-    for idx in indices:
-        op = fiat_shamir.next_merkle_opening()
-        merkle_verify_path(commitment.root, log_height, idx, op.leaf_data, op.path)
-        # Round 0 leaves are raw base-field elements; later rounds pack DIM Fp values per EF element.
-        leaf = op.leaf_data
-        if round_index == 0:
-            packed = leaf
-        else:
-            packed = pack_ef(leaf)
-        fold = eval_multilinear_evals(packed, folding_randomness)
-        ef_pt = EF(pow(int(gen.value), idx, P))
-        pt = expand_from_univariate(ef_pt, num_variables)
-        constraints.append(SparseStatements(num_variables, pt, [(0, fold)]))
-    return constraints
-
-
-def whir_verify(
-    fiat_shamir: FiatShamir,
-    cfg: dict,
-    parsed_commitment: ParsedCommitment,
-    statements: list[SparseStatements],
-) -> list[EF]:
-    n_rounds, final_sumcheck_rounds = whir_n_rounds_and_final_sumcheck(cfg["num_variables"])
-    round_constraints: list[tuple[list[EF], list[SparseStatements]]] = []
-    round_folding: list[list[EF]] = []
-    target = ZERO
-
-    def step(constraints: list[SparseStatements], n_fold: int, pow_bits: int) -> None:
-        nonlocal target
-        fiat_shamir.duplex()
-        gamma = fiat_shamir.sample_ef()
-        combo: list[EF] = []
-        g = ONE
-        for smt in constraints:
-            for _, value in smt.values:
-                target += g * value
-                combo.append(g)
-                g *= gamma
-        round_constraints.append((combo, constraints))
-        sc_point, target = verify_sumcheck(fiat_shamir, target, n_fold, 2, pow_bits)
-        round_folding.append(sc_point)
-
-    step(
-        parsed_commitment.oods_constraints() + statements,
-        whir_folding_factor_at_round(0),
-        cfg["starting_folding_pow_bits"],
-    )
-
-    prev_commitment = parsed_commitment
-    current_vars = cfg["num_variables"]
-    log_domain = cfg["num_variables"] + cfg["log_inv_rate"]
-    for r in range(n_rounds):
-        round_params = cfg["rounds"][r]
-        current_vars -= whir_folding_factor_at_round(r)
-        n_ood_samples = round_params["ood_samples"]
-        new_commitment = ParsedCommitment(
-            current_vars,
-            fiat_shamir.next_base_scalars_vec(DIGEST_ELEMS),
-            fiat_shamir.sample_many_ef(n_ood_samples),
-            fiat_shamir.next_extension_scalars_vec(n_ood_samples),
-        )
-        stir = verify_stir_challenges(
-            fiat_shamir,
-            r,
-            log_domain - whir_folding_factor_at_round(r),
-            current_vars,
-            round_params["num_queries"],
-            round_params["query_pow_bits"],
-            prev_commitment,
-            round_folding[-1],
-        )
-        step(
-            new_commitment.oods_constraints() + stir,
-            whir_folding_factor_at_round(r + 1),
-            round_params["folding_pow_bits"],
-        )
-        log_domain -= RS_DOMAIN_INITIAL_REDUCTION_FACTOR if r == 0 else 1
-        prev_commitment = new_commitment
-
-    n_vars_final = current_vars - whir_folding_factor_at_round(n_rounds)
-    final_coeffs = fiat_shamir.next_extension_scalars_vec(1 << n_vars_final)
-    final_stir = verify_stir_challenges(
-        fiat_shamir,
-        n_rounds,
-        log_domain - whir_folding_factor_at_round(n_rounds),
-        n_vars_final,
-        cfg["final_queries"],
-        cfg["final_query_pow_bits"],
-        prev_commitment,
-        round_folding[-1],
-    )
-    # Each STIR constraint's point is `expand_from_univariate(α, n)` = [α, α², α⁴, …]. We check that `Σ coeffs[i]·α^i == value` for each smt
-    for smt in final_stir:
-        univ_eval = eval_univariate_polynomial(final_coeffs, smt.point[0])
-        if any(univ_eval != v[1] for v in smt.values):
-            raise ProofError("Final STIR constraint mismatch")
-
-    final_sc_point, final_sc_value = verify_sumcheck(fiat_shamir, target, final_sumcheck_rounds, 2)
-    round_folding.append(final_sc_point)
-
-    folding_flat = [r for chunk in round_folding for r in chunk]
-
-    eval_weights = ZERO
-    pt = folding_flat
-    for round_idx, (randomness, smts) in enumerate(round_constraints):
-        if round_idx > 0:
-            pt = pt[whir_folding_factor_at_round(round_idx - 1) :]
-        i = 0
-        for smt in smts:
-            inner_pt = pt[len(pt) - len(smt.point) :]
-            common = next_mle(smt.point, inner_pt) if smt.is_next else eq_poly(smt.point, inner_pt)
-            sel_n = smt.selector_num_variables
-            for v in smt.values:
-                lagrange = eq_at_index(pt, v[0], sel_n)
-                eval_weights += lagrange * common * randomness[i]
-                i += 1
-    final_value = eval_multilinear_coeffs(final_coeffs, list(reversed(final_sc_point)))
-    if final_sc_value != eval_weights * final_value:
-        raise ProofError("WHIR final sumcheck check failed")
-
-    return folding_flat
-
-
-def stacked_pcs_global_statements(
-    stacked_n_vars: int,
-    memory_n_vars: int,
-    bytecode_n_vars: int,
-    previous_statements: list[SparseStatements],
-    tables: Sequence[Table],
-    heights: dict[str, int],
-    committed_statements: dict[str, list[tuple[list[EF], dict[int, EF], dict[int, EF]]]],
-    ending_pc: int,
-) -> list[SparseStatements]:
-    tables_sorted = sort_tables_by_height(tables, heights)
-    table_offsets: dict[str, int] = {}
-    layout_offset = (2 << memory_n_vars) + (1 << max(bytecode_n_vars, tables_sorted[0][1]))
-    for table, n_vars in tables_sorted:
-        table_offsets[table.name] = layout_offset
-        layout_offset += table.n_columns << n_vars
-
-    out = list(previous_statements)
-
-    def values_at(d: dict[int, EF], col_base: int) -> list[tuple[int, EF]]:
-        return [(col_base + i, v) for i, v in sorted(d.items())]
-
-    for table in tables:
-        n_vars = heights[table.name]
-        offset = table_offsets[table.name]
-        col_base = offset >> n_vars
-        out.extend(table.boundary_statements(stacked_n_vars, offset, n_vars, ending_pc))
-        for point, eq_values, next_values in committed_statements[table.name]:
-            if next_values:
-                out.append(SparseStatements(stacked_n_vars, list(point), values_at(next_values, col_base), True))
-            out.append(SparseStatements(stacked_n_vars, list(point), values_at(eq_values, col_base)))
-
-    return out
-
-
-def verify_gkr_quotient(fiat_shamir: FiatShamir, n_vars: int) -> tuple[EF, list[EF], EF, EF]:
-    assert n_vars > N_VARS_TO_SEND_GKR_COEFFS
-
-    nums = fiat_shamir.next_extension_scalars_vec(1 << N_VARS_TO_SEND_GKR_COEFFS)
-    dens = fiat_shamir.next_extension_scalars_vec(1 << N_VARS_TO_SEND_GKR_COEFFS)
-    quotient = sum(n * d.inv() for n, d in zip(nums, dens))
-
-    point = fiat_shamir.sample_many_ef(N_VARS_TO_SEND_GKR_COEFFS)
-    claim_num = eval_multilinear_evals(nums, point)
-    claim_den = eval_multilinear_evals(dens, point)
-
-    for layer_n_vars in range(N_VARS_TO_SEND_GKR_COEFFS, n_vars):
-        fiat_shamir.duplex()
-        alpha = fiat_shamir.sample_ef()
-        raw_pt, sc_value = verify_sumcheck(fiat_shamir, claim_num + alpha * claim_den, layer_n_vars, 3)
-        sc_point = list(reversed(raw_pt))
-        nl, nr, dl, dr = fiat_shamir.next_extension_scalars_vec(4)
-        if sc_value != eq_poly(point, sc_point) * (alpha * dl * dr + nl * dr + nr * dl):
-            raise ProofError("GKR step: postponed value mismatch")
-        beta = fiat_shamir.sample_ef()
-        one_minus = ONE - beta
-        claim_num = one_minus * nl + beta * nr
-        claim_den = one_minus * dl + beta * dr
-        point = sc_point + [beta]
-
-    return quotient, point, claim_num, claim_den
-
-
-def finger_print(domainsep: Fp | EF, data: Sequence[EF], beta_eq: Sequence[EF]) -> EF:
-    assert len(beta_eq) > len(data)
-    return dot_product(beta_eq, data) + beta_eq[-1] * domainsep
-
-
-def sort_tables_by_height(tables: Sequence[Table], heights: dict[str, int]) -> list[tuple[Table, int]]:
-    """Descending by height, alphabetical on ties"""
-    return sorted([(t, heights[t.name]) for t in tables], key=lambda x: (-x[1], x[0].name))
-
-
-def verify_generic_logup(
-    fiat_shamir: FiatShamir,
-    gamma: EF,  # quotient denominator challenge
-    beta: list[EF],  # bus-tuple hashing seeds
-    beta_eq: list[EF],  # eq(beta, ·) evaluation table
-    log_memory: int,
-    bytecode_multilinear: list[int],
-    tables: Sequence[Table],
-    heights: dict[str, int],
-) -> dict:
-    ds_mem = Fp(LOGUP_MEMORY_DOMAINSEP)
-    ds_byte = Fp(LOGUP_BYTECODE_DOMAINSEP)
-    log_instr = log2_ceil(N_INSTRUCTION_COLUMNS)
-    log_bytecode = log2_strict(len(bytecode_multilinear)) - log_instr
-
-    tables_sorted = sort_tables_by_height(tables, heights)
-    tallest_h = tables_sorted[0][1]
-
-    total_active_len = (
-        (1 << log_memory) + max(1 << log_bytecode, 1 << tallest_h) + sum(t.n_buses << h for t, h in tables_sorted)
-    )
-    total_gkr_n_vars = log2_ceil(total_active_len)
-
-    quotient, point_gkr, claim_num, claim_den = verify_gkr_quotient(fiat_shamir, total_gkr_n_vars)
-    if quotient != ZERO:
-        raise ProofError("logup: GKR sum != 0")
-
-    def pref_at(offset: int, log_height: int) -> EF:
-        """Lagrange weight for the layout-offset of a section of height 2^log_height."""
-        n_missing = total_gkr_n_vars - log_height
-        return eq_at_index(point_gkr, offset >> log_height, n_missing)
-
-    num = den = ZERO
-
-    # Memory section
-    mem_pt = point_gkr[-log_memory:]
-    pref = pref_at(0, log_memory)
-    value_memory_acc = fiat_shamir.next_extension_scalar()
-    value_memory = fiat_shamir.next_extension_scalar()
-    fp_mem = finger_print(ds_mem, [mle_of_01234567_etc(mem_pt), value_memory], beta_eq)
-    num -= pref * value_memory_acc
-    den += pref * (gamma - fp_mem)
-    offset = 1 << log_memory
-
-    # Bytecode section (padded to the tallest table)
-    log_byte_pad = max(log_bytecode, tallest_h)
-    byte_pt = point_gkr[-log_bytecode:]
-    pref = pref_at(offset, log_bytecode)
-    pref_pad = pref_at(offset, log_byte_pad)
-    value_bytecode_acc = fiat_shamir.next_extension_scalar()
-    bytecode_value = eval_multilinear_evals([Fp(v) for v in bytecode_multilinear], byte_pt + beta[-log_instr:])
-    correction = math.prod(ONE - a for a in beta[: len(beta) - log_instr])
-    fp_byte = (
-        bytecode_value * correction
-        + mle_of_01234567_etc(byte_pt) * beta_eq[N_INSTRUCTION_COLUMNS]
-        + beta_eq[-1] * ds_byte
-    )
-    num -= pref * value_bytecode_acc
-    den += pref * (gamma - fp_byte) + pref_pad * mle_of_zeros_then_ones(1 << log_bytecode, point_gkr[-log_byte_pad:])
-    offset += 1 << log_byte_pad
-
-    # Per-table section
-    table_offsets: dict[str, int] = {}
-    for table, log_n_rows in tables_sorted:
-        table_offsets[table.name] = offset
-        offset += table.n_buses << log_n_rows
-    final_offset = offset
-
-    bus_num_vals: dict[str, EF] = {}
-    bus_den_vals: dict[str, EF] = {}
-    columns_values: dict[str, dict[int, EF]] = {}
-
-    for table in tables:
-        name = table.name
-        log_n_rows = heights[name]
-        row_stride = 1 << log_n_rows
-        offset_within_table = table_offsets[name]
-        table_values: dict[int, EF] = {}
-
-        def read_fresh(cols: list[int]) -> None:
-            """Read one extension scalar per column not yet in `table_values`, in order."""
-            missing = [c for c in cols if c not in table_values]
-            for c, e in zip(missing, fiat_shamir.next_extension_scalars_vec(len(missing))):
-                table_values[c] = e
-
-        for bus in table.buses:
-            pref = pref_at(offset_within_table, log_n_rows)
-            kind = bus[0]
-            if kind == BusInteraction.PRECOMPILE:
-                bus_num_vals[name] = fiat_shamir.next_extension_scalar()
-                bus_den_vals[name] = fiat_shamir.next_extension_scalar()
-                num += pref * bus_num_vals[name]
-                den += pref * bus_den_vals[name]
-                n_sub = 1
-            elif kind == BusInteraction.BYTECODE:
-                cols = list(range(N_RUNTIME_COLUMNS, N_RUNTIME_COLUMNS + N_INSTRUCTION_COLUMNS)) + [table.col("pc")]
-                read_fresh(cols)
-                evals = [table_values[c] for c in cols]
-                num += pref
-                den += pref * (gamma - finger_print(ds_byte, evals, beta_eq))
-                n_sub = 1
-            elif kind == BusInteraction.MEMORY:
-                _, idx_ref, vals_ref, n_sub = bus
-                idx_col, vals_start = table.col(idx_ref), table.col(vals_ref)
-                # One sub-bus per cell in the group; the prover sends only the not-yet-seen
-                # columns per row (idx_col is shared across all n_sub rows).
-                for i in range(n_sub):
-                    val_col = vals_start + i
-                    read_fresh([idx_col, val_col])
-                    pref = pref_at(offset_within_table + i * row_stride, log_n_rows)
-                    fp = finger_print(ds_mem, [table_values[idx_col] + i, table_values[val_col]], beta_eq)
-                    num += pref
-                    den += pref * (gamma - fp)
-            else:
-                raise ProofError(f"unknown bus kind: {kind}")
-            offset_within_table += n_sub * row_stride
-
-        columns_values[name] = table_values
-
-    den += mle_of_zeros_then_ones(final_offset, point_gkr)
-    if num != claim_num:
-        raise ProofError("logup: numerators value mismatch")
-    if den != claim_den:
-        raise ProofError("logup: denominators value mismatch")
-
-    return {
-        "value_memory": value_memory, "value_memory_acc": value_memory_acc,
-        "value_bytecode_acc": value_bytecode_acc, "bus_num": bus_num_vals, "bus_den": bus_den_vals,
-        "gkr_point": point_gkr, "columns_values": columns_values,
-    }  # fmt: skip
-
-
-class Cols(dict):
-    def arr(self, prefix: str, n: int) -> list:
-        return [self[f"{prefix}_{i}"] for i in range(n)]
-
-
-class ConstraintFolder:
-    def __init__(
-        self, flat: Sequence[EF], shift: Sequence[EF], alpha_powers: Sequence[EF], columns: Sequence[str]
-    ) -> None:
-        self.flat = list(flat)
-        self.shift = list(shift)
-        self.alpha_powers = list(alpha_powers)
-        # Shift columns are always the first `n_shift` columns of the table.
-        self.flat = Cols(zip(columns, self.flat))
-        self.next = Cols(zip(columns[: len(self.shift)], self.shift))
-        self.accumulator: EF = ZERO
-        self.i = 0
-
-    def assert_zero(self, x: EF) -> None:
-        self.accumulator = self.accumulator + self.alpha_powers[self.i] * x
-        self.i += 1
-
-    def assert_eq(self, x: EF, y: EF) -> None:
-        self.assert_zero(x - y)
-
-    def assert_bool(self, x: EF) -> None:
-        self.assert_zero(x * (ONE - x))
-
-
-def eval_precompile_bus_virtual_columns(
-    folder: "ConstraintFolder",
-    logup_beta_eq: list[EF],
-    multiplicity: EF,
-    domainsep: EF,
-    data: Sequence[EF],
-) -> None:
-    folder.assert_zero(multiplicity)
-    folder.assert_zero(finger_print(domainsep, data, logup_beta_eq))
-
-
-def eval_air_execution(folder: ConstraintFolder, logup_beta_eq: list[EF]) -> None:
-    c, n = folder.flat, folder.next
-    (pc, fp, addr_a, addr_b, addr_c, value_a, value_b, value_c, operand_a, operand_b, operand_c,
-     flag_a, flag_b, flag_c, flag_c_fp, flag_ab_fp, flag_mul, flag_jump, aux_1, aux_2) = (c[k] for k in (
-        "pc", "fp", "addr_a", "addr_b", "addr_c", "value_a", "value_b", "value_c",
-        "operand_a", "operand_b", "operand_c", "flag_a", "flag_b", "flag_c", "flag_c_fp",
-        "flag_ab_fp", "flag_mul", "flag_jump", "aux_1", "aux_2"))  # fmt: skip
-    pc_shift, fp_shift = n["pc"], n["fp"]
-
-    # nu_x = flag·operand + (1 − flag − flag_ab_fp)·value + flag_ab_fp·(fp + operand)
-    nfa = ONE - flag_a - flag_ab_fp
-    nfb = ONE - flag_b - flag_ab_fp
-    nfc = ONE - flag_c - flag_c_fp
-    nu_a = flag_a * operand_a + nfa * value_a + flag_ab_fp * (fp + operand_a)
-    nu_b = flag_b * operand_b + nfb * value_b + flag_ab_fp * (fp + operand_b)
-    nu_c = flag_c * operand_c + nfc * value_c + flag_c_fp * (fp + operand_c)
-
-    # aux_1 ∈ {0,1,2}: 0=nothing, 1=add, 2=deref.
-    flag_add = aux_1 * 2 - aux_1 * aux_1
-    flag_deref = aux_1 * (aux_1 - ONE) * ((P + 1) // 2)  # (P+1)/2 is the inverse of 2 mod P
-    flag_precompile = ONE - flag_add - flag_mul - flag_deref - flag_jump
-
-    eval_precompile_bus_virtual_columns(folder, logup_beta_eq, flag_precompile, aux_2, [nu_a, nu_b, nu_c])
-    folder.assert_zero(nfa * (addr_a - (fp + operand_a)))
-    folder.assert_zero(nfb * (addr_b - (fp + operand_b)))
-    folder.assert_zero(nfc * (addr_c - (fp + operand_c)))
-    folder.assert_zero(flag_add * (nu_b - (nu_a + nu_c)))
-    folder.assert_zero(flag_mul * (nu_b - nu_a * nu_c))
-    folder.assert_zero(flag_deref * (addr_b - (value_a + operand_b)))
-    folder.assert_zero(flag_deref * (value_b - nu_c))
-    jc = flag_jump * nu_a
-    folder.assert_zero(jc * (nu_a - ONE))
-    folder.assert_zero(jc * (pc_shift - nu_b))
-    folder.assert_zero(jc * (fp_shift - nu_c))
-    not_jc = ONE - jc
-    folder.assert_zero(not_jc * (pc_shift - (pc + ONE)))
-    folder.assert_zero(not_jc * (fp_shift - fp))
-
-
-def eval_air_extension(folder: ConstraintFolder, logup_beta_eq: list[EF]) -> None:
-    c, n = folder.flat, folder.next
-    flag_be, flag_start, len_col = c["flag_be"], c["flag_start"], c["len"]
-    flag_add, flag_dot_product, flag_eq = c["flag_add"], c["flag_dot_product"], c["flag_eq"]
-    idx_a, idx_b, idx_r = c["idx_a"], c["idx_b"], c["idx_r"]
-    acc, v_a, v_b, res = c.arr("acc", 5), c.arr("v_a", 5), c.arr("v_b", 5), c.arr("res", 5)
-    flag_be_sh, flag_start_sh, len_sh = n["flag_be"], n["flag_start"], n["len"]
-    flag_add_sh, flag_dot_product_sh, flag_eq_sh = n["flag_add"], n["flag_dot_product"], n["flag_eq"]
-    idx_a_sh, idx_b_sh = n["idx_a"], n["idx_b"]
-    acc_sh = n.arr("acc", 5)
-
-    aux_2 = (
-        flag_be * EXT_OP_FLAG_BE
-        + flag_add * EXT_OP_FLAG_ADD
-        + flag_dot_product * EXT_OP_FLAG_DOT_PRODUCT
-        + flag_eq * EXT_OP_FLAG_EQ
-        + len_col * EXT_OP_LEN_MULTIPLIER
-    )
-    eval_precompile_bus_virtual_columns(
-        folder, logup_beta_eq, flag_start * (flag_add + flag_dot_product + flag_eq), aux_2, [idx_a, idx_b, idx_r]
-    )
-
-    for x in (flag_be, flag_start, flag_add, flag_dot_product, flag_eq):
-        folder.assert_bool(x)
-
-    is_ee, not_start_sh = ONE - flag_be, ONE - flag_start_sh
-    v_a_tilde = [v_a[0]] + [v_a[k] * is_ee for k in range(1, 5)]
-    acc_tail = [acc_sh[k] * not_start_sh for k in range(5)]
-    v_a_v_b = quintic_mul(v_a_tilde, v_b, ZERO)
-
-    for k in range(5):
-        folder.assert_zero((acc[k] - (v_a_tilde[k] + v_b[k] + acc_tail[k])) * flag_add)
-    for k in range(5):
-        folder.assert_zero((acc[k] - (v_a_v_b[k] + acc_tail[k])) * flag_dot_product)
-
-    # eq: acc ← (2·v_a·v_b − v_a − v_b + 1) · (acc_tail or 1 at group end).
-    e_eq = [2 * v_a_v_b[k] - v_a_tilde[k] - v_b[k] + (ONE if k == 0 else ZERO) for k in range(5)]
-    acc_tail_or_one = [acc_sh[0] * not_start_sh + flag_start_sh] + [acc_sh[k] * not_start_sh for k in range(1, 5)]
-    eq_result = quintic_mul(e_eq, acc_tail_or_one, ZERO)
-    for k in range(5):
-        folder.assert_zero((acc[k] - eq_result[k]) * flag_eq)
-    for k in range(5):
-        folder.assert_zero((acc[k] - res[k]) * flag_start)
-
-    for x, y in [
-        (len_col, len_sh + ONE),
-        (flag_be, flag_be_sh),
-        (flag_add, flag_add_sh),
-        (flag_dot_product, flag_dot_product_sh),
-        (flag_eq, flag_eq_sh),
-    ]:
-        folder.assert_zero(not_start_sh * (x - y))
-
-    folder.assert_zero(not_start_sh * (idx_a_sh - idx_a - (flag_be + is_ee * 5)))
-    folder.assert_zero(not_start_sh * (idx_b_sh - idx_b - 5))
-    folder.assert_zero(flag_start_sh * (len_col - ONE))
-
-
-def _full_round(state: list[EF], rc1: list[Fp], rc2: list[Fp]) -> list[EF]:
-    """Two consecutive Poseidon full rounds, fused as one AIR step."""
-    for rc in (rc1, rc2):
-        sbox = [(s + c).cube() for s, c in zip(state, rc)]
-        state = [dot_product(sbox, row) for row in POSEIDON_AIR_MDS_DENSE]
-    return state
-
-
-def eval_air_poseidon16(folder: ConstraintFolder, logup_beta_eq: list[EF]) -> None:
-    c = folder.flat
-    half_pairs = POSEIDON_HALF_FULL_ROUNDS // 2
-
-    multiplicity = c["multiplicity"]
-    nu_b, nu_c = c["nu_b"], c["nu_c"]
-    flag_out4, flag_out8, flag_left = c["flag_out4"], c["flag_out8"], c["flag_left"]
-    offset_left = c["offset_left"]
-    addr_left_lo, addr_left_hi = c["addr_left_lo"], c["addr_left_hi"]
-    flag_permute = c["flag_permute"]
-    inputs = c.arr("input", POSEIDON_WIDTH)
-    beginning_full_rounds = [c.arr(f"begin_r{r}", POSEIDON_WIDTH) for r in range(half_pairs)]
-    partial_cols = c.arr("partial", POSEIDON_PARTIAL_ROUNDS)
-    ending_full_rounds = [c.arr(f"end_r{r}", POSEIDON_WIDTH) for r in range(half_pairs - 1)]
-    out_lo = c.arr("out_lo", POSEIDON_WIDTH // 2)
-    out_hi = c.arr("out_hi", POSEIDON_WIDTH // 2)
-
-    domainsep = (
-        POSEIDON_DOMAINSEP_BASE
-        + flag_permute * POSEIDON_FLAG_PERMUTE_SHIFT
-        + flag_out8 * POSEIDON_FLAG_OUT8_SHIFT
-        + flag_left * POSEIDON_FLAG_LEFT_SHIFT
-        + flag_left * offset_left * POSEIDON_OFFSET_LEFT_SHIFT
-    )
-    not_flag_left = ONE - flag_left
-    nu_a = addr_left_hi - not_flag_left * (DIGEST_ELEMS // 2)
-
-    eval_precompile_bus_virtual_columns(folder, logup_beta_eq, multiplicity, domainsep, [nu_a, nu_b, nu_c])
-    for f in (multiplicity, flag_out4, flag_out8, flag_left, flag_permute):
-        folder.assert_bool(f)
-    folder.assert_zero(flag_permute * flag_out4)
-    folder.assert_zero(flag_out8 * flag_out4)
-    folder.assert_zero((ONE - flag_permute) * (ONE - flag_out8) * (ONE - flag_out4))
-    folder.assert_zero(flag_left * (offset_left - addr_left_lo))
-    folder.assert_zero(not_flag_left * (nu_a - addr_left_lo))
-
-    # --- Poseidon1-16 permutation AIR: each committed `post` row pins the intermediate
-    # state then re-binds it, capping polynomial degree across the long round sequence.
-    state = list(inputs)
-
-    # Beginning full rounds, paired up.
-    for r in range(half_pairs):
-        state = _full_round(state, POSEIDON_AIR_INITIAL_CONSTANTS[2 * r], POSEIDON_AIR_INITIAL_CONSTANTS[2 * r + 1])
-        for i, post in enumerate(beginning_full_rounds[r]):
-            folder.assert_eq(state[i], post)
-            state[i] = post
-
-    # Transition into sparse partial-round form.
-    state = [s + rc for s, rc in zip(state, POSEIDON_AIR_SPARSE_FIRST_RC)]
-    state = [dot_product(state, row) for row in POSEIDON_AIR_SPARSE_M_I]
-
-    # Partial rounds: one sbox on lane 0, then sparse mat-vec.
-    for r in range(POSEIDON_PARTIAL_ROUNDS):
-        folder.assert_eq(state[0].cube(), partial_cols[r])
-        state[0] = partial_cols[r]
-        if r < POSEIDON_PARTIAL_ROUNDS - 1:
-            state[0] += POSEIDON_AIR_SPARSE_SCALAR_RC[r]
-        old_s0 = state[0]
-        state[0] = dot_product(state, POSEIDON_AIR_SPARSE_FIRST_ROW[r])
-        for i in range(1, POSEIDON_WIDTH):
-            state[i] += old_s0 * POSEIDON_AIR_SPARSE_V[r][i - 1]
-
-    # Ending full rounds (all but the last pair) commit intermediate state.
-    for r in range(half_pairs - 1):
-        state = _full_round(state, POSEIDON_AIR_FINAL_CONSTANTS[2 * r], POSEIDON_AIR_FINAL_CONSTANTS[2 * r + 1])
-        for i, post in enumerate(ending_full_rounds[r]):
-            folder.assert_eq(state[i], post)
-            state[i] = post
-
-    # Last full round: compression feeds `inputs` forward into out_lo (permute does not).
-    # out_lo[4..8] is real unless the output is 4 elements (out4); out_hi (capacity) is only
-    # written by the full 16-element permutation (out16 = neither out8 nor out4).
-    last = 2 * (half_pairs - 1)
-    state = _full_round(state, POSEIDON_AIR_FINAL_CONSTANTS[last], POSEIDON_AIR_FINAL_CONSTANTS[last + 1])
-    not_permute = ONE - flag_permute
-    gate_lo_8 = ONE - flag_out4
-    gate_hi = ONE - flag_out8 - flag_out4
-    for i in range(POSEIDON_WIDTH // 2):
-        value = state[i] + not_permute * inputs[i]
-        if i < (DIGEST_ELEMS // 2):
-            folder.assert_zero(value - out_lo[i])
-        else:
-            folder.assert_zero(gate_lo_8 * (value - out_lo[i]))
-        folder.assert_zero(gate_hi * (state[i + POSEIDON_WIDTH // 2] - out_hi[i]))
-
-
-EXECUTION_COLUMNS = (
-    "pc", "fp", "addr_a", "addr_b", "addr_c", "value_a", "value_b", "value_c", # 8 runtime cols
-    "operand_a", "operand_b", "operand_c", "flag_a", "flag_b", "flag_c", "flag_c_fp", "flag_ab_fp", "flag_mul", "flag_jump", "aux_1", "aux_2", # 12 instruction cols.
-)  # fmt: skip
-
-EXTENSION_COLUMNS = (
-    "flag_be", "flag_start", "len", "flag_add", "flag_dot_product", "flag_eq", "idx_a", "idx_b",
-    *(f"acc_{i}" for i in range(5)),
-    "idx_r",
-    *(f"v_a_{i}" for i in range(5)),
-    *(f"v_b_{i}" for i in range(5)),
-    *(f"res_{i}" for i in range(5)),
-)  # fmt: skip
-
-POSEIDON_COLUMNS = (
-    "multiplicity", "nu_b", "nu_c", "flag_out4", "flag_out8", "flag_left", "offset_left", "addr_left_lo", "addr_left_hi", "flag_permute",
-    *(f"input_{i}" for i in range(POSEIDON_WIDTH)),
-    *(f"begin_r{r}_{i}" for r in range(POSEIDON_HALF_FULL_ROUNDS // 2) for i in range(POSEIDON_WIDTH)),
-    *(f"partial_{i}" for i in range(POSEIDON_PARTIAL_ROUNDS)),
-    *(f"end_r{r}_{i}" for r in range(POSEIDON_HALF_FULL_ROUNDS // 2 - 1) for i in range(POSEIDON_WIDTH)),
-    *(f"out_lo_{i}" for i in range(POSEIDON_WIDTH // 2)),
-    *(f"out_hi_{i}" for i in range(POSEIDON_WIDTH // 2)),
-)  # fmt: skip
-
-TABLES = [
-    Table(
-        name="execution",
-        columns=EXECUTION_COLUMNS,
-        buses=(
-            (BusInteraction.PRECOMPILE, BusDirection.PUSH),
-            (BusInteraction.BYTECODE,),
-            (BusInteraction.MEMORY, "addr_a", "value_a", 1),
-            (BusInteraction.MEMORY, "addr_b", "value_b", 1),
-            (BusInteraction.MEMORY, "addr_c", "value_c", 1),
-        ),
-        air_degree=5,
-        n_constraints=14,
-        n_shift=2,
-        max_log_height=24,
-        air_constraints_fn=eval_air_execution,
-    ),
-    Table(
-        name="extension",
-        columns=EXTENSION_COLUMNS,
-        buses=(
-            (BusInteraction.PRECOMPILE, BusDirection.PULL),
-            (BusInteraction.MEMORY, "idx_a", "v_a_0", 5),
-            (BusInteraction.MEMORY, "idx_b", "v_b_0", 5),
-            (BusInteraction.MEMORY, "idx_r", "res_0", 5),
-        ),
-        air_degree=6,
-        n_constraints=35,
-        n_shift=13,
-        max_log_height=21,
-        air_constraints_fn=eval_air_extension,
-    ),
-    Table(
-        name="poseidon",
-        columns=POSEIDON_COLUMNS,
-        buses=(
-            (BusInteraction.PRECOMPILE, BusDirection.PULL),
-            (BusInteraction.MEMORY, "addr_left_lo", "input_0", 4),
-            (BusInteraction.MEMORY, "addr_left_hi", "input_4", 4),
-            (BusInteraction.MEMORY, "nu_b", "input_8", 8),
-            (BusInteraction.MEMORY, "nu_c", "out_lo_0", 16),
-        ),
-        air_degree=10,
-        n_constraints=101,
-        n_shift=0,
-        max_log_height=21,
-        air_constraints_fn=eval_air_poseidon16,
-    ),
-]
-
-
-def verify_execution(
-    public_input: Sequence[Fp],
-    proof: Proof,
-    bytecode_multilinear: list[int],
-):
-    bytecode_log_size = log2_strict(len(bytecode_multilinear)) - log2_ceil(N_INSTRUCTION_COLUMNS)
-    ending_pc = (1 << bytecode_log_size) - 1
-    bytecode_hash = sponge_hash([Fp(v) for v in bytecode_multilinear])
-    if len(public_input) != PUBLIC_INPUT_SIZE:
-        raise ProofError("InvalidProof: public_input length mismatch")
-
-    state = FiatShamir(proof, poseidon16_compress(bytecode_hash, SNARK_DOMAIN_SEP))  # domain separator across bytecodes
-    state.observe_scalars(public_input)
-    dims = [int(x.value) for x in state.next_base_scalars_vec(2 + len(TABLES))]
-    log_inv_rate, log_memory, *table_log_n_rows = dims
-    if not MIN_WHIR_LOG_INV_RATE <= log_inv_rate <= MAX_WHIR_LOG_INV_RATE:
-        raise ProofError("InvalidRate")
-    if not MIN_LOG_MEMORY_SIZE <= log_memory <= MAX_LOG_MEMORY_SIZE:
-        raise ProofError("InvalidProof: log_memory out of range")
-    if not MIN_BYTECODE_LOG_SIZE <= bytecode_log_size <= MAX_BYTECODE_LOG_SIZE:
-        raise ProofError("InvalidProof: bytecode log_size out of range")
-    if log_memory < max(max(table_log_n_rows, default=0), bytecode_log_size):
-        raise ProofError("InvalidProof: memory smaller than tables/bytecode")
-    for table, log_height in zip(TABLES, table_log_n_rows):
-        if not MIN_LOG_N_ROWS_PER_TABLE <= log_height <= table.max_log_height:
-            raise ProofError(
-                f"InvalidProof: table {table.name} log_n_rows={log_height} not in [{MIN_LOG_N_ROWS_PER_TABLE}, {table.max_log_height}]"
-            )
-
-    log_heights = {t.name: h for t, h in zip(TABLES, table_log_n_rows)}
-    n_max = sort_tables_by_height(TABLES, log_heights)[0][1]
-
-    total_stacked = (
-        (2 << log_memory)
-        + (1 << max(bytecode_log_size, n_max))
-        + sum(t.n_columns << log_heights[t.name] for t in TABLES)
-    )
-
-    stacked_n_vars = log2_ceil(total_stacked)
-    if stacked_n_vars > TWO_ADICITY + WHIR_INITIAL_FOLDING_FACTOR - log_inv_rate:
-        raise ProofError("InvalidProof: stacked_n_vars exceeds WHIR domain bound")
-    cfg = WHIR_CONFIGS[(log_inv_rate, stacked_n_vars)]
-    nood = cfg["commitment_ood_samples"]
-    parsed_commitment = ParsedCommitment(
-        stacked_n_vars,
-        state.next_base_scalars_vec(DIGEST_ELEMS),
-        state.sample_many_ef(nood),
-        state.next_extension_scalars_vec(nood),
-    )
-
-    logup_gamma = state.sample_ef()  # the quotient denominator
-    state.duplex()
-    logup_beta = state.sample_many_ef(log2_ceil(N_INSTRUCTION_COLUMNS + 2))  # the bus-tuple hashing seeds
-    logup_beta_eq = eval_eq(logup_beta)
-    logup = verify_generic_logup(
-        state,
-        logup_gamma,
-        logup_beta,
-        logup_beta_eq,
-        log_memory,
-        bytecode_multilinear,
-        TABLES,
-        log_heights,
-    )
-    gkr_point = logup["gkr_point"]
-
-    air_alpha = state.sample_ef()
-    alpha_powers = ef_powers(air_alpha, sum(t.n_constraints for t in TABLES))
-
-    initial_sum, offset = ZERO, 0
-    for table in TABLES:
-        initial_sum += alpha_powers[offset] * (logup["bus_num"][table.name] * table.precompile_bus_interraction_sign)
-        initial_sum += alpha_powers[offset + 1] * (logup_gamma - logup["bus_den"][table.name])
-        offset += table.n_constraints
-    sc_point, sc_value = verify_sumcheck(state, initial_sum, n_max, max(t.air_degree + 1 for t in TABLES))
-
-    committed = {t.name: [(gkr_point[-log_heights[t.name] :], logup["columns_values"][t.name], {})] for t in TABLES}
-    my_air_final, offset = ZERO, 0
-    for table in TABLES:
-        log_n_rows = log_heights[table.name]
-        col_evals = state.next_extension_scalars_vec(table.n_columns + table.n_shift)
-        alphas = alpha_powers[offset : offset + table.n_constraints]
-        offset += table.n_constraints
-        constraint_eval = table.eval_air(col_evals, alphas, logup_beta_eq)
-
-        natural_pt = list(reversed(sc_point[-log_n_rows:])) if log_n_rows else []
-        k_t = math.prod(sc_point[: n_max - log_n_rows])
-        my_air_final += k_t * eq_poly(gkr_point[-log_n_rows:], natural_pt) * constraint_eval
-
-        eq_vals = {i: col_evals[i] for i in range(table.n_columns)}
-        next_vals = {j: col_evals[table.n_columns + j] for j in range(table.n_shift)}
-        committed[table.name].append((natural_pt, eq_vals, next_vals))
-    if my_air_final != sc_value:
-        raise ProofError("AIR sumcheck: claimed value mismatch")
-
-    pm_point = state.sample_many_ef(log2_strict(PUBLIC_INPUT_SIZE))
-    pm_eval = eval_multilinear_evals(public_input, pm_point)
-
-    bytecode_acc_idx = (2 << log_memory) >> bytecode_log_size
-    previous_statements = [
-        SparseStatements(
-            stacked_n_vars,
-            gkr_point[-log_memory:],
-            [(0, logup["value_memory"]), (1, logup["value_memory_acc"])],
-        ),
-        SparseStatements(stacked_n_vars, pm_point, [(0, pm_eval)]),
-        SparseStatements(
-            stacked_n_vars, gkr_point[-bytecode_log_size:], [(bytecode_acc_idx, logup["value_bytecode_acc"])]
-        ),
-    ]
-    global_statements = stacked_pcs_global_statements(
-        stacked_n_vars,
-        log_memory,
-        bytecode_log_size,
-        previous_statements,
-        TABLES,
-        log_heights,
-        committed,
-        ending_pc,
-    )
-    whir_verify(state, cfg, parsed_commitment, global_statements)
-
-    if state.offset != len(state.transcript):
-        raise ProofError(
-            f"InvalidProof: transcript not fully consumed ({state.offset}/{len(state.transcript)} scalars read)"
-        )
-    if state.openings:
-        raise ProofError(f"InvalidProof: {len(state.openings)} Merkle openings unused")
-
-
-def main() -> int:
-    vector_path = Path(__file__).resolve().parents[3] / "target" / "zkvm_test_vectors" / "proof.json"
-    if not vector_path.exists():
-        print(
-            f"Test vector not found at {vector_path}. Please follow the instructions at the beginning of verifier.py file."
-        )
-        return 1
-
-    print(f"Loading {vector_path.name}...")
-    raw = json.loads(vector_path.read_text())
-    print("... done")
-
-    arr = array.array("I")
-    arr.frombytes((vector_path.parent / raw["bytecode_multilinear_path"]).read_bytes())
-    bytecode_multilinear: list[int] = list(arr)
-
-    fp_list = lambda xs: [Fp(v) for v in xs]
-    public_input = fp_list(raw["public_input"])
-    proof = Proof(
-        transcript=fp_list(raw["proof"]["transcript"]),
-        merkle_openings=[
-            MerkleOpening(leaf_data=fp_list(o["leaf_data"]), path=[fp_list(d) for d in o["path"]])
-            for o in raw["proof"]["merkle_openings"]
-        ],
-    )
-
-    try:
-        verify_execution(public_input, proof, bytecode_multilinear)
-    except ProofError as e:
-        print(f"FAIL: {e}")
-        return 1
-
-    print(f"Proof successfully verified")
-    return 0
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/crates/lean_prover/src/lib.rs b/crates/lean_prover/src/lib.rs
index 0d0da08a4..9ea3b7ac3 100644
--- a/crates/lean_prover/src/lib.rs
+++ b/crates/lean_prover/src/lib.rs
@@ -19,20 +19,25 @@ mod test_zkvm;
 use trace_gen::*;
 
 // Right now, hash digests = 8 koala-bear (p = 2^31 - 2^24 + 1, i.e. ≈ 31 bits per field element)
-pub const SECURITY_BITS: usize = 124; // TODO 128 bits security
+pub const SECURITY_BITS: usize = 128; // TODO 128 bits security
 
 pub const GRINDING_BITS: usize = 16;
 pub const MAX_NUM_VARIABLES_TO_SEND_COEFFS: usize = 8;
-pub const WHIR_INITIAL_FOLDING_FACTOR: usize = 7;
-pub const WHIR_SUBSEQUENT_FOLDING_FACTOR: usize = 5;
+pub const WHIR_INITIAL_FOLDING_FACTOR: usize = 6;
+pub const WHIR_SUBSEQUENT_FOLDING_FACTOR: usize = 4;
 pub const RS_DOMAIN_INITIAL_REDUCTION_FACTOR: usize = 5;
 
-pub const SNARK_DOMAIN_SEP: [F; 8] = F::new_array([
-    130704175, 1303721200, 493664240, 1035493700, 2063844858, 1410214009, 1938905908, 1696767928,
+// Domain-separation digest for the zkVM SNARK. Arbitrary nothing-up-my-sleeve field
+// elements; size matches `DIGEST_LEN = 4` for the Goldilocks width-8 Poseidon.
+pub const SNARK_DOMAIN_SEP: [F; 4] = F::new_array([
+    0x4c45_414e_5f5a_4b56, // "LEAN_ZKV"
+    0x4d5f_534e_4152_4b5f, // "M_SNARK_"
+    0x444f_4d53_4550_3031, // "DOMSEP01"
+    0xcccc_cccc_cccc_cccc, // nothing-up-my-sleeve tail
 ]);
 
-pub fn fiat_shamir_domain_sep(bytecode: &Bytecode) -> [F; 8] {
-    poseidon16_compress_pair(&bytecode.hash, &SNARK_DOMAIN_SEP)
+pub fn fiat_shamir_domain_sep(bytecode: &Bytecode) -> [F; 4] {
+    poseidon8_compress_pair(&bytecode.hash, &SNARK_DOMAIN_SEP)
 }
 
 pub fn default_whir_config(starting_log_inv_rate: usize) -> WhirConfigBuilder {
@@ -95,10 +100,10 @@ impl Display for ProverError {
 
 #[cfg(test)]
 mod tests {
-    use backend::{PrimeCharacteristicRing, default_koalabear_poseidon1_16, hash_slice_rtl};
+    use backend::{PrimeCharacteristicRing, default_goldilocks_poseidon1_8, hash_slice_rtl};
     use lean_vm::F;
     use rec_aggregation::{get_aggregation_bytecode, init_aggregation_bytecode};
-    use utils::poseidon16_compress_pair;
+    use utils::poseidon8_compress_pair;
 
     #[test]
     fn compute_snark_domain_sep() {
@@ -109,19 +114,19 @@ mod tests {
             .iter()
             .map(|b| F::from_u8(*b))
             .collect::<Vec<_>>();
-        let mut prefix_free_name_fe = vec![F::ZERO; 8];
+        let mut prefix_free_name_fe = vec![F::ZERO; 4];
         let len = name_fe.len();
         prefix_free_name_fe.extend(name_fe);
-        while prefix_free_name_fe.len() % 8 != 7 {
+        while prefix_free_name_fe.len() % 4 != 3 {
             prefix_free_name_fe.push(F::ZERO);
         }
         prefix_free_name_fe.push(F::from_u64(len as u64));
-        let comp = default_koalabear_poseidon1_16();
-        let name_hash = hash_slice_rtl::<_, _, _, 8, 8>(&comp, &prefix_free_name_fe);
+        let comp = default_goldilocks_poseidon1_8();
+        let name_hash = hash_slice_rtl::<_, _, _, 4, 4>(&comp, &prefix_free_name_fe);
 
         // We incorporate the recursion program hash, containing all the verifier logic, into fiat shamir domain separator
         // (likely not necessary but why not, is there a cleaner approach?)
-        let domain_sep = poseidon16_compress_pair(&name_hash, &recursion_bytecode_hash);
+        let domain_sep = poseidon8_compress_pair(&name_hash, &recursion_bytecode_hash);
 
         println!("Computed SNARK_DOMAIN_SEP: {:?}", domain_sep); // We dont assert equality here to avoid the pain of having to update the hardcoded SNARK_DOMAIN_SEP every time we change the recursion program
     }
diff --git a/crates/lean_prover/src/prove_execution.rs b/crates/lean_prover/src/prove_execution.rs
index aaf50be3b..9c52387b3 100644
--- a/crates/lean_prover/src/prove_execution.rs
+++ b/crates/lean_prover/src/prove_execution.rs
@@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize};
 use sub_protocols::*;
 use tracing::info_span;
 use utils::ansi::Colorize;
-use utils::{from_end, get_poseidon16};
+use utils::{from_end, get_poseidon8};
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct ExecutionProof {
@@ -42,7 +42,7 @@ pub fn prove_execution(
     if memory.len() < min_memory_size {
         memory.resize(min_memory_size, F::ZERO);
     }
-    let mut prover_state = ProverState::new(get_poseidon16().clone(), fiat_shamir_domain_sep(bytecode));
+    let mut prover_state = ProverState::new(*get_poseidon8(), fiat_shamir_domain_sep(bytecode));
     prover_state.observe_scalars(public_input);
     prover_state.add_base_scalars(
         &[
diff --git a/crates/lean_prover/src/test_zkvm.rs b/crates/lean_prover/src/test_zkvm.rs
index cfc6be7a7..eedb6ee76 100644
--- a/crates/lean_prover/src/test_zkvm.rs
+++ b/crates/lean_prover/src/test_zkvm.rs
@@ -1,51 +1,51 @@
-use std::{collections::BTreeMap, io::Write};
+use std::collections::BTreeMap;
 
 use crate::{default_whir_config, prove_execution::prove_execution, verify_execution::verify_execution};
 use backend::*;
 use lean_compiler::*;
 use lean_vm::*;
 use rand::{RngExt, SeedableRng, rngs::StdRng};
-use utils::{init_tracing, poseidon16_compress, poseidon16_permute};
+use utils::{init_tracing, poseidon8_compress, poseidon8_permute};
 
 const N: usize = 11;
 const M: usize = 3;
 
 const ALL_PRECOMPILES_PROGRAM: &str = r#"
-DIM = 5
+DIM = 3
 N = 11
 M = 3
-DIGEST_LEN = 8
-HALF_DIGEST_LEN = 4
+DIGEST_LEN = 4
+HALF_DIGEST_LEN = 2
 SCRATCH_SIZE = 8192
 LOOP_ITERS = LOOP_ITERS_PLACEHOLDER
 
 def main():
     scratch = Array(SCRATCH_SIZE)
     hint_witness("scratch", scratch)
-    poseidon16_compress_half(scratch + 4 * DIGEST_LEN, scratch + 5 * DIGEST_LEN, scratch + 6 * DIGEST_LEN)
+    poseidon8_compress_half(scratch + 4 * DIGEST_LEN, scratch + 5 * DIGEST_LEN, scratch + 6 * DIGEST_LEN)
 
-    # poseidon16_compress_quarter: only first 4 FE constrained
+    # poseidon8_compress_quarter: only first 2 FE constrained
     full_out = scratch + 6 * DIGEST_LEN
     half_out = scratch + 80
-    poseidon16_compress_quarter(scratch + 4 * DIGEST_LEN, scratch + 5 * DIGEST_LEN, half_out)
+    poseidon8_compress_quarter(scratch + 4 * DIGEST_LEN, scratch + 5 * DIGEST_LEN, half_out)
     for i in unroll(0, HALF_DIGEST_LEN):
         assert full_out[i] == half_out[i]
 
-    # poseidon16_compress_half_hardcoded_left: the 4-element prefix lives at a compile-time
+    # poseidon8_compress_half_hardcoded_left: the 2-element prefix lives at a compile-time
     # constant memory offset. Public input is the only region with such addresses, so we
-    # place the prefix at public_input[0..4] (= memory address 0..4) and pass offset 0.
+    # place the prefix at public_input[0..2] (= memory address 0..2) and pass offset 0.
     hardcoded_left = scratch + 1496
     hardcoded_full_out = scratch + 1504
-    poseidon16_compress_half_hardcoded_left(
+    poseidon8_compress_half_hardcoded_left(
         hardcoded_left,
         scratch + 5 * DIGEST_LEN,
         hardcoded_full_out,
         0
     )
 
-    # Same, but only first 4 FE of the output are constrained.
+    # Same, but only first 2 FE of the output are constrained.
     hardcoded_half_out = scratch + 1512
-    poseidon16_compress_quarter_hardcoded_left(
+    poseidon8_compress_quarter_hardcoded_left(
         hardcoded_left,
         scratch + 5 * DIGEST_LEN,
         hardcoded_half_out,
@@ -54,21 +54,21 @@ def main():
     for i in unroll(0, HALF_DIGEST_LEN):
         assert hardcoded_full_out[i] == hardcoded_half_out[i]
 
-    # poseidon16_permute: full 16-element permutation (no feed-forward), written in natural order:
-    #   m[res .. res + 16] = poseidon(left || right)
+    # poseidon8_permute: full 8-element permutation (no feed-forward), written in natural order:
+    #   m[res .. res + 8] = poseidon(left || right)
     permute_out = scratch + 1600
-    poseidon16_permute(scratch + 4 * DIGEST_LEN, scratch + 5 * DIGEST_LEN, permute_out)
+    poseidon8_permute(scratch + 4 * DIGEST_LEN, scratch + 5 * DIGEST_LEN, permute_out)
 
-    # poseidon16_permute_half: same permutation, but only the low 8 elements are written/constrained.
+    # poseidon8_permute_half: same permutation, but only the low 4 elements are written/constrained.
     permute_half_out = scratch + 1620
-    poseidon16_permute_half(scratch + 4 * DIGEST_LEN, scratch + 5 * DIGEST_LEN, permute_half_out)
+    poseidon8_permute_half(scratch + 4 * DIGEST_LEN, scratch + 5 * DIGEST_LEN, permute_half_out)
     for i in unroll(0, DIGEST_LEN):
         assert permute_half_out[i] == permute_out[i]
 
-    # poseidon16_permute_half_hardcoded_left: permutation (low 8) with a hardcoded 4-element left prefix.
+    # poseidon8_permute_half_hardcoded_left: permutation (low 4) with a hardcoded 2-element left prefix.
     # Uses the same input as the hardcoded compression above, so it equals the permutation of that input.
     permute_hardcoded_out = scratch + 1640
-    poseidon16_permute_half_hardcoded_left(hardcoded_left, scratch + 5 * DIGEST_LEN, permute_hardcoded_out, 0)
+    poseidon8_permute_half_hardcoded_left(hardcoded_left, scratch + 5 * DIGEST_LEN, permute_hardcoded_out, 0)
 
     base_ptr = scratch + 88
     ext_a_ptr = scratch + 88 + N
@@ -112,53 +112,43 @@ fn all_precompiles_witness() -> ([F; PUBLIC_INPUT_LEN], ExecutionWitness) {
     let mut rng = StdRng::seed_from_u64(0);
     let mut scratch = F::zero_vec(8192);
 
-    // Poseidon test data
-    let poseidon_16_compress_input: [F; 16] = rng.random();
-    scratch[32..48].copy_from_slice(&poseidon_16_compress_input);
-    let poseidon_output = poseidon16_compress(poseidon_16_compress_input);
-    scratch[48..56].copy_from_slice(&poseidon_output[..8]);
-    let poseidon_24_input: [F; 24] = rng.random();
-    scratch[56..80].copy_from_slice(&poseidon_24_input);
-    // poseidon16_compress_quarter output at offset 80: first 4 = hash, last 4 = arbitrary pre-existing data
-    scratch[80..84].copy_from_slice(&poseidon_output[..4]);
-    scratch[84..88].copy_from_slice(&[
-        F::from_usize(111),
-        F::from_usize(222),
-        F::from_usize(333),
-        F::from_usize(444),
-    ]);
-
-    // poseidon16_compress_half_hardcoded_left: prefix lives at public_input[0..4] (compile-time
-    // constant offset 0), data at scratch[1496..1500], expected output at scratch[1504..1512].
-    let hardcoded_prefix: [F; 4] = rng.random();
-    let hardcoded_data: [F; 4] = rng.random();
-    scratch[1496..1500].copy_from_slice(&hardcoded_data);
-    let mut hardcoded_input = [F::ZERO; 16];
-    hardcoded_input[..4].copy_from_slice(&hardcoded_prefix);
-    hardcoded_input[4..8].copy_from_slice(&hardcoded_data);
-    hardcoded_input[8..16].copy_from_slice(&poseidon_16_compress_input[8..16]);
-    let hardcoded_output = poseidon16_compress(hardcoded_input);
-    scratch[1504..1512].copy_from_slice(&hardcoded_output);
-    // half output: first 4 = hash, last 4 = arbitrary pre-existing data
-    scratch[1512..1516].copy_from_slice(&hardcoded_output[..4]);
-    scratch[1516..1520].copy_from_slice(&[
-        F::from_usize(555),
-        F::from_usize(666),
-        F::from_usize(777),
-        F::from_usize(888),
-    ]);
-
-    // poseidon16_permute output at 1600..1616: raw permutation result.
-    let permute_output = poseidon16_permute(poseidon_16_compress_input);
-    scratch[1600..1616].copy_from_slice(&permute_output);
-
-    // poseidon16_permute_half output at 1620..1628: low 8 of the same permutation.
-    scratch[1620..1628].copy_from_slice(&permute_output[..8]);
-
-    // poseidon16_permute_half_hardcoded_left output at 1640..1648: low 8 of the permutation of the
+    // Poseidon test data — width 8 / digest 4 / half-digest 2 for Goldilocks.
+    // DSL uses `scratch + 4*DIGEST_LEN..6*DIGEST_LEN` (positions 16..24) for the input
+    // and `scratch + 6*DIGEST_LEN..7*DIGEST_LEN` (positions 24..28) for the output.
+    let poseidon_8_compress_input: [F; 8] = rng.random();
+    scratch[16..24].copy_from_slice(&poseidon_8_compress_input);
+    let poseidon_output = poseidon8_compress(poseidon_8_compress_input);
+    scratch[24..28].copy_from_slice(&poseidon_output);
+    // poseidon8_compress_half output at offset 80: first 2 = hash, last 2 = arbitrary pre-existing data
+    scratch[80..82].copy_from_slice(&poseidon_output[..2]);
+    scratch[82..84].copy_from_slice(&[F::from_usize(111), F::from_usize(222)]);
+
+    // poseidon8_compress_hardcoded_left: prefix lives at public_input[0..2] (compile-time
+    // constant offset 0), data at scratch[1496..1498], expected output at scratch[1504..1508].
+    let hardcoded_prefix: [F; 2] = rng.random();
+    let hardcoded_data: [F; 2] = rng.random();
+    scratch[1496..1498].copy_from_slice(&hardcoded_data);
+    let mut hardcoded_input = [F::ZERO; 8];
+    hardcoded_input[..2].copy_from_slice(&hardcoded_prefix);
+    hardcoded_input[2..4].copy_from_slice(&hardcoded_data);
+    hardcoded_input[4..8].copy_from_slice(&poseidon_8_compress_input[4..8]);
+    let hardcoded_output = poseidon8_compress(hardcoded_input);
+    scratch[1504..1508].copy_from_slice(&hardcoded_output);
+    // half output: first 2 = hash, last 2 = arbitrary pre-existing data
+    scratch[1512..1514].copy_from_slice(&hardcoded_output[..2]);
+    scratch[1514..1516].copy_from_slice(&[F::from_usize(555), F::from_usize(666)]);
+
+    // poseidon8_permute output at 1600..1608: raw permutation result.
+    let permute_output = poseidon8_permute(poseidon_8_compress_input);
+    scratch[1600..1608].copy_from_slice(&permute_output);
+
+    // poseidon8_permute_half output at 1620..1624: low 4 of the same permutation.
+    scratch[1620..1624].copy_from_slice(&permute_output[..4]);
+
+    // poseidon8_permute_half_hardcoded_left output at 1640..1644: low 4 of the permutation of the
     // hardcoded-left input (same input as the hardcoded compression above).
-    let permute_hardcoded_output = poseidon16_permute(hardcoded_input);
-    scratch[1640..1648].copy_from_slice(&permute_hardcoded_output[..8]);
+    let permute_hardcoded_output = poseidon8_permute(hardcoded_input);
+    scratch[1640..1644].copy_from_slice(&permute_hardcoded_output[..4]);
 
     // Extension op operands: base[N], ext_a[N], ext_b[N]
     let base_slice: [F; N] = rng.random();
@@ -214,7 +204,7 @@ fn all_precompiles_witness() -> ([F; PUBLIC_INPUT_LEN], ExecutionWitness) {
     scratch[1300..][..DIMENSION].copy_from_slice(poly_eq_ee_result.as_basis_coefficients_slice());
 
     let mut public_input = [F::ZERO; PUBLIC_INPUT_LEN];
-    public_input[..4].copy_from_slice(&hardcoded_prefix);
+    public_input[..2].copy_from_slice(&hardcoded_prefix);
 
     let mut hints = std::collections::HashMap::new();
     hints.insert("scratch".to_string(), vec![scratch]);
@@ -236,57 +226,6 @@ fn test_zk_vm_all_precompiles() {
     );
 }
 
-#[test]
-#[ignore]
-fn dump_test_vector_for_python_verifier() {
-    const LOOP_ITERS: usize = 5000;
-
-    let (public_input, witness) = all_precompiles_witness();
-    let bytecode = compile_program_with_flags(
-        &ProgramSource::Raw(ALL_PRECOMPILES_PROGRAM.to_string()),
-        all_precompiles_flags(LOOP_ITERS),
-    );
-    let exec_proof = prove_execution(&bytecode, &public_input, &witness, &default_whir_config(1), false).unwrap();
-    let (_details, raw_proof) = verify_execution(&bytecode, &public_input, exec_proof.proof).unwrap();
-
-    let f_u32 = |x: F| x.as_canonical_u32();
-    let out_dir = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
-        .join("../..")
-        .join(std::env::var("CARGO_TARGET_DIR").unwrap_or_else(|_| "target".into()))
-        .join("zkvm_test_vectors");
-    std::fs::create_dir_all(&out_dir).unwrap();
-
-    let bytecode_path = "proof.bytecode_mle.bin";
-    let mut mle_file = std::fs::File::create(out_dir.join(bytecode_path)).unwrap();
-    for v in &bytecode.instructions_multilinear {
-        mle_file.write_all(&f_u32(*v).to_le_bytes()).unwrap();
-    }
-
-    let opening_json = |o: &MerkleOpening<F>| -> serde_json::Value {
-        serde_json::json!({
-            "leaf_data": o.leaf_data.iter().map(|&f| f_u32(f)).collect::<Vec<_>>(),
-            "path": o.path.iter().map(|d| d.map(f_u32)).collect::<Vec<_>>(),
-        })
-    };
-    let out = serde_json::json!({
-        "bytecode_multilinear_path": bytecode_path,
-        "public_input": public_input.iter().map(|&f| f_u32(f)).collect::<Vec<_>>(),
-        "proof": {
-            "transcript": raw_proof.transcript.iter().map(|&f| f_u32(f)).collect::<Vec<_>>(),
-            "merkle_openings": raw_proof.merkle_openings.iter().map(opening_json).collect::<Vec<_>>(),
-        },
-    });
-    let json_path = out_dir.join("proof.json");
-    std::fs::write(&json_path, serde_json::to_string(&out).unwrap()).unwrap();
-
-    println!(
-        "wrote {} ({:.1} KiB), bytecode_log_size={}",
-        json_path.display(),
-        json_path.metadata().unwrap().len() as f64 / 1024.0,
-        bytecode.log_size(),
-    );
-}
-
 #[test]
 fn test_small_memory() {
     let program_str = r#"
diff --git a/crates/lean_prover/src/trace_gen.rs b/crates/lean_prover/src/trace_gen.rs
index cd0e401be..fe11662cb 100644
--- a/crates/lean_prover/src/trace_gen.rs
+++ b/crates/lean_prover/src/trace_gen.rs
@@ -1,7 +1,7 @@
 use backend::*;
 use lean_vm::*;
 use std::{array, collections::BTreeMap};
-use utils::{ToUsize, get_poseidon_16_of_zero, transposed_par_iter_mut};
+use utils::{ToUsize, get_poseidon_8_of_zero, transposed_par_iter_mut};
 
 #[derive(Debug)]
 pub struct ExecutionTrace {
@@ -100,7 +100,7 @@ pub fn get_execution_trace(
     let padding_zero_vec_ptr = memory_padded.len();
     memory_padded.extend(std::iter::repeat_n(F::ZERO, 16));
     let null_poseidon_16_hash_ptr = memory_padded.len();
-    memory_padded.extend_from_slice(get_poseidon_16_of_zero());
+    memory_padded.extend_from_slice(get_poseidon_8_of_zero());
 
     // IMPORTANT: memory size should always be >= number of VM cycles
     let padded_memory_len = (memory_padded.len().max(n_cycles).max(1 << MIN_LOG_N_ROWS_PER_TABLE)).next_power_of_two();
@@ -108,34 +108,34 @@ pub fn get_execution_trace(
 
     let ExecutionResult { mut traces, .. } = execution_result;
 
-    let poseidon_trace = traces.get_mut(&Table::poseidon16()).unwrap();
-    fill_trace_poseidon_16(&mut poseidon_trace.columns);
+    let poseidon_trace = traces.get_mut(&Table::poseidon8()).unwrap();
+    fill_trace_poseidon_8(&mut poseidon_trace.columns);
 
     // Override the output columns the AIR leaves unconstrained with the actual memory values,
-    // so the 16-cell output lookup matches. out_lo[4..8] is free when the output is only 4
-    // elements (out4); out_hi is free for everything except the full 16-element
+    // so the 8-cell output lookup matches. out_lo[2..4] is free when the output is only 2
+    // elements (out2); out_hi is free for everything except the full 8-element
     // permutation
     {
-        let split = POSEIDON_COL_OUT_LO + HALF_DIGEST_LEN;
+        let split = POSEIDON_8_COL_OUT_LO + HALF_DIGEST_LEN;
         let (left, right) = poseidon_trace.columns.split_at_mut(split);
-        let flag_out4_col = &left[POSEIDON_COL_FLAG_OUT4];
-        let flag_out8_col = &left[POSEIDON_COL_FLAG_OUT8];
-        let nu_c_col = &left[POSEIDON_COL_NU_C];
+        let flag_out2_col = &left[POSEIDON_8_COL_FLAG_OUT2];
+        let flag_out4_col = &left[POSEIDON_8_COL_FLAG_OUT4];
+        let nu_c_col = &left[POSEIDON_8_COL_NU_C];
         const N: usize = HALF_DIGEST_LEN + DIGEST_LEN;
         let cols: &mut [Vec<F>; N] = (&mut right[..N]).try_into().unwrap();
 
         transposed_par_iter_mut(cols)
+            .zip(flag_out2_col)
             .zip(flag_out4_col)
-            .zip(flag_out8_col)
             .zip(nu_c_col)
-            .for_each(|(((row, &flag_out4), &flag_out8), &nu_c)| {
+            .for_each(|(((row, &flag_out2), &flag_out4), &nu_c)| {
                 let base = nu_c.to_usize();
-                if flag_out4 == F::ONE {
+                if flag_out2 == F::ONE {
                     for j in 0..HALF_DIGEST_LEN {
                         *row[j] = memory_padded[base + HALF_DIGEST_LEN + j];
                     }
                 }
-                if flag_out8 == F::ONE || flag_out4 == F::ONE {
+                if flag_out4 == F::ONE || flag_out2 == F::ONE {
                     for j in 0..DIGEST_LEN {
                         *row[HALF_DIGEST_LEN + j] = memory_padded[base + DIGEST_LEN + j];
                     }
diff --git a/crates/lean_prover/src/verify_execution.rs b/crates/lean_prover/src/verify_execution.rs
index f49ae8176..43c984d04 100644
--- a/crates/lean_prover/src/verify_execution.rs
+++ b/crates/lean_prover/src/verify_execution.rs
@@ -4,7 +4,7 @@ use crate::*;
 use backend::{Proof, RawProof, VerifierState};
 use lean_vm::*;
 use sub_protocols::*;
-use utils::{ToUsize, from_end, get_poseidon16};
+use utils::{ToUsize, from_end, get_poseidon8};
 
 #[derive(Debug, Clone)]
 pub struct ProofVerificationDetails {
@@ -24,8 +24,10 @@ pub fn verify_execution(
             max_log_size: MAX_BYTECODE_LOG_SIZE,
         });
     }
-    let mut verifier_state =
-        VerifierState::<EF, _>::new(proof, get_poseidon16().clone(), fiat_shamir_domain_sep(bytecode))?;
+    if public_input.len() != PUBLIC_INPUT_LEN {
+        return Err(ProofError::InvalidProof);
+    }
+    let mut verifier_state = VerifierState::<EF, _>::new(proof, *get_poseidon8(), fiat_shamir_domain_sep(bytecode))?;
     verifier_state.observe_scalars(public_input);
     let dims = verifier_state
         .next_base_scalars_vec(2 + N_TABLES)?
diff --git a/crates/lean_prover/tests/check_whir_configs.rs b/crates/lean_prover/tests/check_whir_configs.rs
deleted file mode 100644
index 60266ef21..000000000
--- a/crates/lean_prover/tests/check_whir_configs.rs
+++ /dev/null
@@ -1,71 +0,0 @@
-use std::fmt::Write;
-use std::fs;
-use std::path::PathBuf;
-
-use backend::{TwoAdicField, WhirConfig};
-use lean_prover::default_whir_config;
-use lean_vm::{EF, F, MAX_WHIR_LOG_INV_RATE, MIN_WHIR_LOG_INV_RATE};
-
-fn expected_whir_configs_line() -> String {
-    let mut entries: Vec<String> = Vec::new();
-
-    for log_inv_rate in MIN_WHIR_LOG_INV_RATE..=MAX_WHIR_LOG_INV_RATE {
-        let builder = default_whir_config(log_inv_rate);
-        let first_ff = builder.folding_factor.at_round(0);
-        let max_nv = F::TWO_ADICITY + first_ff - log_inv_rate;
-
-        for num_variables in first_ff..=max_nv {
-            let cfg: WhirConfig<EF> = WhirConfig::new(&builder, num_variables);
-
-            let mut rounds = String::from("(");
-            for (i, r) in cfg.round_parameters.iter().enumerate() {
-                if i > 0 {
-                    rounds.push(',');
-                }
-                write!(
-                    rounds,
-                    "({},{},{},{})",
-                    r.num_queries, r.ood_samples, r.query_pow_bits, r.folding_pow_bits
-                )
-                .unwrap();
-            }
-            if cfg.round_parameters.len() == 1 {
-                rounds.push(',');
-            }
-            rounds.push(')');
-
-            entries.push(format!(
-                "({},{},{},{},{},{},{})",
-                log_inv_rate,
-                num_variables,
-                cfg.commitment_ood_samples,
-                cfg.starting_folding_pow_bits,
-                cfg.final_queries,
-                cfg.final_query_pow_bits,
-                rounds,
-            ));
-        }
-    }
-
-    format!("WHIR_CONFIGS = ({})", entries.join(","))
-}
-
-fn strip_ws(s: &str) -> String {
-    s.chars().filter(|c| !c.is_whitespace()).collect()
-}
-
-#[test]
-fn check_whir_configs_in_python_verifier() {
-    let expected = expected_whir_configs_line();
-    println!("{expected}");
-
-    let verifier_py = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("python-verifier/verifier.py");
-    let src =
-        fs::read_to_string(&verifier_py).unwrap_or_else(|e| panic!("failed to read {}: {e}", verifier_py.display()));
-
-    assert!(
-        strip_ws(&src).contains(&strip_ws(&expected)),
-        "WHIR_CONFIGS in {} is out of sync with Rust `default_whir_config`. Replace the line with the one printed above.",
-        verifier_py.display(),
-    );
-}
diff --git a/crates/lean_vm/src/core/constants.rs b/crates/lean_vm/src/core/constants.rs
index 8c5baa9fb..8d541522e 100644
--- a/crates/lean_vm/src/core/constants.rs
+++ b/crates/lean_vm/src/core/constants.rs
@@ -4,10 +4,10 @@ use crate::Table;
 pub const LOGUP_MEMORY_DOMAINSEP: usize = 1;
 pub const LOGUP_BYTECODE_DOMAINSEP: usize = 2;
 
-/// Large field = extension field of degree DIMENSION over koala-bear
-pub const DIMENSION: usize = 5;
+/// Large field = extension field of degree DIMENSION over Goldilocks
+pub const DIMENSION: usize = 3;
 
-pub const DIGEST_LEN: usize = 8;
+pub const DIGEST_LEN: usize = 4;
 
 pub const PUBLIC_INPUT_LEN: usize = DIGEST_LEN;
 
@@ -24,9 +24,12 @@ pub const MAX_BYTECODE_LOG_SIZE: usize = 22;
 /// Minimum and maximum number of rows per table (as powers of two), both inclusive
 pub const MIN_LOG_N_ROWS_PER_TABLE: usize = 8; // Zero padding will be added to each at least, if this minimum is not reached, (ensuring AIR / GKR work fine, with SIMD, without too much edge cases). Long term, we should find a more elegant solution.
 pub const MAX_LOG_N_ROWS_PER_TABLE: [(Table, usize); 3] = [
-    (Table::execution(), 24),
+    (Table::execution(), 25),
     (Table::extension_op(), 21),
-    (Table::poseidon16(), 21),
+    // 20 (not 21): the `poseidon8_permute` variant widened the table by 5 columns
+    // (`flag_permute` + 4 `outputs_right`), so 2^21 rows would exceed the WHIR
+    // commitment surface cap (see `ensure_not_too_big_commitment_surface`).
+    (Table::poseidon8(), 20),
 ];
 
 pub fn max_log_n_rows_per_table(table: &Table) -> usize {
diff --git a/crates/lean_vm/src/core/types.rs b/crates/lean_vm/src/core/types.rs
index fbad9af10..393f5e251 100644
--- a/crates/lean_vm/src/core/types.rs
+++ b/crates/lean_vm/src/core/types.rs
@@ -3,13 +3,13 @@ use std::{
     fmt::{Display, Formatter},
 };
 
-use backend::{KoalaBear, QuinticExtensionFieldKB};
+use backend::{CubicExtensionFieldGL, Goldilocks};
 
 /// Base field type for VM operations
-pub type F = KoalaBear;
+pub type F = Goldilocks;
 
 /// Extension field type for VM operations
-pub type EF = QuinticExtensionFieldKB;
+pub type EF = CubicExtensionFieldGL;
 
 /// Line number in source code for debugging
 pub type SourceLineNumber = usize;
diff --git a/crates/lean_vm/src/diagnostics/exec_result.rs b/crates/lean_vm/src/diagnostics/exec_result.rs
index 2024fa083..87e4ab548 100644
--- a/crates/lean_vm/src/diagnostics/exec_result.rs
+++ b/crates/lean_vm/src/diagnostics/exec_result.rs
@@ -52,7 +52,7 @@ impl ExecutionMetadata {
         out.push('\n');
         if self.n_poseidons > 0 {
             out.push_str(&format!(
-                "Poseidon16 calls: {} (1 poseidon per {} instructions)\n",
+                "Poseidon8 calls: {} (1 poseidon per {} instructions)\n",
                 pretty_integer(self.n_poseidons),
                 self.cycles / self.n_poseidons
             ));
diff --git a/crates/lean_vm/src/execution/runner.rs b/crates/lean_vm/src/execution/runner.rs
index f00e04880..0e0358931 100644
--- a/crates/lean_vm/src/execution/runner.rs
+++ b/crates/lean_vm/src/execution/runner.rs
@@ -337,7 +337,7 @@ fn execute_bytecode_helper(
     let metadata = ExecutionMetadata {
         cycles: trace.pcs.len(),
         memory: memory.0.len(),
-        n_poseidons: trace.tables[&Table::poseidon16()].columns[0].len(),
+        n_poseidons: trace.tables[&Table::poseidon8()].columns[0].len(),
         n_extension_ops: trace.tables[&Table::extension_op()].columns[0].len(),
         bytecode_size: bytecode.code.len(),
         public_input_size: PUBLIC_INPUT_LEN,
diff --git a/crates/lean_vm/src/execution/tests.rs b/crates/lean_vm/src/execution/tests.rs
index 60ba768d0..4328ebdcd 100644
--- a/crates/lean_vm/src/execution/tests.rs
+++ b/crates/lean_vm/src/execution/tests.rs
@@ -24,15 +24,23 @@ fn test_memory_already_set_error() {
     // Setting same value should work
     memory.set(0, F::ONE).unwrap();
 
-    // Setting different value should fail
-    assert!(matches!(
-        memory.set(0, F::ZERO),
-        Err(RunnerError::MemoryAlreadySet {
-            address: 0,
-            prev_value: F::ONE,
-            new_value: F::ZERO,
-        })
-    ));
+    // Setting different value should fail.
+    // Goldilocks has two redundant representations for each canonical value
+    // (x and x + ORDER both reduce to x), so it isn't `StructuralPartialEq`
+    // and can't be matched on directly — compare the fields explicitly instead.
+    let err = memory.set(0, F::ZERO).unwrap_err();
+    match err {
+        RunnerError::MemoryAlreadySet {
+            address,
+            prev_value,
+            new_value,
+        } => {
+            assert_eq!(address, 0);
+            assert_eq!(prev_value, F::ONE);
+            assert_eq!(new_value, F::ZERO);
+        }
+        other => panic!("unexpected error variant: {other:?}"),
+    }
 }
 
 #[test]
diff --git a/crates/lean_vm/src/isa/hint.rs b/crates/lean_vm/src/isa/hint.rs
index 9bbc3f6d9..f077c261a 100644
--- a/crates/lean_vm/src/isa/hint.rs
+++ b/crates/lean_vm/src/isa/hint.rs
@@ -101,11 +101,12 @@ impl<T> HintWitnessDestination<T> {
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
 pub enum CustomHint {
-    // Decompose values into their custom representations:
-    /// each field element x is decomposed to: (a0, a1, a2, ..., a11, b) where:
-    /// x = a0 + a1.4 + a2.4^2 + a3.4^3 + ... + a11.4^11 + b.2^24
-    /// and ai < 4, b < 2^7 - 1
-    /// The decomposition is unique, and always exists (except for x = -1)
+    /// WOTS-encoding decomposition of one Goldilocks FE.
+    /// Args: (chunks_ptr, limbs_ptr, src_value).
+    /// Writes 5 2W=6-bit chunks of the low 30 bits to `chunks_ptr[0..5]`
+    /// (each chunk packs two consecutive W=3-bit chain steps as
+    /// `step_a + CHAIN_LENGTH * step_b`) and 2 u16 limbs of the high
+    /// 32 bits to `limbs_ptr[0..2]`.
     DecomposeBitsXMSS,
     DecomposeBitsMerkleWhir,
     DecomposeBits,
@@ -137,8 +138,8 @@ impl CustomHint {
 
     pub fn n_args(&self) -> usize {
         match self {
-            Self::DecomposeBitsXMSS => 4,
-            Self::DecomposeBitsMerkleWhir => 3,
+            Self::DecomposeBitsXMSS => 3,
+            Self::DecomposeBitsMerkleWhir => 4,
             Self::DecomposeBits => 3,
             Self::LessThan => 3,
             Self::Log2Ceil => 2,
@@ -153,41 +154,37 @@ impl CustomHint {
     ) -> Result<(), RunnerError> {
         match self {
             Self::DecomposeBitsXMSS => {
-                let decomposed_ptr = args[0].read_value(ctx.memory, ctx.fp)?.to_usize();
-                let to_decompose_ptr = args[1].read_value(ctx.memory, ctx.fp)?.to_usize();
-                let num_to_decompose = args[2].read_value(ctx.memory, ctx.fp)?.to_usize();
-                let chunk_size = args[3].read_value(ctx.memory, ctx.fp)?.to_usize();
-                if chunk_size == 0 || !24_usize.is_multiple_of(chunk_size) {
-                    return Err(RunnerError::InvalidHintArguments(format!(
-                        "DecomposeBitsXMSS: chunk_size {chunk_size} must be a nonzero divisor of 24"
-                    )));
-                }
-                let mut memory_index_decomposed = decomposed_ptr;
-                #[allow(clippy::explicit_counter_loop)]
-                for i in 0..num_to_decompose {
-                    let value = ctx.memory.get(to_decompose_ptr + i)?.to_usize();
-                    for i in 0..24 / chunk_size {
-                        let value = F::from_usize((value >> (chunk_size * i)) & ((1 << chunk_size) - 1));
-                        ctx.memory.set(memory_index_decomposed, value)?;
-                        memory_index_decomposed += 1;
-                    }
+                // WOTS-encoding decomposition. Writes:
+                //   chunks_ptr[0..5] = 5 chunks of 2W=6 bits (low bits 0..29).
+                //                       Each chunk packs two consecutive chain
+                //                       steps as `step_a + CHAIN_LENGTH * step_b`.
+                //   limbs_ptr[0..2]   = 2 u16 limbs of the high 32 bits (bits 32..47, 48..63)
+                // The 2 high bits of the low limb are implicit zeros, enforced by
+                // the SNARK constraint structure (and rejected at signing time).
+                let chunks_ptr = args[0].read_value(ctx.memory, ctx.fp)?.to_usize();
+                let limbs_ptr = args[1].read_value(ctx.memory, ctx.fp)?.to_usize();
+                let value = args[2].read_value(ctx.memory, ctx.fp)?.as_canonical_u64();
+                const NUM_CHUNKS: usize = 5;
+                const CHUNK_SIZE: usize = 6;
+                for j in 0..NUM_CHUNKS {
+                    let chunk = (value >> (CHUNK_SIZE * j)) & ((1u64 << CHUNK_SIZE) - 1);
+                    ctx.memory.set(chunks_ptr + j, F::from_u64(chunk))?;
                 }
+                ctx.memory.set(limbs_ptr, F::from_u64((value >> 32) & 0xFFFF))?;
+                ctx.memory.set(limbs_ptr + 1, F::from_u64((value >> 48) & 0xFFFF))?;
             }
             Self::DecomposeBitsMerkleWhir => {
+                // Decompose a single FE's canonical u64 into `num_chunks` chunks of
+                // `chunk_size` bits (low bits first). Caller must ensure
+                // `num_chunks * chunk_size <= F::bits()`.
                 let decomposed_ptr = args[0].read_value(ctx.memory, ctx.fp)?.to_usize();
-                let value = args[1].read_value(ctx.memory, ctx.fp)?.to_usize();
-                let chunk_size = args[2].read_value(ctx.memory, ctx.fp)?.to_usize();
-                if chunk_size == 0 || !24_usize.is_multiple_of(chunk_size) {
-                    return Err(RunnerError::InvalidHintArguments(format!(
-                        "DecomposeBitsMerkleWhir: chunk_size {chunk_size} must be a nonzero divisor of 24"
-                    )));
-                }
-                let mut memory_index_decomposed = decomposed_ptr;
-                #[allow(clippy::explicit_counter_loop)]
-                for i in 0..24 / chunk_size {
-                    let value = F::from_usize((value >> (chunk_size * i)) & ((1 << chunk_size) - 1));
-                    ctx.memory.set(memory_index_decomposed, value)?;
-                    memory_index_decomposed += 1;
+                let value = args[1].read_value(ctx.memory, ctx.fp)?.as_canonical_u64();
+                let num_chunks = args[2].read_value(ctx.memory, ctx.fp)?.to_usize();
+                let chunk_size = args[3].read_value(ctx.memory, ctx.fp)?.to_usize();
+                assert!(num_chunks * chunk_size <= F::bits());
+                for j in 0..num_chunks {
+                    let chunk = F::from_u64((value >> (chunk_size * j)) & ((1u64 << chunk_size) - 1));
+                    ctx.memory.set(decomposed_ptr + j, chunk)?;
                 }
             }
             Self::DecomposeBits => {
diff --git a/crates/lean_vm/src/isa/instruction.rs b/crates/lean_vm/src/isa/instruction.rs
index 2ec0b2575..76ba8740a 100644
--- a/crates/lean_vm/src/isa/instruction.rs
+++ b/crates/lean_vm/src/isa/instruction.rs
@@ -8,9 +8,8 @@ use crate::execution::memory::MemoryAccess;
 use crate::tables::TableT;
 use crate::{ExtensionOpMode, Table, TableTrace};
 use crate::{
-    POSEIDON16_HARDCODED_LEFT_NAME, POSEIDON16_NAME, POSEIDON16_PERMUTE_HALF_HARDCODED_LEFT_NAME,
-    POSEIDON16_PERMUTE_HALF_NAME, POSEIDON16_PERMUTE_NAME, POSEIDON16_QUARTER_HARDCODED_LEFT_NAME,
-    POSEIDON16_QUARTER_NAME,
+    POSEIDON8_HARDCODED_LEFT_NAME, POSEIDON8_NAME, POSEIDON8_PERMUTE_HALF_HARDCODED_LEFT_NAME,
+    POSEIDON8_PERMUTE_HALF_NAME, POSEIDON8_PERMUTE_NAME, POSEIDON8_QUARTER_HARDCODED_LEFT_NAME, POSEIDON8_QUARTER_NAME,
 };
 use backend::*;
 use std::collections::BTreeMap;
@@ -67,10 +66,10 @@ pub struct PrecompileArgs<V, S> {
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
 pub enum PrecompileCompTimeArgs<S> {
-    Poseidon16 {
+    Poseidon8 {
         half_output: bool,
-        //   hardcoded_offset_left = None:              left_input = m[arg_a..arg_a+8]
-        //   hardcoded_offset_left = Some(offset_left): left_input = m[offset_left..offset_left+4] | m[arg_a..arg_a+4] (arg_a is the first runtime parameter)
+        //   hardcoded_offset_left = None:              left_input = m[arg_a..arg_a+4]
+        //   hardcoded_offset_left = Some(offset_left): left_input = m[offset_left..offset_left+2] | m[arg_a..arg_a+2] (arg_a is the first runtime parameter)
         hardcoded_offset_left: Option<S>,
         permute: bool, // if false: compression (feedforward), if true: permutation
     },
@@ -83,20 +82,20 @@ pub enum PrecompileCompTimeArgs<S> {
 impl<S> PrecompileCompTimeArgs<S> {
     pub fn table(&self) -> Table {
         match self {
-            Self::Poseidon16 { .. } => Table::poseidon16(),
+            Self::Poseidon8 { .. } => Table::poseidon8(),
             Self::ExtensionOp { .. } => Table::extension_op(),
         }
     }
 
     pub fn map_size<T>(self, mut f: impl FnMut(S) -> T) -> PrecompileCompTimeArgs<T> {
         match self {
-            Self::Poseidon16 {
+            Self::Poseidon8 {
                 half_output,
-                hardcoded_offset_left: hardcoded_left_4,
+                hardcoded_offset_left,
                 permute,
-            } => PrecompileCompTimeArgs::Poseidon16 {
+            } => PrecompileCompTimeArgs::Poseidon8 {
                 half_output,
-                hardcoded_offset_left: hardcoded_left_4.map(&mut f),
+                hardcoded_offset_left: hardcoded_offset_left.map(&mut f),
                 permute,
             },
             Self::ExtensionOp { size, mode } => PrecompileCompTimeArgs::ExtensionOp { size: f(size), mode },
@@ -259,35 +258,32 @@ impl<V: Display, S: Display> Display for PrecompileArgs<V, S> {
             data,
         } = self;
         match data {
-            PrecompileCompTimeArgs::Poseidon16 {
+            PrecompileCompTimeArgs::Poseidon8 {
                 half_output,
-                hardcoded_offset_left: hardcoded_left_4,
+                hardcoded_offset_left,
                 permute,
             } => {
                 if *permute {
-                    match (*half_output, hardcoded_left_4) {
+                    match (*half_output, hardcoded_offset_left) {
                         (true, Some(off)) => {
                             write!(
                                 f,
-                                "{POSEIDON16_PERMUTE_HALF_HARDCODED_LEFT_NAME}({arg_0}, {arg_1}, {res}, off={off})"
+                                "{POSEIDON8_PERMUTE_HALF_HARDCODED_LEFT_NAME}({arg_0}, {arg_1}, {res}, off={off})"
                             )
                         }
-                        (true, None) => write!(f, "{POSEIDON16_PERMUTE_HALF_NAME}({arg_0}, {arg_1}, {res})"),
-                        (false, _) => write!(f, "{POSEIDON16_PERMUTE_NAME}({arg_0}, {arg_1}, {res})"),
+                        (true, None) => write!(f, "{POSEIDON8_PERMUTE_HALF_NAME}({arg_0}, {arg_1}, {res})"),
+                        (false, _) => write!(f, "{POSEIDON8_PERMUTE_NAME}({arg_0}, {arg_1}, {res})"),
                     }
                 } else {
-                    match (*half_output, hardcoded_left_4) {
-                        (false, None) => write!(f, "{POSEIDON16_NAME}({arg_0}, {arg_1}, {res})"),
-                        (true, None) => write!(f, "{POSEIDON16_QUARTER_NAME}({arg_0}, {arg_1}, {res})"),
+                    match (*half_output, hardcoded_offset_left) {
+                        (false, None) => write!(f, "{POSEIDON8_NAME}({arg_0}, {arg_1}, {res})"),
+                        (true, None) => write!(f, "{POSEIDON8_QUARTER_NAME}({arg_0}, {arg_1}, {res})"),
                         (false, Some(off)) => {
-                            write!(
-                                f,
-                                "{POSEIDON16_HARDCODED_LEFT_NAME}({arg_0}, {arg_1}, {res}, off={off})"
-                            )
+                            write!(f, "{POSEIDON8_HARDCODED_LEFT_NAME}({arg_0}, {arg_1}, {res}, off={off})")
                         }
                         (true, Some(off)) => write!(
                             f,
-                            "{POSEIDON16_QUARTER_HARDCODED_LEFT_NAME}({arg_0}, {arg_1}, {res}, off={off})"
+                            "{POSEIDON8_QUARTER_HARDCODED_LEFT_NAME}({arg_0}, {arg_1}, {res}, off={off})"
                         ),
                     }
                 }
diff --git a/crates/lean_vm/src/tables/extension_op/air.rs b/crates/lean_vm/src/tables/extension_op/air.rs
index cd1ad46d5..20005f102 100644
--- a/crates/lean_vm/src/tables/extension_op/air.rs
+++ b/crates/lean_vm/src/tables/extension_op/air.rs
@@ -1,10 +1,12 @@
 use crate::{
-    EF, EXT_OP_FLAG_ADD, EXT_OP_FLAG_BE, EXT_OP_FLAG_DOT_PRODUCT, EXT_OP_FLAG_EQ, ExtraDataForBuses, eval_bus_virtual,
+    DIMENSION, EF, EXT_OP_FLAG_ADD, EXT_OP_FLAG_BE, EXT_OP_FLAG_DOT_PRODUCT, EXT_OP_FLAG_EQ, ExtraDataForBuses,
+    eval_bus_virtual,
     tables::extension_op::{EXT_OP_LEN_MULTIPLIER, ExtensionOpPrecompile},
 };
 use backend::*;
 
-// Shift columns first, in positions 0..13 (see `n_shift_columns` below).
+// ---------- Column layout (cubic extension, DIMENSION = 3) ----------
+// Shift columns first, in positions 0..11 (see `n_shift_columns` below).
 // Flat-only columns follow.
 pub(super) const COL_FLAG_BE: usize = 0;
 pub(super) const COL_FLAG_START: usize = 1;
@@ -14,44 +16,65 @@ pub(super) const COL_FLAG_DOT_PRODUCT: usize = 4;
 pub(super) const COL_FLAG_EQ: usize = 5;
 pub(super) const COL_IDX_A: usize = 6;
 pub(super) const COL_IDX_B: usize = 7;
-/// acc (running accumulator) coordinates (5 columns).
+/// acc (running accumulator) coordinates (3 cols).
 pub(super) const COL_ACC: usize = 8;
 // --- flat-only columns ---
-pub(super) const COL_IDX_RES: usize = 13;
-/// v_A coordinates (5 columns).
-pub(super) const COL_V_A: usize = 14;
-/// v_B coordinates (5 columns).
-pub(super) const COL_V_B: usize = 19;
-/// res coordinates (5 columns).
-pub(super) const COL_RES: usize = 24;
+pub(super) const COL_IDX_RES: usize = 11;
+/// v_A coordinates (3 cols).
+pub(super) const COL_V_A: usize = 12;
+/// v_B coordinates (3 cols).
+pub(super) const COL_V_B: usize = 15;
+/// res coordinates (3 cols).
+pub(super) const COL_RES: usize = 18;
 
 // Virtual columns (not explicitely in AIR)
-pub(super) const COL_MULTIPLICITY_EXTENSION_OP: usize = 29;
-pub(super) const COL_DOMAINSEP_EXTENSION_OP: usize = 30;
+pub(super) const COL_MULTIPLICITY_EXTENSION_OP: usize = 21;
+pub(super) const COL_DOMAINSEP_EXTENSION_OP: usize = 22;
 
-use backend::quintic_extension::extension::quintic_mul;
+pub(super) const AIR_N_COLUMNS: usize = 21;
 
+// ---------- Cubic multiplication gate (`F[X] / (X^3 - X - 1)`, so `X^3 = X + 1`) ----------
+//
+// (a0 + a1·X + a2·X^2)·(b0 + b1·X + b2·X^2), reduced:
+//   c0 = a0·b0 + a1·b2 + a2·b1
+//   c1 = a0·b1 + a1·b0 + a1·b2 + a2·b1 + a2·b2
+//   c2 = a0·b2 + a1·b1 + a2·b0 + a2·b2
 #[inline]
-fn quintic_mul_air<T: PrimeCharacteristicRing>(a: &[T; 5], b: &[T; 5]) -> [T; 5] {
-    quintic_mul(a, b, |x, y| {
-        x[0] * y[0] + x[1] * y[1] + x[2] * y[2] + x[3] * y[3] + x[4] * y[4]
-    })
+fn cubic_mul_air<T: PrimeCharacteristicRing + Copy>(a: &[T; 3], b: &[T; 3]) -> [T; 3] {
+    let a1b2 = a[1] * b[2];
+    let a2b1 = a[2] * b[1];
+    let a2b2 = a[2] * b[2];
+    [
+        a[0] * b[0] + a1b2 + a2b1,
+        a[0] * b[1] + a[1] * b[0] + a1b2 + a2b1 + a2b2,
+        a[0] * b[2] + a[1] * b[1] + a[2] * b[0] + a2b2,
+    ]
 }
 
 impl<const BUS: bool> Air for ExtensionOpPrecompile<BUS> {
     type ExtraData = ExtraDataForBuses<EF>;
 
     fn n_columns(&self) -> usize {
-        29
+        AIR_N_COLUMNS
     }
     fn degree_air(&self) -> usize {
+        // cubic_mul has degree 2 (elementwise), wrapped in `* flag_mul` gives 3; poly_eq
+        // squares that via another cubic_mul so needs degree 4? In the KoalaBear case the
+        // eval used `6` — we retain a conservative upper bound.
         6
     }
     fn n_constraints(&self) -> usize {
-        35
+        // 5 boolean gates
+        // + 3 * flag_add
+        // + 3 * flag_mul
+        // + 3 * flag_poly_eq
+        // + 3 * start (vres vs comp)
+        // + 7 transition gates (len/is_be/flags/idx_a/idx_b) + 1 start-row-length
+        // + 2 bus (multiplicity + fingerprint) if BUS
+        5 + 3 + 3 + 3 + 3 + 8 + if BUS { 2 } else { 0 }
     }
     fn n_shift_columns(&self) -> usize {
-        COL_ACC + 5
+        COL_ACC + 3
     }
 
     #[inline]
@@ -68,12 +91,12 @@ impl<const BUS: bool> Air for ExtensionOpPrecompile<BUS> {
         let idx_a = flat[COL_IDX_A];
         let idx_b = flat[COL_IDX_B];
 
-        let v_a: [AB::IF; 5] = std::array::from_fn(|k| flat[COL_V_A + k]);
-        let v_b: [AB::IF; 5] = std::array::from_fn(|k| flat[COL_V_B + k]);
-        let res: [AB::IF; 5] = std::array::from_fn(|k| flat[COL_RES + k]);
-        let acc: [AB::IF; 5] = std::array::from_fn(|k| flat[COL_ACC + k]);
+        let v_a: [AB::IF; 3] = std::array::from_fn(|k| flat[COL_V_A + k]);
+        let v_b: [AB::IF; 3] = std::array::from_fn(|k| flat[COL_V_B + k]);
+        let res: [AB::IF; 3] = std::array::from_fn(|k| flat[COL_RES + k]);
+        let acc: [AB::IF; 3] = std::array::from_fn(|k| flat[COL_ACC + k]);
 
-        // Shift columns map 1:1 onto the first 13 columns by convention.
+        // Shift columns map 1:1 onto the first 11 columns by convention.
         let flag_be_shift = shift[COL_FLAG_BE];
         let flag_start_shift = shift[COL_FLAG_START];
         let len_shift = shift[COL_LEN];
@@ -82,7 +105,7 @@ impl<const BUS: bool> Air for ExtensionOpPrecompile<BUS> {
         let flag_eq_shift = shift[COL_FLAG_EQ];
         let idx_a_shift = shift[COL_IDX_A];
         let idx_b_shift = shift[COL_IDX_B];
-        let acc_shift: [AB::IF; 5] = std::array::from_fn(|k| shift[COL_ACC + k]);
+        let acc_shift: [AB::IF; 3] = std::array::from_fn(|k| shift[COL_ACC + k]);
 
         let active = flag_add + flag_dot_product + flag_eq;
         let multiplicity = flag_start * active;
@@ -105,9 +128,11 @@ impl<const BUS: bool> Air for ExtensionOpPrecompile<BUS> {
         let is_ee = -(flag_be - AB::F::ONE);
         let not_start_shift = -(flag_start_shift - AB::F::ONE);
 
-        let v_a_tilde: [AB::IF; 5] = std::array::from_fn(|k| if k == 0 { v_a[0] } else { v_a[k] * is_ee });
+        // For "base-extension" ops, v_a is a base-field scalar embedded into EF as
+        // `(v_a[0], 0, 0)`: zero the upper coordinates when `flag_be` is 1.
+        let v_a_tilde: [AB::IF; 3] = std::array::from_fn(|k| if k == 0 { v_a[0] } else { v_a[k] * is_ee });
 
-        let acc_tail: [AB::IF; 5] = std::array::from_fn(|k| acc_shift[k] * not_start_shift);
+        let acc_tail: [AB::IF; 3] = std::array::from_fn(|k| acc_shift[k] * not_start_shift);
 
         builder.assert_bool(flag_be);
         builder.assert_bool(flag_start);
@@ -115,33 +140,35 @@ impl<const BUS: bool> Air for ExtensionOpPrecompile<BUS> {
         builder.assert_bool(flag_dot_product);
         builder.assert_bool(flag_eq);
 
-        for k in 0..5 {
+        for k in 0..3 {
             builder.assert_zero((acc[k] - (v_a_tilde[k] + v_b[k] + acc_tail[k])) * flag_add);
         }
 
-        let v_a_times_v_b = quintic_mul_air(&v_a_tilde, &v_b);
+        let v_a_times_v_b = cubic_mul_air(&v_a_tilde, &v_b);
 
-        for k in 0..5 {
+        for k in 0..3 {
             builder.assert_zero((acc[k] - (v_a_times_v_b[k] + acc_tail[k])) * flag_dot_product);
         }
 
-        let e_eq: [AB::IF; 5] = std::array::from_fn(|k| {
+        // eq per element: `2 a b - a - b + 1` (constant coord only gets +1),
+        // accumulated via multiplication.
+        let e_eq: [AB::IF; 3] = std::array::from_fn(|k| {
             let base = v_a_times_v_b[k].double() - v_a_tilde[k] - v_b[k];
             if k == 0 { base + AB::F::ONE } else { base }
         });
-        let acc_tail_or_one: [AB::IF; 5] = std::array::from_fn(|k| {
+        let acc_tail_or_one: [AB::IF; 3] = std::array::from_fn(|k| {
             if k == 0 {
                 acc_shift[0] * not_start_shift + flag_start_shift
             } else {
                 acc_shift[k] * not_start_shift
             }
         });
-        let eq_result = quintic_mul_air(&e_eq, &acc_tail_or_one);
-        for k in 0..5 {
+        let eq_result = cubic_mul_air(&e_eq, &acc_tail_or_one);
+        for k in 0..3 {
             builder.assert_zero((acc[k] - eq_result[k]) * flag_eq);
         }
 
-        for k in 0..5 {
+        for k in 0..3 {
             builder.assert_zero((acc[k] - res[k]) * flag_start);
         }
 
@@ -150,9 +177,9 @@ impl<const BUS: bool> Air for ExtensionOpPrecompile<BUS> {
         builder.assert_zero(not_start_shift * (flag_add - flag_add_shift));
         builder.assert_zero(not_start_shift * (flag_dot_product - flag_dot_product_shift));
         builder.assert_zero(not_start_shift * (flag_eq - flag_eq_shift));
-        let a_increment = flag_be + is_ee * AB::F::from_usize(crate::DIMENSION);
+        let a_increment = flag_be + is_ee * AB::F::from_usize(DIMENSION);
         builder.assert_zero(not_start_shift * (idx_a_shift - idx_a - a_increment));
-        builder.assert_zero(not_start_shift * (idx_b_shift - idx_b - AB::F::from_usize(crate::DIMENSION)));
+        builder.assert_zero(not_start_shift * (idx_b_shift - idx_b - AB::F::from_usize(DIMENSION)));
 
         builder.assert_zero(flag_start_shift * (len - AB::F::ONE));
     }
diff --git a/crates/lean_vm/src/tables/mod.rs b/crates/lean_vm/src/tables/mod.rs
index 9a7f03e26..64182474f 100644
--- a/crates/lean_vm/src/tables/mod.rs
+++ b/crates/lean_vm/src/tables/mod.rs
@@ -17,11 +17,11 @@ mod utils;
 pub(crate) use utils::*;
 
 // In logup interractions, the `domainsep` is the last entry of every tuple going into
-// the bus. It separates the two precompile tables from each other (Poseidon16 is odd,
-// ExtensionOp is a multiple of 4), and — since every value is odd `>= 3` (Poseidon16) or
+// the bus. It separates the two precompile tables from each other (Poseidon8 is odd,
+// ExtensionOp is a multiple of 4), and — since every value is odd `>= 3` (Poseidon8) or
 // a multiple of 4 (ExtensionOp) — also from the memory and bytecode lookups, whose
 // reserved domainseps are respectively 1 and 2.
 //
-//   Poseidon16  (odd >= 3): 3 + 2·flag_permute + 4·flag_out8 + 8·flag_left + 16·flag_left·offset_left
+//   Poseidon8   (odd >= 3): 3 + 2·flag_permute + 4·flag_out4 + 8·flag_left + 16·flag_left·offset_left
 //   ExtensionOp (0 mod 4):  4·flag_be + 8·flag_add + 16·flag_dot_product + 32·flag_eq + 64·len
 //
diff --git a/crates/lean_vm/src/tables/poseidon/mod.rs b/crates/lean_vm/src/tables/poseidon/mod.rs
index fb1efcb8d..f21905467 100644
--- a/crates/lean_vm/src/tables/poseidon/mod.rs
+++ b/crates/lean_vm/src/tables/poseidon/mod.rs
@@ -1,209 +1,295 @@
-use std::any::TypeId;
-
+use crate::execution::memory::MemoryAccess;
 use crate::*;
-use crate::{execution::memory::MemoryAccess, tables::poseidon::trace_gen::generate_trace_rows_for_perm};
 use backend::*;
-use utils::{ToUsize, poseidon16_compress, poseidon16_permute};
-
-/// Dispatch `mds_fft_16` through concrete types.
-/// For `SymbolicExpression` we use the dense form so the zkDSL generator can
-/// emit `dot_product_be` precompile calls instead of Karatsuba arithmetic.
-#[inline(always)]
-fn mds_air_16<A: PrimeCharacteristicRing + 'static>(state: &mut [A; WIDTH]) {
-    if TypeId::of::<A>() == TypeId::of::<SymbolicExpression<KoalaBear>>() {
-        dense_mat_vec_air_16(mds_dense_16(), state);
-        return;
-    }
-    macro_rules! dispatch {
-        ($t:ty) => {
-            if TypeId::of::<A>() == TypeId::of::<$t>() {
-                mds_fft_16::<$t>(unsafe { &mut *(state as *mut [A; WIDTH] as *mut [$t; WIDTH]) });
-                return;
-            }
+use utils::{ToUsize, poseidon8_compress};
+
+mod sparse;
+mod trace_gen;
+pub use trace_gen::fill_trace_poseidon_8;
+
+use sparse::{PARTIAL_ROUNDS as SPARSE_PARTIAL_ROUNDS, get_partial_constants};
+
+pub(super) const WIDTH: usize = 8;
+pub(super) const DIGEST: usize = DIGEST_LEN; // 4
+pub const HALF_DIGEST_LEN: usize = DIGEST / 2; // 2
+
+// domainsep encoding: see `tables/mod.rs`.
+pub const POSEIDON_DOMAINSEP_BASE: usize = 3;
+pub const POSEIDON_FLAG_PERMUTE_SHIFT: usize = 1 << 1;
+pub const POSEIDON_FLAG_OUT4_SHIFT: usize = 1 << 2;
+pub const POSEIDON_FLAG_LEFT_SHIFT: usize = 1 << 3;
+pub const POSEIDON_OFFSET_LEFT_SHIFT: usize = 1 << 4;
+
+// ---------- I/O columns ----------
+pub const POSEIDON_8_COL_MULTIPLICITY: ColIndex = 0;
+pub const POSEIDON_8_COL_NU_B: ColIndex = 1;
+pub const POSEIDON_8_COL_NU_C: ColIndex = 2;
+// Output width flags (compression only for out2; out4 also covers permute_half):
+//   out2 set  => output is 2 elements (HALF_DIGEST_LEN), compression only.
+//   out4 set  => output is 4 elements (DIGEST); for compression a full digest,
+//                for permutation the low half (permute_half).
+//   neither   => output is 8 elements (WIDTH), full permutation only.
+pub const POSEIDON_8_COL_FLAG_OUT2: ColIndex = 3;
+pub const POSEIDON_8_COL_FLAG_OUT4: ColIndex = 4;
+pub const POSEIDON_8_COL_FLAG_LEFT: ColIndex = 5;
+pub const POSEIDON_8_COL_OFFSET_LEFT: ColIndex = 6;
+pub const POSEIDON_8_COL_ADDR_LEFT_LO: ColIndex = 7;
+pub const POSEIDON_8_COL_ADDR_LEFT_HI: ColIndex = 8;
+pub const POSEIDON_8_COL_FLAG_PERMUTE: ColIndex = 9;
+pub const POSEIDON_8_COL_INPUT_START: ColIndex = 10;
+// Output is the full WIDTH-element permutation state: `out_lo` (WIDTH/2)
+// followed by `out_hi` (WIDTH/2). Compression only uses `out_lo`.
+pub const POSEIDON_8_COL_OUT_LO: ColIndex = POSEIDON_8_COL_INPUT_START + WIDTH; // 18
+pub const POSEIDON_8_COL_OUT_HI: ColIndex = POSEIDON_8_COL_OUT_LO + WIDTH / 2; // 22
+pub const POSEIDON_8_COL_ROUND_START: ColIndex = POSEIDON_8_COL_OUT_LO + WIDTH; // 26
+/// Non-committed columns ("virtual"):
+pub const POSEIDON_8_COL_NU_A: ColIndex = num_cols_poseidon_8();
+pub const POSEIDON_8_COL_DOMAINSEP: ColIndex = num_cols_poseidon_8() + 1;
+
+pub const POSEIDON8_NAME: &str = "poseidon8_compress_half";
+pub const POSEIDON8_QUARTER_NAME: &str = "poseidon8_compress_quarter";
+pub const POSEIDON8_HARDCODED_LEFT_NAME: &str = "poseidon8_compress_half_hardcoded_left";
+pub const POSEIDON8_QUARTER_HARDCODED_LEFT_NAME: &str = "poseidon8_compress_quarter_hardcoded_left";
+pub const POSEIDON8_PERMUTE_NAME: &str = "poseidon8_permute";
+pub const POSEIDON8_PERMUTE_HALF_NAME: &str = "poseidon8_permute_half";
+pub const POSEIDON8_PERMUTE_HALF_HARDCODED_LEFT_NAME: &str = "poseidon8_permute_half_hardcoded_left";
+pub const ALL_POSEIDON8_NAMES: [&str; 7] = [
+    POSEIDON8_NAME,
+    POSEIDON8_QUARTER_NAME,
+    POSEIDON8_HARDCODED_LEFT_NAME,
+    POSEIDON8_QUARTER_HARDCODED_LEFT_NAME,
+    POSEIDON8_PERMUTE_NAME,
+    POSEIDON8_PERMUTE_HALF_NAME,
+    POSEIDON8_PERMUTE_HALF_HARDCODED_LEFT_NAME,
+];
+
+// ---------- Per-round aux columns ----------
+//
+// Goldilocks Poseidon1-8 with the Appendix B sparse partial-round decomposition
+// (see `sparse.rs`). The S-box is `x → x⁷` emitted directly as a degree-7
+// expression `x·x²·x⁴`, so we commit only the minimum needed to reset degree
+// between rounds — no `committed_x3` intermediates.
+//
+// Per full round: 8 `post[i]` cols (state after MDS).
+// Per partial round: 1 `post_sbox` col (the x⁷ output for lane 0); lanes 1..W
+// are expressed symbolically as rank-1 updates via `cheap_matmul`.
+//
+// Constraints:
+// - Full round: `post[i] - Σ_j MDS[i][j] · x[j]⁷ = 0`  (deg 7 equality).
+// - Partial round: `post_sbox - x⁷ = 0`               (deg 7 equality).
+// - Davies-Meyer: `outputs[i] - final_state[i] - inputs[i] = 0`  (deg 1).
+
+const FULL_ROUND_COLS: usize = WIDTH; // 8 post-state
+const PARTIAL_ROUND_COLS: usize = 1; // post_sbox
+
+pub const fn is_full_round(r: usize) -> bool {
+    r < POSEIDON1_HALF_FULL_ROUNDS || r >= POSEIDON1_HALF_FULL_ROUNDS + POSEIDON1_PARTIAL_ROUNDS
+}
+
+/// First column index of round `r`'s data.
+pub const fn round_data_offset(r: usize) -> usize {
+    let mut off = POSEIDON_8_COL_ROUND_START;
+    let mut i = 0;
+    while i < r {
+        off += if is_full_round(i) {
+            FULL_ROUND_COLS
+        } else {
+            PARTIAL_ROUND_COLS
         };
+        i += 1;
     }
-    dispatch!(F);
-    dispatch!(EF);
-    dispatch!(FPacking<F>);
-    dispatch!(EFPacking<EF>);
-    unreachable!()
+    off
 }
 
-fn mds_dense_16() -> &'static [[F; 16]; 16] {
-    use std::sync::OnceLock;
-    static MAT: OnceLock<[[KoalaBear; 16]; 16]> = OnceLock::new();
-    MAT.get_or_init(|| {
-        let cols: [[F; 16]; 16] = std::array::from_fn(|j| {
-            let mut e = [F::ZERO; 16];
-            e[j] = F::ONE;
-            mds_circ_16(&mut e);
-            e
-        });
-        std::array::from_fn(|i| std::array::from_fn(|j| cols[j][i]))
-    })
+pub const fn num_cols_poseidon_8() -> usize {
+    round_data_offset(POSEIDON1_N_ROUNDS)
 }
 
-/// Add a `KoalaBear` constant to any AIR type.
-#[inline(always)]
-fn add_kb<A: 'static>(a: &mut A, value: F) {
-    macro_rules! dispatch {
-        ($t:ty) => {
-            if TypeId::of::<A>() == TypeId::of::<$t>() {
-                *unsafe { &mut *(a as *mut A as *mut $t) } += value;
-                return;
-            }
-        };
+pub const fn num_cols_total_poseidon_8() -> usize {
+    // +2 for non-committed columns: POSEIDON_8_COL_NU_A, POSEIDON_8_COL_DOMAINSEP
+    num_cols_poseidon_8() + 2
+}
+
+const AUX_COLS_PER_ROW: usize = num_cols_poseidon_8() - POSEIDON_8_COL_ROUND_START;
+
+// ---------- Witness computation ----------
+//
+// Replay the Poseidon1-8 permutation on `input`, emitting every committed
+// column value in trace order. The partial phase uses the sparse
+// decomposition so only 2 cols/round are emitted.
+
+fn mds_vec_mul(state: &[F; WIDTH]) -> [F; WIDTH] {
+    let mut out = [F::ZERO; WIDTH];
+    for i in 0..WIDTH {
+        let mut acc = state[0] * F::from_u64(MDS8_ROW[(WIDTH - i) % WIDTH] as u64);
+        for j in 1..WIDTH {
+            acc += state[j] * F::from_u64(MDS8_ROW[(j + WIDTH - i) % WIDTH] as u64);
+        }
+        out[i] = acc;
     }
-    dispatch!(F);
-    dispatch!(EF);
-    dispatch!(FPacking<F>);
-    dispatch!(EFPacking<EF>);
-    dispatch!(SymbolicExpression<KoalaBear>);
-    unreachable!()
+    out
+}
+
+fn sbox7(x: F) -> F {
+    let x2 = x * x;
+    let x4 = x2 * x2;
+    x4 * x2 * x
 }
 
-/// Multiply any AIR type by a `KoalaBear` constant.
-#[inline(always)]
-fn mul_kb<A: PrimeCharacteristicRing + 'static>(a: A, value: F) -> A {
-    macro_rules! dispatch {
-        ($t:ty) => {
-            if TypeId::of::<A>() == TypeId::of::<$t>() {
-                let r = unsafe { std::ptr::read(&a as *const A as *const $t) } * value;
-                return unsafe { std::ptr::read(&r as *const $t as *const A) };
+/// Returns `(aux, perm_state)`: the per-round witness columns and the raw
+/// WIDTH-element permutation output (before any Davies-Meyer feed-forward).
+pub fn compute_poseidon8_witness(input: [F; WIDTH]) -> (Vec<F>, [F; WIDTH]) {
+    let c = get_partial_constants();
+    let mut state = input;
+    let mut aux = Vec::with_capacity(AUX_COLS_PER_ROW);
+
+    // Initial full rounds.
+    for rc in GOLDILOCKS_POSEIDON1_RC_8.iter().take(POSEIDON1_HALF_FULL_ROUNDS) {
+        for (i, s) in state.iter_mut().enumerate() {
+            *s = sbox7(*s + rc[i]);
+        }
+        let post = mds_vec_mul(&state);
+        for v in &post {
+            aux.push(*v);
+        }
+        state = post;
+    }
+
+    // Partial phase: absorb first_round_constants, apply m_i, then sparse rounds.
+    for (i, s) in state.iter_mut().enumerate() {
+        *s += c.first_round_constants[i];
+    }
+    {
+        let mut after = [F::ZERO; WIDTH];
+        for (i, dst) in after.iter_mut().enumerate() {
+            let mut acc = F::ZERO;
+            for (j, sj) in state.iter().enumerate() {
+                acc += c.m_i[i][j] * *sj;
             }
-        };
+            *dst = acc;
+        }
+        state = after;
     }
-    dispatch!(F);
-    dispatch!(EF);
-    dispatch!(FPacking<F>);
-    dispatch!(EFPacking<EF>);
-    dispatch!(SymbolicExpression<KoalaBear>);
-    unreachable!()
-}
 
-mod trace_gen;
-pub use trace_gen::fill_trace_poseidon_16;
+    for r in 0..SPARSE_PARTIAL_ROUNDS {
+        let post_sbox = sbox7(state[0]);
+        aux.push(post_sbox);
 
-pub(super) const WIDTH: usize = 16;
-const HALF_INITIAL_FULL_ROUNDS: usize = POSEIDON1_HALF_FULL_ROUNDS / 2;
-const PARTIAL_ROUNDS: usize = POSEIDON1_PARTIAL_ROUNDS;
-const HALF_FINAL_FULL_ROUNDS: usize = POSEIDON1_HALF_FULL_ROUNDS / 2;
+        state[0] = if r < SPARSE_PARTIAL_ROUNDS - 1 {
+            post_sbox + c.round_constants[r]
+        } else {
+            post_sbox
+        };
 
-// domainsep encoding: see `tables/mod.rs`.
-pub const POSEIDON_DOMAINSEP_BASE: usize = 3;
-pub const POSEIDON_FLAG_PERMUTE_SHIFT: usize = 1 << 1;
-pub const POSEIDON_FLAG_OUT8_SHIFT: usize = 1 << 2;
-pub const POSEIDON_FLAG_LEFT_SHIFT: usize = 1 << 3;
-pub const POSEIDON_OFFSET_LEFT_SHIFT: usize = 1 << 4;
+        // cheap_matmul:
+        //   new_state[0] = Σ_j sparse_first_row[r][j] · state[j]
+        //   new_state[i] = state[i] + v[r][i-1] · old_state[0]    (for i ≥ 1)
+        let old_s0 = state[0];
+        let mut new_s0 = F::ZERO;
+        for (j, sj) in state.iter().enumerate() {
+            new_s0 += c.sparse_first_row[r][j] * *sj;
+        }
+        state[0] = new_s0;
+        for (i, s) in state.iter_mut().enumerate().skip(1) {
+            *s += c.v[r][i - 1] * old_s0;
+        }
+    }
 
-pub const POSEIDON_COL_MULTIPLICITY: ColIndex = 0;
-pub const POSEIDON_COL_NU_B: ColIndex = 1;
-pub const POSEIDON_COL_NU_C: ColIndex = 2;
-pub const POSEIDON_COL_FLAG_OUT4: ColIndex = 3;
-pub const POSEIDON_COL_FLAG_OUT8: ColIndex = 4;
-pub const POSEIDON_COL_FLAG_LEFT: ColIndex = 5;
-pub const POSEIDON_COL_OFFSET_LEFT: ColIndex = 6;
-pub const POSEIDON_COL_ADDR_LEFT_LO: ColIndex = 7;
-pub const POSEIDON_COL_ADDR_LEFT_HI: ColIndex = 8;
-pub const POSEIDON_COL_FLAG_PERMUTE: ColIndex = 9;
-pub const POSEIDON_COL_INPUT_START: ColIndex = 10;
-pub const POSEIDON_COL_OUT_LO: ColIndex = num_cols_poseidon_16() - 16;
-pub const POSEIDON_COL_OUT_HI: ColIndex = num_cols_poseidon_16() - 8;
-/// Non-committed columns ("virtual"):
-pub const POSEIDON_COL_NU_A: ColIndex = num_cols_poseidon_16();
-pub const POSEIDON_COL_DOMAINSEP: ColIndex = num_cols_poseidon_16() + 1;
-
-pub const POSEIDON16_NAME: &str = "poseidon16_compress_half";
-pub const POSEIDON16_QUARTER_NAME: &str = "poseidon16_compress_quarter";
-pub const POSEIDON16_HARDCODED_LEFT_NAME: &str = "poseidon16_compress_half_hardcoded_left";
-pub const POSEIDON16_QUARTER_HARDCODED_LEFT_NAME: &str = "poseidon16_compress_quarter_hardcoded_left";
-pub const POSEIDON16_PERMUTE_NAME: &str = "poseidon16_permute";
-pub const POSEIDON16_PERMUTE_HALF_NAME: &str = "poseidon16_permute_half";
-pub const POSEIDON16_PERMUTE_HALF_HARDCODED_LEFT_NAME: &str = "poseidon16_permute_half_hardcoded_left";
-pub const ALL_POSEIDON16_NAMES: [&str; 7] = [
-    POSEIDON16_NAME,
-    POSEIDON16_QUARTER_NAME,
-    POSEIDON16_HARDCODED_LEFT_NAME,
-    POSEIDON16_QUARTER_HARDCODED_LEFT_NAME,
-    POSEIDON16_PERMUTE_NAME,
-    POSEIDON16_PERMUTE_HALF_NAME,
-    POSEIDON16_PERMUTE_HALF_HARDCODED_LEFT_NAME,
-];
-pub const HALF_DIGEST_LEN: usize = DIGEST_LEN / 2;
+    // Terminal full rounds.
+    for round in 0..POSEIDON1_HALF_FULL_ROUNDS {
+        let abs = POSEIDON1_HALF_FULL_ROUNDS + POSEIDON1_PARTIAL_ROUNDS + round;
+        for i in 0..WIDTH {
+            state[i] = sbox7(state[i] + GOLDILOCKS_POSEIDON1_RC_8[abs][i]);
+        }
+        let post = mds_vec_mul(&state);
+        for v in &post {
+            aux.push(*v);
+        }
+        state = post;
+    }
+
+    // `state` now holds the raw permutation output. Compression (Davies-Meyer
+    // feed-forward `state[i] + input[i]`) is applied by the caller.
+    debug_assert_eq!(aux.len(), AUX_COLS_PER_ROW);
+    (aux, state)
+}
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
-pub struct Poseidon16Precompile<const BUS: bool>;
+pub struct Poseidon8Precompile<const BUS: bool>;
 
-impl<const BUS: bool> TableT for Poseidon16Precompile<BUS> {
+impl<const BUS: bool> TableT for Poseidon8Precompile<BUS> {
     fn name(&self) -> &'static str {
-        POSEIDON16_NAME
+        POSEIDON8_NAME
     }
 
     fn table(&self) -> Table {
-        Table::poseidon16()
+        Table::poseidon8()
     }
 
     fn n_columns_total(&self) -> usize {
-        num_cols_total_poseidon_16()
+        num_cols_total_poseidon_8()
     }
 
     fn bus_interactions(&self) -> Vec<BusInteraction> {
         let mut buses = vec![BusInteraction {
             direction: BusDirection::Pull,
-            multiplicity: BusMultiplicity::Column(POSEIDON_COL_MULTIPLICITY),
-            domainsep: BusData::Column(POSEIDON_COL_DOMAINSEP),
+            multiplicity: BusMultiplicity::Column(POSEIDON_8_COL_MULTIPLICITY),
+            domainsep: BusData::Column(POSEIDON_8_COL_DOMAINSEP),
             data: vec![
-                BusData::Column(POSEIDON_COL_NU_A),
-                BusData::Column(POSEIDON_COL_NU_B),
-                BusData::Column(POSEIDON_COL_NU_C),
+                BusData::Column(POSEIDON_8_COL_NU_A),
+                BusData::Column(POSEIDON_8_COL_NU_B),
+                BusData::Column(POSEIDON_8_COL_NU_C),
             ],
         }];
         buses.extend(memory_lookups_consecutive(
-            POSEIDON_COL_ADDR_LEFT_LO,
-            POSEIDON_COL_INPUT_START,
+            POSEIDON_8_COL_ADDR_LEFT_LO,
+            POSEIDON_8_COL_INPUT_START,
             HALF_DIGEST_LEN,
         ));
         buses.extend(memory_lookups_consecutive(
-            POSEIDON_COL_ADDR_LEFT_HI,
-            POSEIDON_COL_INPUT_START + HALF_DIGEST_LEN,
+            POSEIDON_8_COL_ADDR_LEFT_HI,
+            POSEIDON_8_COL_INPUT_START + HALF_DIGEST_LEN,
             HALF_DIGEST_LEN,
         ));
         buses.extend(memory_lookups_consecutive(
-            POSEIDON_COL_NU_B,
-            POSEIDON_COL_INPUT_START + DIGEST_LEN,
-            DIGEST_LEN,
+            POSEIDON_8_COL_NU_B,
+            POSEIDON_8_COL_INPUT_START + DIGEST,
+            DIGEST,
         ));
         buses.extend(memory_lookups_consecutive(
-            POSEIDON_COL_NU_C,
-            POSEIDON_COL_OUT_LO,
-            DIGEST_LEN * 2,
+            POSEIDON_8_COL_NU_C,
+            POSEIDON_8_COL_OUT_LO,
+            DIGEST * 2,
         ));
         buses
     }
 
     fn padding_row(&self, zero_vec_ptr: usize, null_hash_ptr: usize, _ending_pc: usize) -> Vec<F> {
-        let mut row = vec![F::ZERO; num_cols_total_poseidon_16()];
-        let ptrs: Vec<*mut F> = (0..num_cols_poseidon_16())
-            .map(|i| unsafe { row.as_mut_ptr().add(i) })
-            .collect();
-
-        let perm: &mut Poseidon1Cols16<&mut F> = unsafe { &mut *(ptrs.as_ptr() as *mut Poseidon1Cols16<&mut F>) };
-        perm.inputs.iter_mut().for_each(|x| **x = F::ZERO);
-        *perm.multiplicity = F::ZERO;
-        *perm.nu_b = F::from_usize(zero_vec_ptr);
-        *perm.nu_c = F::from_usize(null_hash_ptr);
-        *perm.flag_out4 = F::ZERO;
-        *perm.flag_out8 = F::ONE;
-        *perm.flag_left = F::ZERO;
-        *perm.offset_left = F::ZERO;
-        *perm.addr_left_lo = F::from_usize(zero_vec_ptr);
-        *perm.addr_left_hi = F::from_usize(zero_vec_ptr + HALF_DIGEST_LEN);
-        *perm.flag_permute = F::ZERO;
-        perm.out_hi.iter_mut().for_each(|x| **x = F::ZERO);
-        row[POSEIDON_COL_NU_A] = F::from_usize(zero_vec_ptr);
-        row[POSEIDON_COL_DOMAINSEP] = F::from_usize(POSEIDON_DOMAINSEP_BASE + POSEIDON_FLAG_OUT8_SHIFT);
-
-        generate_trace_rows_for_perm(perm);
+        let mut row = vec![F::ZERO; num_cols_total_poseidon_8()];
+        row[POSEIDON_8_COL_MULTIPLICITY] = F::ZERO;
+        row[POSEIDON_8_COL_NU_B] = F::from_usize(zero_vec_ptr);
+        row[POSEIDON_8_COL_NU_C] = F::from_usize(null_hash_ptr);
+        // Padding rows are full-digest compression rows (out4).
+        row[POSEIDON_8_COL_FLAG_OUT2] = F::ZERO;
+        row[POSEIDON_8_COL_FLAG_OUT4] = F::ONE;
+        row[POSEIDON_8_COL_FLAG_LEFT] = F::ZERO;
+        row[POSEIDON_8_COL_OFFSET_LEFT] = F::ZERO;
+        row[POSEIDON_8_COL_ADDR_LEFT_LO] = F::from_usize(zero_vec_ptr);
+        row[POSEIDON_8_COL_ADDR_LEFT_HI] = F::from_usize(zero_vec_ptr + HALF_DIGEST_LEN);
+        row[POSEIDON_8_COL_FLAG_PERMUTE] = F::ZERO;
+        // Inputs stay zero; compute and fill the matching witness + output.
+        // Padding rows are compression rows: `out_lo` holds the Davies-Meyer
+        // output (= perm_state, since the input is zero), `out_hi` stays zero.
+        let (aux, perm_state) = compute_poseidon8_witness([F::ZERO; WIDTH]);
+        row[POSEIDON_8_COL_OUT_LO..POSEIDON_8_COL_OUT_LO + WIDTH / 2].copy_from_slice(&perm_state[..WIDTH / 2]);
+        for (i, v) in aux.iter().enumerate() {
+            row[POSEIDON_8_COL_ROUND_START + i] = *v;
+        }
+        // Non-committed columns
+        row[POSEIDON_8_COL_NU_A] = F::from_usize(zero_vec_ptr);
+        row[POSEIDON_8_COL_DOMAINSEP] = F::from_usize(POSEIDON_DOMAINSEP_BASE + POSEIDON_FLAG_OUT4_SHIFT);
+        // Sanity: Davies-Meyer witness must agree with the direct primitive.
+        debug_assert_eq!(&perm_state[..DIGEST], &poseidon8_compress([F::ZERO; WIDTH])[..]);
         row
     }
 
@@ -216,24 +302,27 @@ impl<const BUS: bool> TableT for Poseidon16Precompile<BUS> {
         args: PrecompileCompTimeArgs<usize>,
         ctx: &mut InstructionContext<'_, M>,
     ) -> Result<(), RunnerError> {
-        let PrecompileCompTimeArgs::Poseidon16 {
+        let PrecompileCompTimeArgs::Poseidon8 {
             half_output,
             hardcoded_offset_left,
             permute,
         } = args
         else {
-            unreachable!("Poseidon16 table called with non-Poseidon16 args");
+            unreachable!("Poseidon8 table called with non-Poseidon8 args");
         };
-        let out4 = half_output && !permute;
-        let out8 = (!half_output && !permute) || (half_output && permute);
+        // out2: half-width compression output (2 elements), compression only.
+        // out4: full digest compression output (4 elements) or permute_half (low 4).
+        // neither: full 8-element permutation.
+        let out2 = half_output && !permute;
+        let out4 = (!half_output && !permute) || (half_output && permute);
         let trace = ctx.traces.get_mut(&self.table()).unwrap();
 
         let arg_a_usize = arg_a.to_usize();
         let flag_hardcoded = hardcoded_offset_left.is_some();
         // Convention:
-        //   flag_hardcoded = 0: left input = m[arg_a..arg_a+8] (split as [arg_a..+4], [arg_a+4..+8])
-        //   flag_hardcoded = 1: left input = m[offset..offset+4] | m[arg_a..arg_a+4]
-        //                   (i.e. arg_a now points to a 4-element data digest, and the first 4
+        //   flag_hardcoded = 0: left input = m[arg_a..arg_a+4] (split as [arg_a..+2], [arg_a+2..+4])
+        //   flag_hardcoded = 1: left input = m[offset..offset+2] | m[arg_a..arg_a+2]
+        //                   (i.e. arg_a now points to a 2-element data digest, and the first 2
         //                    elements come from the hardcoded prefix at `offset`)
         let left_first_addr = hardcoded_offset_left.unwrap_or(arg_a_usize);
         let left_second_addr = if flag_hardcoded {
@@ -243,312 +332,312 @@ impl<const BUS: bool> TableT for Poseidon16Precompile<BUS> {
         };
         let arg0_first = ctx.memory.get_slice(left_first_addr, HALF_DIGEST_LEN)?;
         let arg0_second = ctx.memory.get_slice(left_second_addr, HALF_DIGEST_LEN)?;
-        let arg1 = ctx.memory.get_slice(arg_b.to_usize(), DIGEST_LEN)?;
+        let arg1 = ctx.memory.get_slice(arg_b.to_usize(), DIGEST)?;
 
-        let mut input = [F::ZERO; DIGEST_LEN * 2];
+        let mut input = [F::ZERO; WIDTH];
         input[..HALF_DIGEST_LEN].copy_from_slice(&arg0_first);
-        input[HALF_DIGEST_LEN..DIGEST_LEN].copy_from_slice(&arg0_second);
-        input[DIGEST_LEN..].copy_from_slice(&arg1);
+        input[HALF_DIGEST_LEN..DIGEST].copy_from_slice(&arg0_second);
+        input[DIGEST..].copy_from_slice(&arg1);
 
+        let (aux, perm_state) = compute_poseidon8_witness(input);
+
+        // `output_cols` are the WIDTH output trace columns. For permute rows they
+        // hold the raw permutation state; for compression rows `out_lo`
+        // holds the Davies-Meyer output (`perm + input`) and `out_hi` is
+        // left zero (overwritten from memory by the trace post-pass).
         let res_addr = index_res_a.to_usize();
+        let mut output_cols = [F::ZERO; WIDTH];
         if permute {
-            let permuted = poseidon16_permute(input);
-            let out_len = if half_output { DIGEST_LEN } else { DIGEST_LEN * 2 };
-            ctx.memory.set_slice(res_addr, &permuted[..out_len])?;
+            output_cols = perm_state;
+            // permute_half (half_output) writes the low DIGEST elements only.
+            let out_len = if half_output { DIGEST } else { WIDTH };
+            ctx.memory.set_slice(res_addr, &perm_state[..out_len])?;
         } else {
-            let output = poseidon16_compress(input);
-            let out_len = if half_output { HALF_DIGEST_LEN } else { DIGEST_LEN };
-            ctx.memory.set_slice(res_addr, &output[..out_len])?;
+            for i in 0..DIGEST {
+                output_cols[i] = perm_state[i] + input[i];
+            }
+            if half_output {
+                ctx.memory.set_slice(res_addr, &output_cols[..HALF_DIGEST_LEN])?;
+            } else {
+                ctx.memory.set_slice(res_addr, &output_cols[..DIGEST])?;
+            }
         }
 
         let hardcoded_offset_left_val = hardcoded_offset_left.unwrap_or(0);
 
-        trace.columns[POSEIDON_COL_MULTIPLICITY].push(F::ONE);
-        trace.columns[POSEIDON_COL_NU_B].push(arg_b);
-        trace.columns[POSEIDON_COL_NU_C].push(index_res_a);
-        trace.columns[POSEIDON_COL_FLAG_OUT4].push(F::from_bool(out4));
-        trace.columns[POSEIDON_COL_FLAG_OUT8].push(F::from_bool(out8));
-        trace.columns[POSEIDON_COL_FLAG_LEFT].push(F::from_bool(flag_hardcoded));
-        trace.columns[POSEIDON_COL_OFFSET_LEFT].push(F::from_usize(hardcoded_offset_left_val));
-        trace.columns[POSEIDON_COL_ADDR_LEFT_LO].push(F::from_usize(left_first_addr));
-        trace.columns[POSEIDON_COL_ADDR_LEFT_HI].push(F::from_usize(left_second_addr));
-        trace.columns[POSEIDON_COL_FLAG_PERMUTE].push(F::from_bool(permute));
+        trace.columns[POSEIDON_8_COL_MULTIPLICITY].push(F::ONE);
+        trace.columns[POSEIDON_8_COL_NU_B].push(arg_b);
+        trace.columns[POSEIDON_8_COL_NU_C].push(index_res_a);
+        trace.columns[POSEIDON_8_COL_FLAG_OUT2].push(F::from_bool(out2));
+        trace.columns[POSEIDON_8_COL_FLAG_OUT4].push(F::from_bool(out4));
+        trace.columns[POSEIDON_8_COL_FLAG_LEFT].push(F::from_bool(flag_hardcoded));
+        trace.columns[POSEIDON_8_COL_OFFSET_LEFT].push(F::from_usize(hardcoded_offset_left_val));
+        trace.columns[POSEIDON_8_COL_ADDR_LEFT_LO].push(F::from_usize(left_first_addr));
+        trace.columns[POSEIDON_8_COL_ADDR_LEFT_HI].push(F::from_usize(left_second_addr));
+        trace.columns[POSEIDON_8_COL_FLAG_PERMUTE].push(F::from_bool(permute));
         for (i, value) in input.iter().enumerate() {
-            trace.columns[POSEIDON_COL_INPUT_START + i].push(*value);
+            trace.columns[POSEIDON_8_COL_INPUT_START + i].push(*value);
+        }
+        // Output columns. The AIR constrains `out_lo` (compression rows) or
+        // both `out_lo`/`out_hi` (permute rows); columns left
+        // unconstrained for a given mode are overwritten from memory by
+        // `fill_trace_poseidon_8`'s post-pass so the lookup still matches.
+        for (i, value) in output_cols.iter().enumerate() {
+            trace.columns[POSEIDON_8_COL_OUT_LO + i].push(*value);
+        }
+        for (i, value) in aux.iter().enumerate() {
+            trace.columns[POSEIDON_8_COL_ROUND_START + i].push(*value);
         }
         // Non-committed columns
-        trace.columns[POSEIDON_COL_NU_A].push(arg_a);
+        trace.columns[POSEIDON_8_COL_NU_A].push(arg_a);
         let domainsep = POSEIDON_DOMAINSEP_BASE
             + POSEIDON_FLAG_PERMUTE_SHIFT * (permute as usize)
-            + POSEIDON_FLAG_OUT8_SHIFT * (out8 as usize)
+            + POSEIDON_FLAG_OUT4_SHIFT * (out4 as usize)
             + POSEIDON_FLAG_LEFT_SHIFT * (flag_hardcoded as usize)
             + POSEIDON_OFFSET_LEFT_SHIFT * hardcoded_offset_left_val;
-        trace.columns[POSEIDON_COL_DOMAINSEP].push(F::from_usize(domainsep));
-
-        // the rest of the trace is filled at the end of the execution (to get parallelism + SIMD)
+        trace.columns[POSEIDON_8_COL_DOMAINSEP].push(F::from_usize(domainsep));
 
         Ok(())
     }
 }
 
-impl<const BUS: bool> Air for Poseidon16Precompile<BUS> {
+/// Constraint count, computed once at monomorphisation. Must match the number
+/// of `assert_*` / `declare_values` calls issued in
+/// `eval()` exactly; used by the proving pipeline for pre-allocation.
+const fn poseidon8_n_constraints(bus: bool) -> usize {
+    // 1 boolean flag (active).
+    // 4 boolean flags (out2, out4, hardcoded_left, permute).
+    // 3 mutex constraints: permute excludes out2; out4 excludes out2; some output mode set.
+    // 2 effective_index constraints (linking addr_left_lo/hi to flag_hardcoded).
+    // Initial + terminal full rounds: 8 MDS equality gates per round (deg 7).
+    // Partial rounds: 1 post_sbox gate per round (deg 7).
+    // Output: 2 gates per WIDTH/2 lane (out_lo + out_hi).
+    // + 2 bus gates (multiplicity + fingerprint) if enabled.
+    let full_gates = 2 * POSEIDON1_HALF_FULL_ROUNDS * WIDTH;
+    let partial_gates = POSEIDON1_PARTIAL_ROUNDS;
+    let bus_gates = if bus { 2 } else { 0 };
+    1 + 4 + 3 + 2 + full_gates + partial_gates + 2 * (WIDTH / 2) + bus_gates
+}
+
+impl<const BUS: bool> Air for Poseidon8Precompile<BUS> {
     type ExtraData = ExtraDataForBuses<EF>;
     fn n_columns(&self) -> usize {
-        num_cols_poseidon_16()
+        num_cols_poseidon_8()
     }
     fn degree_air(&self) -> usize {
-        // The output constraints gate the degree-9 permutation expression by a single linear
-        // factor (`1 - flag_out4` for out_lo[4..8], `1 - flag_out8 - flag_out4` for out_hi),
-        // keeping them at degree 10.
-        10
-    }
-    fn low_degree_air(&self) -> Option<(usize, usize)> {
-        // Each partial round contributes one `assert_eq_low` per round (1 S-box / round), of degree 3 (= the "low" degree part)
-        Some((3, PARTIAL_ROUNDS))
+        // S-box is x⁷ → max degree 7. The output gates multiply the linear
+        // Davies-Meyer expression by a single linear flag factor, so output
+        // gates are at most degree 2; the round gates dominate at degree 7.
+        8
     }
     fn n_shift_columns(&self) -> usize {
         0
     }
     fn n_constraints(&self) -> usize {
-        2 * BUS as usize + 94
+        poseidon8_n_constraints(BUS)
     }
     fn eval<AB: AirBuilder>(&self, builder: &mut AB, extra_data: &Self::ExtraData) {
-        let cols: Poseidon1Cols16<AB::IF> = {
+        let c = get_partial_constants();
+
+        // Phase 1 — snapshot every `flat[…]` column read into owned locals so we
+        // can then use `builder` mutably without fighting the borrow checker.
+        let multiplicity;
+        let nu_b;
+        let nu_c;
+        let flag_out2;
+        let flag_out4;
+        let flag_left;
+        let flag_permute;
+        let offset_left;
+        let addr_left_lo;
+        let addr_left_hi;
+        let inputs: [AB::IF; WIDTH];
+        let out_lo: [AB::IF; WIDTH / 2];
+        let out_hi: [AB::IF; WIDTH / 2];
+        // Per full round: `post[0..W]`. Per partial round: `post_sbox`.
+        let mut full_posts: Vec<[AB::IF; WIDTH]> = Vec::with_capacity(2 * POSEIDON1_HALF_FULL_ROUNDS);
+        let mut partial_post_sboxes: Vec<AB::IF> = Vec::with_capacity(SPARSE_PARTIAL_ROUNDS);
+        {
             let flat = builder.flat();
-            let (prefix, shorts, suffix) = unsafe { flat.align_to::<Poseidon1Cols16<AB::IF>>() };
-            debug_assert!(prefix.is_empty(), "Alignment should match");
-            debug_assert!(suffix.is_empty(), "Alignment should match");
-            debug_assert_eq!(shorts.len(), 1);
-            unsafe { std::ptr::read(&shorts[0]) }
-        };
+            multiplicity = flat[POSEIDON_8_COL_MULTIPLICITY];
+            nu_b = flat[POSEIDON_8_COL_NU_B];
+            nu_c = flat[POSEIDON_8_COL_NU_C];
+            flag_out2 = flat[POSEIDON_8_COL_FLAG_OUT2];
+            flag_out4 = flat[POSEIDON_8_COL_FLAG_OUT4];
+            flag_left = flat[POSEIDON_8_COL_FLAG_LEFT];
+            flag_permute = flat[POSEIDON_8_COL_FLAG_PERMUTE];
+            offset_left = flat[POSEIDON_8_COL_OFFSET_LEFT];
+            addr_left_lo = flat[POSEIDON_8_COL_ADDR_LEFT_LO];
+            addr_left_hi = flat[POSEIDON_8_COL_ADDR_LEFT_HI];
+            inputs = std::array::from_fn(|i| flat[POSEIDON_8_COL_INPUT_START + i]);
+            out_lo = std::array::from_fn(|i| flat[POSEIDON_8_COL_OUT_LO + i]);
+            out_hi = std::array::from_fn(|i| flat[POSEIDON_8_COL_OUT_HI + i]);
+
+            for round in 0..POSEIDON1_N_ROUNDS {
+                let off = round_data_offset(round);
+                if is_full_round(round) {
+                    let post: [AB::IF; WIDTH] = std::array::from_fn(|i| flat[off + i]);
+                    full_posts.push(post);
+                } else {
+                    partial_post_sboxes.push(flat[off]);
+                }
+            }
+        }
 
+        // Reconstruct domainsep and nu_a (virtual columns) from the committed flags.
         let domainsep_reconstructed = AB::IF::from_usize(POSEIDON_DOMAINSEP_BASE)
-            + cols.flag_permute * AB::F::from_usize(POSEIDON_FLAG_PERMUTE_SHIFT)
-            + cols.flag_out8 * AB::F::from_usize(POSEIDON_FLAG_OUT8_SHIFT)
-            + cols.flag_left * AB::F::from_usize(POSEIDON_FLAG_LEFT_SHIFT)
-            + cols.flag_left * cols.offset_left * AB::F::from_usize(POSEIDON_OFFSET_LEFT_SHIFT);
+            + flag_permute * AB::F::from_usize(POSEIDON_FLAG_PERMUTE_SHIFT)
+            + flag_out4 * AB::F::from_usize(POSEIDON_FLAG_OUT4_SHIFT)
+            + flag_left * AB::F::from_usize(POSEIDON_FLAG_LEFT_SHIFT)
+            + flag_left * offset_left * AB::F::from_usize(POSEIDON_OFFSET_LEFT_SHIFT);
 
         // addr_left_lo = nu_a * (1 - flag_left) + offset_left * flag_left
-        let one_minus_flag_left = AB::IF::ONE - cols.flag_left;
-        let nu_a = cols.addr_left_hi - one_minus_flag_left * AB::F::from_usize(HALF_DIGEST_LEN);
-
-        // Bus: data = [nu_a, nu_b, nu_c], domainsep
+        //   ⇒ when flag_left = 0: addr_left_lo = nu_a
+        //                         addr_left_hi = nu_a + HALF_DIGEST_LEN
+        //   ⇒ when flag_left = 1: addr_left_lo = offset_left
+        //                         addr_left_hi = nu_a
+        // We define nu_a (virtual) via addr_left_hi:
+        //   nu_a = addr_left_hi - (1 - flag_left) * HALF_DIGEST_LEN
+        let one_minus_flag_left = AB::IF::ONE - flag_left;
+        let nu_a = addr_left_hi - one_minus_flag_left * AB::F::from_usize(HALF_DIGEST_LEN);
+
+        // Phase 2 — bus / declare.
         if BUS {
             eval_bus_virtual::<AB, EF>(
                 builder,
                 extra_data,
-                cols.multiplicity,
+                multiplicity,
                 domainsep_reconstructed,
-                &[nu_a, cols.nu_b, cols.nu_c],
+                &[nu_a, nu_b, nu_c],
             );
         } else {
-            builder.declare_values(std::slice::from_ref(&cols.multiplicity));
-            builder.declare_values(&[nu_a, cols.nu_b, cols.nu_c, domainsep_reconstructed]);
+            builder.declare_values(std::slice::from_ref(&multiplicity));
+            builder.declare_values(&[nu_a, nu_b, nu_c, domainsep_reconstructed]);
         }
 
-        builder.assert_bool(cols.multiplicity);
-        builder.assert_bool(cols.flag_out4);
-        builder.assert_bool(cols.flag_out8);
-        builder.assert_bool(cols.flag_left);
-        builder.assert_bool(cols.flag_permute);
-        builder.assert_zero(cols.flag_permute * cols.flag_out4);
-        builder.assert_zero(cols.flag_out8 * cols.flag_out4);
-        builder.assert_zero(
-            (AB::IF::ONE - cols.flag_permute) * (AB::IF::ONE - cols.flag_out8) * (AB::IF::ONE - cols.flag_out4),
-        );
-
-        builder.assert_zero(cols.flag_left * (cols.offset_left - cols.addr_left_lo));
-        builder.assert_zero(one_minus_flag_left * (nu_a - cols.addr_left_lo));
-
-        eval_poseidon1_16(builder, &cols)
-    }
-}
-
-#[repr(C)]
-#[derive(Debug)]
-pub(super) struct Poseidon1Cols16<T> {
-    pub multiplicity: T, // 0 = padding, 1 = active
-    pub nu_b: T,
-    pub nu_c: T,
-    pub flag_out4: T, // output is 4 elements (compression only)
-    pub flag_out8: T, // output is 8 elements; neither out4 nor out8 set => 16 elements (permutation only)
-    pub flag_left: T,
-    pub offset_left: T,
-    pub addr_left_lo: T,
-    pub addr_left_hi: T,
-    pub flag_permute: T,
-
-    pub inputs: [T; WIDTH],
-    pub beginning_full_rounds: [[T; WIDTH]; HALF_INITIAL_FULL_ROUNDS],
-    pub partial_rounds: [T; PARTIAL_ROUNDS],
-    pub ending_full_rounds: [[T; WIDTH]; HALF_FINAL_FULL_ROUNDS - 1],
-    pub out_lo: [T; WIDTH / 2],
-    pub out_hi: [T; WIDTH / 2],
-}
-
-fn eval_poseidon1_16<AB: AirBuilder>(builder: &mut AB, local: &Poseidon1Cols16<AB::IF>) {
-    let mut state: [_; WIDTH] = local.inputs;
-
-    let initial_constants = poseidon1_initial_constants();
-    for round in 0..HALF_INITIAL_FULL_ROUNDS {
-        eval_2_full_rounds_16(
-            &mut state,
-            &local.beginning_full_rounds[round],
-            &initial_constants[2 * round],
-            &initial_constants[2 * round + 1],
-            builder,
-        );
-    }
-
-    // --- Sparse partial rounds ---
-    // Transition: add first-round constants, multiply by m_i
-    builder.low_degree_block(&mut state, |b, state| {
-        let state: &mut [AB::IF; WIDTH] = state.try_into().unwrap();
-
-        let frc = poseidon1_sparse_first_round_constants();
-        for (s, &c) in state.iter_mut().zip(frc.iter()) {
-            add_kb(s, c);
-        }
-        dense_mat_vec_air_16(poseidon1_sparse_m_i(), state);
-
-        let first_rows = poseidon1_sparse_first_row();
-        let v_vecs = poseidon1_sparse_v();
-        let scalar_rc = poseidon1_sparse_scalar_round_constants();
-        for round in 0..PARTIAL_ROUNDS {
-            // S-box on state[0]
-            state[0] = state[0].cube();
-            b.assert_eq_low(state[0], local.partial_rounds[round]);
-            state[0] = local.partial_rounds[round];
-            // Scalar round constant (not on last round)
-            if round < PARTIAL_ROUNDS - 1 {
-                add_kb(&mut state[0], scalar_rc[round]);
+        builder.assert_bool(multiplicity);
+        builder.assert_bool(flag_out2);
+        builder.assert_bool(flag_out4);
+        builder.assert_bool(flag_left);
+        builder.assert_bool(flag_permute);
+        // permute is mutually exclusive with the half-width compression output.
+        builder.assert_zero(flag_permute * flag_out2);
+        // out2 / out4 are mutually exclusive.
+        builder.assert_zero(flag_out4 * flag_out2);
+        // A non-permutation row must specify a compression output width.
+        builder.assert_zero((AB::IF::ONE - flag_permute) * (AB::IF::ONE - flag_out4) * (AB::IF::ONE - flag_out2));
+
+        // Constrain addr_left_lo to match its semantics.
+        builder.assert_zero(flag_left * (offset_left - addr_left_lo));
+        builder.assert_zero(one_minus_flag_left * (nu_a - addr_left_lo));
+
+        // Phase 3 — Poseidon1-8 permutation constraints with Davies-Meyer feed-forward.
+        let mut state: [AB::IF; WIDTH] = inputs;
+
+        // ---- Initial full rounds ----
+        for round in 0..POSEIDON1_HALF_FULL_ROUNDS {
+            let sbox_out: [AB::IF; WIDTH] = std::array::from_fn(|i| {
+                let x = state[i] + AB::F::from_u64(GOLDILOCKS_POSEIDON1_RC_8[round][i].as_canonical_u64());
+                // x⁷ = x · (x²)² · x² — 4 Mul nodes in the symbolic DAG.
+                let x2 = x * x;
+                let x4 = x2 * x2;
+                x4 * x2 * x
+            });
+            let post = full_posts[round];
+            for i in 0..WIDTH {
+                let mut acc = sbox_out[0] * AB::F::from_u64(MDS8_ROW[(WIDTH - i) % WIDTH] as u64);
+                for (j, sj) in sbox_out.iter().enumerate().skip(1) {
+                    let coeff = AB::F::from_u64(MDS8_ROW[(j + WIDTH - i) % WIDTH] as u64);
+                    acc += *sj * coeff;
+                }
+                builder.assert_zero(post[i] - acc);
             }
-            // Sparse matrix: new_s0 = dot(first_row, state), state[i] += old_s0 * v[i-1]
-            sparse_mat_air_16(state, &first_rows[round], &v_vecs[round]);
+            state = post;
         }
-    });
-
-    let final_constants = poseidon1_final_constants();
-    for round in 0..HALF_FINAL_FULL_ROUNDS - 1 {
-        eval_2_full_rounds_16(
-            &mut state,
-            &local.ending_full_rounds[round],
-            &final_constants[2 * round],
-            &final_constants[2 * round + 1],
-            builder,
-        );
-    }
 
-    eval_last_2_full_rounds_16(
-        &local.inputs,
-        &mut state,
-        &local.out_lo,
-        &local.out_hi,
-        &final_constants[2 * (HALF_FINAL_FULL_ROUNDS - 1)],
-        &final_constants[2 * (HALF_FINAL_FULL_ROUNDS - 1) + 1],
-        local.flag_out8,
-        local.flag_out4,
-        local.flag_permute,
-        builder,
-    );
-}
-
-pub const fn num_cols_poseidon_16() -> usize {
-    size_of::<Poseidon1Cols16<u8>>()
-}
-
-pub const fn num_cols_total_poseidon_16() -> usize {
-    // +2 for non-committed columns: POSEIDON_COL_INDEX_INPUT_LEFT, POSEIDON_COL_DOMAINSEP
-    num_cols_poseidon_16() + 2
-}
-
-#[inline]
-fn eval_2_full_rounds_16<AB: AirBuilder>(
-    state: &mut [AB::IF; WIDTH],
-    post_full_round: &[AB::IF; WIDTH],
-    round_constants_1: &[F; WIDTH],
-    round_constants_2: &[F; WIDTH],
-    builder: &mut AB,
-) {
-    for (s, r) in state.iter_mut().zip(round_constants_1.iter()) {
-        add_kb(s, *r);
-        *s = s.cube();
-    }
-    mds_air_16(state);
-    for (s, r) in state.iter_mut().zip(round_constants_2.iter()) {
-        add_kb(s, *r);
-        *s = s.cube();
-    }
-    mds_air_16(state);
-    for (state_i, post_i) in state.iter_mut().zip(post_full_round) {
-        builder.assert_eq(*state_i, *post_i);
-        *state_i = *post_i;
-    }
-}
+        // ---- Partial phase: first_round_constants, m_i, sparse-matmul loop ----
+        for (i, s) in state.iter_mut().enumerate() {
+            *s += AB::F::from_u64(c.first_round_constants[i].as_canonical_u64());
+        }
+        {
+            let mut after: [AB::IF; WIDTH] = std::array::from_fn(|i| {
+                let mut acc = state[0] * AB::F::from_u64(c.m_i[i][0].as_canonical_u64());
+                for (j, sj) in state.iter().enumerate().skip(1) {
+                    acc += *sj * AB::F::from_u64(c.m_i[i][j].as_canonical_u64());
+                }
+                acc
+            });
+            std::mem::swap(&mut state, &mut after);
+        }
 
-#[inline]
-#[allow(clippy::too_many_arguments)]
-fn eval_last_2_full_rounds_16<AB: AirBuilder>(
-    initial_state: &[AB::IF; WIDTH],
-    state: &mut [AB::IF; WIDTH],
-    out_lo: &[AB::IF; WIDTH / 2],
-    out_hi: &[AB::IF; WIDTH / 2],
-    round_constants_1: &[F; WIDTH],
-    round_constants_2: &[F; WIDTH],
-    flag_out8: AB::IF,
-    flag_out4: AB::IF,
-    flag_permute: AB::IF,
-    builder: &mut AB,
-) {
-    for (s, r) in state.iter_mut().zip(round_constants_1.iter()) {
-        add_kb(s, *r);
-        *s = s.cube();
-    }
-    mds_air_16(state);
-    for (s, r) in state.iter_mut().zip(round_constants_2.iter()) {
-        add_kb(s, *r);
-        *s = s.cube();
-    }
-    mds_air_16(state);
-    let feedforward = AB::IF::ONE - flag_permute;
-    let gate_lo_8 = AB::IF::ONE - flag_out4;
-    let gate_hi = AB::IF::ONE - flag_out8 - flag_out4;
-    for i in 0..(WIDTH / 2) {
-        let value = state[i] + feedforward * initial_state[i];
-        if i < HALF_DIGEST_LEN {
-            builder.assert_zero(value - out_lo[i]);
-        } else {
-            builder.assert_zero(gate_lo_8 * (value - out_lo[i]));
+        for (r, post_sbox) in partial_post_sboxes.iter().enumerate().take(SPARSE_PARTIAL_ROUNDS) {
+            let x = state[0];
+            let post_sbox = *post_sbox;
+
+            // post_sbox = x⁷ (deg 7).
+            let x2 = x * x;
+            let x4 = x2 * x2;
+            builder.assert_zero(post_sbox - x4 * x2 * x);
+
+            // state[0] becomes post_sbox (+ scalar RC, except last round).
+            state[0] = if r < SPARSE_PARTIAL_ROUNDS - 1 {
+                post_sbox + AB::F::from_u64(c.round_constants[r].as_canonical_u64())
+            } else {
+                post_sbox
+            };
+
+            // cheap_matmul.
+            let old_s0 = state[0];
+            let mut new_s0 = state[0] * AB::F::from_u64(c.sparse_first_row[r][0].as_canonical_u64());
+            for (j, sj) in state.iter().enumerate().skip(1) {
+                new_s0 += *sj * AB::F::from_u64(c.sparse_first_row[r][j].as_canonical_u64());
+            }
+            state[0] = new_s0;
+            for (i, s) in state.iter_mut().enumerate().skip(1) {
+                *s += old_s0 * AB::F::from_u64(c.v[r][i - 1].as_canonical_u64());
+            }
         }
-        builder.assert_zero(gate_hi * (state[i + WIDTH / 2] - out_hi[i])); // always permutation on the right-half
-    }
-}
 
-#[inline]
-fn dense_mat_vec_air_16<A: PrimeCharacteristicRing + 'static>(mat: &[[F; 16]; 16], state: &mut [A; WIDTH]) {
-    let input = *state;
-    for i in 0..WIDTH {
-        let mut acc = A::ZERO;
-        for j in 0..WIDTH {
-            acc += mul_kb(input[j], mat[i][j]);
+        // ---- Terminal full rounds ----
+        for round in 0..POSEIDON1_HALF_FULL_ROUNDS {
+            let abs = POSEIDON1_HALF_FULL_ROUNDS + POSEIDON1_PARTIAL_ROUNDS + round;
+            let sbox_out: [AB::IF; WIDTH] = std::array::from_fn(|i| {
+                let x = state[i] + AB::F::from_u64(GOLDILOCKS_POSEIDON1_RC_8[abs][i].as_canonical_u64());
+                let x2 = x * x;
+                let x4 = x2 * x2;
+                x4 * x2 * x
+            });
+            let post = full_posts[POSEIDON1_HALF_FULL_ROUNDS + round];
+            for i in 0..WIDTH {
+                let mut acc = sbox_out[0] * AB::F::from_u64(MDS8_ROW[(WIDTH - i) % WIDTH] as u64);
+                for (j, sj) in sbox_out.iter().enumerate().skip(1) {
+                    let coeff = AB::F::from_u64(MDS8_ROW[(j + WIDTH - i) % WIDTH] as u64);
+                    acc += *sj * coeff;
+                }
+                builder.assert_zero(post[i] - acc);
+            }
+            state = post;
         }
-        state[i] = acc;
-    }
-}
 
-#[inline]
-fn sparse_mat_air_16<A: PrimeCharacteristicRing + 'static>(
-    state: &mut [A; WIDTH],
-    first_row: &[F; WIDTH],
-    v: &[F; WIDTH],
-) {
-    let old_s0 = state[0];
-    let mut new_s0 = A::ZERO;
-    for j in 0..WIDTH {
-        new_s0 += mul_kb(state[j], first_row[j]);
-    }
-    state[0] = new_s0;
-    for i in 1..WIDTH {
-        state[i] += mul_kb(old_s0, v[i - 1]);
+        // Output gates (WIDTH/2 lanes, 2 gates each):
+        //   value = state[i] + feedforward * inputs[i]   (feedforward = 1 - permute)
+        //  - out_lo[i] = value, always for i < HALF_DIGEST_LEN; for the rest only
+        //    when the output is not the half-width compression (gate `1 - out2`).
+        //  - out_hi[i] = state[i + WIDTH/2], only on the full permutation
+        //    (gate `1 - out4 - out2`).
+        // For compression rows feedforward = 1 (Davies-Meyer); for permutation
+        // rows feedforward = 0 (raw permutation output).
+        let feedforward = AB::IF::ONE - flag_permute;
+        let gate_lo_full = AB::IF::ONE - flag_out2;
+        let gate_hi = AB::IF::ONE - flag_out4 - flag_out2;
+        for i in 0..WIDTH / 2 {
+            let value = state[i] + feedforward * inputs[i];
+            if i < HALF_DIGEST_LEN {
+                builder.assert_zero(value - out_lo[i]);
+            } else {
+                builder.assert_zero(gate_lo_full * (value - out_lo[i]));
+            }
+            builder.assert_zero(gate_hi * (state[i + WIDTH / 2] - out_hi[i]));
+        }
     }
 }
diff --git a/crates/lean_vm/src/tables/poseidon/sparse.rs b/crates/lean_vm/src/tables/poseidon/sparse.rs
new file mode 100644
index 000000000..b1f897c50
--- /dev/null
+++ b/crates/lean_vm/src/tables/poseidon/sparse.rs
@@ -0,0 +1,430 @@
+//! Sparse matrix decomposition for Poseidon1-8 partial rounds.
+//!
+//! Port of `plonky3/poseidon1/src/utils.rs` specialised to the Goldilocks width-8
+//! configuration. Produces the transition matrix `m_i`, the per-round sparse
+//! matrices (`sparse_first_row[r]`, `v[r]`), and the compressed round constants
+//! (`first_round_constants` + per-round scalar `round_constants`), so that all
+//! 22 partial rounds can be constrained with 2 committed columns each instead
+//! of the naive 9.
+//!
+//! References:
+//! - Grassi et al., "Poseidon: A New Hash Function for Zero-Knowledge Proof
+//!   Systems", USENIX Security 2021, Appendix B.
+//! - `plonky3/poseidon1/src/{utils.rs, internal.rs}`.
+
+use std::sync::OnceLock;
+
+use backend::{
+    Field, GOLDILOCKS_POSEIDON1_RC_8, MDS8_ROW, POSEIDON1_HALF_FULL_ROUNDS, POSEIDON1_PARTIAL_ROUNDS,
+    PrimeCharacteristicRing,
+};
+
+use crate::F;
+
+pub(super) const WIDTH: usize = 8;
+pub(super) const PARTIAL_ROUNDS: usize = POSEIDON1_PARTIAL_ROUNDS;
+
+/// Precomputed constants for the sparse partial-round layer.
+#[derive(Debug, Clone)]
+pub(super) struct PartialConstants {
+    /// Absorbs the original partial-round 0 vector plus backward-substituted
+    /// remainders from rounds 1..RP. Added once before the m_i multiply.
+    pub first_round_constants: [F; WIDTH],
+    /// Dense transition matrix applied once after adding
+    /// `first_round_constants` and before the sparse-round loop.
+    pub m_i: [[F; WIDTH]; WIDTH],
+    /// Per-round pre-assembled first row of the sparse matrix:
+    /// `[mds[0][0], w_hat[0], …, w_hat[WIDTH-2]]`.
+    pub sparse_first_row: [[F; WIDTH]; PARTIAL_ROUNDS],
+    /// Per-round first-column coefficients (WIDTH-1 entries; we use `[F; WIDTH-1]`).
+    pub v: [[F; WIDTH - 1]; PARTIAL_ROUNDS],
+    /// Scalar round constants for partial rounds 0..RP-1 (the last round uses
+    /// no additive constant — it was absorbed by the backward substitution).
+    pub round_constants: [F; PARTIAL_ROUNDS - 1],
+}
+
+static PARTIAL_CONSTANTS: OnceLock<PartialConstants> = OnceLock::new();
+
+pub(super) fn get_partial_constants() -> &'static PartialConstants {
+    PARTIAL_CONSTANTS.get_or_init(compute_partial_constants)
+}
+
+/// Build the dense WxW circulant MDS matrix from `MDS8_ROW`.
+///
+/// `M[i][j] = MDS8_ROW[(j - i) mod W]`, stored as `F`.
+pub(super) fn mds_dense() -> [[F; WIDTH]; WIDTH] {
+    let mut m = [[F::ZERO; WIDTH]; WIDTH];
+    for i in 0..WIDTH {
+        for j in 0..WIDTH {
+            m[i][j] = F::from_u64(MDS8_ROW[(j + WIDTH - i) % WIDTH] as u64);
+        }
+    }
+    m
+}
+
+fn matrix_transpose(m: &[[F; WIDTH]; WIDTH]) -> [[F; WIDTH]; WIDTH] {
+    let mut r = [[F::ZERO; WIDTH]; WIDTH];
+    for i in 0..WIDTH {
+        for j in 0..WIDTH {
+            r[i][j] = m[j][i];
+        }
+    }
+    r
+}
+
+fn matrix_mul(a: &[[F; WIDTH]; WIDTH], b: &[[F; WIDTH]; WIDTH]) -> [[F; WIDTH]; WIDTH] {
+    let mut c = [[F::ZERO; WIDTH]; WIDTH];
+    for i in 0..WIDTH {
+        for j in 0..WIDTH {
+            let mut acc = F::ZERO;
+            for k in 0..WIDTH {
+                acc += a[i][k] * b[k][j];
+            }
+            c[i][j] = acc;
+        }
+    }
+    c
+}
+
+fn matrix_vec_mul(m: &[[F; WIDTH]; WIDTH], v: &[F; WIDTH]) -> [F; WIDTH] {
+    let mut r = [F::ZERO; WIDTH];
+    for i in 0..WIDTH {
+        let mut acc = F::ZERO;
+        for j in 0..WIDTH {
+            acc += m[i][j] * v[j];
+        }
+        r[i] = acc;
+    }
+    r
+}
+
+fn matrix_inverse(m: &[[F; WIDTH]; WIDTH]) -> [[F; WIDTH]; WIDTH] {
+    let mut aug = *m;
+    let mut inv = [[F::ZERO; WIDTH]; WIDTH];
+    for (i, row) in inv.iter_mut().enumerate().take(WIDTH) {
+        row[i] = F::ONE;
+    }
+    for col in 0..WIDTH {
+        let pivot = (col..WIDTH)
+            .find(|&r| aug[r][col] != F::ZERO)
+            .expect("mds matrix is singular");
+        if pivot != col {
+            aug.swap(col, pivot);
+            inv.swap(col, pivot);
+        }
+        let pivot_inv = aug[col][col].inverse();
+        for j in 0..WIDTH {
+            aug[col][j] *= pivot_inv;
+            inv[col][j] *= pivot_inv;
+        }
+        for i in 0..WIDTH {
+            if i == col {
+                continue;
+            }
+            let factor = aug[i][col];
+            if factor == F::ZERO {
+                continue;
+            }
+            let aug_row = aug[col];
+            let inv_row = inv[col];
+            for j in 0..WIDTH {
+                aug[i][j] -= factor * aug_row[j];
+                inv[i][j] -= factor * inv_row[j];
+            }
+        }
+    }
+    inv
+}
+
+/// Inverse of the bottom-right (W-1)x(W-1) submatrix `m[1..][1..]`.
+fn submatrix_inverse(m: &[[F; WIDTH]; WIDTH]) -> [[F; WIDTH - 1]; WIDTH - 1] {
+    const N: usize = WIDTH - 1;
+    let mut sub = [[F::ZERO; N]; N];
+    for i in 0..N {
+        for j in 0..N {
+            sub[i][j] = m[i + 1][j + 1];
+        }
+    }
+    let mut inv = [[F::ZERO; N]; N];
+    for (i, row) in inv.iter_mut().enumerate().take(N) {
+        row[i] = F::ONE;
+    }
+    for col in 0..N {
+        let pivot = (col..N)
+            .find(|&r| sub[r][col] != F::ZERO)
+            .expect("mds submatrix is singular");
+        if pivot != col {
+            sub.swap(col, pivot);
+            inv.swap(col, pivot);
+        }
+        let pivot_inv = sub[col][col].inverse();
+        for j in 0..N {
+            sub[col][j] *= pivot_inv;
+            inv[col][j] *= pivot_inv;
+        }
+        for i in 0..N {
+            if i == col {
+                continue;
+            }
+            let factor = sub[i][col];
+            if factor == F::ZERO {
+                continue;
+            }
+            let sub_row = sub[col];
+            let inv_row = inv[col];
+            for j in 0..N {
+                sub[i][j] -= factor * sub_row[j];
+                inv[i][j] -= factor * inv_row[j];
+            }
+        }
+    }
+    inv
+}
+
+type EquivalentMatrices = ([[F; WIDTH]; WIDTH], Vec<[F; WIDTH]>, Vec<[F; WIDTH]>);
+
+/// Factor the dense MDS matrix into `RP` sparse factors.
+///
+/// Returns `(m_i, v_collection, w_hat_collection)` all in forward application
+/// order; `v_collection[r]` and `w_hat_collection[r]` have `WIDTH-1` meaningful
+/// entries (the last slot is zero padding for fixed-size arrays).
+fn compute_equivalent_matrices(mds: &[[F; WIDTH]; WIDTH], rounds_p: usize) -> EquivalentMatrices {
+    let mut v_collection: Vec<[F; WIDTH]> = Vec::with_capacity(rounds_p);
+    let mut w_hat_collection: Vec<[F; WIDTH]> = Vec::with_capacity(rounds_p);
+
+    let mds_t = matrix_transpose(mds);
+    let mut m_mul = mds_t;
+    let mut m_i = [[F::ZERO; WIDTH]; WIDTH];
+
+    for _ in 0..rounds_p {
+        // v = first row of m_mul (excl [0,0]). In the transposed domain this is
+        // the first column of M'' in the non-transposed view.
+        let v_arr: [F; WIDTH] = std::array::from_fn(|j| if j < WIDTH - 1 { m_mul[0][j + 1] } else { F::ZERO });
+
+        // w = first column of m_mul (excl [0,0]).
+        let mut w = [F::ZERO; WIDTH - 1];
+        for i in 0..WIDTH - 1 {
+            w[i] = m_mul[i + 1][0];
+        }
+        // w_hat = M_hat^{-1} * w.
+        let m_hat_inv = submatrix_inverse(&m_mul);
+        let w_hat_arr: [F; WIDTH] = std::array::from_fn(|i| {
+            if i < WIDTH - 1 {
+                let mut acc = F::ZERO;
+                for k in 0..WIDTH - 1 {
+                    acc += m_hat_inv[i][k] * w[k];
+                }
+                acc
+            } else {
+                F::ZERO
+            }
+        });
+
+        v_collection.push(v_arr);
+        w_hat_collection.push(w_hat_arr);
+
+        // m_i = identity-like with m_mul's first row/column stored, then
+        // "absorb" the rest: first column zeroed, first row zeroed, [0][0]=1.
+        m_i = m_mul;
+        m_i[0][0] = F::ONE;
+        for row in m_i.iter_mut().take(WIDTH).skip(1) {
+            row[0] = F::ZERO;
+        }
+        for entry in m_i[0].iter_mut().take(WIDTH).skip(1) {
+            *entry = F::ZERO;
+        }
+
+        // Accumulate: m_mul = M^T * m_i.
+        m_mul = matrix_mul(&mds_t, &m_i);
+    }
+
+    // Transpose m_i back (HorizenLabs works in the transposed domain).
+    let m_i_returned = matrix_transpose(&m_i);
+
+    v_collection.reverse();
+    w_hat_collection.reverse();
+
+    (m_i_returned, v_collection, w_hat_collection)
+}
+
+/// Backward-substitute partial round constants through M^{-1}, producing the
+/// full first-round vector and per-round scalar offsets.
+fn equivalent_round_constants(partial_rc: &[[F; WIDTH]], mds_inv: &[[F; WIDTH]; WIDTH]) -> ([F; WIDTH], Vec<F>) {
+    let rounds_p = partial_rc.len();
+    let mut opt_partial_rc = vec![F::ZERO; rounds_p];
+
+    let mut tmp = partial_rc[rounds_p - 1];
+    for i in (0..rounds_p - 1).rev() {
+        let inv_cip = matrix_vec_mul(mds_inv, &tmp);
+        opt_partial_rc[i + 1] = inv_cip[0];
+        tmp = partial_rc[i];
+        for j in 1..WIDTH {
+            tmp[j] += inv_cip[j];
+        }
+    }
+    let first_round_constants = tmp;
+    let opt_partial_rc = opt_partial_rc[1..].to_vec();
+    (first_round_constants, opt_partial_rc)
+}
+
+fn compute_partial_constants() -> PartialConstants {
+    let mds = mds_dense();
+    let mds_inv = matrix_inverse(&mds);
+
+    // Slice out the partial-round RCs from the monolithic RC table.
+    let partial_rc: Vec<[F; WIDTH]> = (0..PARTIAL_ROUNDS)
+        .map(|r| GOLDILOCKS_POSEIDON1_RC_8[POSEIDON1_HALF_FULL_ROUNDS + r])
+        .collect();
+
+    let (first_round_constants, round_constants_vec) = equivalent_round_constants(&partial_rc, &mds_inv);
+    let (m_i, v_collection, w_hat_collection) = compute_equivalent_matrices(&mds, PARTIAL_ROUNDS);
+
+    // sparse_first_row[r] = [mds[0][0], w_hat[r][0], …, w_hat[r][W-2]].
+    let mds_0_0 = mds[0][0];
+    let mut sparse_first_row = [[F::ZERO; WIDTH]; PARTIAL_ROUNDS];
+    for r in 0..PARTIAL_ROUNDS {
+        sparse_first_row[r][0] = mds_0_0;
+        for i in 1..WIDTH {
+            sparse_first_row[r][i] = w_hat_collection[r][i - 1];
+        }
+    }
+
+    // v[r] stripped to [F; WIDTH-1] (drop the zero tail).
+    let mut v = [[F::ZERO; WIDTH - 1]; PARTIAL_ROUNDS];
+    for r in 0..PARTIAL_ROUNDS {
+        for i in 0..WIDTH - 1 {
+            v[r][i] = v_collection[r][i];
+        }
+    }
+
+    let mut round_constants = [F::ZERO; PARTIAL_ROUNDS - 1];
+    round_constants[..(PARTIAL_ROUNDS - 1)].copy_from_slice(&round_constants_vec[..(PARTIAL_ROUNDS - 1)]);
+
+    PartialConstants {
+        first_round_constants,
+        m_i,
+        sparse_first_row,
+        v,
+        round_constants,
+    }
+}
+
+#[cfg(test)]
+#[allow(clippy::needless_range_loop, clippy::assign_op_pattern)]
+mod tests {
+    use super::*;
+    use backend::{POSEIDON1_HALF_FULL_ROUNDS, PrimeField64};
+    use utils::poseidon8_compress;
+
+    fn sbox7(x: F) -> F {
+        let x2 = x * x;
+        let x4 = x2 * x2;
+        x4 * x2 * x
+    }
+
+    /// Textbook (non-sparse) partial-round phase, used as a reference.
+    fn textbook_partial_phase(mut state: [F; WIDTH]) -> [F; WIDTH] {
+        let mds = mds_dense();
+        for r in 0..PARTIAL_ROUNDS {
+            let abs = POSEIDON1_HALF_FULL_ROUNDS + r;
+            for i in 0..WIDTH {
+                state[i] = state[i] + GOLDILOCKS_POSEIDON1_RC_8[abs][i];
+            }
+            state[0] = sbox7(state[0]);
+            state = matrix_vec_mul(&mds, &state);
+        }
+        state
+    }
+
+    /// Sparse partial-round phase — what the AIR witness computes.
+    fn sparse_partial_phase(mut state: [F; WIDTH]) -> [F; WIDTH] {
+        let c = get_partial_constants();
+        for i in 0..WIDTH {
+            state[i] = state[i] + c.first_round_constants[i];
+        }
+        state = matrix_vec_mul(&c.m_i, &state);
+        for r in 0..PARTIAL_ROUNDS - 1 {
+            state[0] = sbox7(state[0]);
+            state[0] = state[0] + c.round_constants[r];
+            let old_s0 = state[0];
+            // new_state[0] = dot(sparse_first_row[r], state).
+            let mut acc = F::ZERO;
+            for j in 0..WIDTH {
+                acc = acc + c.sparse_first_row[r][j] * state[j];
+            }
+            state[0] = acc;
+            for i in 1..WIDTH {
+                state[i] = state[i] + c.v[r][i - 1] * old_s0;
+            }
+        }
+        // last round — no RC.
+        {
+            let r = PARTIAL_ROUNDS - 1;
+            state[0] = sbox7(state[0]);
+            let old_s0 = state[0];
+            let mut acc = F::ZERO;
+            for j in 0..WIDTH {
+                acc = acc + c.sparse_first_row[r][j] * state[j];
+            }
+            state[0] = acc;
+            for i in 1..WIDTH {
+                state[i] = state[i] + c.v[r][i - 1] * old_s0;
+            }
+        }
+        state
+    }
+
+    /// The sparse decomposition must reproduce the textbook phase bit-for-bit.
+    #[test]
+    fn sparse_matches_textbook() {
+        let mut seed = 0u64;
+        for trial in 0..4 {
+            seed = seed.wrapping_add(0x9E37_79B9_7F4A_7C15);
+            let input: [F; WIDTH] =
+                std::array::from_fn(|i| F::from_u64(seed.wrapping_mul(i as u64 + 1 + trial as u64)));
+            let a = textbook_partial_phase(input);
+            let b = sparse_partial_phase(input);
+            for i in 0..WIDTH {
+                assert_eq!(
+                    a[i].as_canonical_u64(),
+                    b[i].as_canonical_u64(),
+                    "trial {trial} lane {i}"
+                );
+            }
+        }
+    }
+
+    /// End-to-end: a full permutation via the sparse decomposition must match
+    /// `poseidon8_compress` (Davies-Meyer around the Goldilocks Poseidon1-8).
+    #[test]
+    fn full_permutation_matches_poseidon8_compress() {
+        let input: [F; WIDTH] = std::array::from_fn(|i| F::from_u64(i as u64 * 37 + 1));
+        // Initial full rounds.
+        let mut state = input;
+        let mds = mds_dense();
+        for round in 0..POSEIDON1_HALF_FULL_ROUNDS {
+            for i in 0..WIDTH {
+                state[i] = state[i] + GOLDILOCKS_POSEIDON1_RC_8[round][i];
+            }
+            for i in 0..WIDTH {
+                state[i] = sbox7(state[i]);
+            }
+            state = matrix_vec_mul(&mds, &state);
+        }
+        state = sparse_partial_phase(state);
+        for round in 0..POSEIDON1_HALF_FULL_ROUNDS {
+            let abs = POSEIDON1_HALF_FULL_ROUNDS + PARTIAL_ROUNDS + round;
+            for i in 0..WIDTH {
+                state[i] = state[i] + GOLDILOCKS_POSEIDON1_RC_8[abs][i];
+            }
+            for i in 0..WIDTH {
+                state[i] = sbox7(state[i]);
+            }
+            state = matrix_vec_mul(&mds, &state);
+        }
+        // Davies-Meyer.
+        let output: [F; 4] = std::array::from_fn(|i| state[i] + input[i]);
+        let expected = poseidon8_compress(input);
+        assert_eq!(output, expected);
+    }
+}
diff --git a/crates/lean_vm/src/tables/poseidon/trace_gen.rs b/crates/lean_vm/src/tables/poseidon/trace_gen.rs
index 9022f6c33..38e6a168c 100644
--- a/crates/lean_vm/src/tables/poseidon/trace_gen.rs
+++ b/crates/lean_vm/src/tables/poseidon/trace_gen.rs
@@ -1,164 +1,18 @@
 use tracing::instrument;
 
-use crate::{
-    F,
-    tables::{Poseidon1Cols16, WIDTH},
-};
-use backend::*;
-
-#[instrument(name = "generate Poseidon16 AIR trace", skip_all)]
-pub fn fill_trace_poseidon_16(trace: &mut [Vec<F>]) {
-    let n = trace.iter().map(|col| col.len()).max().unwrap();
+use crate::F;
+use backend::PrimeCharacteristicRing;
+
+// `execute()` writes every column for each active row (I/O + all per-round
+// witness cols), and `padding_row()` supplies the zero-input witness for
+// trailing padding. This pass just equalises column lengths in case any got
+// out of sync during trace construction.
+#[instrument(name = "generate Poseidon8 AIR trace", skip_all)]
+pub fn fill_trace_poseidon_8(trace: &mut [Vec<F>]) {
+    let n = trace.iter().map(|col| col.len()).max().unwrap_or(0);
     for col in trace.iter_mut() {
         if col.len() != n {
             col.resize(n, F::ZERO);
         }
     }
-
-    let m = n - (n % packing_width::<F>());
-    let trace_packed: Vec<_> = trace.iter().map(|col| FPacking::<F>::pack_slice(&col[..m])).collect();
-
-    const N_COLS: usize = super::num_cols_poseidon_16();
-
-    // fill the packed rows
-    let cols: &[&[FPacking<F>]; N_COLS] = (&trace_packed[..N_COLS]).try_into().unwrap();
-    (0..m / packing_width::<F>()).into_par_iter().for_each(|i| {
-        let ptrs: [*mut FPacking<F>; N_COLS] =
-            std::array::from_fn(|c| unsafe { (cols[c].as_ptr() as *mut FPacking<F>).add(i) });
-        let perm: &mut Poseidon1Cols16<&mut FPacking<F>> =
-            unsafe { &mut *(ptrs.as_ptr() as *mut Poseidon1Cols16<&mut FPacking<F>>) };
-
-        generate_trace_rows_for_perm(perm);
-    });
-
-    // fill the remaining rows (non packed)
-    let cols: &[Vec<F>; N_COLS] = (&trace[..N_COLS]).try_into().unwrap();
-    for i in m..n {
-        let ptrs: [*mut F; N_COLS] = std::array::from_fn(|c| unsafe { (cols[c].as_ptr() as *mut F).add(i) });
-        let perm: &mut Poseidon1Cols16<&mut F> = unsafe { &mut *(ptrs.as_ptr() as *mut Poseidon1Cols16<&mut F>) };
-        generate_trace_rows_for_perm(perm);
-    }
-}
-
-pub(super) fn generate_trace_rows_for_perm<F: Algebra<KoalaBear> + Copy>(perm: &mut Poseidon1Cols16<&mut F>) {
-    let inputs: [F; WIDTH] = std::array::from_fn(|i| *perm.inputs[i]);
-    let mut state = inputs;
-
-    // No initial linear layer for Poseidon1 (unlike Poseidon2)
-
-    for (full_round, constants) in perm
-        .beginning_full_rounds
-        .iter_mut()
-        .zip(poseidon1_initial_constants().chunks_exact(2))
-    {
-        generate_2_full_round(&mut state, full_round, &constants[0], &constants[1]);
-    }
-
-    // --- Sparse partial rounds ---
-    // Transition: add first-round constants, multiply by m_i
-    let frc = poseidon1_sparse_first_round_constants();
-    for (s, &c) in state.iter_mut().zip(frc.iter()) {
-        *s += c;
-    }
-    let m_i = poseidon1_sparse_m_i();
-    let input_for_mi = state;
-    for i in 0..WIDTH {
-        let row: [F; WIDTH] = m_i[i].map(F::from);
-        state[i] = F::dot_product(&input_for_mi, &row);
-    }
-
-    let first_rows = poseidon1_sparse_first_row();
-    let v_vecs = poseidon1_sparse_v();
-    let scalar_rc = poseidon1_sparse_scalar_round_constants();
-    let n_partial = perm.partial_rounds.len();
-    for round in 0..n_partial {
-        // S-box on state[0]
-        state[0] = state[0].cube();
-        *perm.partial_rounds[round] = state[0];
-        // Scalar round constant (not on last round)
-        if round < n_partial - 1 {
-            state[0] += scalar_rc[round];
-        }
-        // Sparse matrix
-        let old_s0 = state[0];
-        let row: [F; WIDTH] = first_rows[round].map(F::from);
-        let new_s0 = F::dot_product(&state, &row);
-        state[0] = new_s0;
-        for i in 1..WIDTH {
-            state[i] += old_s0 * v_vecs[round][i - 1];
-        }
-    }
-
-    let n_ending_full_rounds = perm.ending_full_rounds.len();
-    for (full_round, constants) in perm
-        .ending_full_rounds
-        .iter_mut()
-        .zip(poseidon1_final_constants().chunks_exact(2))
-    {
-        generate_2_full_round(&mut state, full_round, &constants[0], &constants[1]);
-    }
-
-    let flag_permute = *perm.flag_permute;
-    generate_last_2_full_rounds(
-        &mut state,
-        &inputs,
-        &mut perm.out_lo,
-        &mut perm.out_hi,
-        flag_permute,
-        &poseidon1_final_constants()[2 * n_ending_full_rounds],
-        &poseidon1_final_constants()[2 * n_ending_full_rounds + 1],
-    );
-}
-
-#[inline]
-fn generate_2_full_round<F: Algebra<KoalaBear> + Copy>(
-    state: &mut [F; WIDTH],
-    post_full_round: &mut [&mut F; WIDTH],
-    round_constants_1: &[KoalaBear; WIDTH],
-    round_constants_2: &[KoalaBear; WIDTH],
-) {
-    for (state_i, const_i) in state.iter_mut().zip(round_constants_1) {
-        *state_i += *const_i;
-        *state_i = state_i.cube();
-    }
-    mds_circ_16(state);
-
-    for (state_i, const_i) in state.iter_mut().zip(round_constants_2.iter()) {
-        *state_i += *const_i;
-        *state_i = state_i.cube();
-    }
-    mds_circ_16(state);
-
-    post_full_round.iter_mut().zip(*state).for_each(|(post, x)| {
-        **post = x;
-    });
-}
-
-#[inline]
-fn generate_last_2_full_rounds<F: Algebra<KoalaBear> + Copy>(
-    state: &mut [F; WIDTH],
-    inputs: &[F; WIDTH],
-    out_lo: &mut [&mut F; WIDTH / 2],
-    out_hi: &mut [&mut F; WIDTH / 2],
-    flag_permute: F,
-    round_constants_1: &[KoalaBear; WIDTH],
-    round_constants_2: &[KoalaBear; WIDTH],
-) {
-    for (state_i, const_i) in state.iter_mut().zip(round_constants_1) {
-        *state_i += *const_i;
-        *state_i = state_i.cube();
-    }
-    mds_circ_16(state);
-
-    for (state_i, const_i) in state.iter_mut().zip(round_constants_2.iter()) {
-        *state_i += *const_i;
-        *state_i = state_i.cube();
-    }
-    mds_circ_16(state);
-
-    for i in 0..(WIDTH / 2) {
-        let compression_value = state[i] + inputs[i];
-        *out_lo[i] = (F::ONE - flag_permute) * compression_value + flag_permute * state[i];
-        *out_hi[i] = flag_permute * state[i + WIDTH / 2];
-    }
 }
diff --git a/crates/lean_vm/src/tables/table_enum.rs b/crates/lean_vm/src/tables/table_enum.rs
index 1a21d6066..75013f552 100644
--- a/crates/lean_vm/src/tables/table_enum.rs
+++ b/crates/lean_vm/src/tables/table_enum.rs
@@ -4,7 +4,7 @@ use crate::execution::memory::MemoryAccess;
 use crate::*;
 
 pub const N_TABLES: usize = 3;
-pub const ALL_TABLES: [Table; N_TABLES] = [Table::execution(), Table::extension_op(), Table::poseidon16()];
+pub const ALL_TABLES: [Table; N_TABLES] = [Table::execution(), Table::extension_op(), Table::poseidon8()];
 pub const MAX_BUS_WIDTH: usize = N_INSTRUCTION_COLUMNS + 2; // + 1 for PC, + 1 for domainsep
 pub const LOG_MAX_BUS_WIDTH: usize = log2_ceil_usize(MAX_BUS_WIDTH);
 
@@ -13,7 +13,7 @@ pub const LOG_MAX_BUS_WIDTH: usize = log2_ceil_usize(MAX_BUS_WIDTH);
 pub enum Table {
     Execution(ExecutionTable<true>),
     ExtensionOp(ExtensionOpPrecompile<true>),
-    Poseidon16(Poseidon16Precompile<true>),
+    Poseidon8(Poseidon8Precompile<true>),
 }
 
 #[macro_export]
@@ -22,7 +22,7 @@ macro_rules! delegate_to_inner {
     ($self:expr, $method:ident $(, $($arg:expr),*)?) => {
         match $self {
             Self::ExtensionOp(p) => p.$method($($($arg),*)?),
-            Self::Poseidon16(p) => p.$method($($($arg),*)?),
+            Self::Poseidon8(p) => p.$method($($($arg),*)?),
             Self::Execution(p) => p.$method($($($arg),*)?),
         }
     };
@@ -30,7 +30,7 @@ macro_rules! delegate_to_inner {
     ($self:expr => $macro_name:ident) => {
         match $self {
             Table::ExtensionOp(p) => $macro_name!(p),
-            Table::Poseidon16(p) => $macro_name!(p),
+            Table::Poseidon8(p) => $macro_name!(p),
             Table::Execution(p) => $macro_name!(p),
         }
     };
@@ -43,8 +43,8 @@ impl Table {
     pub const fn extension_op() -> Self {
         Self::ExtensionOp(ExtensionOpPrecompile)
     }
-    pub const fn poseidon16() -> Self {
-        Self::Poseidon16(Poseidon16Precompile)
+    pub const fn poseidon8() -> Self {
+        Self::Poseidon8(Poseidon8Precompile)
     }
     pub fn embed<PF: PrimeCharacteristicRing>(&self) -> PF {
         PF::from_usize(self.index())
diff --git a/crates/rec_aggregation/src/bytecode_claims.rs b/crates/rec_aggregation/src/bytecode_claims.rs
index 91c44b369..e1286daeb 100644
--- a/crates/rec_aggregation/src/bytecode_claims.rs
+++ b/crates/rec_aggregation/src/bytecode_claims.rs
@@ -1,7 +1,7 @@
 use backend::*;
 use lean_prover::fiat_shamir_domain_sep;
 use lean_vm::*;
-use utils::get_poseidon16;
+use utils::get_poseidon8;
 
 use crate::compilation::BYTECODE_CLAIM_OFFSET;
 use crate::{InnerVerified, get_aggregation_bytecode};
@@ -59,7 +59,7 @@ pub(crate) fn reduce_bytecode_claims(verified: &[InnerVerified]) -> ReducedBytec
 
     let mut reduction_capacity = fiat_shamir_domain_sep(bytecode);
     reduction_capacity[0] += F::ONE; // Domain-separate this sub-protocol's Fiat-Shamir from the main snark
-    let mut reduction_prover = ProverState::new(get_poseidon16().clone(), reduction_capacity);
+    let mut reduction_prover = ProverState::new(*get_poseidon8(), reduction_capacity);
     reduction_prover.observe_scalars(&bytecode_claims_fs_input);
     let alpha: EF = reduction_prover.sample();
     let alpha_powers: Vec<EF> = alpha.powers().take(n_claims).collect();
@@ -90,12 +90,8 @@ pub(crate) fn reduce_bytecode_claims(verified: &[InnerVerified]) -> ReducedBytec
     assert_eq!(bytecode_claim_output.len(), bytecode.bytecode_claim_size());
 
     let sumcheck_transcript = {
-        let mut vs = VerifierState::<EF, _>::new(
-            reduction_prover.into_proof(),
-            get_poseidon16().clone(),
-            reduction_capacity,
-        )
-        .unwrap();
+        let mut vs =
+            VerifierState::<EF, _>::new(reduction_prover.into_proof(), *get_poseidon8(), reduction_capacity).unwrap();
         vs.observe_scalars(&bytecode_claims_fs_input);
         let _: EF = vs.sample();
         sumcheck_verify(&mut vs, bytecode.cumulated_n_vars(), 2, claimed_sum, None).unwrap();
diff --git a/crates/rec_aggregation/src/compilation.rs b/crates/rec_aggregation/src/compilation.rs
index f3752bf0b..953dc0a9a 100644
--- a/crates/rec_aggregation/src/compilation.rs
+++ b/crates/rec_aggregation/src/compilation.rs
@@ -1,7 +1,7 @@
 use backend::*;
 use lean_compiler::{CompilationFlags, ProgramSource, compile_program_with_flags};
 use lean_prover::{
-    GRINDING_BITS, MAX_NUM_VARIABLES_TO_SEND_COEFFS, RS_DOMAIN_INITIAL_REDUCTION_FACTOR, WHIR_INITIAL_FOLDING_FACTOR,
+    MAX_NUM_VARIABLES_TO_SEND_COEFFS, RS_DOMAIN_INITIAL_REDUCTION_FACTOR, WHIR_INITIAL_FOLDING_FACTOR,
     WHIR_SUBSEQUENT_FOLDING_FACTOR, default_whir_config,
 };
 use lean_vm::*;
@@ -83,7 +83,7 @@ fn compile_main_program(program_log_size: usize, bytecode_zero_eval: F) -> Bytec
 
 #[instrument(skip_all)]
 fn compile_main_program_self_referential() -> Bytecode {
-    let mut log_size_guess = 18;
+    let mut log_size_guess = 19;
     let bytecode_zero_eval = F::ZERO;
     for _ in 0..10 {
         let bytecode = compile_main_program(log_size_guess, bytecode_zero_eval);
@@ -110,31 +110,23 @@ fn build_replacements(log_inner_bytecode: usize, bytecode_zero_eval: F) -> BTree
     let mut all_potential_num_queries = vec![];
     let mut all_potential_query_grinding = vec![];
     let mut all_potential_num_oods = vec![];
-    let mut all_potential_folding_grinding = vec![];
-    let mut too_much_grinding = false;
     for log_inv_rate in MIN_WHIR_LOG_INV_RATE..=MAX_WHIR_LOG_INV_RATE {
-        let max_n_vars = F::TWO_ADICITY + WHIR_INITIAL_FOLDING_FACTOR - log_inv_rate;
+        let max_n_vars = EFFECTIVE_TWO_ADICITY + WHIR_INITIAL_FOLDING_FACTOR - log_inv_rate;
         let whir_config_builder = default_whir_config(log_inv_rate);
 
         let mut queries_for_rate = vec![];
         let mut query_grinding_for_rate = vec![];
         let mut oods_for_rate = vec![];
-        let mut folding_grinding_for_rate = vec![];
         for n_vars in min_stacked..=max_n_vars {
             let cfg = WhirConfig::<EF>::new(&whir_config_builder, n_vars);
-            if cfg.max_folding_pow_bits() > GRINDING_BITS {
-                too_much_grinding = true;
-            }
 
             let mut num_queries = vec![];
             let mut query_grinding_bits = vec![];
             let mut oods = vec![cfg.commitment_ood_samples];
-            let mut folding_grinding = vec![cfg.starting_folding_pow_bits];
             for round in &cfg.round_parameters {
                 num_queries.push(round.num_queries);
                 query_grinding_bits.push(round.query_pow_bits);
                 oods.push(round.ood_samples);
-                folding_grinding.push(round.folding_pow_bits);
             }
             num_queries.push(cfg.final_queries);
             query_grinding_bits.push(cfg.final_query_pow_bits);
@@ -155,22 +147,10 @@ fn build_replacements(log_inner_bytecode: usize, bytecode_zero_eval: F) -> BTree
                 "[{}]",
                 oods.iter().map(|o| o.to_string()).collect::<Vec<_>>().join(", ")
             ));
-            folding_grinding_for_rate.push(format!(
-                "[{}]",
-                folding_grinding
-                    .iter()
-                    .map(|g| g.to_string())
-                    .collect::<Vec<_>>()
-                    .join(", ")
-            ));
         }
         all_potential_num_queries.push(format!("[{}]", queries_for_rate.join(", ")));
         all_potential_query_grinding.push(format!("[{}]", query_grinding_for_rate.join(", ")));
         all_potential_num_oods.push(format!("[{}]", oods_for_rate.join(", ")));
-        all_potential_folding_grinding.push(format!("[{}]", folding_grinding_for_rate.join(", ")));
-    }
-    if too_much_grinding {
-        tracing::info!("Warning: Too much grinding for WHIR folding"); // TODO
     }
     replacements.insert(
         "WHIR_FIRST_RS_REDUCTION_FACTOR_PLACEHOLDER".to_string(),
@@ -188,10 +168,6 @@ fn build_replacements(log_inner_bytecode: usize, bytecode_zero_eval: F) -> BTree
         "WHIR_ALL_POTENTIAL_NUM_OODS_PLACEHOLDER".to_string(),
         format!("[{}]", all_potential_num_oods.join(", ")),
     );
-    replacements.insert(
-        "WHIR_ALL_POTENTIAL_FOLDING_GRINDING_PLACEHOLDER".to_string(),
-        format!("[{}]", all_potential_folding_grinding.join(", ")),
-    );
     replacements.insert("MIN_STACKED_N_VARS_PLACEHOLDER".to_string(), min_stacked.to_string());
 
     // VM recursion parameters (different from WHIR)
@@ -228,6 +204,10 @@ fn build_replacements(log_inner_bytecode: usize, bytecode_zero_eval: F) -> BTree
         "WHIR_SUBSEQUENT_FOLDING_FACTOR_PLACEHOLDER".to_string(),
         WHIR_SUBSEQUENT_FOLDING_FACTOR.to_string(),
     );
+    replacements.insert(
+        "EFFECTIVE_TWO_ADICITY_PLACEHOLDER".to_string(),
+        EFFECTIVE_TWO_ADICITY.to_string(),
+    );
     replacements.insert(
         "MAX_LOG_N_ROWS_PER_TABLE_PLACEHOLDER".to_string(),
         format!(
@@ -476,7 +456,7 @@ fn all_air_evals_in_zk_dsl() -> String {
     let mut res = String::new();
     res += &air_eval_in_zk_dsl(ExecutionTable::<false> {});
     res += &air_eval_in_zk_dsl(ExtensionOpPrecompile::<false> {});
-    res += &air_eval_in_zk_dsl(Poseidon16Precompile::<false> {});
+    res += &air_eval_in_zk_dsl(Poseidon8Precompile::<false> {});
     res
 }
 
@@ -484,8 +464,8 @@ const AIR_INNER_VALUES_VAR: &str = "inner_evals";
 
 struct AirCodegenCtx {
     expr_cache: HashMap<u32, String>,
-    consts_cache: HashMap<Vec<u32>, String>,
-    ef_const_cache: HashMap<u32, String>,
+    consts_cache: HashMap<Vec<u64>, String>,
+    ef_const_cache: HashMap<u64, String>,
     ctr: Counter,
 }
 
@@ -499,7 +479,7 @@ impl AirCodegenCtx {
         }
     }
 
-    fn write_base_constants(&mut self, values: &[u32], res: &mut String) -> String {
+    fn write_base_constants(&mut self, values: &[u64], res: &mut String) -> String {
         if let Some(name) = self.consts_cache.get(values) {
             return name.clone();
         }
@@ -512,7 +492,7 @@ impl AirCodegenCtx {
         name
     }
 
-    fn write_embedded_constant(&mut self, c: u32, res: &mut String) -> String {
+    fn write_embedded_constant(&mut self, c: u64, res: &mut String) -> String {
         if let Some(name) = self.ef_const_cache.get(&c) {
             return name.clone();
         }
@@ -550,7 +530,7 @@ where
     res += &format!("\n    buff = Array(DIM * {})", bus_real_data.len());
     for (i, data) in bus_real_data.iter().enumerate() {
         let data_str = eval_air_constraint(*data, None, &mut ctx, &mut res);
-        res += &format!("\n    copy_5({}, buff + DIM * {})", data_str, i);
+        res += &format!("\n    copy_ef({}, buff + DIM * {})", data_str, i);
     }
     let domainsep_str = eval_air_constraint(*bus_domainsep, None, &mut ctx, &mut res);
     // bus_res = sum(buff[i] * logup_alphas_eq_poly[i]) + disc * logup_alphas_eq_poly.last()
@@ -592,7 +572,7 @@ fn eval_air_constraint(
     res: &mut String,
 ) -> String {
     let v = match expr {
-        SymbolicExpression::Constant(c) => ctx.write_embedded_constant(c.as_canonical_u32(), res),
+        SymbolicExpression::Constant(c) => ctx.write_embedded_constant(c.as_canonical_u64(), res),
         SymbolicExpression::Variable(v) => format!("{} + DIM * {}", AIR_INNER_VALUES_VAR, v.index),
         SymbolicExpression::Operation(idx) => {
             if let Some(v) = ctx.expr_cache.get(&idx) {
@@ -619,7 +599,7 @@ fn eval_air_constraint(
     if let Some(d) = dest
         && v != d
     {
-        res.push_str(&format!("\n    copy_5({}, {})", v, d));
+        res.push_str(&format!("\n    copy_ef({}, {})", v, d));
     }
     v
 }
@@ -652,7 +632,7 @@ fn try_emit_dot_product_be(idx: u32, dest: Option<&str>, ctx: &mut AirCodegenCtx
                 }
                 let (c, expr) = match (mul.lhs, mul.rhs) {
                     (SymbolicExpression::Constant(c), o) | (o, SymbolicExpression::Constant(c)) => {
-                        (c.as_canonical_u32(), o)
+                        (c.as_canonical_u64(), o)
                     }
                     _ => return None,
                 };
@@ -723,11 +703,11 @@ fn eval_air_binary_op(
     res: &mut String,
 ) -> String {
     let c0 = match lhs {
-        SymbolicExpression::Constant(c) => Some(c.as_canonical_u32()),
+        SymbolicExpression::Constant(c) => Some(c.as_canonical_u64()),
         _ => None,
     };
     let c1 = match rhs {
-        SymbolicExpression::Constant(c) => Some(c.as_canonical_u32()),
+        SymbolicExpression::Constant(c) => Some(c.as_canonical_u64()),
         _ => None,
     };
 
@@ -815,5 +795,5 @@ fn display_all_air_evals_in_zk_dsl() {
 
 #[test]
 fn display_poseidon_air_in_zk_dsl() {
-    println!("{}", air_eval_in_zk_dsl(Poseidon16Precompile::<false> {}));
+    println!("{}", air_eval_in_zk_dsl(Poseidon8Precompile::<false> {}));
 }
diff --git a/crates/rec_aggregation/src/multi_message_aggregation.rs b/crates/rec_aggregation/src/multi_message_aggregation.rs
index 5eaf70193..c6f0b7bb9 100644
--- a/crates/rec_aggregation/src/multi_message_aggregation.rs
+++ b/crates/rec_aggregation/src/multi_message_aggregation.rs
@@ -73,7 +73,7 @@ impl MultiMessageAggregateSignature {
     }
 }
 
-/// Layout: [prefix(8) | bytecode_claim_padded | initial_fiat_shamir_cap(8) | n × digest(8)].
+/// Layout: [prefix(DIGEST_LEN) | bytecode_claim_padded | initial_fiat_shamir_cap(DIGEST_LEN) | n × digest(DIGEST_LEN)].
 fn build_multi_message_input_data(digests: &[[F; DIGEST_LEN]], bytecode_claim_flat: &[F]) -> Vec<F> {
     let n = digests.len();
     let claim_padded = bytecode_claim_flat.len().next_multiple_of(DIGEST_LEN);
@@ -83,7 +83,7 @@ fn build_multi_message_input_data(digests: &[[F; DIGEST_LEN]], bytecode_claim_fl
 
     data[0] = F::from_usize(MULTI_MESSAGE_FLAG);
     data[1] = F::from_usize(n);
-    // data[2..8] stays zero (prefix-chunk pad).
+    // data[2..DIGEST_LEN] stays zero (prefix-chunk pad).
 
     data[BYTECODE_CLAIM_OFFSET..][..bytecode_claim_flat.len()].copy_from_slice(bytecode_claim_flat);
     let domsep = fiat_shamir_domain_sep(get_aggregation_bytecode());
diff --git a/crates/rec_aggregation/src/single_message_aggregation.rs b/crates/rec_aggregation/src/single_message_aggregation.rs
index 2fd918eea..7ef78ade3 100644
--- a/crates/rec_aggregation/src/single_message_aggregation.rs
+++ b/crates/rec_aggregation/src/single_message_aggregation.rs
@@ -29,8 +29,8 @@ use crate::verify_inner;
 
 /// Number of tweaks in the table: 1 encoding + V*CHAIN_LENGTH chains + 1 wots_pk + LOG_LIFETIME merkle
 pub(crate) const N_TWEAKS: usize = 1 + V * CHAIN_LENGTH + 1 + LOG_LIFETIME;
-/// All tweaks are stored as a 4-FE slot [tw[0], tw[1], 0, 0].
-pub(crate) const TWEAK_SLOT_SIZE: usize = 4;
+/// Under Goldilocks each tweak is a single 64-bit FE, stored in a 2-FE slot `[tw[0], 0]`.
+pub(crate) const TWEAK_SLOT_SIZE: usize = 2;
 pub(crate) const TWEAK_TABLE_SIZE_FE_PADDED: usize = (N_TWEAKS * TWEAK_SLOT_SIZE).next_multiple_of(DIGEST_LEN);
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize)]
@@ -132,13 +132,13 @@ pub(crate) fn hash_pubkeys(pub_keys: &[XmssPublicKey]) -> [F; DIGEST_LEN] {
     poseidon_hash_slice(&flat)
 }
 
-/// Tweak slots are 4-FE [tw[0], tw[1], 0, 0]
+/// Tweak slots are 2-FE `[tw[0], 0]` under Goldilocks (1 real FE + 1 zero pad).
 fn compute_tweak_table(slot: u32) -> Vec<F> {
     let mut table = Vec::new();
 
     let push_padded = |table: &mut Vec<F>, tweak_type: usize, sub_position: usize, index: u32| {
         table.extend(make_tweak(tweak_type, sub_position, index));
-        table.extend(std::iter::repeat_n(F::ZERO, 2));
+        table.push(F::ZERO);
     };
 
     // Encoding tweak
diff --git a/crates/rec_aggregation/zkdsl_implem/fiat_shamir.py b/crates/rec_aggregation/zkdsl_implem/fiat_shamir.py
index d23955294..9c56a6409 100644
--- a/crates/rec_aggregation/zkdsl_implem/fiat_shamir.py
+++ b/crates/rec_aggregation/zkdsl_implem/fiat_shamir.py
@@ -2,36 +2,38 @@
 from utils import *
 
 
-# fs layout (17 cells):
-#   fs[0..8]   = capacity
-#   fs[8..16]  = rate
-#   fs[16]     = transcript pointer
-# This matches the normal-ordering poseidon precompile output [cap | rate].
+# Duplex-sponge Fiat-Shamir over the Goldilocks width-8 Poseidon permutation.
+#
+# fs layout (9 cells):
+#   fs[0..4]  = capacity
+#   fs[4..8]  = rate
+#   fs[8]     = transcript pointer
+# This matches the natural-ordering poseidon permute precompile output [cap | rate].
 
 
 @inline
 def fs_new(transcript_ptr, initial_capacity):
-    fs = Array(17)
-    copy_8(fs, initial_capacity)
-    set_to_8_zeros(fs + 8)
-    fs[16] = transcript_ptr
+    fs = Array(9)
+    copy_digest(fs, initial_capacity)
+    zero_digest(fs + 4)
+    fs[8] = transcript_ptr
     return fs
 
 
 @inline
 def _absorb_chunks(fs, data, n_chunks, new_transcript_ptr):
     assert n_chunks != 0
-    chain = Array(n_chunks * 16 + 1)
-    poseidon16_permute(fs, data, chain)
+    chain = Array(n_chunks * 8 + 1)
+    poseidon8_permute(fs, data, chain)
     for i in unroll(1, n_chunks):
-        poseidon16_permute(chain + (i - 1) * 16, data + i * DIGEST_LEN, chain + i * 16)
-    chain[n_chunks * 16] = new_transcript_ptr
-    return chain + (n_chunks - 1) * 16
+        poseidon8_permute(chain + (i - 1) * 8, data + i * DIGEST_LEN, chain + i * 8)
+    chain[n_chunks * 8] = new_transcript_ptr
+    return chain + (n_chunks - 1) * 8
 
 
 @inline
 def fs_observe_chunks(fs, data, n_chunks):
-    return _absorb_chunks(fs, data, n_chunks, fs[16])
+    return _absorb_chunks(fs, data, n_chunks, fs[8])
 
 
 def fs_observe(fs, data, length: Const):
@@ -51,11 +53,11 @@ def fs_observe(fs, data, length: Const):
 def fs_grinding(fs, bits):
     if bits == 0:
         return fs  # no grinding
-    transcript_ptr = fs[16]
+    transcript_ptr = fs[8]
     new_fs = _absorb_chunks(fs, transcript_ptr, 1, transcript_ptr + DIGEST_LEN)
 
-    # Rate is at new_fs[8..16]; sample the first cell of it for the grinding check.
-    sampled = new_fs[8]
+    # Rate is at new_fs[4..8]; sample the first cell of it for the grinding check.
+    sampled = new_fs[4]
     debug_assert(bits <= 24)
     match_range(bits, range(0, 25), lambda b: assert_trailing_bits_are_zeros(sampled, b))
 
@@ -64,83 +66,84 @@ def fs_grinding(fs, bits):
 
 def assert_trailing_bits_are_zeros(value, bits: Const):
     debug_assert(bits != 0)
+    debug_assert(bits <= 24)
 
-    chunk_size = 12
-    num_chunks = 24 / chunk_size  # 2
+    chunk_size = 16
+    num_chunks = F_BITS / chunk_size  # 4
+    half_chunks = num_chunks / 2  # 2
 
     chunks = Array(num_chunks)
-    hint_decompose_bits_merkle_whir(chunks, value, chunk_size)
+    hint_decompose_bits_merkle_whir(chunks, value, num_chunks, chunk_size)
     for i in unroll(0, num_chunks):
         assert chunks[i] < 2**chunk_size
 
-    partial_sums = Array(num_chunks)
-    partial_sums[0] = chunks[0]
-    for i in unroll(1, num_chunks):
-        partial_sums[i] = partial_sums[i - 1] + chunks[i] * 2 ** (i * chunk_size)
-    # p = 2^31 - 2^24 + 1, so 2^24 * 127 = p - 1 ≡ -1 (mod p), hence inv(2^24) = -127.
-    # Deduce top7 from the identity partial_sum + top7 * 2^24 == a:
-    # top7 = (a - partial_sum) * inv(2^24) = (partial_sum - a) * 127
-    top7 = (partial_sums[num_chunks - 1] - value) * 127
-    assert top7 < 2**7
-    if top7 == 2**7 - 1:
-        assert partial_sums[num_chunks - 1] == 0
-
-    if bits < 12:
+    # Recompose into low/high 32-bit halves and enforce canonicality:
+    # if the high 32 bits are all set (top half = 2^32 - 1), the low 32
+    # bits must be zero (only valid such element is p - 1 = 2^64 - 2^32).
+    partial_sum_low: Mut = chunks[0]
+    for i in unroll(1, half_chunks):
+        partial_sum_low += chunks[i] * 2 ** (i * chunk_size)
+    partial_sum_high: Mut = chunks[half_chunks]
+    for i in unroll(1, half_chunks):
+        partial_sum_high += chunks[half_chunks + i] * 2 ** (i * chunk_size)
+
+    assert value == partial_sum_low + partial_sum_high * 2**HALF_BITS
+
+    if partial_sum_high == 2**HALF_BITS - 1:
+        assert partial_sum_low == 0
+
+    if bits < 16:
         assert chunks[0] / 2**bits < 2 ** (chunk_size - bits)
-    elif bits < 24:
-        assert chunks[0] == 0
-        assert chunks[1] / 2 ** (bits - 12) < 2 ** (chunk_size - (bits - 12))
     else:
-        debug_assert(bits == 24)
         assert chunks[0] == 0
-        assert chunks[1] == 0
+        assert chunks[1] / 2 ** (bits - 16) < 2 ** (chunk_size - (bits - 16))
 
     return
 
 
 @inline
 def fs_duplex(fs):
-    # (equivalent to absorbing 8 zeros)
+    # Equivalent to absorbing DIGEST_LEN zeros.
     # Refreshes the rate so a subsequent sample doesn't repeat the previous one.
-    new_fs = Array(17)
-    poseidon16_permute(fs, ZERO_VEC_PTR, new_fs)
-    new_fs[16] = fs[16]
+    new_fs = Array(9)
+    poseidon8_permute(fs, ZERO_VEC_PTR, new_fs)
+    new_fs[8] = fs[8]
     return new_fs
 
 
 def fs_sample_chunks(fs, n_chunks: Const):
     # Returns (new_fs, samples_ptr) where samples_ptr points to a contiguous
-    # n_chunks * 8-cell buffer holding the squeezed chunks. Assumes the rate at
-    # fs[8..16] is "fresh" (just-permuted, not yet emitted); caller must duplex
-    # (or observe) between independent sample sequences.
+    # n_chunks * DIGEST_LEN-cell buffer holding the squeezed chunks. Assumes the
+    # rate at fs[4..8] is "fresh" (just-permuted, not yet emitted); caller must
+    # duplex (or observe) between independent sample sequences.
     if n_chunks == 0:
         return fs, ZERO_VEC_PTR
     if n_chunks == 1:
-        # Chunk 0 is the current fs itself: its rate is fs[8..16], no permute needed.
-        return fs, fs + 8
-    samples = Array(n_chunks * 8)
-    copy_8(samples, fs + 8)
-    chain = Array((n_chunks - 1) * 16 + 1)
-    poseidon16_permute(fs, ZERO_VEC_PTR, chain)
-    copy_8(samples + 8, chain + 8)
+        # Chunk 0 is the current fs itself: its rate is fs[4..8], no permute needed.
+        return fs, fs + 4
+    samples = Array(n_chunks * DIGEST_LEN)
+    copy_digest(samples, fs + 4)
+    chain = Array((n_chunks - 1) * 8 + 1)
+    poseidon8_permute(fs, ZERO_VEC_PTR, chain)
+    copy_digest(samples + DIGEST_LEN, chain + 4)
     for i in unroll(2, n_chunks):
-        poseidon16_permute(chain + (i - 2) * 16, ZERO_VEC_PTR, chain + (i - 1) * 16)
-        copy_8(samples + i * 8, chain + (i - 1) * 16 + 8)
-    chain[(n_chunks - 1) * 16] = fs[16]
-    new_fs = chain + (n_chunks - 2) * 16
+        poseidon8_permute(chain + (i - 2) * 8, ZERO_VEC_PTR, chain + (i - 1) * 8)
+        copy_digest(samples + i * DIGEST_LEN, chain + (i - 1) * 8 + 4)
+    chain[(n_chunks - 1) * 8] = fs[8]
+    new_fs = chain + (n_chunks - 2) * 8
     return new_fs, samples
 
 
 @inline
 def fs_sample_ef(fs):
-    # Single-chunk sample: read the fresh rate at fs[8..16]; the new fs is unchanged.
-    return fs, fs + 8
+    # Single-chunk sample: read the fresh rate at fs[4..8]; the new fs is unchanged.
+    return fs, fs + 4
 
 
 @inline
 def fs_sample_many_ef(fs, n):
     # return the updated fiat-shamir, and a pointer to n (continuous) extension field elements
-    n_chunks = div_ceil(n * DIM, 8)
+    n_chunks = div_ceil(n * DIM, DIGEST_LEN)
     debug_assert(n_chunks <= 31)
     debug_assert(1 <= n_chunks)
     new_fs, sampled = fs_sample_chunks(fs, n_chunks)
@@ -151,25 +154,24 @@ def fs_sample_many_ef(fs, n):
 def fs_hint(fs, n):
     # Hint = read `n` cells from the transcript without absorbing them. Just advance the
     # transcript pointer; the sponge state is unchanged.
-    new_fs = Array(17)
+    new_fs = Array(9)
     copy_8(new_fs, fs)
-    copy_8(new_fs + 8, fs + 8)
-    new_fs[16] = fs[16] + n
-    return new_fs, fs[16]
+    new_fs[8] = fs[8] + n
+    return new_fs, fs[8]
 
 
 def fs_receive_chunks(fs, n_chunks: Const):
-    # Read n_chunks * 8 cells from the transcript and absorb them. Returns the new fs
-    # and a pointer to the just-consumed transcript region.
-    transcript_ptr = fs[16]
+    # Read n_chunks * DIGEST_LEN cells from the transcript and absorb them. Returns the
+    # new fs and a pointer to the just-consumed transcript region.
+    transcript_ptr = fs[8]
     new_fs = _absorb_chunks(fs, transcript_ptr, n_chunks, transcript_ptr + n_chunks * DIGEST_LEN)
     return new_fs, transcript_ptr
 
 
 @inline
 def fs_receive_ef_inlined(fs, n):
-    new_fs, ef_ptr = fs_receive_chunks(fs, div_ceil(n * DIM, 8))
-    for i in unroll(n * DIM, next_multiple_of(n * DIM, 8)):
+    new_fs, ef_ptr = fs_receive_chunks(fs, div_ceil(n * DIM, DIGEST_LEN))
+    for i in unroll(n * DIM, next_multiple_of(n * DIM, DIGEST_LEN)):
         assert ef_ptr[i] == 0
     return new_fs, ef_ptr
 
@@ -184,28 +186,30 @@ def fs_receive_ef_by_log_dynamic(fs, log_n, min_value: Const, max_value: Const):
 
 
 def fs_receive_ef(fs, n: Const):
-    new_fs, ef_ptr = fs_receive_chunks(fs, div_ceil(n * DIM, 8))
-    for i in unroll(n * DIM, next_multiple_of(n * DIM, 8)):
+    new_fs, ef_ptr = fs_receive_chunks(fs, div_ceil(n * DIM, DIGEST_LEN))
+    for i in unroll(n * DIM, next_multiple_of(n * DIM, DIGEST_LEN)):
         assert ef_ptr[i] == 0
     return new_fs, ef_ptr
 
 
 def fs_print_state(fs_state):
-    for i in unroll(0, 17):
+    for i in unroll(0, 9):
         print(i, fs_state[i])
     return
 
 
 @inline
 def fs_sample_queries(fs, n_samples):
-    # Sample `n_samples` query bit-strings. Each chunk yields 8 base field elements that
-    # can be downsampled to query indices. We squeeze `ceil(n_samples / 8)` chunks.
+    # Sample `n_samples` query bit-strings. Each chunk yields DIGEST_LEN base field
+    # elements that can be downsampled to query indices. We squeeze
+    # `ceil(n_samples / DIGEST_LEN)` chunks.
     debug_assert(n_samples < 512)
-    # Compute total_chunks = ceil(n_samples / 8) via bit decomposition.
-    # Big-endian: nb[0]=bit8 (MSB), nb[8]=bit0 (LSB).
+    # total_chunks = ceil(n_samples / DIGEST_LEN). With DIGEST_LEN=4 we shift right
+    # by 2 and check whether the low 2 bits are nonzero. BE decomposition:
+    # nb[0] = bit 8 (MSB), nb[8] = bit 0 (LSB).
     nb = checked_decompose_bits_small_value_const(n_samples, 9)
-    floor_div = nb[0] * 32 + nb[1] * 16 + nb[2] * 8 + nb[3] * 4 + nb[4] * 2 + nb[5]
-    has_remainder = 1 - (1 - nb[6]) * (1 - nb[7]) * (1 - nb[8])
+    floor_div = nb[0] * 64 + nb[1] * 32 + nb[2] * 16 + nb[3] * 8 + nb[4] * 4 + nb[5] * 2 + nb[6]
+    has_remainder = 1 - (1 - nb[7]) * (1 - nb[8])
     total_chunks = floor_div + has_remainder
-    new_fs, sampled = match_range(total_chunks, range(0, 65), lambda nc: fs_sample_chunks(fs, nc))
+    new_fs, sampled = match_range(total_chunks, range(0, 129), lambda nc: fs_sample_chunks(fs, nc))
     return sampled, new_fs
diff --git a/crates/rec_aggregation/zkdsl_implem/hashing.py b/crates/rec_aggregation/zkdsl_implem/hashing.py
index fb6e7ebf3..b8747f865 100644
--- a/crates/rec_aggregation/zkdsl_implem/hashing.py
+++ b/crates/rec_aggregation/zkdsl_implem/hashing.py
@@ -1,7 +1,7 @@
 from snark_lib import *
 
-DIM = 5  # extension degree
-DIGEST_LEN = 8
+DIM = 3  # extension degree (Goldilocks cubic extension)
+DIGEST_LEN = 4
 
 # memory layout: [public_input (PUBLIC_INPUT_LEN)] [preamble_memory (PREAMBLE_MEMORY_LEN)] [runtime ...]
 # `preamble_memory` is a region that is filled by the guest program, with usefull constants [0000...][1000...]...
@@ -65,7 +65,7 @@ def build_iv(length):
 @inline
 def sponge_finalize(carried_capacity, last_chunk):
     full = Array(2 * DIGEST_LEN)
-    poseidon16_permute(carried_capacity, last_chunk, full)
+    poseidon8_permute(carried_capacity, last_chunk, full)
     return full + DIGEST_LEN
 
 
@@ -74,15 +74,15 @@ def slice_hash_rtl(data, num_chunks, iv):
     debug_assert(1 <= num_chunks)
     result = Array(2 * DIGEST_LEN)
     if num_chunks == 1:
-        poseidon16_permute(iv, data, result)
+        poseidon8_permute(iv, data, result)
     else:
         states = Array((num_chunks - 1) * DIGEST_LEN)
-        poseidon16_permute_half(iv, data + (num_chunks - 1) * DIGEST_LEN, states)
+        poseidon8_permute_half(iv, data + (num_chunks - 1) * DIGEST_LEN, states)
         for j in unroll(1, num_chunks - 1):
-            poseidon16_permute_half(
+            poseidon8_permute_half(
                 states + (j - 1) * DIGEST_LEN, data + (num_chunks - 1 - j) * DIGEST_LEN, states + j * DIGEST_LEN
             )
-        poseidon16_permute(states + (num_chunks - 2) * DIGEST_LEN, data, result)
+        poseidon8_permute(states + (num_chunks - 2) * DIGEST_LEN, data, result)
     return result + DIGEST_LEN
 
 
@@ -98,11 +98,11 @@ def slice_hash_range(data, num_chunks, dest):
     debug_assert(2 < num_chunks)
     iv = build_iv(num_chunks * DIGEST_LEN)
     states = Array((num_chunks - 1) * DIGEST_LEN)
-    poseidon16_permute_half(iv, data, states)
+    poseidon8_permute_half(iv, data, states)
     for j in range(1, num_chunks - 1):
-        poseidon16_permute_half(states + (j - 1) * DIGEST_LEN, data + j * DIGEST_LEN, states + j * DIGEST_LEN)
+        poseidon8_permute_half(states + (j - 1) * DIGEST_LEN, data + j * DIGEST_LEN, states + j * DIGEST_LEN)
     rate = sponge_finalize(states + (num_chunks - 2) * DIGEST_LEN, data + (num_chunks - 1) * DIGEST_LEN)
-    copy_8(rate, dest)
+    copy_digest(rate, dest)
     return
 
 
@@ -111,20 +111,20 @@ def slice_hash(data, num_chunks, dest):
     debug_assert(2 <= num_chunks)
     iv = build_iv(num_chunks * DIGEST_LEN)
     states = Array((num_chunks - 1) * DIGEST_LEN)
-    poseidon16_permute_half(iv, data, states)
+    poseidon8_permute_half(iv, data, states)
     for j in unroll(1, num_chunks - 1):
-        poseidon16_permute_half(states + (j - 1) * DIGEST_LEN, data + j * DIGEST_LEN, states + j * DIGEST_LEN)
+        poseidon8_permute_half(states + (j - 1) * DIGEST_LEN, data + j * DIGEST_LEN, states + j * DIGEST_LEN)
     rate = sponge_finalize(states + (num_chunks - 2) * DIGEST_LEN, data + (num_chunks - 1) * DIGEST_LEN)
-    copy_8(rate, dest)
+    copy_digest(rate, dest)
     return
 
 
 @inline
 def slice_hash_continue(running, data, num_chunks):
     states = Array(num_chunks * DIGEST_LEN)
-    poseidon16_permute_half(running, data, states)
+    poseidon8_permute_half(running, data, states)
     for j in unroll(1, num_chunks):
-        poseidon16_permute_half(states + (j - 1) * DIGEST_LEN, data + j * DIGEST_LEN, states + j * DIGEST_LEN)
+        poseidon8_permute_half(states + (j - 1) * DIGEST_LEN, data + j * DIGEST_LEN, states + j * DIGEST_LEN)
     return states + (num_chunks - 1) * DIGEST_LEN
 
 
@@ -148,7 +148,7 @@ def absorb_n_hashes_const(n: Const, sp_in, dp_in):
     dp: Mut = dp_in
     for _ in unroll(0, n):
         new_state = sp + DIGEST_LEN
-        poseidon16_permute_half(sp, dp, new_state)
+        poseidon8_permute_half(sp, dp, new_state)
         sp = new_state
         dp += DIGEST_LEN
     return sp
@@ -163,7 +163,7 @@ def slice_hash_runtime(data, num_chunks):
         return sponge_finalize(iv, data)
 
     states = Array((num_chunks - 1) * DIGEST_LEN)
-    poseidon16_permute_half(iv, data, states)
+    poseidon8_permute_half(iv, data, states)
     n_iters = num_chunks - 2
     state_ptr: Mut = states
     data_ptr: Mut = data + DIGEST_LEN
@@ -172,7 +172,7 @@ def slice_hash_runtime(data, num_chunks):
     for _ in range(0, n_chunks_outer):
         for _ in unroll(0, PARTIAL_UNROLL_BATCH):
             new_state = state_ptr + DIGEST_LEN
-            poseidon16_permute_half(state_ptr, data_ptr, new_state)
+            poseidon8_permute_half(state_ptr, data_ptr, new_state)
             state_ptr = new_state
             data_ptr += DIGEST_LEN
 
@@ -197,24 +197,24 @@ def whir_do_4_merkle_levels(b, state_in, path_chunk, state_out):
     temps = Array(3 * DIGEST_LEN)
 
     if b0 == 0:
-        poseidon16_compress_half(state_in, path_chunk, temps)
+        poseidon8_compress_half(state_in, path_chunk, temps)
     else:
-        poseidon16_compress_half(path_chunk, state_in, temps)
+        poseidon8_compress_half(path_chunk, state_in, temps)
 
     if b1 == 0:
-        poseidon16_compress_half(temps, path_chunk + DIGEST_LEN, temps + DIGEST_LEN)
+        poseidon8_compress_half(temps, path_chunk + DIGEST_LEN, temps + DIGEST_LEN)
     else:
-        poseidon16_compress_half(path_chunk + DIGEST_LEN, temps, temps + DIGEST_LEN)
+        poseidon8_compress_half(path_chunk + DIGEST_LEN, temps, temps + DIGEST_LEN)
 
     if b2 == 0:
-        poseidon16_compress_half(temps + DIGEST_LEN, path_chunk + 2 * DIGEST_LEN, temps + 2 * DIGEST_LEN)
+        poseidon8_compress_half(temps + DIGEST_LEN, path_chunk + 2 * DIGEST_LEN, temps + 2 * DIGEST_LEN)
     else:
-        poseidon16_compress_half(path_chunk + 2 * DIGEST_LEN, temps + DIGEST_LEN, temps + 2 * DIGEST_LEN)
+        poseidon8_compress_half(path_chunk + 2 * DIGEST_LEN, temps + DIGEST_LEN, temps + 2 * DIGEST_LEN)
 
     if b3 == 0:
-        poseidon16_compress_half(temps + 2 * DIGEST_LEN, path_chunk + 3 * DIGEST_LEN, state_out)
+        poseidon8_compress_half(temps + 2 * DIGEST_LEN, path_chunk + 3 * DIGEST_LEN, state_out)
     else:
-        poseidon16_compress_half(path_chunk + 3 * DIGEST_LEN, temps + 2 * DIGEST_LEN, state_out)
+        poseidon8_compress_half(path_chunk + 3 * DIGEST_LEN, temps + 2 * DIGEST_LEN, state_out)
     return
 
 
@@ -229,19 +229,19 @@ def whir_do_3_merkle_levels(b, state_in, path_chunk, state_out):
     temps = Array(2 * DIGEST_LEN)
 
     if b0 == 0:
-        poseidon16_compress_half(state_in, path_chunk, temps)
+        poseidon8_compress_half(state_in, path_chunk, temps)
     else:
-        poseidon16_compress_half(path_chunk, state_in, temps)
+        poseidon8_compress_half(path_chunk, state_in, temps)
 
     if b1 == 0:
-        poseidon16_compress_half(temps, path_chunk + DIGEST_LEN, temps + DIGEST_LEN)
+        poseidon8_compress_half(temps, path_chunk + DIGEST_LEN, temps + DIGEST_LEN)
     else:
-        poseidon16_compress_half(path_chunk + DIGEST_LEN, temps, temps + DIGEST_LEN)
+        poseidon8_compress_half(path_chunk + DIGEST_LEN, temps, temps + DIGEST_LEN)
 
     if b2 == 0:
-        poseidon16_compress_half(temps + DIGEST_LEN, path_chunk + 2 * DIGEST_LEN, state_out)
+        poseidon8_compress_half(temps + DIGEST_LEN, path_chunk + 2 * DIGEST_LEN, state_out)
     else:
-        poseidon16_compress_half(path_chunk + 2 * DIGEST_LEN, temps + DIGEST_LEN, state_out)
+        poseidon8_compress_half(path_chunk + 2 * DIGEST_LEN, temps + DIGEST_LEN, state_out)
     return
 
 
@@ -254,14 +254,14 @@ def whir_do_2_merkle_levels(b, state_in, path_chunk, state_out):
     temp = Array(DIGEST_LEN)
 
     if b0 == 0:
-        poseidon16_compress_half(state_in, path_chunk, temp)
+        poseidon8_compress_half(state_in, path_chunk, temp)
     else:
-        poseidon16_compress_half(path_chunk, state_in, temp)
+        poseidon8_compress_half(path_chunk, state_in, temp)
 
     if b1 == 0:
-        poseidon16_compress_half(temp, path_chunk + DIGEST_LEN, state_out)
+        poseidon8_compress_half(temp, path_chunk + DIGEST_LEN, state_out)
     else:
-        poseidon16_compress_half(path_chunk + DIGEST_LEN, temp, state_out)
+        poseidon8_compress_half(path_chunk + DIGEST_LEN, temp, state_out)
     return
 
 
@@ -270,9 +270,9 @@ def whir_do_1_merkle_level(b, state_in, path_chunk, state_out):
     b0 = b % 2
 
     if b0 == 0:
-        poseidon16_compress_half(state_in, path_chunk, state_out)
+        poseidon8_compress_half(state_in, path_chunk, state_out)
     else:
-        poseidon16_compress_half(path_chunk, state_in, state_out)
+        poseidon8_compress_half(path_chunk, state_in, state_out)
     return
 
 
@@ -294,9 +294,9 @@ def merkle_verif_batch(merkle_paths, leaves_digests, leave_positions, root, heig
 
 def merkle_verif_batch_const(n_paths, merkle_paths, leaves_digests, leave_positions, root, height: Const):
     # n_paths: F
-    # leaves_digests: pointer to a slice of n_paths pointers, each pointing to 1 chunk of 8 field elements
+    # leaves_digests: pointer to a slice of n_paths pointers, each pointing to 1 chunk of 4 field elements
     # leave_positions: pointer to a slice of n_paths field elements (each < 2^height)
-    # root: pointer to 1 chunk of 8 field elements
+    # root: pointer to 1 chunk of 4 field elements
     # height: F
 
     for i in range(0, n_paths):
@@ -317,25 +317,25 @@ def merkle_verify(leaf_digest, merkle_path, leaf_position_bits, root, height: Co
     # First merkle round
     match leaf_position_bits[0]:
         case 0:
-            poseidon16_compress_half(leaf_digest, merkle_path, states)
+            poseidon8_compress_half(leaf_digest, merkle_path, states)
         case 1:
-            poseidon16_compress_half(merkle_path, leaf_digest, states)
+            poseidon8_compress_half(merkle_path, leaf_digest, states)
 
     # Remaining merkle rounds
     for j in unroll(1, height):
         # Warning: this works only if leaf_position_bits[i] is known to be boolean:
         match leaf_position_bits[j]:
             case 0:
-                poseidon16_compress_half(
+                poseidon8_compress_half(
                     states + (j - 1) * DIGEST_LEN,
                     merkle_path + j * DIGEST_LEN,
                     states + j * DIGEST_LEN,
                 )
             case 1:
-                poseidon16_compress_half(
+                poseidon8_compress_half(
                     merkle_path + j * DIGEST_LEN,
                     states + (j - 1) * DIGEST_LEN,
                     states + j * DIGEST_LEN,
                 )
-    copy_8(states + (height - 1) * DIGEST_LEN, root)
+    copy_digest(states + (height - 1) * DIGEST_LEN, root)
     return
diff --git a/crates/rec_aggregation/zkdsl_implem/main.py b/crates/rec_aggregation/zkdsl_implem/main.py
index 2fa65d17f..9bbb10cb1 100644
--- a/crates/rec_aggregation/zkdsl_implem/main.py
+++ b/crates/rec_aggregation/zkdsl_implem/main.py
@@ -5,13 +5,13 @@
 MAX_N_SIGS = MAX_XMSS_AGGREGATED_PLACEHOLDER
 MAX_N_DUPS = MAX_XMSS_DUPLICATES_PLACEHOLDER
 
-# data_buf[0..8] = [flag, count, 0×6] (count = n_sigs for single-message, n_components for multi-message).
+# Goldilocks DIGEST_LEN = 4, so data_buf[0..DIGEST_LEN] = [flag, count, 0×(DIGEST_LEN-2)] (count = n_sigs for single-message, n_components for multi-message).
 SINGLE_MESSAGE_FLAG = SINGLE_MESSAGE_FLAG_PLACEHOLDER
 MULTI_MESSAGE_FLAG = MULTI_MESSAGE_FLAG_PLACEHOLDER
 
 BYTECODE_SUMCHECK_PROOF_SIZE = BYTECODE_SUMCHECK_PROOF_SIZE_PLACEHOLDER
 
-# layout: [flag, count, 0×6 (8)] [bytecode_claim_padded] [initial_fiat_shamir_cap(8)] [single-message/multi-message mode-specific data]
+# layout: [flag, count, 0×(DIGEST_LEN-2) (DIGEST_LEN)] [bytecode_claim_padded] [initial_fiat_shamir_cap(DIGEST_LEN)] [single-message/multi-message mode-specific data]
 BYTECODE_CLAIM_OFFSET = DIGEST_LEN  # (right after the prefix chunk)
 INITIAL_FIAT_SHAMIR_CAP_OFFSET = BYTECODE_CLAIM_OFFSET + BYTECODE_CLAIM_SIZE_PADDED
 COMPONENT_DATA_OFFSET = INITIAL_FIAT_SHAMIR_CAP_OFFSET + DIGEST_LEN
@@ -24,7 +24,13 @@
 SINGLE_MESSAGE_INPUT_DATA_SIZE_PADDED = SINGLE_MESSAGE_TWEAKS_HASH_OFFSET + DIGEST_LEN
 SINGLE_MESSAGE_INPUT_DATA_NUM_CHUNKS = SINGLE_MESSAGE_INPUT_DATA_SIZE_PADDED / DIGEST_LEN
 
-# Multi-message mode-specific data (variable): n_components × digest(8).
+# Number of DIGEST_LEN-sized chunks in the single-message component-data block
+# (pubkeys_hash | message | merkle_chunks | tweaks_hash).
+COMPONENT_DATA_NUM_CHUNKS = (SINGLE_MESSAGE_INPUT_DATA_SIZE_PADDED - COMPONENT_DATA_OFFSET) / DIGEST_LEN
+# Number of DIGEST_LEN-sized chunks needed to hold the merkle chunks for one slot.
+MERKLE_CHUNKS_NUM_CHUNKS = N_MERKLE_CHUNKS / DIGEST_LEN
+
+# Multi-message mode-specific data (variable): n_components × digest(DIGEST_LEN).
 MULTI_MESSAGE_DIGESTS_OFFSET = COMPONENT_DATA_OFFSET
 
 BYTECODE_CLAIM_NUM_CHUNKS = BYTECODE_CLAIM_SIZE_PADDED / DIGEST_LEN
@@ -41,7 +47,8 @@ def main():
     input_data_num_chunks = input_data_num_chunks_buf[0]
     data_buf = Array(input_data_num_chunks * DIGEST_LEN)
     hint_witness("input_data", data_buf)
-    set_to_6_zeros(data_buf + 2)
+    for k in unroll(2, DIGEST_LEN):
+        data_buf[k] = 0
 
     bytecode_claim_output = data_buf + BYTECODE_CLAIM_OFFSET
     initial_fiat_shamir_cap = data_buf + INITIAL_FIAT_SHAMIR_CAP_OFFSET
@@ -93,8 +100,12 @@ def main():
 
         kept_single_message_buff = Array(SINGLE_MESSAGE_INPUT_DATA_SIZE_PADDED)
         hint_witness("kept_single_message_buff", kept_single_message_buff)
-        copy_8(data_buf, kept_single_message_buff)  # single-message flag | n_signatures | 0×6
-        copy_32(data_buf + COMPONENT_DATA_OFFSET, kept_single_message_buff + COMPONENT_DATA_OFFSET)
+        copy_digest(data_buf, kept_single_message_buff)  # single-message flag | n_signatures | 0×(DIGEST_LEN-2)
+        for k in unroll(0, COMPONENT_DATA_NUM_CHUNKS):
+            copy_digest(
+                data_buf + COMPONENT_DATA_OFFSET + k * DIGEST_LEN,
+                kept_single_message_buff + COMPONENT_DATA_OFFSET + k * DIGEST_LEN,
+            )
         ensure_well_formed_input_data(kept_single_message_buff, initial_fiat_shamir_cap, SINGLE_MESSAGE_FLAG)
         digest_kept = multi_message_digests + multi_message_kept_index * DIGEST_LEN
         slice_hash(kept_single_message_buff, SINGLE_MESSAGE_INPUT_DATA_NUM_CHUNKS, digest_kept)
@@ -140,15 +151,19 @@ def main():
     hint_witness("aggregate_sizes", aggregate_sizes)
 
     computed_tweaks_hash = slice_hash_ret(tweak_table, TWEAK_TABLE_SIZE_FE_PADDED / DIGEST_LEN)
-    copy_8(computed_tweaks_hash, tweaks_hash_expected)
+    copy_digest(computed_tweaks_hash, tweaks_hash_expected)
 
     # 1->1 optimization: a single recursive single-message child, no raw signatures, no duplicates.
     if n_recursions == 1:
         assert n_dup == 0
         if n_raw_xmss == 0:
             single_message_data_buf = Array(SINGLE_MESSAGE_INPUT_DATA_SIZE_PADDED)
-            copy_8(data_buf, single_message_data_buf)  # prefix
-            copy_32(data_buf + COMPONENT_DATA_OFFSET, single_message_data_buf + COMPONENT_DATA_OFFSET)
+            copy_digest(data_buf, single_message_data_buf)  # prefix
+            for k in unroll(0, COMPONENT_DATA_NUM_CHUNKS):
+                copy_digest(
+                    data_buf + COMPONENT_DATA_OFFSET + k * DIGEST_LEN,
+                    single_message_data_buf + COMPONENT_DATA_OFFSET + k * DIGEST_LEN,
+                )
             hint_witness("inner_bytecode_claim", single_message_data_buf + BYTECODE_CLAIM_OFFSET)
             ensure_well_formed_input_data(single_message_data_buf, initial_fiat_shamir_cap, SINGLE_MESSAGE_FLAG)
             inner_pub_mem = Array(INNER_PUB_MEM_SIZE)
@@ -162,7 +177,7 @@ def main():
 
     # General path
     computed_pubkeys_hash = slice_hash_runtime(all_pubkeys, n_sigs)
-    copy_8(computed_pubkeys_hash, pubkeys_hash_expected)
+    copy_digest(computed_pubkeys_hash, pubkeys_hash_expected)
 
     # Buffer for partition verification
     n_total = n_sigs + n_dup
@@ -218,10 +233,14 @@ def main():
         for k in unroll(2, DIGEST_LEN):
             single_message_data_buf[k] = 0
 
-        copy_8(running_hash, single_message_data_buf + SINGLE_MESSAGE_PUBKEYS_HASH_OFFSET)
-        copy_8(message, single_message_data_buf + SINGLE_MESSAGE_PUBKEYS_HASH_OFFSET + DIGEST_LEN)
-        copy_8(merkle_chunks_for_slot, single_message_data_buf + SINGLE_MESSAGE_PUBKEYS_HASH_OFFSET + DIGEST_LEN + MESSAGE_LEN)
-        copy_8(tweaks_hash_expected, single_message_data_buf + SINGLE_MESSAGE_TWEAKS_HASH_OFFSET)
+        copy_digest(running_hash, single_message_data_buf + SINGLE_MESSAGE_PUBKEYS_HASH_OFFSET)
+        copy_digest(message, single_message_data_buf + SINGLE_MESSAGE_MSG_HASH_OFFSET)
+        for k in unroll(0, MERKLE_CHUNKS_NUM_CHUNKS):
+            copy_digest(
+                merkle_chunks_for_slot + k * DIGEST_LEN,
+                single_message_data_buf + SINGLE_MESSAGE_MERKLE_CHUNKS_OFFSET + k * DIGEST_LEN,
+            )
+        copy_digest(tweaks_hash_expected, single_message_data_buf + SINGLE_MESSAGE_TWEAKS_HASH_OFFSET)
         hint_witness("inner_bytecode_claim", single_message_data_buf + BYTECODE_CLAIM_OFFSET)
         ensure_well_formed_input_data(single_message_data_buf, initial_fiat_shamir_cap, SINGLE_MESSAGE_FLAG)
         inner_pub_mem = Array(INNER_PUB_MEM_SIZE)
@@ -234,7 +253,7 @@ def main():
 
     if n_recursions == 0:
         for k in unroll(0, BYTECODE_POINT_N_VARS):
-            set_to_5_zeros(bytecode_claim_output + k * DIM)
+            zero_ef(bytecode_claim_output + k * DIM)
         bytecode_claim_output[BYTECODE_POINT_N_VARS * DIM] = BYTECODE_ZERO_EVAL
         for k in unroll(1, DIM):
             bytecode_claim_output[BYTECODE_POINT_N_VARS * DIM + k] = 0
@@ -278,7 +297,7 @@ def reduce_bytecode_claims(bytecode_claims, n_bytecode_claims, bytecode_claim_ou
     all_values = Array(n_bytecode_claims * DIM)
     for i in range(0, n_bytecode_claims):
         claim_ptr = bytecode_claims[i]
-        copy_5(claim_ptr + BYTECODE_POINT_N_VARS * DIM, all_values + i * DIM)
+        copy_ef(claim_ptr + BYTECODE_POINT_N_VARS * DIM, all_values + i * DIM)
 
     claimed_sum = Array(DIM)
     dot_product_ee_dynamic(all_values, alpha_powers, claimed_sum, n_bytecode_claims)
@@ -295,7 +314,7 @@ def reduce_bytecode_claims(bytecode_claims, n_bytecode_claims, bytecode_claim_ou
     bytecode_value_at_r = div_extension_ret(final_eval, w_r)
 
     copy_many_ef(challenges, bytecode_claim_output, BYTECODE_POINT_N_VARS)
-    copy_5(bytecode_value_at_r, bytecode_claim_output + BYTECODE_POINT_N_VARS * DIM)
+    copy_ef(bytecode_value_at_r, bytecode_claim_output + BYTECODE_POINT_N_VARS * DIM)
     return
 
 
@@ -303,10 +322,11 @@ def reduce_bytecode_claims(bytecode_claims, n_bytecode_claims, bytecode_claim_ou
 def ensure_well_formed_input_data(data_buf, initial_fiat_shamir_cap, flag):
     data_buf[0] = flag
     # data_buf[1]: count
-    set_to_6_zeros(data_buf + 2)
+    for k in unroll(2, DIGEST_LEN):
+        data_buf[k] = 0
     for k in unroll(BYTECODE_CLAIM_OFFSET + BYTECODE_CLAIM_SIZE, INITIAL_FIAT_SHAMIR_CAP_OFFSET):
         data_buf[k] = 0
-    copy_8(initial_fiat_shamir_cap, data_buf + INITIAL_FIAT_SHAMIR_CAP_OFFSET)
+    copy_digest(initial_fiat_shamir_cap, data_buf + INITIAL_FIAT_SHAMIR_CAP_OFFSET)
     return
 
 
@@ -322,7 +342,7 @@ def _pubkey_absorb_prep(j, sub_indices_arr, n_total, all_pubkeys, buffer, counte
 def absorb_recursive_pubkey(j, sub_indices_arr, n_total, all_pubkeys, buffer, counter_in, running_hash_in):
     new_counter, pk = _pubkey_absorb_prep(j, sub_indices_arr, n_total, all_pubkeys, buffer, counter_in)
     new_hash = Array(DIGEST_LEN)
-    poseidon16_permute_half(running_hash_in, pk, new_hash)
+    poseidon8_permute_half(running_hash_in, pk, new_hash)
     return new_counter, new_hash
 
 
diff --git a/crates/rec_aggregation/zkdsl_implem/recursion.py b/crates/rec_aggregation/zkdsl_implem/recursion.py
index 2578f8a86..6cb17dcd6 100644
--- a/crates/rec_aggregation/zkdsl_implem/recursion.py
+++ b/crates/rec_aggregation/zkdsl_implem/recursion.py
@@ -34,6 +34,7 @@
 N_AIR_SHIFT_COLUMNS = N_AIR_SHIFT_COLUMNS_PLACEHOLDER  # [_; N_TABLES] — by convention, shift column j of table t is column j
 AIR_ALPHA_OFFSETS = AIR_ALPHA_OFFSETS_PLACEHOLDER  # [_; N_TABLES], # AIR_ALPHA_OFFSETS[t] = sum(N_AIR_CONSTRAINTS[k] for k in range(t))
 
+
 N_INSTRUCTION_COLUMNS = N_INSTRUCTION_COLUMNS_PLACEHOLDER
 N_COMMITTED_EXEC_COLUMNS = N_COMMITTED_EXEC_COLUMNS_PLACEHOLDER
 
@@ -46,7 +47,7 @@
 BYTECODE_ZERO_EVAL = BYTECODE_ZERO_EVAL_PLACEHOLDER
 BYTECODE_CLAIM_SIZE = (BYTECODE_POINT_N_VARS + 1) * DIM
 BYTECODE_CLAIM_SIZE_PADDED = next_multiple_of(BYTECODE_CLAIM_SIZE, DIGEST_LEN)
-INNER_PUBLIC_MEMORY_LOG_SIZE = 3  # public input = 1 hash digest = 8 field elements
+INNER_PUBLIC_MEMORY_LOG_SIZE = 2  # Goldilocks: public input = 1 hash digest = 4 field elements
 PUB_INPUT_SIZE = DIGEST_LEN  # the public input is a single digest
 
 
@@ -59,10 +60,11 @@ def recursion(inner_public_memory, initial_fiat_shamir_cap):
 
     fs = fs_observe(fs, inner_public_memory, PUB_INPUT_SIZE)  # observe public input (the data digest)
 
-    # table dims
-    debug_assert(N_TABLES + 1 < DIGEST_LEN)
-    fs, dims = fs_receive_chunks(fs, 1)
-    for i in unroll(N_TABLES + 2, 8):
+    # table dims — 2 leading slots (whir_log_inv_rate, log_memory)
+    # + N_TABLES per-table heights. Under Goldilocks DIGEST_LEN=4 so one chunk
+    # is not enough; we pull two (8 slots). Surplus slots must be zero.
+    fs, dims = fs_receive_chunks(fs, 2)
+    for i in unroll(N_TABLES + 2, 2 * DIGEST_LEN):
         assert dims[i] == 0
     whir_log_inv_rate = dims[0]
     log_memory = dims[1]
@@ -105,7 +107,7 @@ def recursion(inner_public_memory, initial_fiat_shamir_cap):
     log_bytecode_padded = maximum(LOG_GUEST_BYTECODE_LEN, log_max_table_height)
 
     stacked_n_vars = compute_stacked_n_vars(log_memory, log_bytecode_padded, table_heights)
-    assert stacked_n_vars <= TWO_ADICITY + WHIR_INITIAL_FOLDING_FACTOR - whir_log_inv_rate
+    assert stacked_n_vars <= EFFECTIVE_TWO_ADICITY + WHIR_INITIAL_FOLDING_FACTOR - whir_log_inv_rate
 
     n_vars_logup_gkr = compute_total_gkr_n_vars(log_memory, log_bytecode_padded, table_heights)
 
@@ -141,7 +143,7 @@ def recursion(inner_public_memory, initial_fiat_shamir_cap):
     # LOGUP
 
     fs, quotient_gkr, point_gkr, numerators_value, denominators_value = verify_gkr_quotient(fs, n_vars_logup_gkr)
-    set_to_5_zeros(quotient_gkr)
+    zero_ef(quotient_gkr)
 
     memory_and_acc_prefix = multilinear_location_prefix(0, n_vars_logup_gkr - log_memory, point_gkr)
 
@@ -248,8 +250,8 @@ def recursion(inner_public_memory, initial_fiat_shamir_cap):
             retrieved_denominators_value, mul_extension_ret(prefix, eval_on_data)
         )
 
-        copy_5(eval_on_selector, bus_numerators_values + table_index * DIM)
-        copy_5(eval_on_data, bus_denominators_values + table_index * DIM)
+        copy_ef(eval_on_selector, bus_numerators_values + table_index * DIM)
+        copy_ef(eval_on_data, bus_denominators_values + table_index * DIM)
 
         offset += n_rows
 
@@ -271,9 +273,9 @@ def recursion(inner_public_memory, initial_fiat_shamir_cap):
                 data_ofs = ONE_BUSES_DATA_OFFSETS[table_index][one_bus_idx][i]
                 src = pcs_vals_logup[table_index * MAX_NUM_COLS_AIR + data_col]
                 if data_ofs == 0:
-                    copy_5(src, data_evals + i * DIM)
+                    copy_ef(src, data_evals + i * DIM)
                 if data_ofs != 0:
-                    copy_5(add_base_extension_ret(data_ofs, src), data_evals + i * DIM)
+                    copy_ef(add_base_extension_ret(data_ofs, src), data_evals + i * DIM)
 
             pref = multilinear_location_prefix(offset / n_rows, n_vars_logup_gkr - log_n_rows, point_gkr)
             retrieved_numerators_value = add_extension_ret(retrieved_numerators_value, pref)
@@ -290,8 +292,8 @@ def recursion(inner_public_memory, initial_fiat_shamir_cap):
         mle_of_zeros_then_ones(point_gkr, gkr_cumul, n_vars_logup_gkr),
     )
 
-    copy_5(retrieved_numerators_value, numerators_value)
-    copy_5(retrieved_denominators_value, denominators_value)
+    copy_ef(retrieved_numerators_value, numerators_value)
+    copy_ef(retrieved_denominators_value, denominators_value)
 
     memory_and_acc_point = point_gkr + (n_vars_logup_gkr - log_memory) * DIM
 
@@ -355,7 +357,7 @@ def recursion(inner_public_memory, initial_fiat_shamir_cap):
                 pcs_shifts_air[table_index * MAX_NUM_COLS_AIR + i] = evals_shift + i * DIM
 
     # verify that the AIR-batched sumcheck is valid
-    copy_5(check_sum, batched_air_final_value)
+    copy_ef(check_sum, batched_air_final_value)
 
     fs, public_memory_random_point = fs_sample_many_ef(fs, INNER_PUBLIC_MEMORY_LOG_SIZE)
     poly_eq_public_mem = compute_eq_mle_extension(public_memory_random_point, INNER_PUBLIC_MEMORY_LOG_SIZE)
@@ -536,7 +538,7 @@ def recursion(inner_public_memory, initial_fiat_shamir_cap):
         s = add_extension_ret(s, mul_extension_ret(air_sum, eq_factor_air))
         curr_randomness += N_AIR_COLUMNS[table_index] * DIM
 
-    copy_5(mul_extension_ret(s, final_value), end_sum)
+    copy_ef(mul_extension_ret(s, final_value), end_sum)
 
     return bytecode_claim
 
@@ -581,8 +583,8 @@ def compute_column_prefixes(first_col_offset, n_vars, point, n_cols: Const):
 
 def fingerprint_2(table_index, data_1, data_2, logup_alphas_eq_poly):
     buff = Array(DIM * 2)
-    copy_5(data_1, buff)
-    copy_5(data_2, buff + DIM)
+    copy_ef(data_1, buff)
+    copy_ef(data_2, buff + DIM)
     res: Mut = dot_product_ee_ret(buff, logup_alphas_eq_poly, 2)
     res = add_extension_ret(
         res, mul_base_extension_ret(table_index, logup_alphas_eq_poly + (2 ** log2_ceil(MAX_BUS_WIDTH) - 1) * DIM)
@@ -671,7 +673,7 @@ def verify_gkr_quotient_step(prev_fs, n_vars, point, claim_num, claim_den):
     new_claim_num = dot_product_ee_ret(inner_evals, point_poly_eq, 2)
     new_claim_den = dot_product_ee_ret(inner_evals + 2 * DIM, point_poly_eq, 2)
 
-    copy_5(beta, postponed_point + n_vars * DIM)
+    copy_ef(beta, postponed_point + n_vars * DIM)
 
     return fs, postponed_point, new_claim_num, new_claim_den
 
diff --git a/crates/rec_aggregation/zkdsl_implem/utils.py b/crates/rec_aggregation/zkdsl_implem/utils.py
index d4b2e983c..908d22938 100644
--- a/crates/rec_aggregation/zkdsl_implem/utils.py
+++ b/crates/rec_aggregation/zkdsl_implem/utils.py
@@ -1,10 +1,13 @@
 from snark_lib import *
 from hashing import *
 
-F_BITS = 31  # koala-bear = 31 bits
+F_BITS = 64  # Goldilocks (P = 2^64 - 2^32 + 1, values fit in u64)
+HALF_BITS = 32  # Goldilocks splits cleanly at 32:32 for canonical-form checks.
 
-TWO_ADICITY = 24
-ROOT = 1791270792  # of order 2^TWO_ADICITY
+TWO_ADICITY = 32
+ROOT = 1753635133440165772  # = 0x185629dcda58878c, of order 2^TWO_ADICITY
+
+EFFECTIVE_TWO_ADICITY = EFFECTIVE_TWO_ADICITY_PLACEHOLDER
 
 
 @inline
@@ -55,7 +58,7 @@ def powers_const(alpha, n: Const):
     set_to_one(res)
     if n == 1:
         return res
-    copy_5(alpha, res + DIM)
+    copy_ef(alpha, res + DIM)
     for i in unroll(1, n - 1):
         mul_extension(res + i * DIM, res + DIM, res + (i + 1) * DIM)
     return res
@@ -92,7 +95,7 @@ def compute_eq_mle_extension(point, n: Const):
 
     for s in unroll(0, n):
         p = Array(DIM)
-        copy_5(point + (n - 1 - s) * DIM, p)
+        copy_ef(point + (n - 1 - s) * DIM, p)
         for i in unroll(0, 2**s):
             mul_extension(p, res + (2**s - 1 + i) * DIM, res + (2 ** (s + 1) - 1 + 2**s + i) * DIM)
             sub_extension(
@@ -174,7 +177,7 @@ def expand_from_univariate_ext(alpha, n):
 
 def expand_from_univariate_ext_const(alpha, n: Const):
     res = Array(n * DIM)
-    copy_5(alpha, res)
+    copy_ef(alpha, res)
     for i in unroll(0, n - 1):
         mul_extension(res + i * DIM, res + i * DIM, res + (i + 1) * DIM)
     return res
@@ -199,7 +202,7 @@ def eval_multilinear_coeffs_rev(coeffs, point, n: Const):
     set_to_one(basis)
     for k in unroll(0, n):
         p = Array(DIM)
-        copy_5(point + k * DIM, p)
+        copy_ef(point + k * DIM, p)
         for j in unroll(0, 2**k):
             mul_extension(basis + j * DIM, p, basis + (j + 2**k) * DIM)
     result = Array(DIM)
@@ -373,87 +376,89 @@ def sub_extension_ret(a, b):
     return c
 
 
+# Semantic copy / zero helpers. Sized to Goldilocks (DIM=3, DIGEST_LEN=4,
+# MESSAGE_LEN=4). Each helper is a thin wrapper over `dot_product_ee(_, ONE_EF_PTR, _)`
+# which copies DIM elements via the extension-op precompile.
+
+
 @inline
-def copy_5(a, b):
+def copy_ef(a, b):
+    # Copy one extension-field element = DIM entries.
     dot_product_ee(a, ONE_EF_PTR, b)
     return
 
 
 @inline
-def set_to_5_zeros(a):
+def zero_ef(a):
+    # Zero one extension-field element = DIM entries.
     zero_ptr = ZERO_VEC_PTR
     dot_product_ee(a, ONE_EF_PTR, zero_ptr)
     return
 
 
 @inline
-def set_to_6_zeros(a):
+def zero_digest_tail(a):
+    # Zero DIGEST_LEN-1 entries — typically called on `ptr + 1` after writing a
+    # domain-sep byte into slot 0 of a digest-sized buffer. Under Goldilocks
+    # DIGEST_LEN-1 == DIM so one dot_product_ee suffices.
     zero_ptr = ZERO_VEC_PTR
     dot_product_ee(a, ONE_EF_PTR, zero_ptr)
-    a[5] = 0
     return
 
 
 @inline
-def copy_6(a, b):
-    dot_product_ee(a, ONE_EF_PTR, b)
-    a[5] = b[5]
+def zero_digest(a):
+    # Zero one digest = DIGEST_LEN entries via two overlapping DIM clears.
+    zero_ptr = ZERO_VEC_PTR
+    dot_product_ee(a, ONE_EF_PTR, zero_ptr)
+    dot_product_ee(a + (DIGEST_LEN - DIM), ONE_EF_PTR, zero_ptr)
     return
 
 
 @inline
-def set_to_7_zeros(a):
-    zero_ptr = ZERO_VEC_PTR
-    dot_product_ee(a, ONE_EF_PTR, zero_ptr)
-    a[5] = 0
-    a[6] = 0
+def copy_digest(a, b):
+    # Copy one digest = DIGEST_LEN entries via two overlapping DIM copies.
+    dot_product_ee(a, ONE_EF_PTR, b)
+    dot_product_ee(a + (DIGEST_LEN - DIM), ONE_EF_PTR, b + (DIGEST_LEN - DIM))
     return
 
 
 @inline
-def set_to_8_zeros(a):
-    zero_ptr = ZERO_VEC_PTR
-    dot_product_ee(a, ONE_EF_PTR, zero_ptr)
-    dot_product_ee(a + (8 - DIM), ONE_EF_PTR, zero_ptr)
+def copy_message(a, b):
+    # Copy one message = MESSAGE_LEN entries. Under Goldilocks MESSAGE_LEN ==
+    # DIGEST_LEN, so this is structurally identical to `copy_digest`.
+    dot_product_ee(a, ONE_EF_PTR, b)
+    dot_product_ee(a + (DIGEST_LEN - DIM), ONE_EF_PTR, b + (DIGEST_LEN - DIM))
     return
 
 
 @inline
-def set_to_16_zeros(a):
+def set_to_8_zeros(a):
+    # Zero 8 entries (the duplex-sponge state) via three overlapping DIM clears.
     zero_ptr = ZERO_VEC_PTR
     dot_product_ee(a, ONE_EF_PTR, zero_ptr)
-    dot_product_ee(a + 5, ONE_EF_PTR, zero_ptr)
-    dot_product_ee(a + 10, ONE_EF_PTR, zero_ptr)
-    a[15] = 0
+    dot_product_ee(a + DIM, ONE_EF_PTR, zero_ptr)
+    dot_product_ee(a + (8 - DIM), ONE_EF_PTR, zero_ptr)
     return
 
 
 @inline
-def copy_16(a, b):
-    dot_product_ee(a, ONE_EF_PTR, b)
-    dot_product_ee(a + 5, ONE_EF_PTR, b + 5)
-    dot_product_ee(a + 10, ONE_EF_PTR, b + 10)
-    a[15] = b[15]
+def copy_poseidon_input(a, b):
+    # Copy a full Poseidon8 input block = 2 × DIGEST_LEN entries.
+    copy_digest(a, b)
+    copy_digest(a + DIGEST_LEN, b + DIGEST_LEN)
     return
 
 
 @inline
 def copy_8(a, b):
+    # Copy 8 entries (the duplex-sponge state) via three overlapping DIM copies.
     dot_product_ee(a, ONE_EF_PTR, b)
+    dot_product_ee(a + DIM, ONE_EF_PTR, b + DIM)
     dot_product_ee(a + (8 - DIM), ONE_EF_PTR, b + (8 - DIM))
     return
 
 
-@inline
-def copy_32(a, b):
-    chunks = div_floor(32, DIM)
-    for i in unroll(0, chunks):
-        copy_5(a + i * DIM, b + i * DIM)
-    if DIM * chunks != 32:
-        copy_5(a + (32 - DIM), b + (32 - DIM))
-    return
-
-
 @inline
 def copy_many_ef(a, b, n):
     for i in unroll(0, n):
@@ -502,36 +507,35 @@ def sum_2_ef_fractions(a_num, a_den, b_num, b_den):
     return sum_num, common_den
 
 
-# p = 2^31 - 2^24 + 1
-# in binary: p = 1111111000000000000000000000001
-#        p - 1 = 1111111000000000000000000000000
-#        p - 2 = 1111110111111111111111111111111
-#        p - 3 = 1111110111111111111111111111110
-#        ...
+# Goldilocks: p = 2^64 - 2^32 + 1 = 0xFFFFFFFF_00000001
+#   p - 1 = 0xFFFFFFFF_00000000
+#   p - 2 = 0xFFFFFFFE_FFFFFFFF
+#   ...
 # Any field element (< p) is either:
-# -   1111111    | 00...00
-# - not(1111111) | xx...xx
+#   - high 32 bits = 0xFFFFFFFF and low 32 bits = 0
+#   - high 32 bits < 0xFFFFFFFF and low 32 bits arbitrary
 def checked_decompose_bits(a):
-    # return a pointer to the 31 bits of a (big-endian: bits[0] = MSB, bits[F_BITS-1] = LSB)
-    # .. and the first 24 partial sums of these bits, where partial_sums_24[k] is the
-    # value of the lowest k+1 bits of a.
+    # Return a pointer to the F_BITS=64 big-endian bits of `a` (bits[0] is MSB,
+    # bits[F_BITS - 1] is LSB), plus the partial sums over the low HALF_BITS=32
+    # bits. Enforces canonicality.
     bits = Array(F_BITS)
     hint_decompose_bits(a, bits, F_BITS)
 
     for i in unroll(0, F_BITS):
         assert bits[i] * (1 - bits[i]) == 0
-    partial_sums_24 = Array(24)
-    partial_sums_24[0] = bits[F_BITS - 1]
-    for i in unroll(1, 24):
-        partial_sums_24[i] = partial_sums_24[i - 1] + bits[F_BITS - 1 - i] * 2**i
-    sum_7: Mut = bits[F_BITS - 1 - 24]
-    for i in unroll(1, 7):
-        sum_7 += bits[F_BITS - 1 - (24 + i)] * 2**i
-    if sum_7 == 127:
-        assert partial_sums_24[23] == 0
+    partial_sums_low = Array(HALF_BITS)
+    partial_sums_low[0] = bits[F_BITS - 1]
+    for i in unroll(1, HALF_BITS):
+        partial_sums_low[i] = partial_sums_low[i - 1] + bits[F_BITS - 1 - i] * 2**i
+    sum_high: Mut = bits[F_BITS - 1 - HALF_BITS]
+    for i in unroll(1, F_BITS - HALF_BITS):
+        sum_high += bits[F_BITS - 1 - HALF_BITS - i] * 2**i
+    # If the high 32 bits are all set, the low 32 bits must be zero (only p-1).
+    if sum_high == 2 ** (F_BITS - HALF_BITS) - 1:
+        assert partial_sums_low[HALF_BITS - 1] == 0
 
-    assert a == partial_sums_24[23] + sum_7 * 2**24
-    return bits, partial_sums_24
+    assert a == partial_sums_low[HALF_BITS - 1] + sum_high * 2**HALF_BITS
+    return bits, partial_sums_low
 
 
 @inline
@@ -560,23 +564,42 @@ def whir_1_merkle_step_and_pow(v, state_in, path_chunk, state_out, power_shift):
 
 @inline
 def decompose_and_verify_merkle_query(a, domain_size, prev_root, num_chunks, leaf_iv):
-    nibbles = Array(6)
-    hint_decompose_bits_merkle_whir(nibbles, a, 4)
+    # Only the low `n_nibbles = ceil(domain_size/4)` nibbles of `a` encode the Merkle
+    # query index; they are dispatched through `match_range` below so each needs an
+    # individual [0,16) check. The high bits only need a well-formed canonical 64-bit
+    # decomposition, so we bound them with coarser 16-bit chunk checks.
+    n_nibbles = div_ceil(domain_size, 4)
+    # A 16-bit chunk holds 4 nibbles, so the index nibbles span the first `n_index_16bit_chunks` chunks.
+    n_index_16bit_chunks = div_ceil(n_nibbles, 4)
+    n_idx_nibbles = n_index_16bit_chunks * 4  # nibbles we decompose & individually bound (>= n_nibbles)
 
-    for i in unroll(0, 6):
+    nibbles = Array(n_idx_nibbles)
+    hint_decompose_bits_merkle_whir(nibbles, a, n_idx_nibbles, 4)
+    for i in unroll(0, n_idx_nibbles):
         assert nibbles[i] < 16
 
-    partial_sum: Mut = nibbles[0]
-    for i in unroll(1, 6):
-        partial_sum += nibbles[i] * 16**i
-
-    # p = 2^31 - 2^24 + 1, so 2^24 * 127 = p - 1 ≡ -1 (mod p), hence inv(2^24) = -127.
-    # Deduce top7 from the identity partial_sum + top7 * 2^24 == a:
-    # top7 = (a - partial_sum) * inv(2^24) = (partial_sum - a) * 127
-    top7 = (partial_sum - a) * 127
-    assert top7 < 2**7
-    if top7 == 2**7 - 1:
-        assert partial_sum == 0
+    # Low 16*n_index_16bit_chunks bits, reconstructed from the bounded nibbles.
+    partial_sum_low: Mut = nibbles[0]
+    for i in unroll(1, n_idx_nibbles):
+        partial_sum_low += nibbles[i] * 16**i
+
+    # Rest of `a` as 16-bit chunks (chunk c covers bits [16*c, 16*c+16)), bounded directly.
+    chunks16 = Array(4)
+    hint_decompose_bits_merkle_whir(chunks16, a, 4, 16)
+    for c in unroll(n_index_16bit_chunks, 4):
+        assert chunks16[c] < 2**16
+    # Add the low-half chunk(s) the nibbles didn't cover (chunk 1 when n_index_16bit_chunks == 1).
+    for c in unroll(n_index_16bit_chunks, 2):
+        partial_sum_low += chunks16[c] * 2 ** (16 * c)
+
+    partial_sum_high = chunks16[2] + chunks16[3] * 2**16
+
+    # Canonicality: a == low + high * 2^32 with both halves < 2^32. The edge check below
+    # rejects the alternate decomposition of integers in [p, 2^64) (a soundness break):
+    # the only canonical element with top half = 2^32 - 1 is p - 1 = 2^64 - 2^32.
+    assert a == partial_sum_low + partial_sum_high * 2**HALF_BITS
+    if partial_sum_high == 2**HALF_BITS - 1:
+        assert partial_sum_low == 0
 
     leaf_data = Array(num_chunks * DIGEST_LEN)
     hint_witness("merkle_leaf", leaf_data)
@@ -585,7 +608,6 @@ def decompose_and_verify_merkle_query(a, domain_size, prev_root, num_chunks, lea
     merkle_path = Array(domain_size * DIGEST_LEN)
     hint_witness("merkle_path", merkle_path)
 
-    n_nibbles = div_ceil(domain_size, 4)
     states = Array((n_nibbles - 1) * DIGEST_LEN)
 
     prod: Mut = 1
@@ -808,18 +830,13 @@ def _verify_log2_large(n, log2: Const):
 
 
 def log2_ceil_runtime(n):
-    # requires: 2 < n <= 2^30
+    # requires: 2 < n <= 2^30 (still inside HALF_BITS=32, so `_verify_log2_small`
+    # is always chosen under Goldilocks).
     log2: Imm
     hint_log2_ceil(n, log2)
     assert log2 < 31
     if two_exp(log2) != n:
-        _, partial_sums_24 = checked_decompose_bits(n)
-        match_range(
-            log2,
-            range(2, 24),
-            lambda i: _verify_log2_small(n, partial_sums_24, i),
-            range(24, 31),
-            lambda i: _verify_log2_large(n, i),
-        )
+        _, partial_sums_low = checked_decompose_bits(n)
+        match_range(log2, range(2, 31), lambda i: _verify_log2_small(n, partial_sums_low, i))
     return log2
 
diff --git a/crates/rec_aggregation/zkdsl_implem/whir.py b/crates/rec_aggregation/zkdsl_implem/whir.py
index d14a10efb..e6ac3aad6 100644
--- a/crates/rec_aggregation/zkdsl_implem/whir.py
+++ b/crates/rec_aggregation/zkdsl_implem/whir.py
@@ -11,7 +11,6 @@
 WHIR_ALL_POTENTIAL_NUM_QUERIES = WHIR_ALL_POTENTIAL_NUM_QUERIES_PLACEHOLDER
 WHIR_ALL_POTENTIAL_QUERY_GRINDING = WHIR_ALL_POTENTIAL_QUERY_GRINDING_PLACEHOLDER
 WHIR_ALL_POTENTIAL_NUM_OODS = WHIR_ALL_POTENTIAL_NUM_OODS_PLACEHOLDER
-WHIR_ALL_POTENTIAL_FOLDING_GRINDING = WHIR_ALL_POTENTIAL_FOLDING_GRINDING_PLACEHOLDER
 MIN_STACKED_N_VARS = MIN_STACKED_N_VARS_PLACEHOLDER
 
 
@@ -27,9 +26,7 @@ def whir_open(
     fs: Mut = prev_fs
     root: Mut = prev_root
     claimed_sum: Mut = prev_claimed_sum
-    n_rounds, n_final_vars, num_queries, num_oods, query_grinding_bits, folding_grinding = get_whir_params(
-        n_vars, initial_log_inv_rate
-    )
+    n_rounds, n_final_vars, num_queries, num_oods, query_grinding_bits = get_whir_params(n_vars, initial_log_inv_rate)
     folding_factors = Array(n_rounds + 1)
     folding_factors[0] = WHIR_INITIAL_FOLDING_FACTOR
     for i in range(1, n_rounds + 1):
@@ -66,15 +63,14 @@ def whir_open(
             claimed_sum,
             query_grinding_bits[r],
             num_oods[r + 1],
-            folding_grinding[r],
         )
         if r == 0:
             domain_sz -= WHIR_FIRST_RS_REDUCTION_FACTOR
         else:
             domain_sz -= 1
 
-    fs, all_folding_randomness[n_rounds], claimed_sum = sumcheck_verify_with_grinding(
-        fs, WHIR_SUBSEQUENT_FOLDING_FACTOR, claimed_sum, 2, folding_grinding[n_rounds]
+    fs, all_folding_randomness[n_rounds], claimed_sum = sumcheck_verify(
+        fs, WHIR_SUBSEQUENT_FOLDING_FACTOR, claimed_sum, 2
     )
 
     fs, final_coeffcients = fs_receive_ef_by_log_dynamic(
@@ -106,7 +102,7 @@ def whir_open(
             ),
             lambda n: univariate_eval_on_base(final_coeffcients, alpha, n),
         )
-        copy_5(final_pol_evaluated_on_circle, final_folds + i * DIM)
+        copy_ef(final_pol_evaluated_on_circle, final_folds + i * DIM)
 
     fs, all_folding_randomness[n_rounds + 1], end_sum = sumcheck_verify(fs, n_final_vars, claimed_sum, 2)
 
@@ -115,10 +111,10 @@ def whir_open(
     start: Mut = folding_randomness_global
     for i in range(0, n_rounds + 1):
         for j in range(0, folding_factors[i]):
-            copy_5(all_folding_randomness[i] + j * DIM, start + j * DIM)
+            copy_ef(all_folding_randomness[i] + j * DIM, start + j * DIM)
         start += folding_factors[i] * DIM
     for j in range(0, n_final_vars):
-        copy_5(all_folding_randomness[n_rounds + 1] + j * DIM, start + j * DIM)
+        copy_ef(all_folding_randomness[n_rounds + 1] + j * DIM, start + j * DIM)
 
     all_ood_recovered_evals = Array(num_oods[0] * DIM)
     for i in range(0, num_oods[0]):
@@ -173,7 +169,7 @@ def whir_open(
         range(MAX_NUM_VARIABLES_TO_SEND_COEFFS - WHIR_SUBSEQUENT_FOLDING_FACTOR, MAX_NUM_VARIABLES_TO_SEND_COEFFS + 1),
         lambda n: eval_multilinear_coeffs_rev(final_coeffcients, all_folding_randomness[n_rounds + 1], n),
     )
-    # copy_5(mul_extension_ret(s, final_value), end_sum);
+    # copy_ef(mul_extension_ret(s, final_value), end_sum);
 
     return fs, folding_randomness_global, s, final_value, end_sum
 
@@ -192,7 +188,7 @@ def sumcheck_verify_helper(prev_fs, n_steps, prev_claimed_sum, degree: Const, ch
         polynomial_sum_at_0_and_1(poly, degree, claimed_sum)
         fs, rand = fs_sample_ef(fs)
         claimed_sum = univariate_polynomial_eval(poly, rand, degree)
-        copy_5(rand, challenges + sc_round * DIM)
+        copy_ef(rand, challenges + sc_round * DIM)
 
     return fs, claimed_sum
 
@@ -221,26 +217,11 @@ def sumcheck_verify_reversed_helper_const(prev_fs, n_steps: Const, prev_claimed_
         polynomial_sum_at_0_and_1(poly, degree, claimed_sum)
         fs, rand = fs_sample_ef(fs)
         claimed_sum = univariate_polynomial_eval(poly, rand, degree)
-        copy_5(rand, challenges + (n_steps - 1 - sc_round) * DIM)
+        copy_ef(rand, challenges + (n_steps - 1 - sc_round) * DIM)
 
     return fs, claimed_sum
 
 
-def sumcheck_verify_with_grinding(prev_fs, n_steps, prev_claimed_sum, degree: Const, folding_grinding_bits):
-    fs: Mut = prev_fs
-    claimed_sum: Mut = prev_claimed_sum
-    challenges = Array(n_steps * DIM)
-    for sc_round in range(0, n_steps):
-        fs, poly = fs_receive_ef_inlined(fs, degree + 1)
-        polynomial_sum_at_0_and_1(poly, degree, claimed_sum)
-        fs = fs_grinding(fs, folding_grinding_bits)
-        fs, rand = fs_sample_ef(fs)
-        claimed_sum = univariate_polynomial_eval(poly, rand, degree)
-        copy_5(rand, challenges + sc_round * DIM)
-
-    return fs, challenges, claimed_sum
-
-
 @inline
 def decompose_and_verify_merkle_batch(num_queries, sampled, root, height, num_chunks, circle_values, answers):
     debug_assert(height < 25)
@@ -257,20 +238,32 @@ def decompose_and_verify_merkle_batch(num_queries, sampled, root, height, num_ch
 def decompose_and_verify_merkle_batch_with_height(
     num_queries, sampled, root, height: Const, num_chunks, circle_values, answers
 ):
+    # Under Goldilocks (DIGEST_LEN=4, DIM=3) the value `num_chunks = two_pow_folding_factor * {1,DIM} / DIGEST_LEN`
+    # roughly doubles vs KoalaBear (DIGEST_LEN=8). We dispatch the union of both
+    # configurations so the same file compiles for either field.
     if num_chunks == DIM * 2:
         decompose_and_verify_merkle_batch_const(num_queries, sampled, root, height, DIM * 2, circle_values, answers)
         return
     if num_chunks == 16:
         decompose_and_verify_merkle_batch_const(num_queries, sampled, root, height, 16, circle_values, answers)
         return
+    if num_chunks == 32:
+        decompose_and_verify_merkle_batch_const(num_queries, sampled, root, height, 32, circle_values, answers)
+        return
     if num_chunks == 8:
         decompose_and_verify_merkle_batch_const(num_queries, sampled, root, height, 8, circle_values, answers)
         return
+    if num_chunks == 12:
+        decompose_and_verify_merkle_batch_const(num_queries, sampled, root, height, 12, circle_values, answers)
+        return
     if num_chunks == 20:
         decompose_and_verify_merkle_batch_const(num_queries, sampled, root, height, 20, circle_values, answers)
         return
-    if num_chunks == 1:
-        decompose_and_verify_merkle_batch_const(num_queries, sampled, root, height, 1, circle_values, answers)
+    if num_chunks == 24:
+        decompose_and_verify_merkle_batch_const(num_queries, sampled, root, height, 24, circle_values, answers)
+        return
+    if num_chunks == 2:
+        decompose_and_verify_merkle_batch_const(num_queries, sampled, root, height, 2, circle_values, answers)
         return
     if num_chunks == 4:
         decompose_and_verify_merkle_batch_const(num_queries, sampled, root, height, 4, circle_values, answers)
@@ -355,12 +348,9 @@ def whir_round(
     claimed_sum,
     query_grinding_bits,
     num_ood,
-    folding_grinding_bits,
 ):
     fs: Mut = prev_fs
-    fs, folding_randomness, new_claimed_sum_a = sumcheck_verify_with_grinding(
-        fs, folding_factor, claimed_sum, 2, folding_grinding_bits
-    )
+    fs, folding_randomness, new_claimed_sum_a = sumcheck_verify(fs, folding_factor, claimed_sum, 2)
 
     fs, root, ood_points, ood_evals = parse_commitment(fs, num_ood)
 
@@ -449,10 +439,7 @@ def get_whir_params(n_vars, log_inv_rate):
 
     num_oods = get_num_oods(log_inv_rate, n_vars)
 
-    folding_grinding: Imm
-    folding_grinding = get_folding_grinding(log_inv_rate, n_vars)
-
-    return n_rounds, final_vars, num_queries, num_oods, query_grinding_bits, folding_grinding
+    return n_rounds, final_vars, num_queries, num_oods, query_grinding_bits
 
 
 @inline
@@ -468,7 +455,7 @@ def get_num_queries(log_inv_rate, n_vars):
 def get_num_queries_const_rate(log_inv_rate: Const, n_vars):
     res = match_range(
         n_vars,
-        range(MIN_STACKED_N_VARS, TWO_ADICITY + WHIR_INITIAL_FOLDING_FACTOR - log_inv_rate + 1),
+        range(MIN_STACKED_N_VARS, EFFECTIVE_TWO_ADICITY + WHIR_INITIAL_FOLDING_FACTOR - log_inv_rate + 1),
         lambda nv: get_num_queries_const(log_inv_rate, nv),
     )
     return res
@@ -497,7 +484,7 @@ def get_query_grinding_bits(log_inv_rate, n_vars):
 def get_query_grinding_bits_const_rate(log_inv_rate: Const, n_vars):
     res = match_range(
         n_vars,
-        range(MIN_STACKED_N_VARS, TWO_ADICITY + WHIR_INITIAL_FOLDING_FACTOR - log_inv_rate + 1),
+        range(MIN_STACKED_N_VARS, EFFECTIVE_TWO_ADICITY + WHIR_INITIAL_FOLDING_FACTOR - log_inv_rate + 1),
         lambda nv: get_query_grinding_bits_const(log_inv_rate, nv),
     )
     return res
@@ -513,35 +500,6 @@ def get_query_grinding_bits_const(log_inv_rate: Const, n_vars: Const):
     return query_grinding_bits
 
 
-@inline
-def get_folding_grinding(log_inv_rate, n_vars):
-    res = match_range(
-        log_inv_rate,
-        range(MIN_WHIR_LOG_INV_RATE, MAX_WHIR_LOG_INV_RATE + 1),
-        lambda r: get_folding_grinding_const_rate(r, n_vars),
-    )
-    return res
-
-
-def get_folding_grinding_const_rate(log_inv_rate: Const, n_vars):
-    res = match_range(
-        n_vars,
-        range(MIN_STACKED_N_VARS, TWO_ADICITY + WHIR_INITIAL_FOLDING_FACTOR - log_inv_rate + 1),
-        lambda nv: get_folding_grinding_const(log_inv_rate, nv),
-    )
-    return res
-
-
-def get_folding_grinding_const(log_inv_rate: Const, n_vars: Const):
-    max = len(WHIR_ALL_POTENTIAL_FOLDING_GRINDING[log_inv_rate - MIN_WHIR_LOG_INV_RATE][n_vars - MIN_STACKED_N_VARS])
-    folding_grinding = Array(max)
-    for i in unroll(0, max):
-        folding_grinding[i] = WHIR_ALL_POTENTIAL_FOLDING_GRINDING[log_inv_rate - MIN_WHIR_LOG_INV_RATE][
-            n_vars - MIN_STACKED_N_VARS
-        ][i]
-    return folding_grinding
-
-
 def get_num_oods(log_inv_rate, n_vars):
     res = match_range(
         log_inv_rate,
@@ -554,7 +512,7 @@ def get_num_oods(log_inv_rate, n_vars):
 def get_num_oods_const_rate(log_inv_rate: Const, n_vars):
     res = match_range(
         n_vars,
-        range(MIN_STACKED_N_VARS, TWO_ADICITY + WHIR_INITIAL_FOLDING_FACTOR - log_inv_rate + 1),
+        range(MIN_STACKED_N_VARS, EFFECTIVE_TWO_ADICITY + WHIR_INITIAL_FOLDING_FACTOR - log_inv_rate + 1),
         lambda nv: get_num_oods_const(log_inv_rate, nv),
     )
     return res
diff --git a/crates/rec_aggregation/zkdsl_implem/xmss_aggregate.py b/crates/rec_aggregation/zkdsl_implem/xmss_aggregate.py
index b5fc01493..0653967e5 100644
--- a/crates/rec_aggregation/zkdsl_implem/xmss_aggregate.py
+++ b/crates/rec_aggregation/zkdsl_implem/xmss_aggregate.py
@@ -11,18 +11,22 @@
 PUBLIC_PARAM_LEN_FE = PUBLIC_PARAM_LEN_FE_PLACEHOLDER
 XMSS_DIGEST_LEN = XMSS_DIGEST_LEN_PLACEHOLDER
 PUB_KEY_SIZE = XMSS_DIGEST_LEN + PUBLIC_PARAM_LEN_FE
-PP_IN_LEFT = DIGEST_LEN - XMSS_DIGEST_LEN
+PP_IN_LEFT = DIGEST_LEN - XMSS_DIGEST_LEN  # = 2 (Goldilocks: pp(2)|zeros(2))
 WOTS_SIG_SIZE = RANDOMNESS_LEN + V * XMSS_DIGEST_LEN
-# wots_public_key pair stride: each pair occupies 10 cells `[leading_0 | tip_a(4) | tip_b(4) | trailing_0]`. In order to be able to use copy_5 on both sides.
+# wots_public_key pair stride: each pair occupies (1 + 2*XMSS_DIGEST_LEN + 1) cells.
+# `[leading_0 | tip_a(2) | tip_b(2) | trailing_0]` so that copy_ef can be used on
+# both halves under Goldilocks (DIM = 3 = 1 + XMSS_DIGEST_LEN).
 WOTS_PK_PAIR_STRIDE = 2 + 2 * XMSS_DIGEST_LEN
-NUM_ENCODING_FE = div_ceil(V, (24 / W))
+NUM_ENCODING_FE = 4
+LOW_BITS_PER_ENCODING_FE = 32
 MERKLE_LEVELS_PER_CHUNK = MERKLE_LEVELS_PER_CHUNK_PLACEHOLDER
 N_MERKLE_CHUNKS = LOG_LIFETIME / MERKLE_LEVELS_PER_CHUNK
 INNER_PUB_MEM_SIZE = 2**INNER_PUBLIC_MEMORY_LOG_SIZE  # = DIGEST_LEN
 TWEAK_TABLE_ADDR = PREAMBLE_MEMORY_END
 
-# Tweak table layout: all tweaks are stored as a 4-FE slot [tw[0], tw[1], 0, 0]
-TWEAK_LEN = 4  # stride / slot size for non-encoding tweaks
+# Tweak table layout: each tweak is stored in a 2-FE slot [tw[0], 0]. Goldilocks
+# tweaks are 1 FE + 1 zero pad so the slot stride is 2 (= XMSS_DIGEST_LEN).
+TWEAK_LEN = 2  # stride / slot size for tweaks (1 actual + 1 zero)
 N_TWEAKS = 1 + V * CHAIN_LENGTH + 1 + LOG_LIFETIME
 TWEAK_TABLE_SIZE_FE_PADDED = next_multiple_of(N_TWEAKS * TWEAK_LEN, DIGEST_LEN)
 TWEAK_ENCODING_OFFSET = 0
@@ -40,47 +44,33 @@ def xmss_verify(pub_key, message, merkle_chunks):
     randomness = wots
     chain_starts = wots + RANDOMNESS_LEN
 
-    # 1) Encode: poseidon16_compress_half(message[0:8], [randomness(6) | tweak_encoding(2))
-    #            poseidon16_compress_half(pre_compressed, [pp(4) | zeros(4)])
+    # 1) Encode: poseidon8_compress_half(message[0:8], [randomness(6) | tweak_encoding(2))
+    #            poseidon8_compress_half(pre_compressed, [pp(4) | zeros(4)])
     encoding_tweak = TWEAK_TABLE_ADDR + TWEAK_ENCODING_OFFSET
     a_input_right = Array(DIGEST_LEN)
-    copy_6(randomness, a_input_right)
-    a_input_right[6] = encoding_tweak[0]
-    a_input_right[7] = encoding_tweak[1]
+    copy_ef(randomness, a_input_right)
+    a_input_right[3] = encoding_tweak[0]
     pre_compressed = Array(DIGEST_LEN)
-    poseidon16_compress_half(message, a_input_right, pre_compressed)
+    poseidon8_compress_half(message, a_input_right, pre_compressed)
 
-    public_params_paded_buff = Array(DIGEST_LEN + 2)  # 0 [public_param(4) | zeros(4)] 0
-    copy_5(public_param - 1, public_params_paded_buff)
-    set_to_5_zeros(public_params_paded_buff + 5)
+    # `[0 | pp(2) | zeros(2) | 0]` so poseidon8_compress_hardcoded_left could be applied later
+    # (here we just need the right operand layout `[pp(2) | zeros(2)]`).
+    public_params_paded_buff = Array(DIGEST_LEN + 2)
+    copy_ef(public_param - 1, public_params_paded_buff)
+    zero_ef(public_params_paded_buff + 3)
     public_params_paded = public_params_paded_buff + 1
     encoding_fe = Array(DIGEST_LEN)
-    poseidon16_compress_half(pre_compressed, public_params_paded, encoding_fe)
+    poseidon8_compress_half(pre_compressed, public_params_paded, encoding_fe)
 
-    # Decompose the encoding into chunks of 2*W bits. Each chunk packs the chain step
-    # counts of two consecutive WOTS chains: chunk i = step_{2i} + CHAIN_LENGTH * step_{2i+1}.
-    encoding = Array(NUM_ENCODING_FE * 24 / (2 * W))
-
-    hint_decompose_bits_xmss(encoding, encoding_fe, NUM_ENCODING_FE, 2 * W)
-
-    # check that the decomposition is correct
+    debug_assert(V % 2 == 0)
+    encoding = Array(V / 2)
     for i in unroll(0, NUM_ENCODING_FE):
-        for j in unroll(0, 24 / (2 * W)):
-            assert encoding[i * (24 / (2 * W)) + j] < CHAIN_LENGTH**2
-
-        partial_sum: Mut = encoding[i * (24 / (2 * W))]
-        for j in unroll(1, 24 / (2 * W)):
-            partial_sum += encoding[i * (24 / (2 * W)) + j] * (CHAIN_LENGTH**2) ** j
-
-        # p = 2^31 - 2^24 + 1 = 127.2^24 + 1, so inv(2^24) = -127 (mod p).
-        # Deduce remaining_i from partial_sum + remaining_i * 2^24 == encoding_fe[i]:
-        # remaining_i = (encoding_fe[i] - partial_sum) * inv(2^24) = (partial_sum - encoding_fe[i]) * 127
-        remaining_i = (partial_sum - encoding_fe[i]) * 127
-        assert remaining_i < 127  # ensures uniformity + prevent overflow
+        decompose_encoding_fe(encoding_fe[i], encoding + i * ((V / 2) / NUM_ENCODING_FE))
 
-    debug_assert(V % 2 == 0)
     wots_public_key = Array((V / 2) * WOTS_PK_PAIR_STRIDE)
     target_sum: Mut = 0
+    # Pair structure: `[leading_0 | tip_a(XMSS_DIGEST_LEN) | tip_b(XMSS_DIGEST_LEN) | trailing_0]`
+    # so a length-DIM block straddles each tip with one zero-pad cell.
     for i in unroll(0, V / 2):
         chain_start_a = chain_starts + (2 * i) * XMSS_DIGEST_LEN
         chain_start_b = chain_starts + (2 * i + 1) * XMSS_DIGEST_LEN
@@ -117,31 +107,38 @@ def xmss_verify(pub_key, message, merkle_chunks):
 
 
 @inline
-def chain_hash_pa(input, n, output, chain_i_tweaks, chain_right):
-    starting_step = CHAIN_LENGTH - 1 - n
-    if n == 1:
+def chain_hash_inner(input, n, output, chain_i_tweaks, chain_right):
+    # Iterate the WOTS chain hash `num_hashes = (CHAIN_LENGTH-1) - n` times,
+    # starting from `input`, writing the final chain tip to `output`.
+    # Caller is responsible for the n=0 case (a copy from input).
+    num_hashes = (CHAIN_LENGTH - 1) - n
+    starting_step = CHAIN_LENGTH - 1 - num_hashes
+
+    if num_hashes == 1:
         first_tweak = chain_i_tweaks + starting_step * TWEAK_LEN
-        poseidon16_compress_quarter_hardcoded_left(input, chain_right, output, first_tweak)
+        poseidon8_compress_quarter_hardcoded_left(input, chain_right, output, first_tweak)
     else:
-        digests = Array(n * XMSS_DIGEST_LEN)
+        digests = Array(num_hashes * XMSS_DIGEST_LEN)
 
-        # Hash 0: input → digests[0..4]
+        # Hash 0: input → digests[0..XMSS_DIGEST_LEN]
         first_tweak = chain_i_tweaks + starting_step * TWEAK_LEN
-        poseidon16_compress_quarter_hardcoded_left(input, chain_right, digests, first_tweak)
+        poseidon8_compress_quarter_hardcoded_left(input, chain_right, digests, first_tweak)
 
-        # Hashes 1..n-2: digests[(j-1)*4..j*4] → digests[j*4..(j+1)*4]
-        for j in unroll(1, n - 1):
+        # Hashes 1..num_hashes-2
+        for j in unroll(1, num_hashes - 1):
             cur_tweak = chain_i_tweaks + (starting_step + j) * TWEAK_LEN
-            poseidon16_compress_quarter_hardcoded_left(
+            poseidon8_compress_quarter_hardcoded_left(
                 digests + (j - 1) * XMSS_DIGEST_LEN,
                 chain_right,
                 digests + j * XMSS_DIGEST_LEN,
                 cur_tweak,
             )
 
-        # Final hash: digests[(n-2)*4..(n-1)*4] → output
-        last_tweak = chain_i_tweaks + (starting_step + n - 1) * TWEAK_LEN
-        poseidon16_compress_quarter_hardcoded_left(digests + (n - 2) * XMSS_DIGEST_LEN, chain_right, output, last_tweak)
+        # Final hash → output
+        last_tweak = chain_i_tweaks + (starting_step + num_hashes - 1) * TWEAK_LEN
+        poseidon8_compress_quarter_hardcoded_left(
+            digests + (num_hashes - 2) * XMSS_DIGEST_LEN, chain_right, output, last_tweak
+        )
     return
 
 
@@ -157,33 +154,55 @@ def chain_hash_pair(
     chain_right,
     pair_sum_ptr,
 ):
-    # Pair-encoded chain hash. `n` is a compile-time constant in [0, CHAIN_LENGTH^2)
     raw_a = n % CHAIN_LENGTH
     raw_b = (n - raw_a) / CHAIN_LENGTH
     num_hashes_a = (CHAIN_LENGTH - 1) - raw_a
     num_hashes_b = (CHAIN_LENGTH - 1) - raw_b
 
     if num_hashes_a == 0:
-        copy_5(input_a - 1, output_a - 1)
+        copy_ef(input_a - 1, output_a - 1)
     else:
-        chain_hash_pa(input_a, num_hashes_a, output_a, tweaks_a, chain_right)
+        chain_hash_inner(input_a, raw_a, output_a, tweaks_a, chain_right)
 
     if num_hashes_b == 0:
-        copy_5(input_b, output_b)
+        copy_ef(input_b, output_b)
     else:
-        chain_hash_pa(input_b, num_hashes_b, output_b, tweaks_b, chain_right)
+        chain_hash_inner(input_b, raw_b, output_b, tweaks_b, chain_right)
 
     pair_sum_ptr[0] = raw_a + raw_b
     return
 
 
+@inline
+def decompose_encoding_fe(fe_value, chunks_ptr):
+    limbs = Array(2)
+    hint_decompose_bits_xmss(chunks_ptr, limbs, fe_value)
+
+    for k in unroll(0, 5):
+        assert chunks_ptr[k] < CHAIN_LENGTH**2
+    assert limbs[0] < 2**16
+    assert limbs[1] < 2**16
+
+    low: Mut = chunks_ptr[0]
+    for k in unroll(1, 5):
+        low += chunks_ptr[k] * (2 ** (2 * W * k))
+
+    high = limbs[0] + limbs[1] * (2**16)
+    assert fe_value == low + (2**32) * high
+    assert high != 2**32 - 1  # ensures uniformity + prevents overflow
+
+    return
+
+
 @inline
 def wots_pk_hash(wots_public_key, public_param):
+    # T-Sponge with replacement: IV = poseidon8([tweak(1)|0|pp(2)], zeros)
+    # then absorb pairs of WOTS chain tips.
     N_CHUNKS = V / 2
     states = Array(N_CHUNKS * DIGEST_LEN)
-    poseidon16_permute_half_hardcoded_left(public_param, ZERO_VEC_PTR, states, TWEAK_TABLE_ADDR + TWEAK_WOTS_PK_OFFSET)
+    poseidon8_permute_half_hardcoded_left(public_param, ZERO_VEC_PTR, states, TWEAK_TABLE_ADDR + TWEAK_WOTS_PK_OFFSET)
     for i in unroll(0, N_CHUNKS - 1):
-        poseidon16_permute_half(
+        poseidon8_permute_half(
             states + i * DIGEST_LEN,
             wots_public_key + i * WOTS_PK_PAIR_STRIDE + 1,
             states + (i + 1) * DIGEST_LEN,
@@ -193,14 +212,6 @@ def wots_pk_hash(wots_public_key, public_param):
     )
 
 
-@inline
-def set_buf_prefix_right(buf, public_param):
-    # Writes [pp(4)] to buf[0..4] — the RIGHT-input prefix.
-    for k in unroll(0, PP_IN_LEFT):
-        buf[k] = public_param[k]
-    return
-
-
 @inline
 def do_4_merkle_levels(b, state_in, state_out, public_param, merkle_tweaks_chunk):
     b0 = b % 2
@@ -211,33 +222,34 @@ def do_4_merkle_levels(b, state_in, state_out, public_param, merkle_tweaks_chunk
     r3 = (r2 - b2) / 2
     b3 = r3 % 2
 
+    # buf0 layout: [0 | left_child(2) | right_child(2) | 0] (stride DIM=3 on each side)
     buf0_alloc = Array(XMSS_DIGEST_LEN * 2 + 2)
     buf0 = buf0_alloc + 1
     if b0 == 1:
-        # state_in is the LEFT child → state_in[0..4] lands at buf0[0..4].
-        copy_5(state_in - 1, buf0 - 1)
+        # state_in is the LEFT child → buf0[0..XMSS_DIGEST_LEN].
+        copy_ef(state_in - 1, buf0 - 1)
         hint_witness("xmss_merkle_node", buf0 + XMSS_DIGEST_LEN)
     else:
-        # state_in is the RIGHT child → state_in[0..4] lands at buf0[4..8].
+        # state_in is the RIGHT child → buf0[XMSS_DIGEST_LEN..2*XMSS_DIGEST_LEN].
         hint_witness("xmss_merkle_node", buf0)
-        copy_5(state_in, buf0 + XMSS_DIGEST_LEN)
+        copy_ef(state_in, buf0 + XMSS_DIGEST_LEN)
 
-    # Level 0 hash
+    # Level 0 hash → buf1
     buf1 = Array(XMSS_DIGEST_LEN * 2)
     if b1 == 1:
-        poseidon16_compress_quarter_hardcoded_left(public_param, buf0, buf1, merkle_tweaks_chunk)
+        poseidon8_compress_quarter_hardcoded_left(public_param, buf0, buf1, merkle_tweaks_chunk)
         hint_witness("xmss_merkle_node", buf1 + XMSS_DIGEST_LEN)
     else:
-        poseidon16_compress_quarter_hardcoded_left(public_param, buf0, buf1 + XMSS_DIGEST_LEN, merkle_tweaks_chunk)
+        poseidon8_compress_quarter_hardcoded_left(public_param, buf0, buf1 + XMSS_DIGEST_LEN, merkle_tweaks_chunk)
         hint_witness("xmss_merkle_node", buf1)
 
     # Level 1 hash → buf2
     buf2 = Array(XMSS_DIGEST_LEN * 2)
     if b2 == 1:
-        poseidon16_compress_quarter_hardcoded_left(public_param, buf1, buf2, merkle_tweaks_chunk + 1 * TWEAK_LEN)
+        poseidon8_compress_quarter_hardcoded_left(public_param, buf1, buf2, merkle_tweaks_chunk + 1 * TWEAK_LEN)
         hint_witness("xmss_merkle_node", buf2 + XMSS_DIGEST_LEN)
     else:
-        poseidon16_compress_quarter_hardcoded_left(
+        poseidon8_compress_quarter_hardcoded_left(
             public_param, buf1, buf2 + XMSS_DIGEST_LEN, merkle_tweaks_chunk + 1 * TWEAK_LEN
         )
         hint_witness("xmss_merkle_node", buf2)
@@ -245,15 +257,15 @@ def do_4_merkle_levels(b, state_in, state_out, public_param, merkle_tweaks_chunk
     # Level 2 hash → buf3
     buf3 = Array(XMSS_DIGEST_LEN * 2)
     if b3 == 1:
-        poseidon16_compress_quarter_hardcoded_left(public_param, buf2, buf3, merkle_tweaks_chunk + 2 * TWEAK_LEN)
+        poseidon8_compress_quarter_hardcoded_left(public_param, buf2, buf3, merkle_tweaks_chunk + 2 * TWEAK_LEN)
         hint_witness("xmss_merkle_node", buf3 + XMSS_DIGEST_LEN)
     else:
-        poseidon16_compress_quarter_hardcoded_left(
+        poseidon8_compress_quarter_hardcoded_left(
             public_param, buf2, buf3 + XMSS_DIGEST_LEN, merkle_tweaks_chunk + 2 * TWEAK_LEN
         )
         hint_witness("xmss_merkle_node", buf3)
 
-    poseidon16_compress_quarter_hardcoded_left(public_param, buf3, state_out, merkle_tweaks_chunk + 3 * TWEAK_LEN)
+    poseidon8_compress_quarter_hardcoded_left(public_param, buf3, state_out, merkle_tweaks_chunk + 3 * TWEAK_LEN)
     return
 
 
diff --git a/crates/sub_protocols/src/air_sumcheck.rs b/crates/sub_protocols/src/air_sumcheck.rs
index 0f536d7fa..3fb3403f1 100644
--- a/crates/sub_protocols/src/air_sumcheck.rs
+++ b/crates/sub_protocols/src/air_sumcheck.rs
@@ -4,7 +4,7 @@ use std::ops::{Add, AddAssign, Mul, Sub};
 
 use backend::*;
 use lean_vm::ColIndex;
-use tracing::info_span;
+use tracing::{info_span, instrument};
 
 // Sumcheck to prove validity of AIR constraints
 //
@@ -323,38 +323,6 @@ where
 {
     let unpack_sum_packed = |s: EFPacking<EF>| -> EF { EFPacking::<EF>::to_ext_iter([s]).sum::<EF>() };
 
-    if let Some((low_degree, low_n_constraints)) = computation.low_degree_air() {
-        match multilinears {
-            MleGroupRef::BasePacked(cols) => {
-                return compute_raw_poly_degree_split::<EF, A, PFPacking<EF>, _, _>(
-                    cols,
-                    |j| split_eq.get_packed(j),
-                    computation,
-                    extra_data,
-                    fold_bit,
-                    active_count_pairs,
-                    low_degree,
-                    low_n_constraints,
-                    unpack_sum_packed,
-                );
-            }
-            MleGroupRef::ExtensionPacked(cols) => {
-                return compute_raw_poly_degree_split::<EF, A, EFPacking<EF>, _, _>(
-                    cols,
-                    |j| split_eq.get_packed(j),
-                    computation,
-                    extra_data,
-                    fold_bit,
-                    active_count_pairs,
-                    low_degree,
-                    low_n_constraints,
-                    unpack_sum_packed,
-                );
-            }
-            _ => {}
-        }
-    }
-
     match multilinears {
         MleGroupRef::BasePacked(cols) => compute_raw_poly_impl::<EF, A, PFPacking<EF>, EFPacking<EF>, _, _>(
             cols,
@@ -399,163 +367,6 @@ where
     }
 }
 
-#[allow(clippy::too_many_arguments)]
-fn compute_raw_poly_degree_split<EF, A, IF, GetEq, UnpackSum>(
-    cols: &[&[IF]],
-    get_split_eq: GetEq,
-    computation: &A,
-    extra_data: &A::ExtraData,
-    fold_bit: usize,
-    active_count_pairs: usize,
-    low_degree: usize,
-    low_n_constraints: usize,
-    unpack_sum: UnpackSum,
-) -> Vec<EF>
-where
-    EF: ExtensionField<PF<EF>>,
-    A: Air + 'static,
-    A::ExtraData: AlphaPowers<EF>,
-    IF: Algebra<PFPacking<EF>> + Copy + Send + Sync + Sub<Output = IF> + AddAssign + PrimeCharacteristicRing + 'static,
-    EFPacking<EF>: PrimeCharacteristicRing
-        + Mul<IF, Output = EFPacking<EF>>
-        + Add<IF, Output = EFPacking<EF>>
-        + Mul<PFPacking<EF>, Output = EFPacking<EF>>,
-    GetEq: Fn(usize) -> EFPacking<EF> + Sync + Send,
-    UnpackSum: Fn(EFPacking<EF>) -> EF + Sync + Send,
-{
-    let degree = computation.degree_air();
-    let n_cols = cols.len();
-    let n_flat = computation.n_columns();
-    let stride = 1usize << fold_bit;
-    let lo_mask = stride - 1;
-    let n_full = low_degree + 1;
-    let n_skip = degree - n_full;
-
-    // Points where we run the full AIR constraints = {0, 2, 3, …, d_low+1}
-    let low_zs: Vec<_> = (std::iter::once(0).chain(2..=(low_degree + 1)).map(PF::<EF>::from_usize)).collect();
-    // Points where we skip the low-degree constraints = target_z = {d_low+2, …, degree}
-    let hi_zs: Vec<_> = ((low_degree + 2)..=degree).map(PF::<EF>::from_usize).collect();
-    let hi_zs_halved: Vec<_> = hi_zs.iter().map(|&tz| tz.halve()).collect();
-    let lagrange_coeffs = lagrange_basis_evals(&low_zs, &hi_zs);
-
-    let acc = (0..active_count_pairs)
-        .into_par_iter()
-        .fold(
-            || {
-                (
-                    vec![EFPacking::<EF>::ZERO; degree],
-                    Vec::<IF>::with_capacity(n_cols),
-                    Vec::<IF>::with_capacity(n_cols),
-                    vec![EFPacking::<EF>::ZERO; n_full],
-                    Vec::<IF>::new(),
-                    Vec::<IF>::new(),
-                    Vec::<IF>::new(),
-                )
-            },
-            |(mut acc, mut point, mut diff, mut low_evals, mut state_0, mut state_2, mut cached_buf), new_j| {
-                let i_hi = new_j >> fold_bit;
-                let i_lo = new_j & lo_mask;
-                let i0 = (i_hi << (fold_bit + 1)) | i_lo;
-                let i1 = i0 | stride;
-                let partial_eq = get_split_eq(new_j);
-
-                // `point` holds column values at z=0; `diff[k] = col_k[i1] - col_k[i0]`.
-                // Invariant for the rest of this closure: `col_k(z) = point[k] + z · diff[k]`,
-                // so advancing z by 1 means `point[k] += diff[k]` for all k.
-                point.clear();
-                diff.clear();
-                for c in cols {
-                    let lo = c[i0];
-                    let hi = c[i1];
-                    point.push(lo);
-                    diff.push(hi - lo);
-                }
-
-                // Phase 1: full AIR constraints
-
-                // z = 0: full eval, capture post-block state.
-                {
-                    let mut folder = ConstraintFolderPacked::new(&point[..n_flat], &point[n_flat..], extra_data);
-                    folder.cached_state = Some(state_0);
-                    Air::eval(computation, &mut folder, extra_data);
-                    acc[0] += folder.accumulator * partial_eq;
-                    low_evals[0] = folder.accumulator_low;
-                    state_0 = folder.cached_state.unwrap();
-                }
-
-                // z = 2: advance `point` by 2·diff, full eval, capture post-block state.
-                // Together with `state_0` this pins down the linear `state(z)` (linear when we "omit" the low degree constraints of the block)
-                for k in 0..n_cols {
-                    point[k] += diff[k].double();
-                }
-                {
-                    let mut folder = ConstraintFolderPacked::new(&point[..n_flat], &point[n_flat..], extra_data);
-                    folder.cached_state = Some(state_2);
-                    Air::eval(computation, &mut folder, extra_data);
-                    acc[1] += folder.accumulator * partial_eq;
-                    low_evals[1] = folder.accumulator_low;
-                    state_2 = folder.cached_state.unwrap();
-                }
-
-                // z = 3, …, d_low+1: still doing full eval
-                for z_idx in 2..n_full {
-                    for k in 0..n_cols {
-                        point[k] += diff[k];
-                    }
-                    let mut folder = ConstraintFolderPacked::new(&point[..n_flat], &point[n_flat..], extra_data);
-                    Air::eval(computation, &mut folder, extra_data);
-                    acc[z_idx] += folder.accumulator * partial_eq;
-                    low_evals[z_idx] = folder.accumulator_low;
-                }
-
-                // Phase 2: skip the low degree constraints of the block
-                // For each skipped point, assemble Constraints(z) = high(z) + low(z):
-                //   -high(z): run folder with `skip_low = true`
-                //   -low(z): deduce it via Lagrange-interpolation from previous computations
-                for t in 0..n_skip {
-                    for k in 0..n_cols {
-                        point[k] += diff[k];
-                    }
-
-                    cached_buf.clear();
-                    for i in 0..state_0.len() {
-                        cached_buf
-                            .push(state_0[i] + (state_2[i] - state_0[i]) * PFPacking::<EF>::from(hi_zs_halved[t]));
-                    }
-
-                    let mut folder = ConstraintFolderPacked::new(&point[..n_flat], &point[n_flat..], extra_data);
-                    folder.skip_low = true;
-                    folder.cached_state = Some(cached_buf);
-                    folder.low_ci_count = low_n_constraints;
-                    Air::eval(computation, &mut folder, extra_data);
-                    cached_buf = folder.cached_state.unwrap();
-
-                    // low(hi_zs[t]) = Σ_i L_i(hi_zs[t]) · low(low_zs[i])
-                    let mut low_interpolated = EFPacking::<EF>::ZERO;
-                    for (i, lc) in lagrange_coeffs[t].iter().enumerate() {
-                        low_interpolated += low_evals[i] * PFPacking::<EF>::from(*lc);
-                    }
-
-                    acc[n_full + t] += (folder.accumulator + low_interpolated) * partial_eq;
-                }
-
-                (acc, point, diff, low_evals, state_0, state_2, cached_buf)
-            },
-        )
-        .map(|(acc, ..)| acc)
-        .reduce(
-            || vec![EFPacking::<EF>::ZERO; degree],
-            |mut a, b| {
-                for i in 0..degree {
-                    a[i] += b[i];
-                }
-                a
-            },
-        );
-
-    acc.into_iter().map(&unpack_sum).collect()
-}
-
 #[allow(clippy::too_many_arguments)]
 fn compute_raw_poly_impl<EF, A, IF, EFT, GetEq, UnpackSum>(
     cols: &[&[IF]],
@@ -633,6 +444,7 @@ where
     acc.into_iter().map(unpack_sum).collect()
 }
 
+#[instrument(skip_all)]
 pub fn prove_batched_air_sumcheck<'a, EF: ExtensionField<PF<EF>>>(
     prover_state: &mut impl FSProver<EF>,
     sessions: &mut [Box<dyn OuterSumcheckSession<EF> + 'a>],
diff --git a/crates/sub_protocols/src/quotient_gkr/mod.rs b/crates/sub_protocols/src/quotient_gkr/mod.rs
index 26fa25a65..cac959698 100644
--- a/crates/sub_protocols/src/quotient_gkr/mod.rs
+++ b/crates/sub_protocols/src/quotient_gkr/mod.rs
@@ -201,10 +201,10 @@ mod tests {
 
     use super::*;
     use rand::{RngExt, SeedableRng, rngs::StdRng};
-    use utils::{get_poseidon16, init_tracing};
+    use utils::{get_poseidon8, init_tracing};
 
-    type F = KoalaBear;
-    type EF = QuinticExtensionFieldKB;
+    type F = Goldilocks;
+    type EF = CubicExtensionFieldGL;
 
     fn sum_all_quotients(nums: &[F], den: &[EF]) -> EF {
         nums.par_iter().zip(den).map(|(&n, &d)| EF::from(n) / d).sum()
@@ -248,7 +248,7 @@ mod tests {
         denominators_raw.extend(std::iter::repeat_n(EF::ONE, n - active_len));
 
         let real_quotient = sum_all_quotients(&numerators_raw, &denominators_raw);
-        let mut prover_state = ProverState::new(get_poseidon16().clone(), Default::default());
+        let mut prover_state = ProverState::new(*get_poseidon8(), Default::default());
 
         // Keep natural-layout MLEs to check claims at `claim_point`.
         let numerators_nat = MleOwned::BasePacked(pack_extension(&numerators_raw));
@@ -273,8 +273,7 @@ mod tests {
         println!("Proving time: {:.3}s", time.elapsed().as_secs_f64());
 
         let mut verifier_state =
-            VerifierState::<EF, _>::new(prover_state.into_proof(), get_poseidon16().clone(), Default::default())
-                .unwrap();
+            VerifierState::<EF, _>::new(prover_state.into_proof(), *get_poseidon8(), Default::default()).unwrap();
         let verifier_statements = verify_gkr_quotient::<EF>(&mut verifier_state, log_n).unwrap();
         let (retrieved_quotient, claim_point, claim_num, claim_den) = verifier_statements;
         assert_eq!(claim_point_prover, claim_point);
diff --git a/crates/sub_protocols/tests/prove_poseidon.rs b/crates/sub_protocols/tests/prove_poseidon.rs
index cc234375d..b7e0d070b 100644
--- a/crates/sub_protocols/tests/prove_poseidon.rs
+++ b/crates/sub_protocols/tests/prove_poseidon.rs
@@ -2,38 +2,62 @@ use std::time::Instant;
 
 use backend::*;
 use lean_vm::{
-    EF, ExtraDataForBuses, F, POSEIDON_COL_ADDR_LEFT_HI, POSEIDON_COL_ADDR_LEFT_LO, POSEIDON_COL_FLAG_OUT8,
-    POSEIDON_COL_INPUT_START, POSEIDON_COL_MULTIPLICITY, Poseidon16Precompile, fill_trace_poseidon_16,
-    num_cols_poseidon_16,
+    EF, ExtraDataForBuses, F, HALF_DIGEST_LEN, POSEIDON_8_COL_ADDR_LEFT_HI, POSEIDON_8_COL_ADDR_LEFT_LO,
+    POSEIDON_8_COL_FLAG_OUT4, POSEIDON_8_COL_INPUT_START, POSEIDON_8_COL_MULTIPLICITY, POSEIDON_8_COL_OUT_LO,
+    POSEIDON_8_COL_ROUND_START, Poseidon8Precompile, compute_poseidon8_witness, fill_trace_poseidon_8,
+    num_cols_poseidon_8,
 };
 use rand::{RngExt, SeedableRng, rngs::StdRng};
 use sub_protocols::{
     AirSumcheckSession, OuterSumcheckSession, natural_ordering_point_for_session, prove_batched_air_sumcheck,
 };
-use utils::{get_poseidon16, padd_with_zero_to_next_power_of_two};
+use utils::{get_poseidon8, padd_with_zero_to_next_power_of_two};
 
-const WIDTH: usize = 16;
-const HALF_DIGEST_LEN: usize = 4;
+// Width of the Poseidon1 permutation under Goldilocks (compresses 8 → DIGEST=4).
+const WIDTH: usize = 8;
 
 #[test]
-#[allow(clippy::too_many_lines)]
-fn test_prove_poseidon() {
-    // LOG_N_ROWS=20 cargo test --release --package sub_protocols --test prove_poseidon -- test_prove_poseidon --exact --nocapture
+fn test_prove_poseidon_8() {
+    // LOG_N_ROWS=20 cargo test --release --package sub_protocols --test prove_poseidon_8 -- test_prove_poseidon_8 --exact --nocapture
     let log_n_rows: usize = std::env::var("LOG_N_ROWS").unwrap_or("11".to_string()).parse().unwrap();
+    utils::init_tracing();
+    prove_air_poseidon_8(log_n_rows);
+}
+
+#[allow(clippy::too_many_lines)]
+fn prove_air_poseidon_8(log_n_rows: usize) {
     let n_rows = 1 << log_n_rows;
     let mut rng = StdRng::seed_from_u64(0);
-    let n_cols = num_cols_poseidon_16();
+    let n_cols = num_cols_poseidon_8();
     let mut trace = vec![vec![F::ZERO; n_rows]; n_cols];
-    for t in trace.iter_mut().skip(POSEIDON_COL_INPUT_START).take(WIDTH) {
+    for t in trace.iter_mut().skip(POSEIDON_8_COL_INPUT_START).take(WIDTH) {
         *t = (0..n_rows).map(|_| rng.random()).collect();
     }
-    trace[POSEIDON_COL_MULTIPLICITY] = vec![F::ONE; n_rows];
-    trace[POSEIDON_COL_FLAG_OUT8] = vec![F::ONE; n_rows];
-    trace[POSEIDON_COL_ADDR_LEFT_LO] = vec![F::ZERO; n_rows];
-    trace[POSEIDON_COL_ADDR_LEFT_HI] = vec![F::from_usize(HALF_DIGEST_LEN); n_rows];
-    fill_trace_poseidon_16(&mut trace);
+    trace[POSEIDON_8_COL_MULTIPLICITY] = vec![F::ONE; n_rows];
+    trace[POSEIDON_8_COL_FLAG_OUT4] = vec![F::ONE; n_rows];
+    trace[POSEIDON_8_COL_ADDR_LEFT_LO] = vec![F::ZERO; n_rows];
+    trace[POSEIDON_8_COL_ADDR_LEFT_HI] = vec![F::from_usize(HALF_DIGEST_LEN); n_rows];
+
+    // Fill the per-round witness columns + outputs from the inputs.
+    // The AIR's transition constraints require a consistent Poseidon1 permutation
+    // trace, otherwise sumcheck verification fails.
+    #[allow(clippy::needless_range_loop)]
+    for row in 0..n_rows {
+        let input: [F; WIDTH] = std::array::from_fn(|i| trace[POSEIDON_8_COL_INPUT_START + i][row]);
+        let (aux, perm_state) = compute_poseidon8_witness(input);
+        // Compression rows (flag_permute = 0): `out_lo` holds the Davies-Meyer
+        // output; `out_hi` is left zero (AIR-unconstrained for compression).
+        for i in 0..WIDTH / 2 {
+            trace[POSEIDON_8_COL_OUT_LO + i][row] = perm_state[i] + input[i];
+        }
+        for (i, v) in aux.iter().enumerate() {
+            trace[POSEIDON_8_COL_ROUND_START + i][row] = *v;
+        }
+    }
+
+    fill_trace_poseidon_8(&mut trace);
 
-    let air = Poseidon16Precompile::<false>;
+    let air = Poseidon8Precompile::<false>;
     let n_constraints = air.n_constraints();
     let air_degree = air.degree_air();
 
@@ -50,7 +74,7 @@ fn test_prove_poseidon() {
     let packed_n_vars = log2_ceil_usize(n_cols << log_n_rows);
     let whir_config = WhirConfig::new(&whir_config_builder, packed_n_vars);
 
-    let mut prover_state = ProverState::<EF, _>::new(get_poseidon16().clone(), Default::default());
+    let mut prover_state = ProverState::<EF, _>::new(*get_poseidon8(), Default::default());
 
     let time = Instant::now();
 
@@ -101,7 +125,7 @@ fn test_prove_poseidon() {
     );
 
     let mut verifier_state =
-        VerifierState::<EF, _>::new(prover_state.into_proof(), get_poseidon16().clone(), Default::default()).unwrap();
+        VerifierState::<EF, _>::new(prover_state.into_proof(), *get_poseidon8(), Default::default()).unwrap();
 
     let parsed_commitment = whir_config.parse_commitment::<F>(&mut verifier_state).unwrap();
 
@@ -119,7 +143,7 @@ fn test_prove_poseidon() {
 
     let col_evals_v: Vec<EF> = verifier_state.next_extension_scalars_vec(n_cols).unwrap();
     let constraint_eval =
-        <Poseidon16Precompile<false> as SumcheckComputation<EF>>::eval_extension(&air, &col_evals_v, &extra_data);
+        <Poseidon8Precompile<false> as SumcheckComputation<EF>>::eval_extension(&air, &col_evals_v, &extra_data);
 
     let natural_ordering_point_v = natural_ordering_point_for_session(&sumcheck_air_point_v.0, log_n_rows);
     let eq_val = MultilinearPoint(eq_factor_v).eq_poly_outside(&MultilinearPoint(natural_ordering_point_v.clone()));
diff --git a/crates/utils/src/multilinear.rs b/crates/utils/src/multilinear.rs
index 2030ca5f4..4c0ed89fb 100644
--- a/crates/utils/src/multilinear.rs
+++ b/crates/utils/src/multilinear.rs
@@ -103,8 +103,8 @@ mod tests {
 
     use super::*;
 
-    type F = KoalaBear;
-    type EF = QuinticExtensionFieldKB;
+    type F = Goldilocks;
+    type EF = CubicExtensionFieldGL;
 
     #[test]
     fn test_evaluate_as_larger_multilinear_pol() {
diff --git a/crates/utils/src/poseidon.rs b/crates/utils/src/poseidon.rs
index 5fa1358d9..999e6aa30 100644
--- a/crates/utils/src/poseidon.rs
+++ b/crates/utils/src/poseidon.rs
@@ -1,51 +1,52 @@
-use backend::symmetric::Permutation;
 use backend::*;
 use std::sync::OnceLock;
 
-pub type Poseidon16 = Poseidon1KoalaBear16;
+pub type Poseidon8 = Poseidon1Goldilocks8;
 
-pub const HALF_FULL_ROUNDS_16: usize = POSEIDON1_HALF_FULL_ROUNDS;
-pub const PARTIAL_ROUNDS_16: usize = POSEIDON1_PARTIAL_ROUNDS;
+pub const HALF_FULL_ROUNDS_8: usize = POSEIDON1_HALF_FULL_ROUNDS;
+pub const PARTIAL_ROUNDS_8: usize = POSEIDON1_PARTIAL_ROUNDS;
 
-static POSEIDON_16_INSTANCE: OnceLock<Poseidon16> = OnceLock::new();
-static POSEIDON_16_OF_ZERO: OnceLock<[KoalaBear; 8]> = OnceLock::new();
+static POSEIDON_8_INSTANCE: OnceLock<Poseidon8> = OnceLock::new();
+static POSEIDON_8_OF_ZERO: OnceLock<[Goldilocks; 4]> = OnceLock::new();
 
 #[inline(always)]
-pub fn get_poseidon16() -> &'static Poseidon16 {
-    POSEIDON_16_INSTANCE.get_or_init(default_koalabear_poseidon1_16)
+pub fn get_poseidon8() -> &'static Poseidon8 {
+    POSEIDON_8_INSTANCE.get_or_init(default_goldilocks_poseidon1_8)
 }
 
 #[inline(always)]
-pub fn get_poseidon_16_of_zero() -> &'static [KoalaBear; 8] {
-    POSEIDON_16_OF_ZERO.get_or_init(|| poseidon16_compress([KoalaBear::default(); 16]))
+pub fn get_poseidon_8_of_zero() -> &'static [Goldilocks; 4] {
+    POSEIDON_8_OF_ZERO.get_or_init(|| poseidon8_compress([Goldilocks::default(); 8]))
 }
 
 #[inline(always)]
-pub fn poseidon16_compress(input: [KoalaBear; 16]) -> [KoalaBear; 8] {
-    get_poseidon16().compress(input)[0..8].try_into().unwrap()
+pub fn poseidon8_compress(input: [Goldilocks; 8]) -> [Goldilocks; 4] {
+    let mut state = input;
+    get_poseidon8().compress_in_place(&mut state);
+    state[0..4].try_into().unwrap()
 }
 
 #[inline(always)]
-pub fn poseidon16_permute(input: [KoalaBear; 16]) -> [KoalaBear; 16] {
-    get_poseidon16().permute(input)
+pub fn poseidon8_permute(input: [Goldilocks; 8]) -> [Goldilocks; 8] {
+    get_poseidon8().permute(input)
 }
 
-pub fn poseidon16_compress_pair(left: &[KoalaBear; 8], right: &[KoalaBear; 8]) -> [KoalaBear; 8] {
-    let mut input = [KoalaBear::default(); 16];
-    input[..8].copy_from_slice(left);
-    input[8..].copy_from_slice(right);
-    poseidon16_compress(input)
+pub fn poseidon8_compress_pair(left: &[Goldilocks; 4], right: &[Goldilocks; 4]) -> [Goldilocks; 4] {
+    let mut input = [Goldilocks::default(); 8];
+    input[..4].copy_from_slice(left);
+    input[4..].copy_from_slice(right);
+    poseidon8_compress(input)
 }
 
 // Overwrite-sponge
-pub fn poseidon_hash_slice(data: &[KoalaBear]) -> [KoalaBear; DIGEST_ELEMS] {
+pub fn poseidon_hash_slice(data: &[Goldilocks]) -> [Goldilocks; DIGEST_ELEMS] {
     assert!(!data.is_empty());
     assert!(data.len().is_multiple_of(RATE));
-    let mut state = [KoalaBear::default(); WIDTH];
-    state[0] = KoalaBear::from_usize(data.len());
+    let mut state = [Goldilocks::default(); WIDTH];
+    state[0] = Goldilocks::from_usize(data.len());
     for chunk in data.chunks(RATE) {
         state[CAPACITY..].copy_from_slice(chunk);
-        state = poseidon16_permute(state);
+        state = poseidon8_permute(state);
     }
     state[CAPACITY..].try_into().unwrap()
 }
diff --git a/crates/whir/Cargo.toml b/crates/whir/Cargo.toml
index 1c2a2b0a7..5a7052814 100644
--- a/crates/whir/Cargo.toml
+++ b/crates/whir/Cargo.toml
@@ -5,7 +5,7 @@ edition.workspace = true
 
 [dependencies]
 field = { path = "../backend/field", package = "mt-field" }
-koala-bear = { path = "../backend/koala-bear", package = "mt-koala-bear" }
+goldilocks = { path = "../backend/goldilocks", package = "mt-goldilocks" }
 poly = { path = "../backend/poly", package = "mt-poly" }
 sumcheck = { path = "../backend/sumcheck", package = "mt-sumcheck" }
 fiat-shamir = { path = "../backend/fiat-shamir", package = "mt-fiat-shamir" }
diff --git a/crates/whir/src/config.rs b/crates/whir/src/config.rs
index 5f57b600a..02040398d 100644
--- a/crates/whir/src/config.rs
+++ b/crates/whir/src/config.rs
@@ -3,6 +3,11 @@
 use field::{Field, TwoAdicField};
 use poly::*;
 
+// (Goldilocks two adicity is 32) We use a smaller one to avoid having to deal with PoW grinding at folding in WHIR
+// TODO we likely want a bit more than 24, so we should reintroduce PoW grinding for folding in the future
+// But hopefully we will have better proximity gaps formulas by then
+pub const EFFECTIVE_TWO_ADICITY: usize = 24;
+
 /// Defines the folding factor for polynomial commitments.
 #[derive(Debug, Clone, Copy)]
 pub struct FoldingFactor {
@@ -103,7 +108,6 @@ pub struct WhirConfigBuilder {
 #[derive(Debug, Clone)]
 pub struct RoundConfig<EF: Field> {
     pub query_pow_bits: usize,
-    pub folding_pow_bits: usize,
     pub num_queries: usize,
     pub ood_samples: usize,
     pub log_inv_rate: usize,
@@ -119,7 +123,6 @@ pub struct WhirConfig<EF: Field> {
 
     pub commitment_ood_samples: usize,
     pub starting_log_inv_rate: usize,
-    pub starting_folding_pow_bits: usize,
 
     pub folding_factor: FoldingFactor,
     pub rs_domain_initial_reduction_factor: usize,
@@ -137,40 +140,22 @@ where
     PF<EF>: TwoAdicField,
 {
     /// `log_c` controls the proximity parameter `η` (η = √ρ/c for JB, η = ρ/c for CB).
-    /// Increasing `log_c` shrinks `η`, which:
-    /// - reduces the number of queries, but
-    /// - grows the list size, which tightens the `prox_gaps_error` and `sumcheck_error` -> more PoW grinding
-    ///
-    /// Both errors are decreasing functions in `log_c`. Among feasible `m ∈ [3, 100]` (with `log_c = log2(2m)`,
-    /// and `folding_pow_bits ≤ pow_bits`) we pick the smallest `m` that achieves the minimum query count.
-    fn compute_optimal_log_c_for_rate(
-        whir_parameters: &WhirConfigBuilder,
-        field_size_bits: usize,
-        num_variables: usize,
-        log_inv_rate: usize,
-    ) -> f64 {
+    /// Increasing `log_c` shrinks `η`, which reduces the number of queries but grows the list size.
+    /// With the field we use (192-bit cubic extension over Goldilocks), the list size never threatens
+    /// `prox_gaps_error` / `sumcheck_error` enough to require folding PoW, so we pick the smallest
+    /// `m ∈ [3, 100]` (with `log_c = log2(2m)`) that achieves the minimum query count — keeping
+    /// `log_c` (and the dependent OOD sample count) as small as possible.
+    fn compute_optimal_log_c_for_rate(whir_parameters: &WhirConfigBuilder, log_inv_rate: usize) -> f64 {
         if matches!(whir_parameters.soundness_type, SecurityAssumption::UniqueDecoding) {
             return 0.0;
         }
 
-        let pow_budget = whir_parameters.pow_bits;
-        let query_security_level = whir_parameters.security_level.saturating_sub(pow_budget);
+        let query_security_level = whir_parameters.security_level.saturating_sub(whir_parameters.pow_bits);
 
         let mut best_m = 3;
         let mut best_queries = usize::MAX;
         for m in 3..=100 {
             let log_c = (2.0 * m as f64).log2();
-            let folding_pow = Self::folding_pow_bits(
-                whir_parameters.security_level,
-                whir_parameters.soundness_type,
-                field_size_bits,
-                num_variables,
-                log_inv_rate,
-                log_c,
-            );
-            if folding_pow.ceil() as usize > pow_budget {
-                break;
-            }
             let queries = whir_parameters
                 .soundness_type
                 .queries(query_security_level, log_inv_rate, log_c);
@@ -182,7 +167,6 @@ where
         (2.0 * best_m as f64).log2()
     }
 
-    #[allow(clippy::too_many_lines)]
     pub fn new(whir_parameters: &WhirConfigBuilder, num_variables: usize) -> Self {
         whir_parameters.folding_factor.check_validity(num_variables).unwrap();
 
@@ -200,16 +184,23 @@ where
 
         let log_folded_domain_size = log_domain_size - whir_parameters.folding_factor.at_round(0);
         assert!(
-            log_folded_domain_size <= PF::<EF>::TWO_ADICITY,
-            "Increase folding_factor_0"
+            log_folded_domain_size <= EFFECTIVE_TWO_ADICITY,
+            "num_variables + log_inv_rate must be ≤ EFFECTIVE_TWO_ADICITY ({}) + first_folding_factor ({}); \
+             got {}.",
+            EFFECTIVE_TWO_ADICITY,
+            whir_parameters.folding_factor.at_round(0),
+            log_domain_size,
+        );
+        debug_assert!(
+            EFFECTIVE_TWO_ADICITY <= PF::<EF>::TWO_ADICITY,
+            "EFFECTIVE_TWO_ADICITY exceeds the field's actual two-adicity",
         );
 
         let (num_rounds, final_sumcheck_rounds) = whir_parameters
             .folding_factor
             .compute_number_of_rounds(num_variables, whir_parameters.max_num_variables_to_send_coeffs);
 
-        let mut log_c_old =
-            Self::compute_optimal_log_c_for_rate(whir_parameters, field_size_bits, num_variables, log_inv_rate);
+        let mut log_c_old = Self::compute_optimal_log_c_for_rate(whir_parameters, log_inv_rate);
 
         let commitment_ood_samples = whir_parameters.soundness_type.determine_ood_samples(
             whir_parameters.security_level,
@@ -219,15 +210,6 @@ where
             log_c_old,
         );
 
-        let starting_folding_pow_bits = Self::folding_pow_bits(
-            whir_parameters.security_level,
-            whir_parameters.soundness_type,
-            field_size_bits,
-            num_variables,
-            log_inv_rate,
-            log_c_old,
-        );
-
         let mut round_parameters = Vec::with_capacity(num_rounds);
 
         let mut num_variables_moving = num_variables;
@@ -241,8 +223,7 @@ where
             };
             let next_rate = log_inv_rate + (whir_parameters.folding_factor.at_round(round) - rs_reduction_factor);
 
-            let log_c_new =
-                Self::compute_optimal_log_c_for_rate(whir_parameters, field_size_bits, num_variables_moving, next_rate);
+            let log_c_new = Self::compute_optimal_log_c_for_rate(whir_parameters, next_rate);
 
             let num_queries = whir_parameters
                 .soundness_type
@@ -272,21 +253,12 @@ where
             let query_pow_bits =
                 0_f64.max(whir_parameters.security_level as f64 - (query_error.min(combination_error)));
 
-            let folding_pow_bits = Self::folding_pow_bits(
-                whir_parameters.security_level,
-                whir_parameters.soundness_type,
-                field_size_bits,
-                num_variables_moving,
-                next_rate,
-                log_c_new,
-            );
             let folding_factor = whir_parameters.folding_factor.at_round(round);
             let next_folding_factor = whir_parameters.folding_factor.at_round(round + 1);
             let folded_domain_gen = PF::<EF>::two_adic_generator(domain_size.ilog2() as usize - folding_factor);
 
             round_parameters.push(RoundConfig {
                 query_pow_bits: query_pow_bits.ceil() as usize,
-                folding_pow_bits: folding_pow_bits.ceil() as usize,
                 num_queries,
                 ood_samples,
                 log_inv_rate,
@@ -322,7 +294,6 @@ where
             commitment_ood_samples,
             num_variables,
             starting_log_inv_rate: whir_parameters.starting_log_inv_rate,
-            starting_folding_pow_bits: starting_folding_pow_bits.ceil() as usize,
             folding_factor: whir_parameters.folding_factor,
             rs_domain_initial_reduction_factor: whir_parameters.rs_domain_initial_reduction_factor,
             round_parameters,
@@ -366,41 +337,6 @@ where
         self.num_variables - self.folding_factor.total_number(self.n_rounds())
     }
 
-    pub fn max_folding_pow_bits(&self) -> usize {
-        self.round_parameters.iter().map(|r| r.folding_pow_bits).max().unwrap()
-    }
-
-    #[must_use]
-    pub fn rbr_soundness_fold_sumcheck(
-        soundness_type: SecurityAssumption,
-        field_size_bits: usize,
-        num_variables: usize,
-        log_inv_rate: usize,
-        log_c: f64,
-    ) -> f64 {
-        let list_size = soundness_type.list_size_bits(num_variables, log_inv_rate, log_c);
-
-        field_size_bits as f64 - (list_size + 1.)
-    }
-
-    #[must_use]
-    pub fn folding_pow_bits(
-        security_level: usize,
-        soundness_type: SecurityAssumption,
-        field_size_bits: usize,
-        num_variables: usize,
-        log_inv_rate: usize,
-        log_c: f64,
-    ) -> f64 {
-        let prox_gaps_error = soundness_type.prox_gaps_error(num_variables, log_inv_rate, field_size_bits, 2, log_c);
-        let sumcheck_error =
-            Self::rbr_soundness_fold_sumcheck(soundness_type, field_size_bits, num_variables, log_inv_rate, log_c);
-
-        let error = prox_gaps_error.min(sumcheck_error);
-
-        0_f64.max(security_level as f64 - error)
-    }
-
     #[must_use]
     pub fn rbr_soundness_queries_combination(
         soundness_type: SecurityAssumption,
@@ -436,7 +372,6 @@ where
             domain_size,
             folded_domain_gen,
             ood_samples: last.ood_samples,
-            folding_pow_bits: 0,
             log_inv_rate: last.log_inv_rate,
         }
     }
@@ -462,7 +397,7 @@ impl SecurityAssumption {
     ///
     /// `log_c` is log2 of the divisor c, where η = √ρ/c (JB) or ρ/c (CB).
     /// It is computed per rate phase by `WhirConfig::compute_optimal_log_c_for_rate` to
-    /// balance folding PoW vs queries.
+    /// minimize the query count.
     #[must_use]
     pub fn log_eta(&self, log_inv_rate: usize, log_c: f64) -> f64 {
         match self {
diff --git a/crates/whir/src/dft.rs b/crates/whir/src/dft.rs
index 277597eb8..068a67c4d 100644
--- a/crates/whir/src/dft.rs
+++ b/crates/whir/src/dft.rs
@@ -570,14 +570,14 @@ impl<F: Field> Butterfly<F> for EvalsButterfly<F> {
 #[cfg(test)]
 mod tests {
     use field::{PrimeCharacteristicRing, TwoAdicField};
-    use koala_bear::{KoalaBear, QuinticExtensionFieldKB};
+    use goldilocks::{CubicExtensionFieldGL, Goldilocks};
     use poly::*;
     use rand::{RngExt, SeedableRng, rngs::StdRng};
 
     use crate::*;
 
-    type F = KoalaBear;
-    type EF = QuinticExtensionFieldKB;
+    type F = Goldilocks;
+    type EF = CubicExtensionFieldGL;
 
     #[test]
     fn test_eval_dft() {
diff --git a/crates/whir/src/merkle.rs b/crates/whir/src/merkle.rs
index eadf8212a..8a651dac7 100644
--- a/crates/whir/src/merkle.rs
+++ b/crates/whir/src/merkle.rs
@@ -9,7 +9,7 @@ use field::ExtensionField;
 use field::Field;
 use field::PackedValue;
 use field::PrimeCharacteristicRing;
-use koala_bear::{KoalaBear, QuinticExtensionFieldKB, default_koalabear_poseidon1_16};
+use goldilocks::{CubicExtensionFieldGL, Goldilocks, default_goldilocks_poseidon1_8};
 use poly::*;
 
 use rayon::prelude::*;
@@ -30,22 +30,22 @@ pub(crate) fn merkle_commit<F: Field, EF: ExtensionField<F>>(
     full_n_cols: usize,
     effective_n_cols: usize,
 ) -> ([F; DIGEST_ELEMS], RoundMerkleTree<F>) {
-    if TypeId::of::<(F, EF)>() == TypeId::of::<(KoalaBear, QuinticExtensionFieldKB)>() {
-        let matrix = unsafe { std::mem::transmute::<_, DenseMatrix<QuinticExtensionFieldKB>>(matrix) };
-        let dim = <QuinticExtensionFieldKB as BasedVectorSpace<KoalaBear>>::DIMENSION;
+    if TypeId::of::<(F, EF)>() == TypeId::of::<(Goldilocks, CubicExtensionFieldGL)>() {
+        let matrix = unsafe { std::mem::transmute::<_, DenseMatrix<CubicExtensionFieldGL>>(matrix) };
+        let dim = <CubicExtensionFieldGL as BasedVectorSpace<Goldilocks>>::DIMENSION;
         let dft_base_width = matrix.width * dim;
         let full_base_width = full_n_cols * dim;
         let effective_base_width = effective_n_cols * dim;
-        let base_values = QuinticExtensionFieldKB::flatten_to_base(matrix.values);
-        let base_matrix = DenseMatrix::<KoalaBear>::new(base_values, dft_base_width);
-        let tree = build_merkle_tree_koalabear(base_matrix, full_base_width, effective_base_width);
+        let base_values = CubicExtensionFieldGL::flatten_to_base(matrix.values);
+        let base_matrix = DenseMatrix::<Goldilocks>::new(base_values, dft_base_width);
+        let tree = build_merkle_tree_goldilocks(base_matrix, full_base_width, effective_base_width);
         let root: [_; DIGEST_ELEMS] = tree.root();
         let root = unsafe { std::mem::transmute_copy::<_, [F; DIGEST_ELEMS]>(&root) };
         let tree = unsafe { std::mem::transmute::<_, RoundMerkleTree<F>>(tree) };
         (root, tree)
-    } else if TypeId::of::<(F, EF)>() == TypeId::of::<(KoalaBear, KoalaBear)>() {
-        let matrix = unsafe { std::mem::transmute::<_, DenseMatrix<KoalaBear>>(matrix) };
-        let tree = build_merkle_tree_koalabear(matrix, full_n_cols, effective_n_cols);
+    } else if TypeId::of::<(F, EF)>() == TypeId::of::<(Goldilocks, Goldilocks)>() {
+        let matrix = unsafe { std::mem::transmute::<_, DenseMatrix<Goldilocks>>(matrix) };
+        let tree = build_merkle_tree_goldilocks(matrix, full_n_cols, effective_n_cols);
         let root: [_; DIGEST_ELEMS] = tree.root();
         let root = unsafe { std::mem::transmute_copy::<_, [F; DIGEST_ELEMS]>(&root) };
         let tree = unsafe { std::mem::transmute::<_, RoundMerkleTree<F>>(tree) };
@@ -56,30 +56,30 @@ pub(crate) fn merkle_commit<F: Field, EF: ExtensionField<F>>(
 }
 
 #[instrument(name = "build merkle tree", skip_all)]
-fn build_merkle_tree_koalabear(
-    leaf: DenseMatrix<KoalaBear>,
+fn build_merkle_tree_goldilocks(
+    leaf: DenseMatrix<Goldilocks>,
     full_base_width: usize,
     effective_base_width: usize,
-) -> RoundMerkleTree<KoalaBear> {
+) -> RoundMerkleTree<Goldilocks> {
     // Leaf hashing is an overwrite sponge (raw permutation); the 2->1 tree climb is compression.
-    // `perm` (Poseidon16) implements both `Permutation` and `Compression`, so it serves both roles.
-    let perm = default_koalabear_poseidon1_16();
-    let n_zero_suffix_rate_chunks = (full_base_width - effective_base_width) / 8;
-    let iv_first = KoalaBear::from_usize(full_base_width);
-    let scalar_state = symetric::precompute_zero_suffix_state::<KoalaBear, _, 16, 8, DIGEST_ELEMS>(
+    // `perm` (Poseidon8) implements both `Permutation` and `Compression`, so it serves both roles.
+    let perm = default_goldilocks_poseidon1_8();
+    let n_zero_suffix_rate_chunks = (full_base_width - effective_base_width) / 4;
+    let iv_first = Goldilocks::from_usize(full_base_width);
+    let scalar_state = symetric::precompute_zero_suffix_state::<Goldilocks, _, 8, 4, DIGEST_ELEMS>(
         &perm,
         iv_first,
         n_zero_suffix_rate_chunks,
     );
-    let packed_state: [PFPacking<KoalaBear>; 16] =
-        std::array::from_fn(|i| PFPacking::<KoalaBear>::from_fn(|_| scalar_state[i]));
-    let first_layer = first_digest_layer_with_initial_state::<PFPacking<KoalaBear>, _, _, DIGEST_ELEMS, 16, 8>(
+    let packed_state: [PFPacking<Goldilocks>; 8] =
+        std::array::from_fn(|i| PFPacking::<Goldilocks>::from_fn(|_| scalar_state[i]));
+    let first_layer = first_digest_layer_with_initial_state::<PFPacking<Goldilocks>, _, _, DIGEST_ELEMS, 8, 4>(
         &perm,
         &leaf,
         &packed_state,
         effective_base_width,
     );
-    let tree = symetric::merkle::MerkleTree::from_first_layer::<PFPacking<KoalaBear>, _, 16>(&perm, first_layer);
+    let tree = symetric::merkle::MerkleTree::from_first_layer::<PFPacking<Goldilocks>, _, 8>(&perm, first_layer);
     WhirMerkleTree {
         leaf,
         tree,
@@ -92,17 +92,17 @@ pub(crate) fn merkle_open<F: Field, EF: ExtensionField<F>>(
     merkle_tree: &RoundMerkleTree<F>,
     index: usize,
 ) -> (Vec<EF>, Vec<[F; DIGEST_ELEMS]>) {
-    if TypeId::of::<(F, EF)>() == TypeId::of::<(KoalaBear, QuinticExtensionFieldKB)>() {
-        let merkle_tree = unsafe { std::mem::transmute::<_, &RoundMerkleTree<KoalaBear>>(merkle_tree) };
+    if TypeId::of::<(F, EF)>() == TypeId::of::<(Goldilocks, CubicExtensionFieldGL)>() {
+        let merkle_tree = unsafe { std::mem::transmute::<_, &RoundMerkleTree<Goldilocks>>(merkle_tree) };
         let (inner_leaf, proof) = merkle_tree.open(index);
-        let leaf = QuinticExtensionFieldKB::reconstitute_from_base(inner_leaf);
+        let leaf = CubicExtensionFieldGL::reconstitute_from_base(inner_leaf);
         let leaf = unsafe { std::mem::transmute::<_, Vec<EF>>(leaf) };
         let proof = unsafe { std::mem::transmute::<_, Vec<[F; DIGEST_ELEMS]>>(proof) };
         (leaf, proof)
-    } else if TypeId::of::<(F, EF)>() == TypeId::of::<(KoalaBear, KoalaBear)>() {
-        let merkle_tree = unsafe { std::mem::transmute::<_, &RoundMerkleTree<KoalaBear>>(merkle_tree) };
+    } else if TypeId::of::<(F, EF)>() == TypeId::of::<(Goldilocks, Goldilocks)>() {
+        let merkle_tree = unsafe { std::mem::transmute::<_, &RoundMerkleTree<Goldilocks>>(merkle_tree) };
         let (inner_leaf, proof) = merkle_tree.open(index);
-        let leaf = KoalaBear::reconstitute_from_base(inner_leaf);
+        let leaf = Goldilocks::reconstitute_from_base(inner_leaf);
         let leaf = unsafe { std::mem::transmute::<_, Vec<EF>>(leaf) };
         let proof = unsafe { std::mem::transmute::<_, Vec<[F; DIGEST_ELEMS]>>(proof) };
         (leaf, proof)
@@ -119,14 +119,14 @@ pub(crate) fn merkle_verify<F: Field, EF: ExtensionField<F>>(
     data: Vec<EF>,
     proof: &Vec<[F; DIGEST_ELEMS]>,
 ) -> bool {
-    let perm = default_koalabear_poseidon1_16();
+    let perm = default_goldilocks_poseidon1_8();
     let log_max_height = utils::log2_strict_usize(dimension.height.next_power_of_two());
-    if TypeId::of::<(F, EF)>() == TypeId::of::<(KoalaBear, QuinticExtensionFieldKB)>() {
-        let merkle_root = unsafe { std::mem::transmute_copy::<_, [KoalaBear; DIGEST_ELEMS]>(&merkle_root) };
-        let data = unsafe { std::mem::transmute::<_, Vec<QuinticExtensionFieldKB>>(data) };
-        let proof = unsafe { std::mem::transmute::<_, &Vec<[KoalaBear; DIGEST_ELEMS]>>(proof) };
-        let base_data = QuinticExtensionFieldKB::flatten_to_base(data);
-        symetric::merkle::merkle_verify::<_, _, _, DIGEST_ELEMS, 16, 8>(
+    if TypeId::of::<(F, EF)>() == TypeId::of::<(Goldilocks, CubicExtensionFieldGL)>() {
+        let merkle_root = unsafe { std::mem::transmute_copy::<_, [Goldilocks; DIGEST_ELEMS]>(&merkle_root) };
+        let data = unsafe { std::mem::transmute::<_, Vec<CubicExtensionFieldGL>>(data) };
+        let proof = unsafe { std::mem::transmute::<_, &Vec<[Goldilocks; DIGEST_ELEMS]>>(proof) };
+        let base_data = CubicExtensionFieldGL::flatten_to_base(data);
+        symetric::merkle::merkle_verify::<_, _, _, DIGEST_ELEMS, 8, 4>(
             &perm,
             &perm,
             &merkle_root,
@@ -135,12 +135,12 @@ pub(crate) fn merkle_verify<F: Field, EF: ExtensionField<F>>(
             &base_data,
             proof,
         )
-    } else if TypeId::of::<(F, EF)>() == TypeId::of::<(KoalaBear, KoalaBear)>() {
-        let merkle_root = unsafe { std::mem::transmute_copy::<_, [KoalaBear; DIGEST_ELEMS]>(&merkle_root) };
-        let data = unsafe { std::mem::transmute::<_, Vec<KoalaBear>>(data) };
-        let proof = unsafe { std::mem::transmute::<_, &Vec<[KoalaBear; DIGEST_ELEMS]>>(proof) };
-        let base_data = KoalaBear::flatten_to_base(data);
-        symetric::merkle::merkle_verify::<_, _, _, DIGEST_ELEMS, 16, 8>(
+    } else if TypeId::of::<(F, EF)>() == TypeId::of::<(Goldilocks, Goldilocks)>() {
+        let merkle_root = unsafe { std::mem::transmute_copy::<_, [Goldilocks; DIGEST_ELEMS]>(&merkle_root) };
+        let data = unsafe { std::mem::transmute::<_, Vec<Goldilocks>>(data) };
+        let proof = unsafe { std::mem::transmute::<_, &Vec<[Goldilocks; DIGEST_ELEMS]>>(proof) };
+        let base_data = Goldilocks::flatten_to_base(data);
+        symetric::merkle::merkle_verify::<_, _, _, DIGEST_ELEMS, 8, 4>(
             &perm,
             &perm,
             &merkle_root,
@@ -188,7 +188,7 @@ fn first_digest_layer_with_initial_state<P, Perm, M, const DIGEST_ELEMS: usize,
 where
     P: PackedValue + Default,
     P::Value: Default + Copy,
-    Perm: koala_bear::symmetric::Permutation<[P::Value; WIDTH]> + koala_bear::symmetric::Permutation<[P; WIDTH]>,
+    Perm: symetric::Permutation<[P::Value; WIDTH]> + symetric::Permutation<[P; WIDTH]>,
     M: Matrix<P::Value>,
 {
     let width = P::WIDTH;
diff --git a/crates/whir/src/open.rs b/crates/whir/src/open.rs
index 6636b77c7..eeacecfa3 100644
--- a/crates/whir/src/open.rs
+++ b/crates/whir/src/open.rs
@@ -160,12 +160,10 @@ where
             &stir_combination_randomness,
         );
 
-        let next_folding_randomness = round_state.sumcheck_prover.run_sumcheck_many_rounds(
-            None,
-            prover_state,
-            folding_factor_next,
-            round_params.folding_pow_bits,
-        );
+        let next_folding_randomness =
+            round_state
+                .sumcheck_prover
+                .run_sumcheck_many_rounds(None, prover_state, folding_factor_next);
 
         round_state.randomness_vec.extend_from_slice(&next_folding_randomness.0);
 
@@ -239,7 +237,7 @@ where
             let final_folding_randomness =
                 round_state
                     .sumcheck_prover
-                    .run_sumcheck_many_rounds(None, prover_state, self.final_sumcheck_rounds, 0);
+                    .run_sumcheck_many_rounds(None, prover_state, self.final_sumcheck_rounds);
 
             round_state.randomness_vec.extend(final_folding_randomness.0);
         }
@@ -386,7 +384,6 @@ where
         prev_folding_scalar: Option<EF>,
         prover_state: &mut impl FSProver<EF>,
         n_rounds: usize,
-        pow_bits: usize,
     ) -> MultilinearPoint<EF> {
         let (challenges, folds, new_sum) = sumcheck_prove_many_rounds(
             MleGroupRef::merge(&[&self.evals.by_ref(), &self.weights.by_ref()]),
@@ -399,7 +396,7 @@ where
             None,
             n_rounds,
             false,
-            pow_bits,
+            0,
         );
 
         self.sum = new_sum;
@@ -415,7 +412,6 @@ where
         combination_randomness: EF,
         prover_state: &mut impl FSProver<EF>,
         folding_factor: usize,
-        pow_bits: usize,
     ) -> (Self, MultilinearPoint<EF>) {
         assert_ne!(folding_factor, 0);
 
@@ -423,14 +419,8 @@ where
 
         let mut evals = evals.pack();
         let mut weights = Mle::Owned(MleOwned::ExtensionPacked(weights));
-        let (challengess, new_sum, new_evals, new_weights) = run_product_sumcheck(
-            &evals.by_ref(),
-            &weights.by_ref(),
-            prover_state,
-            sum,
-            folding_factor,
-            pow_bits,
-        );
+        let (challengess, new_sum, new_evals, new_weights) =
+            run_product_sumcheck(&evals.by_ref(), &weights.by_ref(), prover_state, sum, folding_factor, 0);
 
         evals = new_evals.into();
         weights = new_weights.into();
@@ -494,7 +484,6 @@ where
             combination_randomness_gen,
             prover_state,
             prover.folding_factor.at_round(0),
-            prover.starting_folding_pow_bits,
         );
 
         Ok(Self {
diff --git a/crates/whir/src/verify.rs b/crates/whir/src/verify.rs
index 53ed173f7..ec41144d4 100644
--- a/crates/whir/src/verify.rs
+++ b/crates/whir/src/verify.rs
@@ -112,12 +112,8 @@ where
         round_constraints.push((combination_randomness, constraints));
 
         // Initial sumcheck
-        let folding_randomness = verify_sumcheck_rounds::<F, EF>(
-            verifier_state,
-            &mut claimed_sum,
-            self.folding_factor.at_round(0),
-            self.starting_folding_pow_bits,
-        )?;
+        let folding_randomness =
+            verify_sumcheck_rounds::<F, EF>(verifier_state, &mut claimed_sum, self.folding_factor.at_round(0))?;
         round_folding_randomness.push(folding_randomness);
 
         for round_index in 0..self.n_rounds() {
@@ -152,7 +148,6 @@ where
                 verifier_state,
                 &mut claimed_sum,
                 self.folding_factor.at_round(round_index + 1),
-                round_params.folding_pow_bits,
             )?;
 
             round_folding_randomness.push(folding_randomness);
@@ -182,7 +177,7 @@ where
             .ok_or(ProofError::InvalidProof)?;
 
         let final_sumcheck_randomness =
-            verify_sumcheck_rounds::<F, EF>(verifier_state, &mut claimed_sum, self.final_sumcheck_rounds, 0)?;
+            verify_sumcheck_rounds::<F, EF>(verifier_state, &mut claimed_sum, self.final_sumcheck_rounds)?;
         round_folding_randomness.push(final_sumcheck_randomness.clone());
 
         // Compute folding randomness across all rounds.
@@ -411,7 +406,6 @@ pub(crate) fn verify_sumcheck_rounds<F, EF>(
     verifier_state: &mut impl FSVerifier<EF>,
     claimed_sum: &mut EF,
     rounds: usize,
-    pow_bits: usize,
 ) -> ProofResult<SumcheckRandomness<EF>>
 where
     F: TwoAdicField,
@@ -424,8 +418,6 @@ where
         let coeffs = verifier_state.next_sumcheck_polynomial(3, *claimed_sum, None)?;
         let poly = DensePolynomial::new(coeffs);
 
-        verifier_state.check_pow_grinding(pow_bits)?;
-
         // Sample the next verifier folding randomness rᵢ
         let rand: EF = verifier_state.sample();
 
diff --git a/crates/whir/tests/run_whir.rs b/crates/whir/tests/run_whir.rs
index 217a19c5d..d71a7ae01 100644
--- a/crates/whir/tests/run_whir.rs
+++ b/crates/whir/tests/run_whir.rs
@@ -3,16 +3,16 @@
 use std::time::Instant;
 
 use fiat_shamir::{ProverState, VerifierState};
-use field::{Field, TwoAdicField};
-use koala_bear::{KoalaBear, QuinticExtensionFieldKB, default_koalabear_poseidon1_16};
+use field::Field;
+use goldilocks::{CubicExtensionFieldGL, Goldilocks, default_goldilocks_poseidon1_8};
 use mt_whir::*;
 use poly::*;
 use rand::{RngExt, SeedableRng, rngs::StdRng};
 use tracing_forest::{ForestLayer, util::LevelFilter};
 use tracing_subscriber::{EnvFilter, Registry, layer::SubscriberExt, util::SubscriberInitExt};
 
-type F = KoalaBear;
-type EF = QuinticExtensionFieldKB;
+type F = Goldilocks;
+type EF = CubicExtensionFieldGL;
 
 /*
 WHIR_NUM_VARIABLES=25 WHIR_LOG_INV_RATE=1 cargo test --release --package mt-whir --test run_whir -- test_run_whir --exact --nocapture
@@ -30,7 +30,7 @@ fn test_run_whir() {
             .with(ForestLayer::default())
             .try_init();
     }
-    let poseidon16 = default_koalabear_poseidon1_16();
+    let poseidon8 = default_goldilocks_poseidon1_8();
 
     let num_variables = std::env::var("WHIR_NUM_VARIABLES")
         .ok()
@@ -44,7 +44,7 @@ fn test_run_whir() {
     let num_coeffs = 1 << num_variables;
 
     let params = WhirConfigBuilder {
-        security_level: 124,
+        security_level: 128,
         max_num_variables_to_send_coeffs: 9,
         pow_bits: 18,
         folding_factor: FoldingFactor::new(7, 4),
@@ -100,9 +100,9 @@ fn test_run_whir() {
         ));
     }
 
-    let mut prover_state = ProverState::new(poseidon16.clone(), Default::default());
+    let mut prover_state = ProverState::new(poseidon8, Default::default());
 
-    precompute_dft_twiddles::<F>(1 << F::TWO_ADICITY);
+    precompute_dft_twiddles::<F>(1 << 24);
 
     let polynomial: MleOwned<EF> = MleOwned::Base(polynomial);
 
@@ -123,7 +123,7 @@ fn test_run_whir() {
 
     let proof_size_single = pruned_proof.proof_size_fe() as f64 * F::bits() as f64 / 8.0;
 
-    let mut verifier_state = VerifierState::<EF, _>::new(pruned_proof, poseidon16.clone(), Default::default()).unwrap();
+    let mut verifier_state = VerifierState::<EF, _>::new(pruned_proof, poseidon8, Default::default()).unwrap();
 
     let parsed_commitment = params.parse_commitment::<F>(&mut verifier_state).unwrap();
 
@@ -145,11 +145,11 @@ fn display_whir_round_info() {
     let first_folding_factor = 7;
     for n_vars in 20..31 {
         for log_inv_rate in 1..5 {
-            if n_vars + log_inv_rate - first_folding_factor > F::TWO_ADICITY {
+            if n_vars + log_inv_rate - first_folding_factor > EFFECTIVE_TWO_ADICITY {
                 continue;
             }
             let params = WhirConfigBuilder {
-                security_level: 124,
+                security_level: 128,
                 max_num_variables_to_send_coeffs: 8,
                 pow_bits: 16,
                 folding_factor: FoldingFactor::new(first_folding_factor, 5),
@@ -158,9 +158,6 @@ fn display_whir_round_info() {
                 rs_domain_initial_reduction_factor: 5,
             };
             let params = WhirConfig::<EF>::new(&params, n_vars);
-            let folding_pow_bits = std::iter::once(params.starting_folding_pow_bits)
-                .chain(params.round_parameters.iter().map(|r| r.folding_pow_bits))
-                .collect::<Vec<_>>();
             let query_pow_bits = params
                 .round_parameters
                 .iter()
@@ -168,7 +165,7 @@ fn display_whir_round_info() {
                 .chain(std::iter::once(params.final_query_pow_bits))
                 .collect::<Vec<_>>();
             println!(
-                "n_vars: {}, log_inv_rate: {}, num_queries: {:?}, folding_pow_bits: {:?}, query_pow_bits: {:?}",
+                "n_vars: {}, log_inv_rate: {}, num_queries: {:?}, query_pow_bits: {:?}",
                 n_vars,
                 log_inv_rate,
                 params
@@ -176,7 +173,6 @@ fn display_whir_round_info() {
                     .iter()
                     .map(|r| r.num_queries)
                     .collect::<Vec<_>>(),
-                folding_pow_bits,
                 query_pow_bits,
             );
         }
diff --git a/crates/xmss/src/lib.rs b/crates/xmss/src/lib.rs
index 941eff17c..d6366d30c 100644
--- a/crates/xmss/src/lib.rs
+++ b/crates/xmss/src/lib.rs
@@ -1,6 +1,6 @@
 #![cfg_attr(not(test), warn(unused_crate_dependencies))]
 use backend::PrimeCharacteristicRing;
-use backend::{DIGEST_LEN_FE, KoalaBear, POSEIDON1_WIDTH};
+use backend::{DIGEST_LEN_FE, Goldilocks, POSEIDON1_WIDTH};
 
 pub mod signers_cache;
 mod wots;
@@ -8,24 +8,27 @@ pub use wots::*;
 mod xmss;
 pub use xmss::*;
 
-pub const XMSS_DIGEST_LEN: usize = 4;
-pub(crate) const TWEAK_LEN: usize = 2;
+pub const XMSS_DIGEST_LEN: usize = 2;
+pub(crate) const TWEAK_LEN: usize = 1;
 
-type F = KoalaBear;
+type F = Goldilocks;
 type Digest = [F; XMSS_DIGEST_LEN];
 type PublicParam = [F; PUBLIC_PARAM_LEN_FE];
 type Randomness = [F; RANDOMNESS_LEN_FE];
 
 // WOTS
-pub const V: usize = 42;
+pub const V: usize = 40;
 pub const W: usize = 3;
 pub const CHAIN_LENGTH: usize = 1 << W;
 pub const NUM_CHAIN_HASHES: usize = 110;
 pub const TARGET_SUM: usize = V * (CHAIN_LENGTH - 1) - NUM_CHAIN_HASHES;
-pub const NUM_ENCODING_FE: usize = V.div_ceil(24 / W);
-pub const RANDOMNESS_LEN_FE: usize = 6;
-pub const MESSAGE_LEN_FE: usize = 8;
-pub const PUBLIC_PARAM_LEN_FE: usize = 4;
+pub const ENCODING_NUM_FINAL_ZEROS: usize = 8;
+const _: () = assert!(V * W + ENCODING_NUM_FINAL_ZEROS == DIGEST_LEN_FE * 32);
+const _: () = assert!(V.is_multiple_of(DIGEST_LEN_FE)); // V chunks split evenly across the 4 FEs
+const _: () = assert!(ENCODING_NUM_FINAL_ZEROS.is_multiple_of(DIGEST_LEN_FE)); // same for the zero bits
+pub const RANDOMNESS_LEN_FE: usize = 3;
+pub const MESSAGE_LEN_FE: usize = 4;
+pub const PUBLIC_PARAM_LEN_FE: usize = 2;
 pub const PUB_KEY_FLAT_SIZE: usize = XMSS_DIGEST_LEN + PUBLIC_PARAM_LEN_FE;
 pub const WOTS_SIG_SIZE_FE: usize = RANDOMNESS_LEN_FE + V * XMSS_DIGEST_LEN;
 
@@ -41,18 +44,20 @@ pub const TWEAK_TYPE_ENCODING: usize = 3;
 const _: () = assert!(V.is_multiple_of(2)); // For efficiency of the snark (we can batch chains in pairs)
 
 /// index = slot or node_index in Merkle tree
+///
+/// Goldilocks (64-bit field): the entire `(tweak_type, sub_position, index)` tuple
+/// fits comfortably in one field element. We pack:
+///   bits  0..32 : index (u32)
+///   bits 32..42 : sub_position (10 bits)
+///   bits 42..44 : tweak_type (2 bits)
 pub fn make_tweak(tweak_type: usize, sub_position: usize, index: u32) -> [F; TWEAK_LEN] {
     assert!(tweak_type < 4);
     assert!(sub_position < 1 << 10);
-    let index_lo = (index & 0xFFFF) as usize;
-    let index_hi = (index >> 16) as usize;
-    [
-        F::from_usize((tweak_type << 26) + (index_hi << 10) + sub_position),
-        F::from_usize(index_lo),
-    ]
+    let packed = ((tweak_type as u64) << 42) | ((sub_position as u64) << 32) | u64::from(index);
+    [F::from_u64(packed)]
 }
 
-/// [tweak(2) | zeros(2) | public_param(4) | left_child(4) | right_child(4)]
+/// `[tweak(1) | zeros(1) | public_param(2) | left_child(2) | right_child(2)]`
 pub(crate) fn build_merkle_data(
     tweak: [F; TWEAK_LEN],
     public_param: &PublicParam,
@@ -61,14 +66,14 @@ pub(crate) fn build_merkle_data(
 ) -> [F; POSEIDON1_WIDTH] {
     let mut data = [F::default(); POSEIDON1_WIDTH];
     data[..TWEAK_LEN].copy_from_slice(&tweak);
-    // data[2..4] = zeros (default)
+    // data[1..2] = zeros (default)
     data[DIGEST_LEN_FE - PUBLIC_PARAM_LEN_FE..][..PUBLIC_PARAM_LEN_FE].copy_from_slice(public_param);
     data[DIGEST_LEN_FE..][..XMSS_DIGEST_LEN].copy_from_slice(left_child);
     data[DIGEST_LEN_FE + XMSS_DIGEST_LEN..].copy_from_slice(right_child);
     data
 }
 
-/// [tweak(2) | zeros(2) | data(4)]
+/// `[tweak(1) | zeros(1) | data(2)]`
 pub(crate) fn build_left_chain_input(tweak: [F; TWEAK_LEN], data: &Digest) -> [F; DIGEST_LEN_FE] {
     let mut left = [F::default(); DIGEST_LEN_FE];
     left[..TWEAK_LEN].copy_from_slice(&tweak);
@@ -76,7 +81,7 @@ pub(crate) fn build_left_chain_input(tweak: [F; TWEAK_LEN], data: &Digest) -> [F
     left
 }
 
-/// [public_param(4) | zeros(4)]
+/// `[public_param(2) | zeros(2)]`
 pub(crate) fn build_right_chain_input(public_param: &PublicParam) -> [F; DIGEST_LEN_FE] {
     let mut right = [F::default(); DIGEST_LEN_FE];
     right[..PUBLIC_PARAM_LEN_FE].copy_from_slice(public_param);
diff --git a/crates/xmss/src/signers_cache.rs b/crates/xmss/src/signers_cache.rs
index 6e7a9956e..9b344d08e 100644
--- a/crates/xmss/src/signers_cache.rs
+++ b/crates/xmss/src/signers_cache.rs
@@ -37,10 +37,10 @@ fn cache_footprint(first_pubkey: &XmssPublicKey) -> u128 {
     hasher.update(NUM_BENCHMARK_SIGNERS.to_le_bytes());
     hasher.update(BENCHMARK_SLOT.to_le_bytes());
     for f in message_for_benchmark() {
-        hasher.update(f.as_canonical_u32().to_le_bytes());
+        hasher.update(f.as_canonical_u64().to_le_bytes());
     }
     for f in first_pubkey.merkle_root {
-        hasher.update(f.as_canonical_u32().to_le_bytes());
+        hasher.update(f.as_canonical_u64().to_le_bytes());
     }
     let hash = hasher.finalize();
     u128::from_le_bytes(hash[..16].try_into().unwrap())
diff --git a/crates/xmss/src/wots.rs b/crates/xmss/src/wots.rs
index 814c26ba0..bcf0ba3ce 100644
--- a/crates/xmss/src/wots.rs
+++ b/crates/xmss/src/wots.rs
@@ -1,7 +1,7 @@
 use backend::*;
 use rand::{CryptoRng, RngExt};
 use serde::{Deserialize, Serialize};
-use utils::{ToUsize, poseidon16_compress_pair, poseidon16_permute};
+use utils::{poseidon8_compress_pair, poseidon8_permute};
 
 use crate::*;
 
@@ -93,15 +93,15 @@ impl WotsSignature {
 impl WotsPublicKey {
     // Overwrite-sponge
     pub fn hash(&self, public_param: PublicParam, slot: u32) -> Digest {
-        // state[0..8] = IV [tweak(2) | 00 | pp(4)]; state[8..16] = 0.
+        // state[0..4] = IV [tweak(1) | 0 | pp(2)]; state[4..8] = 0.
         let mut state = [F::ZERO; WIDTH];
         state[..TWEAK_LEN].copy_from_slice(&make_tweak(TWEAK_TYPE_WOTS_PK, 0, slot));
-        state[4..4 + PUBLIC_PARAM_LEN_FE].copy_from_slice(&public_param);
-        state = poseidon16_permute(state);
+        state[2..2 + PUBLIC_PARAM_LEN_FE].copy_from_slice(&public_param);
+        state = poseidon8_permute(state);
         for i in (0..V).step_by(2) {
-            state[8..][..XMSS_DIGEST_LEN].copy_from_slice(&self.0[i]);
-            state[8 + XMSS_DIGEST_LEN..].copy_from_slice(&self.0[i + 1]);
-            state = poseidon16_permute(state);
+            state[CAPACITY..][..XMSS_DIGEST_LEN].copy_from_slice(&self.0[i]);
+            state[CAPACITY + XMSS_DIGEST_LEN..].copy_from_slice(&self.0[i + 1]);
+            state = poseidon8_permute(state);
         }
         state[CAPACITY..][..XMSS_DIGEST_LEN].try_into().unwrap()
     }
@@ -115,12 +115,12 @@ pub fn iterate_hash(
     chain_index: usize,
     start_step: usize,
 ) -> Digest {
-    // Chain hash layout: left = [tweak (2) | zeros (2) | data (4)], right = [public_param(4) | zeros(4)].
+    // Chain hash layout: left = [tweak (1) | zero (1) | data (2)], right = [public_param(2) | zeros(2)].
     let right = build_right_chain_input(&public_param);
     (0..n).fold(*a, |acc, j| {
         let tweak = make_tweak(TWEAK_TYPE_CHAIN, chain_index * CHAIN_LENGTH + start_step + j, slot);
         let left = build_left_chain_input(tweak, &acc);
-        poseidon16_compress_pair(&left, &right)[..XMSS_DIGEST_LEN]
+        poseidon8_compress_pair(&left, &right)[..XMSS_DIGEST_LEN]
             .try_into()
             .unwrap()
     })
@@ -152,30 +152,29 @@ pub fn wots_encode(
     let mut first_input_right = [F::default(); DIGEST_LEN_FE];
     first_input_right[..RANDOMNESS_LEN_FE].copy_from_slice(randomness);
     first_input_right[RANDOMNESS_LEN_FE..][..TWEAK_LEN].copy_from_slice(&make_tweak(TWEAK_TYPE_ENCODING, 0, slot));
-    let pre_compressed = poseidon16_compress_pair(first_input_left, &first_input_right);
+    let pre_compressed = poseidon8_compress_pair(first_input_left, &first_input_right);
 
     let mut second_input_right = [F::default(); DIGEST_LEN_FE];
     second_input_right[..PUBLIC_PARAM_LEN_FE].copy_from_slice(&xmss_pub_key.public_param);
-    let compressed = poseidon16_compress_pair(&pre_compressed, &second_input_right);
-
-    if compressed[..NUM_ENCODING_FE].iter().any(|&kb| kb == -F::ONE) {
-        // ensures uniformity of encoding
-        return None;
+    let compressed = poseidon8_compress_pair(&pre_compressed, &second_input_right);
+
+    // Per-FE decomposition: each output FE contributes V/DIGEST_LEN_FE
+    // = 10 W-bit chunks from the low 30 bits of its low limb; the top 2 bits
+    // of each FE's low limb must be zero (ENCODING_NUM_FINAL_ZEROS = 8 bits
+    // total, evenly distributed = 2 per FE)
+    const CHUNKS_PER_FE: usize = V / DIGEST_LEN_FE;
+    const CHUNK_BITS_PER_FE: usize = CHUNKS_PER_FE * W;
+    let mut all_indices = [0u8; V];
+    for (i, fe) in compressed.iter().enumerate() {
+        let low = fe.as_canonical_u64() & ((1u64 << 32) - 1);
+        if (low >> CHUNK_BITS_PER_FE) != 0 {
+            return None;
+        }
+        for j in 0..CHUNKS_PER_FE {
+            all_indices[i * CHUNKS_PER_FE + j] = ((low >> (W * j)) & ((1u64 << W) - 1)) as u8;
+        }
     }
-    let all_indices: Vec<_> = compressed[..NUM_ENCODING_FE]
-        .iter()
-        .flat_map(|kb| to_little_endian_bits(kb.to_usize(), 24))
-        .collect::<Vec<_>>()
-        .chunks_exact(W)
-        .take(V)
-        .map(|chunk| {
-            chunk
-                .iter()
-                .enumerate()
-                .fold(0u8, |acc, (i, &bit)| acc | (u8::from(bit) << i))
-        })
-        .collect();
-    is_valid_encoding(&all_indices).then(|| all_indices[..V].try_into().unwrap())
+    is_valid_encoding(&all_indices).then_some(all_indices)
 }
 
 fn is_valid_encoding(encoding: &[u8]) -> bool {
diff --git a/crates/xmss/src/xmss.rs b/crates/xmss/src/xmss.rs
index d5f69f445..f28d50928 100644
--- a/crates/xmss/src/xmss.rs
+++ b/crates/xmss/src/xmss.rs
@@ -2,7 +2,7 @@ use backend::*;
 use rand::{CryptoRng, RngExt, SeedableRng, rngs::StdRng};
 use serde::{Deserialize, Serialize};
 use sha3::{Digest as Sha3Digest, Keccak256};
-use utils::poseidon16_compress;
+use utils::poseidon8_compress;
 
 use crate::*;
 
@@ -123,7 +123,7 @@ pub fn xmss_key_gen(
                         &left,
                         &right,
                     );
-                    poseidon16_compress(merkle_data)[..XMSS_DIGEST_LEN].try_into().unwrap()
+                    poseidon8_compress(merkle_data)[..XMSS_DIGEST_LEN].try_into().unwrap()
                 })
                 .collect()
         };
@@ -228,7 +228,7 @@ pub fn xmss_verify(
             &left_child,
             &right_child,
         );
-        current_hash = poseidon16_compress(merkle_data)[..XMSS_DIGEST_LEN].try_into().unwrap();
+        current_hash = poseidon8_compress(merkle_data)[..XMSS_DIGEST_LEN].try_into().unwrap();
     }
     if current_hash == pub_key.merkle_root {
         Ok(())
diff --git a/crates/xmss/tests/xmss_tests.rs b/crates/xmss/tests/xmss_tests.rs
index 0fb08e01d..74fe05d5a 100644
--- a/crates/xmss/tests/xmss_tests.rs
+++ b/crates/xmss/tests/xmss_tests.rs
@@ -2,7 +2,7 @@ use backend::*;
 use rand::{SeedableRng, rngs::StdRng};
 use xmss::*;
 
-type F = KoalaBear;
+type F = Goldilocks;
 
 #[test]
 fn test_xmss_serialize_deserialize() {
diff --git a/crates/xmss/xmss.md b/crates/xmss/xmss.md
index e1538e1ce..febb8e14f 100644
--- a/crates/xmss/xmss.md
+++ b/crates/xmss/xmss.md
@@ -2,25 +2,26 @@
 
 ## Field
 
-KoalaBear (p = 2^31 - 2^24 + 1).
+Goldilocks (p = 2^64 - 2^32 + 1).
 
 ## Hash function
 
-[Poseidon](https://eprint.iacr.org/2019/458), in compression mode (feedforward addition). Input: 16 field elements. Output: 8 field elements. We denote it `H`. Chain hashes, Merkle hashes, and the final WOTS-pubkey hash truncate the output to 4 field elements (`n`); the encoding step and the intermediate WOTS-pubkey sponge states keep the full 8 elements.
+[Poseidon](https://eprint.iacr.org/2019/458), in compression mode (feedforward addition). Input: 8 field elements. Output: 4 field elements. We denote it `H`. Chain hashes, Merkle hashes, and the final WOTS-pubkey hash truncate the output to 2 field elements (`n`); the encoding step and the intermediate WOTS-pubkey sponge states keep the full 4 elements.
 
 ## Sizes (in field elements)
 
-- `n = 4`: digest size
-- `|pp| = 4`: public parameter
-- `|randomness| = 6`: signature randomness
-- `|msg| = 8`: message size
-- `|tweak| = 2`: tweak (domain separation: `encoding`, `chain`, `wots_pk`, `merkle`)
+- `n = 2`: digest size
+- `|pp| = 2`: public parameter
+- `|randomness| = 3`: signature randomness
+- `|msg| = 4`: message size
+- `|tweak| = 1`: tweak (domain separation: `encoding`, `chain`, `wots_pk`, `merkle`)
 
 ## WOTS (Winternitz One Time Signature)
 
-- `v = 42`: number of hash chains
+- `v = 40`: number of hash chains
 - `w = 3`, `chain_length = 2^w = 8`
-- `target_sum = 184`: a WOTS encoding `(e_0, ..., e_{v-1})` is valid iff each `e_i < chain_length` and `sum(e_i) = target_sum`. The signer grinds `randomness` until the encoding is valid (avoids checksum chains).
+- `target_sum = 170`: a WOTS encoding `(e_0, ..., e_{v-1})` is valid iff each `e_i < chain_length` and `sum(e_i) = target_sum`. The signer grinds `randomness` until the encoding is valid (avoids checksum chains).
+- `encoding_num_final_zeros = 8`: the 2 high bits of the low 32-bit half of each of the 4 encoding-digest limbs must be zero (2 zero bits per limb × 4 limbs).
 
 ## XMSS
 
@@ -30,19 +31,19 @@ KoalaBear (p = 2^31 - 2^24 + 1).
 
 Inputs: public key `(merkle_root, pp)`, message `msg`, slot `s`, signature `(randomness, chain_tips, merkle_proof)`.
 
-1. **Encode**: compute the 8-limb digest `D = H(H(msg | randomness | tweak_encoding(s)) | pp | 0000)`. For each limb `D_i`, take the canonical representative `D_i = low + 2^24 · high` (with `low ∈ [0, 2^24)`, `high ∈ [0, 128)`) and reject if `high == 127` (equivalently `D_i == −1`). This guarantees an uniform encoding. Concatenate the 24-bit `low` parts of the 8 limbs in little-endian order to get 192 bits, then take the first `v · w = 126` bits split into `v = 42` little-endian chunks of `w = 3` bits → encoding `(e_0, ..., e_{v-1})` with each `e_i ∈ [0, chain_length)`. Reject if `sum(e_i) ≠ target_sum`.
-2. **Recover WOTS public key**: for each `i`, walk chain `i` from `chain_tips[i]` for `chain_length - 1 - e_i` steps, where each step is `H(tweak_chain(i, step, s) | 00 | previous_value | pp | 0000)` truncated to `n`.
-3. **Hash WOTS public key**: T-sponge with replacement over the `v` recovered chain ends, with IV `[tweak_wots_pk(s) | 00 | pp]`, ingesting two chain end digests at a time. Output is the Merkle leaf.
-4. **Walk Merkle path**: for `level = 0..log_lifetime`, combine the current node with `merkle_proof[level]` (left/right determined by bit `level` of `s`) via `H(tweak_merkle(level+1, parent_index) | 00 | pp | left | right)` truncated to `n`.
+1. **Encode**: compute the 4-limb digest `D = H(H(msg | randomness | tweak_encoding(s)) | pp | 00)`. For each limb `D_i`, take the canonical representative `D_i = low + 2^32 · high` (with `low, high < 2^32`) and reject if `high == 2^32 - 1` (needed for uniformity of the encoding). Reject if the 2 high bits of `low` are non-zero (`encoding_num_final_zeros / 4 = 2` bits per limb). Otherwise split the low 30 bits of `low` into `v / 4 = 10` little-endian chunks of `w = 3` bits each, giving the encoding `(e_0, ..., e_{v-1})` with each `e_i ∈ [0, chain_length)`. Reject if `sum(e_i) ≠ target_sum`.
+2. **Recover WOTS public key**: for each `i`, walk chain `i` from `chain_tips[i]` for `chain_length - 1 - e_i` steps, where each step is `H(tweak_chain(i, step, s) | 0 | previous_value | pp | 00)` truncated to `n`.
+3. **Hash WOTS public key**: T-sponge with replacement over the `v` recovered chain ends, with IV `[tweak_wots_pk(s) | 0 | pp]`, ingesting two chain end digests at a time. Output is the Merkle leaf.
+4. **Walk Merkle path**: for `level = 0..log_lifetime`, combine the current node with `merkle_proof[level]` (left/right determined by bit `level` of `s`) via `H(tweak_merkle(level+1, parent_index) | 0 | pp | left | right)` truncated to `n`.
 5. **Check root**: accept iff the final hash equals `merkle_root`.
 
 
 ## Security
 
-target = 123,9 ≈ 124 bits of classical security in the ROM, and ≈ 62 bits of quantum security in the QROM, with an analysis inspired by the section 3.1 of [Tight adaptive reprogramming in the QROM](https://arxiv.org/pdf/2010.15103). TODO write the complete proof.
+target ≈ 128 bits of classical security in the ROM, and ≈ 64 bits of quantum security in the QROM, with an analysis inspired by the section 3.1 of [Tight adaptive reprogramming in the QROM](https://arxiv.org/pdf/2010.15103). TODO write the complete proof.
 
 ## Signature size
 
-**1171 bytes** `log2(p).(|randomness| + n.(v + log_lifetime))`
+**1176 bytes** `log2(p).(|randomness| + n.(v + log_lifetime)) / 8`
 
 below IPv6 [MTU](https://fr.wikipedia.org/wiki/Maximum_transmission_unit) (1280 bytes)
diff --git a/src/lib.rs b/src/lib.rs
index 577853996..6720544ea 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -9,7 +9,7 @@ pub use rec_aggregation::{
 };
 pub use xmss::{MESSAGE_LEN_FE, XmssPublicKey, XmssSecretKey, XmssSignature, xmss_key_gen, xmss_sign, xmss_verify};
 
-pub type F = KoalaBear;
+pub type F = Goldilocks;
 
 /// Call once before proving. Compiles the aggregation program and precomputes DFT twiddles.
 pub fn setup_prover() {
diff --git a/tests/test_multisignatures.rs b/tests/test_multisignatures.rs
index c5ba89d4c..36a1c08e4 100644
--- a/tests/test_multisignatures.rs
+++ b/tests/test_multisignatures.rs
@@ -11,6 +11,7 @@ use rec_aggregation::{
     split_multi_message_aggregate_by_msg,
 };
 use xmss::{
+    MESSAGE_LEN_FE,
     signers_cache::{BENCHMARK_SLOT, get_benchmark_signatures, message_for_benchmark},
     xmss_key_gen, xmss_sign, xmss_verify,
 };
@@ -85,7 +86,7 @@ fn test_multi_message_aggregation() {
 
     let slot_b = BENCHMARK_SLOT + 1;
     let mut rng_b: StdRng = StdRng::seed_from_u64(17);
-    let message_b: [_; 8] = std::array::from_fn(|_| rng_b.random());
+    let message_b: [_; MESSAGE_LEN_FE] = std::array::from_fn(|_| rng_b.random());
 
     assert!(message_b != message_a && slot_b != slot_a);