(&self, rng: &mut R) -> Goldilocks {
+ loop {
+ let next_u64 = rng.next_u64();
+ if next_u64 < Goldilocks::ORDER_U64 {
+ return Goldilocks::new(next_u64);
+ }
+ }
+ }
+}
+
+impl PrimeCharacteristicRing for Goldilocks {
+ type PrimeSubfield = Self;
+
+ const ZERO: Self = Self::new(0);
+ const ONE: Self = Self::new(1);
+ const TWO: Self = Self::new(2);
+ const NEG_ONE: Self = Self::new(Self::ORDER_U64 - 1);
+
+ #[inline]
+ fn from_prime_subfield(f: Self::PrimeSubfield) -> Self {
+ f
+ }
+
+ #[inline]
+ fn from_bool(b: bool) -> Self {
+ Self::new(b.into())
+ }
+
+ #[inline]
+ fn halve(&self) -> Self {
+ Self::new(crate::helpers::halve_u64::(self.value))
+ }
+
+ #[inline]
+ fn mul_2exp_u64(&self, exp: u64) -> Self {
+ // 2^96 = -1 mod P, 2^192 = 1 mod P.
+ match exp {
+ 0 => *self,
+ 1 => *self + *self,
+ _ => {
+ if exp < 96 {
+ *self * Self::POWERS_OF_TWO[exp as usize]
+ } else if exp < 192 {
+ -*self * Self::POWERS_OF_TWO[(exp - 96) as usize]
+ } else {
+ self.mul_2exp_u64(exp % 192)
+ }
+ }
+ }
+ }
+
+ #[inline]
+ fn div_2exp_u64(&self, mut exp: u64) -> Self {
+ // 2^{-n} = 2^{192 - n} mod P.
+ exp %= 192;
+ match exp {
+ 0 => *self,
+ 1 => self.halve(),
+ _ => self.mul_2exp_u64(192 - exp),
+ }
+ }
+
+ #[inline]
+ fn sum_array(input: &[Self]) -> Self {
+ assert_eq!(N, input.len());
+ match N {
+ 0 => Self::ZERO,
+ 1 => input[0],
+ 2 => input[0] + input[1],
+ 3 => input[0] + input[1] + input[2],
+ _ => input.iter().copied().sum(),
+ }
+ }
+
+ #[inline]
+ fn dot_product(lhs: &[Self; N], rhs: &[Self; N]) -> Self {
+ // OFFSET has two key properties:
+ // 1. it's a multiple of P,
+ // 2. it exceeds the maximum sum of two u64 products.
+ const OFFSET: u128 = ((P as u128) << 64) - (P as u128) + ((P as u128) << 32);
+ const {
+ assert!((N as u32) <= (1 << 31));
+ }
+ match N {
+ 0 => Self::ZERO,
+ 1 => lhs[0] * rhs[0],
+ 2 => {
+ let long_prod_0 = (lhs[0].value as u128) * (rhs[0].value as u128);
+ let long_prod_1 = (lhs[1].value as u128) * (rhs[1].value as u128);
+ let (sum, over) = long_prod_0.overflowing_add(long_prod_1);
+ let sum_corr = sum.wrapping_sub(OFFSET);
+ if over { reduce128(sum_corr) } else { reduce128(sum) }
+ }
+ _ => {
+ let (lo_plus_hi, hi) = lhs
+ .iter()
+ .zip(rhs)
+ .map(|(x, y)| (x.value as u128) * (y.value as u128))
+ .fold((0_u128, 0_u64), |(acc_lo, acc_hi), val| {
+ let val_hi = (val >> 96) as u64;
+ unsafe { (acc_lo.wrapping_add(val), acc_hi.unchecked_add(val_hi)) }
+ });
+ let lo = lo_plus_hi.wrapping_sub((hi as u128) << 96);
+ let sum = unsafe { lo.unchecked_add(P.unchecked_sub(hi) as u128) };
+ reduce128(sum)
+ }
+ }
+ }
+
+ #[inline]
+ fn zero_vec(len: usize) -> Vec {
+ // SAFETY: `#[repr(transparent)]` means `Goldilocks` and `u64` share layout.
+ unsafe { flatten_to_base(vec![0u64; len]) }
+ }
+}
+
+/// `p - 1 = 2^32 * 3 * 5 * 17 * 257 * 65537`. The smallest `D` with `gcd(p - 1, D) = 1` is 7.
+impl InjectiveMonomial<7> for Goldilocks {}
+
+impl PermutationMonomial<7> for Goldilocks {
+ fn injective_exp_root_n(&self) -> Self {
+ exp_10540996611094048183(*self)
+ }
+}
+
+impl RawDataSerializable for Goldilocks {
+ impl_raw_serializable_primefield64!();
+}
+
+impl Field for Goldilocks {
+ #[cfg(all(target_arch = "x86_64", target_feature = "avx2", not(target_feature = "avx512f")))]
+ type Packing = crate::PackedGoldilocksAVX2;
+
+ #[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
+ type Packing = crate::PackedGoldilocksAVX512;
+
+ #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+ type Packing = crate::PackedGoldilocksNeon;
+
+ #[cfg(not(any(
+ all(target_arch = "x86_64", target_feature = "avx2", not(target_feature = "avx512f")),
+ all(target_arch = "x86_64", target_feature = "avx512f"),
+ all(target_arch = "aarch64", target_feature = "neon"),
+ )))]
+ type Packing = Self;
+
+ const GENERATOR: Self = Self::new(7);
+
+ #[inline]
+ fn is_zero(&self) -> bool {
+ self.value == 0 || self.value == Self::ORDER_U64
+ }
+
+ fn try_inverse(&self) -> Option {
+ if self.is_zero() {
+ return None;
+ }
+ Some(gcd_inversion(*self))
+ }
+
+ #[inline]
+ fn order() -> BigUint {
+ P.into()
+ }
+}
+
+quotient_map_small_int!(Goldilocks, u64, [u8, u16, u32]);
+quotient_map_small_int!(Goldilocks, i64, [i8, i16, i32]);
+quotient_map_large_uint!(
+ Goldilocks,
+ u64,
+ Goldilocks::ORDER_U64,
+ "`[0, 2^64 - 2^32]`",
+ "`[0, 2^64 - 1]`",
+ [u128]
+);
+quotient_map_large_iint!(
+ Goldilocks,
+ i64,
+ "`[-(2^63 - 2^31), 2^63 - 2^31]`",
+ "`[1 + 2^32 - 2^64, 2^64 - 1]`",
+ [(i128, u128)]
+);
+
+impl QuotientMap for Goldilocks {
+ #[inline]
+ fn from_int(int: u64) -> Self {
+ Self::new(int)
+ }
+
+ #[inline]
+ fn from_canonical_checked(int: u64) -> Option {
+ (int < Self::ORDER_U64).then(|| Self::new(int))
+ }
+
+ #[inline(always)]
+ unsafe fn from_canonical_unchecked(int: u64) -> Self {
+ Self::new(int)
+ }
+}
+
+impl QuotientMap for Goldilocks {
+ #[inline]
+ fn from_int(int: i64) -> Self {
+ if int >= 0 {
+ Self::new(int as u64)
+ } else {
+ Self::new(Self::ORDER_U64.wrapping_add_signed(int))
+ }
+ }
+
+ #[inline]
+ fn from_canonical_checked(int: i64) -> Option {
+ const POS_BOUND: i64 = (P >> 1) as i64;
+ const NEG_BOUND: i64 = -POS_BOUND;
+ match int {
+ 0..=POS_BOUND => Some(Self::new(int as u64)),
+ NEG_BOUND..0 => Some(Self::new(Self::ORDER_U64.wrapping_add_signed(int))),
+ _ => None,
+ }
+ }
+
+ #[inline(always)]
+ unsafe fn from_canonical_unchecked(int: i64) -> Self {
+ Self::from_int(int)
+ }
+}
+
+impl PrimeField for Goldilocks {
+ fn as_canonical_biguint(&self) -> BigUint {
+ self.as_canonical_u64().into()
+ }
+}
+
+impl PrimeField64 for Goldilocks {
+ const ORDER_U64: u64 = P;
+
+ #[inline]
+ fn as_canonical_u64(&self) -> u64 {
+ let mut c = self.value;
+ // Single conditional subtraction is sufficient since `2 * ORDER` would overflow u64.
+ if c >= Self::ORDER_U64 {
+ c -= Self::ORDER_U64;
+ }
+ c
+ }
+}
+
+impl TwoAdicField for Goldilocks {
+ const TWO_ADICITY: usize = 32;
+
+ fn two_adic_generator(bits: usize) -> Self {
+ assert!(bits <= Self::TWO_ADICITY);
+ Self::TWO_ADIC_GENERATORS[bits]
+ }
+}
+
+/// `const` version of addition — useful for building const tables.
+#[inline]
+const fn const_add(lhs: Goldilocks, rhs: Goldilocks) -> Goldilocks {
+ let (sum, over) = lhs.value.overflowing_add(rhs.value);
+ let (mut sum, over) = sum.overflowing_add((over as u64) * Goldilocks::NEG_ORDER);
+ if over {
+ sum += Goldilocks::NEG_ORDER;
+ }
+ Goldilocks::new(sum)
+}
+
+impl Add for Goldilocks {
+ type Output = Self;
+
+ #[inline]
+ fn add(self, rhs: Self) -> Self {
+ let (sum, over) = self.value.overflowing_add(rhs.value);
+ let (mut sum, over) = sum.overflowing_add(u64::from(over) * Self::NEG_ORDER);
+ if over {
+ unsafe {
+ assume(self.value > Self::ORDER_U64 && rhs.value > Self::ORDER_U64);
+ }
+ branch_hint();
+ sum += Self::NEG_ORDER;
+ }
+ Self::new(sum)
+ }
+}
+
+impl Sub for Goldilocks {
+ type Output = Self;
+
+ #[inline]
+ fn sub(self, rhs: Self) -> Self {
+ let (diff, under) = self.value.overflowing_sub(rhs.value);
+ let (mut diff, under) = diff.overflowing_sub(u64::from(under) * Self::NEG_ORDER);
+ if under {
+ unsafe {
+ assume(self.value < Self::NEG_ORDER - 1 && rhs.value > Self::ORDER_U64);
+ }
+ branch_hint();
+ diff -= Self::NEG_ORDER;
+ }
+ Self::new(diff)
+ }
+}
+
+impl Neg for Goldilocks {
+ type Output = Self;
+
+ #[inline]
+ fn neg(self) -> Self::Output {
+ Self::new(Self::ORDER_U64 - self.as_canonical_u64())
+ }
+}
+
+impl Mul for Goldilocks {
+ type Output = Self;
+
+ #[inline]
+ fn mul(self, rhs: Self) -> Self {
+ reduce128(u128::from(self.value) * u128::from(rhs.value))
+ }
+}
+
+impl_add_assign!(Goldilocks);
+impl_sub_assign!(Goldilocks);
+impl_mul_methods!(Goldilocks);
+impl_div_methods!(Goldilocks, Goldilocks);
+
+impl Sum for Goldilocks {
+ fn sum>(iter: I) -> Self {
+ // Faster than `reduce` for iterators of length > 2; cannot overflow provided len < 2^64.
+ let sum = iter.map(|x| x.value as u128).sum::();
+ reduce128(sum)
+ }
+}
+
+/// Reduce to a 64-bit value. Output may be in `[0, 2^64)`, i.e. not necessarily canonical.
+#[inline]
+pub(crate) fn reduce128(x: u128) -> Goldilocks {
+ let (x_lo, x_hi) = split(x);
+ let x_hi_hi = x_hi >> 32;
+ let x_hi_lo = x_hi & Goldilocks::NEG_ORDER;
+
+ let (mut t0, borrow) = x_lo.overflowing_sub(x_hi_hi);
+ if borrow {
+ branch_hint();
+ t0 -= Goldilocks::NEG_ORDER;
+ }
+ let t1 = x_hi_lo * Goldilocks::NEG_ORDER;
+ let t2 = unsafe { add_no_canonicalize_trashing_input(t0, t1) };
+ Goldilocks::new(t2)
+}
+
+#[inline]
+#[allow(clippy::cast_possible_truncation)]
+const fn split(x: u128) -> (u64, u64) {
+ (x as u64, (x >> 64) as u64)
+}
+
+/// Fast addition modulo `ORDER` on x86-64, using CF/SBB to pick the adjustment branchlessly.
+///
+/// # Safety
+/// - Only correct if `x + y < 2^64 + ORDER = 0x1_FFFF_FFFF_0000_0001`.
+/// - Overwrites both inputs in registers on x86; avoid reusing them.
+#[inline(always)]
+#[cfg(target_arch = "x86_64")]
+unsafe fn add_no_canonicalize_trashing_input(x: u64, y: u64) -> u64 {
+ unsafe {
+ let res_wrapped: u64;
+ let adjustment: u64;
+ core::arch::asm!(
+ "add {0}, {1}",
+ "sbb {1:e}, {1:e}",
+ inlateout(reg) x => res_wrapped,
+ inlateout(reg) y => adjustment,
+ options(pure, nomem, nostack),
+ );
+ assume(x != 0 || (res_wrapped == y && adjustment == 0));
+ assume(y != 0 || (res_wrapped == x && adjustment == 0));
+ res_wrapped + adjustment
+ }
+}
+
+#[inline(always)]
+#[cfg(not(target_arch = "x86_64"))]
+unsafe fn add_no_canonicalize_trashing_input(x: u64, y: u64) -> u64 {
+ let (res_wrapped, carry) = x.overflowing_add(y);
+ res_wrapped + Goldilocks::NEG_ORDER * u64::from(carry)
+}
+
+/// Binary-GCD inversion for Goldilocks.
+///
+/// Uses the "update factor" variant from https://eprint.iacr.org/2020/972.pdf: compute
+/// factors off by a known power of two, then correct at the end via a linear combination.
+fn gcd_inversion(input: Goldilocks) -> Goldilocks {
+ let (mut a, mut b) = (input.value, P);
+
+ // `len(a) + len(b) <= 128` initially; 126 iterations suffice to drive it to <= 2.
+ // Split into 2 rounds of 63.
+ const ROUND_SIZE: usize = 63;
+
+ let (f00, _, f10, _) = gcd_inner::(&mut a, &mut b);
+ let (_, _, f11, g11) = gcd_inner::(&mut a, &mut b);
+
+ // The update factors are i64's, but we interpret `-2^63` as `2^63` because
+ // `gcd_inner` outputs sit in `(-2^ROUND_SIZE, 2^ROUND_SIZE]`.
+ let u = from_unusual_int(f00);
+ let v = from_unusual_int(f10);
+ let u_fac11 = from_unusual_int(f11);
+ let v_fac11 = from_unusual_int(g11);
+
+ // Each iteration introduced a factor of 2, so we need to divide by `2^126`.
+ // `2^192 = 1 mod P`, so multiply by `2^66` instead (192 - 126 = 66).
+ (u * u_fac11 + v * v_fac11).mul_2exp_u64(66)
+}
+
+/// Convert an `i64` to Goldilocks, interpreting `i64::MIN` as `2^63` (not `-2^63`).
+const fn from_unusual_int(int: i64) -> Goldilocks {
+ if (int >= 0) || (int == i64::MIN) {
+ Goldilocks::new(int as u64)
+ } else {
+ Goldilocks::new(Goldilocks::ORDER_U64.wrapping_add_signed(int))
+ }
+}
+
+// A few unused-variable suppression helpers that clippy might warn about
+#[allow(dead_code)]
+fn _unused_array_touch() {
+ let _ = array::from_fn::(|_| 0);
+}
diff --git a/crates/backend/goldilocks/src/helpers.rs b/crates/backend/goldilocks/src/helpers.rs
new file mode 100644
index 000000000..65b08b104
--- /dev/null
+++ b/crates/backend/goldilocks/src/helpers.rs
@@ -0,0 +1,73 @@
+// Credits: Plonky3 (https://github.com/Plonky3/Plonky3) (MIT and Apache-2.0 licenses).
+
+//! Helpers ported from `p3_util` and `p3_field::exponentiation`, scoped to what the
+//! Goldilocks field implementation needs.
+
+use field::PrimeCharacteristicRing;
+
+/// Given an element `x` from a 64-bit field `F_P`, compute `x / 2`.
+#[inline]
+#[must_use]
+pub const fn halve_u64(x: u64) -> u64 {
+ let shift = (P + 1) >> 1;
+ let half = x >> 1;
+ if x & 1 == 0 { half } else { half + shift }
+}
+
+/// Inner loop of the binary-GCD-based inversion algorithm used by Goldilocks.
+///
+/// See https://eprint.iacr.org/2020/972.pdf for background; this mini-GCD builds up
+/// a small transformation using u64 ops and bit shifts, which we then apply to the
+/// big-int values in the outer loop.
+#[inline]
+pub const fn gcd_inner(a: &mut u64, b: &mut u64) -> (i64, i64, i64, i64) {
+ let (mut f0, mut g0, mut f1, mut g1) = (1, 0, 0, 1);
+
+ let mut round = 0;
+ while round < NUM_ROUNDS {
+ if *a & 1 == 0 {
+ *a >>= 1;
+ } else {
+ if *a < *b {
+ core::mem::swap(a, b);
+ (f0, f1) = (f1, f0);
+ (g0, g1) = (g1, g0);
+ }
+ *a -= *b;
+ *a >>= 1;
+ f0 -= f1;
+ g0 -= g1;
+ }
+ f1 <<= 1;
+ g1 <<= 1;
+
+ round += 1;
+ }
+
+ (f0, g0, f1, g1)
+}
+
+/// Compute `x -> x^{10540996611094048183}` using a custom addition chain.
+///
+/// This map computes the seventh root of `x` if `x` is a member of the `Goldilocks` field.
+/// It follows from: `7 * 10540996611094048183 = 4*(2^64 - 2^32) + 1 = 1 mod (p - 1)`.
+#[must_use]
+pub fn exp_10540996611094048183(val: R) -> R {
+ let p1 = val;
+ let p10 = p1.square();
+ let p11 = p10 * p1;
+ let p100 = p10.square();
+ let p111 = p100 * p11;
+ let p1_30 = p100.exp_power_of_2(30);
+ let p1_30_11 = p1_30 * p11;
+ let p1_30_11_000 = p1_30_11.exp_power_of_2(3);
+ let p1_30_11_011 = p1_30_11_000 * p1_30_11;
+ let p1_30_11_011_000000 = p1_30_11_011.exp_power_of_2(6);
+ let p_chunk12 = p1_30_11_011_000000 * p1_30_11_011;
+ let p_chunk12_000000000000 = p_chunk12.exp_power_of_2(12);
+ let p_chunk24 = p_chunk12_000000000000 * p_chunk12;
+ let p_chunk24_000000 = p_chunk24.exp_power_of_2(6);
+ let p_chunk30 = p_chunk24_000000 * p1_30_11;
+ let p_chunk30_0000 = p_chunk30.exp_power_of_2(4);
+ p_chunk30_0000 * p111
+}
diff --git a/crates/backend/goldilocks/src/lib.rs b/crates/backend/goldilocks/src/lib.rs
new file mode 100644
index 000000000..f8e3e5781
--- /dev/null
+++ b/crates/backend/goldilocks/src/lib.rs
@@ -0,0 +1,37 @@
+// Credits: Plonky3 (https://github.com/Plonky3/Plonky3) (MIT and Apache-2.0 licenses).
+
+//! The Goldilocks prime field `F_p` where `p = 2^64 - 2^32 + 1`, and a degree-3 extension.
+//!
+//! This is a port of `plonky3/goldilocks/` adapted to the in-tree `field` trait crate.
+
+extern crate alloc;
+
+mod cubic_extension;
+mod goldilocks;
+mod helpers;
+mod packed_cubic_extension;
+mod poseidon1;
+
+#[cfg(test)]
+mod benchmark_poseidons_goldilocks;
+
+pub use cubic_extension::*;
+pub use goldilocks::*;
+pub use helpers::*;
+pub use packed_cubic_extension::*;
+pub use poseidon1::*;
+
+#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+mod aarch64_neon;
+#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+pub use aarch64_neon::*;
+
+#[cfg(all(target_arch = "x86_64", target_feature = "avx2", not(target_feature = "avx512f")))]
+mod x86_64_avx2;
+#[cfg(all(target_arch = "x86_64", target_feature = "avx2", not(target_feature = "avx512f")))]
+pub use x86_64_avx2::*;
+
+#[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
+mod x86_64_avx512;
+#[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
+pub use x86_64_avx512::*;
diff --git a/crates/backend/goldilocks/src/packed_cubic_extension.rs b/crates/backend/goldilocks/src/packed_cubic_extension.rs
new file mode 100644
index 000000000..57827fd76
--- /dev/null
+++ b/crates/backend/goldilocks/src/packed_cubic_extension.rs
@@ -0,0 +1,377 @@
+// Credits: Plonky3 (https://github.com/Plonky3/Plonky3) (MIT and Apache-2.0 licenses).
+
+//! Packed (SIMD) version of the cubic extension `F_p[X] / (X^3 - X - 1)`.
+//!
+//! Mirrors `koala-bear`'s `PackedQuinticExtensionField` shape: a SoA array of
+//! `[PF; 3]` packed-base-field lanes, so each field operation is a SIMD
+//! multiply/add over `PF::WIDTH` extension elements at once.
+
+use alloc::vec::Vec;
+use core::array;
+use core::fmt::Debug;
+use core::iter::{Product, Sum};
+use core::ops::{Add, AddAssign, Mul, MulAssign, Neg, Sub, SubAssign};
+
+use field::{
+ Algebra, BasedVectorSpace, Field, PackedField, PackedFieldExtension, PackedValue, Powers, PrimeCharacteristicRing,
+ field_to_array,
+};
+use itertools::Itertools;
+use rand::distr::{Distribution, StandardUniform};
+use serde::{Deserialize, Serialize};
+use utils::{flatten_to_base, reconstitute_from_base};
+
+use crate::Goldilocks;
+use crate::cubic_extension::{CubicExtensionFieldGL, cubic_mul_generic, cubic_square_generic};
+
+const D: usize = 3;
+
+/// Packed cubic extension over `Goldilocks`, parameterized by base field packing `PF`.
+#[derive(Copy, Clone, Eq, PartialEq, Hash, Debug, Serialize, Deserialize, PartialOrd, Ord)]
+#[repr(transparent)]
+pub struct PackedCubicExtensionFieldGL> {
+ #[serde(
+ with = "utils::array_serialization",
+ bound(serialize = "PF: Serialize", deserialize = "PF: Deserialize<'de>")
+ )]
+ pub(crate) value: [PF; D],
+}
+
+impl> PackedCubicExtensionFieldGL {
+ const fn new(value: [PF; D]) -> Self {
+ Self { value }
+ }
+}
+
+impl> Default for PackedCubicExtensionFieldGL {
+ #[inline]
+ fn default() -> Self {
+ Self {
+ value: array::from_fn(|_| PF::ZERO),
+ }
+ }
+}
+
+impl> From for PackedCubicExtensionFieldGL {
+ #[inline]
+ fn from(x: CubicExtensionFieldGL) -> Self {
+ Self {
+ value: x.value.map(Into::into),
+ }
+ }
+}
+
+impl> From for PackedCubicExtensionFieldGL {
+ #[inline]
+ fn from(x: PF) -> Self {
+ Self {
+ value: field_to_array(x),
+ }
+ }
+}
+
+impl> Distribution> for StandardUniform
+where
+ Self: Distribution,
+{
+ #[inline]
+ fn sample(&self, rng: &mut R) -> PackedCubicExtensionFieldGL {
+ PackedCubicExtensionFieldGL::new(array::from_fn(|_| self.sample(rng)))
+ }
+}
+
+impl> Algebra for PackedCubicExtensionFieldGL {}
+
+impl> Algebra for PackedCubicExtensionFieldGL {}
+
+impl> PrimeCharacteristicRing for PackedCubicExtensionFieldGL {
+ type PrimeSubfield = PF::PrimeSubfield;
+
+ const ZERO: Self = Self { value: [PF::ZERO; D] };
+
+ const ONE: Self = Self {
+ value: field_to_array(PF::ONE),
+ };
+
+ const TWO: Self = Self {
+ value: field_to_array(PF::TWO),
+ };
+
+ const NEG_ONE: Self = Self {
+ value: field_to_array(PF::NEG_ONE),
+ };
+
+ #[inline]
+ fn from_prime_subfield(val: Self::PrimeSubfield) -> Self {
+ PF::from_prime_subfield(val).into()
+ }
+
+ #[inline]
+ fn from_bool(b: bool) -> Self {
+ PF::from_bool(b).into()
+ }
+
+ #[inline(always)]
+ fn square(&self) -> Self {
+ let mut res = Self::default();
+ cubic_square_generic(&self.value, &mut res.value);
+ res
+ }
+
+ #[inline]
+ fn zero_vec(len: usize) -> Vec {
+ // SAFETY: this is a repr(transparent) wrapper around an array.
+ unsafe { reconstitute_from_base(PF::zero_vec(len * D)) }
+ }
+}
+
+impl> BasedVectorSpace for PackedCubicExtensionFieldGL {
+ const DIMENSION: usize = D;
+
+ #[inline]
+ fn as_basis_coefficients_slice(&self) -> &[PF] {
+ &self.value
+ }
+
+ #[inline]
+ fn from_basis_coefficients_fn PF>(f: Fn) -> Self {
+ Self {
+ value: array::from_fn(f),
+ }
+ }
+
+ #[inline]
+ fn from_basis_coefficients_iter>(mut iter: I) -> Option {
+ (iter.len() == D).then(|| Self::new(array::from_fn(|_| iter.next().unwrap())))
+ }
+
+ #[inline]
+ fn flatten_to_base(vec: Vec) -> Vec {
+ // SAFETY: `Self` is `repr(transparent)` over `[PF; D]`.
+ unsafe { flatten_to_base(vec) }
+ }
+
+ #[inline]
+ fn reconstitute_from_base(vec: Vec) -> Vec {
+ // SAFETY: `Self` is `repr(transparent)` over `[PF; D]`.
+ unsafe { reconstitute_from_base(vec) }
+ }
+}
+
+impl PackedFieldExtension
+ for PackedCubicExtensionFieldGL<::Packing>
+{
+ #[inline]
+ fn from_ext_slice(ext_slice: &[CubicExtensionFieldGL]) -> Self {
+ let width = ::Packing::WIDTH;
+ assert_eq!(ext_slice.len(), width);
+
+ let res = array::from_fn(|i| ::Packing::from_fn(|j| ext_slice[j].value[i]));
+ Self::new(res)
+ }
+
+ #[inline]
+ fn to_ext_iter(iter: impl IntoIterator- ) -> impl Iterator
- {
+ let width = ::Packing::WIDTH;
+ iter.into_iter().flat_map(move |x| {
+ (0..width).map(move |i| {
+ let values = array::from_fn(|j| x.value[j].as_slice()[i]);
+ CubicExtensionFieldGL::new(values)
+ })
+ })
+ }
+
+ #[inline]
+ fn packed_ext_powers(base: CubicExtensionFieldGL) -> Powers {
+ let width = ::Packing::WIDTH;
+ let powers = base.powers().take(width + 1).collect_vec();
+ let current = Self::from_ext_slice(&powers[..width]);
+ let multiplier = powers[width].into();
+
+ Powers {
+ base: multiplier,
+ current,
+ }
+ }
+}
+
+impl> Neg for PackedCubicExtensionFieldGL {
+ type Output = Self;
+ #[inline]
+ fn neg(self) -> Self {
+ Self {
+ value: self.value.map(PF::neg),
+ }
+ }
+}
+
+impl> Add for PackedCubicExtensionFieldGL {
+ type Output = Self;
+ #[inline]
+ fn add(self, rhs: Self) -> Self {
+ Self {
+ value: array::from_fn(|i| self.value[i] + rhs.value[i]),
+ }
+ }
+}
+
+impl> Add for PackedCubicExtensionFieldGL {
+ type Output = Self;
+ #[inline]
+ fn add(self, rhs: CubicExtensionFieldGL) -> Self {
+ Self {
+ value: array::from_fn(|i| self.value[i] + rhs.value[i]),
+ }
+ }
+}
+
+impl> Add for PackedCubicExtensionFieldGL {
+ type Output = Self;
+ #[inline]
+ fn add(mut self, rhs: PF) -> Self {
+ self.value[0] += rhs;
+ self
+ }
+}
+
+impl> AddAssign for PackedCubicExtensionFieldGL {
+ #[inline]
+ fn add_assign(&mut self, rhs: Self) {
+ for i in 0..D {
+ self.value[i] += rhs.value[i];
+ }
+ }
+}
+
+impl> AddAssign for PackedCubicExtensionFieldGL {
+ #[inline]
+ fn add_assign(&mut self, rhs: CubicExtensionFieldGL) {
+ for i in 0..D {
+ self.value[i] += rhs.value[i];
+ }
+ }
+}
+
+impl> AddAssign for PackedCubicExtensionFieldGL {
+ #[inline]
+ fn add_assign(&mut self, rhs: PF) {
+ self.value[0] += rhs;
+ }
+}
+
+impl> Sum for PackedCubicExtensionFieldGL {
+ #[inline]
+ fn sum>(iter: I) -> Self {
+ iter.reduce(|acc, x| acc + x).unwrap_or(Self::ZERO)
+ }
+}
+
+impl> Sub for PackedCubicExtensionFieldGL {
+ type Output = Self;
+ #[inline]
+ fn sub(self, rhs: Self) -> Self {
+ Self {
+ value: array::from_fn(|i| self.value[i] - rhs.value[i]),
+ }
+ }
+}
+
+impl> Sub for PackedCubicExtensionFieldGL {
+ type Output = Self;
+ #[inline]
+ fn sub(self, rhs: CubicExtensionFieldGL) -> Self {
+ Self {
+ value: array::from_fn(|i| self.value[i] - rhs.value[i]),
+ }
+ }
+}
+
+impl> Sub for PackedCubicExtensionFieldGL {
+ type Output = Self;
+ #[inline]
+ fn sub(self, rhs: PF) -> Self {
+ let mut res = self.value;
+ res[0] -= rhs;
+ Self { value: res }
+ }
+}
+
+impl> SubAssign for PackedCubicExtensionFieldGL {
+ #[inline]
+ fn sub_assign(&mut self, rhs: Self) {
+ *self = *self - rhs;
+ }
+}
+
+impl> SubAssign for PackedCubicExtensionFieldGL {
+ #[inline]
+ fn sub_assign(&mut self, rhs: CubicExtensionFieldGL) {
+ *self = *self - rhs;
+ }
+}
+
+impl> SubAssign for PackedCubicExtensionFieldGL {
+ #[inline]
+ fn sub_assign(&mut self, rhs: PF) {
+ *self = *self - rhs;
+ }
+}
+
+impl> Mul for PackedCubicExtensionFieldGL {
+ type Output = Self;
+ #[inline(always)]
+ fn mul(self, rhs: Self) -> Self {
+ let mut res = Self::default();
+ cubic_mul_generic(&self.value, &rhs.value, &mut res.value);
+ res
+ }
+}
+
+impl> Mul for PackedCubicExtensionFieldGL {
+ type Output = Self;
+ #[inline(always)]
+ fn mul(self, rhs: CubicExtensionFieldGL) -> Self {
+ let b: [PF; D] = rhs.value.map(|x| x.into());
+ let mut res = Self::default();
+ cubic_mul_generic(&self.value, &b, &mut res.value);
+ res
+ }
+}
+
+impl> Mul for PackedCubicExtensionFieldGL {
+ type Output = Self;
+ #[inline]
+ fn mul(self, rhs: PF) -> Self {
+ Self {
+ value: self.value.map(|x| x * rhs),
+ }
+ }
+}
+
+impl> Product for PackedCubicExtensionFieldGL {
+ #[inline]
+ fn product>(iter: I) -> Self {
+ iter.reduce(|acc, x| acc * x).unwrap_or(Self::ONE)
+ }
+}
+
+impl> MulAssign for PackedCubicExtensionFieldGL {
+ #[inline(always)]
+ fn mul_assign(&mut self, rhs: Self) {
+ *self = *self * rhs;
+ }
+}
+
+impl> MulAssign for PackedCubicExtensionFieldGL {
+ #[inline(always)]
+ fn mul_assign(&mut self, rhs: CubicExtensionFieldGL) {
+ *self = *self * rhs;
+ }
+}
+
+impl> MulAssign for PackedCubicExtensionFieldGL {
+ #[inline]
+ fn mul_assign(&mut self, rhs: PF) {
+ *self = *self * rhs;
+ }
+}
diff --git a/crates/backend/goldilocks/src/poseidon1.rs b/crates/backend/goldilocks/src/poseidon1.rs
new file mode 100644
index 000000000..43b03cf77
--- /dev/null
+++ b/crates/backend/goldilocks/src/poseidon1.rs
@@ -0,0 +1,1008 @@
+// Credits: Plonky3 (https://github.com/Plonky3/Plonky3) (MIT and Apache-2.0 licenses).
+
+//! Scalar Poseidon1 permutation at width 8 for Goldilocks.
+//!
+//! Parameters:
+//! - S-box `x^7` (smallest `d` with `gcd(d, p - 1) = 1` for Goldilocks)
+//! - `R_F = 8` full rounds (4 initial + 4 terminal)
+//! - `R_P = 22` partial rounds in the middle
+//! - External MDS is the circulant matrix with first row `[7, 1, 3, 8, 8, 3, 4, 9]`
+//! (Plonky2/upstream-Plonky3 "small MDS" — same matrix the upstream
+//! `MdsMatrixGoldilocks` uses at width 8).
+//!
+//! The permutation is generic over any algebra `R` over `Goldilocks` that also
+//! implements `InjectiveMonomial<7>`, mirroring the koala-bear crate's
+//! Poseidon1 surface.
+
+#[cfg(any(
+ test,
+ not(any(
+ all(target_arch = "x86_64", target_feature = "avx2", not(target_feature = "avx512f")),
+ all(target_arch = "x86_64", target_feature = "avx512f"),
+ )),
+))]
+use field::PackedValue;
+use field::{Algebra, Field, InjectiveMonomial, PrimeCharacteristicRing};
+
+use crate::Goldilocks;
+
+pub const POSEIDON1_WIDTH: usize = 8;
+pub const POSEIDON1_HALF_FULL_ROUNDS: usize = 4;
+pub const POSEIDON1_PARTIAL_ROUNDS: usize = 22;
+pub const POSEIDON1_SBOX_DEGREE: u64 = 7;
+pub const POSEIDON1_DIGEST_LEN: usize = 4;
+
+pub const POSEIDON1_N_ROUNDS: usize = 2 * POSEIDON1_HALF_FULL_ROUNDS + POSEIDON1_PARTIAL_ROUNDS;
+
+// =========================================================================
+// MDS matrix (circulant, width 8)
+// =========================================================================
+//
+// First row of the circulant MDS matrix. `MDS8_COL[i] = r_{(N - i) mod N}` is
+// the first column — more convenient for a row-major apply of a circulant
+// since `row_i = cyclic_shift(col, i)`, i.e. `M[i][j] = COL[(j - i + N) mod N]`
+// (equivalently `ROW[(j - i) mod N]`).
+pub const MDS8_ROW: [i64; 8] = [7, 1, 3, 8, 8, 3, 4, 9];
+
+/// Apply the width-8 circulant MDS matrix in place, generic over `R`.
+///
+/// The matrix has tiny integer entries (max 9), so even without any delayed
+/// reduction a plain algebra-over-Goldilocks multiply is fine.
+#[inline]
+fn mds_mul_generic>(state: &mut [R; 8]) {
+ // Precompute the constants as Goldilocks once — `From` for `R`
+ // gives us `R` conversions.
+ let coeffs: [Goldilocks; 8] = {
+ let mut arr = [Goldilocks::ZERO; 8];
+ for i in 0..8 {
+ arr[i] = Goldilocks::new(MDS8_ROW[i] as u64);
+ }
+ arr
+ };
+
+ let input = *state;
+ for i in 0..8 {
+ // `row_i · input = sum_j ROW[(j - i) mod 8] · input[j]`
+ let mut acc = input[0] * coeffs[(8 - i) % 8];
+ for j in 1..8 {
+ acc += input[j] * coeffs[(j + 8 - i) % 8];
+ }
+ state[i] = acc;
+ }
+}
+
+/// Specialized fast MDS for the concrete `Goldilocks` scalar.
+///
+/// Each output is a dot product `sum_j MDS_ROW[(j-i) mod 8] * state[j]` with
+/// MDS coefficients in `{1, 3, 4, 7, 8, 9}` (all fit in 4 bits). With the
+/// constants spelled out explicitly LLVM strength-reduces `c * s` to shifts
+/// and adds (e.g. `8*s = s<<3`, `7*s = (s<<3)-s`), eliminating the variable
+/// multiplications entirely. We accumulate into `u128` (8·9·2^64 ≈ 2^71 fits
+/// comfortably) and reduce once per output via `reduce128`. The explicit
+/// `1 *` factors keep the circulant structure readable column-by-column.
+#[inline(always)]
+#[allow(clippy::identity_op)]
+fn mds_mul_scalar(state: &mut [Goldilocks; 8]) {
+ let s0 = state[0].value as u128;
+ let s1 = state[1].value as u128;
+ let s2 = state[2].value as u128;
+ let s3 = state[3].value as u128;
+ let s4 = state[4].value as u128;
+ let s5 = state[5].value as u128;
+ let s6 = state[6].value as u128;
+ let s7 = state[7].value as u128;
+
+ // MDS_ROW = [7, 1, 3, 8, 8, 3, 4, 9]; row i is MDS_ROW rotated right by i.
+ let acc0 = 7 * s0 + 1 * s1 + 3 * s2 + 8 * s3 + 8 * s4 + 3 * s5 + 4 * s6 + 9 * s7;
+ let acc1 = 9 * s0 + 7 * s1 + 1 * s2 + 3 * s3 + 8 * s4 + 8 * s5 + 3 * s6 + 4 * s7;
+ let acc2 = 4 * s0 + 9 * s1 + 7 * s2 + 1 * s3 + 3 * s4 + 8 * s5 + 8 * s6 + 3 * s7;
+ let acc3 = 3 * s0 + 4 * s1 + 9 * s2 + 7 * s3 + 1 * s4 + 3 * s5 + 8 * s6 + 8 * s7;
+ let acc4 = 8 * s0 + 3 * s1 + 4 * s2 + 9 * s3 + 7 * s4 + 1 * s5 + 3 * s6 + 8 * s7;
+ let acc5 = 8 * s0 + 8 * s1 + 3 * s2 + 4 * s3 + 9 * s4 + 7 * s5 + 1 * s6 + 3 * s7;
+ let acc6 = 3 * s0 + 8 * s1 + 8 * s2 + 3 * s3 + 4 * s4 + 9 * s5 + 7 * s6 + 1 * s7;
+ let acc7 = 1 * s0 + 3 * s1 + 8 * s2 + 8 * s3 + 3 * s4 + 4 * s5 + 9 * s6 + 7 * s7;
+
+ state[0] = crate::goldilocks::reduce128(acc0);
+ state[1] = crate::goldilocks::reduce128(acc1);
+ state[2] = crate::goldilocks::reduce128(acc2);
+ state[3] = crate::goldilocks::reduce128(acc3);
+ state[4] = crate::goldilocks::reduce128(acc4);
+ state[5] = crate::goldilocks::reduce128(acc5);
+ state[6] = crate::goldilocks::reduce128(acc6);
+ state[7] = crate::goldilocks::reduce128(acc7);
+}
+
+// =========================================================================
+// Round constants (width 8)
+// =========================================================================
+//
+// Layout: [4 initial full][22 partial][4 terminal full].
+// Generated by the Grain LFSR (Poseidon1, Appendix E) with
+// `field_type = 1, alpha = 7, n = 64, t = 8, R_F = 8, R_P = 22`.
+// Values carried over verbatim from `plonky3/goldilocks/src/poseidon1.rs`.
+pub const GOLDILOCKS_POSEIDON1_RC_8: [[Goldilocks; POSEIDON1_WIDTH]; POSEIDON1_N_ROUNDS] = Goldilocks::new_2d_array([
+ // ---- Initial full rounds (4) ----
+ [
+ 0xdd5743e7f2a5a5d9,
+ 0xcb3a864e58ada44b,
+ 0xffa2449ed32f8cdc,
+ 0x42025f65d6bd13ee,
+ 0x7889175e25506323,
+ 0x34b98bb03d24b737,
+ 0xbdcc535ecc4faa2a,
+ 0x5b20ad869fc0d033,
+ ],
+ [
+ 0xf1dda5b9259dfcb4,
+ 0x27515210be112d59,
+ 0x4227d1718c766c3f,
+ 0x26d333161a5bd794,
+ 0x49b938957bf4b026,
+ 0x4a56b5938b213669,
+ 0x1120426b48c8353d,
+ 0x6b323c3f10a56cad,
+ ],
+ [
+ 0xce57d6245ddca6b2,
+ 0xb1fc8d402bba1eb1,
+ 0xb5c5096ca959bd04,
+ 0x6db55cd306d31f7f,
+ 0xc49d293a81cb9641,
+ 0x1ce55a4fe979719f,
+ 0xa92e60a9d178a4d1,
+ 0x002cc64973bcfd8c,
+ ],
+ [
+ 0xcea721cce82fb11b,
+ 0xe5b55eb8098ece81,
+ 0x4e30525c6f1ddd66,
+ 0x43c6702827070987,
+ 0xaca68430a7b5762a,
+ 0x3674238634df9c93,
+ 0x88cee1c825e33433,
+ 0xde99ae8d74b57176,
+ ],
+ // ---- Partial rounds (22) ----
+ [
+ 0x488897d85ff51f56,
+ 0x1140737ccb162218,
+ 0xa7eeb9215866ed35,
+ 0x9bd2976fee49fcc9,
+ 0xc0c8f0de580a3fcc,
+ 0x4fb2dae6ee8fc793,
+ 0x343a89f35f37395b,
+ 0x223b525a77ca72c8,
+ ],
+ [
+ 0x56ccb62574aaa918,
+ 0xc4d507d8027af9ed,
+ 0xa080673cf0b7e95c,
+ 0xf0184884eb70dcf8,
+ 0x044f10b0cb3d5c69,
+ 0xe9e3f7993938f186,
+ 0x1b761c80e772f459,
+ 0x606cec607a1b5fac,
+ ],
+ [
+ 0x14a0c2e1d45f03cd,
+ 0x4eace8855398574f,
+ 0xf905ca7103eff3e6,
+ 0xf8c8f8d20862c059,
+ 0xb524fe8bdd678e5a,
+ 0xfbb7865901a1ec41,
+ 0x014ef1197d341346,
+ 0x9725e20825d07394,
+ ],
+ [
+ 0xfdb25aef2c5bae3b,
+ 0xbe5402dc598c971e,
+ 0x93a5711f04cdca3d,
+ 0xc45a9a5b2f8fb97b,
+ 0xfe8946a924933545,
+ 0x2af997a27369091c,
+ 0xaa62c88e0b294011,
+ 0x058eb9d810ce9f74,
+ ],
+ [
+ 0xb3cb23eced349ae4,
+ 0xa3648177a77b4a84,
+ 0x43153d905992d95d,
+ 0xf4e2a97cda44aa4b,
+ 0x5baa2702b908682f,
+ 0x082923bdf4f750d1,
+ 0x98ae09a325893803,
+ 0xf8a6475077968838,
+ ],
+ [
+ 0xceb0735bf00b2c5f,
+ 0x0a1a5d953888e072,
+ 0x2fcb190489f94475,
+ 0xb5be06270dec69fc,
+ 0x739cb934b09acf8b,
+ 0x537750b75ec7f25b,
+ 0xe9dd318bae1f3961,
+ 0xf7462137299efe1a,
+ ],
+ [
+ 0xb1f6b8eee9adb940,
+ 0xbdebcc8a809dfe6b,
+ 0x40fc1f791b178113,
+ 0x3ac1c3362d014864,
+ 0x9a016184bdb8aeba,
+ 0x95f2394459fbc25e,
+ 0xe3f34a07a76a66c2,
+ 0x8df25f9ad98b1b96,
+ ],
+ [
+ 0x85ffc27171439d9d,
+ 0xddcb9a2dcfd26910,
+ 0x26b5ba4bf3afb94e,
+ 0xffff9cc7c7651e2f,
+ 0x8c88364698280b55,
+ 0xebc114167b910501,
+ 0x2d77b4d89ecfb516,
+ 0x332e0828eba151f2,
+ ],
+ [
+ 0x46fa6a6450dd4735,
+ 0xd00db7dd92384a33,
+ 0x5fd4fb751f3a5fc5,
+ 0x496fb90c0bb65ea2,
+ 0xf3baec0bb87cc5c7,
+ 0x862a3c0a7d4c7713,
+ 0xbf5f38336a3f47d8,
+ 0x41ad9dbc1394a20c,
+ ],
+ [
+ 0xcc535945b7dbf0f7,
+ 0x82af2bc93685bcec,
+ 0x8e4c8d0c8cebfccd,
+ 0x17cb39417e84597e,
+ 0xd4a965a8c749b232,
+ 0xa2cab040f33f3ee5,
+ 0xa98811a1fed4e3a6,
+ 0x1cc48b54f377e2a1,
+ ],
+ [
+ 0xe40cd4f6c5609a27,
+ 0x11de79ebca97a4a4,
+ 0x9177c73d8b7e929d,
+ 0x2a6fe8085797e792,
+ 0x3de6e93329f8d5ae,
+ 0x3f7af9125da962ff,
+ 0xd710682cfc77d3ac,
+ 0x48faf05f3b053cf4,
+ ],
+ [
+ 0x287db8630da89c8b,
+ 0x4d0de32053cb30e9,
+ 0x8b37a4f20c5ada7b,
+ 0xe7cc6ebe78c84ecf,
+ 0x240bdc0a66a2610d,
+ 0x8299e7f02caa1650,
+ 0x380a53fefb6e754e,
+ 0x684a1d8cf8eb6810,
+ ],
+ [
+ 0xe839452eb4b8a5e1,
+ 0xb03fa62e90626af4,
+ 0x11a688602fbc5efc,
+ 0x30dda75c355a2d62,
+ 0x0f712adcb73810de,
+ 0xffdc1102187f1ae1,
+ 0x40c34f398254b99c,
+ 0xede021b9dc289a4a,
+ ],
+ [
+ 0x8b7b05225c4e7dad,
+ 0x3bc794346f9d9ff9,
+ 0xfccb5a57f2ca86ff,
+ 0xbb1502015a7da9d4,
+ 0xd7e0a35d4352a015,
+ 0x27af7a44f8160931,
+ 0xc37442f6782f4615,
+ 0xbdf392a9bd095dcb,
+ ],
+ [
+ 0xc17f55037cf00de9,
+ 0xbcffedd34c71a874,
+ 0x5eb45d2a8133d1f2,
+ 0xbabe251e1612ebdf,
+ 0x3efeb9fbe438c536,
+ 0x2d7cef97b4afe1cf,
+ 0xe5de1b4660016c0b,
+ 0xcdcc26c332f5657c,
+ ],
+ [
+ 0xe01dd653daf15809,
+ 0xb0a6bdd4b41094b5,
+ 0x27eac858b0b03a05,
+ 0x51d43b5e93adbdc0,
+ 0x8b89a23b0fea5fc9,
+ 0xdc8ac3b14f7f2fc1,
+ 0xe793f82f1efec039,
+ 0x9f6f2cf8969e7b80,
+ ],
+ [
+ 0x49d45382e0f21d4a,
+ 0x5f4ad1797cd72786,
+ 0x4dc3dbebfd45f795,
+ 0x03a3ef84dba6e1bc,
+ 0x204bc9b3d3fc4c01,
+ 0x9ad706081e89b9ba,
+ 0x638bfb4d840e9f89,
+ 0x5ef2938cd095ae35,
+ ],
+ [
+ 0x42cca18ebeb265c8,
+ 0xb7b2ec5c29aecbf8,
+ 0x0d84f9535dc78f0f,
+ 0x04e64ad942e77b8c,
+ 0xb4880dffffc9da0b,
+ 0x16db16d9c29adeb1,
+ 0x09bbaf2a0590cd1e,
+ 0x76460e74961fcf8d,
+ ],
+ [
+ 0xed12a2276dfa1553,
+ 0x0b5acec5de0436fd,
+ 0x3c6cfea033a1f0a8,
+ 0x2b5ecefe546cac15,
+ 0x6e2d82884cd3bf6f,
+ 0xc134878d1add7b83,
+ 0x997963422eb7a280,
+ 0x5e834537ac648cf6,
+ ],
+ [
+ 0x89e779214737c0b7,
+ 0x1a8c05e8581ad95b,
+ 0x8d18b72796437cf7,
+ 0xe7252c949e04b106,
+ 0x53267c4fd174585a,
+ 0xa16ef5d9c81dad47,
+ 0xda65191937270a46,
+ 0xcb2a5b55f2df664c,
+ ],
+ [
+ 0x854aee2dc1924137,
+ 0xf37013c9d479ece6,
+ 0x0e163bc0630c4696,
+ 0x384ee64955048f76,
+ 0xf65d814e28ee4ec5,
+ 0xe57bc564fd82f1b1,
+ 0x4b338937b6876614,
+ 0x66ee0b04ed43cd8d,
+ ],
+ [
+ 0x49884bf25f4ef15d,
+ 0xeb51fe28de1c6f54,
+ 0x2cd64e84fce8dfcc,
+ 0x29164a96a541a013,
+ 0x173ce7558f4cacb8,
+ 0xeb5b1ce5877c89e9,
+ 0x5faff4b0f5217bf6,
+ 0xac42d0b1c20f205e,
+ ],
+ // ---- Terminal full rounds (4) ----
+ [
+ 0xfb1d6bf0ca43221b,
+ 0x97b0a1b01d6a2955,
+ 0x08c60bd622952b30,
+ 0x43f2be0f9e24147c,
+ 0xfa7268b7d3730f5d,
+ 0x43a6c419a23983bb,
+ 0xcd77c1f7b29b113c,
+ 0xcfa43c9db8eec29f,
+ ],
+ [
+ 0xcaaa95a6c7365dec,
+ 0x0a91193f798f3be0,
+ 0x1104497652735dc6,
+ 0x35aecb93663b515e,
+ 0x8dbc9916065aa858,
+ 0xada8f7a0266579ed,
+ 0x524dee7bec1ea789,
+ 0xa93aee9dd5af9521,
+ ],
+ [
+ 0x9d1f1b54750d707e,
+ 0x7c9feab87096d5dc,
+ 0xa2e1fb19f9d4261b,
+ 0xb714deb448de6346,
+ 0x225d1f0d011c5403,
+ 0x1549b7f1d28cedc0,
+ 0xaef3e46f97d43942,
+ 0x6dfc7ffe0b38bf08,
+ ],
+ [
+ 0x7de853fdc542b663,
+ 0xa68ecc96610657b2,
+ 0xe88bb5428af289b1,
+ 0xd7cfa1504c5569f5,
+ 0x78a9aad0d642d30a,
+ 0xd68315f2353dce52,
+ 0x46e56300f86fcfd5,
+ 0x323d95332b145fd6,
+ ],
+]);
+
+// =========================================================================
+// S-box helpers
+// =========================================================================
+
+#[inline(always)]
+fn sbox_full>(x: R) -> R {
+ x.injective_exp_n()
+}
+
+// =========================================================================
+// Permutation driver
+// =========================================================================
+
+/// Width-8 Poseidon1 permutation for Goldilocks.
+///
+/// Zero-sized — all state lives in the round-constant tables above. Mirrors
+/// `Poseidon1Goldilocks8`'s public surface: `permute{,_mut}`,
+/// `compress{,_in_place}`, plus a `default_goldilocks_poseidon1_8()` constructor.
+#[derive(Clone, Copy, Debug, Default)]
+pub struct Poseidon1Goldilocks8;
+
+impl Poseidon1Goldilocks8 {
+ /// Fast scalar permutation — direct `Goldilocks` arithmetic with a `u128`
+ /// MDS accumulator.
+ pub fn permute(&self, mut state: [Goldilocks; POSEIDON1_WIDTH]) -> [Goldilocks; POSEIDON1_WIDTH] {
+ self.permute_mut(&mut state);
+ state
+ }
+
+ #[inline]
+ pub fn permute_mut(&self, state: &mut [Goldilocks; POSEIDON1_WIDTH]) {
+ for rc in GOLDILOCKS_POSEIDON1_RC_8.iter().take(POSEIDON1_HALF_FULL_ROUNDS) {
+ for (i, s) in state.iter_mut().enumerate() {
+ *s += rc[i];
+ }
+ for s in state.iter_mut() {
+ *s = sbox_full::(*s);
+ }
+ mds_mul_scalar(state);
+ }
+
+ for rc in GOLDILOCKS_POSEIDON1_RC_8
+ .iter()
+ .skip(POSEIDON1_HALF_FULL_ROUNDS)
+ .take(POSEIDON1_PARTIAL_ROUNDS)
+ {
+ for (i, s) in state.iter_mut().enumerate() {
+ *s += rc[i];
+ }
+ state[0] = sbox_full::(state[0]);
+ mds_mul_scalar(state);
+ }
+
+ for rc in GOLDILOCKS_POSEIDON1_RC_8
+ .iter()
+ .take(POSEIDON1_N_ROUNDS)
+ .skip(POSEIDON1_HALF_FULL_ROUNDS + POSEIDON1_PARTIAL_ROUNDS)
+ {
+ for (i, s) in state.iter_mut().enumerate() {
+ *s += rc[i];
+ }
+ for s in state.iter_mut() {
+ *s = sbox_full::(*s);
+ }
+ mds_mul_scalar(state);
+ }
+ }
+
+ /// Generic permutation over any algebra `R` over `Goldilocks` with `x^7`
+ /// as an injective monomial. Used by the AIR / symbolic trace builders.
+ pub fn permute_generic(&self, state: &mut [R; POSEIDON1_WIDTH])
+ where
+ R: Algebra + InjectiveMonomial<7> + Copy,
+ {
+ for rc in GOLDILOCKS_POSEIDON1_RC_8.iter().take(POSEIDON1_HALF_FULL_ROUNDS) {
+ for (i, s) in state.iter_mut().enumerate() {
+ *s += rc[i];
+ }
+ for s in state.iter_mut() {
+ *s = sbox_full::(*s);
+ }
+ mds_mul_generic(state);
+ }
+
+ for rc in GOLDILOCKS_POSEIDON1_RC_8
+ .iter()
+ .skip(POSEIDON1_HALF_FULL_ROUNDS)
+ .take(POSEIDON1_PARTIAL_ROUNDS)
+ {
+ for (i, s) in state.iter_mut().enumerate() {
+ *s += rc[i];
+ }
+ state[0] = sbox_full::(state[0]);
+ mds_mul_generic(state);
+ }
+
+ for rc in GOLDILOCKS_POSEIDON1_RC_8
+ .iter()
+ .take(POSEIDON1_N_ROUNDS)
+ .skip(POSEIDON1_HALF_FULL_ROUNDS + POSEIDON1_PARTIAL_ROUNDS)
+ {
+ for (i, s) in state.iter_mut().enumerate() {
+ *s += rc[i];
+ }
+ for s in state.iter_mut() {
+ *s = sbox_full::(*s);
+ }
+ mds_mul_generic(state);
+ }
+ }
+
+ /// Compression-mode in-place permutation: `output = permute(input) + input`.
+ ///
+ /// When `R` matches the architecture's packed Goldilocks type, dispatches
+ /// to the SIMD-parallel path. When `R == Goldilocks`, uses the scalar fast
+ /// path (avoids the symbolic-friendly but slow `permute_generic`).
+ /// Otherwise falls back to the generic algebra path.
+ #[inline(always)]
+ pub fn compress_in_place(&self, state: &mut [R; POSEIDON1_WIDTH])
+ where
+ R: Algebra + InjectiveMonomial<7> + Copy + 'static,
+ {
+ use core::any::TypeId;
+
+ type Packing = ::Packing;
+
+ if TypeId::of::() == TypeId::of::() {
+ // SAFETY: TypeId equality guarantees R has the same layout as Packing,
+ // and the array is repr-transparent as a slice of W*8 Goldilocks.
+ let s = unsafe { &mut *(state as *mut [R; POSEIDON1_WIDTH] as *mut [Packing; POSEIDON1_WIDTH]) };
+ self.simd_core::(s);
+ return;
+ }
+ if TypeId::of::() == TypeId::of::() {
+ // SAFETY: TypeId equality.
+ let s = unsafe { &mut *(state as *mut [R; POSEIDON1_WIDTH] as *mut [Goldilocks; POSEIDON1_WIDTH]) };
+ let initial = *s;
+ self.permute_mut(s);
+ for (slot, init) in s.iter_mut().zip(initial) {
+ *slot += init;
+ }
+ return;
+ }
+
+ let initial = *state;
+ self.permute_generic(state);
+ for (s, init) in state.iter_mut().zip(initial) {
+ *s += init;
+ }
+ }
+
+ /// Permutation-mode in-place permutation (no feedforward), mirroring
+ /// [`Self::compress_in_place`]'s SIMD dispatch. Used by the overwrite sponge
+ /// for Merkle leaf/node hashing — without this the packed `Permutation` impl
+ /// would fall back to the slow `permute_generic` (fully-reducing packed MDS),
+ /// regressing all Merkle tree building ~4x.
+ #[inline(always)]
+ pub fn permute_in_place(&self, state: &mut [R; POSEIDON1_WIDTH])
+ where
+ R: Algebra + InjectiveMonomial<7> + Copy + 'static,
+ {
+ use core::any::TypeId;
+
+ type Packing = ::Packing;
+
+ if TypeId::of::() == TypeId::of::() {
+ // SAFETY: TypeId equality guarantees R has the same layout as Packing.
+ let s = unsafe { &mut *(state as *mut [R; POSEIDON1_WIDTH] as *mut [Packing; POSEIDON1_WIDTH]) };
+ self.simd_core::(s);
+ return;
+ }
+ if TypeId::of::() == TypeId::of::() {
+ // SAFETY: TypeId equality.
+ let s = unsafe { &mut *(state as *mut [R; POSEIDON1_WIDTH] as *mut [Goldilocks; POSEIDON1_WIDTH]) };
+ self.permute_mut(s);
+ return;
+ }
+
+ self.permute_generic(state);
+ }
+
+ /// SIMD-parallel compression over `::Packing`.
+ ///
+ /// On x86_64 (AVX2 or AVX512), keeps state in packed registers throughout
+ /// the rounds. RC adds and sboxes use the packed `Add`/`square`/`Mul`
+ /// (which fully reduce), and the MDS uses the dedicated `mds_mul_simd`
+ /// (delayed reduction via shift+add multiplication by tiny constants).
+ ///
+ /// On other architectures (e.g. aarch64+NEON, scalar fallback), we
+ /// deinterleave to per-lane scalar arrays and run the rounds in lockstep
+ /// across all W lanes. The MDS coefficients are tiny (max 9), so the
+ /// scalar `mds_mul_scalar` (u128 accumulator + single `reduce128` per
+ /// output) is far cheaper than the packed type's fully-reducing `Mul`.
+ ///
+ /// `FEEDFORWARD = true` adds back the original input (compression / Davies-Meyer);
+ /// `FEEDFORWARD = false` is the raw permutation (overwrite sponge).
+ #[inline(always)]
+ fn simd_core(&self, state: &mut [::Packing; POSEIDON1_WIDTH]) {
+ #[cfg(any(
+ all(target_arch = "x86_64", target_feature = "avx2", not(target_feature = "avx512f")),
+ all(target_arch = "x86_64", target_feature = "avx512f"),
+ ))]
+ {
+ type P = ::Packing;
+
+ #[cfg(all(target_arch = "x86_64", target_feature = "avx2", not(target_feature = "avx512f")))]
+ use crate::x86_64_avx2::packing::{add_canonical_scalar, mds_mul_simd};
+ #[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
+ use crate::x86_64_avx512::packing::{add_canonical_scalar, mds_mul_simd};
+
+ // 8 named SSA scalars rather than an array — otherwise LLVM
+ // re-rolls the (identical-shape) per-slot sboxes back into a loop,
+ // serializing them through a memory-resident state. Naming each
+ // slot keeps each sbox a distinct value, enabling ILP across the
+ // 8 slots and keeping everything in zmm/ymm registers across all
+ // 30 rounds.
+ //
+ // `add_canonical_scalar` skips the `canonicalize` that the generic
+ // packed `Add` applies to its RHS — round constants are canonical
+ // by construction (all `< P`).
+ let initial = *state;
+ let [mut s0, mut s1, mut s2, mut s3, mut s4, mut s5, mut s6, mut s7] = initial;
+
+ // Initial full rounds.
+ for rc in GOLDILOCKS_POSEIDON1_RC_8.iter().take(POSEIDON1_HALF_FULL_ROUNDS) {
+ s0 = add_canonical_scalar(s0, rc[0]);
+ s1 = add_canonical_scalar(s1, rc[1]);
+ s2 = add_canonical_scalar(s2, rc[2]);
+ s3 = add_canonical_scalar(s3, rc[3]);
+ s4 = add_canonical_scalar(s4, rc[4]);
+ s5 = add_canonical_scalar(s5, rc[5]);
+ s6 = add_canonical_scalar(s6, rc[6]);
+ s7 = add_canonical_scalar(s7, rc[7]);
+ s0 = sbox_full::
(s0);
+ s1 = sbox_full::
(s1);
+ s2 = sbox_full::
(s2);
+ s3 = sbox_full::
(s3);
+ s4 = sbox_full::
(s4);
+ s5 = sbox_full::
(s5);
+ s6 = sbox_full::
(s6);
+ s7 = sbox_full::
(s7);
+ [s0, s1, s2, s3, s4, s5, s6, s7] = mds_mul_simd([s0, s1, s2, s3, s4, s5, s6, s7]);
+ }
+
+ // Partial rounds.
+ //
+ // NB: the Appendix-B sparse partial-round decomposition (one dense
+ // `m_i` multiply + per-round rank-1 updates, as used by the AIR and
+ // the KoalaBear-16 permutation) was implemented and measured here and
+ // is ~13% SLOWER for Goldilocks: this circulant MDS has tiny entries
+ // {1,3,4,7,8,9} that strength-reduce to shift/adds and batch 8 terms
+ // into a single `reduce128` per output, whereas the sparse form needs
+ // arbitrary-constant 64x64 multiplies (one `reduce128` each → 15 vs 8
+ // reductions per round). Kept the full circulant MDS.
+ for rc in GOLDILOCKS_POSEIDON1_RC_8
+ .iter()
+ .skip(POSEIDON1_HALF_FULL_ROUNDS)
+ .take(POSEIDON1_PARTIAL_ROUNDS)
+ {
+ s0 = add_canonical_scalar(s0, rc[0]);
+ s1 = add_canonical_scalar(s1, rc[1]);
+ s2 = add_canonical_scalar(s2, rc[2]);
+ s3 = add_canonical_scalar(s3, rc[3]);
+ s4 = add_canonical_scalar(s4, rc[4]);
+ s5 = add_canonical_scalar(s5, rc[5]);
+ s6 = add_canonical_scalar(s6, rc[6]);
+ s7 = add_canonical_scalar(s7, rc[7]);
+ s0 = sbox_full::
(s0);
+ [s0, s1, s2, s3, s4, s5, s6, s7] = mds_mul_simd([s0, s1, s2, s3, s4, s5, s6, s7]);
+ }
+
+ // Terminal full rounds.
+ for rc in GOLDILOCKS_POSEIDON1_RC_8
+ .iter()
+ .take(POSEIDON1_N_ROUNDS)
+ .skip(POSEIDON1_HALF_FULL_ROUNDS + POSEIDON1_PARTIAL_ROUNDS)
+ {
+ s0 = add_canonical_scalar(s0, rc[0]);
+ s1 = add_canonical_scalar(s1, rc[1]);
+ s2 = add_canonical_scalar(s2, rc[2]);
+ s3 = add_canonical_scalar(s3, rc[3]);
+ s4 = add_canonical_scalar(s4, rc[4]);
+ s5 = add_canonical_scalar(s5, rc[5]);
+ s6 = add_canonical_scalar(s6, rc[6]);
+ s7 = add_canonical_scalar(s7, rc[7]);
+ s0 = sbox_full::
(s0);
+ s1 = sbox_full::
(s1);
+ s2 = sbox_full::
(s2);
+ s3 = sbox_full::
(s3);
+ s4 = sbox_full::
(s4);
+ s5 = sbox_full::
(s5);
+ s6 = sbox_full::
(s6);
+ s7 = sbox_full::
(s7);
+ [s0, s1, s2, s3, s4, s5, s6, s7] = mds_mul_simd([s0, s1, s2, s3, s4, s5, s6, s7]);
+ }
+
+ if FEEDFORWARD {
+ // Compression-mode add-back of the original input.
+ state[0] = s0 + initial[0];
+ state[1] = s1 + initial[1];
+ state[2] = s2 + initial[2];
+ state[3] = s3 + initial[3];
+ state[4] = s4 + initial[4];
+ state[5] = s5 + initial[5];
+ state[6] = s6 + initial[6];
+ state[7] = s7 + initial[7];
+ } else {
+ state[0] = s0;
+ state[1] = s1;
+ state[2] = s2;
+ state[3] = s3;
+ state[4] = s4;
+ state[5] = s5;
+ state[6] = s6;
+ state[7] = s7;
+ }
+ }
+
+ #[cfg(not(any(
+ all(target_arch = "x86_64", target_feature = "avx2", not(target_feature = "avx512f")),
+ all(target_arch = "x86_64", target_feature = "avx512f"),
+ )))]
+ {
+ type P = ::Packing;
+ const W: usize = ::WIDTH;
+
+ let mut lanes: [[Goldilocks; POSEIDON1_WIDTH]; W] = [[Goldilocks::ZERO; POSEIDON1_WIDTH]; W];
+ for i in 0..POSEIDON1_WIDTH {
+ let s = state[i].as_slice();
+ for (k, lane) in lanes.iter_mut().enumerate() {
+ lane[i] = s[k];
+ }
+ }
+ let initial = lanes;
+
+ // Initial full rounds.
+ for rc in GOLDILOCKS_POSEIDON1_RC_8.iter().take(POSEIDON1_HALF_FULL_ROUNDS) {
+ for lane in lanes.iter_mut() {
+ for (i, s) in lane.iter_mut().enumerate() {
+ *s += rc[i];
+ }
+ }
+ for lane in lanes.iter_mut() {
+ for s in lane.iter_mut() {
+ *s = sbox_full::(*s);
+ }
+ }
+ for lane in lanes.iter_mut() {
+ mds_mul_scalar(lane);
+ }
+ }
+
+ // Partial rounds.
+ for rc in GOLDILOCKS_POSEIDON1_RC_8
+ .iter()
+ .skip(POSEIDON1_HALF_FULL_ROUNDS)
+ .take(POSEIDON1_PARTIAL_ROUNDS)
+ {
+ for lane in lanes.iter_mut() {
+ for (i, s) in lane.iter_mut().enumerate() {
+ *s += rc[i];
+ }
+ }
+ for lane in lanes.iter_mut() {
+ lane[0] = sbox_full::