diff --git a/sha2/CHANGELOG.md b/sha2/CHANGELOG.md
index af12c8d60..6efda9374 100644
--- a/sha2/CHANGELOG.md
+++ b/sha2/CHANGELOG.md
@@ -5,6 +5,13 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## 0.11.1 (UNRELEASED)
+### Changed
+- Removed workaround for unaligned loads in `riscv-zknh` backend ([#879])
+- `riscv-zknh` no longer requires `zbkb` (or `zbb`) target feature ([#879])
+
+[#879]: https://github.com/RustCrypto/hashes/pull/879
+
 ## 0.11.0 (2026-03-25)
 ### Changed
 - Edition changed to 2024 and MSRV bumped to 1.85 ([#652])
diff --git a/sha2/README.md b/sha2/README.md
index c19ca9ee0..c85bb6273 100644
--- a/sha2/README.md
+++ b/sha2/README.md
@@ -63,8 +63,7 @@ SHA-256 and SHA-512 backends:
 - `soft`: portable software implementation
 - `loongarch64-asm`: `asm!`-based implementation for LoongArch64 targets
 - `riscv-zknh`: uses the RISC-V `Zknh` scalar crypto extension. Experimental,
-  requires Nightly compiler and to enable `Zknh` and `Zbkb` (or `Zbb`)
-  target features at compile time.
+  see the [section below](#about-riscv-zknh) for more information.
 - `wasm32-simd128`: uses the WASM `simd128` extension.
 
 SHA-256 only backends:
@@ -91,10 +90,10 @@ You can force backend selection using the following configuration flags:
 - `sha2_512_backend`: select SHA-512 backend. Supported values: `aarch64-sha3`, `soft`,
   `riscv-zknh`, `x86-avx2`.
 
-They can be enabled using either the `RUSTFLAGS` environment variable
+They can be enabled using either a `RUSTFLAGS` environment variable
 (e.g. `RUSTFLAGS='--cfg sha2_backend="soft"'`), or by modifying your `.cargo/config.toml` file.
 
-Note that `sha2_backend` has higher priority than `sha2_256_backend` and `sha2_512_backend`.
+Note that `sha2_backend` has a higher priority than `sha2_256_backend` and `sha2_512_backend`.
 In other words, using `--cfg sha2_backend="soft" --cfg sha2_256_backend="x86_sha"` will result
 in selection of the software backend for SHA-256.
 
@@ -103,6 +102,16 @@ performance at the cost of a bigger resulting binary. You can disable unrolling
 by using `sha2_backend_soft = "compact"` and `sha2_backend_riscv_zknh = "compact"` configuration
 flags respectively.
 
+### About `riscv-zknh`
+
+This is an experimental RISC-V-only backend which requires a Nightly compiler and
+to enable the `Zknh` target feature at compile time. For a more efficient code generation,
+it's recommended to enable the `Zbkb` (or `Zbb`) and `unaligned-scalar-mem` target features
+(see [this LLVM issue][Zicclsm] for more information). For example, you can do it by using
+`RUSTFLAGS="-C target-feature=+zknh,+zbkb,+unaligned-scalar-mem"`.
+
+[Zicclsm]: https://github.com/llvm/llvm-project/issues/110454
+
 ## License
 
 The crate is licensed under either of:
diff --git a/sha2/src/sha256.rs b/sha2/src/sha256.rs
index 5239ca15a..494bc6c06 100644
--- a/sha2/src/sha256.rs
+++ b/sha2/src/sha256.rs
@@ -5,11 +5,8 @@ cfg_if::cfg_if! {
     } else if #[cfg(any(sha2_backend = "riscv-zknh", sha2_256_backend = "riscv-zknh"))] {
         mod riscv_zknh;
 
-        #[cfg(not(all(
-            target_feature = "zknh",
-            any(target_feature = "zbb", target_feature = "zbkb")
-        )))]
-        compile_error!("riscv-zknh backend requires zknh and zbkb (or zbb) target features");
+        #[cfg(not(target_feature = "zknh"))]
+        compile_error!("riscv-zknh backend requires `zknh` target feature");
 
         fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
             // SAFETY: we checked above that the required target features are enabled
@@ -22,7 +19,7 @@ cfg_if::cfg_if! {
             target_feature = "sha",
             target_feature = "sse4.1",
         )))]
-        compile_error!("x86-sha backend requires sha and sse4.1 target features");
+        compile_error!("x86-sha backend requires `sha` and `sse4.1` target features");
 
         fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
             // SAFETY: we checked above that the required target features are enabled
@@ -32,7 +29,7 @@ cfg_if::cfg_if! {
         mod aarch64_sha2;
 
         #[cfg(not(target_feature = "sha2"))]
-        compile_error!("aarch64-sha2 backend requires sha2 target feature");
+        compile_error!("aarch64-sha2 backend requires `sha2` target feature");
 
         fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) {
             // SAFETY: we checked above that the required target features are enabled
diff --git a/sha2/src/sha256/riscv_zknh.rs b/sha2/src/sha256/riscv_zknh.rs
index a596a2209..fd4115bde 100644
--- a/sha2/src/sha256/riscv_zknh.rs
+++ b/sha2/src/sha256/riscv_zknh.rs
@@ -1,8 +1,6 @@
 #[cfg(not(any(target_arch = "riscv32", target_arch = "riscv64")))]
 compile_error!("riscv-zknh backend can be used only on riscv32 and riscv64 target arches");
 
-mod utils;
-
 #[cfg(target_arch = "riscv32")]
 use core::arch::riscv32::{sha256sig0, sha256sig1, sha256sum0, sha256sum1};
 #[cfg(target_arch = "riscv64")]
@@ -11,9 +9,20 @@ use core::arch::riscv64::{sha256sig0, sha256sig1, sha256sum0, sha256sum1};
 cfg_if::cfg_if! {
     if #[cfg(sha2_backend_riscv_zknh = "compact")] {
         mod compact;
-        pub(super) use compact::compress;
+        use compact::compress_block;
     } else {
         mod unroll;
-        pub(super) use unroll::compress;
+        use unroll::compress_block;
+    }
+}
+
+#[target_feature(enable = "zknh")]
+pub(super) fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
+    for block in blocks {
+        let block: [u32; 16] = core::array::from_fn(|i| {
+            let chunk = block[4 * i..][..4].try_into().unwrap();
+            u32::from_be_bytes(chunk)
+        });
+        compress_block(state, block);
     }
 }
diff --git a/sha2/src/sha256/riscv_zknh/compact.rs b/sha2/src/sha256/riscv_zknh/compact.rs
index 83840560b..7121128b7 100644
--- a/sha2/src/sha256/riscv_zknh/compact.rs
+++ b/sha2/src/sha256/riscv_zknh/compact.rs
@@ -2,14 +2,7 @@ use super::{sha256sig0, sha256sig1, sha256sum0, sha256sum1};
 use crate::consts::K32;
 
 #[target_feature(enable = "zknh")]
-pub(in super::super) fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
-    for block in blocks.iter().map(super::utils::load_block) {
-        compress_block(state, block);
-    }
-}
-
-#[target_feature(enable = "zknh")]
-fn compress_block(state: &mut [u32; 8], mut block: [u32; 16]) {
+pub(super) fn compress_block(state: &mut [u32; 8], mut block: [u32; 16]) {
     let mut s = *state;
 
     for r in 0..64 {
diff --git a/sha2/src/sha256/riscv_zknh/unroll.rs b/sha2/src/sha256/riscv_zknh/unroll.rs
index e8b702e58..d9cfc80e4 100644
--- a/sha2/src/sha256/riscv_zknh/unroll.rs
+++ b/sha2/src/sha256/riscv_zknh/unroll.rs
@@ -2,14 +2,7 @@ use super::{sha256sig0, sha256sig1, sha256sum0, sha256sum1};
 use crate::consts::K32;
 
 #[target_feature(enable = "zknh")]
-pub(in super::super) fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
-    for block in blocks.iter().map(super::utils::load_block) {
-        compress_block(state, block);
-    }
-}
-
-#[target_feature(enable = "zknh")]
-fn compress_block(state: &mut [u32; 8], mut block: [u32; 16]) {
+pub(super) fn compress_block(state: &mut [u32; 8], mut block: [u32; 16]) {
     let s = &mut state.clone();
     let b = &mut block;
 
@@ -82,7 +75,7 @@ fn round<const R: usize>(state: &mut [u32; 8], block: &[u32; 16], k: &[u32]) {
     state[h] = state[h]
         .wrapping_add(sha256sum1(state[e]))
         .wrapping_add(ch(state[e], state[f], state[g]))
-        .wrapping_add(super::utils::opaque_load::<R>(k))
+        .wrapping_add(opaque_load::<R>(k))
         .wrapping_add(block[R]);
     state[d] = state[d].wrapping_add(state[h]);
     state[h] = state[h]
@@ -99,3 +92,33 @@ fn ch(x: u32, y: u32, z: u32) -> u32 {
 fn maj(x: u32, y: u32, z: u32) -> u32 {
     (x & y) ^ (x & z) ^ (y & z)
 }
+
+/// This function returns `k[R]`, but prevents the compiler from inlining the indexed value
+fn opaque_load<const R: usize>(k: &[u32]) -> u32 {
+    assert!(R < k.len());
+    let dst;
+
+    #[cfg(target_arch = "riscv64")]
+    unsafe {
+        core::arch::asm!(
+            "lwu {dst}, 4*{R}({k})",
+            R = const R,
+            k = in(reg) k.as_ptr(),
+            dst = out(reg) dst,
+            options(pure, readonly, nostack, preserves_flags),
+        );
+    }
+
+    #[cfg(target_arch = "riscv32")]
+    unsafe {
+        core::arch::asm!(
+            "lw {dst}, 4*{R}({k})",
+            R = const R,
+            k = in(reg) k.as_ptr(),
+            dst = out(reg) dst,
+            options(pure, readonly, nostack, preserves_flags),
+        );
+    }
+
+    dst
+}
diff --git a/sha2/src/sha256/riscv_zknh/utils.rs b/sha2/src/sha256/riscv_zknh/utils.rs
deleted file mode 100644
index 2ec54977e..000000000
--- a/sha2/src/sha256/riscv_zknh/utils.rs
+++ /dev/null
@@ -1,97 +0,0 @@
-use core::{arch::asm, ptr};
-
-#[inline(always)]
-pub(super) fn load_block(block: &[u8; 64]) -> [u32; 16] {
-    if block.as_ptr().cast::<u32>().is_aligned() {
-        load_aligned_block(block)
-    } else {
-        load_unaligned_block(block)
-    }
-}
-
-#[inline(always)]
-fn load_aligned_block(block: &[u8; 64]) -> [u32; 16] {
-    let p: *const u32 = block.as_ptr().cast();
-    debug_assert!(p.is_aligned());
-    let mut res = [0u32; 16];
-    for i in 0..16 {
-        let val = unsafe { ptr::read(p.add(i)) };
-        res[i] = val.to_be();
-    }
-    res
-}
-
-/// Use LW instruction on RV32 and LWU on RV64
-#[cfg(target_arch = "riscv32")]
-macro_rules! lw {
-    ($r:literal) => {
-        concat!("lw ", $r)
-    };
-}
-#[cfg(target_arch = "riscv64")]
-macro_rules! lw {
-    ($r:literal) => {
-        concat!("lwu ", $r)
-    };
-}
-
-#[inline(always)]
-fn load_unaligned_block(block: &[u8; 64]) -> [u32; 16] {
-    let offset = (block.as_ptr() as usize) % align_of::<u32>();
-    debug_assert_ne!(offset, 0);
-    let off1 = (8 * offset) % 32;
-    let off2 = (32 - off1) % 32;
-    let bp: *const u32 = block.as_ptr().wrapping_sub(offset).cast();
-
-    let mut left: u32;
-    let mut res = [0u32; 16];
-
-    unsafe {
-        asm!(
-            lw!("{left}, 0({bp})"),         // left = unsafe { ptr::read(bp) };
-            "srl {left}, {left}, {off1}",   // left >>= off1;
-            bp = in(reg) bp,
-            off1 = in(reg) off1,
-            left = out(reg) left,
-            options(pure, nostack, readonly, preserves_flags),
-        );
-    }
-
-    for i in 0..15 {
-        let right = unsafe { ptr::read(bp.add(1 + i)) };
-        res[i] = (left | (right << off2)).to_be();
-        left = right >> off1;
-    }
-
-    let right: u32;
-    unsafe {
-        asm!(
-            lw!("{right}, 16 * 4({bp})"),   // right = ptr::read(bp.add(16));
-            "sll {right}, {right}, {off2}", // right <<= off2;
-            bp = in(reg) bp,
-            off2 = in(reg) off2,
-            right = out(reg) right,
-            options(pure, nostack, readonly, preserves_flags),
-        );
-    }
-    res[15] = (left | right).to_be();
-
-    res
-}
-
-/// This function returns `k[R]`, but prevents compiler from inlining the indexed value
-#[cfg(not(sha2_backend_riscv_zknh = "compact"))]
-pub(super) fn opaque_load<const R: usize>(k: &[u32]) -> u32 {
-    assert!(R < k.len());
-    let dst;
-    unsafe {
-        core::arch::asm!(
-            lw!("{dst}, 4*{R}({k})"),
-            R = const R,
-            k = in(reg) k.as_ptr(),
-            dst = out(reg) dst,
-            options(pure, readonly, nostack, preserves_flags),
-        );
-    }
-    dst
-}
diff --git a/sha2/src/sha512.rs b/sha2/src/sha512.rs
index 03cd57619..1822243de 100644
--- a/sha2/src/sha512.rs
+++ b/sha2/src/sha512.rs
@@ -5,11 +5,8 @@ cfg_if::cfg_if! {
     } else if #[cfg(any(sha2_backend = "riscv-zknh", sha2_256_backend = "riscv-zknh"))] {
         mod riscv_zknh;
 
-        #[cfg(not(all(
-            target_feature = "zknh",
-            any(target_feature = "zbb", target_feature = "zbkb")
-        )))]
-        compile_error!("riscv-zknh backend requires zknh and zbkb (or zbb) target features");
+        #[cfg(not(target_feature = "zknh"))]
+        compile_error!("riscv-zknh backend requires `zknh` target features");
 
         fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) {
             // SAFETY: we checked above that the required target features are enabled
@@ -19,7 +16,7 @@ cfg_if::cfg_if! {
         mod x86_avx2;
 
         #[cfg(not(target_feature = "avx2"))]
-        compile_error!("x86-avx2 backend requires avx2 target feature");
+        compile_error!("x86-avx2 backend requires `avx2` target feature");
 
         fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
             // SAFETY: we checked above that the required target features are enabled
@@ -29,7 +26,7 @@ cfg_if::cfg_if! {
         mod aarch64_sha3;
 
         #[cfg(not(target_feature = "sha3"))]
-        compile_error!("aarch64-sha3 backend requires sha3 target feature");
+        compile_error!("aarch64-sha3 backend requires `sha3` target feature");
 
         fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) {
             // SAFETY: we checked above that the required target features are enabled
diff --git a/sha2/src/sha512/riscv_zknh.rs b/sha2/src/sha512/riscv_zknh.rs
index e5dc79e11..4c6d2c5f6 100644
--- a/sha2/src/sha512/riscv_zknh.rs
+++ b/sha2/src/sha512/riscv_zknh.rs
@@ -1,52 +1,61 @@
 #[cfg(not(any(target_arch = "riscv32", target_arch = "riscv64")))]
 compile_error!("riscv-zknh backend can be used only on riscv32 and riscv64 target arches");
 
-mod utils;
-
 cfg_if::cfg_if! {
     if #[cfg(sha2_backend_riscv_zknh = "compact")] {
         mod compact;
-        pub(super) use compact::compress;
+        use compact::compress_block;
     } else {
         mod unroll;
-        pub(super) use unroll::compress;
+        use unroll::compress_block;
     }
 }
 
-#[cfg(target_arch = "riscv64")]
-use core::arch::riscv64::{sha512sig0, sha512sig1, sha512sum0, sha512sum1};
-
-#[cfg(target_arch = "riscv32")]
-use core::arch::riscv32::*;
-
-#[cfg(target_arch = "riscv32")]
-#[target_feature(enable = "zknh")]
-fn sha512sum0(x: u64) -> u64 {
-    let a = sha512sum0r((x >> 32) as u32, x as u32);
-    let b = sha512sum0r(x as u32, (x >> 32) as u32);
-    ((a as u64) << 32) | (b as u64)
-}
-
-#[cfg(target_arch = "riscv32")]
-#[target_feature(enable = "zknh")]
-fn sha512sum1(x: u64) -> u64 {
-    let a = sha512sum1r((x >> 32) as u32, x as u32);
-    let b = sha512sum1r(x as u32, (x >> 32) as u32);
-    ((a as u64) << 32) | (b as u64)
-}
-
-#[cfg(target_arch = "riscv32")]
 #[target_feature(enable = "zknh")]
-fn sha512sig0(x: u64) -> u64 {
-    let a = sha512sig0h((x >> 32) as u32, x as u32);
-    let b = sha512sig0l(x as u32, (x >> 32) as u32);
-    ((a as u64) << 32) | (b as u64)
+pub(super) fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) {
+    for block in blocks {
+        let block: [u64; 16] = core::array::from_fn(|i| {
+            let chunk = block[8 * i..][..8].try_into().unwrap();
+            u64::from_be_bytes(chunk)
+        });
+        compress_block(state, block);
+    }
 }
 
-#[cfg(target_arch = "riscv32")]
-#[target_feature(enable = "zknh")]
-fn sha512sig1(x: u64) -> u64 {
-    let a = sha512sig1h((x >> 32) as u32, x as u32);
-    let b = sha512sig1l(x as u32, (x >> 32) as u32);
-    ((a as u64) << 32) | (b as u64)
+cfg_if::cfg_if! {
+    if #[cfg(target_arch = "riscv64")] {
+        use core::arch::riscv64::{sha512sig0, sha512sig1, sha512sum0, sha512sum1};
+    } else {
+        use core::arch::riscv32::{
+            sha512sig0h, sha512sig0l, sha512sig1h, sha512sig1l, sha512sum0r, sha512sum1r,
+        };
+
+        #[target_feature(enable = "zknh")]
+        fn sha512sum0(x: u64) -> u64 {
+            let a = sha512sum0r((x >> 32) as u32, x as u32);
+            let b = sha512sum0r(x as u32, (x >> 32) as u32);
+            ((a as u64) << 32) | (b as u64)
+        }
+
+        #[target_feature(enable = "zknh")]
+        fn sha512sum1(x: u64) -> u64 {
+            let a = sha512sum1r((x >> 32) as u32, x as u32);
+            let b = sha512sum1r(x as u32, (x >> 32) as u32);
+            ((a as u64) << 32) | (b as u64)
+        }
+
+        #[target_feature(enable = "zknh")]
+        fn sha512sig0(x: u64) -> u64 {
+            let a = sha512sig0h((x >> 32) as u32, x as u32);
+            let b = sha512sig0l(x as u32, (x >> 32) as u32);
+            ((a as u64) << 32) | (b as u64)
+        }
+
+        #[target_feature(enable = "zknh")]
+        fn sha512sig1(x: u64) -> u64 {
+            let a = sha512sig1h((x >> 32) as u32, x as u32);
+            let b = sha512sig1l(x as u32, (x >> 32) as u32);
+            ((a as u64) << 32) | (b as u64)
+        }
+    }
 }
diff --git a/sha2/src/sha512/riscv_zknh/compact.rs b/sha2/src/sha512/riscv_zknh/compact.rs
index 865288ee4..baf370b1d 100644
--- a/sha2/src/sha512/riscv_zknh/compact.rs
+++ b/sha2/src/sha512/riscv_zknh/compact.rs
@@ -2,14 +2,7 @@ use super::{sha512sig0, sha512sig1, sha512sum0, sha512sum1};
 use crate::consts::K64;
 
 #[target_feature(enable = "zknh")]
-pub(in super::super) fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) {
-    for block in blocks.iter().map(super::utils::load_block) {
-        compress_block(state, block);
-    }
-}
-
-#[target_feature(enable = "zknh")]
-fn compress_block(state: &mut [u64; 8], mut block: [u64; 16]) {
+pub(super) fn compress_block(state: &mut [u64; 8], mut block: [u64; 16]) {
     let mut s = *state;
 
     for r in 0..80 {
diff --git a/sha2/src/sha512/riscv_zknh/unroll.rs b/sha2/src/sha512/riscv_zknh/unroll.rs
index 9f21fa221..89d9e55c7 100644
--- a/sha2/src/sha512/riscv_zknh/unroll.rs
+++ b/sha2/src/sha512/riscv_zknh/unroll.rs
@@ -2,14 +2,7 @@ use super::{sha512sig0, sha512sig1, sha512sum0, sha512sum1};
 use crate::consts::K64;
 
 #[target_feature(enable = "zknh")]
-pub(in super::super) fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) {
-    for block in blocks.iter().map(super::utils::load_block) {
-        compress_block(state, block);
-    }
-}
-
-#[target_feature(enable = "zknh")]
-fn compress_block(state: &mut [u64; 8], mut block: [u64; 16]) {
+pub(super) fn compress_block(state: &mut [u64; 8], mut block: [u64; 16]) {
     let s = &mut state.clone();
     let b = &mut block;
 
@@ -82,7 +75,7 @@ fn round<const R: usize>(state: &mut [u64; 8], block: &[u64; 16], k: &[u64]) {
     state[h] = state[h]
         .wrapping_add(sha512sum1(state[e]))
         .wrapping_add(ch(state[e], state[f], state[g]))
-        .wrapping_add(super::utils::opaque_load::<R>(k))
+        .wrapping_add(opaque_load::<R>(k))
         .wrapping_add(block[R]);
     state[d] = state[d].wrapping_add(state[h]);
     state[h] = state[h]
@@ -99,3 +92,34 @@ fn ch(x: u64, y: u64, z: u64) -> u64 {
 fn maj(x: u64, y: u64, z: u64) -> u64 {
     (x & y) ^ (x & z) ^ (y & z)
 }
+
+/// This function returns `k[R]`, but prevents compiler from inlining the indexed value
+fn opaque_load<const R: usize>(k: &[u64]) -> u64 {
+    assert!(R < k.len());
+    #[cfg(target_arch = "riscv64")]
+    unsafe {
+        let dst;
+        core::arch::asm!(
+            "ld {dst}, 8 * {R}({k})",
+            R = const R,
+            k = in(reg) k.as_ptr(),
+            dst = out(reg) dst,
+            options(pure, readonly, nostack, preserves_flags),
+        );
+        dst
+    }
+    #[cfg(target_arch = "riscv32")]
+    unsafe {
+        let [hi, lo]: [u32; 2];
+        core::arch::asm!(
+            "lw {lo}, 8 * {R}({k})",
+            "lw {hi}, 8 * {R} + 4({k})",
+            R = const R,
+            k = in(reg) k.as_ptr(),
+            lo = out(reg) lo,
+            hi = out(reg) hi,
+            options(pure, readonly, nostack, preserves_flags),
+        );
+        ((hi as u64) << 32) | (lo as u64)
+    }
+}
diff --git a/sha2/src/sha512/riscv_zknh/utils.rs b/sha2/src/sha512/riscv_zknh/utils.rs
deleted file mode 100644
index 440682ea7..000000000
--- a/sha2/src/sha512/riscv_zknh/utils.rs
+++ /dev/null
@@ -1,163 +0,0 @@
-use core::{arch::asm, ptr};
-
-#[inline(always)]
-pub(super) fn load_block(block: &[u8; 128]) -> [u64; 16] {
-    if block.as_ptr().cast::<usize>().is_aligned() {
-        load_aligned_block(block)
-    } else {
-        load_unaligned_block(block)
-    }
-}
-
-#[cfg(target_arch = "riscv32")]
-fn load_aligned_block(block: &[u8; 128]) -> [u64; 16] {
-    let p: *const [u32; 32] = block.as_ptr().cast();
-    debug_assert!(p.is_aligned());
-    let block = unsafe { &*p };
-    let mut res = [0u64; 16];
-    for i in 0..16 {
-        let a = block[2 * i].to_be() as u64;
-        let b = block[2 * i + 1].to_be() as u64;
-        res[i] = (a << 32) | b;
-    }
-    res
-}
-
-#[cfg(target_arch = "riscv64")]
-fn load_aligned_block(block: &[u8; 128]) -> [u64; 16] {
-    let block_ptr: *const u64 = block.as_ptr().cast();
-    debug_assert!(block_ptr.is_aligned());
-    let mut res = [0u64; 16];
-    for i in 0..16 {
-        let val = unsafe { ptr::read(block_ptr.add(i)) };
-        res[i] = val.to_be();
-    }
-    res
-}
-
-#[cfg(target_arch = "riscv32")]
-fn load_unaligned_block(block: &[u8; 128]) -> [u64; 16] {
-    let offset = (block.as_ptr() as usize) % align_of::<u32>();
-    debug_assert_ne!(offset, 0);
-    let off1 = (8 * offset) % 32;
-    let off2 = (32 - off1) % 32;
-    let bp: *const u32 = block.as_ptr().wrapping_sub(offset).cast();
-
-    let mut left: u32;
-    let mut block32 = [0u32; 32];
-
-    unsafe {
-        asm!(
-            "lw {left}, 0({bp})",         // left = unsafe { ptr::read(bp) };
-            "srl {left}, {left}, {off1}", // left >>= off1;
-            bp = in(reg) bp,
-            off1 = in(reg) off1,
-            left = out(reg) left,
-            options(pure, nostack, readonly, preserves_flags),
-        );
-    }
-
-    for i in 0..31 {
-        let right = unsafe { ptr::read(bp.add(1 + i)) };
-        block32[i] = left | (right << off2);
-        left = right >> off1;
-    }
-
-    let right: u32;
-    unsafe {
-        asm!(
-            "lw {right}, 32 * 4({bp})",     // right = ptr::read(bp.add(32));
-            "sll {right}, {right}, {off2}", // right <<= off2;
-            bp = in(reg) bp,
-            off2 = in(reg) off2,
-            right = out(reg) right,
-            options(pure, nostack, readonly, preserves_flags),
-        );
-    }
-    block32[31] = left | right;
-
-    let mut block64 = [0u64; 16];
-    for i in 0..16 {
-        let a = block32[2 * i].to_be() as u64;
-        let b = block32[2 * i + 1].to_be() as u64;
-        block64[i] = (a << 32) | b;
-    }
-    block64
-}
-
-#[cfg(target_arch = "riscv64")]
-fn load_unaligned_block(block: &[u8; 128]) -> [u64; 16] {
-    let offset = (block.as_ptr() as usize) % align_of::<u64>();
-    debug_assert_ne!(offset, 0);
-    let off1 = (8 * offset) % 64;
-    let off2 = (64 - off1) % 64;
-    let bp: *const u64 = block.as_ptr().wrapping_sub(offset).cast();
-
-    let mut left: u64;
-    let mut res = [0u64; 16];
-
-    unsafe {
-        asm!(
-            "ld {left}, 0({bp})",           // left = unsafe { ptr::read(bp) };
-            "srl {left}, {left}, {off1}",   // left >>= off1;
-            bp = in(reg) bp,
-            off1 = in(reg) off1,
-            left = out(reg) left,
-            options(pure, nostack, readonly, preserves_flags),
-        );
-    }
-    for i in 0..15 {
-        let right = unsafe { ptr::read(bp.add(1 + i)) };
-        res[i] = (left | (right << off2)).to_be();
-        left = right >> off1;
-    }
-
-    let right: u64;
-    unsafe {
-        asm!(
-            "ld {right}, 16 * 8({bp})",     // right = ptr::read(bp.add(16));
-            "sll {right}, {right}, {off2}", // right <<= off2;
-            bp = in(reg) bp,
-            off2 = in(reg) off2,
-            right = out(reg) right,
-            options(pure, nostack, readonly, preserves_flags),
-        );
-    }
-    res[15] = (left | right).to_be();
-
-    res
-}
-
-/// This function returns `k[R]`, but prevents compiler from inlining the indexed value
-#[cfg(not(sha2_backend_riscv_zknh = "compact"))]
-pub(super) fn opaque_load<const R: usize>(k: &[u64]) -> u64 {
-    use core::arch::asm;
-
-    assert!(R < k.len());
-    #[cfg(target_arch = "riscv64")]
-    unsafe {
-        let dst;
-        asm!(
-            "ld {dst}, 8 * {R}({k})",
-            R = const R,
-            k = in(reg) k.as_ptr(),
-            dst = out(reg) dst,
-            options(pure, readonly, nostack, preserves_flags),
-        );
-        dst
-    }
-    #[cfg(target_arch = "riscv32")]
-    unsafe {
-        let [hi, lo]: [u32; 2];
-        asm!(
-            "lw {lo}, 8 * {R}({k})",
-            "lw {hi}, 8 * {R} + 4({k})",
-            R = const R,
-            k = in(reg) k.as_ptr(),
-            lo = out(reg) lo,
-            hi = out(reg) hi,
-            options(pure, readonly, nostack, preserves_flags),
-        );
-        ((hi as u64) << 32) | (lo as u64)
-    }
-}