diff --git a/Cargo.lock b/Cargo.lock
index c93d8592..4cfc61e0 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -41,33 +41,15 @@ version = "1.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"
 
-[[package]]
-name = "belt-block"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5a1be3fcfd5950762b6ac3f929b7ac65f31b35afee94e17f6fcadd58ee47abee"
-
-[[package]]
-name = "belt-hash"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4d8ee43afdfcc493dec1e51c8032546cc1c52b4d68549e6469d03ff2cf3c3d91"
-dependencies = [
- "belt-block",
- "digest",
-]
-
 [[package]]
 name = "bign256"
 version = "0.0.0"
 dependencies = [
- "belt-hash",
  "criterion",
- "ecdsa",
  "elliptic-curve",
  "hex-literal",
  "primeorder",
- "serdect",
+ "proptest",
 ]
 
 [[package]]
@@ -730,9 +712,9 @@ dependencies = [
 
 [[package]]
 name = "once_cell"
-version = "1.17.2"
+version = "1.18.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9670a07f94779e00908f3e686eab508878ebb390ba6e604d3a284c00e8d0487b"
+checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d"
 dependencies = [
  "atomic-polyfill",
  "critical-section",
diff --git a/k256/Cargo.toml b/k256/Cargo.toml
index d8e096ab..dd3c7335 100644
--- a/k256/Cargo.toml
+++ b/k256/Cargo.toml
@@ -22,7 +22,7 @@ cfg-if = "1.0"
 elliptic-curve = { version = "0.13", default-features = false, features = ["hazmat", "sec1"] }
 
 # optional dependencies
-once_cell = { version = "1.17", optional = true, default-features = false }
+once_cell = { version = "1.18", optional = true, default-features = false }
 ecdsa-core = { version = "0.16", package = "ecdsa", optional = true, default-features = false, features = ["der"] }
 hex-literal = { version = "0.4", optional = true }
 serdect = { version = "0.2", optional = true, default-features = false }
diff --git a/p256/src/arithmetic/field.rs b/p256/src/arithmetic/field.rs
index 735d09f1..ea153a88 100644
--- a/p256/src/arithmetic/field.rs
+++ b/p256/src/arithmetic/field.rs
@@ -2,58 +2,356 @@
 
 #![allow(clippy::assign_op_pattern, clippy::op_ref)]
 
-#[cfg_attr(target_pointer_width = "32", path = "field/field32.rs")]
-#[cfg_attr(target_pointer_width = "64", path = "field/field64.rs")]
-mod field_impl;
-
-use self::field_impl::*;
-use crate::{FieldBytes, NistP256};
+use crate::{
+    arithmetic::util::{adc, mac, sbb, u256_to_u64x4, u64x4_to_u256},
+    FieldBytes,
+};
 use core::{
-    fmt::{self, Debug},
     iter::{Product, Sum},
-    ops::{AddAssign, Mul, MulAssign, Neg, SubAssign},
+    ops::{Add, AddAssign, Mul, MulAssign, Neg, Sub, SubAssign},
 };
 use elliptic_curve::{
-    bigint::U256,
-    ff::PrimeField,
-    subtle::{Choice, ConstantTimeEq, CtOption},
+    bigint::{ArrayEncoding, U256},
+    ff::{Field, PrimeField},
+    rand_core::RngCore,
+    subtle::{Choice, ConditionallySelectable, ConstantTimeEq, ConstantTimeLess, CtOption},
+    zeroize::DefaultIsZeroes,
 };
 
-/// Field modulus serialized as hex.
-/// p = 2^{224}(2^{32} − 1) + 2^{192} + 2^{96} − 1
 const MODULUS_HEX: &str = "ffffffff00000001000000000000000000000000ffffffffffffffffffffffff";
 
-/// Constant representing the modulus.
-pub const MODULUS: U256 = U256::from_be_hex(MODULUS_HEX);
+/// Constant representing the modulus
+/// p = 2^{224}(2^{32} − 1) + 2^{192} + 2^{96} − 1
+pub const MODULUS: FieldElement = FieldElement(U256::from_be_hex(MODULUS_HEX));
+
+/// R = 2^256 mod p
+const R: FieldElement = FieldElement(U256::from_be_hex(
+    "00000000fffffffeffffffffffffffffffffffff000000000000000000000001",
+));
 
 /// R^2 = 2^512 mod p
-const R_2: U256 =
-    U256::from_be_hex("00000004fffffffdfffffffffffffffefffffffbffffffff0000000000000003");
+const R2: FieldElement = FieldElement(U256::from_be_hex(
+    "00000004fffffffdfffffffffffffffefffffffbffffffff0000000000000003",
+));
 
 /// An element in the finite field modulo p = 2^{224}(2^{32} − 1) + 2^{192} + 2^{96} − 1.
 ///
 /// The internal representation is in little-endian order. Elements are always in
 /// Montgomery form; i.e., FieldElement(a) = aR mod p, with R = 2^256.
-#[derive(Clone, Copy)]
+#[derive(Clone, Copy, Debug)]
 pub struct FieldElement(pub(crate) U256);
 
-primeorder::impl_mont_field_element!(
-    NistP256,
-    FieldElement,
-    FieldBytes,
-    U256,
-    MODULUS,
-    Fe,
-    fe_from_montgomery,
-    fe_to_montgomery,
-    fe_add,
-    fe_sub,
-    fe_mul,
-    fe_neg,
-    fe_square
-);
-
 impl FieldElement {
+    /// Zero element.
+    pub const ZERO: Self = FieldElement(U256::ZERO);
+
+    /// Multiplicative identity.
+    pub const ONE: Self = R;
+
+    /// Attempts to parse the given byte array as an SEC1-encoded field element.
+    ///
+    /// Returns None if the byte array does not contain a big-endian integer in the range
+    /// [0, p).
+    pub fn from_bytes(bytes: FieldBytes) -> CtOption<Self> {
+        Self::from_uint(U256::from_be_byte_array(bytes))
+    }
+
+    /// Returns the SEC1 encoding of this field element.
+    pub fn to_bytes(self) -> FieldBytes {
+        self.to_canonical().0.to_be_byte_array()
+    }
+
+    /// Decode [`FieldElement`] from [`U256`], converting it into Montgomery form:
+    ///
+    /// ```text
+    /// w * R^2 * R^-1 mod p = wR mod p
+    /// ```
+    pub fn from_uint(uint: U256) -> CtOption<Self> {
+        let is_some = uint.ct_lt(&MODULUS.0);
+        CtOption::new(Self::from_uint_unchecked(uint), is_some)
+    }
+
+    /// Convert a `u64` into a [`FieldElement`].
+    pub const fn from_u64(w: u64) -> Self {
+        Self::from_uint_unchecked(U256::from_u64(w))
+    }
+
+    /// Parse a [`FieldElement`] from big endian hex-encoded bytes.
+    ///
+    /// Does *not* perform a check that the field element does not overflow the order.
+    ///
+    /// This method is primarily intended for defining internal constants.
+    pub(crate) const fn from_hex(hex: &str) -> Self {
+        Self::from_uint_unchecked(U256::from_be_hex(hex))
+    }
+
+    /// Decode [`FieldElement`] from [`U256`] converting it into Montgomery form.
+    ///
+    /// Does *not* perform a check that the field element does not overflow the order.
+    ///
+    /// Used incorrectly this can lead to invalid results!
+    pub(crate) const fn from_uint_unchecked(w: U256) -> Self {
+        Self(w).to_montgomery()
+    }
+
+    /// Determine if this `FieldElement` is zero.
+    ///
+    /// # Returns
+    ///
+    /// If zero, return `Choice(1)`.  Otherwise, return `Choice(0)`.
+    pub fn is_zero(&self) -> Choice {
+        self.ct_eq(&FieldElement::ZERO)
+    }
+
+    /// Determine if this `FieldElement` is odd in the SEC1 sense: `self mod 2 == 1`.
+    ///
+    /// # Returns
+    ///
+    /// If odd, return `Choice(1)`.  Otherwise, return `Choice(0)`.
+    pub fn is_odd(&self) -> Choice {
+        let bytes = self.to_bytes();
+        (bytes[31] & 1).into()
+    }
+
+    /// Returns self + rhs mod p
+    pub const fn add(&self, rhs: &Self) -> Self {
+        let a = u256_to_u64x4(self.0);
+        let b = u256_to_u64x4(rhs.0);
+
+        // Bit 256 of p is set, so addition can result in five words.
+        let (w0, carry) = adc(a[0], b[0], 0);
+        let (w1, carry) = adc(a[1], b[1], carry);
+        let (w2, carry) = adc(a[2], b[2], carry);
+        let (w3, w4) = adc(a[3], b[3], carry);
+
+        // Attempt to subtract the modulus, to ensure the result is in the field.
+        let modulus = u256_to_u64x4(MODULUS.0);
+        let (result, _) = Self::sub_inner(
+            w0, w1, w2, w3, w4, modulus[0], modulus[1], modulus[2], modulus[3], 0,
+        );
+        result
+    }
+
+    /// Returns 2*self.
+    pub const fn double(&self) -> Self {
+        self.add(self)
+    }
+
+    /// Returns self - rhs mod p
+    pub const fn sub(&self, rhs: &Self) -> Self {
+        let a = u256_to_u64x4(self.0);
+        let b = u256_to_u64x4(rhs.0);
+        Self::sub_inner(a[0], a[1], a[2], a[3], 0, b[0], b[1], b[2], b[3], 0).0
+    }
+
+    /// Negate element.
+    pub const fn neg(&self) -> Self {
+        Self::sub(&Self::ZERO, self)
+    }
+
+    fn from_bytes_wide(bytes: [u8; 64]) -> Self {
+        #[allow(clippy::unwrap_used)]
+        FieldElement::montgomery_reduce(
+            u64::from_be_bytes(bytes[0..8].try_into().unwrap()),
+            u64::from_be_bytes(bytes[8..16].try_into().unwrap()),
+            u64::from_be_bytes(bytes[16..24].try_into().unwrap()),
+            u64::from_be_bytes(bytes[24..32].try_into().unwrap()),
+            u64::from_be_bytes(bytes[32..40].try_into().unwrap()),
+            u64::from_be_bytes(bytes[40..48].try_into().unwrap()),
+            u64::from_be_bytes(bytes[48..56].try_into().unwrap()),
+            u64::from_be_bytes(bytes[56..64].try_into().unwrap()),
+        )
+    }
+
+    #[inline]
+    #[allow(clippy::too_many_arguments)]
+    const fn sub_inner(
+        l0: u64,
+        l1: u64,
+        l2: u64,
+        l3: u64,
+        l4: u64,
+        r0: u64,
+        r1: u64,
+        r2: u64,
+        r3: u64,
+        r4: u64,
+    ) -> (Self, u64) {
+        let (w0, borrow) = sbb(l0, r0, 0);
+        let (w1, borrow) = sbb(l1, r1, borrow);
+        let (w2, borrow) = sbb(l2, r2, borrow);
+        let (w3, borrow) = sbb(l3, r3, borrow);
+        let (_, borrow) = sbb(l4, r4, borrow);
+
+        // If underflow occurred on the final limb, borrow = 0xfff...fff, otherwise
+        // borrow = 0x000...000. Thus, we use it as a mask to conditionally add the
+        // modulus.
+        let modulus = u256_to_u64x4(MODULUS.0);
+        let (w0, carry) = adc(w0, modulus[0] & borrow, 0);
+        let (w1, carry) = adc(w1, modulus[1] & borrow, carry);
+        let (w2, carry) = adc(w2, modulus[2] & borrow, carry);
+        let (w3, _) = adc(w3, modulus[3] & borrow, carry);
+
+        (Self(u64x4_to_u256([w0, w1, w2, w3])), borrow)
+    }
+
+    /// Montgomery Reduction
+    ///
+    /// The general algorithm is:
+    /// ```text
+    /// A <- input (2n b-limbs)
+    /// for i in 0..n {
+    ///     k <- A[i] p' mod b
+    ///     A <- A + k p b^i
+    /// }
+    /// A <- A / b^n
+    /// if A >= p {
+    ///     A <- A - p
+    /// }
+    /// ```
+    ///
+    /// For secp256r1, we have the following simplifications:
+    ///
+    /// - `p'` is 1, so our multiplicand is simply the first limb of the intermediate A.
+    ///
+    /// - The first limb of p is 2^64 - 1; multiplications by this limb can be simplified
+    ///   to a shift and subtraction:
+    ///   ```text
+    ///       a_i * (2^64 - 1) = a_i * 2^64 - a_i = (a_i << 64) - a_i
+    ///   ```
+    ///   However, because `p' = 1`, the first limb of p is multiplied by limb i of the
+    ///   intermediate A and then immediately added to that same limb, so we simply
+    ///   initialize the carry to limb i of the intermediate.
+    ///
+    /// - The third limb of p is zero, so we can ignore any multiplications by it and just
+    ///   add the carry.
+    ///
+    /// References:
+    /// - Handbook of Applied Cryptography, Chapter 14
+    ///   Algorithm 14.32
+    ///   http://cacr.uwaterloo.ca/hac/about/chap14.pdf
+    ///
+    /// - Efficient and Secure Elliptic Curve Cryptography Implementation of Curve P-256
+    ///   Algorithm 7) Montgomery Word-by-Word Reduction
+    ///   https://csrc.nist.gov/csrc/media/events/workshop-on-elliptic-curve-cryptography-standards/documents/papers/session6-adalier-mehmet.pdf
+    #[inline]
+    #[allow(clippy::too_many_arguments)]
+    const fn montgomery_reduce(
+        r0: u64,
+        r1: u64,
+        r2: u64,
+        r3: u64,
+        r4: u64,
+        r5: u64,
+        r6: u64,
+        r7: u64,
+    ) -> Self {
+        let modulus = u256_to_u64x4(MODULUS.0);
+
+        let (r1, carry) = mac(r1, r0, modulus[1], r0);
+        let (r2, carry) = adc(r2, 0, carry);
+        let (r3, carry) = mac(r3, r0, modulus[3], carry);
+        let (r4, carry2) = adc(r4, 0, carry);
+
+        let (r2, carry) = mac(r2, r1, modulus[1], r1);
+        let (r3, carry) = adc(r3, 0, carry);
+        let (r4, carry) = mac(r4, r1, modulus[3], carry);
+        let (r5, carry2) = adc(r5, carry2, carry);
+
+        let (r3, carry) = mac(r3, r2, modulus[1], r2);
+        let (r4, carry) = adc(r4, 0, carry);
+        let (r5, carry) = mac(r5, r2, modulus[3], carry);
+        let (r6, carry2) = adc(r6, carry2, carry);
+
+        let (r4, carry) = mac(r4, r3, modulus[1], r3);
+        let (r5, carry) = adc(r5, 0, carry);
+        let (r6, carry) = mac(r6, r3, modulus[3], carry);
+        let (r7, r8) = adc(r7, carry2, carry);
+
+        // Result may be within MODULUS of the correct value
+        let (result, _) = Self::sub_inner(
+            r4, r5, r6, r7, r8, modulus[0], modulus[1], modulus[2], modulus[3], 0,
+        );
+        result
+    }
+
+    /// Translate a field element out of the Montgomery domain.
+    #[inline]
+    pub(crate) const fn to_canonical(self) -> Self {
+        let w = u256_to_u64x4(self.0);
+        FieldElement::montgomery_reduce(w[0], w[1], w[2], w[3], 0, 0, 0, 0)
+    }
+
+    /// Translate a field element into the Montgomery domain.
+    #[inline]
+    pub(crate) const fn to_montgomery(self) -> Self {
+        Self::multiply(&self, &R2)
+    }
+
+    /// Returns self * rhs mod p
+    pub const fn multiply(&self, rhs: &Self) -> Self {
+        // Schoolbook multiplication.
+        let a = u256_to_u64x4(self.0);
+        let b = u256_to_u64x4(rhs.0);
+
+        let (w0, carry) = mac(0, a[0], b[0], 0);
+        let (w1, carry) = mac(0, a[0], b[1], carry);
+        let (w2, carry) = mac(0, a[0], b[2], carry);
+        let (w3, w4) = mac(0, a[0], b[3], carry);
+
+        let (w1, carry) = mac(w1, a[1], b[0], 0);
+        let (w2, carry) = mac(w2, a[1], b[1], carry);
+        let (w3, carry) = mac(w3, a[1], b[2], carry);
+        let (w4, w5) = mac(w4, a[1], b[3], carry);
+
+        let (w2, carry) = mac(w2, a[2], b[0], 0);
+        let (w3, carry) = mac(w3, a[2], b[1], carry);
+        let (w4, carry) = mac(w4, a[2], b[2], carry);
+        let (w5, w6) = mac(w5, a[2], b[3], carry);
+
+        let (w3, carry) = mac(w3, a[3], b[0], 0);
+        let (w4, carry) = mac(w4, a[3], b[1], carry);
+        let (w5, carry) = mac(w5, a[3], b[2], carry);
+        let (w6, w7) = mac(w6, a[3], b[3], carry);
+
+        FieldElement::montgomery_reduce(w0, w1, w2, w3, w4, w5, w6, w7)
+    }
+
+    /// Returns self * self mod p
+    pub const fn square(&self) -> Self {
+        // Schoolbook multiplication.
+        self.multiply(self)
+    }
+
+    /// Returns self^(2^n) mod p
+    const fn sqn(&self, n: usize) -> Self {
+        let mut x = *self;
+        let mut i = 0;
+        while i < n {
+            x = x.square();
+            i += 1;
+        }
+        x
+    }
+
+    /// Returns `self^by`, where `by` is a little-endian integer exponent.
+    ///
+    /// **This operation is variable time with respect to the exponent.** If the exponent
+    /// is fixed, this operation is effectively constant time.
+    pub fn pow_vartime(&self, by: &[u64; 4]) -> Self {
+        let mut res = Self::ONE;
+        for e in by.iter().rev() {
+            for i in (0..64).rev() {
+                res = res.square();
+
+                if ((*e >> i) & 1) == 1 {
+                    res = res * self;
+                }
+            }
+        }
+        res
+    }
+
     /// Returns the multiplicative inverse of self, if self is non-zero.
     pub fn invert(&self) -> CtOption<Self> {
         CtOption::new(self.invert_unchecked(), !self.is_zero())
@@ -71,7 +369,6 @@ impl FieldElement {
         //    a^(p-2) * a ≡ 1 mod p
         //
         // Thus inversion can be implemented with a single exponentiation.
-
         let t111 = self.multiply(&self.multiply(&self.square()).square());
         let t111111 = t111.multiply(&t111.sqn(3));
         let x15 = t111111.sqn(6).multiply(&t111111).sqn(3).multiply(&t111);
@@ -95,11 +392,11 @@ impl FieldElement {
 
         let t11 = self.mul(&self.square());
         let t1111 = t11.mul(&t11.sqn(2));
-        let t11111111 = t1111.mul(&t1111.sqn(4));
-        let x16 = t11111111.sqn(8).mul(&t11111111);
+        let t11111111 = t1111.mul(t1111.sqn(4));
+        let x16 = t11111111.sqn(8).mul(t11111111);
         let sqrt = x16
             .sqn(16)
-            .mul(&x16)
+            .mul(x16)
             .sqn(32)
             .mul(self)
             .sqn(96)
@@ -111,16 +408,40 @@ impl FieldElement {
             (&sqrt * &sqrt).ct_eq(self), // Only return Some if it's the square root.
         )
     }
+}
 
-    /// Returns self^(2^n) mod p
-    const fn sqn(&self, n: usize) -> Self {
-        let mut x = *self;
-        let mut i = 0;
-        while i < n {
-            x = x.square();
-            i += 1;
-        }
-        x
+impl Field for FieldElement {
+    const ZERO: Self = Self::ZERO;
+    const ONE: Self = Self::ONE;
+
+    fn random(mut rng: impl RngCore) -> Self {
+        // We reduce a random 512-bit value into a 256-bit field, which results in a
+        // negligible bias from the uniform distribution.
+        let mut buf = [0; 64];
+        rng.fill_bytes(&mut buf);
+        FieldElement::from_bytes_wide(buf)
+    }
+
+    #[must_use]
+    fn square(&self) -> Self {
+        self.square()
+    }
+
+    #[must_use]
+    fn double(&self) -> Self {
+        self.double()
+    }
+
+    fn invert(&self) -> CtOption<Self> {
+        self.invert()
+    }
+
+    fn sqrt(&self) -> CtOption<Self> {
+        self.sqrt()
+    }
+
+    fn sqrt_ratio(num: &Self, div: &Self) -> (Choice, Self) {
+        elliptic_curve::ff::helpers::sqrt_ratio_generic(num, div)
     }
 }
 
@@ -138,61 +459,231 @@ impl PrimeField for FieldElement {
     const ROOT_OF_UNITY_INV: Self = Self::ROOT_OF_UNITY.invert_unchecked();
     const DELTA: Self = Self::from_u64(36);
 
-    #[inline]
     fn from_repr(bytes: FieldBytes) -> CtOption<Self> {
-        Self::from_bytes(&bytes)
+        Self::from_bytes(bytes)
     }
 
-    #[inline]
     fn to_repr(&self) -> FieldBytes {
         self.to_bytes()
     }
 
-    #[inline]
     fn is_odd(&self) -> Choice {
         self.is_odd()
     }
 }
 
-impl Debug for FieldElement {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        write!(f, "FieldElement(0x{:X})", &self.0)
+impl ConditionallySelectable for FieldElement {
+    fn conditional_select(a: &Self, b: &Self, choice: Choice) -> Self {
+        Self(U256::conditional_select(&a.0, &b.0, choice))
+    }
+}
+
+impl ConstantTimeEq for FieldElement {
+    fn ct_eq(&self, other: &Self) -> Choice {
+        self.0.ct_eq(&other.0)
+    }
+}
+
+impl Default for FieldElement {
+    fn default() -> Self {
+        FieldElement::ZERO
+    }
+}
+
+impl DefaultIsZeroes for FieldElement {}
+
+impl Eq for FieldElement {}
+
+impl From<u64> for FieldElement {
+    fn from(n: u64) -> FieldElement {
+        Self::from_uint_unchecked(U256::from(n))
+    }
+}
+
+impl PartialEq for FieldElement {
+    fn eq(&self, other: &Self) -> bool {
+        self.ct_eq(other).into()
+    }
+}
+
+impl Add<FieldElement> for FieldElement {
+    type Output = FieldElement;
+
+    fn add(self, other: FieldElement) -> FieldElement {
+        FieldElement::add(&self, &other)
+    }
+}
+
+impl Add<&FieldElement> for FieldElement {
+    type Output = FieldElement;
+
+    fn add(self, other: &FieldElement) -> FieldElement {
+        FieldElement::add(&self, other)
+    }
+}
+
+impl Add<&FieldElement> for &FieldElement {
+    type Output = FieldElement;
+
+    fn add(self, other: &FieldElement) -> FieldElement {
+        FieldElement::add(self, other)
+    }
+}
+
+impl AddAssign<FieldElement> for FieldElement {
+    fn add_assign(&mut self, other: FieldElement) {
+        *self = FieldElement::add(self, &other);
+    }
+}
+
+impl AddAssign<&FieldElement> for FieldElement {
+    fn add_assign(&mut self, other: &FieldElement) {
+        *self = FieldElement::add(self, other);
+    }
+}
+
+impl Sub<FieldElement> for FieldElement {
+    type Output = FieldElement;
+
+    fn sub(self, other: FieldElement) -> FieldElement {
+        FieldElement::sub(&self, &other)
+    }
+}
+
+impl Sub<&FieldElement> for FieldElement {
+    type Output = FieldElement;
+
+    fn sub(self, other: &FieldElement) -> FieldElement {
+        FieldElement::sub(&self, other)
+    }
+}
+
+impl Sub<&FieldElement> for &FieldElement {
+    type Output = FieldElement;
+
+    fn sub(self, other: &FieldElement) -> FieldElement {
+        FieldElement::sub(self, other)
+    }
+}
+
+impl SubAssign<FieldElement> for FieldElement {
+    fn sub_assign(&mut self, other: FieldElement) {
+        *self = FieldElement::sub(self, &other);
+    }
+}
+
+impl SubAssign<&FieldElement> for FieldElement {
+    fn sub_assign(&mut self, other: &FieldElement) {
+        *self = FieldElement::sub(self, other);
+    }
+}
+
+impl Mul<FieldElement> for FieldElement {
+    type Output = FieldElement;
+
+    fn mul(self, other: FieldElement) -> FieldElement {
+        FieldElement::multiply(&self, &other)
+    }
+}
+
+impl Mul<&FieldElement> for FieldElement {
+    type Output = FieldElement;
+
+    fn mul(self, other: &FieldElement) -> FieldElement {
+        FieldElement::multiply(&self, other)
+    }
+}
+
+impl Mul<&FieldElement> for &FieldElement {
+    type Output = FieldElement;
+
+    fn mul(self, other: &FieldElement) -> FieldElement {
+        FieldElement::multiply(self, other)
+    }
+}
+
+impl MulAssign<FieldElement> for FieldElement {
+    fn mul_assign(&mut self, other: FieldElement) {
+        *self = FieldElement::multiply(self, &other);
+    }
+}
+
+impl MulAssign<&FieldElement> for FieldElement {
+    fn mul_assign(&mut self, other: &FieldElement) {
+        *self = FieldElement::multiply(self, other);
+    }
+}
+
+impl Neg for FieldElement {
+    type Output = FieldElement;
+
+    fn neg(self) -> FieldElement {
+        FieldElement::ZERO - &self
+    }
+}
+
+impl Neg for &FieldElement {
+    type Output = FieldElement;
+
+    fn neg(self) -> FieldElement {
+        FieldElement::ZERO - self
+    }
+}
+
+impl Sum for FieldElement {
+    fn sum<I: Iterator<Item = Self>>(iter: I) -> Self {
+        iter.reduce(core::ops::Add::add).unwrap_or(Self::ZERO)
+    }
+}
+
+impl<'a> Sum<&'a FieldElement> for FieldElement {
+    fn sum<I: Iterator<Item = &'a FieldElement>>(iter: I) -> Self {
+        iter.copied().sum()
+    }
+}
+
+impl Product for FieldElement {
+    fn product<I: Iterator<Item = Self>>(iter: I) -> Self {
+        iter.reduce(core::ops::Mul::mul).unwrap_or(Self::ONE)
+    }
+}
+
+impl<'a> Product<&'a FieldElement> for FieldElement {
+    fn product<I: Iterator<Item = &'a Self>>(iter: I) -> Self {
+        iter.copied().product()
     }
 }
 
 #[cfg(test)]
 mod tests {
-    use super::FieldElement;
+    use super::{u64x4_to_u256, FieldElement};
     use crate::{test_vectors::field::DBL_TEST_VECTORS, FieldBytes};
-    use elliptic_curve::{bigint::U256, ff::PrimeField};
-    use primeorder::{
-        impl_field_identity_tests, impl_field_invert_tests, impl_field_sqrt_tests,
-        impl_primefield_tests,
-    };
-    use proptest::{num, prelude::*};
-
-    /// t = (modulus - 1) >> S
-    const T: [u64; 4] = [
-        0xffffffffffffffff,
-        0x000000007fffffff,
-        0x8000000000000000,
-        0x7fffffff80000000,
-    ];
-
-    impl_field_identity_tests!(FieldElement);
-    impl_field_invert_tests!(FieldElement);
-    impl_field_sqrt_tests!(FieldElement);
-    impl_primefield_tests!(FieldElement, T);
+    use core::ops::Mul;
+    use proptest::{num::u64::ANY, prelude::*};
+
+    #[test]
+    fn zero_is_additive_identity() {
+        let zero = FieldElement::ZERO;
+        let one = FieldElement::ONE;
+        assert_eq!(zero.add(&zero), zero);
+        assert_eq!(one.add(&zero), one);
+    }
+
+    #[test]
+    fn one_is_multiplicative_identity() {
+        let one = FieldElement::ONE;
+        assert_eq!(one.mul(&one), one);
+    }
 
     #[test]
     fn from_bytes() {
         assert_eq!(
-            FieldElement::from_bytes(&FieldBytes::default()).unwrap(),
+            FieldElement::from_bytes(FieldBytes::default()).unwrap(),
             FieldElement::ZERO
         );
         assert_eq!(
             FieldElement::from_bytes(
-                &[
+                [
                     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                     0, 0, 0, 0, 0, 1
                 ]
@@ -202,7 +693,7 @@ mod tests {
             FieldElement::ONE
         );
         assert!(bool::from(
-            FieldElement::from_bytes(&[0xff; 32].into()).is_none()
+            FieldElement::from_bytes([0xff; 32].into()).is_none()
         ));
     }
 
@@ -263,46 +754,40 @@ mod tests {
         assert_eq!(two.pow_vartime(&[2, 0, 0, 0]), four);
     }
 
-    proptest! {
-        /// This checks behaviour well within the field ranges, because it doesn't set the
-        /// highest limb.
-        #[cfg(target_pointer_width = "32")]
-        #[test]
-        fn add_then_sub(
-            a0 in num::u32::ANY,
-            a1 in num::u32::ANY,
-            a2 in num::u32::ANY,
-            a3 in num::u32::ANY,
-            a4 in num::u32::ANY,
-            a5 in num::u32::ANY,
-            a6 in num::u32::ANY,
-            b0 in num::u32::ANY,
-            b1 in num::u32::ANY,
-            b2 in num::u32::ANY,
-            b3 in num::u32::ANY,
-            b4 in num::u32::ANY,
-            b5 in num::u32::ANY,
-            b6 in num::u32::ANY,
-        ) {
-            let a = FieldElement(U256::from_words([a0, a1, a2, a3, a4, a5, a6, 0]));
-            let b = FieldElement(U256::from_words([b0, b1, b2, b3, b4, b5, b6, 0]));
-            assert_eq!(a.add(&b).sub(&a), b);
-        }
+    #[test]
+    fn invert() {
+        assert!(bool::from(FieldElement::ZERO.invert().is_none()));
+
+        let one = FieldElement::ONE;
+        assert_eq!(one.invert().unwrap(), one);
+
+        let two = one + &one;
+        let inv_two = two.invert().unwrap();
+        assert_eq!(two * &inv_two, one);
+    }
 
+    #[test]
+    fn sqrt() {
+        let one = FieldElement::ONE;
+        let two = one + &one;
+        let four = two.square();
+        assert_eq!(four.sqrt().unwrap(), two);
+    }
+
+    proptest! {
         /// This checks behaviour well within the field ranges, because it doesn't set the
         /// highest limb.
-        #[cfg(target_pointer_width = "64")]
         #[test]
         fn add_then_sub(
-            a0 in num::u64::ANY,
-            a1 in num::u64::ANY,
-            a2 in num::u64::ANY,
-            b0 in num::u64::ANY,
-            b1 in num::u64::ANY,
-            b2 in num::u64::ANY,
+            a0 in ANY,
+            a1 in ANY,
+            a2 in ANY,
+            b0 in ANY,
+            b1 in ANY,
+            b2 in ANY,
         ) {
-            let a = FieldElement(U256::from_words([a0, a1, a2, 0]));
-            let b = FieldElement(U256::from_words([b0, b1, b2, 0]));
+            let a = FieldElement(u64x4_to_u256([a0, a1, a2, 0]));
+            let b = FieldElement(u64x4_to_u256([b0, b1, b2, 0]));
             assert_eq!(a.add(&b).sub(&a), b);
         }
     }
diff --git a/p256/src/arithmetic/field/field32.rs b/p256/src/arithmetic/field/field32.rs
deleted file mode 100644
index bc39b76e..00000000
--- a/p256/src/arithmetic/field/field32.rs
+++ /dev/null
@@ -1,206 +0,0 @@
-//! 32-bit secp256r1 base field implementation
-
-// TODO(tarcieri): adapt 64-bit arithmetic to proper 32-bit arithmetic
-
-use super::{MODULUS, R_2};
-use crate::arithmetic::util::{adc, mac, sbb};
-
-/// Raw field element.
-pub type Fe = [u32; 8];
-
-/// Translate a field element out of the Montgomery domain.
-#[inline]
-pub const fn fe_from_montgomery(w: &Fe) -> Fe {
-    let w = fe32_to_fe64(w);
-    montgomery_reduce(&[w[0], w[1], w[2], w[3], 0, 0, 0, 0])
-}
-
-/// Translate a field element into the Montgomery domain.
-#[inline]
-pub const fn fe_to_montgomery(w: &Fe) -> Fe {
-    fe_mul(w, R_2.as_words())
-}
-
-/// Returns `a + b mod p`.
-pub const fn fe_add(a: &Fe, b: &Fe) -> Fe {
-    let a = fe32_to_fe64(a);
-    let b = fe32_to_fe64(b);
-
-    // Bit 256 of p is set, so addition can result in five words.
-    let (w0, carry) = adc(a[0], b[0], 0);
-    let (w1, carry) = adc(a[1], b[1], carry);
-    let (w2, carry) = adc(a[2], b[2], carry);
-    let (w3, w4) = adc(a[3], b[3], carry);
-
-    // Attempt to subtract the modulus, to ensure the result is in the field.
-    let modulus = fe32_to_fe64(MODULUS.as_words());
-    sub_inner(
-        &[w0, w1, w2, w3, w4],
-        &[modulus[0], modulus[1], modulus[2], modulus[3], 0],
-    )
-}
-
-/// Returns `a - b mod p`.
-pub const fn fe_sub(a: &Fe, b: &Fe) -> Fe {
-    let a = fe32_to_fe64(a);
-    let b = fe32_to_fe64(b);
-    sub_inner(&[a[0], a[1], a[2], a[3], 0], &[b[0], b[1], b[2], b[3], 0])
-}
-
-/// Returns `a * b mod p`.
-pub const fn fe_mul(a: &Fe, b: &Fe) -> Fe {
-    let a = fe32_to_fe64(a);
-    let b = fe32_to_fe64(b);
-
-    let (w0, carry) = mac(0, a[0], b[0], 0);
-    let (w1, carry) = mac(0, a[0], b[1], carry);
-    let (w2, carry) = mac(0, a[0], b[2], carry);
-    let (w3, w4) = mac(0, a[0], b[3], carry);
-
-    let (w1, carry) = mac(w1, a[1], b[0], 0);
-    let (w2, carry) = mac(w2, a[1], b[1], carry);
-    let (w3, carry) = mac(w3, a[1], b[2], carry);
-    let (w4, w5) = mac(w4, a[1], b[3], carry);
-
-    let (w2, carry) = mac(w2, a[2], b[0], 0);
-    let (w3, carry) = mac(w3, a[2], b[1], carry);
-    let (w4, carry) = mac(w4, a[2], b[2], carry);
-    let (w5, w6) = mac(w5, a[2], b[3], carry);
-
-    let (w3, carry) = mac(w3, a[3], b[0], 0);
-    let (w4, carry) = mac(w4, a[3], b[1], carry);
-    let (w5, carry) = mac(w5, a[3], b[2], carry);
-    let (w6, w7) = mac(w6, a[3], b[3], carry);
-
-    montgomery_reduce(&[w0, w1, w2, w3, w4, w5, w6, w7])
-}
-
-/// Returns `-w mod p`.
-pub const fn fe_neg(w: &Fe) -> Fe {
-    fe_sub(&[0; 8], w)
-}
-
-/// Returns `w * w mod p`.
-pub const fn fe_square(w: &Fe) -> Fe {
-    fe_mul(w, w)
-}
-
-/// Montgomery Reduction
-///
-/// The general algorithm is:
-/// ```text
-/// A <- input (2n b-limbs)
-/// for i in 0..n {
-///     k <- A[i] p' mod b
-///     A <- A + k p b^i
-/// }
-/// A <- A / b^n
-/// if A >= p {
-///     A <- A - p
-/// }
-/// ```
-///
-/// For secp256r1, we have the following simplifications:
-///
-/// - `p'` is 1, so our multiplicand is simply the first limb of the intermediate A.
-///
-/// - The first limb of p is 2^64 - 1; multiplications by this limb can be simplified
-///   to a shift and subtraction:
-///   ```text
-///       a_i * (2^64 - 1) = a_i * 2^64 - a_i = (a_i << 64) - a_i
-///   ```
-///   However, because `p' = 1`, the first limb of p is multiplied by limb i of the
-///   intermediate A and then immediately added to that same limb, so we simply
-///   initialize the carry to limb i of the intermediate.
-///
-/// - The third limb of p is zero, so we can ignore any multiplications by it and just
-///   add the carry.
-///
-/// References:
-/// - Handbook of Applied Cryptography, Chapter 14
-///   Algorithm 14.32
-///   http://cacr.uwaterloo.ca/hac/about/chap14.pdf
-///
-/// - Efficient and Secure Elliptic Curve Cryptography Implementation of Curve P-256
-///   Algorithm 7) Montgomery Word-by-Word Reduction
-///   https://csrc.nist.gov/csrc/media/events/workshop-on-elliptic-curve-cryptography-standards/documents/papers/session6-adalier-mehmet.pdf
-#[inline]
-#[allow(clippy::too_many_arguments)]
-const fn montgomery_reduce(r: &[u64; 8]) -> Fe {
-    let r0 = r[0];
-    let r1 = r[1];
-    let r2 = r[2];
-    let r3 = r[3];
-    let r4 = r[4];
-    let r5 = r[5];
-    let r6 = r[6];
-    let r7 = r[7];
-    let modulus = fe32_to_fe64(MODULUS.as_words());
-
-    let (r1, carry) = mac(r1, r0, modulus[1], r0);
-    let (r2, carry) = adc(r2, 0, carry);
-    let (r3, carry) = mac(r3, r0, modulus[3], carry);
-    let (r4, carry2) = adc(r4, 0, carry);
-
-    let (r2, carry) = mac(r2, r1, modulus[1], r1);
-    let (r3, carry) = adc(r3, 0, carry);
-    let (r4, carry) = mac(r4, r1, modulus[3], carry);
-    let (r5, carry2) = adc(r5, carry2, carry);
-
-    let (r3, carry) = mac(r3, r2, modulus[1], r2);
-    let (r4, carry) = adc(r4, 0, carry);
-    let (r5, carry) = mac(r5, r2, modulus[3], carry);
-    let (r6, carry2) = adc(r6, carry2, carry);
-
-    let (r4, carry) = mac(r4, r3, modulus[1], r3);
-    let (r5, carry) = adc(r5, 0, carry);
-    let (r6, carry) = mac(r6, r3, modulus[3], carry);
-    let (r7, r8) = adc(r7, carry2, carry);
-
-    // Result may be within MODULUS of the correct value
-    sub_inner(
-        &[r4, r5, r6, r7, r8],
-        &[modulus[0], modulus[1], modulus[2], modulus[3], 0],
-    )
-}
-
-#[inline]
-#[allow(clippy::too_many_arguments)]
-const fn sub_inner(l: &[u64; 5], r: &[u64; 5]) -> Fe {
-    let (w0, borrow) = sbb(l[0], r[0], 0);
-    let (w1, borrow) = sbb(l[1], r[1], borrow);
-    let (w2, borrow) = sbb(l[2], r[2], borrow);
-    let (w3, borrow) = sbb(l[3], r[3], borrow);
-    let (_, borrow) = sbb(l[4], r[4], borrow);
-
-    // If underflow occurred on the final limb, borrow = 0xfff...fff, otherwise
-    // borrow = 0x000...000. Thus, we use it as a mask to conditionally add the
-    // modulus.
-    let modulus = fe32_to_fe64(MODULUS.as_words());
-    let (w0, carry) = adc(w0, modulus[0] & borrow, 0);
-    let (w1, carry) = adc(w1, modulus[1] & borrow, carry);
-    let (w2, carry) = adc(w2, modulus[2] & borrow, carry);
-    let (w3, _) = adc(w3, modulus[3] & borrow, carry);
-
-    [
-        (w0 & 0xFFFFFFFF) as u32,
-        (w0 >> 32) as u32,
-        (w1 & 0xFFFFFFFF) as u32,
-        (w1 >> 32) as u32,
-        (w2 & 0xFFFFFFFF) as u32,
-        (w2 >> 32) as u32,
-        (w3 & 0xFFFFFFFF) as u32,
-        (w3 >> 32) as u32,
-    ]
-}
-
-// TODO(tarcieri): replace this with proper 32-bit arithmetic
-#[inline]
-const fn fe32_to_fe64(fe32: &Fe) -> [u64; 4] {
-    [
-        (fe32[0] as u64) | ((fe32[1] as u64) << 32),
-        (fe32[2] as u64) | ((fe32[3] as u64) << 32),
-        (fe32[4] as u64) | ((fe32[5] as u64) << 32),
-        (fe32[6] as u64) | ((fe32[7] as u64) << 32),
-    ]
-}
diff --git a/p256/src/arithmetic/field/field64.rs b/p256/src/arithmetic/field/field64.rs
deleted file mode 100644
index e549cddd..00000000
--- a/p256/src/arithmetic/field/field64.rs
+++ /dev/null
@@ -1,175 +0,0 @@
-//! 64-bit secp256r1 base field implementation
-
-use super::{MODULUS, R_2};
-use crate::arithmetic::util::{adc, mac, sbb};
-
-/// Raw field element.
-pub type Fe = [u64; 4];
-
-/// Translate a field element out of the Montgomery domain.
-#[inline]
-pub const fn fe_from_montgomery(w: &Fe) -> Fe {
-    montgomery_reduce(&[w[0], w[1], w[2], w[3], 0, 0, 0, 0])
-}
-
-/// Translate a field element into the Montgomery domain.
-#[inline]
-pub const fn fe_to_montgomery(w: &Fe) -> Fe {
-    fe_mul(w, R_2.as_words())
-}
-
-/// Returns `a + b mod p`.
-pub const fn fe_add(a: &Fe, b: &Fe) -> Fe {
-    // Bit 256 of p is set, so addition can result in five words.
-    let (w0, carry) = adc(a[0], b[0], 0);
-    let (w1, carry) = adc(a[1], b[1], carry);
-    let (w2, carry) = adc(a[2], b[2], carry);
-    let (w3, w4) = adc(a[3], b[3], carry);
-
-    // Attempt to subtract the modulus, to ensure the result is in the field.
-    let modulus = MODULUS.as_words();
-    sub_inner(
-        &[w0, w1, w2, w3, w4],
-        &[modulus[0], modulus[1], modulus[2], modulus[3], 0],
-    )
-}
-
-/// Returns `a - b mod p`.
-pub const fn fe_sub(a: &Fe, b: &Fe) -> Fe {
-    sub_inner(&[a[0], a[1], a[2], a[3], 0], &[b[0], b[1], b[2], b[3], 0])
-}
-
-/// Returns `a * b mod p`.
-pub const fn fe_mul(a: &Fe, b: &Fe) -> Fe {
-    let (w0, carry) = mac(0, a[0], b[0], 0);
-    let (w1, carry) = mac(0, a[0], b[1], carry);
-    let (w2, carry) = mac(0, a[0], b[2], carry);
-    let (w3, w4) = mac(0, a[0], b[3], carry);
-
-    let (w1, carry) = mac(w1, a[1], b[0], 0);
-    let (w2, carry) = mac(w2, a[1], b[1], carry);
-    let (w3, carry) = mac(w3, a[1], b[2], carry);
-    let (w4, w5) = mac(w4, a[1], b[3], carry);
-
-    let (w2, carry) = mac(w2, a[2], b[0], 0);
-    let (w3, carry) = mac(w3, a[2], b[1], carry);
-    let (w4, carry) = mac(w4, a[2], b[2], carry);
-    let (w5, w6) = mac(w5, a[2], b[3], carry);
-
-    let (w3, carry) = mac(w3, a[3], b[0], 0);
-    let (w4, carry) = mac(w4, a[3], b[1], carry);
-    let (w5, carry) = mac(w5, a[3], b[2], carry);
-    let (w6, w7) = mac(w6, a[3], b[3], carry);
-
-    montgomery_reduce(&[w0, w1, w2, w3, w4, w5, w6, w7])
-}
-
-/// Returns `-w mod p`.
-pub const fn fe_neg(w: &Fe) -> Fe {
-    fe_sub(&[0, 0, 0, 0], w)
-}
-
-/// Returns `w * w mod p`.
-pub const fn fe_square(w: &Fe) -> Fe {
-    fe_mul(w, w)
-}
-
-/// Montgomery Reduction
-///
-/// The general algorithm is:
-/// ```text
-/// A <- input (2n b-limbs)
-/// for i in 0..n {
-///     k <- A[i] p' mod b
-///     A <- A + k p b^i
-/// }
-/// A <- A / b^n
-/// if A >= p {
-///     A <- A - p
-/// }
-/// ```
-///
-/// For secp256r1, we have the following simplifications:
-///
-/// - `p'` is 1, so our multiplicand is simply the first limb of the intermediate A.
-///
-/// - The first limb of p is 2^64 - 1; multiplications by this limb can be simplified
-///   to a shift and subtraction:
-///   ```text
-///       a_i * (2^64 - 1) = a_i * 2^64 - a_i = (a_i << 64) - a_i
-///   ```
-///   However, because `p' = 1`, the first limb of p is multiplied by limb i of the
-///   intermediate A and then immediately added to that same limb, so we simply
-///   initialize the carry to limb i of the intermediate.
-///
-/// - The third limb of p is zero, so we can ignore any multiplications by it and just
-///   add the carry.
-///
-/// References:
-/// - Handbook of Applied Cryptography, Chapter 14
-///   Algorithm 14.32
-///   http://cacr.uwaterloo.ca/hac/about/chap14.pdf
-///
-/// - Efficient and Secure Elliptic Curve Cryptography Implementation of Curve P-256
-///   Algorithm 7) Montgomery Word-by-Word Reduction
-///   https://csrc.nist.gov/csrc/media/events/workshop-on-elliptic-curve-cryptography-standards/documents/papers/session6-adalier-mehmet.pdf
-#[inline]
-#[allow(clippy::too_many_arguments)]
-const fn montgomery_reduce(r: &[u64; 8]) -> Fe {
-    let r0 = r[0];
-    let r1 = r[1];
-    let r2 = r[2];
-    let r3 = r[3];
-    let r4 = r[4];
-    let r5 = r[5];
-    let r6 = r[6];
-    let r7 = r[7];
-    let modulus = MODULUS.as_words();
-
-    let (r1, carry) = mac(r1, r0, modulus[1], r0);
-    let (r2, carry) = adc(r2, 0, carry);
-    let (r3, carry) = mac(r3, r0, modulus[3], carry);
-    let (r4, carry2) = adc(r4, 0, carry);
-
-    let (r2, carry) = mac(r2, r1, modulus[1], r1);
-    let (r3, carry) = adc(r3, 0, carry);
-    let (r4, carry) = mac(r4, r1, modulus[3], carry);
-    let (r5, carry2) = adc(r5, carry2, carry);
-
-    let (r3, carry) = mac(r3, r2, modulus[1], r2);
-    let (r4, carry) = adc(r4, 0, carry);
-    let (r5, carry) = mac(r5, r2, modulus[3], carry);
-    let (r6, carry2) = adc(r6, carry2, carry);
-
-    let (r4, carry) = mac(r4, r3, modulus[1], r3);
-    let (r5, carry) = adc(r5, 0, carry);
-    let (r6, carry) = mac(r6, r3, modulus[3], carry);
-    let (r7, r8) = adc(r7, carry2, carry);
-
-    // Result may be within MODULUS of the correct value
-    sub_inner(
-        &[r4, r5, r6, r7, r8],
-        &[modulus[0], modulus[1], modulus[2], modulus[3], 0],
-    )
-}
-
-#[inline]
-#[allow(clippy::too_many_arguments)]
-const fn sub_inner(l: &[u64; 5], r: &[u64; 5]) -> Fe {
-    let (w0, borrow) = sbb(l[0], r[0], 0);
-    let (w1, borrow) = sbb(l[1], r[1], borrow);
-    let (w2, borrow) = sbb(l[2], r[2], borrow);
-    let (w3, borrow) = sbb(l[3], r[3], borrow);
-    let (_, borrow) = sbb(l[4], r[4], borrow);
-
-    // If underflow occurred on the final limb, borrow = 0xfff...fff, otherwise
-    // borrow = 0x000...000. Thus, we use it as a mask to conditionally add the
-    // modulus.
-    let modulus = MODULUS.as_words();
-    let (w0, carry) = adc(w0, modulus[0] & borrow, 0);
-    let (w1, carry) = adc(w1, modulus[1] & borrow, carry);
-    let (w2, carry) = adc(w2, modulus[2] & borrow, carry);
-    let (w3, _) = adc(w3, modulus[3] & borrow, carry);
-
-    [w0, w1, w2, w3]
-}
diff --git a/p256/src/arithmetic/util.rs b/p256/src/arithmetic/util.rs
index 9be7651d..8ce5a9db 100644
--- a/p256/src/arithmetic/util.rs
+++ b/p256/src/arithmetic/util.rs
@@ -1,23 +1,72 @@
 //! Helper functions.
 // TODO(tarcieri): replace these with `crypto-bigint`
 
+use elliptic_curve::bigint::U256;
+
 /// Computes `a + b + carry`, returning the result along with the new carry. 64-bit version.
 #[inline(always)]
-pub const fn adc(a: u64, b: u64, carry: u64) -> (u64, u64) {
+pub(crate) const fn adc(a: u64, b: u64, carry: u64) -> (u64, u64) {
     let ret = (a as u128) + (b as u128) + (carry as u128);
     (ret as u64, (ret >> 64) as u64)
 }
 
 /// Computes `a - (b + borrow)`, returning the result along with the new borrow. 64-bit version.
 #[inline(always)]
-pub const fn sbb(a: u64, b: u64, borrow: u64) -> (u64, u64) {
+pub(crate) const fn sbb(a: u64, b: u64, borrow: u64) -> (u64, u64) {
     let ret = (a as u128).wrapping_sub((b as u128) + ((borrow >> 63) as u128));
     (ret as u64, (ret >> 64) as u64)
 }
 
 /// Computes `a + (b * c) + carry`, returning the result along with the new carry.
 #[inline(always)]
-pub const fn mac(a: u64, b: u64, c: u64, carry: u64) -> (u64, u64) {
+pub(crate) const fn mac(a: u64, b: u64, c: u64, carry: u64) -> (u64, u64) {
     let ret = (a as u128) + ((b as u128) * (c as u128)) + (carry as u128);
     (ret as u64, (ret >> 64) as u64)
 }
+
+/// Array containing 4 x 64-bit unsigned integers.
+// TODO(tarcieri): replace this entirely with `U256`
+pub(crate) type U64x4 = [u64; 4];
+
+/// Convert to a [`U64x4`] array.
+// TODO(tarcieri): implement all algorithms in terms of `U256`?
+#[cfg(target_pointer_width = "32")]
+pub(crate) const fn u256_to_u64x4(u256: U256) -> U64x4 {
+    let limbs = u256.to_words();
+
+    [
+        (limbs[0] as u64) | ((limbs[1] as u64) << 32),
+        (limbs[2] as u64) | ((limbs[3] as u64) << 32),
+        (limbs[4] as u64) | ((limbs[5] as u64) << 32),
+        (limbs[6] as u64) | ((limbs[7] as u64) << 32),
+    ]
+}
+
+/// Convert to a [`U64x4`] array.
+// TODO(tarcieri): implement all algorithms in terms of `U256`?
+#[cfg(target_pointer_width = "64")]
+pub(crate) const fn u256_to_u64x4(u256: U256) -> U64x4 {
+    u256.to_words()
+}
+
+/// Convert from a [`U64x4`] array.
+#[cfg(target_pointer_width = "32")]
+pub(crate) const fn u64x4_to_u256(limbs: U64x4) -> U256 {
+    U256::from_words([
+        (limbs[0] & 0xFFFFFFFF) as u32,
+        (limbs[0] >> 32) as u32,
+        (limbs[1] & 0xFFFFFFFF) as u32,
+        (limbs[1] >> 32) as u32,
+        (limbs[2] & 0xFFFFFFFF) as u32,
+        (limbs[2] >> 32) as u32,
+        (limbs[3] & 0xFFFFFFFF) as u32,
+        (limbs[3] >> 32) as u32,
+    ])
+}
+
+/// Convert from a [`U64x4`] array.
+// TODO(tarcieri): implement all algorithms in terms of `U256`?
+#[cfg(target_pointer_width = "64")]
+pub(crate) const fn u64x4_to_u256(limbs: U64x4) -> U256 {
+    U256::from_words(limbs)
+}