diff --git a/doc/modules/ROOT/pages/bit.adoc b/doc/modules/ROOT/pages/bit.adoc
index 2bf481a..f321073 100644
--- a/doc/modules/ROOT/pages/bit.adoc
+++ b/doc/modules/ROOT/pages/bit.adoc
@@ -26,7 +26,7 @@ For `u128`, the functions delegate to the `boost::int128` implementations.
 [source,c++]
 ----
 template <unsigned_library_type UnsignedInt>
-[[nodiscard]] constexpr auto has_single_bit(UnsignedInt x) noexcept -> bool;
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto has_single_bit(UnsignedInt x) noexcept -> bool;
 ----
 
 Returns `true` if `x` is a power of two.
@@ -37,7 +37,7 @@ See https://en.cppreference.com/w/cpp/numeric/has_single_bit.html[`std::has_sing
 [source,c++]
 ----
 template <unsigned_library_type UnsignedInt>
-[[nodiscard]] constexpr auto bit_ceil(UnsignedInt x) noexcept -> UnsignedInt;
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto bit_ceil(UnsignedInt x) noexcept -> UnsignedInt;
 ----
 
 Returns the smallest power of two not less than `x`.
@@ -49,7 +49,7 @@ See https://en.cppreference.com/w/cpp/numeric/bit_ceil.html[`std::bit_ceil`].
 [source,c++]
 ----
 template <unsigned_library_type UnsignedInt>
-[[nodiscard]] constexpr auto bit_floor(UnsignedInt x) noexcept -> UnsignedInt;
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto bit_floor(UnsignedInt x) noexcept -> UnsignedInt;
 ----
 
 Returns the largest power of two not greater than `x`.
@@ -61,7 +61,7 @@ See https://en.cppreference.com/w/cpp/numeric/bit_floor.html[`std::bit_floor`].
 [source,c++]
 ----
 template <unsigned_library_type UnsignedInt>
-[[nodiscard]] constexpr auto bit_width(UnsignedInt x) noexcept -> int;
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto bit_width(UnsignedInt x) noexcept -> int;
 ----
 
 Returns the number of bits needed to represent `x` (i.e., 1 + floor(log2(x)) for x > 0, or 0 for x == 0).
@@ -74,7 +74,7 @@ See https://en.cppreference.com/w/cpp/numeric/bit_width.html[`std::bit_width`].
 [source,c++]
 ----
 template <non_bounded_unsigned_library_type UnsignedInt>
-[[nodiscard]] constexpr auto rotl(UnsignedInt x, int s) noexcept -> UnsignedInt;
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto rotl(UnsignedInt x, int s) noexcept -> UnsignedInt;
 ----
 
 Computes the result of bitwise left-rotating `x` by `s` positions.
@@ -87,7 +87,7 @@ NOTE: `rotl` is not available for `bounded_uint` types. Bit rotation can produce
 [source,c++]
 ----
 template <non_bounded_unsigned_library_type UnsignedInt>
-[[nodiscard]] constexpr auto rotr(UnsignedInt x, int s) noexcept -> UnsignedInt;
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto rotr(UnsignedInt x, int s) noexcept -> UnsignedInt;
 ----
 
 Computes the result of bitwise right-rotating `x` by `s` positions.
@@ -102,7 +102,7 @@ NOTE: `rotr` is not available for `bounded_uint` types. Bit rotation can produce
 [source,c++]
 ----
 template <unsigned_library_type UnsignedInt>
-[[nodiscard]] constexpr auto countl_zero(UnsignedInt x) noexcept -> int;
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto countl_zero(UnsignedInt x) noexcept -> int;
 ----
 
 Returns the number of consecutive 0-bits starting from the most significant bit.
@@ -113,7 +113,7 @@ See https://en.cppreference.com/w/cpp/numeric/countl_zero.html[`std::countl_zero
 [source,c++]
 ----
 template <unsigned_library_type UnsignedInt>
-[[nodiscard]] constexpr auto countl_one(UnsignedInt x) noexcept -> int;
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto countl_one(UnsignedInt x) noexcept -> int;
 ----
 
 Returns the number of consecutive 1-bits starting from the most significant bit.
@@ -124,7 +124,7 @@ See https://en.cppreference.com/w/cpp/numeric/countl_one.html[`std::countl_one`]
 [source,c++]
 ----
 template <unsigned_library_type UnsignedInt>
-[[nodiscard]] constexpr auto countr_zero(UnsignedInt x) noexcept -> int;
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto countr_zero(UnsignedInt x) noexcept -> int;
 ----
 
 Returns the number of consecutive 0-bits starting from the least significant bit.
@@ -135,7 +135,7 @@ See https://en.cppreference.com/w/cpp/numeric/countr_zero.html[`std::countr_zero
 [source,c++]
 ----
 template <unsigned_library_type UnsignedInt>
-[[nodiscard]] constexpr auto countr_one(UnsignedInt x) noexcept -> int;
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto countr_one(UnsignedInt x) noexcept -> int;
 ----
 
 Returns the number of consecutive 1-bits starting from the least significant bit.
@@ -146,7 +146,7 @@ See https://en.cppreference.com/w/cpp/numeric/countr_one.html[`std::countr_one`]
 [source,c++]
 ----
 template <unsigned_library_type UnsignedInt>
-[[nodiscard]] constexpr auto popcount(UnsignedInt x) noexcept -> int;
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto popcount(UnsignedInt x) noexcept -> int;
 ----
 
 Returns the number of 1-bits in `x`.
@@ -159,7 +159,7 @@ See https://en.cppreference.com/w/cpp/numeric/popcount.html[`std::popcount`].
 [source,c++]
 ----
 template <non_bounded_integral_library_type Int>
-[[nodiscard]] constexpr auto byteswap(Int x) noexcept -> Int;
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto byteswap(Int x) noexcept -> Int;
 ----
 
 Reverses the bytes of `x`.
@@ -172,7 +172,7 @@ NOTE: `byteswap` is not available for `bounded_uint` types. Byte reversal can pr
 [source,c++]
 ----
 template <non_bounded_integral_library_type Int>
-[[nodiscard]] constexpr auto bitswap(Int x) noexcept -> Int;
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto bitswap(Int x) noexcept -> Int;
 ----
 
 Reverses all bits of `x`.
diff --git a/doc/modules/ROOT/pages/byte_conversions.adoc b/doc/modules/ROOT/pages/byte_conversions.adoc
index 152aa3c..5696e41 100644
--- a/doc/modules/ROOT/pages/byte_conversions.adoc
+++ b/doc/modules/ROOT/pages/byte_conversions.adoc
@@ -170,7 +170,7 @@ The value is first converted to big-endian byte order using `to_be`, then reinte
 [source,c++]
 ----
 template <non_bounded_integral_library_type T>
-[[nodiscard]] constexpr auto to_be_bytes(const T value) noexcept -> std::array<std::byte, sizeof(T)>;
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto to_be_bytes(const T value) noexcept -> std::array<std::byte, sizeof(T)>;
 ----
 
 === Parameters
@@ -203,7 +203,7 @@ The bytes are reinterpreted as the underlying type and then converted from big-e
 [source,c++]
 ----
 template <non_bounded_integral_library_type T, std::size_t N>
-[[nodiscard]] constexpr auto from_be_bytes(const std::span<const std::byte, N> bytes) -> T;
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto from_be_bytes(const std::span<const std::byte, N> bytes) -> T;
 ----
 
 === Parameters
@@ -251,7 +251,7 @@ The value is first converted to little-endian byte order using `to_le`, then rei
 [source,c++]
 ----
 template <non_bounded_integral_library_type T>
-[[nodiscard]] constexpr auto to_le_bytes(const T value) noexcept -> std::array<std::byte, sizeof(T)>;
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto to_le_bytes(const T value) noexcept -> std::array<std::byte, sizeof(T)>;
 ----
 
 === Parameters
@@ -284,7 +284,7 @@ The bytes are reinterpreted as the underlying type and then converted from littl
 [source,c++]
 ----
 template <non_bounded_integral_library_type T, std::size_t N>
-[[nodiscard]] constexpr auto from_le_bytes(const std::span<const std::byte, N> bytes) -> T;
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto from_le_bytes(const std::span<const std::byte, N> bytes) -> T;
 ----
 
 === Parameters
@@ -334,7 +334,7 @@ The result is equivalent to `std::bit_cast<std::array<std::byte, sizeof(T)>>(val
 [source,c++]
 ----
 template <non_bounded_integral_library_type T>
-[[nodiscard]] constexpr auto to_ne_bytes(const T value) noexcept -> std::array<std::byte, sizeof(T)>;
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto to_ne_bytes(const T value) noexcept -> std::array<std::byte, sizeof(T)>;
 ----
 
 === Parameters
@@ -368,7 +368,7 @@ Delegates to `from_le_bytes` on little-endian platforms and `from_be_bytes` on b
 [source,c++]
 ----
 template <non_bounded_integral_library_type T, std::size_t N>
-[[nodiscard]] constexpr auto from_ne_bytes(const std::span<const std::byte, N> bytes) -> T;
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto from_ne_bytes(const std::span<const std::byte, N> bytes) -> T;
 ----
 
 === Parameters
diff --git a/doc/modules/ROOT/pages/charconv.adoc b/doc/modules/ROOT/pages/charconv.adoc
index 599b073..6bc6af4 100644
--- a/doc/modules/ROOT/pages/charconv.adoc
+++ b/doc/modules/ROOT/pages/charconv.adoc
@@ -33,12 +33,14 @@ namespace boost::charconv {
 
 // Convert safe integer to character string
 template <safe_numbers::detail::library_type T>
+BOOST_SAFE_NUMBERS_HOST_DEVICE
 constexpr auto to_chars(char* first, char* last,
                         T value,
                         int base = 10) -> charconv::to_chars_result;
 
 // Convert character string to safe integer
 template <safe_numbers::detail::library_type T>
+BOOST_SAFE_NUMBERS_HOST_DEVICE
 constexpr auto from_chars(const char* first, const char* last,
                           T& value,
                           int base = 10) -> charconv::from_chars_result;
@@ -57,8 +59,10 @@ struct to_chars_result
     char* ptr;
     std::errc ec;
 
+    BOOST_SAFE_NUMBERS_HOST_DEVICE
     friend constexpr bool operator==(const to_chars_result& lhs,
                                      const to_chars_result& rhs) noexcept = default;
+    BOOST_SAFE_NUMBERS_HOST_DEVICE
     constexpr explicit operator bool() const noexcept { return ec == std::errc{}; }
 };
 
@@ -83,8 +87,11 @@ struct from_chars_result
     const char* ptr;
     std::errc ec;
 
+    BOOST_SAFE_NUMBERS_HOST_DEVICE
     friend constexpr bool operator==(const from_chars_result& lhs,
                                      const from_chars_result& rhs) noexcept = default;
+
+    BOOST_SAFE_NUMBERS_HOST_DEVICE
     constexpr explicit operator bool() const noexcept { return ec == std::errc{}; }
 };
 
@@ -103,6 +110,7 @@ struct from_chars_result
 [source,c++]
 ----
 template <safe_numbers::detail::library_type T>
+BOOST_SAFE_NUMBERS_HOST_DEVICE
 constexpr auto to_chars(char* first, char* last,
                         T value,
                         int base = 10) -> charconv::to_chars_result;
@@ -133,6 +141,7 @@ Returns `boost::charconv::to_chars_result` with:
 [source,c++]
 ----
 template <safe_numbers::detail::library_type T>
+BOOST_SAFE_NUMBERS_HOST_DEVICE
 constexpr auto from_chars(const char* first, const char* last,
                           T& value,
                           int base = 10) -> charconv::from_chars_result;
diff --git a/doc/modules/ROOT/pages/cuda.adoc b/doc/modules/ROOT/pages/cuda.adoc
index e772070..dc7ff15 100644
--- a/doc/modules/ROOT/pages/cuda.adoc
+++ b/doc/modules/ROOT/pages/cuda.adoc
@@ -92,5 +92,5 @@ Device error on thread 256 at /home/runner/work/safe_numbers/boost-root/libs/saf
 
 The `device_error_context` will also attempt to `printf` the error into the terminal.
 This works when compiling with verbose mode `-V`.
-`printf` error messages will look the same as the message displayed by
+`printf` error messages will look the same as the message displayed by the thrown exception
 
diff --git a/doc/modules/ROOT/pages/integer_utilities.adoc b/doc/modules/ROOT/pages/integer_utilities.adoc
index 8dc4a0e..d01f31b 100644
--- a/doc/modules/ROOT/pages/integer_utilities.adoc
+++ b/doc/modules/ROOT/pages/integer_utilities.adoc
@@ -23,7 +23,7 @@ These operate on the non-bounded unsigned types (`u8`, `u16`, `u32`, `u64`, `u12
 [source,c++]
 ----
 template <non_bounded_unsigned_library_type T>
-[[nodiscard]] constexpr auto isqrt(const T val) -> T;
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto isqrt(const T val) -> T;
 ----
 
 Returns the integer square root of `val`, i.e., the largest integer `r` such that `r * r \<= val`.
@@ -74,7 +74,7 @@ struct remove_trailing_zeros_return
 [source,c++]
 ----
 template <non_bounded_unsigned_library_type T>
-[[nodiscard]] constexpr auto remove_trailing_zeros(const T n);
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto remove_trailing_zeros(const T n);
 ----
 
 Removes all trailing decimal zeros from `n`.
@@ -141,7 +141,7 @@ Tests whether an unsigned integer value is an exact power of 10 (i.e., one of 1,
 [source,c++]
 ----
 template <non_bounded_unsigned_library_type T>
-[[nodiscard]] constexpr auto is_power_10(const T n) -> bool;
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto is_power_10(const T n) -> bool;
 ----
 
 === Parameters
@@ -171,7 +171,7 @@ Returns the integer base-2 logarithm (floor of log~2~) of a value.
 [source,c++]
 ----
 template <non_bounded_unsigned_library_type T>
-[[nodiscard]] constexpr auto ilog2(const T n) -> int;
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto ilog2(const T n) -> int;
 ----
 
 Computes `floor(log~2~(n))` using `bit_width(n) - 1`.
@@ -213,7 +213,7 @@ Uses an O(1) algorithm based on the most significant bit position to approximate
 [source,c++]
 ----
 template <non_bounded_unsigned_library_type T>
-[[nodiscard]] constexpr auto ilog10(const T n) -> int;
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto ilog10(const T n) -> int;
 ----
 
 Computes `floor(log~10~(n))` using `num_digits(n) - 1`, where `num_digits` approximates the digit count via `log~10~(x) ~= log~2~(x) / log~2~(10)` and refines with at most two comparisons against a power-of-10 lookup table.
@@ -249,7 +249,7 @@ Returns the integer logarithm in an arbitrary base (floor of log~base~) of a val
 [source,c++]
 ----
 template <non_bounded_unsigned_library_type T>
-[[nodiscard]] constexpr auto ilog(const T n, const T base) -> int;
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto ilog(const T n, const T base) -> int;
 ----
 
 Computes `floor(log~base~(n))` by repeated division.
@@ -294,7 +294,7 @@ Integer exponentiation using the exponentiation-by-squaring algorithm.
 [source,c++]
 ----
 template <non_bounded_unsigned_library_type T>
-[[nodiscard]] constexpr auto ipow(const T a, const T b) noexcept -> T;
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto ipow(const T a, const T b) noexcept -> T;
 ----
 
 Computes `a` raised to the power `b` using exponentiation by squaring.
@@ -339,7 +339,7 @@ Tests whether an unsigned integer value is an exact power of 2 (i.e., has exactl
 [source,c++]
 ----
 template <non_bounded_unsigned_library_type T>
-[[nodiscard]] constexpr auto is_power_2(const T n) noexcept -> bool;
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto is_power_2(const T n) noexcept -> bool;
 ----
 
 === Parameters
@@ -370,7 +370,7 @@ For unsigned types, naive subtraction `a - b` when `b > a` would underflow; `abs
 [source,c++]
 ----
 template <integral_library_type T>
-[[nodiscard]] constexpr auto abs_diff(const T a, const T b) noexcept -> T;
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto abs_diff(const T a, const T b) noexcept -> T;
 ----
 
 Returns `|a - b|`, computed as `a - b` if `a >= b`, or `b - a` otherwise.
@@ -411,7 +411,7 @@ For unsigned types, this is equivalent to `(a + b - 1) / b` but computed without
 [source,c++]
 ----
 template <integral_library_type T>
-[[nodiscard]] constexpr auto div_ceil(const T a, const T b) noexcept -> T;
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto div_ceil(const T a, const T b) noexcept -> T;
 ----
 
 Returns the ceiling of `a / b`.
@@ -454,7 +454,7 @@ This is useful for alignment calculations (e.g., aligning a size to a page bound
 [source,c++]
 ----
 template <integral_library_type T>
-[[nodiscard]] constexpr auto next_multiple_of(const T a, const T b) noexcept -> T;
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto next_multiple_of(const T a, const T b) noexcept -> T;
 ----
 
 Returns the smallest value `m` such that `m >= a` and `m % b == 0`.
diff --git a/doc/modules/ROOT/pages/limits.adoc b/doc/modules/ROOT/pages/limits.adoc
index fe9aa3a..f273a2b 100644
--- a/doc/modules/ROOT/pages/limits.adoc
+++ b/doc/modules/ROOT/pages/limits.adoc
@@ -79,15 +79,15 @@ struct numeric_limits<T>
     static constexpr bool tinyness_before = std::numeric_limits<basis_type>::tinyness_before;
 
     // Static member functions
-    static constexpr T min() noexcept;
-    static constexpr T max() noexcept;
-    static constexpr T lowest() noexcept;
-    static constexpr T epsilon() noexcept;
-    static constexpr T round_error() noexcept;
-    static constexpr T infinity() noexcept;
-    static constexpr T quiet_NaN() noexcept;
-    static constexpr T signaling_NaN() noexcept;
-    static constexpr T denorm_min() noexcept;
+    BOOST_SAFE_NUMBERS_HOST_DEVICE static constexpr T min() noexcept;
+    BOOST_SAFE_NUMBERS_HOST_DEVICE static constexpr T max() noexcept;
+    BOOST_SAFE_NUMBERS_HOST_DEVICE static constexpr T lowest() noexcept;
+    BOOST_SAFE_NUMBERS_HOST_DEVICE static constexpr T epsilon() noexcept;
+    BOOST_SAFE_NUMBERS_HOST_DEVICE static constexpr T round_error() noexcept;
+    BOOST_SAFE_NUMBERS_HOST_DEVICE static constexpr T infinity() noexcept;
+    BOOST_SAFE_NUMBERS_HOST_DEVICE static constexpr T quiet_NaN() noexcept;
+    BOOST_SAFE_NUMBERS_HOST_DEVICE static constexpr T signaling_NaN() noexcept;
+    BOOST_SAFE_NUMBERS_HOST_DEVICE static constexpr T denorm_min() noexcept;
 };
 
 } // namespace std
@@ -138,63 +138,63 @@ For unsigned integer types, the following values are consistent across all speci
 
 [source,c++]
 ----
-static constexpr T min() noexcept;
+BOOST_SAFE_NUMBERS_HOST_DEVICE static constexpr T min() noexcept;
 ----
 
 Returns the minimum finite value (always `Tpass:[{0}]` for unsigned types).
 
 [source,c++]
 ----
-static constexpr T max() noexcept;
+BOOST_SAFE_NUMBERS_HOST_DEVICE static constexpr T max() noexcept;
 ----
 
 Returns the maximum finite value.
 
 [source,c++]
 ----
-static constexpr T lowest() noexcept;
+BOOST_SAFE_NUMBERS_HOST_DEVICE static constexpr T lowest() noexcept;
 ----
 
 Returns the lowest finite value (same as `min()` for unsigned types).
 
 [source,c++]
 ----
-static constexpr T epsilon() noexcept;
+BOOST_SAFE_NUMBERS_HOST_DEVICE static constexpr T epsilon() noexcept;
 ----
 
 Returns `Tpass:[{0}]` (not meaningful for integer types).
 
 [source,c++]
 ----
-static constexpr T round_error() noexcept;
+BOOST_SAFE_NUMBERS_HOST_DEVICE static constexpr T round_error() noexcept;
 ----
 
 Returns `Tpass:[{0}]` (not meaningful for integer types).
 
 [source,c++]
 ----
-static constexpr T infinity() noexcept;
+BOOST_SAFE_NUMBERS_HOST_DEVICE static constexpr T infinity() noexcept;
 ----
 
 Returns `Tpass:[{0}]` (unsigned integers cannot represent infinity).
 
 [source,c++]
 ----
-static constexpr T quiet_NaN() noexcept;
+BOOST_SAFE_NUMBERS_HOST_DEVICE static constexpr T quiet_NaN() noexcept;
 ----
 
 Returns `Tpass:[{0}]` (unsigned integers cannot represent NaN).
 
 [source,c++]
 ----
-static constexpr T signaling_NaN() noexcept;
+BOOST_SAFE_NUMBERS_HOST_DEVICE static constexpr T signaling_NaN() noexcept;
 ----
 
 Returns `Tpass:[{0}]` (unsigned integers cannot represent NaN).
 
 [source,c++]
 ----
-static constexpr T denorm_min() noexcept;
+BOOST_SAFE_NUMBERS_HOST_DEVICE static constexpr T denorm_min() noexcept;
 ----
 
 Returns `Tpass:[{0}]` (not meaningful for integer types).
diff --git a/doc/modules/ROOT/pages/numeric.adoc b/doc/modules/ROOT/pages/numeric.adoc
index f058ebd..6899858 100644
--- a/doc/modules/ROOT/pages/numeric.adoc
+++ b/doc/modules/ROOT/pages/numeric.adoc
@@ -25,7 +25,7 @@ Computes the greatest common divisor of two integers using the Euclidean algorit
 [source,c++]
 ----
 template <non_bounded_integral_library_type T>
-[[nodiscard]] constexpr auto gcd(const T m, const T n) noexcept -> T;
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto gcd(const T m, const T n) noexcept -> T;
 ----
 
 Returns the greatest common divisor of `m` and `n`.
@@ -65,7 +65,7 @@ Computes the least common multiple of two integers.
 [source,c++]
 ----
 template <non_bounded_integral_library_type T>
-[[nodiscard]] constexpr auto lcm(const T m, const T n) noexcept -> T;
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto lcm(const T m, const T n) noexcept -> T;
 ----
 
 Returns the least common multiple of `m` and `n`.
@@ -106,7 +106,7 @@ The result is rounded towards the first argument `a`.
 [source,c++]
 ----
 template <non_bounded_integral_library_type T>
-[[nodiscard]] constexpr auto midpoint(const T a, const T b) noexcept -> T;
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto midpoint(const T a, const T b) noexcept -> T;
 ----
 
 Returns the midpoint of `a` and `b`, computed without overflow.
diff --git a/doc/modules/ROOT/pages/unsigned_integers.adoc b/doc/modules/ROOT/pages/unsigned_integers.adoc
index b0f12bc..842252f 100644
--- a/doc/modules/ROOT/pages/unsigned_integers.adoc
+++ b/doc/modules/ROOT/pages/unsigned_integers.adoc
@@ -44,103 +44,120 @@ public:
     using basis_type = BasisType;
 
     // Construction
-    constexpr unsigned_integer_basis() noexcept = default;
-    explicit constexpr unsigned_integer_basis(BasisType val) noexcept;
+    BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr unsigned_integer_basis() noexcept = default;
+    BOOST_SAFE_NUMBERS_HOST_DEVICE explicit constexpr unsigned_integer_basis(BasisType val) noexcept;
 
     template <typename T>
         requires std::is_same_v<T, bool>
-    explicit constexpr unsigned_integer_basis(T) noexcept = delete; // bool prohibited
+    BOOST_SAFE_NUMBERS_HOST_DEVICE explicit constexpr unsigned_integer_basis(T) noexcept = delete; // bool prohibited
 
     // Conversion to underlying types
     template <unsigned_integral OtherBasis>
-    explicit constexpr operator OtherBasis() const;
+    BOOST_SAFE_NUMBERS_HOST_DEVICE explicit constexpr operator OtherBasis() const;
 
     // Comparison operators
+    BOOST_SAFE_NUMBERS_HOST_DEVICE
     friend constexpr auto operator<=>(unsigned_integer_basis lhs, unsigned_integer_basis rhs) noexcept
         -> std::strong_ordering = default;
 
     // Compound assignment operators (arithmetic)
     template <unsigned_integral OtherBasis>
+    BOOST_SAFE_NUMBERS_HOST_DEVICE
     constexpr auto operator+=(unsigned_integer_basis<OtherBasis> rhs) -> unsigned_integer_basis&;
 
     template <unsigned_integral OtherBasis>
+    BOOST_SAFE_NUMBERS_HOST_DEVICE
     constexpr auto operator-=(unsigned_integer_basis<OtherBasis> rhs) -> unsigned_integer_basis&;
 
     template <unsigned_integral OtherBasis>
+    BOOST_SAFE_NUMBERS_HOST_DEVICE
     constexpr auto operator*=(unsigned_integer_basis<OtherBasis> rhs) -> unsigned_integer_basis&;
 
     template <unsigned_integral OtherBasis>
+    BOOST_SAFE_NUMBERS_HOST_DEVICE
     constexpr auto operator/=(unsigned_integer_basis<OtherBasis> rhs) -> unsigned_integer_basis&;
 
     template <unsigned_integral OtherBasis>
+    BOOST_SAFE_NUMBERS_HOST_DEVICE
     constexpr auto operator%=(unsigned_integer_basis<OtherBasis> rhs) -> unsigned_integer_basis&;
 
     // Compound assignment operators (bitwise)
-    constexpr auto operator&=(unsigned_integer_basis rhs) noexcept -> unsigned_integer_basis&;
-    constexpr auto operator|=(unsigned_integer_basis rhs) noexcept -> unsigned_integer_basis&;
-    constexpr auto operator^=(unsigned_integer_basis rhs) noexcept -> unsigned_integer_basis&;
-    constexpr auto operator<<=(unsigned_integer_basis rhs) -> unsigned_integer_basis&;
-    constexpr auto operator>>=(unsigned_integer_basis rhs) -> unsigned_integer_basis&;
+    BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator&=(unsigned_integer_basis rhs) noexcept -> unsigned_integer_basis&;
+    BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator|=(unsigned_integer_basis rhs) noexcept -> unsigned_integer_basis&;
+    BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator^=(unsigned_integer_basis rhs) noexcept -> unsigned_integer_basis&;
+    BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator<<=(unsigned_integer_basis rhs) -> unsigned_integer_basis&;
+    BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator>>=(unsigned_integer_basis rhs) -> unsigned_integer_basis&;
 
     // Increment and decrement operators
-    constexpr auto operator++() -> unsigned_integer_basis&;
-    constexpr auto operator++(int) -> unsigned_integer_basis;
-    constexpr auto operator--() -> unsigned_integer_basis&;
-    constexpr auto operator--(int) -> unsigned_integer_basis;
+    BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator++() -> unsigned_integer_basis&;
+    BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator++(int) -> unsigned_integer_basis;
+    BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator--() -> unsigned_integer_basis&;
+    BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator--(int) -> unsigned_integer_basis;
 
     // Unary operators
-    constexpr auto operator+() const noexcept -> unsigned_integer_basis;
-    constexpr auto operator-() const noexcept; // compile-time error
+    BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator+() const noexcept -> unsigned_integer_basis;
+    BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator-() const noexcept; // compile-time error
 
 }; // class unsigned_integer_basis
 
 // Arithmetic operators (throw on overflow/underflow)
 template <unsigned_integral BasisType>
+BOOST_SAFE_NUMBERS_HOST_DEVICE
 constexpr auto operator+(unsigned_integer_basis<BasisType> lhs,
                          unsigned_integer_basis<BasisType> rhs) -> unsigned_integer_basis<BasisType>;
 
 template <unsigned_integral BasisType>
+BOOST_SAFE_NUMBERS_HOST_DEVICE
 constexpr auto operator-(unsigned_integer_basis<BasisType> lhs,
                          unsigned_integer_basis<BasisType> rhs) -> unsigned_integer_basis<BasisType>;
 
 template <unsigned_integral BasisType>
+BOOST_SAFE_NUMBERS_HOST_DEVICE
 constexpr auto operator*(unsigned_integer_basis<BasisType> lhs,
                          unsigned_integer_basis<BasisType> rhs) -> unsigned_integer_basis<BasisType>;
 
 template <unsigned_integral BasisType>
+BOOST_SAFE_NUMBERS_HOST_DEVICE
 constexpr auto operator/(unsigned_integer_basis<BasisType> lhs,
                          unsigned_integer_basis<BasisType> rhs) -> unsigned_integer_basis<BasisType>;
 
 template <unsigned_integral BasisType>
+BOOST_SAFE_NUMBERS_HOST_DEVICE
 constexpr auto operator%(unsigned_integer_basis<BasisType> lhs,
                          unsigned_integer_basis<BasisType> rhs) -> unsigned_integer_basis<BasisType>;
 
 // Bitwise operators
 template <unsigned_integral BasisType>
+BOOST_SAFE_NUMBERS_HOST_DEVICE
 constexpr auto operator~(unsigned_integer_basis<BasisType> lhs) noexcept
     -> unsigned_integer_basis<BasisType>;
 
 template <unsigned_integral BasisType>
+BOOST_SAFE_NUMBERS_HOST_DEVICE
 constexpr auto operator&(unsigned_integer_basis<BasisType> lhs,
                          unsigned_integer_basis<BasisType> rhs) noexcept
     -> unsigned_integer_basis<BasisType>;
 
 template <unsigned_integral BasisType>
+BOOST_SAFE_NUMBERS_HOST_DEVICE
 constexpr auto operator|(unsigned_integer_basis<BasisType> lhs,
                          unsigned_integer_basis<BasisType> rhs) noexcept
     -> unsigned_integer_basis<BasisType>;
 
 template <unsigned_integral BasisType>
+BOOST_SAFE_NUMBERS_HOST_DEVICE
 constexpr auto operator^(unsigned_integer_basis<BasisType> lhs,
                          unsigned_integer_basis<BasisType> rhs) noexcept
     -> unsigned_integer_basis<BasisType>;
 
 template <unsigned_integral BasisType>
+BOOST_SAFE_NUMBERS_HOST_DEVICE
 constexpr auto operator<<(unsigned_integer_basis<BasisType> lhs,
                           unsigned_integer_basis<BasisType> rhs)
     -> unsigned_integer_basis<BasisType>;
 
 template <unsigned_integral BasisType>
+BOOST_SAFE_NUMBERS_HOST_DEVICE
 constexpr auto operator>>(unsigned_integer_basis<BasisType> lhs,
                           unsigned_integer_basis<BasisType> rhs)
     -> unsigned_integer_basis<BasisType>;
@@ -269,7 +286,7 @@ constexpr auto shr(T lhs, T rhs);
 
 [source,c++]
 ----
-constexpr unsigned_integer_basis() noexcept = default;
+BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr unsigned_integer_basis() noexcept = default;
 ----
 
 Values are default-initialized to zero.
@@ -278,7 +295,7 @@ Values are default-initialized to zero.
 
 [source,c++]
 ----
-explicit constexpr unsigned_integer_basis(BasisType val) noexcept;
+BOOST_SAFE_NUMBERS_HOST_DEVICE explicit constexpr unsigned_integer_basis(BasisType val) noexcept;
 ----
 
 Construction from the underlying type is explicit to prevent accidental conversions.
@@ -289,7 +306,7 @@ Construction from the underlying type is explicit to prevent accidental conversi
 ----
 template <typename T>
     requires std::is_same_v<T, bool>
-explicit constexpr unsigned_integer_basis(T) noexcept = delete;
+BOOST_SAFE_NUMBERS_HOST_DEVICE explicit constexpr unsigned_integer_basis(T) noexcept = delete;
 ----
 
 Constructing from `bool` is a compile-time error.
@@ -299,7 +316,7 @@ Constructing from `bool` is a compile-time error.
 [source,c++]
 ----
 template <unsigned_integral OtherBasis>
-explicit constexpr operator OtherBasis() const;
+BOOST_SAFE_NUMBERS_HOST_DEVICE explicit constexpr operator OtherBasis() const;
 ----
 
 Conversion to other unsigned integral types is explicit.
@@ -311,6 +328,7 @@ This allows safe narrowing when the value is known to fit at runtime.
 
 [source,c++]
 ----
+BOOST_SAFE_NUMBERS_HOST_DEVICE
 friend constexpr auto operator<=>(unsigned_integer_basis lhs, unsigned_integer_basis rhs) noexcept
     -> std::strong_ordering = default;
 ----
@@ -323,22 +341,27 @@ All comparison operators (`<`, `<=`, `>`, `>=`, `==`, `!=`) are available.
 [source,c++]
 ----
 template <unsigned_integral BasisType>
+BOOST_SAFE_NUMBERS_HOST_DEVICE
 constexpr auto operator+(unsigned_integer_basis<BasisType> lhs,
                          unsigned_integer_basis<BasisType> rhs) -> unsigned_integer_basis<BasisType>;
 
 template <unsigned_integral BasisType>
+BOOST_SAFE_NUMBERS_HOST_DEVICE
 constexpr auto operator-(unsigned_integer_basis<BasisType> lhs,
                          unsigned_integer_basis<BasisType> rhs) -> unsigned_integer_basis<BasisType>;
 
 template <unsigned_integral BasisType>
+BOOST_SAFE_NUMBERS_HOST_DEVICE
 constexpr auto operator*(unsigned_integer_basis<BasisType> lhs,
                          unsigned_integer_basis<BasisType> rhs) -> unsigned_integer_basis<BasisType>;
 
 template <unsigned_integral BasisType>
+BOOST_SAFE_NUMBERS_HOST_DEVICE
 constexpr auto operator/(unsigned_integer_basis<BasisType> lhs,
                          unsigned_integer_basis<BasisType> rhs) -> unsigned_integer_basis<BasisType>;
 
 template <unsigned_integral BasisType>
+BOOST_SAFE_NUMBERS_HOST_DEVICE
 constexpr auto operator%(unsigned_integer_basis<BasisType> lhs,
                          unsigned_integer_basis<BasisType> rhs) -> unsigned_integer_basis<BasisType>;
 ----
@@ -356,18 +379,23 @@ All arithmetic operators perform runtime checks and throw exceptions when undefi
 [source,c++]
 ----
 template <unsigned_integral OtherBasis>
+BOOST_SAFE_NUMBERS_HOST_DEVICE
 constexpr auto operator+=(unsigned_integer_basis<OtherBasis> rhs) -> unsigned_integer_basis&;
 
 template <unsigned_integral OtherBasis>
+BOOST_SAFE_NUMBERS_HOST_DEVICE
 constexpr auto operator-=(unsigned_integer_basis<OtherBasis> rhs) -> unsigned_integer_basis&;
 
 template <unsigned_integral OtherBasis>
+BOOST_SAFE_NUMBERS_HOST_DEVICE
 constexpr auto operator*=(unsigned_integer_basis<OtherBasis> rhs) -> unsigned_integer_basis&;
 
 template <unsigned_integral OtherBasis>
+BOOST_SAFE_NUMBERS_HOST_DEVICE
 constexpr auto operator/=(unsigned_integer_basis<OtherBasis> rhs) -> unsigned_integer_basis&;
 
 template <unsigned_integral OtherBasis>
+BOOST_SAFE_NUMBERS_HOST_DEVICE
 constexpr auto operator%=(unsigned_integer_basis<OtherBasis> rhs) -> unsigned_integer_basis&;
 ----
 
@@ -378,30 +406,36 @@ Compound assignment operators follow the same exception behavior as their corres
 [source,c++]
 ----
 template <unsigned_integral BasisType>
+BOOST_SAFE_NUMBERS_HOST_DEVICE
 constexpr auto operator~(unsigned_integer_basis<BasisType> lhs) noexcept
     -> unsigned_integer_basis<BasisType>;
 
 template <unsigned_integral BasisType>
+BOOST_SAFE_NUMBERS_HOST_DEVICE
 constexpr auto operator&(unsigned_integer_basis<BasisType> lhs,
                          unsigned_integer_basis<BasisType> rhs) noexcept
     -> unsigned_integer_basis<BasisType>;
 
 template <unsigned_integral BasisType>
+BOOST_SAFE_NUMBERS_HOST_DEVICE
 constexpr auto operator|(unsigned_integer_basis<BasisType> lhs,
                          unsigned_integer_basis<BasisType> rhs) noexcept
     -> unsigned_integer_basis<BasisType>;
 
 template <unsigned_integral BasisType>
+BOOST_SAFE_NUMBERS_HOST_DEVICE
 constexpr auto operator^(unsigned_integer_basis<BasisType> lhs,
                          unsigned_integer_basis<BasisType> rhs) noexcept
     -> unsigned_integer_basis<BasisType>;
 
 template <unsigned_integral BasisType>
+BOOST_SAFE_NUMBERS_HOST_DEVICE
 constexpr auto operator<<(unsigned_integer_basis<BasisType> lhs,
                           unsigned_integer_basis<BasisType> rhs)
     -> unsigned_integer_basis<BasisType>;
 
 template <unsigned_integral BasisType>
+BOOST_SAFE_NUMBERS_HOST_DEVICE
 constexpr auto operator>>(unsigned_integer_basis<BasisType> lhs,
                           unsigned_integer_basis<BasisType> rhs)
     -> unsigned_integer_basis<BasisType>;
@@ -473,11 +507,11 @@ All shift policy functions are `noexcept`.
 
 [source,c++]
 ----
-constexpr auto operator&=(unsigned_integer_basis rhs) noexcept -> unsigned_integer_basis&;
-constexpr auto operator|=(unsigned_integer_basis rhs) noexcept -> unsigned_integer_basis&;
-constexpr auto operator^=(unsigned_integer_basis rhs) noexcept -> unsigned_integer_basis&;
-constexpr auto operator<<=(unsigned_integer_basis rhs) -> unsigned_integer_basis&;
-constexpr auto operator>>=(unsigned_integer_basis rhs) -> unsigned_integer_basis&;
+BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator&=(unsigned_integer_basis rhs) noexcept -> unsigned_integer_basis&;
+BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator|=(unsigned_integer_basis rhs) noexcept -> unsigned_integer_basis&;
+BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator^=(unsigned_integer_basis rhs) noexcept -> unsigned_integer_basis&;
+BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator<<=(unsigned_integer_basis rhs) -> unsigned_integer_basis&;
+BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator>>=(unsigned_integer_basis rhs) -> unsigned_integer_basis&;
 ----
 
 Compound bitwise assignment operators delegate to the corresponding free-function bitwise operators and follow the same exception behavior.
@@ -488,10 +522,10 @@ Compound bitwise assignment operators delegate to the corresponding free-functio
 
 [source,c++]
 ----
-constexpr auto operator++() -> unsigned_integer_basis&;
-constexpr auto operator++(int) -> unsigned_integer_basis;
-constexpr auto operator--() -> unsigned_integer_basis&;
-constexpr auto operator--(int) -> unsigned_integer_basis;
+BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator++() -> unsigned_integer_basis&;
+BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator++(int) -> unsigned_integer_basis;
+BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator--() -> unsigned_integer_basis&;
+BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator--(int) -> unsigned_integer_basis;
 ----
 
 - `++` (pre/post): Throws `std::overflow_error` if the value is already at the maximum
@@ -501,8 +535,8 @@ constexpr auto operator--(int) -> unsigned_integer_basis;
 
 [source,c++]
 ----
-constexpr auto operator+() const noexcept -> unsigned_integer_basis;
-constexpr auto operator-() const noexcept; // compile-time error
+BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator+() const noexcept -> unsigned_integer_basis;
+BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator-() const noexcept; // compile-time error
 ----
 
 - `+`: Returns a copy of the value (identity). This is consistent with built-in unsigned integer behavior.
diff --git a/examples/cuda.cu b/examples/cuda.cu
new file mode 100644
index 0000000..eb0a63c
--- /dev/null
+++ b/examples/cuda.cu
@@ -0,0 +1,224 @@
+// Copyright 2026 Matt Borland
+// Distributed under the Boost Software License, Version 1.0.
+// https://www.boost.org/LICENSE_1_0.txt
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/numeric.hpp>
+#include <boost/safe_numbers/charconv.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u32;
+using basis_type = test_type::basis_type;
+
+// All safe_numbers types and free functions are annotated with __host__ __device__,
+// so they work identically on both host and device.
+
+__global__ void arithmetic_kernel(const test_type* a, const test_type* b, test_type* out, int n)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < n)
+    {
+        // Basic arithmetic with overflow detection works on device
+        out[i] = a[i] + b[i];
+    }
+}
+
+__global__ void bit_kernel(const test_type* in, int* out, int n)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < n)
+    {
+        // All <bit> free functions work on device
+        out[i] = boost::safe_numbers::popcount(in[i]);
+    }
+}
+
+__global__ void utility_kernel(const test_type* in, test_type* out, int n)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < n)
+    {
+        // Integer utilities work on device
+        out[i] = boost::safe_numbers::isqrt(in[i]);
+    }
+}
+
+__global__ void numeric_kernel(const test_type* a, const test_type* b, test_type* out, int n)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < n)
+    {
+        // gcd, lcm, midpoint work on device
+        out[i] = boost::safe_numbers::gcd(a[i], b[i]);
+    }
+}
+
+__global__ void charconv_kernel(const test_type* in, test_type* out, int n)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < n)
+    {
+        // charconv round-trip on device
+        char buf[16] {};
+        auto tc = boost::charconv::to_chars(buf, buf + sizeof(buf), in[i]);
+        test_type parsed {};
+        boost::charconv::from_chars(buf, tc.ptr, parsed);
+        out[i] = parsed;
+    }
+}
+
+// Helper: allocate CUDA managed memory
+void allocate(void** ptr, std::size_t bytes)
+{
+    cudaError_t err = cudaMallocManaged(ptr, bytes);
+    if (err != cudaSuccess)
+    {
+        throw std::runtime_error(cudaGetErrorString(err));
+    }
+    cudaDeviceSynchronize();
+}
+
+template <typename T>
+void cleanup(T** ptr)
+{
+    if (*ptr != nullptr)
+    {
+        cudaFree(*ptr);
+        *ptr = nullptr;
+    }
+}
+
+int main()
+{
+    const int n = 10000;
+    const int threadsPerBlock = 256;
+    const int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;
+
+    std::mt19937_64 rng {42};
+    std::uniform_int_distribution<basis_type> dist {basis_type{1}, (std::numeric_limits<basis_type>::max)() / basis_type{2}};
+
+    // --- Allocate managed arrays ---
+
+    test_type* a = nullptr;
+    test_type* b = nullptr;
+    test_type* out_tt = nullptr;
+    int* out_int = nullptr;
+
+    allocate(reinterpret_cast<void**>(&a), n * sizeof(test_type));
+    allocate(reinterpret_cast<void**>(&b), n * sizeof(test_type));
+    allocate(reinterpret_cast<void**>(&out_tt), n * sizeof(test_type));
+    allocate(reinterpret_cast<void**>(&out_int), n * sizeof(int));
+
+    for (int i = 0; i < n; ++i)
+    {
+        a[i] = test_type{dist(rng)};
+        b[i] = test_type{dist(rng)};
+    }
+
+    // The device_error_context captures any overflow/underflow errors
+    // reported from device code and rethrows them on the host.
+    boost::safe_numbers::device_error_context ctx;
+
+    // --- Test 1: Arithmetic (a + b, using half-range to avoid overflow) ---
+
+    arithmetic_kernel<<<blocksPerGrid, threadsPerBlock>>>(a, b, out_tt, n);
+    ctx.synchronize();
+
+    bool pass = true;
+    for (int i = 0; i < n; ++i)
+    {
+        if (out_tt[i] != a[i] + b[i])
+        {
+            pass = false;
+            break;
+        }
+    }
+    std::cout << "Arithmetic (add):  " << (pass ? "PASSED" : "FAILED") << '\n';
+
+    // --- Test 2: Bit functions (popcount) ---
+
+    bit_kernel<<<blocksPerGrid, threadsPerBlock>>>(a, out_int, n);
+    ctx.synchronize();
+
+    pass = true;
+    for (int i = 0; i < n; ++i)
+    {
+        if (out_int[i] != boost::safe_numbers::popcount(a[i]))
+        {
+            pass = false;
+            break;
+        }
+    }
+    std::cout << "Bit (popcount):    " << (pass ? "PASSED" : "FAILED") << '\n';
+
+    // --- Test 3: Integer utilities (isqrt) ---
+
+    utility_kernel<<<blocksPerGrid, threadsPerBlock>>>(a, out_tt, n);
+    ctx.synchronize();
+
+    pass = true;
+    for (int i = 0; i < n; ++i)
+    {
+        if (out_tt[i] != boost::safe_numbers::isqrt(a[i]))
+        {
+            pass = false;
+            break;
+        }
+    }
+    std::cout << "Utility (isqrt):   " << (pass ? "PASSED" : "FAILED") << '\n';
+
+    // --- Test 4: Numeric (gcd) ---
+
+    numeric_kernel<<<blocksPerGrid, threadsPerBlock>>>(a, b, out_tt, n);
+    ctx.synchronize();
+
+    pass = true;
+    for (int i = 0; i < n; ++i)
+    {
+        if (out_tt[i] != boost::safe_numbers::gcd(a[i], b[i]))
+        {
+            pass = false;
+            break;
+        }
+    }
+    std::cout << "Numeric (gcd):     " << (pass ? "PASSED" : "FAILED") << '\n';
+
+    // --- Test 5: Charconv round-trip ---
+
+    charconv_kernel<<<blocksPerGrid, threadsPerBlock>>>(a, out_tt, n);
+    ctx.synchronize();
+
+    pass = true;
+    for (int i = 0; i < n; ++i)
+    {
+        if (out_tt[i] != a[i])
+        {
+            pass = false;
+            break;
+        }
+    }
+    std::cout << "Charconv (rt):     " << (pass ? "PASSED" : "FAILED") << '\n';
+
+    // --- Cleanup ---
+
+    cleanup(&a);
+    cleanup(&b);
+    cleanup(&out_tt);
+    cleanup(&out_int);
+    cudaDeviceReset();
+
+    return 0;
+}
diff --git a/examples/cuda_error_handling.cu b/examples/cuda_error_handling.cu
new file mode 100644
index 0000000..2e9efff
--- /dev/null
+++ b/examples/cuda_error_handling.cu
@@ -0,0 +1,129 @@
+// Copyright 2026 Matt Borland
+// Distributed under the Boost Software License, Version 1.0.
+// https://www.boost.org/LICENSE_1_0.txt
+
+// This example demonstrates how to catch arithmetic errors that occur
+// on a CUDA device using device_error_context. When a safe_numbers
+// operation overflows on the GPU, the error is captured in managed
+// memory and rethrown with BOOST_THROW_EXCEPTION on the host when
+// you call ctx.synchronize().
+//
+// The device_error_context manages a dynamically allocated managed
+// memory buffer. When an error is detected, synchronize() clears the
+// error state and throws. After catching the exception, the same
+// context can be reused immediately for new kernel launches.
+
+#include <iostream>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u32;
+using basis_type = test_type::basis_type;
+
+// This kernel deliberately overflows: it adds 1 to the maximum u32 value
+__global__ void overflow_kernel(test_type* out)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i == 0)
+    {
+        const test_type max_val {(std::numeric_limits<basis_type>::max)()};
+        out[0] = max_val + test_type{1};  // Overflow!
+    }
+}
+
+// This kernel performs valid arithmetic
+__global__ void safe_kernel(const test_type* in, test_type* out, int n)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < n)
+    {
+        out[i] = in[i] + test_type{1};
+    }
+}
+
+int main()
+{
+    // Create a single device_error_context for the lifetime of the program.
+    // The constructor allocates managed memory for error reporting and
+    // clears any stale state.
+    boost::safe_numbers::device_error_context ctx;
+
+    // ---------------------------------------------------------------
+    // Step 1: Launch a kernel that overflows and catch the error
+    // ---------------------------------------------------------------
+
+    test_type* result = nullptr;
+    cudaMallocManaged(&result, sizeof(test_type));
+    cudaDeviceSynchronize();
+
+    std::cout << "=== Launching kernel that overflows ===" << std::endl;
+
+    overflow_kernel<<<1, 1>>>(result);
+
+    // synchronize() waits for the kernel, reads the error state,
+    // and throws the appropriate std::exception if one was captured.
+    // On error it clears the error state before throwing, so the
+    // context is immediately reusable after catching the exception.
+    try
+    {
+        ctx.synchronize();
+        std::cout << "No error detected (unexpected)" << std::endl;
+    }
+    catch (const std::overflow_error& e)
+    {
+        std::cout << "Caught overflow_error: " << e.what() << std::endl;
+    }
+
+    // ---------------------------------------------------------------
+    // Step 2: After catching the error, the same ctx can be reused
+    //         immediately. synchronize() already cleared the error
+    //         state before throwing, so no recovery step is needed.
+    // ---------------------------------------------------------------
+
+    std::cout << "\n=== Launching kernel with valid arithmetic ===" << std::endl;
+
+    test_type* data = nullptr;
+    test_type* out = nullptr;
+
+    cudaMallocManaged(&data, 4 * sizeof(test_type));
+    cudaMallocManaged(&out, 4 * sizeof(test_type));
+    cudaDeviceSynchronize();
+
+    data[0] = test_type{10};
+    data[1] = test_type{20};
+    data[2] = test_type{30};
+    data[3] = test_type{40};
+
+    safe_kernel<<<1, 4>>>(data, out, 4);
+
+    try
+    {
+        ctx.synchronize();
+        std::cout << "No error detected (expected)" << std::endl;
+    }
+    catch (const std::exception& e)
+    {
+        std::cout << "Unexpected error: " << e.what() << std::endl;
+    }
+
+    for (int i = 0; i < 4; ++i)
+    {
+        std::cout << "result[" << i << "] = "
+                  << static_cast<basis_type>(out[i]) << std::endl;
+    }
+
+    // ---------------------------------------------------------------
+    // Cleanup
+    // ---------------------------------------------------------------
+
+    cudaFree(result);
+    cudaFree(data);
+    cudaFree(out);
+
+    return 0;
+}
diff --git a/include/boost/safe_numbers/bit.hpp b/include/boost/safe_numbers/bit.hpp
index 77b7843..35e6c33 100644
--- a/include/boost/safe_numbers/bit.hpp
+++ b/include/boost/safe_numbers/bit.hpp
@@ -1,3 +1,4 @@
+// Copyright 2020 Peter Dimov
 // Copyright 2026 Matt Borland
 // Distributed under the Boost Software License, Version 1.0.
 // https://www.boost.org/LICENSE_1_0.txt
@@ -11,119 +12,341 @@
 
 #ifndef BOOST_SAFE_NUMBERS_BUILD_MODULE
 
-#include <boost/core/bit.hpp>
 #include <array>
 
+#if (defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__))
+
+#include <cuda/std/bit>
+
+#else
+
+#include <boost/core/bit.hpp>
+#include <bit>
+
+#endif
+
 #endif // BOOST_SAFE_NUMBERS_BUILD_MODULE
 
 namespace boost::safe_numbers {
 
 BOOST_SAFE_NUMBERS_EXPORT template <detail::unsigned_library_type UnsignedInt>
-[[nodiscard]] constexpr auto has_single_bit(const UnsignedInt x) noexcept -> bool
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto has_single_bit(const UnsignedInt x) noexcept -> bool
 {
-    using boost::core::has_single_bit;
     using underlying_type = detail::underlying_type_t<UnsignedInt>;
 
+    #if !(defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__))
+
+    using boost::core::has_single_bit;
     return has_single_bit(static_cast<underlying_type>(x));
+
+    #else
+
+    if constexpr (std::is_same_v<UnsignedInt, u128>)
+    {
+        return boost::int128::has_single_bit(static_cast<underlying_type>(x));
+    }
+    else
+    {
+        return cuda::std::has_single_bit(static_cast<underlying_type>(x));
+    }
+
+    #endif
 }
 
 BOOST_SAFE_NUMBERS_EXPORT template <detail::unsigned_library_type UnsignedInt>
-[[nodiscard]] constexpr auto bit_ceil(const UnsignedInt x) noexcept -> UnsignedInt
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto bit_ceil(const UnsignedInt x) noexcept -> UnsignedInt
 {
-    using boost::core::bit_ceil;
     using underlying_type = detail::underlying_type_t<UnsignedInt>;
 
+    #if !(defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__))
+
+    using boost::core::bit_ceil;
     return UnsignedInt{bit_ceil(static_cast<underlying_type>(x))};
+
+    #else
+
+    if constexpr (std::is_same_v<UnsignedInt, u128>)
+    {
+        return UnsignedInt{boost::int128::bit_ceil(static_cast<underlying_type>(x))};
+    }
+    else
+    {
+        return UnsignedInt{cuda::std::bit_ceil(static_cast<underlying_type>(x))};
+    }
+
+    #endif
 }
 
 BOOST_SAFE_NUMBERS_EXPORT template <detail::unsigned_library_type UnsignedInt>
-[[nodiscard]] constexpr auto bit_floor(const UnsignedInt x) noexcept -> UnsignedInt
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto bit_floor(const UnsignedInt x) noexcept -> UnsignedInt
 {
-    using boost::core::bit_floor;
     using underlying_type = detail::underlying_type_t<UnsignedInt>;
 
+    #if !(defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__))
+
+    using boost::core::bit_floor;
     return UnsignedInt{bit_floor(static_cast<underlying_type>(x))};
+
+    #else
+
+    if constexpr (std::is_same_v<UnsignedInt, u128>)
+    {
+        return UnsignedInt{boost::int128::bit_floor(static_cast<underlying_type>(x))};
+    }
+    else
+    {
+        return UnsignedInt{cuda::std::bit_floor(static_cast<underlying_type>(x))};
+    }
+
+    #endif
 }
 
 BOOST_SAFE_NUMBERS_EXPORT template <detail::unsigned_library_type UnsignedInt>
-[[nodiscard]] constexpr auto bit_width(const UnsignedInt x) noexcept -> int
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto bit_width(const UnsignedInt x) noexcept -> int
 {
-    using boost::core::bit_width;
     using underlying_type = detail::underlying_type_t<UnsignedInt>;
 
+    #if !(defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__))
+
+    using boost::core::bit_width;
     return static_cast<int>(bit_width(static_cast<underlying_type>(x)));
+
+    #else
+
+    if constexpr (std::is_same_v<UnsignedInt, u128>)
+    {
+        return static_cast<int>(boost::int128::bit_width(static_cast<underlying_type>(x)));
+    }
+    else
+    {
+        return static_cast<int>(cuda::std::bit_width(static_cast<underlying_type>(x)));
+    }
+
+    #endif
 }
 
 BOOST_SAFE_NUMBERS_EXPORT template <detail::non_bounded_unsigned_library_type UnsignedInt>
-[[nodiscard]] constexpr auto rotl(const UnsignedInt x, const int s) noexcept -> UnsignedInt
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto rotl(const UnsignedInt x, const int s) noexcept -> UnsignedInt
 {
-    using boost::core::rotl;
     using underlying_type = detail::underlying_type_t<UnsignedInt>;
 
+    #if !(defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__))
+
+    using boost::core::rotl;
     return UnsignedInt{rotl(static_cast<underlying_type>(x), s)};
+
+    #else
+
+    if constexpr (std::is_same_v<UnsignedInt, u128>)
+    {
+        return UnsignedInt{boost::int128::rotl(static_cast<underlying_type>(x), s)};
+    }
+    else
+    {
+        return UnsignedInt{cuda::std::rotl(static_cast<underlying_type>(x), s)};
+    }
+
+    #endif
 }
 
 BOOST_SAFE_NUMBERS_EXPORT template <detail::non_bounded_unsigned_library_type UnsignedInt>
-[[nodiscard]] constexpr auto rotr(const UnsignedInt x, const int s) noexcept -> UnsignedInt
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto rotr(const UnsignedInt x, const int s) noexcept -> UnsignedInt
 {
-    using boost::core::rotr;
     using underlying_type = detail::underlying_type_t<UnsignedInt>;
 
+    #if !(defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__))
+
+    using boost::core::rotr;
     return UnsignedInt{rotr(static_cast<underlying_type>(x), s)};
+
+    #else
+
+    if constexpr (std::is_same_v<UnsignedInt, u128>)
+    {
+        return UnsignedInt{boost::int128::rotr(static_cast<underlying_type>(x), s)};
+    }
+    else
+    {
+        return UnsignedInt{cuda::std::rotr(static_cast<underlying_type>(x), s)};
+    }
+
+    #endif
 }
 
 BOOST_SAFE_NUMBERS_EXPORT template <detail::unsigned_library_type UnsignedInt>
-[[nodiscard]] constexpr auto countl_zero(const UnsignedInt x) noexcept -> int
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto countl_zero(const UnsignedInt x) noexcept -> int
 {
-    using boost::core::countl_zero;
     using underlying_type = detail::underlying_type_t<UnsignedInt>;
 
+    #if !(defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__))
+
+    using boost::core::countl_zero;
     return countl_zero(static_cast<underlying_type>(x));
+
+    #else
+
+    if constexpr (std::is_same_v<UnsignedInt, u128>)
+    {
+        return boost::int128::countl_zero(static_cast<underlying_type>(x));
+    }
+    else
+    {
+        return cuda::std::countl_zero(static_cast<underlying_type>(x));
+    }
+
+    #endif
 }
 
 BOOST_SAFE_NUMBERS_EXPORT template <detail::unsigned_library_type UnsignedInt>
-[[nodiscard]] constexpr auto countl_one(const UnsignedInt x) noexcept -> int
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto countl_one(const UnsignedInt x) noexcept -> int
 {
-    using boost::core::countl_one;
     using underlying_type = detail::underlying_type_t<UnsignedInt>;
 
+    #if !(defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__))
+
+    using boost::core::countl_one;
     return countl_one(static_cast<underlying_type>(x));
+
+    #else
+
+    if constexpr (std::is_same_v<UnsignedInt, u128>)
+    {
+        return boost::int128::countl_one(static_cast<underlying_type>(x));
+    }
+    else
+    {
+        return cuda::std::countl_one(static_cast<underlying_type>(x));
+    }
+
+    #endif
 }
 
 BOOST_SAFE_NUMBERS_EXPORT template <detail::unsigned_library_type UnsignedInt>
-[[nodiscard]] constexpr auto countr_zero(const UnsignedInt x) noexcept -> int
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto countr_zero(const UnsignedInt x) noexcept -> int
 {
-    using boost::core::countr_zero;
     using underlying_type = detail::underlying_type_t<UnsignedInt>;
 
+    #if !(defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__))
+
+    using boost::core::countr_zero;
     return countr_zero(static_cast<underlying_type>(x));
+
+    #else
+
+    if constexpr (std::is_same_v<UnsignedInt, u128>)
+    {
+        return boost::int128::countr_zero(static_cast<underlying_type>(x));
+    }
+    else
+    {
+        return cuda::std::countr_zero(static_cast<underlying_type>(x));
+    }
+
+    #endif
 }
 
 BOOST_SAFE_NUMBERS_EXPORT template <detail::unsigned_library_type UnsignedInt>
-[[nodiscard]] constexpr auto countr_one(const UnsignedInt x) noexcept -> int
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto countr_one(const UnsignedInt x) noexcept -> int
 {
-    using boost::core::countr_one;
     using underlying_type = detail::underlying_type_t<UnsignedInt>;
 
+    #if !(defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__))
+
+    using boost::core::countr_one;
     return countr_one(static_cast<underlying_type>(x));
+
+    #else
+
+    if constexpr (std::is_same_v<UnsignedInt, u128>)
+    {
+        return boost::int128::countr_one(static_cast<underlying_type>(x));
+    }
+    else
+    {
+        return cuda::std::countr_one(static_cast<underlying_type>(x));
+    }
+
+    #endif
 }
 
 BOOST_SAFE_NUMBERS_EXPORT template <detail::unsigned_library_type UnsignedInt>
-[[nodiscard]] constexpr auto popcount(const UnsignedInt x) noexcept -> int
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto popcount(const UnsignedInt x) noexcept -> int
 {
-    using boost::core::popcount;
     using underlying_type = detail::underlying_type_t<UnsignedInt>;
 
+    #if !(defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__))
+
+    using boost::core::popcount;
     return popcount(static_cast<underlying_type>(x));
+
+    #else
+
+    if constexpr (std::is_same_v<UnsignedInt, u128>)
+    {
+        return boost::int128::popcount(static_cast<underlying_type>(x));
+    }
+    else
+    {
+        return cuda::std::popcount(static_cast<underlying_type>(x));
+    }
+
+    #endif
+}
+
+// NVCC 12 does not have byteswap builtin, only 13+
+#if (defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__))
+
+namespace detail {
+
+constexpr auto byteswap_impl(const std::uint8_t x) noexcept
+{
+    return x;
+}
+
+constexpr auto byteswap_impl(const std::uint16_t x) noexcept
+{
+    return static_cast<std::uint16_t>( x << 8 | x >> 8 );
+}
+
+constexpr auto byteswap_impl(const std::uint32_t x) noexcept
+{
+    const auto step16 = x << 16 | x >> 16;
+    return ((step16 << 8) & 0xff00ff00) | ((step16 >> 8) & 0x00ff00ff);
+}
+
+constexpr auto byteswap_impl(const std::uint64_t x) noexcept
+{
+    const auto step32 = x << 32 | x >> 32;
+    const auto step16 = (step32 & 0x0000FFFF0000FFFFULL) << 16 | (step32 & 0xFFFF0000FFFF0000ULL) >> 16;
+    return (step16 & 0x00FF00FF00FF00FFULL) << 8 | (step16 & 0xFF00FF00FF00FF00ULL) >> 8;
 }
 
+} // namespace detail
+
+#endif
+
 BOOST_SAFE_NUMBERS_EXPORT template <detail::non_bounded_integral_library_type Int>
-[[nodiscard]] constexpr auto byteswap(const Int x) noexcept -> Int
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto byteswap(const Int x) noexcept -> Int
 {
-    using boost::core::byteswap;
     using underlying_type = detail::underlying_type_t<Int>;
 
+    #if !(defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__))
+
+    using boost::core::byteswap;
     return Int{byteswap(static_cast<underlying_type>(x))};
+
+    #else
+
+    if constexpr (std::is_same_v<Int, u128>)
+    {
+        return Int{boost::int128::byteswap(static_cast<underlying_type>(x))};
+    }
+    else
+    {
+        return Int{detail::byteswap_impl(static_cast<underlying_type>(x))};
+    }
+
+    #endif
 }
 
 namespace detail {
@@ -149,11 +372,21 @@ consteval auto make_byte_reverse_table() -> std::array<std::uint8_t, 256>
     return table;
 }
 
+#if !(defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__))
+
 inline constexpr auto reverse_table {make_byte_reverse_table()};
 
+#endif
+
 template <fundamental_unsigned_integral UnsignedInt>
-[[nodiscard]] constexpr auto bitswap_impl(UnsignedInt x) noexcept -> UnsignedInt
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto bitswap_impl(UnsignedInt x) noexcept -> UnsignedInt
 {
+    #if (defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__))
+
+    constexpr auto reverse_table {make_byte_reverse_table()};
+
+    #endif
+
     if constexpr (sizeof(UnsignedInt) == 1)
     {
         return static_cast<UnsignedInt>(reverse_table[static_cast<std::uint8_t>(x)]);
@@ -177,7 +410,7 @@ template <fundamental_unsigned_integral UnsignedInt>
 } // namespace detail
 
 BOOST_SAFE_NUMBERS_EXPORT template <detail::non_bounded_integral_library_type Int>
-[[nodiscard]] constexpr auto bitswap(Int x) noexcept -> Int
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto bitswap(Int x) noexcept -> Int
 {
     using underlying_type = detail::underlying_type_t<Int>;
     return static_cast<Int>(detail::bitswap_impl(static_cast<underlying_type>(x)));
diff --git a/include/boost/safe_numbers/byte_conversions.hpp b/include/boost/safe_numbers/byte_conversions.hpp
index cba4b15..f74fa96 100644
--- a/include/boost/safe_numbers/byte_conversions.hpp
+++ b/include/boost/safe_numbers/byte_conversions.hpp
@@ -21,7 +21,7 @@
 namespace boost::safe_numbers {
 
 template <detail::non_bounded_integral_library_type T>
-[[nodiscard]] constexpr auto to_be(const T value) noexcept -> T
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto to_be(const T value) noexcept -> T
 {
     if constexpr (std::endian::native == std::endian::big)
     {
@@ -34,14 +34,14 @@ template <detail::non_bounded_integral_library_type T>
 }
 
 template <detail::non_bounded_integral_library_type T>
-[[nodiscard]] constexpr auto from_be(const T value) noexcept -> T
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto from_be(const T value) noexcept -> T
 {
     // Self-inverse
     return to_be(value);
 }
 
 template <detail::non_bounded_integral_library_type T>
-[[nodiscard]] constexpr auto to_le(const T value) noexcept -> T
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto to_le(const T value) noexcept -> T
 {
     if constexpr (std::endian::native == std::endian::little)
     {
@@ -54,21 +54,21 @@ template <detail::non_bounded_integral_library_type T>
 }
 
 template <detail::non_bounded_integral_library_type T>
-[[nodiscard]] constexpr auto from_le(const T value) noexcept -> T
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto from_le(const T value) noexcept -> T
 {
     // Self-inverse
     return to_le(value);
 }
 
 template <detail::non_bounded_integral_library_type T>
-[[nodiscard]] constexpr auto to_be_bytes(const T value) noexcept -> std::array<std::byte, sizeof(T)>
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto to_be_bytes(const T value) noexcept -> std::array<std::byte, sizeof(T)>
 {
     const auto be_value {to_be(value)};
     return std::bit_cast<std::array<std::byte, sizeof(T)>>(be_value);
 }
 
 template <detail::non_bounded_integral_library_type T, std::size_t N>
-[[nodiscard]] constexpr auto from_be_bytes(const std::span<const std::byte, N> bytes) -> T
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto from_be_bytes(const std::span<const std::byte, N> bytes) -> T
 {
     using underlying_type = detail::underlying_type_t<T>;
 
@@ -103,14 +103,14 @@ template <detail::non_bounded_integral_library_type T, std::size_t N>
 }
 
 template <detail::non_bounded_integral_library_type T>
-[[nodiscard]] constexpr auto to_le_bytes(const T value) noexcept -> std::array<std::byte, sizeof(T)>
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto to_le_bytes(const T value) noexcept -> std::array<std::byte, sizeof(T)>
 {
     const auto le_value {to_le(value)};
     return std::bit_cast<std::array<std::byte, sizeof(T)>>(le_value);
 }
 
 template <detail::non_bounded_integral_library_type T, std::size_t N>
-[[nodiscard]] constexpr auto from_le_bytes(const std::span<const std::byte, N> bytes) -> T
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto from_le_bytes(const std::span<const std::byte, N> bytes) -> T
 {
     using underlying_type = detail::underlying_type_t<T>;
 
@@ -145,7 +145,7 @@ template <detail::non_bounded_integral_library_type T, std::size_t N>
 }
 
 template <detail::non_bounded_integral_library_type T>
-[[nodiscard]] constexpr auto to_ne_bytes(const T value) noexcept -> std::array<std::byte, sizeof(T)>
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto to_ne_bytes(const T value) noexcept -> std::array<std::byte, sizeof(T)>
 {
     if constexpr (std::endian::native == std::endian::little)
     {
@@ -158,7 +158,7 @@ template <detail::non_bounded_integral_library_type T>
 }
 
 template <detail::non_bounded_integral_library_type T, std::size_t N>
-[[nodiscard]] constexpr auto from_ne_bytes(const std::span<const std::byte, N> bytes) -> T
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto from_ne_bytes(const std::span<const std::byte, N> bytes) -> T
 {
     if constexpr (std::endian::native == std::endian::little)
     {
diff --git a/include/boost/safe_numbers/charconv.hpp b/include/boost/safe_numbers/charconv.hpp
index 9b43c39..bd8c4e5 100644
--- a/include/boost/safe_numbers/charconv.hpp
+++ b/include/boost/safe_numbers/charconv.hpp
@@ -18,7 +18,7 @@
 namespace boost::charconv {
 
 template <safe_numbers::detail::library_type T>
-constexpr auto from_chars(const char* first, const char* last, T& value, int base = 10)
+BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto from_chars(const char* first, const char* last, T& value, int base = 10)
     -> charconv::from_chars_result
 {
     using underlying_type = safe_numbers::detail::underlying_type_t<T>;
@@ -31,7 +31,7 @@ constexpr auto from_chars(const char* first, const char* last, T& value, int bas
 }
 
 template <safe_numbers::detail::library_type T>
-constexpr auto to_chars(char* first, char* last, const T value, int base = 10)
+BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto to_chars(char* first, char* last, const T value, int base = 10)
     -> charconv::to_chars_result
 {
     using underlying_type = safe_numbers::detail::underlying_type_t<T>;
diff --git a/include/boost/safe_numbers/detail/cuda_error_reporting.hpp b/include/boost/safe_numbers/detail/cuda_error_reporting.hpp
index c7e144e..669fe36 100644
--- a/include/boost/safe_numbers/detail/cuda_error_reporting.hpp
+++ b/include/boost/safe_numbers/detail/cuda_error_reporting.hpp
@@ -87,9 +87,13 @@ BOOST_SAFE_NUMBERS_HOST_DEVICE inline void copy_to_buf(char* dst, const char* sr
 
 #ifdef __CUDACC__
 
-// __managed__ places this in unified memory so the host can read it directly
-// without cudaMemcpyFromSymbol, which fails after __trap() corrupts the device context
-__managed__ cuda_device_error g_device_error = {0, 0, 0, exception_type::unknown, {'\0'}, {'\0'}};
+// Managed memory error struct accessible from both host and device.
+// Since we never destroy the CUDA context, __managed__ is safe to use.
+__managed__ cuda_device_error g_device_error {};
+
+// Tracks whether a device_error_context instance is alive.
+// Only one may exist at a time to prevent races on g_device_error.
+inline bool g_device_error_context_active = false;
 
 __host__ __device__ inline void report_device_error(
     exception_type exc,
@@ -108,19 +112,13 @@ __host__ __device__ inline void report_device_error(
         copy_to_buf(g_device_error.file, file, BOOST_SAFE_NUMBERS_DEVICE_ERROR_BUFFER_SIZE);
         copy_to_buf(g_device_error.expression, expression, BOOST_SAFE_NUMBERS_DEVICE_ERROR_BUFFER_SIZE);
         __threadfence_system();
-
-        printf("Device error on thread %d at %s:%d: %s\n",
-               blockIdx.x * blockDim.x + threadIdx.x,
-               file, line, expression);
-
-        __trap();
     }
 
-    // Other threads: spin until the trap terminates the kernel
-    while (true)
-    {
-        __nanosleep(1000000);
-    }
+    // Return instead of calling __trap(). This allows the kernel to
+    // complete normally without corrupting the CUDA context. Other
+    // threads may continue with incorrect values, but synchronize()
+    // will detect the error via the flag and throw on the host.
+    return;
     #else
 
     const auto msg = std::string(file) + ":" + std::to_string(line) + ": " + expression;
@@ -154,8 +152,27 @@ class device_error_context
 {
 public:
 
-    // Clears the global state
-    // The error context can be reused with multiple kernels if this is called
+    // Clears the error state. Only one device_error_context may exist at a time.
+    device_error_context()
+    {
+        if (detail::g_device_error_context_active)
+        {
+            BOOST_THROW_EXCEPTION(std::logic_error(
+                "Only one device_error_context may exist at a time"));
+        }
+        detail::g_device_error_context_active = true;
+        reset();
+    }
+
+    ~device_error_context()
+    {
+        detail::g_device_error_context_active = false;
+    }
+
+    device_error_context(const device_error_context&) = delete;
+    device_error_context& operator=(const device_error_context&) = delete;
+
+    // Clears the error fields so the context can be reused across kernel launches.
     void reset()
     {
         detail::g_device_error.flag = 0;
@@ -166,60 +183,54 @@ class device_error_context
         detail::g_device_error.expression[0] = '\0';
     }
 
-    // On construction, reset the global error state to ensure we have a good start
-    device_error_context()
-    {
-        reset();
-    }
-
-    // Allows the user to synchronize and check for errors as is typical of CUDA
-    // This allows an extra step in that it will throw on the host
-    // Much like cudaGetLastError, the call to synchronize will destroy the information in the global context
-    // This allows trivial reuse of all these facilities
+    // Synchronizes the device and checks for errors captured by device code.
+    // If an error was detected, the error state is cleared (so the context
+    // is immediately reusable), and the appropriate std::exception is thrown.
     void synchronize()
     {
         const auto status = cudaDeviceSynchronize();
 
-        // Read directly from managed memory — no cudaMemcpyFromSymbol needed
-        // This works even after __trap() corrupts the device context
         const auto flag = detail::g_device_error.flag;
-        const auto thread_id = detail::g_device_error.thread_id;
-        const auto line = detail::g_device_error.line;
 
         if (flag != 0)
         {
+            const auto thread_id = detail::g_device_error.thread_id;
+            const auto line = detail::g_device_error.line;
+            const auto exc = detail::g_device_error.exception;
+
             std::ostringstream oss;
             oss << "Device error on thread " << thread_id
                 << " at " << detail::g_device_error.file
                 << ":" << line
                 << ": " << detail::g_device_error.expression;
 
-            // Read exception type before reset clears it
-            const auto exc = detail::g_device_error.exception;
+            const auto msg = oss.str();
 
-            // Clear the sticky CUDA error and reset our state
-            cudaGetLastError();
+            // Clear the error state so the context can be reused
+            // immediately after catching the exception.
             reset();
 
             switch (exc)
             {
                 case detail::exception_type::domain_error:
-                    BOOST_THROW_EXCEPTION(std::domain_error(oss.str()));
+                    BOOST_THROW_EXCEPTION(std::domain_error(msg));
                     break;
                 case detail::exception_type::overflow:
-                    BOOST_THROW_EXCEPTION(std::overflow_error(oss.str()));
+                    BOOST_THROW_EXCEPTION(std::overflow_error(msg));
                     break;
                 case detail::exception_type::underflow:
-                    BOOST_THROW_EXCEPTION(std::underflow_error(oss.str()));
+                    BOOST_THROW_EXCEPTION(std::underflow_error(msg));
                     break;
                 case detail::exception_type::unknown:
                     [[fallthrough]];
                 default:
-                    BOOST_THROW_EXCEPTION(std::runtime_error(oss.str()));
+                    BOOST_THROW_EXCEPTION(std::runtime_error(msg));
             }
         }
-
-        reset();
+        else
+        {
+            reset();
+        }
 
         if (status != cudaSuccess)
         {
diff --git a/include/boost/safe_numbers/detail/num_digits.hpp b/include/boost/safe_numbers/detail/num_digits.hpp
index c35831d..6c6caae 100644
--- a/include/boost/safe_numbers/detail/num_digits.hpp
+++ b/include/boost/safe_numbers/detail/num_digits.hpp
@@ -13,11 +13,20 @@
 
 #ifndef BOOST_SAFE_NUMBERS_BUILD_MODULE
 
-#include <boost/core/bit.hpp>
 #include <array>
 #include <cstdint>
 #include <limits>
 
+#if (defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__))
+
+#include <cuda/std/bit>
+
+#else
+
+#include <boost/core/bit.hpp>
+
+#endif
+
 #endif
 
 namespace boost::safe_numbers::detail {
@@ -41,10 +50,14 @@ consteval auto make_powers_of_10() noexcept
     return table;
 }
 
+#if !(defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__))
+
 inline constexpr auto powers_of_10_u32 {make_powers_of_10<std::uint32_t>()};
 inline constexpr auto powers_of_10_u64 {make_powers_of_10<std::uint64_t>()};
 inline constexpr auto powers_of_10_u128 {make_powers_of_10<int128::uint128_t>()};
 
+#endif
+
 // ============================================================================
 // num_digits: counts the number of decimal digits using MSB approximation
 //
@@ -58,6 +71,12 @@ template <typename T>
     requires (std::numeric_limits<T>::digits <= 32 && std::is_unsigned_v<T>)
 constexpr auto num_digits(const T init_x) noexcept -> int
 {
+    #if (defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__))
+
+    constexpr auto powers_of_10_u32 {make_powers_of_10<std::uint32_t>()};
+
+    #endif
+
     const auto x {static_cast<std::uint32_t>(init_x)};
 
     if (x == 0)
@@ -65,7 +84,11 @@ constexpr auto num_digits(const T init_x) noexcept -> int
         return 1;
     }
 
+    #if !(defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__))
     const auto msb {32 - boost::core::countl_zero(x)};
+    #else
+    const auto msb {32 - cuda::std::countl_zero(x)};
+    #endif
 
     // Approximate log10
     const auto estimated_digits {(msb * 1000) / 3322 + 1};
@@ -86,12 +109,22 @@ constexpr auto num_digits(const T init_x) noexcept -> int
 // Overload for uint64_t
 constexpr auto num_digits(const std::uint64_t x) noexcept -> int
 {
+    #if (defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__))
+
+    constexpr auto powers_of_10_u64 {make_powers_of_10<std::uint64_t>()};
+
+    #endif
+
     if (x <= UINT32_MAX)
     {
         return num_digits(static_cast<std::uint32_t>(x));
     }
 
+    #if !(defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__))
     const auto msb {64 - boost::core::countl_zero(x)};
+    #else
+    const auto msb {64 - cuda::std::countl_zero(x)};
+    #endif
 
     // Approximate log10
     const auto estimated_digits {(msb * 1000) / 3322 + 1};
@@ -112,6 +145,12 @@ constexpr auto num_digits(const std::uint64_t x) noexcept -> int
 // Overload for uint128_t
 constexpr auto num_digits(const boost::int128::uint128_t& x) noexcept -> int
 {
+    #if (defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__))
+
+    constexpr auto powers_of_10_u128 {make_powers_of_10<int128::uint128_t>()};
+
+    #endif
+
     if (x.high == UINT64_C(0))
     {
         return num_digits(x.low);
diff --git a/include/boost/safe_numbers/detail/rtz.hpp b/include/boost/safe_numbers/detail/rtz.hpp
index d2018ee..2d12c98 100644
--- a/include/boost/safe_numbers/detail/rtz.hpp
+++ b/include/boost/safe_numbers/detail/rtz.hpp
@@ -21,7 +21,7 @@ namespace boost::safe_numbers::detail {
 
 // n is assumed to be at most of bit_width bits
 template <std::size_t bit_width, fundamental_unsigned_integral UInt>
-constexpr auto rotr(UInt n, unsigned int r) noexcept -> UInt
+BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto rotr(UInt n, unsigned int r) noexcept -> UInt
 {
     static_assert(bit_width >= std::numeric_limits<UInt>::digits);
 
@@ -38,7 +38,7 @@ struct remove_trailing_zeros_return
     std::size_t number_of_removed_zeros;
 };
 
-constexpr auto remove_trailing_zeros(std::uint8_t n) noexcept -> remove_trailing_zeros_return<std::uint8_t>
+BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto remove_trailing_zeros(std::uint8_t n) noexcept -> remove_trailing_zeros_return<std::uint8_t>
 {
     std::size_t s {};
 
@@ -55,7 +55,7 @@ constexpr auto remove_trailing_zeros(std::uint8_t n) noexcept -> remove_trailing
     return {n, s};
 }
 
-constexpr auto remove_trailing_zeros(std::uint16_t n) noexcept -> remove_trailing_zeros_return<std::uint16_t>
+BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto remove_trailing_zeros(std::uint16_t n) noexcept -> remove_trailing_zeros_return<std::uint16_t>
 {
     std::size_t s {};
 
@@ -77,7 +77,7 @@ constexpr auto remove_trailing_zeros(std::uint16_t n) noexcept -> remove_trailin
     return {n, s};
 }
 
-constexpr auto remove_trailing_zeros(std::uint32_t n) noexcept -> remove_trailing_zeros_return<std::uint32_t>
+BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto remove_trailing_zeros(std::uint32_t n) noexcept -> remove_trailing_zeros_return<std::uint32_t>
 {
     std::size_t s {};
 
@@ -104,7 +104,7 @@ constexpr auto remove_trailing_zeros(std::uint32_t n) noexcept -> remove_trailin
     return {n, s};
 }
 
-constexpr auto remove_trailing_zeros(std::uint64_t n) noexcept -> remove_trailing_zeros_return<std::uint64_t>
+BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto remove_trailing_zeros(std::uint64_t n) noexcept -> remove_trailing_zeros_return<std::uint64_t>
 {
     std::size_t s {};
 
@@ -136,7 +136,7 @@ constexpr auto remove_trailing_zeros(std::uint64_t n) noexcept -> remove_trailin
     return {n, s};
 }
 
-constexpr auto remove_trailing_zeros(int128::uint128_t n) noexcept -> remove_trailing_zeros_return<int128::uint128_t>
+BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto remove_trailing_zeros(int128::uint128_t n) noexcept -> remove_trailing_zeros_return<int128::uint128_t>
 {
     std::size_t s {};
 
diff --git a/include/boost/safe_numbers/integer_utilities.hpp b/include/boost/safe_numbers/integer_utilities.hpp
index 991b7a9..66d207b 100644
--- a/include/boost/safe_numbers/integer_utilities.hpp
+++ b/include/boost/safe_numbers/integer_utilities.hpp
@@ -15,7 +15,7 @@ namespace boost::safe_numbers {
 
 // Newton's method as it can't possibly overflow, and converges rapidly
 template <detail::non_bounded_unsigned_library_type T>
-[[nodiscard]] constexpr auto isqrt(const T val) -> T
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto isqrt(const T val) -> T
 {
     using underlying_type = detail::underlying_type_t<T>;
 
@@ -42,7 +42,7 @@ template <detail::non_bounded_unsigned_library_type T>
 }
 
 template <detail::non_bounded_unsigned_library_type T>
-[[nodiscard]] constexpr auto remove_trailing_zeros(const T n)
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto remove_trailing_zeros(const T n)
 {
     using underlying_type = detail::underlying_type_t<T>;
 
@@ -55,7 +55,7 @@ template <detail::non_bounded_unsigned_library_type T>
 }
 
 template <detail::non_bounded_unsigned_library_type T>
-[[nodiscard]] constexpr auto is_power_10(const T n) -> bool
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto is_power_10(const T n) -> bool
 {
     using underlying_type = detail::underlying_type_t<T>;
 
@@ -64,14 +64,14 @@ template <detail::non_bounded_unsigned_library_type T>
 }
 
 template <detail::non_bounded_unsigned_library_type T>
-[[nodiscard]] constexpr auto is_power_2(const T n) noexcept -> bool
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto is_power_2(const T n) noexcept -> bool
 {
     return has_single_bit(n);
 }
 
 // Integer log base 2: floor(log2(n)) == bit_width(n) - 1
 template <detail::non_bounded_unsigned_library_type T>
-[[nodiscard]] constexpr auto ilog2(const T n) -> int
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto ilog2(const T n) -> int
 {
     using underlying_type = detail::underlying_type_t<T>;
 
@@ -86,7 +86,7 @@ template <detail::non_bounded_unsigned_library_type T>
 // Integer log base 10: floor(ilog10(n)) == num_digits(n) - 1
 // Uses MSB-based approximation with power-of-10 table lookup (O(1))
 template <detail::non_bounded_unsigned_library_type T>
-[[nodiscard]] constexpr auto ilog10(const T n) -> int
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto ilog10(const T n) -> int
 {
     using underlying_type = detail::underlying_type_t<T>;
 
@@ -101,7 +101,7 @@ template <detail::non_bounded_unsigned_library_type T>
 // Integer log arbitrary base: floor(log_base(n))
 // Repeated division: O(log_base(n)) divisions
 template <detail::non_bounded_unsigned_library_type T>
-[[nodiscard]] constexpr auto ilog(const T n, const T base) -> int
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto ilog(const T n, const T base) -> int
 {
     using underlying_type = detail::underlying_type_t<T>;
 
@@ -132,7 +132,7 @@ namespace detail {
 
 // Iterative exponentiation by squaring: O(log b) multiplications
 template <non_bounded_unsigned_library_type T>
-[[nodiscard]] constexpr auto ipow_impl(T base, T exp) -> T
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto ipow_impl(T base, T exp) -> T
 {
     using underlying_type = underlying_type_t<T>;
 
@@ -157,19 +157,19 @@ template <non_bounded_unsigned_library_type T>
 } // namespace detail
 
 template <detail::non_bounded_unsigned_library_type T>
-[[nodiscard]] constexpr auto ipow(const T a, const T b) -> T
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto ipow(const T a, const T b) -> T
 {
     return detail::ipow_impl(a, b);
 }
 
 template <detail::integral_library_type T>
-[[nodiscard]] constexpr auto abs_diff(const T a, const T b) noexcept -> T
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto abs_diff(const T a, const T b) noexcept -> T
 {
     return a > b ? a - b : b - a;
 }
 
 template <detail::integral_library_type T>
-[[nodiscard]] constexpr auto div_ceil(const T a, const T b) noexcept -> T
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto div_ceil(const T a, const T b) noexcept -> T
 {
     using underlying_type = detail::underlying_type_t<T>;
 
@@ -187,7 +187,7 @@ template <detail::integral_library_type T>
 }
 
 template <detail::integral_library_type T>
-[[nodiscard]] constexpr auto next_multiple_of(const T a, const T b) noexcept -> T
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto next_multiple_of(const T a, const T b) noexcept -> T
 {
     return div_ceil(a, b) * b;
 }
diff --git a/include/boost/safe_numbers/numeric.hpp b/include/boost/safe_numbers/numeric.hpp
index dd5e1a0..cfb95d9 100644
--- a/include/boost/safe_numbers/numeric.hpp
+++ b/include/boost/safe_numbers/numeric.hpp
@@ -10,14 +10,18 @@
 
 #ifndef BOOST_SAFE_NUMBERS_BUILD_MODULE
 
+#if (defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__))
+#include <cuda/std/numeric>
+#else
 #include <numeric>
+#endif
 
 #endif
 
 namespace boost::safe_numbers {
 
 template <detail::non_bounded_integral_library_type T>
-[[nodiscard]] constexpr auto gcd(const T m, const T n) noexcept -> T
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto gcd(const T m, const T n) noexcept -> T
 {
     using underlying_type = detail::underlying_type_t<T>;
 
@@ -27,12 +31,16 @@ template <detail::non_bounded_integral_library_type T>
     }
     else
     {
+        #if !(defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__))
         return T{static_cast<underlying_type>(std::gcd(static_cast<underlying_type>(m), static_cast<underlying_type>(n)))};
+        #else
+        return T{static_cast<underlying_type>(cuda::std::gcd(static_cast<underlying_type>(m), static_cast<underlying_type>(n)))};
+        #endif
     }
 }
 
 template <detail::non_bounded_integral_library_type T>
-[[nodiscard]] constexpr auto lcm(const T m, const T n) noexcept -> T
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto lcm(const T m, const T n) noexcept -> T
 {
     using underlying_type = detail::underlying_type_t<T>;
 
@@ -42,12 +50,16 @@ template <detail::non_bounded_integral_library_type T>
     }
     else
     {
+        #if !(defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__))
         return T{static_cast<underlying_type>(std::lcm(static_cast<underlying_type>(m), static_cast<underlying_type>(n)))};
+        #else
+        return T{static_cast<underlying_type>(cuda::std::lcm(static_cast<underlying_type>(m), static_cast<underlying_type>(n)))};
+        #endif
     }
 }
 
 template <detail::non_bounded_integral_library_type T>
-[[nodiscard]] constexpr auto midpoint(const T a, const T b) noexcept -> T
+BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto midpoint(const T a, const T b) noexcept -> T
 {
     using underlying_type = detail::underlying_type_t<T>;
 
@@ -57,7 +69,11 @@ template <detail::non_bounded_integral_library_type T>
     }
     else
     {
+        #if !(defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__))
         return T{static_cast<underlying_type>(std::midpoint(static_cast<underlying_type>(a), static_cast<underlying_type>(b)))};
+        #else
+        return T{static_cast<underlying_type>(cuda::std::midpoint(static_cast<underlying_type>(a), static_cast<underlying_type>(b)))};
+        #endif
     }
 }
 
diff --git a/test/cuda_jamfile b/test/cuda_jamfile
index d00564a..667f088 100644
--- a/test/cuda_jamfile
+++ b/test/cuda_jamfile
@@ -80,3 +80,232 @@ run test_cuda_u128_sub_error.cu ;
 run test_cuda_u128_mul_error.cu ;
 run test_cuda_u128_div_error.cu ;
 run test_cuda_u128_mod_error.cu ;
+
+# Bit function tests
+
+# u8 bit tests
+run test_cuda_u8_has_single_bit.cu ;
+run test_cuda_u8_bit_ceil.cu ;
+run test_cuda_u8_bit_floor.cu ;
+run test_cuda_u8_bit_width.cu ;
+run test_cuda_u8_rotl.cu ;
+run test_cuda_u8_rotr.cu ;
+run test_cuda_u8_countl_zero.cu ;
+run test_cuda_u8_countl_one.cu ;
+run test_cuda_u8_countr_zero.cu ;
+run test_cuda_u8_countr_one.cu ;
+run test_cuda_u8_popcount.cu ;
+run test_cuda_u8_byteswap.cu ;
+run test_cuda_u8_bitswap.cu ;
+
+# u16 bit tests
+run test_cuda_u16_has_single_bit.cu ;
+run test_cuda_u16_bit_ceil.cu ;
+run test_cuda_u16_bit_floor.cu ;
+run test_cuda_u16_bit_width.cu ;
+run test_cuda_u16_rotl.cu ;
+run test_cuda_u16_rotr.cu ;
+run test_cuda_u16_countl_zero.cu ;
+run test_cuda_u16_countl_one.cu ;
+run test_cuda_u16_countr_zero.cu ;
+run test_cuda_u16_countr_one.cu ;
+run test_cuda_u16_popcount.cu ;
+run test_cuda_u16_byteswap.cu ;
+run test_cuda_u16_bitswap.cu ;
+
+# u32 bit tests
+run test_cuda_u32_has_single_bit.cu ;
+run test_cuda_u32_bit_ceil.cu ;
+run test_cuda_u32_bit_floor.cu ;
+run test_cuda_u32_bit_width.cu ;
+run test_cuda_u32_rotl.cu ;
+run test_cuda_u32_rotr.cu ;
+run test_cuda_u32_countl_zero.cu ;
+run test_cuda_u32_countl_one.cu ;
+run test_cuda_u32_countr_zero.cu ;
+run test_cuda_u32_countr_one.cu ;
+run test_cuda_u32_popcount.cu ;
+run test_cuda_u32_byteswap.cu ;
+run test_cuda_u32_bitswap.cu ;
+
+# u64 bit tests
+run test_cuda_u64_has_single_bit.cu ;
+run test_cuda_u64_bit_ceil.cu ;
+run test_cuda_u64_bit_floor.cu ;
+run test_cuda_u64_bit_width.cu ;
+run test_cuda_u64_rotl.cu ;
+run test_cuda_u64_rotr.cu ;
+run test_cuda_u64_countl_zero.cu ;
+run test_cuda_u64_countl_one.cu ;
+run test_cuda_u64_countr_zero.cu ;
+run test_cuda_u64_countr_one.cu ;
+run test_cuda_u64_popcount.cu ;
+run test_cuda_u64_byteswap.cu ;
+run test_cuda_u64_bitswap.cu ;
+
+# u128 bit tests
+run test_cuda_u128_has_single_bit.cu ;
+run test_cuda_u128_bit_ceil.cu ;
+run test_cuda_u128_bit_floor.cu ;
+run test_cuda_u128_bit_width.cu ;
+run test_cuda_u128_rotl.cu ;
+run test_cuda_u128_rotr.cu ;
+run test_cuda_u128_countl_zero.cu ;
+run test_cuda_u128_countl_one.cu ;
+run test_cuda_u128_countr_zero.cu ;
+run test_cuda_u128_countr_one.cu ;
+run test_cuda_u128_popcount.cu ;
+run test_cuda_u128_byteswap.cu ;
+run test_cuda_u128_bitswap.cu ;
+
+# Byte conversion tests
+
+# u8 byte conversion tests
+run test_cuda_u8_to_be.cu ;
+run test_cuda_u8_from_be.cu ;
+run test_cuda_u8_to_le.cu ;
+run test_cuda_u8_from_le.cu ;
+
+# u16 byte conversion tests
+run test_cuda_u16_to_be.cu ;
+run test_cuda_u16_from_be.cu ;
+run test_cuda_u16_to_le.cu ;
+run test_cuda_u16_from_le.cu ;
+
+# u32 byte conversion tests
+run test_cuda_u32_to_be.cu ;
+run test_cuda_u32_from_be.cu ;
+run test_cuda_u32_to_le.cu ;
+run test_cuda_u32_from_le.cu ;
+
+# u64 byte conversion tests
+run test_cuda_u64_to_be.cu ;
+run test_cuda_u64_from_be.cu ;
+run test_cuda_u64_to_le.cu ;
+run test_cuda_u64_from_le.cu ;
+
+# u128 byte conversion tests
+run test_cuda_u128_to_be.cu ;
+run test_cuda_u128_from_be.cu ;
+run test_cuda_u128_to_le.cu ;
+run test_cuda_u128_from_le.cu ;
+
+# Charconv tests
+
+# u8 charconv tests
+run test_cuda_u8_charconv.cu ;
+run test_cuda_u8_charconv_all_bases.cu ;
+
+# u16 charconv tests
+run test_cuda_u16_charconv.cu ;
+run test_cuda_u16_charconv_all_bases.cu ;
+
+# u32 charconv tests
+run test_cuda_u32_charconv.cu ;
+run test_cuda_u32_charconv_all_bases.cu ;
+
+# u64 charconv tests
+run test_cuda_u64_charconv.cu ;
+run test_cuda_u64_charconv_all_bases.cu ;
+
+# u128 charconv tests
+run test_cuda_u128_charconv.cu ;
+run test_cuda_u128_charconv_all_bases.cu ;
+
+# Integer utilities tests
+
+# u8 integer utilities tests
+run test_cuda_u8_isqrt.cu ;
+run test_cuda_u8_remove_trailing_zeros.cu ;
+run test_cuda_u8_is_power_10.cu ;
+run test_cuda_u8_is_power_2.cu ;
+run test_cuda_u8_ilog2.cu ;
+run test_cuda_u8_ilog10.cu ;
+run test_cuda_u8_ilog.cu ;
+run test_cuda_u8_ipow.cu ;
+run test_cuda_u8_abs_diff.cu ;
+run test_cuda_u8_div_ceil.cu ;
+run test_cuda_u8_next_multiple_of.cu ;
+
+# u16 integer utilities tests
+run test_cuda_u16_isqrt.cu ;
+run test_cuda_u16_remove_trailing_zeros.cu ;
+run test_cuda_u16_is_power_10.cu ;
+run test_cuda_u16_is_power_2.cu ;
+run test_cuda_u16_ilog2.cu ;
+run test_cuda_u16_ilog10.cu ;
+run test_cuda_u16_ilog.cu ;
+run test_cuda_u16_ipow.cu ;
+run test_cuda_u16_abs_diff.cu ;
+run test_cuda_u16_div_ceil.cu ;
+run test_cuda_u16_next_multiple_of.cu ;
+
+# u32 integer utilities tests
+run test_cuda_u32_isqrt.cu ;
+run test_cuda_u32_remove_trailing_zeros.cu ;
+run test_cuda_u32_is_power_10.cu ;
+run test_cuda_u32_is_power_2.cu ;
+run test_cuda_u32_ilog2.cu ;
+run test_cuda_u32_ilog10.cu ;
+run test_cuda_u32_ilog.cu ;
+run test_cuda_u32_ipow.cu ;
+run test_cuda_u32_abs_diff.cu ;
+run test_cuda_u32_div_ceil.cu ;
+run test_cuda_u32_next_multiple_of.cu ;
+
+# u64 integer utilities tests
+run test_cuda_u64_isqrt.cu ;
+run test_cuda_u64_remove_trailing_zeros.cu ;
+run test_cuda_u64_is_power_10.cu ;
+run test_cuda_u64_is_power_2.cu ;
+run test_cuda_u64_ilog2.cu ;
+run test_cuda_u64_ilog10.cu ;
+run test_cuda_u64_ilog.cu ;
+run test_cuda_u64_ipow.cu ;
+run test_cuda_u64_abs_diff.cu ;
+run test_cuda_u64_div_ceil.cu ;
+run test_cuda_u64_next_multiple_of.cu ;
+
+# u128 integer utilities tests
+run test_cuda_u128_isqrt.cu ;
+run test_cuda_u128_remove_trailing_zeros.cu ;
+run test_cuda_u128_is_power_10.cu ;
+run test_cuda_u128_is_power_2.cu ;
+run test_cuda_u128_ilog2.cu ;
+run test_cuda_u128_ilog10.cu ;
+run test_cuda_u128_ilog.cu ;
+run test_cuda_u128_ipow.cu ;
+run test_cuda_u128_abs_diff.cu ;
+run test_cuda_u128_div_ceil.cu ;
+run test_cuda_u128_next_multiple_of.cu ;
+
+# Numeric tests
+
+# u8 numeric tests
+run test_cuda_u8_gcd.cu ;
+run test_cuda_u8_lcm.cu ;
+run test_cuda_u8_midpoint.cu ;
+
+# u16 numeric tests
+run test_cuda_u16_gcd.cu ;
+run test_cuda_u16_lcm.cu ;
+run test_cuda_u16_midpoint.cu ;
+
+# u32 numeric tests
+run test_cuda_u32_gcd.cu ;
+run test_cuda_u32_lcm.cu ;
+run test_cuda_u32_midpoint.cu ;
+
+# u64 numeric tests
+run test_cuda_u64_gcd.cu ;
+run test_cuda_u64_lcm.cu ;
+run test_cuda_u64_midpoint.cu ;
+
+# u128 numeric tests
+run test_cuda_u128_gcd.cu ;
+run test_cuda_u128_lcm.cu ;
+run test_cuda_u128_midpoint.cu ;
+
+# Examples
+run ../examples/cuda.cu ;
+run ../examples/cuda_error_handling.cu ;
diff --git a/test/test_cuda_u128_abs_diff.cu b/test/test_cuda_u128_abs_diff.cu
new file mode 100644
index 0000000..fb863d7
--- /dev/null
+++ b/test/test_cuda_u128_abs_diff.cu
@@ -0,0 +1,90 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include <boost/safe_numbers/detail/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u128;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::abs_diff(in1[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector1(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<basis_type> dist{basis_type{0U}, (std::numeric_limits<basis_type>::max)()};
+    boost::random::uniform_int_distribution<basis_type> dist2{basis_type{0U}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = test_type{dist(rng)};
+        input_vector2[i] = test_type{dist2(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::abs_diff(input_vector1[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u128_bit_ceil.cu b/test/test_cuda_u128_bit_ceil.cu
new file mode 100644
index 0000000..52ca8ec
--- /dev/null
+++ b/test/test_cuda_u128_bit_ceil.cu
@@ -0,0 +1,87 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include <boost/safe_numbers/detail/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u128;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::bit_ceil(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<basis_type> dist{basis_type{0U}, (std::numeric_limits<basis_type>::max)() / basis_type{2U}};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::bit_ceil(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u128_bit_floor.cu b/test/test_cuda_u128_bit_floor.cu
new file mode 100644
index 0000000..b22b8ba
--- /dev/null
+++ b/test/test_cuda_u128_bit_floor.cu
@@ -0,0 +1,87 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include <boost/safe_numbers/detail/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u128;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::bit_floor(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<basis_type> dist{basis_type{0U}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::bit_floor(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u128_bit_width.cu b/test/test_cuda_u128_bit_width.cu
new file mode 100644
index 0000000..99ede69
--- /dev/null
+++ b/test/test_cuda_u128_bit_width.cu
@@ -0,0 +1,87 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include <boost/safe_numbers/detail/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u128;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::bit_width(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<basis_type> dist{basis_type{0U}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::bit_width(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u128_bitswap.cu b/test/test_cuda_u128_bitswap.cu
new file mode 100644
index 0000000..636d746
--- /dev/null
+++ b/test/test_cuda_u128_bitswap.cu
@@ -0,0 +1,87 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include <boost/safe_numbers/detail/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u128;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::bitswap(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<basis_type> dist{basis_type{0U}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::bitswap(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u128_byteswap.cu b/test/test_cuda_u128_byteswap.cu
new file mode 100644
index 0000000..8c449f8
--- /dev/null
+++ b/test/test_cuda_u128_byteswap.cu
@@ -0,0 +1,87 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include <boost/safe_numbers/detail/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u128;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::byteswap(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<basis_type> dist{basis_type{0U}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::byteswap(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u128_charconv.cu b/test/test_cuda_u128_charconv.cu
new file mode 100644
index 0000000..7a7e2b8
--- /dev/null
+++ b/test/test_cuda_u128_charconv.cu
@@ -0,0 +1,95 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/charconv.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include <boost/safe_numbers/detail/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u128;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        char buf[64] {};
+        auto tc_result {boost::charconv::to_chars(buf, buf + sizeof(buf), in[i])};
+        test_type parsed {};
+        boost::charconv::from_chars(buf, tc_result.ptr, parsed);
+        out[i] = parsed;
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<basis_type> dist{basis_type{0U}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        char buf[64] {};
+        auto tc_result {boost::charconv::to_chars(buf, buf + sizeof(buf), input_vector[i])};
+        test_type parsed {};
+        boost::charconv::from_chars(buf, tc_result.ptr, parsed);
+        results.push_back(parsed);
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u128_charconv_all_bases.cu b/test/test_cuda_u128_charconv_all_bases.cu
new file mode 100644
index 0000000..762ba5a
--- /dev/null
+++ b/test/test_cuda_u128_charconv_all_bases.cu
@@ -0,0 +1,111 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/charconv.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include <boost/safe_numbers/detail/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u128;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        int pass_count {0};
+        for (int base = 2; base <= 36; ++base)
+        {
+            char buf[256] {};
+            auto tc_result {boost::charconv::to_chars(buf, buf + sizeof(buf), in[i], base)};
+            test_type parsed {};
+            boost::charconv::from_chars(buf, tc_result.ptr, parsed, base);
+            if (parsed == in[i])
+            {
+                ++pass_count;
+            }
+        }
+        out[i] = pass_count;
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<basis_type> dist{basis_type{0U}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        int pass_count {0};
+        for (int base = 2; base <= 36; ++base)
+        {
+            char buf[256] {};
+            auto tc_result {boost::charconv::to_chars(buf, buf + sizeof(buf), input_vector[i], base)};
+            test_type parsed {};
+            boost::charconv::from_chars(buf, tc_result.ptr, parsed, base);
+            if (parsed == input_vector[i])
+            {
+                ++pass_count;
+            }
+        }
+        results.push_back(pass_count);
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u128_countl_one.cu b/test/test_cuda_u128_countl_one.cu
new file mode 100644
index 0000000..06caaef
--- /dev/null
+++ b/test/test_cuda_u128_countl_one.cu
@@ -0,0 +1,87 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include <boost/safe_numbers/detail/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u128;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::countl_one(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<basis_type> dist{basis_type{0U}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::countl_one(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u128_countl_zero.cu b/test/test_cuda_u128_countl_zero.cu
new file mode 100644
index 0000000..74cbfe6
--- /dev/null
+++ b/test/test_cuda_u128_countl_zero.cu
@@ -0,0 +1,87 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include <boost/safe_numbers/detail/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u128;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::countl_zero(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<basis_type> dist{basis_type{0U}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::countl_zero(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u128_countr_one.cu b/test/test_cuda_u128_countr_one.cu
new file mode 100644
index 0000000..8fa6fa7
--- /dev/null
+++ b/test/test_cuda_u128_countr_one.cu
@@ -0,0 +1,87 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include <boost/safe_numbers/detail/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u128;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::countr_one(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<basis_type> dist{basis_type{0U}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::countr_one(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u128_countr_zero.cu b/test/test_cuda_u128_countr_zero.cu
new file mode 100644
index 0000000..1fd6114
--- /dev/null
+++ b/test/test_cuda_u128_countr_zero.cu
@@ -0,0 +1,87 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include <boost/safe_numbers/detail/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u128;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::countr_zero(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<basis_type> dist{basis_type{0U}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::countr_zero(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u128_div_ceil.cu b/test/test_cuda_u128_div_ceil.cu
new file mode 100644
index 0000000..a23045e
--- /dev/null
+++ b/test/test_cuda_u128_div_ceil.cu
@@ -0,0 +1,90 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include <boost/safe_numbers/detail/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u128;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::div_ceil(in1[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector1(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<basis_type> dist{basis_type{0U}, (std::numeric_limits<basis_type>::max)()};
+    boost::random::uniform_int_distribution<basis_type> dist2{basis_type{1U}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = test_type{dist(rng)};
+        input_vector2[i] = test_type{dist2(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::div_ceil(input_vector1[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u128_from_be.cu b/test/test_cuda_u128_from_be.cu
new file mode 100644
index 0000000..56476d9
--- /dev/null
+++ b/test/test_cuda_u128_from_be.cu
@@ -0,0 +1,87 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/byte_conversions.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include <boost/safe_numbers/detail/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u128;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::from_be(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<basis_type> dist{basis_type{0U}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::from_be(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u128_from_le.cu b/test/test_cuda_u128_from_le.cu
new file mode 100644
index 0000000..4558a42
--- /dev/null
+++ b/test/test_cuda_u128_from_le.cu
@@ -0,0 +1,87 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/byte_conversions.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include <boost/safe_numbers/detail/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u128;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::from_le(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<basis_type> dist{basis_type{0U}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::from_le(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u128_gcd.cu b/test/test_cuda_u128_gcd.cu
new file mode 100644
index 0000000..d39fb2d
--- /dev/null
+++ b/test/test_cuda_u128_gcd.cu
@@ -0,0 +1,90 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/numeric.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include <boost/safe_numbers/detail/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u128;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::gcd(in1[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector1(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<basis_type> dist{basis_type{0U}, (std::numeric_limits<basis_type>::max)()};
+    boost::random::uniform_int_distribution<basis_type> dist2{basis_type{0U}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = test_type{dist(rng)};
+        input_vector2[i] = test_type{dist2(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::gcd(input_vector1[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u128_has_single_bit.cu b/test/test_cuda_u128_has_single_bit.cu
new file mode 100644
index 0000000..775398f
--- /dev/null
+++ b/test/test_cuda_u128_has_single_bit.cu
@@ -0,0 +1,87 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include <boost/safe_numbers/detail/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u128;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = static_cast<int>(boost::safe_numbers::has_single_bit(in[i]));
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<basis_type> dist{basis_type{0U}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(static_cast<int>(boost::safe_numbers::has_single_bit(input_vector[i])));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u128_ilog.cu b/test/test_cuda_u128_ilog.cu
new file mode 100644
index 0000000..4b61aa3
--- /dev/null
+++ b/test/test_cuda_u128_ilog.cu
@@ -0,0 +1,87 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include <boost/safe_numbers/detail/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u128;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::ilog(in[i], test_type{static_cast<basis_type>(7)});
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<basis_type> dist{basis_type{1U}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::ilog(input_vector[i], test_type{static_cast<basis_type>(7)}));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u128_ilog10.cu b/test/test_cuda_u128_ilog10.cu
new file mode 100644
index 0000000..7c2f731
--- /dev/null
+++ b/test/test_cuda_u128_ilog10.cu
@@ -0,0 +1,87 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include <boost/safe_numbers/detail/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u128;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::ilog10(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<basis_type> dist{basis_type{1U}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::ilog10(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u128_ilog2.cu b/test/test_cuda_u128_ilog2.cu
new file mode 100644
index 0000000..2764202
--- /dev/null
+++ b/test/test_cuda_u128_ilog2.cu
@@ -0,0 +1,87 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include <boost/safe_numbers/detail/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u128;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::ilog2(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<basis_type> dist{basis_type{1U}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::ilog2(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u128_ipow.cu b/test/test_cuda_u128_ipow.cu
new file mode 100644
index 0000000..4caada7
--- /dev/null
+++ b/test/test_cuda_u128_ipow.cu
@@ -0,0 +1,90 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include <boost/safe_numbers/detail/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u128;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::ipow(in1[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector1(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<basis_type> dist{basis_type{0U}, basis_type{10U}};
+    boost::random::uniform_int_distribution<basis_type> dist2{basis_type{0U}, basis_type{2U}};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = test_type{dist(rng)};
+        input_vector2[i] = test_type{dist2(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::ipow(input_vector1[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u128_is_power_10.cu b/test/test_cuda_u128_is_power_10.cu
new file mode 100644
index 0000000..5d7048f
--- /dev/null
+++ b/test/test_cuda_u128_is_power_10.cu
@@ -0,0 +1,87 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include <boost/safe_numbers/detail/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u128;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = static_cast<int>(boost::safe_numbers::is_power_10(in[i]));
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<basis_type> dist{basis_type{1U}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(static_cast<int>(boost::safe_numbers::is_power_10(input_vector[i])));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u128_is_power_2.cu b/test/test_cuda_u128_is_power_2.cu
new file mode 100644
index 0000000..2775f07
--- /dev/null
+++ b/test/test_cuda_u128_is_power_2.cu
@@ -0,0 +1,87 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include <boost/safe_numbers/detail/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u128;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = static_cast<int>(boost::safe_numbers::is_power_2(in[i]));
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<basis_type> dist{basis_type{0U}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(static_cast<int>(boost::safe_numbers::is_power_2(input_vector[i])));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u128_isqrt.cu b/test/test_cuda_u128_isqrt.cu
new file mode 100644
index 0000000..a54b27f
--- /dev/null
+++ b/test/test_cuda_u128_isqrt.cu
@@ -0,0 +1,87 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include <boost/safe_numbers/detail/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u128;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::isqrt(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<basis_type> dist{basis_type{0U}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::isqrt(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u128_lcm.cu b/test/test_cuda_u128_lcm.cu
new file mode 100644
index 0000000..ba04d7e
--- /dev/null
+++ b/test/test_cuda_u128_lcm.cu
@@ -0,0 +1,90 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/numeric.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include <boost/safe_numbers/detail/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u128;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::lcm(in1[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector1(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<basis_type> dist{basis_type{1U}, basis_type{10U}};
+    boost::random::uniform_int_distribution<basis_type> dist2{basis_type{1U}, basis_type{10U}};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = test_type{dist(rng)};
+        input_vector2[i] = test_type{dist2(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::lcm(input_vector1[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u128_midpoint.cu b/test/test_cuda_u128_midpoint.cu
new file mode 100644
index 0000000..00ea8d2
--- /dev/null
+++ b/test/test_cuda_u128_midpoint.cu
@@ -0,0 +1,90 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/numeric.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include <boost/safe_numbers/detail/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u128;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::midpoint(in1[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector1(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<basis_type> dist{basis_type{0U}, (std::numeric_limits<basis_type>::max)()};
+    boost::random::uniform_int_distribution<basis_type> dist2{basis_type{0U}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = test_type{dist(rng)};
+        input_vector2[i] = test_type{dist2(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::midpoint(input_vector1[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u128_next_multiple_of.cu b/test/test_cuda_u128_next_multiple_of.cu
new file mode 100644
index 0000000..313092a
--- /dev/null
+++ b/test/test_cuda_u128_next_multiple_of.cu
@@ -0,0 +1,90 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include <boost/safe_numbers/detail/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u128;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::next_multiple_of(in1[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector1(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<basis_type> dist{basis_type{0U}, (std::numeric_limits<basis_type>::max)() / basis_type{2U}};
+    boost::random::uniform_int_distribution<basis_type> dist2{basis_type{1U}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = test_type{dist(rng)};
+        input_vector2[i] = test_type{dist2(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::next_multiple_of(input_vector1[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u128_popcount.cu b/test/test_cuda_u128_popcount.cu
new file mode 100644
index 0000000..ad62bd6
--- /dev/null
+++ b/test/test_cuda_u128_popcount.cu
@@ -0,0 +1,87 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include <boost/safe_numbers/detail/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u128;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::popcount(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<basis_type> dist{basis_type{0U}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::popcount(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u128_remove_trailing_zeros.cu b/test/test_cuda_u128_remove_trailing_zeros.cu
new file mode 100644
index 0000000..5ef04b7
--- /dev/null
+++ b/test/test_cuda_u128_remove_trailing_zeros.cu
@@ -0,0 +1,87 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include <boost/safe_numbers/detail/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u128;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = test_type{boost::safe_numbers::remove_trailing_zeros(in[i]).trimmed_number};
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<basis_type> dist{basis_type{1U}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(test_type{boost::safe_numbers::remove_trailing_zeros(input_vector[i]).trimmed_number});
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u128_rotl.cu b/test/test_cuda_u128_rotl.cu
new file mode 100644
index 0000000..3b33dd7
--- /dev/null
+++ b/test/test_cuda_u128_rotl.cu
@@ -0,0 +1,87 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include <boost/safe_numbers/detail/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u128;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::rotl(in[i], 3);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<basis_type> dist{basis_type{0U}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::rotl(input_vector[i], 3));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u128_rotr.cu b/test/test_cuda_u128_rotr.cu
new file mode 100644
index 0000000..6e259de
--- /dev/null
+++ b/test/test_cuda_u128_rotr.cu
@@ -0,0 +1,87 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include <boost/safe_numbers/detail/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u128;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::rotr(in[i], 3);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<basis_type> dist{basis_type{0U}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::rotr(input_vector[i], 3));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u128_to_be.cu b/test/test_cuda_u128_to_be.cu
new file mode 100644
index 0000000..0e1e6cf
--- /dev/null
+++ b/test/test_cuda_u128_to_be.cu
@@ -0,0 +1,87 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/byte_conversions.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include <boost/safe_numbers/detail/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u128;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::to_be(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<basis_type> dist{basis_type{0U}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::to_be(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u128_to_le.cu b/test/test_cuda_u128_to_le.cu
new file mode 100644
index 0000000..461d47d
--- /dev/null
+++ b/test/test_cuda_u128_to_le.cu
@@ -0,0 +1,87 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/byte_conversions.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include <boost/safe_numbers/detail/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u128;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::to_le(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<basis_type> dist{basis_type{0U}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::to_le(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u16_abs_diff.cu b/test/test_cuda_u16_abs_diff.cu
new file mode 100644
index 0000000..830196a
--- /dev/null
+++ b/test/test_cuda_u16_abs_diff.cu
@@ -0,0 +1,85 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u16;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::abs_diff(in1[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector1(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    std::uniform_int_distribution<unsigned> dist2{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = test_type{static_cast<basis_type>(dist(rng))};
+        input_vector2[i] = test_type{static_cast<basis_type>(dist2(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::abs_diff(input_vector1[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u16_bit_ceil.cu b/test/test_cuda_u16_bit_ceil.cu
new file mode 100644
index 0000000..a8ddfb7
--- /dev/null
+++ b/test/test_cuda_u16_bit_ceil.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u16;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::bit_ceil(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)()) / 2U};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::bit_ceil(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u16_bit_floor.cu b/test/test_cuda_u16_bit_floor.cu
new file mode 100644
index 0000000..ef63dbe
--- /dev/null
+++ b/test/test_cuda_u16_bit_floor.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u16;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::bit_floor(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::bit_floor(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u16_bit_width.cu b/test/test_cuda_u16_bit_width.cu
new file mode 100644
index 0000000..2085fc5
--- /dev/null
+++ b/test/test_cuda_u16_bit_width.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u16;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::bit_width(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::bit_width(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u16_bitswap.cu b/test/test_cuda_u16_bitswap.cu
new file mode 100644
index 0000000..db46116
--- /dev/null
+++ b/test/test_cuda_u16_bitswap.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u16;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::bitswap(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::bitswap(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u16_byteswap.cu b/test/test_cuda_u16_byteswap.cu
new file mode 100644
index 0000000..a9fcb6d
--- /dev/null
+++ b/test/test_cuda_u16_byteswap.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u16;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::byteswap(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::byteswap(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u16_charconv.cu b/test/test_cuda_u16_charconv.cu
new file mode 100644
index 0000000..cd53a19
--- /dev/null
+++ b/test/test_cuda_u16_charconv.cu
@@ -0,0 +1,90 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/charconv.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u16;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        char buf[64] {};
+        auto tc_result {boost::charconv::to_chars(buf, buf + sizeof(buf), in[i])};
+        test_type parsed {};
+        boost::charconv::from_chars(buf, tc_result.ptr, parsed);
+        out[i] = parsed;
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        char buf[64] {};
+        auto tc_result {boost::charconv::to_chars(buf, buf + sizeof(buf), input_vector[i])};
+        test_type parsed {};
+        boost::charconv::from_chars(buf, tc_result.ptr, parsed);
+        results.push_back(parsed);
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u16_charconv_all_bases.cu b/test/test_cuda_u16_charconv_all_bases.cu
new file mode 100644
index 0000000..50f64a7
--- /dev/null
+++ b/test/test_cuda_u16_charconv_all_bases.cu
@@ -0,0 +1,106 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/charconv.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u16;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        int pass_count {0};
+        for (int base = 2; base <= 36; ++base)
+        {
+            char buf[256] {};
+            auto tc_result {boost::charconv::to_chars(buf, buf + sizeof(buf), in[i], base)};
+            test_type parsed {};
+            boost::charconv::from_chars(buf, tc_result.ptr, parsed, base);
+            if (parsed == in[i])
+            {
+                ++pass_count;
+            }
+        }
+        out[i] = pass_count;
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        int pass_count {0};
+        for (int base = 2; base <= 36; ++base)
+        {
+            char buf[256] {};
+            auto tc_result {boost::charconv::to_chars(buf, buf + sizeof(buf), input_vector[i], base)};
+            test_type parsed {};
+            boost::charconv::from_chars(buf, tc_result.ptr, parsed, base);
+            if (parsed == input_vector[i])
+            {
+                ++pass_count;
+            }
+        }
+        results.push_back(pass_count);
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u16_countl_one.cu b/test/test_cuda_u16_countl_one.cu
new file mode 100644
index 0000000..1fcc61c
--- /dev/null
+++ b/test/test_cuda_u16_countl_one.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u16;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::countl_one(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::countl_one(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u16_countl_zero.cu b/test/test_cuda_u16_countl_zero.cu
new file mode 100644
index 0000000..e78dc50
--- /dev/null
+++ b/test/test_cuda_u16_countl_zero.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u16;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::countl_zero(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::countl_zero(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u16_countr_one.cu b/test/test_cuda_u16_countr_one.cu
new file mode 100644
index 0000000..f900927
--- /dev/null
+++ b/test/test_cuda_u16_countr_one.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u16;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::countr_one(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::countr_one(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u16_countr_zero.cu b/test/test_cuda_u16_countr_zero.cu
new file mode 100644
index 0000000..4feddc9
--- /dev/null
+++ b/test/test_cuda_u16_countr_zero.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u16;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::countr_zero(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::countr_zero(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u16_div_ceil.cu b/test/test_cuda_u16_div_ceil.cu
new file mode 100644
index 0000000..52013ad
--- /dev/null
+++ b/test/test_cuda_u16_div_ceil.cu
@@ -0,0 +1,85 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u16;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::div_ceil(in1[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector1(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    std::uniform_int_distribution<unsigned> dist2{1U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = test_type{static_cast<basis_type>(dist(rng))};
+        input_vector2[i] = test_type{static_cast<basis_type>(dist2(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::div_ceil(input_vector1[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u16_from_be.cu b/test/test_cuda_u16_from_be.cu
new file mode 100644
index 0000000..1eda1cd
--- /dev/null
+++ b/test/test_cuda_u16_from_be.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/byte_conversions.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u16;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::from_be(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::from_be(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u16_from_le.cu b/test/test_cuda_u16_from_le.cu
new file mode 100644
index 0000000..389a8fd
--- /dev/null
+++ b/test/test_cuda_u16_from_le.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/byte_conversions.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u16;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::from_le(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::from_le(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u16_gcd.cu b/test/test_cuda_u16_gcd.cu
new file mode 100644
index 0000000..0bff419
--- /dev/null
+++ b/test/test_cuda_u16_gcd.cu
@@ -0,0 +1,85 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/numeric.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u16;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::gcd(in1[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector1(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    std::uniform_int_distribution<unsigned> dist2{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = test_type{static_cast<basis_type>(dist(rng))};
+        input_vector2[i] = test_type{static_cast<basis_type>(dist2(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::gcd(input_vector1[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u16_has_single_bit.cu b/test/test_cuda_u16_has_single_bit.cu
new file mode 100644
index 0000000..39bb369
--- /dev/null
+++ b/test/test_cuda_u16_has_single_bit.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u16;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = static_cast<int>(boost::safe_numbers::has_single_bit(in[i]));
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(static_cast<int>(boost::safe_numbers::has_single_bit(input_vector[i])));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u16_ilog.cu b/test/test_cuda_u16_ilog.cu
new file mode 100644
index 0000000..39bf5de
--- /dev/null
+++ b/test/test_cuda_u16_ilog.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u16;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::ilog(in[i], test_type{static_cast<basis_type>(7)});
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{1U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::ilog(input_vector[i], test_type{static_cast<basis_type>(7)}));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u16_ilog10.cu b/test/test_cuda_u16_ilog10.cu
new file mode 100644
index 0000000..3eff426
--- /dev/null
+++ b/test/test_cuda_u16_ilog10.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u16;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::ilog10(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{1U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::ilog10(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u16_ilog2.cu b/test/test_cuda_u16_ilog2.cu
new file mode 100644
index 0000000..23696c4
--- /dev/null
+++ b/test/test_cuda_u16_ilog2.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u16;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::ilog2(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{1U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::ilog2(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u16_ipow.cu b/test/test_cuda_u16_ipow.cu
new file mode 100644
index 0000000..a45070a
--- /dev/null
+++ b/test/test_cuda_u16_ipow.cu
@@ -0,0 +1,85 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u16;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::ipow(in1[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector1(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, 10U};
+    std::uniform_int_distribution<unsigned> dist2{0U, 2U};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = test_type{static_cast<basis_type>(dist(rng))};
+        input_vector2[i] = test_type{static_cast<basis_type>(dist2(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::ipow(input_vector1[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u16_is_power_10.cu b/test/test_cuda_u16_is_power_10.cu
new file mode 100644
index 0000000..1d680da
--- /dev/null
+++ b/test/test_cuda_u16_is_power_10.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u16;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = static_cast<int>(boost::safe_numbers::is_power_10(in[i]));
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{1U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(static_cast<int>(boost::safe_numbers::is_power_10(input_vector[i])));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u16_is_power_2.cu b/test/test_cuda_u16_is_power_2.cu
new file mode 100644
index 0000000..e53ae2c
--- /dev/null
+++ b/test/test_cuda_u16_is_power_2.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u16;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = static_cast<int>(boost::safe_numbers::is_power_2(in[i]));
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(static_cast<int>(boost::safe_numbers::is_power_2(input_vector[i])));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u16_isqrt.cu b/test/test_cuda_u16_isqrt.cu
new file mode 100644
index 0000000..7f9708e
--- /dev/null
+++ b/test/test_cuda_u16_isqrt.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u16;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::isqrt(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::isqrt(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u16_lcm.cu b/test/test_cuda_u16_lcm.cu
new file mode 100644
index 0000000..2f2f420
--- /dev/null
+++ b/test/test_cuda_u16_lcm.cu
@@ -0,0 +1,85 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/numeric.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u16;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::lcm(in1[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector1(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{1U, 10U};
+    std::uniform_int_distribution<unsigned> dist2{1U, 10U};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = test_type{static_cast<basis_type>(dist(rng))};
+        input_vector2[i] = test_type{static_cast<basis_type>(dist2(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::lcm(input_vector1[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u16_midpoint.cu b/test/test_cuda_u16_midpoint.cu
new file mode 100644
index 0000000..136056b
--- /dev/null
+++ b/test/test_cuda_u16_midpoint.cu
@@ -0,0 +1,85 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/numeric.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u16;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::midpoint(in1[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector1(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    std::uniform_int_distribution<unsigned> dist2{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = test_type{static_cast<basis_type>(dist(rng))};
+        input_vector2[i] = test_type{static_cast<basis_type>(dist2(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::midpoint(input_vector1[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u16_next_multiple_of.cu b/test/test_cuda_u16_next_multiple_of.cu
new file mode 100644
index 0000000..f597589
--- /dev/null
+++ b/test/test_cuda_u16_next_multiple_of.cu
@@ -0,0 +1,85 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u16;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::next_multiple_of(in1[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector1(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)()) / 2U};
+    std::uniform_int_distribution<unsigned> dist2{1U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = test_type{static_cast<basis_type>(dist(rng))};
+        input_vector2[i] = test_type{static_cast<basis_type>(dist2(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::next_multiple_of(input_vector1[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u16_popcount.cu b/test/test_cuda_u16_popcount.cu
new file mode 100644
index 0000000..b883bf3
--- /dev/null
+++ b/test/test_cuda_u16_popcount.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u16;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::popcount(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::popcount(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u16_remove_trailing_zeros.cu b/test/test_cuda_u16_remove_trailing_zeros.cu
new file mode 100644
index 0000000..85559e2
--- /dev/null
+++ b/test/test_cuda_u16_remove_trailing_zeros.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u16;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = test_type{boost::safe_numbers::remove_trailing_zeros(in[i]).trimmed_number};
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{1U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(test_type{boost::safe_numbers::remove_trailing_zeros(input_vector[i]).trimmed_number});
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u16_rotl.cu b/test/test_cuda_u16_rotl.cu
new file mode 100644
index 0000000..94f331a
--- /dev/null
+++ b/test/test_cuda_u16_rotl.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u16;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::rotl(in[i], 3);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::rotl(input_vector[i], 3));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u16_rotr.cu b/test/test_cuda_u16_rotr.cu
new file mode 100644
index 0000000..eeda3d0
--- /dev/null
+++ b/test/test_cuda_u16_rotr.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u16;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::rotr(in[i], 3);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::rotr(input_vector[i], 3));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u16_to_be.cu b/test/test_cuda_u16_to_be.cu
new file mode 100644
index 0000000..9268e37
--- /dev/null
+++ b/test/test_cuda_u16_to_be.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/byte_conversions.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u16;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::to_be(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::to_be(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u16_to_le.cu b/test/test_cuda_u16_to_le.cu
new file mode 100644
index 0000000..7b0ce48
--- /dev/null
+++ b/test/test_cuda_u16_to_le.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/byte_conversions.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u16;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::to_le(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::to_le(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u32_abs_diff.cu b/test/test_cuda_u32_abs_diff.cu
new file mode 100644
index 0000000..213bfe8
--- /dev/null
+++ b/test/test_cuda_u32_abs_diff.cu
@@ -0,0 +1,85 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u32;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::abs_diff(in1[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector1(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    std::uniform_int_distribution<basis_type> dist2{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = test_type{dist(rng)};
+        input_vector2[i] = test_type{dist2(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::abs_diff(input_vector1[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u32_bit_ceil.cu b/test/test_cuda_u32_bit_ceil.cu
new file mode 100644
index 0000000..f1b130a
--- /dev/null
+++ b/test/test_cuda_u32_bit_ceil.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u32;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::bit_ceil(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)() / basis_type{2}};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::bit_ceil(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u32_bit_floor.cu b/test/test_cuda_u32_bit_floor.cu
new file mode 100644
index 0000000..987c4ba
--- /dev/null
+++ b/test/test_cuda_u32_bit_floor.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u32;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::bit_floor(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::bit_floor(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u32_bit_width.cu b/test/test_cuda_u32_bit_width.cu
new file mode 100644
index 0000000..4ea5784
--- /dev/null
+++ b/test/test_cuda_u32_bit_width.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u32;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::bit_width(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::bit_width(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u32_bitswap.cu b/test/test_cuda_u32_bitswap.cu
new file mode 100644
index 0000000..2c8c5b2
--- /dev/null
+++ b/test/test_cuda_u32_bitswap.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u32;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::bitswap(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::bitswap(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u32_byteswap.cu b/test/test_cuda_u32_byteswap.cu
new file mode 100644
index 0000000..300e1d9
--- /dev/null
+++ b/test/test_cuda_u32_byteswap.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u32;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::byteswap(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::byteswap(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u32_charconv.cu b/test/test_cuda_u32_charconv.cu
new file mode 100644
index 0000000..b618742
--- /dev/null
+++ b/test/test_cuda_u32_charconv.cu
@@ -0,0 +1,90 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/charconv.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u32;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        char buf[64] {};
+        auto tc_result {boost::charconv::to_chars(buf, buf + sizeof(buf), in[i])};
+        test_type parsed {};
+        boost::charconv::from_chars(buf, tc_result.ptr, parsed);
+        out[i] = parsed;
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        char buf[64] {};
+        auto tc_result {boost::charconv::to_chars(buf, buf + sizeof(buf), input_vector[i])};
+        test_type parsed {};
+        boost::charconv::from_chars(buf, tc_result.ptr, parsed);
+        results.push_back(parsed);
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u32_charconv_all_bases.cu b/test/test_cuda_u32_charconv_all_bases.cu
new file mode 100644
index 0000000..70c43e7
--- /dev/null
+++ b/test/test_cuda_u32_charconv_all_bases.cu
@@ -0,0 +1,106 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/charconv.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u32;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        int pass_count {0};
+        for (int base = 2; base <= 36; ++base)
+        {
+            char buf[256] {};
+            auto tc_result {boost::charconv::to_chars(buf, buf + sizeof(buf), in[i], base)};
+            test_type parsed {};
+            boost::charconv::from_chars(buf, tc_result.ptr, parsed, base);
+            if (parsed == in[i])
+            {
+                ++pass_count;
+            }
+        }
+        out[i] = pass_count;
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        int pass_count {0};
+        for (int base = 2; base <= 36; ++base)
+        {
+            char buf[256] {};
+            auto tc_result {boost::charconv::to_chars(buf, buf + sizeof(buf), input_vector[i], base)};
+            test_type parsed {};
+            boost::charconv::from_chars(buf, tc_result.ptr, parsed, base);
+            if (parsed == input_vector[i])
+            {
+                ++pass_count;
+            }
+        }
+        results.push_back(pass_count);
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u32_countl_one.cu b/test/test_cuda_u32_countl_one.cu
new file mode 100644
index 0000000..b5d40e5
--- /dev/null
+++ b/test/test_cuda_u32_countl_one.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u32;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::countl_one(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::countl_one(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u32_countl_zero.cu b/test/test_cuda_u32_countl_zero.cu
new file mode 100644
index 0000000..f5b4284
--- /dev/null
+++ b/test/test_cuda_u32_countl_zero.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u32;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::countl_zero(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::countl_zero(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u32_countr_one.cu b/test/test_cuda_u32_countr_one.cu
new file mode 100644
index 0000000..3e687cb
--- /dev/null
+++ b/test/test_cuda_u32_countr_one.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u32;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::countr_one(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::countr_one(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u32_countr_zero.cu b/test/test_cuda_u32_countr_zero.cu
new file mode 100644
index 0000000..99028ef
--- /dev/null
+++ b/test/test_cuda_u32_countr_zero.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u32;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::countr_zero(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::countr_zero(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u32_div_ceil.cu b/test/test_cuda_u32_div_ceil.cu
new file mode 100644
index 0000000..83def48
--- /dev/null
+++ b/test/test_cuda_u32_div_ceil.cu
@@ -0,0 +1,85 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u32;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::div_ceil(in1[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector1(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    std::uniform_int_distribution<basis_type> dist2{basis_type{1}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = test_type{dist(rng)};
+        input_vector2[i] = test_type{dist2(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::div_ceil(input_vector1[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u32_from_be.cu b/test/test_cuda_u32_from_be.cu
new file mode 100644
index 0000000..409cdda
--- /dev/null
+++ b/test/test_cuda_u32_from_be.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/byte_conversions.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u32;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::from_be(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::from_be(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u32_from_le.cu b/test/test_cuda_u32_from_le.cu
new file mode 100644
index 0000000..358f4f4
--- /dev/null
+++ b/test/test_cuda_u32_from_le.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/byte_conversions.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u32;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::from_le(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::from_le(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u32_gcd.cu b/test/test_cuda_u32_gcd.cu
new file mode 100644
index 0000000..0ca6178
--- /dev/null
+++ b/test/test_cuda_u32_gcd.cu
@@ -0,0 +1,85 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/numeric.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u32;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::gcd(in1[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector1(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    std::uniform_int_distribution<basis_type> dist2{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = test_type{dist(rng)};
+        input_vector2[i] = test_type{dist2(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::gcd(input_vector1[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u32_has_single_bit.cu b/test/test_cuda_u32_has_single_bit.cu
new file mode 100644
index 0000000..308d0a6
--- /dev/null
+++ b/test/test_cuda_u32_has_single_bit.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u32;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = static_cast<int>(boost::safe_numbers::has_single_bit(in[i]));
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(static_cast<int>(boost::safe_numbers::has_single_bit(input_vector[i])));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u32_ilog.cu b/test/test_cuda_u32_ilog.cu
new file mode 100644
index 0000000..b98cd7b
--- /dev/null
+++ b/test/test_cuda_u32_ilog.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u32;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::ilog(in[i], test_type{static_cast<basis_type>(7)});
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{1}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::ilog(input_vector[i], test_type{static_cast<basis_type>(7)}));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u32_ilog10.cu b/test/test_cuda_u32_ilog10.cu
new file mode 100644
index 0000000..9302d56
--- /dev/null
+++ b/test/test_cuda_u32_ilog10.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u32;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::ilog10(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{1}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::ilog10(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u32_ilog2.cu b/test/test_cuda_u32_ilog2.cu
new file mode 100644
index 0000000..85b2e9d
--- /dev/null
+++ b/test/test_cuda_u32_ilog2.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u32;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::ilog2(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{1}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::ilog2(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u32_ipow.cu b/test/test_cuda_u32_ipow.cu
new file mode 100644
index 0000000..e8c1f1d
--- /dev/null
+++ b/test/test_cuda_u32_ipow.cu
@@ -0,0 +1,85 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u32;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::ipow(in1[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector1(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, basis_type{10}};
+    std::uniform_int_distribution<basis_type> dist2{basis_type{0}, basis_type{2}};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = test_type{dist(rng)};
+        input_vector2[i] = test_type{dist2(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::ipow(input_vector1[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u32_is_power_10.cu b/test/test_cuda_u32_is_power_10.cu
new file mode 100644
index 0000000..3d8b03c
--- /dev/null
+++ b/test/test_cuda_u32_is_power_10.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u32;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = static_cast<int>(boost::safe_numbers::is_power_10(in[i]));
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{1}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(static_cast<int>(boost::safe_numbers::is_power_10(input_vector[i])));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u32_is_power_2.cu b/test/test_cuda_u32_is_power_2.cu
new file mode 100644
index 0000000..381e674
--- /dev/null
+++ b/test/test_cuda_u32_is_power_2.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u32;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = static_cast<int>(boost::safe_numbers::is_power_2(in[i]));
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(static_cast<int>(boost::safe_numbers::is_power_2(input_vector[i])));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u32_isqrt.cu b/test/test_cuda_u32_isqrt.cu
new file mode 100644
index 0000000..a6fcb8c
--- /dev/null
+++ b/test/test_cuda_u32_isqrt.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u32;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::isqrt(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::isqrt(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u32_lcm.cu b/test/test_cuda_u32_lcm.cu
new file mode 100644
index 0000000..09019b9
--- /dev/null
+++ b/test/test_cuda_u32_lcm.cu
@@ -0,0 +1,85 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/numeric.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u32;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::lcm(in1[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector1(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{1}, basis_type{10}};
+    std::uniform_int_distribution<basis_type> dist2{basis_type{1}, basis_type{10}};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = test_type{dist(rng)};
+        input_vector2[i] = test_type{dist2(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::lcm(input_vector1[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u32_midpoint.cu b/test/test_cuda_u32_midpoint.cu
new file mode 100644
index 0000000..aadfcac
--- /dev/null
+++ b/test/test_cuda_u32_midpoint.cu
@@ -0,0 +1,85 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/numeric.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u32;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::midpoint(in1[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector1(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    std::uniform_int_distribution<basis_type> dist2{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = test_type{dist(rng)};
+        input_vector2[i] = test_type{dist2(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::midpoint(input_vector1[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u32_next_multiple_of.cu b/test/test_cuda_u32_next_multiple_of.cu
new file mode 100644
index 0000000..3371948
--- /dev/null
+++ b/test/test_cuda_u32_next_multiple_of.cu
@@ -0,0 +1,85 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u32;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::next_multiple_of(in1[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector1(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)() / basis_type{2}};
+    std::uniform_int_distribution<basis_type> dist2{basis_type{1}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = test_type{dist(rng)};
+        input_vector2[i] = test_type{dist2(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::next_multiple_of(input_vector1[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u32_popcount.cu b/test/test_cuda_u32_popcount.cu
new file mode 100644
index 0000000..1b2678c
--- /dev/null
+++ b/test/test_cuda_u32_popcount.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u32;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::popcount(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::popcount(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u32_remove_trailing_zeros.cu b/test/test_cuda_u32_remove_trailing_zeros.cu
new file mode 100644
index 0000000..1e8c9e4
--- /dev/null
+++ b/test/test_cuda_u32_remove_trailing_zeros.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u32;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = test_type{boost::safe_numbers::remove_trailing_zeros(in[i]).trimmed_number};
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{1}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(test_type{boost::safe_numbers::remove_trailing_zeros(input_vector[i]).trimmed_number});
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u32_rotl.cu b/test/test_cuda_u32_rotl.cu
new file mode 100644
index 0000000..ed06774
--- /dev/null
+++ b/test/test_cuda_u32_rotl.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u32;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::rotl(in[i], 3);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::rotl(input_vector[i], 3));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u32_rotr.cu b/test/test_cuda_u32_rotr.cu
new file mode 100644
index 0000000..9b6b7a2
--- /dev/null
+++ b/test/test_cuda_u32_rotr.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u32;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::rotr(in[i], 3);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::rotr(input_vector[i], 3));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u32_to_be.cu b/test/test_cuda_u32_to_be.cu
new file mode 100644
index 0000000..abf22d7
--- /dev/null
+++ b/test/test_cuda_u32_to_be.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/byte_conversions.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u32;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::to_be(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::to_be(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u32_to_le.cu b/test/test_cuda_u32_to_le.cu
new file mode 100644
index 0000000..2d31f30
--- /dev/null
+++ b/test/test_cuda_u32_to_le.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/byte_conversions.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u32;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::to_le(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::to_le(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u64_abs_diff.cu b/test/test_cuda_u64_abs_diff.cu
new file mode 100644
index 0000000..038f420
--- /dev/null
+++ b/test/test_cuda_u64_abs_diff.cu
@@ -0,0 +1,85 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u64;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::abs_diff(in1[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector1(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    std::uniform_int_distribution<basis_type> dist2{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = test_type{dist(rng)};
+        input_vector2[i] = test_type{dist2(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::abs_diff(input_vector1[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u64_bit_ceil.cu b/test/test_cuda_u64_bit_ceil.cu
new file mode 100644
index 0000000..885f44c
--- /dev/null
+++ b/test/test_cuda_u64_bit_ceil.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u64;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::bit_ceil(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)() / basis_type{2}};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::bit_ceil(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u64_bit_floor.cu b/test/test_cuda_u64_bit_floor.cu
new file mode 100644
index 0000000..18b61b0
--- /dev/null
+++ b/test/test_cuda_u64_bit_floor.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u64;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::bit_floor(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::bit_floor(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u64_bit_width.cu b/test/test_cuda_u64_bit_width.cu
new file mode 100644
index 0000000..ec04975
--- /dev/null
+++ b/test/test_cuda_u64_bit_width.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u64;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::bit_width(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::bit_width(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u64_bitswap.cu b/test/test_cuda_u64_bitswap.cu
new file mode 100644
index 0000000..ca98035
--- /dev/null
+++ b/test/test_cuda_u64_bitswap.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u64;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::bitswap(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::bitswap(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u64_byteswap.cu b/test/test_cuda_u64_byteswap.cu
new file mode 100644
index 0000000..e07beb1
--- /dev/null
+++ b/test/test_cuda_u64_byteswap.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u64;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::byteswap(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::byteswap(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u64_charconv.cu b/test/test_cuda_u64_charconv.cu
new file mode 100644
index 0000000..75998ff
--- /dev/null
+++ b/test/test_cuda_u64_charconv.cu
@@ -0,0 +1,90 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/charconv.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u64;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        char buf[64] {};
+        auto tc_result {boost::charconv::to_chars(buf, buf + sizeof(buf), in[i])};
+        test_type parsed {};
+        boost::charconv::from_chars(buf, tc_result.ptr, parsed);
+        out[i] = parsed;
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        char buf[64] {};
+        auto tc_result {boost::charconv::to_chars(buf, buf + sizeof(buf), input_vector[i])};
+        test_type parsed {};
+        boost::charconv::from_chars(buf, tc_result.ptr, parsed);
+        results.push_back(parsed);
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u64_charconv_all_bases.cu b/test/test_cuda_u64_charconv_all_bases.cu
new file mode 100644
index 0000000..5de01ea
--- /dev/null
+++ b/test/test_cuda_u64_charconv_all_bases.cu
@@ -0,0 +1,106 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/charconv.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u64;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        int pass_count {0};
+        for (int base = 2; base <= 36; ++base)
+        {
+            char buf[256] {};
+            auto tc_result {boost::charconv::to_chars(buf, buf + sizeof(buf), in[i], base)};
+            test_type parsed {};
+            boost::charconv::from_chars(buf, tc_result.ptr, parsed, base);
+            if (parsed == in[i])
+            {
+                ++pass_count;
+            }
+        }
+        out[i] = pass_count;
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        int pass_count {0};
+        for (int base = 2; base <= 36; ++base)
+        {
+            char buf[256] {};
+            auto tc_result {boost::charconv::to_chars(buf, buf + sizeof(buf), input_vector[i], base)};
+            test_type parsed {};
+            boost::charconv::from_chars(buf, tc_result.ptr, parsed, base);
+            if (parsed == input_vector[i])
+            {
+                ++pass_count;
+            }
+        }
+        results.push_back(pass_count);
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u64_countl_one.cu b/test/test_cuda_u64_countl_one.cu
new file mode 100644
index 0000000..4f52634
--- /dev/null
+++ b/test/test_cuda_u64_countl_one.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u64;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::countl_one(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::countl_one(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u64_countl_zero.cu b/test/test_cuda_u64_countl_zero.cu
new file mode 100644
index 0000000..81d3d67
--- /dev/null
+++ b/test/test_cuda_u64_countl_zero.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u64;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::countl_zero(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::countl_zero(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u64_countr_one.cu b/test/test_cuda_u64_countr_one.cu
new file mode 100644
index 0000000..de86742
--- /dev/null
+++ b/test/test_cuda_u64_countr_one.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u64;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::countr_one(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::countr_one(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u64_countr_zero.cu b/test/test_cuda_u64_countr_zero.cu
new file mode 100644
index 0000000..c348275
--- /dev/null
+++ b/test/test_cuda_u64_countr_zero.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u64;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::countr_zero(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::countr_zero(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u64_div_ceil.cu b/test/test_cuda_u64_div_ceil.cu
new file mode 100644
index 0000000..4bd8b1b
--- /dev/null
+++ b/test/test_cuda_u64_div_ceil.cu
@@ -0,0 +1,85 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u64;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::div_ceil(in1[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector1(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    std::uniform_int_distribution<basis_type> dist2{basis_type{1}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = test_type{dist(rng)};
+        input_vector2[i] = test_type{dist2(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::div_ceil(input_vector1[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u64_from_be.cu b/test/test_cuda_u64_from_be.cu
new file mode 100644
index 0000000..e867176
--- /dev/null
+++ b/test/test_cuda_u64_from_be.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/byte_conversions.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u64;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::from_be(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::from_be(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u64_from_le.cu b/test/test_cuda_u64_from_le.cu
new file mode 100644
index 0000000..29e4024
--- /dev/null
+++ b/test/test_cuda_u64_from_le.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/byte_conversions.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u64;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::from_le(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::from_le(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u64_gcd.cu b/test/test_cuda_u64_gcd.cu
new file mode 100644
index 0000000..6d0dc83
--- /dev/null
+++ b/test/test_cuda_u64_gcd.cu
@@ -0,0 +1,85 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/numeric.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u64;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::gcd(in1[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector1(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    std::uniform_int_distribution<basis_type> dist2{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = test_type{dist(rng)};
+        input_vector2[i] = test_type{dist2(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::gcd(input_vector1[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u64_has_single_bit.cu b/test/test_cuda_u64_has_single_bit.cu
new file mode 100644
index 0000000..76012cd
--- /dev/null
+++ b/test/test_cuda_u64_has_single_bit.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u64;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = static_cast<int>(boost::safe_numbers::has_single_bit(in[i]));
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(static_cast<int>(boost::safe_numbers::has_single_bit(input_vector[i])));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u64_ilog.cu b/test/test_cuda_u64_ilog.cu
new file mode 100644
index 0000000..430ae89
--- /dev/null
+++ b/test/test_cuda_u64_ilog.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u64;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::ilog(in[i], test_type{static_cast<basis_type>(7)});
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{1}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::ilog(input_vector[i], test_type{static_cast<basis_type>(7)}));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u64_ilog10.cu b/test/test_cuda_u64_ilog10.cu
new file mode 100644
index 0000000..2c67863
--- /dev/null
+++ b/test/test_cuda_u64_ilog10.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u64;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::ilog10(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{1}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::ilog10(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u64_ilog2.cu b/test/test_cuda_u64_ilog2.cu
new file mode 100644
index 0000000..375c119
--- /dev/null
+++ b/test/test_cuda_u64_ilog2.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u64;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::ilog2(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{1}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::ilog2(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u64_ipow.cu b/test/test_cuda_u64_ipow.cu
new file mode 100644
index 0000000..be09471
--- /dev/null
+++ b/test/test_cuda_u64_ipow.cu
@@ -0,0 +1,85 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u64;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::ipow(in1[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector1(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, basis_type{10}};
+    std::uniform_int_distribution<basis_type> dist2{basis_type{0}, basis_type{2}};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = test_type{dist(rng)};
+        input_vector2[i] = test_type{dist2(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::ipow(input_vector1[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u64_is_power_10.cu b/test/test_cuda_u64_is_power_10.cu
new file mode 100644
index 0000000..0f2fb55
--- /dev/null
+++ b/test/test_cuda_u64_is_power_10.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u64;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = static_cast<int>(boost::safe_numbers::is_power_10(in[i]));
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{1}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(static_cast<int>(boost::safe_numbers::is_power_10(input_vector[i])));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u64_is_power_2.cu b/test/test_cuda_u64_is_power_2.cu
new file mode 100644
index 0000000..c823c6f
--- /dev/null
+++ b/test/test_cuda_u64_is_power_2.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u64;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = static_cast<int>(boost::safe_numbers::is_power_2(in[i]));
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(static_cast<int>(boost::safe_numbers::is_power_2(input_vector[i])));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u64_isqrt.cu b/test/test_cuda_u64_isqrt.cu
new file mode 100644
index 0000000..ba5a5ae
--- /dev/null
+++ b/test/test_cuda_u64_isqrt.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u64;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::isqrt(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::isqrt(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u64_lcm.cu b/test/test_cuda_u64_lcm.cu
new file mode 100644
index 0000000..24dce83
--- /dev/null
+++ b/test/test_cuda_u64_lcm.cu
@@ -0,0 +1,85 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/numeric.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u64;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::lcm(in1[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector1(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{1}, basis_type{10}};
+    std::uniform_int_distribution<basis_type> dist2{basis_type{1}, basis_type{10}};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = test_type{dist(rng)};
+        input_vector2[i] = test_type{dist2(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::lcm(input_vector1[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u64_midpoint.cu b/test/test_cuda_u64_midpoint.cu
new file mode 100644
index 0000000..bba9451
--- /dev/null
+++ b/test/test_cuda_u64_midpoint.cu
@@ -0,0 +1,85 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/numeric.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u64;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::midpoint(in1[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector1(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    std::uniform_int_distribution<basis_type> dist2{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = test_type{dist(rng)};
+        input_vector2[i] = test_type{dist2(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::midpoint(input_vector1[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u64_next_multiple_of.cu b/test/test_cuda_u64_next_multiple_of.cu
new file mode 100644
index 0000000..fd6f1a7
--- /dev/null
+++ b/test/test_cuda_u64_next_multiple_of.cu
@@ -0,0 +1,85 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u64;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::next_multiple_of(in1[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector1(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)() / basis_type{2}};
+    std::uniform_int_distribution<basis_type> dist2{basis_type{1}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = test_type{dist(rng)};
+        input_vector2[i] = test_type{dist2(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::next_multiple_of(input_vector1[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u64_popcount.cu b/test/test_cuda_u64_popcount.cu
new file mode 100644
index 0000000..e48df1d
--- /dev/null
+++ b/test/test_cuda_u64_popcount.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u64;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::popcount(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::popcount(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u64_remove_trailing_zeros.cu b/test/test_cuda_u64_remove_trailing_zeros.cu
new file mode 100644
index 0000000..0a58876
--- /dev/null
+++ b/test/test_cuda_u64_remove_trailing_zeros.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u64;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = test_type{boost::safe_numbers::remove_trailing_zeros(in[i]).trimmed_number};
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{1}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(test_type{boost::safe_numbers::remove_trailing_zeros(input_vector[i]).trimmed_number});
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u64_rotl.cu b/test/test_cuda_u64_rotl.cu
new file mode 100644
index 0000000..a3240b1
--- /dev/null
+++ b/test/test_cuda_u64_rotl.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u64;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::rotl(in[i], 3);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::rotl(input_vector[i], 3));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u64_rotr.cu b/test/test_cuda_u64_rotr.cu
new file mode 100644
index 0000000..3cf95de
--- /dev/null
+++ b/test/test_cuda_u64_rotr.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u64;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::rotr(in[i], 3);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::rotr(input_vector[i], 3));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u64_to_be.cu b/test/test_cuda_u64_to_be.cu
new file mode 100644
index 0000000..01613a0
--- /dev/null
+++ b/test/test_cuda_u64_to_be.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/byte_conversions.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u64;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::to_be(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::to_be(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u64_to_le.cu b/test/test_cuda_u64_to_le.cu
new file mode 100644
index 0000000..80ce98a
--- /dev/null
+++ b/test/test_cuda_u64_to_le.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/byte_conversions.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u64;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::to_le(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<basis_type> dist{basis_type{0}, (std::numeric_limits<basis_type>::max)()};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{dist(rng)};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::to_le(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u8_abs_diff.cu b/test/test_cuda_u8_abs_diff.cu
new file mode 100644
index 0000000..3d0e736
--- /dev/null
+++ b/test/test_cuda_u8_abs_diff.cu
@@ -0,0 +1,85 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u8;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::abs_diff(in1[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector1(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    std::uniform_int_distribution<unsigned> dist2{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = test_type{static_cast<basis_type>(dist(rng))};
+        input_vector2[i] = test_type{static_cast<basis_type>(dist2(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::abs_diff(input_vector1[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u8_bit_ceil.cu b/test/test_cuda_u8_bit_ceil.cu
new file mode 100644
index 0000000..f84d37d
--- /dev/null
+++ b/test/test_cuda_u8_bit_ceil.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u8;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::bit_ceil(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)()) / 2U};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::bit_ceil(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u8_bit_floor.cu b/test/test_cuda_u8_bit_floor.cu
new file mode 100644
index 0000000..5ef0598
--- /dev/null
+++ b/test/test_cuda_u8_bit_floor.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u8;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::bit_floor(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::bit_floor(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u8_bit_width.cu b/test/test_cuda_u8_bit_width.cu
new file mode 100644
index 0000000..3ae3bbc
--- /dev/null
+++ b/test/test_cuda_u8_bit_width.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u8;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::bit_width(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::bit_width(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u8_bitswap.cu b/test/test_cuda_u8_bitswap.cu
new file mode 100644
index 0000000..d2d0d56
--- /dev/null
+++ b/test/test_cuda_u8_bitswap.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u8;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::bitswap(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::bitswap(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u8_byteswap.cu b/test/test_cuda_u8_byteswap.cu
new file mode 100644
index 0000000..9a6e4d5
--- /dev/null
+++ b/test/test_cuda_u8_byteswap.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u8;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::byteswap(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::byteswap(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u8_charconv.cu b/test/test_cuda_u8_charconv.cu
new file mode 100644
index 0000000..daaed8c
--- /dev/null
+++ b/test/test_cuda_u8_charconv.cu
@@ -0,0 +1,90 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/charconv.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u8;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        char buf[64] {};
+        auto tc_result {boost::charconv::to_chars(buf, buf + sizeof(buf), in[i])};
+        test_type parsed {};
+        boost::charconv::from_chars(buf, tc_result.ptr, parsed);
+        out[i] = parsed;
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        char buf[64] {};
+        auto tc_result {boost::charconv::to_chars(buf, buf + sizeof(buf), input_vector[i])};
+        test_type parsed {};
+        boost::charconv::from_chars(buf, tc_result.ptr, parsed);
+        results.push_back(parsed);
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u8_charconv_all_bases.cu b/test/test_cuda_u8_charconv_all_bases.cu
new file mode 100644
index 0000000..481caf1
--- /dev/null
+++ b/test/test_cuda_u8_charconv_all_bases.cu
@@ -0,0 +1,106 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/charconv.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u8;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        int pass_count {0};
+        for (int base = 2; base <= 36; ++base)
+        {
+            char buf[256] {};
+            auto tc_result {boost::charconv::to_chars(buf, buf + sizeof(buf), in[i], base)};
+            test_type parsed {};
+            boost::charconv::from_chars(buf, tc_result.ptr, parsed, base);
+            if (parsed == in[i])
+            {
+                ++pass_count;
+            }
+        }
+        out[i] = pass_count;
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        int pass_count {0};
+        for (int base = 2; base <= 36; ++base)
+        {
+            char buf[256] {};
+            auto tc_result {boost::charconv::to_chars(buf, buf + sizeof(buf), input_vector[i], base)};
+            test_type parsed {};
+            boost::charconv::from_chars(buf, tc_result.ptr, parsed, base);
+            if (parsed == input_vector[i])
+            {
+                ++pass_count;
+            }
+        }
+        results.push_back(pass_count);
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u8_countl_one.cu b/test/test_cuda_u8_countl_one.cu
new file mode 100644
index 0000000..b5b89fe
--- /dev/null
+++ b/test/test_cuda_u8_countl_one.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u8;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::countl_one(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::countl_one(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u8_countl_zero.cu b/test/test_cuda_u8_countl_zero.cu
new file mode 100644
index 0000000..68ba382
--- /dev/null
+++ b/test/test_cuda_u8_countl_zero.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u8;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::countl_zero(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::countl_zero(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u8_countr_one.cu b/test/test_cuda_u8_countr_one.cu
new file mode 100644
index 0000000..4466c8c
--- /dev/null
+++ b/test/test_cuda_u8_countr_one.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u8;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::countr_one(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::countr_one(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u8_countr_zero.cu b/test/test_cuda_u8_countr_zero.cu
new file mode 100644
index 0000000..9902dd0
--- /dev/null
+++ b/test/test_cuda_u8_countr_zero.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u8;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::countr_zero(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::countr_zero(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u8_div_ceil.cu b/test/test_cuda_u8_div_ceil.cu
new file mode 100644
index 0000000..9a4d4f2
--- /dev/null
+++ b/test/test_cuda_u8_div_ceil.cu
@@ -0,0 +1,85 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u8;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::div_ceil(in1[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector1(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    std::uniform_int_distribution<unsigned> dist2{1U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = test_type{static_cast<basis_type>(dist(rng))};
+        input_vector2[i] = test_type{static_cast<basis_type>(dist2(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::div_ceil(input_vector1[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u8_from_be.cu b/test/test_cuda_u8_from_be.cu
new file mode 100644
index 0000000..b914fdd
--- /dev/null
+++ b/test/test_cuda_u8_from_be.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/byte_conversions.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u8;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::from_be(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::from_be(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u8_from_le.cu b/test/test_cuda_u8_from_le.cu
new file mode 100644
index 0000000..4f669cc
--- /dev/null
+++ b/test/test_cuda_u8_from_le.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/byte_conversions.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u8;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::from_le(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::from_le(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u8_gcd.cu b/test/test_cuda_u8_gcd.cu
new file mode 100644
index 0000000..3ab02ee
--- /dev/null
+++ b/test/test_cuda_u8_gcd.cu
@@ -0,0 +1,85 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/numeric.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u8;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::gcd(in1[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector1(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    std::uniform_int_distribution<unsigned> dist2{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = test_type{static_cast<basis_type>(dist(rng))};
+        input_vector2[i] = test_type{static_cast<basis_type>(dist2(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::gcd(input_vector1[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u8_has_single_bit.cu b/test/test_cuda_u8_has_single_bit.cu
new file mode 100644
index 0000000..4c30350
--- /dev/null
+++ b/test/test_cuda_u8_has_single_bit.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u8;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = static_cast<int>(boost::safe_numbers::has_single_bit(in[i]));
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(static_cast<int>(boost::safe_numbers::has_single_bit(input_vector[i])));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u8_ilog.cu b/test/test_cuda_u8_ilog.cu
new file mode 100644
index 0000000..8fcdce1
--- /dev/null
+++ b/test/test_cuda_u8_ilog.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u8;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::ilog(in[i], test_type{static_cast<basis_type>(7)});
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{1U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::ilog(input_vector[i], test_type{static_cast<basis_type>(7)}));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u8_ilog10.cu b/test/test_cuda_u8_ilog10.cu
new file mode 100644
index 0000000..3bde939
--- /dev/null
+++ b/test/test_cuda_u8_ilog10.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u8;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::ilog10(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{1U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::ilog10(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u8_ilog2.cu b/test/test_cuda_u8_ilog2.cu
new file mode 100644
index 0000000..adf4094
--- /dev/null
+++ b/test/test_cuda_u8_ilog2.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u8;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::ilog2(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{1U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::ilog2(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u8_ipow.cu b/test/test_cuda_u8_ipow.cu
new file mode 100644
index 0000000..ca643a8
--- /dev/null
+++ b/test/test_cuda_u8_ipow.cu
@@ -0,0 +1,85 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u8;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::ipow(in1[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector1(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, 10U};
+    std::uniform_int_distribution<unsigned> dist2{0U, 2U};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = test_type{static_cast<basis_type>(dist(rng))};
+        input_vector2[i] = test_type{static_cast<basis_type>(dist2(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::ipow(input_vector1[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u8_is_power_10.cu b/test/test_cuda_u8_is_power_10.cu
new file mode 100644
index 0000000..7cee6aa
--- /dev/null
+++ b/test/test_cuda_u8_is_power_10.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u8;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = static_cast<int>(boost::safe_numbers::is_power_10(in[i]));
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{1U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(static_cast<int>(boost::safe_numbers::is_power_10(input_vector[i])));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u8_is_power_2.cu b/test/test_cuda_u8_is_power_2.cu
new file mode 100644
index 0000000..4405f2d
--- /dev/null
+++ b/test/test_cuda_u8_is_power_2.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u8;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = static_cast<int>(boost::safe_numbers::is_power_2(in[i]));
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(static_cast<int>(boost::safe_numbers::is_power_2(input_vector[i])));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u8_isqrt.cu b/test/test_cuda_u8_isqrt.cu
new file mode 100644
index 0000000..3c68a88
--- /dev/null
+++ b/test/test_cuda_u8_isqrt.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u8;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::isqrt(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::isqrt(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u8_lcm.cu b/test/test_cuda_u8_lcm.cu
new file mode 100644
index 0000000..636263b
--- /dev/null
+++ b/test/test_cuda_u8_lcm.cu
@@ -0,0 +1,85 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/numeric.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u8;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::lcm(in1[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector1(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{1U, 10U};
+    std::uniform_int_distribution<unsigned> dist2{1U, 10U};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = test_type{static_cast<basis_type>(dist(rng))};
+        input_vector2[i] = test_type{static_cast<basis_type>(dist2(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::lcm(input_vector1[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u8_midpoint.cu b/test/test_cuda_u8_midpoint.cu
new file mode 100644
index 0000000..37e2a76
--- /dev/null
+++ b/test/test_cuda_u8_midpoint.cu
@@ -0,0 +1,85 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/numeric.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u8;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::midpoint(in1[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector1(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    std::uniform_int_distribution<unsigned> dist2{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = test_type{static_cast<basis_type>(dist(rng))};
+        input_vector2[i] = test_type{static_cast<basis_type>(dist2(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::midpoint(input_vector1[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u8_next_multiple_of.cu b/test/test_cuda_u8_next_multiple_of.cu
new file mode 100644
index 0000000..d5a95d3
--- /dev/null
+++ b/test/test_cuda_u8_next_multiple_of.cu
@@ -0,0 +1,85 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u8;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::next_multiple_of(in1[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector1(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)()) / 2U};
+    std::uniform_int_distribution<unsigned> dist2{1U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = test_type{static_cast<basis_type>(dist(rng))};
+        input_vector2[i] = test_type{static_cast<basis_type>(dist2(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::next_multiple_of(input_vector1[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u8_popcount.cu b/test/test_cuda_u8_popcount.cu
new file mode 100644
index 0000000..cd27d91
--- /dev/null
+++ b/test/test_cuda_u8_popcount.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u8;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::popcount(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::popcount(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u8_remove_trailing_zeros.cu b/test/test_cuda_u8_remove_trailing_zeros.cu
new file mode 100644
index 0000000..f115383
--- /dev/null
+++ b/test/test_cuda_u8_remove_trailing_zeros.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/integer_utilities.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u8;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = test_type{boost::safe_numbers::remove_trailing_zeros(in[i]).trimmed_number};
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{1U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(test_type{boost::safe_numbers::remove_trailing_zeros(input_vector[i]).trimmed_number});
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u8_rotl.cu b/test/test_cuda_u8_rotl.cu
new file mode 100644
index 0000000..a45e622
--- /dev/null
+++ b/test/test_cuda_u8_rotl.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u8;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::rotl(in[i], 3);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::rotl(input_vector[i], 3));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u8_rotr.cu b/test/test_cuda_u8_rotr.cu
new file mode 100644
index 0000000..47cfd9e
--- /dev/null
+++ b/test/test_cuda_u8_rotr.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/bit.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u8;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::rotr(in[i], 3);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::rotr(input_vector[i], 3));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u8_to_be.cu b/test/test_cuda_u8_to_be.cu
new file mode 100644
index 0000000..ae5801c
--- /dev/null
+++ b/test/test_cuda_u8_to_be.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/byte_conversions.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u8;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::to_be(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::to_be(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_cuda_u8_to_le.cu b/test/test_cuda_u8_to_le.cu
new file mode 100644
index 0000000..d51dd61
--- /dev/null
+++ b/test/test_cuda_u8_to_le.cu
@@ -0,0 +1,82 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/safe_numbers/unsigned_integers.hpp>
+#include <boost/safe_numbers/byte_conversions.hpp>
+#include <boost/safe_numbers/detail/cuda_error_reporting.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::safe_numbers::u8;
+using basis_type = test_type::basis_type;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::safe_numbers::to_le(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng{42};
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    std::uniform_int_distribution<unsigned> dist{0U, static_cast<unsigned>((std::numeric_limits<basis_type>::max)())};
+    for (int i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = test_type{static_cast<basis_type>(dist(rng))};
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    boost::safe_numbers::device_error_context ctx;
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    ctx.synchronize();
+
+    std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl;
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::safe_numbers::to_le(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}