diff --git a/doc/modules/ROOT/nav.adoc b/doc/modules/ROOT/nav.adoc index c91432c1..ae2c1c61 100644 --- a/doc/modules/ROOT/nav.adoc +++ b/doc/modules/ROOT/nav.adoc @@ -12,6 +12,7 @@ ** xref:examples.adoc#examples_boost_math_random[Boost Math and Random Integration] ** xref:examples.adoc#examples_boost_charconv[Boost.Charconv Integration] ** xref:examples.adoc#examples_cstdlib[`` support (Combined div and mod)] +** xref:examples.adoc#examples_cuda[Use of the library in a CUDA kernel] * xref:api_reference.adoc[] ** xref:api_reference.adoc#api_namespaces[Namespaces] ** xref:api_reference.adoc#api_types[Types] diff --git a/doc/modules/ROOT/pages/examples.adoc b/doc/modules/ROOT/pages/examples.adoc index 1e83f2d0..2db70e47 100644 --- a/doc/modules/ROOT/pages/examples.adoc +++ b/doc/modules/ROOT/pages/examples.adoc @@ -492,3 +492,22 @@ Verification: 142857142857142857 * 7 + 1 = 1000000000000000000 3 / 10 = 0 remainder 3 ---- ==== + +[#examples_cuda] +== CUDA Usage + +.This https://github.com/cppalliance/int128/blob/develop/examples/cuda.cu[example] demonstrates how to use library types and functions inside a CUDA kernel. +==== +[source, c++] +---- +include::example$cuda.cu[] +---- + +.Expected Output +[listing] +---- +[Vector operation on 50000 elements] +CUDA kernel launch with 196 blocks of 256 threads +All CPU and GPU computed elements match! +---- +==== diff --git a/examples/cuda.cu b/examples/cuda.cu new file mode 100644 index 00000000..ec20577c --- /dev/null +++ b/examples/cuda.cu @@ -0,0 +1,138 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +using test_type = boost::int128::uint128_t; + +// Calculates the GCD of 2 values on device +__global__ void cuda_gcd(const test_type* in1, const test_type* in2, test_type* out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::int128::gcd(in1[i], in2[i]); + } +} + +// Allocate managed space so that the arrays can be used on both host and device +void allocate(test_type** in, int numElements) +{ + cudaError_t err = cudaSuccess; + err = cudaMallocManaged(in, numElements * sizeof(test_type)); + if (err != cudaSuccess) + { + throw std::runtime_error(cudaGetErrorString(err)); + } + + cudaDeviceSynchronize(); +} + +void cleanup(test_type** in1, test_type** in2, test_type** out) +{ + if (*in1 != nullptr) + { + cudaFree(*in1); + *in1 = nullptr; + } + + if (*in2 != nullptr) + { + cudaFree(*in2); + *in2 = nullptr; + } + + if (*out != nullptr) + { + cudaFree(*out); + *out = nullptr; + } + + cudaDeviceReset(); +} + +int main() +{ + std::mt19937_64 rng {42}; + + const int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate managed space for our inputs and GPU outputs + // We then fill them with random numbers + + test_type* in1 = nullptr; + test_type* in2 = nullptr; + test_type* out = nullptr; + + allocate(&in1, numElements); + allocate(&in2, numElements); + allocate(&out, numElements); + + boost::random::uniform_int_distribution dist {(std::numeric_limits::min)(), (std::numeric_limits::max)()}; + for (std::size_t i = 0; i < numElements; ++i) + { + in1[i] = dist(rng); + in2[i] = dist(rng); + } + + const int threadsPerBlock = 256; + const int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + // Launch the CUDA kernel and check for errors + + cuda_gcd<<>>(in1, in2, out, numElements); + cudaDeviceSynchronize(); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + cleanup(&in1, &in2, &out); + return EXIT_FAILURE; + } + + // We now will perform the same operation using the same inputs on CPU, + // to compare the results for equality + + std::vector results; + results.reserve(numElements); + + for (int i = 0; i < numElements; ++i) + { + results.emplace_back(boost::int128::gcd(in1[i], in2[i])); + } + + // We can now compare that our operation on GPU and the same operation on CPU have identical results + + for (int i = 0; i < numElements; ++i) + { + if (out[i] != results[i]) + { + std::cerr << "Result verification failed at element: " << i << "!" << std::endl; + cleanup(&in1, &in2, &out); + return EXIT_FAILURE; + } + } + + cleanup(&in1, &in2, &out); + + std::cout << "All CPU and GPU computed elements match!" << std::endl; + + return 0; +} diff --git a/test/cuda_jamfile b/test/cuda_jamfile index 1a67d48d..e7279d01 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -92,3 +92,5 @@ run test_signed_from_chars_bases.cu ; run test_unsigned_literals.cu ; run test_signed_literals.cu ; + +run ../examples/cuda.cu ;