From fc4cda4852f8a23c48f163a78ca9e51b57512da0 Mon Sep 17 00:00:00 2001 From: Alexander Droste Date: Wed, 6 May 2026 10:20:45 +0000 Subject: [PATCH] bench: CUDA host-to-device copy modes Compare pageable host memory with cuMemHostAlloc pinned allocations using default flags and WRITECOMBINED. Signed-off-by: Alexander Droste --- vortex-cuda/benches/load_to_device_cuda.rs | 215 +++++++++++++++++---- 1 file changed, 174 insertions(+), 41 deletions(-) diff --git a/vortex-cuda/benches/load_to_device_cuda.rs b/vortex-cuda/benches/load_to_device_cuda.rs index a6bbbbc0e7d..fbb3ceaf2cf 100644 --- a/vortex-cuda/benches/load_to_device_cuda.rs +++ b/vortex-cuda/benches/load_to_device_cuda.rs @@ -7,61 +7,194 @@ mod bench_config; // Unused here but suppresses dead_code warning for the shared module. const _: &[(usize, &str)] = bench_config::BENCH_SIZES; +use std::sync::Arc; + use criterion::BatchSize; use criterion::BenchmarkId; use criterion::Criterion; use criterion::Throughput; -use vortex::array::buffer::BufferHandle; -use vortex::buffer::ByteBuffer; -use vortex::error::VortexExpect; -use vortex::session::VortexSession; -use vortex_cuda::CudaSession; +use cudarc::driver::CudaContext; +use cudarc::driver::CudaStream; +use cudarc::driver::HostSlice; +use cudarc::driver::SyncOnDrop; +use cudarc::driver::result; +use cudarc::driver::sys::CU_MEMHOSTALLOC_WRITECOMBINED; use vortex_cuda_macros::cuda_available; use vortex_cuda_macros::cuda_not_available; -const LOAD_SIZES: &[(usize, &str)] = &[ - (16 * 1024 * 1024, "16MiB"), - (64 * 1024 * 1024, "64MiB"), - (256 * 1024 * 1024, "256MiB"), - (1024 * 1024 * 1024, "1GiB"), +const LOAD_SIZES: &[(usize, &str)] = &[(1024 * 1024 * 1024, "1GiB")]; + +const HOST_MEMORY_KINDS: &[(&str, Option)] = &[ + // Pageable host memory allocated through the Rust global allocator. CUDA may need to stage or + // pin pages internally before the host-to-device copy can run. + ("pageable", None), + // Page-locked host memory from cuMemHostAlloc with no additional flags. + ("pinned_default", Some(0)), + // Page-locked write-combined host memory. This favors CPU writes into the source buffer but + // makes CPU reads from it expensive. + ("pinned_write_combined", Some(CU_MEMHOSTALLOC_WRITECOMBINED)), ]; +struct CudaHostBuffer { + ctx: Arc, + ptr: *mut u8, + len: usize, +} + +// TODO(0ax1): Move CudaHostBuffer out of the test logic and make +// explicit allocation with flags part of the vortex-cuda API. +impl CudaHostBuffer { + fn alloc(ctx: &Arc, len: usize, flags: u32) -> Self { + ctx.bind_to_thread().expect("bind cuda context"); + let ptr = unsafe { result::malloc_host(len, flags) }.expect("allocate cuda host buffer"); + Self { + ctx: Arc::clone(ctx), + ptr: ptr.cast(), + len, + } + } + + fn as_mut_slice(&mut self) -> &mut [u8] { + unsafe { std::slice::from_raw_parts_mut(self.ptr, self.len) } + } +} + +impl HostSlice for CudaHostBuffer { + fn len(&self) -> usize { + self.len + } + + unsafe fn stream_synced_slice<'a>( + &'a self, + _stream: &'a CudaStream, + ) -> (&'a [u8], SyncOnDrop<'a>) { + ( + unsafe { std::slice::from_raw_parts(self.ptr, self.len) }, + SyncOnDrop::Sync(None), + ) + } + + unsafe fn stream_synced_mut_slice<'a>( + &'a mut self, + _stream: &'a CudaStream, + ) -> (&'a mut [u8], SyncOnDrop<'a>) { + ( + unsafe { std::slice::from_raw_parts_mut(self.ptr, self.len) }, + SyncOnDrop::Sync(None), + ) + } +} + +impl Drop for CudaHostBuffer { + fn drop(&mut self) { + self.ctx.record_err(self.ctx.bind_to_thread()); + self.ctx + .record_err(unsafe { result::free_host(self.ptr.cast()) }); + } +} + fn benchmark_load_to_device(c: &mut Criterion) { - let mut group = c.benchmark_group("cuda"); + // Measures a synchronized host-to-device copy after both host source and device + // destination have already been allocated and the source has been initialized. + // This isolates copy throughput for each host allocation mode as much as possible. + let mut copy_group = c.benchmark_group("cuda/load_to_device/memcpy_htod"); + + for &(size, size_name) in LOAD_SIZES { + copy_group.throughput(Throughput::Bytes(size as u64)); + + for &(name, flags) in HOST_MEMORY_KINDS { + copy_group.bench_with_input(BenchmarkId::new(name, size_name), &size, |b, &size| { + let cuda_ctx = CudaContext::new(0).expect("cuda ctx"); + let stream = cuda_ctx.new_stream().expect("cuda stream"); + + match flags { + Some(flags) => b.iter_batched( + || { + let mut source = CudaHostBuffer::alloc(&cuda_ctx, size, flags); + source.as_mut_slice().fill(0xA5); + let dest = unsafe { stream.alloc::(size) } + .expect("allocate device buffer"); + (source, dest) + }, + |(source, mut dest)| { + stream.memcpy_htod(&source, &mut dest).expect("memcpy_htod"); + stream.synchronize().expect("synchronize stream"); + }, + BatchSize::PerIteration, + ), + None => b.iter_batched( + || { + let mut source = vec![0u8; size]; + source.fill(0xA5); + let dest = unsafe { stream.alloc::(size) } + .expect("allocate device buffer"); + (source, dest) + }, + |(source, mut dest)| { + stream.memcpy_htod(&source, &mut dest).expect("memcpy_htod"); + stream.synchronize().expect("synchronize stream"); + }, + BatchSize::PerIteration, + ), + } + }); + } + } + + copy_group.finish(); + + // Measures device allocation plus host-to-device copy. Host source allocation and + // initialization stay in Criterion setup, so this separates device allocation cost + // from host allocation cost. + let mut alloc_copy_group = c.benchmark_group("cuda/load_to_device/device_alloc_memcpy_htod"); for &(size, size_name) in LOAD_SIZES { - group.throughput(Throughput::Bytes(size as u64)); - - group.bench_with_input( - BenchmarkId::new("cuda/load_to_device/ensure_on_device_sync", size_name), - &size, - |b, &size| { - let session = VortexSession::empty(); - let cuda_ctx = - CudaSession::create_execution_ctx(&session).vortex_expect("cuda ctx"); - - b.iter_batched( - || BufferHandle::new_host(ByteBuffer::from(vec![0xA5; size])), - |source| { - let handle = cuda_ctx - .ensure_on_device_sync(source) - .vortex_expect("ensure_on_device_sync"); - assert!(handle.is_on_device()); - // Keep the explicit sync here to ensure that we measure a sync copy. In - // case the default buffer allocation strategy in the future changes to use - // `cuMemHostAlloc`, the htod copy would change to being async, making the - // function return immediately. - cuda_ctx.stream().synchronize().expect("synchronize stream"); - }, - BatchSize::PerIteration, - ); - - drop(cuda_ctx); - }, - ); + alloc_copy_group.throughput(Throughput::Bytes(size as u64)); + + for &(name, flags) in HOST_MEMORY_KINDS { + alloc_copy_group.bench_with_input( + BenchmarkId::new(name, size_name), + &size, + |b, &size| { + let cuda_ctx = CudaContext::new(0).expect("cuda ctx"); + let stream = cuda_ctx.new_stream().expect("cuda stream"); + + match flags { + Some(flags) => b.iter_batched( + || { + let mut source = CudaHostBuffer::alloc(&cuda_ctx, size, flags); + source.as_mut_slice().fill(0xA5); + source + }, + |source| { + let mut dest = unsafe { stream.alloc::(size) } + .expect("allocate device buffer"); + stream.memcpy_htod(&source, &mut dest).expect("memcpy_htod"); + stream.synchronize().expect("synchronize stream"); + }, + BatchSize::PerIteration, + ), + None => b.iter_batched( + || { + let mut source = vec![0u8; size]; + source.fill(0xA5); + source + }, + |source| { + let mut dest = unsafe { stream.alloc::(size) } + .expect("allocate device buffer"); + stream.memcpy_htod(&source, &mut dest).expect("memcpy_htod"); + stream.synchronize().expect("synchronize stream"); + }, + BatchSize::PerIteration, + ), + } + }, + ); + } } - group.finish(); + alloc_copy_group.finish(); } criterion::criterion_group! {