Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
215 changes: 174 additions & 41 deletions vortex-cuda/benches/load_to_device_cuda.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,61 +7,194 @@ mod bench_config;
// Unused here but suppresses dead_code warning for the shared module.
const _: &[(usize, &str)] = bench_config::BENCH_SIZES;

use std::sync::Arc;

use criterion::BatchSize;
use criterion::BenchmarkId;
use criterion::Criterion;
use criterion::Throughput;
use vortex::array::buffer::BufferHandle;
use vortex::buffer::ByteBuffer;
use vortex::error::VortexExpect;
use vortex::session::VortexSession;
use vortex_cuda::CudaSession;
use cudarc::driver::CudaContext;
use cudarc::driver::CudaStream;
use cudarc::driver::HostSlice;
use cudarc::driver::SyncOnDrop;
use cudarc::driver::result;
use cudarc::driver::sys::CU_MEMHOSTALLOC_WRITECOMBINED;
use vortex_cuda_macros::cuda_available;
use vortex_cuda_macros::cuda_not_available;

const LOAD_SIZES: &[(usize, &str)] = &[
(16 * 1024 * 1024, "16MiB"),
(64 * 1024 * 1024, "64MiB"),
(256 * 1024 * 1024, "256MiB"),
(1024 * 1024 * 1024, "1GiB"),
const LOAD_SIZES: &[(usize, &str)] = &[(1024 * 1024 * 1024, "1GiB")];

const HOST_MEMORY_KINDS: &[(&str, Option<u32>)] = &[
// Pageable host memory allocated through the Rust global allocator. CUDA may need to stage or
// pin pages internally before the host-to-device copy can run.
("pageable", None),
// Page-locked host memory from cuMemHostAlloc with no additional flags.
("pinned_default", Some(0)),
// Page-locked write-combined host memory. This favors CPU writes into the source buffer but
// makes CPU reads from it expensive.
("pinned_write_combined", Some(CU_MEMHOSTALLOC_WRITECOMBINED)),
];

struct CudaHostBuffer {
ctx: Arc<CudaContext>,
ptr: *mut u8,
len: usize,
}

// TODO(0ax1): Move CudaHostBuffer out of the test logic and make
// explicit allocation with flags part of the vortex-cuda API.
impl CudaHostBuffer {
fn alloc(ctx: &Arc<CudaContext>, len: usize, flags: u32) -> Self {
ctx.bind_to_thread().expect("bind cuda context");
let ptr = unsafe { result::malloc_host(len, flags) }.expect("allocate cuda host buffer");
Self {
ctx: Arc::clone(ctx),
ptr: ptr.cast(),
len,
}
}

fn as_mut_slice(&mut self) -> &mut [u8] {
unsafe { std::slice::from_raw_parts_mut(self.ptr, self.len) }
}
}

impl HostSlice<u8> for CudaHostBuffer {
fn len(&self) -> usize {
self.len
}

unsafe fn stream_synced_slice<'a>(
&'a self,
_stream: &'a CudaStream,
) -> (&'a [u8], SyncOnDrop<'a>) {
(
unsafe { std::slice::from_raw_parts(self.ptr, self.len) },
SyncOnDrop::Sync(None),
)
}

unsafe fn stream_synced_mut_slice<'a>(
&'a mut self,
_stream: &'a CudaStream,
) -> (&'a mut [u8], SyncOnDrop<'a>) {
(
unsafe { std::slice::from_raw_parts_mut(self.ptr, self.len) },
SyncOnDrop::Sync(None),
)
}
}

impl Drop for CudaHostBuffer {
fn drop(&mut self) {
self.ctx.record_err(self.ctx.bind_to_thread());
self.ctx
.record_err(unsafe { result::free_host(self.ptr.cast()) });
}
}

fn benchmark_load_to_device(c: &mut Criterion) {
let mut group = c.benchmark_group("cuda");
// Measures a synchronized host-to-device copy after both host source and device
// destination have already been allocated and the source has been initialized.
// This isolates copy throughput for each host allocation mode as much as possible.
let mut copy_group = c.benchmark_group("cuda/load_to_device/memcpy_htod");

for &(size, size_name) in LOAD_SIZES {
copy_group.throughput(Throughput::Bytes(size as u64));

for &(name, flags) in HOST_MEMORY_KINDS {
copy_group.bench_with_input(BenchmarkId::new(name, size_name), &size, |b, &size| {
let cuda_ctx = CudaContext::new(0).expect("cuda ctx");
let stream = cuda_ctx.new_stream().expect("cuda stream");

match flags {
Some(flags) => b.iter_batched(
|| {
let mut source = CudaHostBuffer::alloc(&cuda_ctx, size, flags);
source.as_mut_slice().fill(0xA5);
let dest = unsafe { stream.alloc::<u8>(size) }
.expect("allocate device buffer");
(source, dest)
},
|(source, mut dest)| {
stream.memcpy_htod(&source, &mut dest).expect("memcpy_htod");
stream.synchronize().expect("synchronize stream");
},
BatchSize::PerIteration,
),
None => b.iter_batched(
|| {
let mut source = vec![0u8; size];
source.fill(0xA5);
let dest = unsafe { stream.alloc::<u8>(size) }
.expect("allocate device buffer");
(source, dest)
},
|(source, mut dest)| {
stream.memcpy_htod(&source, &mut dest).expect("memcpy_htod");
stream.synchronize().expect("synchronize stream");
},
BatchSize::PerIteration,
),
}
});
}
}

copy_group.finish();

// Measures device allocation plus host-to-device copy. Host source allocation and
// initialization stay in Criterion setup, so this separates device allocation cost
// from host allocation cost.
let mut alloc_copy_group = c.benchmark_group("cuda/load_to_device/device_alloc_memcpy_htod");

for &(size, size_name) in LOAD_SIZES {
group.throughput(Throughput::Bytes(size as u64));

group.bench_with_input(
BenchmarkId::new("cuda/load_to_device/ensure_on_device_sync", size_name),
&size,
|b, &size| {
let session = VortexSession::empty();
let cuda_ctx =
CudaSession::create_execution_ctx(&session).vortex_expect("cuda ctx");

b.iter_batched(
|| BufferHandle::new_host(ByteBuffer::from(vec![0xA5; size])),
|source| {
let handle = cuda_ctx
.ensure_on_device_sync(source)
.vortex_expect("ensure_on_device_sync");
assert!(handle.is_on_device());
// Keep the explicit sync here to ensure that we measure a sync copy. In
// case the default buffer allocation strategy in the future changes to use
// `cuMemHostAlloc`, the htod copy would change to being async, making the
// function return immediately.
cuda_ctx.stream().synchronize().expect("synchronize stream");
},
BatchSize::PerIteration,
);

drop(cuda_ctx);
},
);
alloc_copy_group.throughput(Throughput::Bytes(size as u64));

for &(name, flags) in HOST_MEMORY_KINDS {
alloc_copy_group.bench_with_input(
BenchmarkId::new(name, size_name),
&size,
|b, &size| {
let cuda_ctx = CudaContext::new(0).expect("cuda ctx");
let stream = cuda_ctx.new_stream().expect("cuda stream");

match flags {
Some(flags) => b.iter_batched(
|| {
let mut source = CudaHostBuffer::alloc(&cuda_ctx, size, flags);
source.as_mut_slice().fill(0xA5);
source
},
|source| {
let mut dest = unsafe { stream.alloc::<u8>(size) }
.expect("allocate device buffer");
stream.memcpy_htod(&source, &mut dest).expect("memcpy_htod");
stream.synchronize().expect("synchronize stream");
},
BatchSize::PerIteration,
),
None => b.iter_batched(
|| {
let mut source = vec![0u8; size];
source.fill(0xA5);
source
},
|source| {
let mut dest = unsafe { stream.alloc::<u8>(size) }
.expect("allocate device buffer");
stream.memcpy_htod(&source, &mut dest).expect("memcpy_htod");
stream.synchronize().expect("synchronize stream");
},
BatchSize::PerIteration,
),
}
},
);
}
}

group.finish();
alloc_copy_group.finish();
}

criterion::criterion_group! {
Expand Down
Loading