diff --git a/vortex-cuda/kernels/src/alp.cu b/vortex-cuda/kernels/src/alp.cu index e34285d9d9b..95502131257 100644 --- a/vortex-cuda/kernels/src/alp.cu +++ b/vortex-cuda/kernels/src/alp.cu @@ -1,36 +1,92 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -#include "scalar_kernel.cuh" - -// ALP (Adaptive Lossless floating-Point) decode operation. -// Converts integers to floats by multiplying by precomputed exponent factors. -// Formula: decoded = (float)encoded * f * e -// Where f = F10[exponents.f] and e = IF10[exponents.e] are passed directly. -template -struct AlpOp { - FloatT f; // F10[exponents.f] - power of 10 - FloatT e; // IF10[exponents.e] - inverse power of 10 - - __device__ inline FloatT operator()(EncodedT value) const { - return static_cast(value) * f * e; +#include "patches.cuh" + +// ALP (Adaptive Lossless floating-Point) decode: out[i] = (FloatT)in[i] * f * e. +// +// Each block processes one 1024-element chunk cooperatively and applies patches +// into shared memory before writing to global memory, mirroring the strategy +// used by bit_unpack. f = F10[exponents.f], e = IF10[exponents.e]. +// +// The cast from EncT to FloatT must preserve ALP's lossless contract: f32 is +// only encoded as i32, and f64 is only encoded as i64. The i64 → double cast +// is lossless for all values ALP can produce. +template +__device__ void alp_device(const EncT *__restrict in, + FloatT *__restrict out, + FloatT f, + FloatT e, + uint64_t array_len, + int thread_idx, + GPUPatches &patches) { + constexpr int ThreadCount = 32; + // ThreadCount == 32 (one warp) is baked into this kernel: + // - __syncwarp() below is only sufficient because all threads live in one warp. + // - per_thread must evenly divide 1024 so the unrolled loops cover the chunk. + static_assert(ThreadCount == 32, "alp kernel requires exactly one warp per block"); + static_assert(1024 % ThreadCount == 0, "ThreadCount must evenly divide 1024"); + __shared__ FloatT shared_out[1024]; + + constexpr int per_thread = 1024 / ThreadCount; + uint64_t chunk_base = static_cast(blockIdx.x) * 1024; + + // Step 1: decode the chunk into shared memory. The tail block is bounds-checked; + // all interior blocks take the fast path with no per-element branch. + if (chunk_base + 1024 <= array_len) { +#pragma unroll + for (int i = 0; i < per_thread; i++) { + int idx = i * ThreadCount + thread_idx; + shared_out[idx] = static_cast(in[idx]) * f * e; + } + } else { +#pragma unroll + for (int i = 0; i < per_thread; i++) { + int idx = i * ThreadCount + thread_idx; + uint64_t global_idx = chunk_base + static_cast(idx); + if (global_idx < array_len) { + shared_out[idx] = static_cast(in[idx]) * f * e; + } else { + shared_out[idx] = FloatT {}; + } + } } -}; - -// Macro to generate ALP kernel for each type combination. -// Input is integer (encoded), output is float (decoded). -#define GENERATE_ALP_KERNEL(enc_suffix, float_suffix, EncType, FloatType) \ - extern "C" __global__ void alp_##enc_suffix##_##float_suffix(const EncType *__restrict encoded, \ - FloatType *__restrict decoded, \ - FloatType f, \ - FloatType e, \ - uint64_t array_len) { \ - scalar_kernel(encoded, decoded, array_len, AlpOp {f, e}); \ + __syncwarp(); + + // Step 2: apply patches in parallel across the warp. + PatchesCursor cursor(patches, blockIdx.x, thread_idx, static_cast(ThreadCount)); + auto patch = cursor.next(); + while (patch.index != 1024) { + shared_out[patch.index] = patch.value; + patch = cursor.next(); } + __syncwarp(); -// f32 variants (ALP for f32 encodes as i32 or i64) -GENERATE_ALP_KERNEL(i32, f32, int32_t, float) -GENERATE_ALP_KERNEL(i64, f32, int64_t, float) +// Step 3: coalesced write-out of the full 1024-element chunk. The caller +// allocates `full_out` rounded up to a multiple of 1024, so every block +// writes entirely within bounds. Positions in `[array_len, rounded_len)` +// of the tail chunk hold don't-care values; the caller slices them off. +#pragma unroll + for (int i = 0; i < per_thread; i++) { + int idx = i * ThreadCount + thread_idx; + out[idx] = shared_out[idx]; + } +} + +#define GENERATE_ALP_KERNEL(enc_suffix, float_suffix, EncT, FloatT) \ + extern "C" __global__ void alp_##enc_suffix##_##float_suffix##_32t(const EncT *__restrict full_in, \ + FloatT *__restrict full_out, \ + FloatT f, \ + FloatT e, \ + uint64_t array_len, \ + GPUPatches patches) { \ + int thread_idx = threadIdx.x; \ + auto in = full_in + (blockIdx.x * 1024); \ + auto out = full_out + (blockIdx.x * 1024); \ + alp_device(in, out, f, e, array_len, thread_idx, patches); \ + } -// f64 variants (ALP for f64 encodes as i64) +// The only ALPInt bindings produced by the encoder are (f32, i32) and (f64, i64). +// i64 → double is lossless; i32 → float is lossless for all values ALP emits. +GENERATE_ALP_KERNEL(i32, f32, int32_t, float) GENERATE_ALP_KERNEL(i64, f64, int64_t, double) diff --git a/vortex-cuda/src/kernel/encodings/alp.rs b/vortex-cuda/src/kernel/encodings/alp.rs index 068b07e7978..77dd4dc73b4 100644 --- a/vortex-cuda/src/kernel/encodings/alp.rs +++ b/vortex-cuda/src/kernel/encodings/alp.rs @@ -2,10 +2,10 @@ // SPDX-FileCopyrightText: Copyright the Vortex contributors use std::fmt::Debug; -use std::sync::Arc; use async_trait::async_trait; use cudarc::driver::DeviceRepr; +use cudarc::driver::LaunchConfig; use cudarc::driver::PushKernelArg; use tracing::instrument; use vortex::array::ArrayRef; @@ -13,7 +13,7 @@ use vortex::array::Canonical; use vortex::array::arrays::PrimitiveArray; use vortex::array::arrays::primitive::PrimitiveDataParts; use vortex::array::buffer::BufferHandle; -use vortex::array::match_each_unsigned_integer_ptype; +use vortex::array::buffer::DeviceBufferExt; use vortex::dtype::NativePType; use vortex::encodings::alp::ALP; use vortex::encodings::alp::ALPArray; @@ -30,7 +30,8 @@ use crate::CudaDeviceBuffer; use crate::executor::CudaArrayExt; use crate::executor::CudaExecute; use crate::executor::CudaExecutionCtx; -use crate::kernel::patches::execute_patches; +use crate::kernel::patches::build_gpu_patches; +use crate::kernel::patches::types::load_patches; /// CUDA decoder for ALP (Adaptive Lossless floating-Point) decompression. #[derive(Debug)] @@ -54,6 +55,11 @@ impl CudaExecute for ALPExecutor { } } +/// Threads per block. 32 threads × 32 elements = 1024 element chunks for both +/// (f32, i32) and (f64, i64). f64 uses 8 KB of shared memory per block. +const ALP_THREADS_PER_BLOCK: u32 = 32; + +#[instrument(skip_all)] async fn decode_alp(array: ALPArray, ctx: &mut CudaExecutionCtx) -> VortexResult where A: ALPFloat + NativePType + DeviceRepr + Send + Sync + 'static, @@ -67,7 +73,7 @@ where let f: A = A::F10[exponents.f as usize]; let e: A = A::IF10[exponents.e as usize]; - // Execute child and copy to device + // Execute child and copy to device. let canonical = array.encoded().clone().execute_cuda(ctx).await?; let primitive = canonical.into_primitive(); let PrimitiveDataParts { @@ -75,42 +81,53 @@ where } = primitive.into_data_parts(); let device_input = ctx.ensure_on_device(buffer).await?; - - // Get CUDA view of input let input_view = device_input.cuda_view::()?; - // Allocate output buffer - let output_slice = ctx.device_alloc::(array_len)?; + // Allocate output rounded up to a full chunk: the fused kernel writes a + // whole 1024-element chunk per block, and we slice off any padding below. + let output_slice = ctx.device_alloc::(array_len.next_multiple_of(1024))?; let output_buf = CudaDeviceBuffer::new(output_slice); let output_view = output_buf.as_view::(); - let array_len_u64 = array_len as u64; - - // Load kernel function - let kernel_ptypes = [A::ALPInt::PTYPE, A::PTYPE]; - let cuda_function = ctx.load_function("alp", &kernel_ptypes)?; + // Patch validity does not need to be scattered: the ALP encoder strips null + // positions from the exception list, so patches only exist at valid + // positions. load_patches additionally rejects patches without + // chunk_offsets (required by the fused kernel's PatchesCursor). + let device_patches = if let Some(patches) = array.patches() { + Some(load_patches(&patches, ctx).await?) + } else { + None + }; + let patches_arg = build_gpu_patches(device_patches.as_ref())?; + + // Load the kernel: alp_{enc}_{float}_32t + let enc_suffix = A::ALPInt::PTYPE.to_string(); + let float_suffix = A::PTYPE.to_string(); + let cuda_function = ctx + .load_function_with_suffixes("alp", &[enc_suffix.as_str(), float_suffix.as_str(), "32t"])?; + + let num_blocks = u32::try_from(array_len.div_ceil(1024))?; + let config = LaunchConfig { + grid_dim: (num_blocks, 1, 1), + block_dim: (ALP_THREADS_PER_BLOCK, 1, 1), + shared_mem_bytes: 0, + }; - ctx.launch_kernel(&cuda_function, array_len, |args| { + let array_len_u64 = array_len as u64; + ctx.launch_kernel_config(&cuda_function, config, array_len, |args| { args.arg(&input_view) .arg(&output_view) .arg(&f) .arg(&e) - .arg(&array_len_u64); + .arg(&array_len_u64) + .arg(&patches_arg); })?; - // Check if there are any patches to decode here. Patch validity does not - // need to be scattered: the ALP encoder strips null positions from the - // exception list, so patches only exist at valid positions. execute_patches - // additionally guards against nullable patch values at runtime. - let output_buf = if let Some(patches) = array.patches() { - match_each_unsigned_integer_ptype!(patches.indices_ptype()?, |I| { - execute_patches::(patches.clone(), output_buf, ctx).await? - }) - } else { - output_buf - }; + // Synchronize so the device patches buffers remain alive for the kernel. + ctx.synchronize_stream()?; + drop(device_patches); - let output_handle = BufferHandle::new_device(Arc::new(output_buf)); + let output_handle = BufferHandle::new_device(output_buf.slice_typed::(0..array_len)); Ok(Canonical::Primitive(PrimitiveArray::from_buffer_handle( output_handle, A::PTYPE, @@ -137,30 +154,38 @@ mod tests { use super::*; use crate::CanonicalCudaExt; + use crate::canonicalize_cpu; use crate::executor::CudaArrayExt; use crate::session::CudaSession; + /// Irrational values ALP cannot encode losslessly, guaranteed to land + /// in the exception list on round-trip through `alp_encode`. + const UNENCODABLE: f64 = std::f64::consts::PI; + const UNENCODABLE_F32: f32 = std::f32::consts::PI; + + /// Small manually-constructed ALP array with patches. Exercises the + /// custom-construction path (as opposed to going through `alp_encode`). + /// Patches must carry `chunk_offsets` — the fused kernel requires them. #[crate::test] async fn test_cuda_alp_decompression_f32() -> VortexResult<()> { let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) .vortex_expect("failed to create execution context"); - // Create encoded values (what ALP would produce) // For f32 with exponents (e=0, f=2): decoded = encoded * F10[2] * IF10[0] // = encoded * 100.0 * 1.0 - // So encoded value of 100 -> decoded 10000.0 + // Encoded value of 100 -> decoded 10000.0. let encoded_data: Vec = vec![100, 200, 300, 400, 500]; - let exponents = Exponents { e: 0, f: 2 }; // multiply by 100 + let exponents = Exponents { e: 0, f: 2 }; - // Patches + // One chunk holds all 5 elements. chunk_offsets[0] = 0: chunk 0's + // patches begin at patch index 0. let patches = Patches::new( 5, 0, PrimitiveArray::new(buffer![0u32, 4u32], Validity::NonNullable).into_array(), PrimitiveArray::new(buffer![0.0f32, 999f32], Validity::NonNullable).into_array(), - None, - ) - .unwrap(); + Some(PrimitiveArray::new(buffer![0u32], Validity::NonNullable).into_array()), + )?; let alp_array = ALP::try_new( PrimitiveArray::new(Buffer::from(encoded_data.clone()), Validity::NonNullable) @@ -169,7 +194,7 @@ mod tests { Some(patches), )?; - let cpu_result = crate::canonicalize_cpu(alp_array.clone())?.into_array(); + let cpu_result = canonicalize_cpu(alp_array.clone())?.into_array(); let gpu_result = ALPExecutor .execute(alp_array.into_array(), &mut cuda_ctx) @@ -194,14 +219,14 @@ mod tests { .vortex_expect("failed to create execution context"); // Values that will produce ALP exceptions at non-null positions. - // Nulls at positions 1 and 3; the exception at position 4 (1.23456) - // can't be encoded losslessly by ALP. + // Nulls at positions 1 and 3; the exception at position 4 can't be + // encoded losslessly by ALP. let values: Vec> = vec![ Some(1.0), None, Some(2.0), None, - Some(1.23456), + Some(UNENCODABLE_F32), Some(3.0), Some(4.0), Some(5.0), @@ -213,7 +238,7 @@ mod tests { &mut LEGACY_SESSION.create_execution_ctx(), )?; - let cpu_result = crate::canonicalize_cpu(alp_array.clone())?.into_array(); + let cpu_result = canonicalize_cpu(alp_array.clone())?.into_array(); let gpu_result = alp_array .into_array() @@ -244,7 +269,181 @@ mod tests { &mut LEGACY_SESSION.create_execution_ctx(), )?; - let cpu_result = crate::canonicalize_cpu(alp_array.clone())?.into_array(); + let cpu_result = canonicalize_cpu(alp_array.clone())?.into_array(); + + let gpu_result = alp_array + .into_array() + .execute_cuda(&mut cuda_ctx) + .await? + .into_host() + .await? + .into_array(); + + assert_arrays_eq!(cpu_result, gpu_result); + Ok(()) + } + + /// Multi-chunk ALP (> 1024 elements) with patches in chunks 0 and 2 but + /// none in chunk 1. Exercises the `PatchesCursor` branch where a + /// non-trailing chunk has `chunk_offsets[c] == chunk_offsets[c+1]` + /// (zero patches) via the offset math rather than the NULL sentinel. + #[crate::test] + async fn test_cuda_alp_multi_chunk_sparse_patches() -> VortexResult<()> { + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) + .vortex_expect("failed to create execution context"); + + // 3072 values (3 chunks). Inject exceptions (values ALP can't encode + // losslessly) only in chunks 0 and 2; chunk 1 stays exception-free so + // its cursor slice is empty despite patches existing in the array. + let values: Buffer = (0u32..3072) + .map(|i| { + if matches!(i, 0 | 100 | 1023 | 3071) { + UNENCODABLE_F32 + } else { + i as f32 + } + }) + .collect(); + let prim = PrimitiveArray::new(values, Validity::NonNullable); + let alp_array = alp_encode( + prim.as_view(), + None, + &mut LEGACY_SESSION.create_execution_ctx(), + )?; + assert!( + alp_array.patches().is_some(), + "expected patches from ALP exceptions" + ); + + let cpu_result = canonicalize_cpu(alp_array.clone())?.into_array(); + + let gpu_result = alp_array + .into_array() + .execute_cuda(&mut cuda_ctx) + .await? + .into_host() + .await? + .into_array(); + + assert_arrays_eq!(cpu_result, gpu_result); + Ok(()) + } + + /// Multi-chunk f64 decode with patches distributed across chunks. The f64 + /// path (i64 → double) is otherwise only covered by the partial-tail case, + /// so this guards the fast-path for the (i64, f64) kernel variant. + #[crate::test] + async fn test_cuda_alp_f64_multi_chunk_with_patches() -> VortexResult<()> { + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) + .vortex_expect("failed to create execution context"); + + // 3072 values (3 chunks). Sprinkle exceptions into each chunk. + let values: Buffer = (0u32..3072) + .map(|i| { + if matches!(i, 0 | 500 | 1024 | 1500 | 2048 | 3071) { + UNENCODABLE + } else { + i as f64 + } + }) + .collect(); + let prim = PrimitiveArray::new(values, Validity::NonNullable); + let alp_array = alp_encode( + prim.as_view(), + None, + &mut LEGACY_SESSION.create_execution_ctx(), + )?; + assert!( + alp_array.patches().is_some(), + "expected patches from ALP exceptions" + ); + + let cpu_result = canonicalize_cpu(alp_array.clone())?.into_array(); + + let gpu_result = alp_array + .into_array() + .execute_cuda(&mut cuda_ctx) + .await? + .into_host() + .await? + .into_array(); + + assert_arrays_eq!(cpu_result, gpu_result); + Ok(()) + } + + /// Single chunk with more patches than threads per block (32). Forces + /// `PatchesCursor` to split patches across multiple threads, exercising + /// the per-thread ceil-division and clamping math that no other test hits + /// (existing tests have ≤ 6 patches per chunk). + #[crate::test] + async fn test_cuda_alp_dense_patches_single_chunk() -> VortexResult<()> { + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) + .vortex_expect("failed to create execution context"); + + // Build a 1024-element ALP array manually with exactly 40 patches + // all in the single chunk. 40 > 32 forces cursor division (each of + // the first 20 threads handles 2 patches; remaining threads idle). + const LEN: i32 = 1024; + const NUM_PATCHES: u32 = 40; + + let exponents = Exponents { e: 0, f: 2 }; + let encoded: Buffer = (0i32..LEN).map(|i| i * 100).collect(); + + let patch_indices: Buffer = (0u32..NUM_PATCHES).collect(); + let patch_values: Buffer = (0..NUM_PATCHES).map(|i| i as f32 * 0.125 + 0.5).collect(); + + let patches = Patches::new( + LEN as usize, + 0, + PrimitiveArray::new(patch_indices, Validity::NonNullable).into_array(), + PrimitiveArray::new(patch_values, Validity::NonNullable).into_array(), + Some(PrimitiveArray::new(buffer![0u32], Validity::NonNullable).into_array()), + )?; + + let alp_array = ALP::try_new( + PrimitiveArray::new(encoded, Validity::NonNullable).into_array(), + exponents, + Some(patches), + )?; + + let cpu_result = canonicalize_cpu(alp_array.clone())?.into_array(); + + let gpu_result = ALPExecutor + .execute(alp_array.into_array(), &mut cuda_ctx) + .await + .vortex_expect("GPU decompression failed") + .into_host() + .await? + .into_array(); + + assert_arrays_eq!(cpu_result, gpu_result); + Ok(()) + } + + /// Tail-chunk bounds check: an array whose length is not a multiple of + /// 1024 forces the kernel's tail-block path to bounds-check its decode + /// loop. Includes a patch in the tail. + #[crate::test] + async fn test_cuda_alp_partial_tail_chunk() -> VortexResult<()> { + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) + .vortex_expect("failed to create execution context"); + + let values: Buffer = (0u32..1500) + .map(|i| if i == 1400 { UNENCODABLE } else { i as f64 }) + .collect(); + let prim = PrimitiveArray::new(values, Validity::NonNullable); + let alp_array = alp_encode( + prim.as_view(), + None, + &mut LEGACY_SESSION.create_execution_ctx(), + )?; + assert!( + alp_array.patches().is_some(), + "expected patches from ALP exceptions" + ); + + let cpu_result = canonicalize_cpu(alp_array.clone())?.into_array(); let gpu_result = alp_array .into_array() diff --git a/vortex-cuda/src/kernel/encodings/bitpacked.rs b/vortex-cuda/src/kernel/encodings/bitpacked.rs index 0059d74fd5e..29b56436feb 100644 --- a/vortex-cuda/src/kernel/encodings/bitpacked.rs +++ b/vortex-cuda/src/kernel/encodings/bitpacked.rs @@ -28,10 +28,8 @@ use crate::CudaBufferExt; use crate::CudaDeviceBuffer; use crate::executor::CudaExecute; use crate::executor::CudaExecutionCtx; -use crate::kernel::patches::gpu::ChunkOffsetType_CO_U32; -use crate::kernel::patches::gpu::GPUPatches; +use crate::kernel::patches::build_gpu_patches; use crate::kernel::patches::types::load_patches; -use crate::kernel::patches::types::ptype_to_chunk_offset_type; /// CUDA decoder for bit-packed arrays. #[derive(Debug)] @@ -86,8 +84,6 @@ pub fn bitpacked_cuda_launch_config(output_width: usize, len: usize) -> VortexRe }) } -unsafe impl DeviceRepr for GPUPatches {} - #[instrument(skip_all)] pub(crate) async fn decode_bitpacked( array: BitPackedArray, @@ -131,31 +127,7 @@ where None }; - #[expect(clippy::cast_possible_truncation)] - let patches_arg = if let Some(p) = &device_patches { - GPUPatches { - chunk_offsets: p.chunk_offsets.cuda_device_ptr()? as _, - chunk_offset_type: ptype_to_chunk_offset_type(p.chunk_offset_ptype)?, - indices: p.indices.cuda_device_ptr()? as _, - values: p.values.cuda_device_ptr()? as _, - offset: p.offset as u32, - offset_within_chunk: p.offset_within_chunk as u32, - num_patches: p.num_patches as u32, - n_chunks: p.n_chunks as u32, - } - } else { - // NULL chunk_offsets signals no patches to the kernel - GPUPatches { - chunk_offsets: std::ptr::null_mut(), - chunk_offset_type: ChunkOffsetType_CO_U32, - indices: std::ptr::null_mut(), - values: std::ptr::null_mut(), - offset: 0, - offset_within_chunk: 0, - num_patches: 0, - n_chunks: 0, - } - }; + let patches_arg = build_gpu_patches(device_patches.as_ref())?; ctx.launch_kernel_config(&cuda_function, config, len, |args| { args.arg(&input_view) diff --git a/vortex-cuda/src/kernel/patches/mod.rs b/vortex-cuda/src/kernel/patches/mod.rs index 07cd51c3f01..6075b664f72 100644 --- a/vortex-cuda/src/kernel/patches/mod.rs +++ b/vortex-cuda/src/kernel/patches/mod.rs @@ -17,14 +17,79 @@ use vortex::array::patches::Patches; use vortex::array::validity::Validity; use vortex::dtype::NativePType; use vortex::error::VortexResult; +use vortex::error::vortex_bail; use vortex::error::vortex_ensure; use crate::CudaBufferExt; use crate::CudaDeviceBuffer; use crate::CudaExecutionCtx; use crate::executor::CudaArrayExt; +use crate::kernel::patches::gpu::ChunkOffsetType; +use crate::kernel::patches::gpu::ChunkOffsetType_CO_U8; +use crate::kernel::patches::gpu::ChunkOffsetType_CO_U16; +use crate::kernel::patches::gpu::ChunkOffsetType_CO_U32; +use crate::kernel::patches::gpu::ChunkOffsetType_CO_U64; +use crate::kernel::patches::gpu::GPUPatches; +use crate::kernel::patches::types::DevicePatches; + +// Safe because `GPUPatches` contains only raw pointers, POD integers, and an enum. +unsafe impl DeviceRepr for GPUPatches {} + +impl GPUPatches { + /// Sentinel value passed to kernels when no patches are present. A NULL + /// `chunk_offsets` pointer is the signal `PatchesCursor` checks for. + pub(crate) const NULL_PATCHES: Self = Self { + chunk_offsets: std::ptr::null_mut(), + chunk_offset_type: ChunkOffsetType_CO_U32, + indices: std::ptr::null_mut(), + values: std::ptr::null_mut(), + offset: 0, + offset_within_chunk: 0, + num_patches: 0, + n_chunks: 0, + }; +} + +/// Convert a [`PType`] to the corresponding [`ChunkOffsetType`] for GPU patches. +fn ptype_to_chunk_offset_type(ptype: vortex::dtype::PType) -> VortexResult { + match ptype { + vortex::dtype::PType::U8 => Ok(ChunkOffsetType_CO_U8), + vortex::dtype::PType::U16 => Ok(ChunkOffsetType_CO_U16), + vortex::dtype::PType::U32 => Ok(ChunkOffsetType_CO_U32), + vortex::dtype::PType::U64 => Ok(ChunkOffsetType_CO_U64), + _ => vortex_bail!("Invalid PType for chunk_offsets: {:?}", ptype), + } +} + +/// Build a [`GPUPatches`] kernel argument from optional device-resident patches. +/// +/// When `device_patches` is `None`, returns a sentinel value whose NULL +/// `chunk_offsets` signals "no patches" to the kernel. +pub(crate) fn build_gpu_patches( + device_patches: Option<&DevicePatches>, +) -> VortexResult { + #[expect(clippy::cast_possible_truncation)] + match device_patches { + Some(p) => Ok(GPUPatches { + chunk_offsets: p.chunk_offsets.cuda_device_ptr()? as _, + chunk_offset_type: ptype_to_chunk_offset_type(p.chunk_offset_ptype)?, + indices: p.indices.cuda_device_ptr()? as _, + values: p.values.cuda_device_ptr()? as _, + offset: p.offset as u32, + offset_within_chunk: p.offset_within_chunk as u32, + num_patches: p.num_patches as u32, + n_chunks: p.n_chunks as u32, + }), + None => Ok(GPUPatches::NULL_PATCHES), + } +} /// Apply a set of patches in-place onto a [`CudaDeviceBuffer`] holding `ValuesT`. +/// +/// Naive scatter kernel. Kept as a reusable fallback for encoders that cannot +/// use the chunk-based fused patching path (e.g., where `chunk_offsets` are +/// unavailable); no production caller uses it today. +#[allow(dead_code)] #[instrument(skip_all)] pub(crate) async fn execute_patches< ValuesT: NativePType + DeviceRepr,