diff --git a/vortex-cuda/kernels/src/alp.cu b/vortex-cuda/kernels/src/alp.cu
index e34285d9d9b..95502131257 100644
--- a/vortex-cuda/kernels/src/alp.cu
+++ b/vortex-cuda/kernels/src/alp.cu
@@ -1,36 +1,92 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
-#include "scalar_kernel.cuh"
-
-// ALP (Adaptive Lossless floating-Point) decode operation.
-// Converts integers to floats by multiplying by precomputed exponent factors.
-// Formula: decoded = (float)encoded * f * e
-// Where f = F10[exponents.f] and e = IF10[exponents.e] are passed directly.
-template <typename EncodedT, typename FloatT>
-struct AlpOp {
-    FloatT f; // F10[exponents.f] - power of 10
-    FloatT e; // IF10[exponents.e] - inverse power of 10
-
-    __device__ inline FloatT operator()(EncodedT value) const {
-        return static_cast<FloatT>(value) * f * e;
+#include "patches.cuh"
+
+// ALP (Adaptive Lossless floating-Point) decode: out[i] = (FloatT)in[i] * f * e.
+//
+// Each block processes one 1024-element chunk cooperatively and applies patches
+// into shared memory before writing to global memory, mirroring the strategy
+// used by bit_unpack. f = F10[exponents.f], e = IF10[exponents.e].
+//
+// The cast from EncT to FloatT must preserve ALP's lossless contract: f32 is
+// only encoded as i32, and f64 is only encoded as i64. The i64 → double cast
+// is lossless for all values ALP can produce.
+template <typename EncT, typename FloatT>
+__device__ void alp_device(const EncT *__restrict in,
+                           FloatT *__restrict out,
+                           FloatT f,
+                           FloatT e,
+                           uint64_t array_len,
+                           int thread_idx,
+                           GPUPatches &patches) {
+    constexpr int ThreadCount = 32;
+    // ThreadCount == 32 (one warp) is baked into this kernel:
+    //   - __syncwarp() below is only sufficient because all threads live in one warp.
+    //   - per_thread must evenly divide 1024 so the unrolled loops cover the chunk.
+    static_assert(ThreadCount == 32, "alp kernel requires exactly one warp per block");
+    static_assert(1024 % ThreadCount == 0, "ThreadCount must evenly divide 1024");
+    __shared__ FloatT shared_out[1024];
+
+    constexpr int per_thread = 1024 / ThreadCount;
+    uint64_t chunk_base = static_cast<uint64_t>(blockIdx.x) * 1024;
+
+    // Step 1: decode the chunk into shared memory. The tail block is bounds-checked;
+    // all interior blocks take the fast path with no per-element branch.
+    if (chunk_base + 1024 <= array_len) {
+#pragma unroll
+        for (int i = 0; i < per_thread; i++) {
+            int idx = i * ThreadCount + thread_idx;
+            shared_out[idx] = static_cast<FloatT>(in[idx]) * f * e;
+        }
+    } else {
+#pragma unroll
+        for (int i = 0; i < per_thread; i++) {
+            int idx = i * ThreadCount + thread_idx;
+            uint64_t global_idx = chunk_base + static_cast<uint64_t>(idx);
+            if (global_idx < array_len) {
+                shared_out[idx] = static_cast<FloatT>(in[idx]) * f * e;
+            } else {
+                shared_out[idx] = FloatT {};
+            }
+        }
     }
-};
-
-// Macro to generate ALP kernel for each type combination.
-// Input is integer (encoded), output is float (decoded).
-#define GENERATE_ALP_KERNEL(enc_suffix, float_suffix, EncType, FloatType)                                    \
-    extern "C" __global__ void alp_##enc_suffix##_##float_suffix(const EncType *__restrict encoded,          \
-                                                                 FloatType *__restrict decoded,              \
-                                                                 FloatType f,                                \
-                                                                 FloatType e,                                \
-                                                                 uint64_t array_len) {                       \
-        scalar_kernel(encoded, decoded, array_len, AlpOp<EncType, FloatType> {f, e});                        \
+    __syncwarp();
+
+    // Step 2: apply patches in parallel across the warp.
+    PatchesCursor<FloatT> cursor(patches, blockIdx.x, thread_idx, static_cast<uint32_t>(ThreadCount));
+    auto patch = cursor.next();
+    while (patch.index != 1024) {
+        shared_out[patch.index] = patch.value;
+        patch = cursor.next();
     }
+    __syncwarp();
 
-// f32 variants (ALP for f32 encodes as i32 or i64)
-GENERATE_ALP_KERNEL(i32, f32, int32_t, float)
-GENERATE_ALP_KERNEL(i64, f32, int64_t, float)
+// Step 3: coalesced write-out of the full 1024-element chunk. The caller
+// allocates `full_out` rounded up to a multiple of 1024, so every block
+// writes entirely within bounds. Positions in `[array_len, rounded_len)`
+// of the tail chunk hold don't-care values; the caller slices them off.
+#pragma unroll
+    for (int i = 0; i < per_thread; i++) {
+        int idx = i * ThreadCount + thread_idx;
+        out[idx] = shared_out[idx];
+    }
+}
+
+#define GENERATE_ALP_KERNEL(enc_suffix, float_suffix, EncT, FloatT)                                          \
+    extern "C" __global__ void alp_##enc_suffix##_##float_suffix##_32t(const EncT *__restrict full_in,       \
+                                                                       FloatT *__restrict full_out,          \
+                                                                       FloatT f,                             \
+                                                                       FloatT e,                             \
+                                                                       uint64_t array_len,                   \
+                                                                       GPUPatches patches) {                 \
+        int thread_idx = threadIdx.x;                                                                        \
+        auto in = full_in + (blockIdx.x * 1024);                                                             \
+        auto out = full_out + (blockIdx.x * 1024);                                                           \
+        alp_device<EncT, FloatT>(in, out, f, e, array_len, thread_idx, patches);                             \
+    }
 
-// f64 variants (ALP for f64 encodes as i64)
+// The only ALPInt bindings produced by the encoder are (f32, i32) and (f64, i64).
+// i64 → double is lossless; i32 → float is lossless for all values ALP emits.
+GENERATE_ALP_KERNEL(i32, f32, int32_t, float)
 GENERATE_ALP_KERNEL(i64, f64, int64_t, double)
diff --git a/vortex-cuda/src/kernel/encodings/alp.rs b/vortex-cuda/src/kernel/encodings/alp.rs
index 068b07e7978..77dd4dc73b4 100644
--- a/vortex-cuda/src/kernel/encodings/alp.rs
+++ b/vortex-cuda/src/kernel/encodings/alp.rs
@@ -2,10 +2,10 @@
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
 use std::fmt::Debug;
-use std::sync::Arc;
 
 use async_trait::async_trait;
 use cudarc::driver::DeviceRepr;
+use cudarc::driver::LaunchConfig;
 use cudarc::driver::PushKernelArg;
 use tracing::instrument;
 use vortex::array::ArrayRef;
@@ -13,7 +13,7 @@ use vortex::array::Canonical;
 use vortex::array::arrays::PrimitiveArray;
 use vortex::array::arrays::primitive::PrimitiveDataParts;
 use vortex::array::buffer::BufferHandle;
-use vortex::array::match_each_unsigned_integer_ptype;
+use vortex::array::buffer::DeviceBufferExt;
 use vortex::dtype::NativePType;
 use vortex::encodings::alp::ALP;
 use vortex::encodings::alp::ALPArray;
@@ -30,7 +30,8 @@ use crate::CudaDeviceBuffer;
 use crate::executor::CudaArrayExt;
 use crate::executor::CudaExecute;
 use crate::executor::CudaExecutionCtx;
-use crate::kernel::patches::execute_patches;
+use crate::kernel::patches::build_gpu_patches;
+use crate::kernel::patches::types::load_patches;
 
 /// CUDA decoder for ALP (Adaptive Lossless floating-Point) decompression.
 #[derive(Debug)]
@@ -54,6 +55,11 @@ impl CudaExecute for ALPExecutor {
     }
 }
 
+/// Threads per block. 32 threads × 32 elements = 1024 element chunks for both
+/// (f32, i32) and (f64, i64). f64 uses 8 KB of shared memory per block.
+const ALP_THREADS_PER_BLOCK: u32 = 32;
+
+#[instrument(skip_all)]
 async fn decode_alp<A>(array: ALPArray, ctx: &mut CudaExecutionCtx) -> VortexResult<Canonical>
 where
     A: ALPFloat + NativePType + DeviceRepr + Send + Sync + 'static,
@@ -67,7 +73,7 @@ where
     let f: A = A::F10[exponents.f as usize];
     let e: A = A::IF10[exponents.e as usize];
 
-    // Execute child and copy to device
+    // Execute child and copy to device.
     let canonical = array.encoded().clone().execute_cuda(ctx).await?;
     let primitive = canonical.into_primitive();
     let PrimitiveDataParts {
@@ -75,42 +81,53 @@ where
     } = primitive.into_data_parts();
 
     let device_input = ctx.ensure_on_device(buffer).await?;
-
-    // Get CUDA view of input
     let input_view = device_input.cuda_view::<A::ALPInt>()?;
 
-    // Allocate output buffer
-    let output_slice = ctx.device_alloc::<A>(array_len)?;
+    // Allocate output rounded up to a full chunk: the fused kernel writes a
+    // whole 1024-element chunk per block, and we slice off any padding below.
+    let output_slice = ctx.device_alloc::<A>(array_len.next_multiple_of(1024))?;
     let output_buf = CudaDeviceBuffer::new(output_slice);
     let output_view = output_buf.as_view::<A>();
 
-    let array_len_u64 = array_len as u64;
-
-    // Load kernel function
-    let kernel_ptypes = [A::ALPInt::PTYPE, A::PTYPE];
-    let cuda_function = ctx.load_function("alp", &kernel_ptypes)?;
+    // Patch validity does not need to be scattered: the ALP encoder strips null
+    // positions from the exception list, so patches only exist at valid
+    // positions. load_patches additionally rejects patches without
+    // chunk_offsets (required by the fused kernel's PatchesCursor).
+    let device_patches = if let Some(patches) = array.patches() {
+        Some(load_patches(&patches, ctx).await?)
+    } else {
+        None
+    };
+    let patches_arg = build_gpu_patches(device_patches.as_ref())?;
+
+    // Load the kernel: alp_{enc}_{float}_32t
+    let enc_suffix = A::ALPInt::PTYPE.to_string();
+    let float_suffix = A::PTYPE.to_string();
+    let cuda_function = ctx
+        .load_function_with_suffixes("alp", &[enc_suffix.as_str(), float_suffix.as_str(), "32t"])?;
+
+    let num_blocks = u32::try_from(array_len.div_ceil(1024))?;
+    let config = LaunchConfig {
+        grid_dim: (num_blocks, 1, 1),
+        block_dim: (ALP_THREADS_PER_BLOCK, 1, 1),
+        shared_mem_bytes: 0,
+    };
 
-    ctx.launch_kernel(&cuda_function, array_len, |args| {
+    let array_len_u64 = array_len as u64;
+    ctx.launch_kernel_config(&cuda_function, config, array_len, |args| {
         args.arg(&input_view)
             .arg(&output_view)
             .arg(&f)
             .arg(&e)
-            .arg(&array_len_u64);
+            .arg(&array_len_u64)
+            .arg(&patches_arg);
     })?;
 
-    // Check if there are any patches to decode here. Patch validity does not
-    // need to be scattered: the ALP encoder strips null positions from the
-    // exception list, so patches only exist at valid positions. execute_patches
-    // additionally guards against nullable patch values at runtime.
-    let output_buf = if let Some(patches) = array.patches() {
-        match_each_unsigned_integer_ptype!(patches.indices_ptype()?, |I| {
-            execute_patches::<A, I>(patches.clone(), output_buf, ctx).await?
-        })
-    } else {
-        output_buf
-    };
+    // Synchronize so the device patches buffers remain alive for the kernel.
+    ctx.synchronize_stream()?;
+    drop(device_patches);
 
-    let output_handle = BufferHandle::new_device(Arc::new(output_buf));
+    let output_handle = BufferHandle::new_device(output_buf.slice_typed::<A>(0..array_len));
     Ok(Canonical::Primitive(PrimitiveArray::from_buffer_handle(
         output_handle,
         A::PTYPE,
@@ -137,30 +154,38 @@ mod tests {
 
     use super::*;
     use crate::CanonicalCudaExt;
+    use crate::canonicalize_cpu;
     use crate::executor::CudaArrayExt;
     use crate::session::CudaSession;
 
+    /// Irrational values ALP cannot encode losslessly, guaranteed to land
+    /// in the exception list on round-trip through `alp_encode`.
+    const UNENCODABLE: f64 = std::f64::consts::PI;
+    const UNENCODABLE_F32: f32 = std::f32::consts::PI;
+
+    /// Small manually-constructed ALP array with patches. Exercises the
+    /// custom-construction path (as opposed to going through `alp_encode`).
+    /// Patches must carry `chunk_offsets` — the fused kernel requires them.
     #[crate::test]
     async fn test_cuda_alp_decompression_f32() -> VortexResult<()> {
         let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
 
-        // Create encoded values (what ALP would produce)
         // For f32 with exponents (e=0, f=2): decoded = encoded * F10[2] * IF10[0]
         //                                            = encoded * 100.0 * 1.0
-        // So encoded value of 100 -> decoded 10000.0
+        // Encoded value of 100 -> decoded 10000.0.
         let encoded_data: Vec<i32> = vec![100, 200, 300, 400, 500];
-        let exponents = Exponents { e: 0, f: 2 }; // multiply by 100
+        let exponents = Exponents { e: 0, f: 2 };
 
-        // Patches
+        // One chunk holds all 5 elements. chunk_offsets[0] = 0: chunk 0's
+        // patches begin at patch index 0.
         let patches = Patches::new(
             5,
             0,
             PrimitiveArray::new(buffer![0u32, 4u32], Validity::NonNullable).into_array(),
             PrimitiveArray::new(buffer![0.0f32, 999f32], Validity::NonNullable).into_array(),
-            None,
-        )
-        .unwrap();
+            Some(PrimitiveArray::new(buffer![0u32], Validity::NonNullable).into_array()),
+        )?;
 
         let alp_array = ALP::try_new(
             PrimitiveArray::new(Buffer::from(encoded_data.clone()), Validity::NonNullable)
@@ -169,7 +194,7 @@ mod tests {
             Some(patches),
         )?;
 
-        let cpu_result = crate::canonicalize_cpu(alp_array.clone())?.into_array();
+        let cpu_result = canonicalize_cpu(alp_array.clone())?.into_array();
 
         let gpu_result = ALPExecutor
             .execute(alp_array.into_array(), &mut cuda_ctx)
@@ -194,14 +219,14 @@ mod tests {
             .vortex_expect("failed to create execution context");
 
         // Values that will produce ALP exceptions at non-null positions.
-        // Nulls at positions 1 and 3; the exception at position 4 (1.23456)
-        // can't be encoded losslessly by ALP.
+        // Nulls at positions 1 and 3; the exception at position 4 can't be
+        // encoded losslessly by ALP.
         let values: Vec<Option<f32>> = vec![
             Some(1.0),
             None,
             Some(2.0),
             None,
-            Some(1.23456),
+            Some(UNENCODABLE_F32),
             Some(3.0),
             Some(4.0),
             Some(5.0),
@@ -213,7 +238,7 @@ mod tests {
             &mut LEGACY_SESSION.create_execution_ctx(),
         )?;
 
-        let cpu_result = crate::canonicalize_cpu(alp_array.clone())?.into_array();
+        let cpu_result = canonicalize_cpu(alp_array.clone())?.into_array();
 
         let gpu_result = alp_array
             .into_array()
@@ -244,7 +269,181 @@ mod tests {
             &mut LEGACY_SESSION.create_execution_ctx(),
         )?;
 
-        let cpu_result = crate::canonicalize_cpu(alp_array.clone())?.into_array();
+        let cpu_result = canonicalize_cpu(alp_array.clone())?.into_array();
+
+        let gpu_result = alp_array
+            .into_array()
+            .execute_cuda(&mut cuda_ctx)
+            .await?
+            .into_host()
+            .await?
+            .into_array();
+
+        assert_arrays_eq!(cpu_result, gpu_result);
+        Ok(())
+    }
+
+    /// Multi-chunk ALP (> 1024 elements) with patches in chunks 0 and 2 but
+    /// none in chunk 1. Exercises the `PatchesCursor` branch where a
+    /// non-trailing chunk has `chunk_offsets[c] == chunk_offsets[c+1]`
+    /// (zero patches) via the offset math rather than the NULL sentinel.
+    #[crate::test]
+    async fn test_cuda_alp_multi_chunk_sparse_patches() -> VortexResult<()> {
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
+            .vortex_expect("failed to create execution context");
+
+        // 3072 values (3 chunks). Inject exceptions (values ALP can't encode
+        // losslessly) only in chunks 0 and 2; chunk 1 stays exception-free so
+        // its cursor slice is empty despite patches existing in the array.
+        let values: Buffer<f32> = (0u32..3072)
+            .map(|i| {
+                if matches!(i, 0 | 100 | 1023 | 3071) {
+                    UNENCODABLE_F32
+                } else {
+                    i as f32
+                }
+            })
+            .collect();
+        let prim = PrimitiveArray::new(values, Validity::NonNullable);
+        let alp_array = alp_encode(
+            prim.as_view(),
+            None,
+            &mut LEGACY_SESSION.create_execution_ctx(),
+        )?;
+        assert!(
+            alp_array.patches().is_some(),
+            "expected patches from ALP exceptions"
+        );
+
+        let cpu_result = canonicalize_cpu(alp_array.clone())?.into_array();
+
+        let gpu_result = alp_array
+            .into_array()
+            .execute_cuda(&mut cuda_ctx)
+            .await?
+            .into_host()
+            .await?
+            .into_array();
+
+        assert_arrays_eq!(cpu_result, gpu_result);
+        Ok(())
+    }
+
+    /// Multi-chunk f64 decode with patches distributed across chunks. The f64
+    /// path (i64 → double) is otherwise only covered by the partial-tail case,
+    /// so this guards the fast-path for the (i64, f64) kernel variant.
+    #[crate::test]
+    async fn test_cuda_alp_f64_multi_chunk_with_patches() -> VortexResult<()> {
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
+            .vortex_expect("failed to create execution context");
+
+        // 3072 values (3 chunks). Sprinkle exceptions into each chunk.
+        let values: Buffer<f64> = (0u32..3072)
+            .map(|i| {
+                if matches!(i, 0 | 500 | 1024 | 1500 | 2048 | 3071) {
+                    UNENCODABLE
+                } else {
+                    i as f64
+                }
+            })
+            .collect();
+        let prim = PrimitiveArray::new(values, Validity::NonNullable);
+        let alp_array = alp_encode(
+            prim.as_view(),
+            None,
+            &mut LEGACY_SESSION.create_execution_ctx(),
+        )?;
+        assert!(
+            alp_array.patches().is_some(),
+            "expected patches from ALP exceptions"
+        );
+
+        let cpu_result = canonicalize_cpu(alp_array.clone())?.into_array();
+
+        let gpu_result = alp_array
+            .into_array()
+            .execute_cuda(&mut cuda_ctx)
+            .await?
+            .into_host()
+            .await?
+            .into_array();
+
+        assert_arrays_eq!(cpu_result, gpu_result);
+        Ok(())
+    }
+
+    /// Single chunk with more patches than threads per block (32). Forces
+    /// `PatchesCursor` to split patches across multiple threads, exercising
+    /// the per-thread ceil-division and clamping math that no other test hits
+    /// (existing tests have ≤ 6 patches per chunk).
+    #[crate::test]
+    async fn test_cuda_alp_dense_patches_single_chunk() -> VortexResult<()> {
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
+            .vortex_expect("failed to create execution context");
+
+        // Build a 1024-element ALP array manually with exactly 40 patches
+        // all in the single chunk. 40 > 32 forces cursor division (each of
+        // the first 20 threads handles 2 patches; remaining threads idle).
+        const LEN: i32 = 1024;
+        const NUM_PATCHES: u32 = 40;
+
+        let exponents = Exponents { e: 0, f: 2 };
+        let encoded: Buffer<i32> = (0i32..LEN).map(|i| i * 100).collect();
+
+        let patch_indices: Buffer<u32> = (0u32..NUM_PATCHES).collect();
+        let patch_values: Buffer<f32> = (0..NUM_PATCHES).map(|i| i as f32 * 0.125 + 0.5).collect();
+
+        let patches = Patches::new(
+            LEN as usize,
+            0,
+            PrimitiveArray::new(patch_indices, Validity::NonNullable).into_array(),
+            PrimitiveArray::new(patch_values, Validity::NonNullable).into_array(),
+            Some(PrimitiveArray::new(buffer![0u32], Validity::NonNullable).into_array()),
+        )?;
+
+        let alp_array = ALP::try_new(
+            PrimitiveArray::new(encoded, Validity::NonNullable).into_array(),
+            exponents,
+            Some(patches),
+        )?;
+
+        let cpu_result = canonicalize_cpu(alp_array.clone())?.into_array();
+
+        let gpu_result = ALPExecutor
+            .execute(alp_array.into_array(), &mut cuda_ctx)
+            .await
+            .vortex_expect("GPU decompression failed")
+            .into_host()
+            .await?
+            .into_array();
+
+        assert_arrays_eq!(cpu_result, gpu_result);
+        Ok(())
+    }
+
+    /// Tail-chunk bounds check: an array whose length is not a multiple of
+    /// 1024 forces the kernel's tail-block path to bounds-check its decode
+    /// loop. Includes a patch in the tail.
+    #[crate::test]
+    async fn test_cuda_alp_partial_tail_chunk() -> VortexResult<()> {
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
+            .vortex_expect("failed to create execution context");
+
+        let values: Buffer<f64> = (0u32..1500)
+            .map(|i| if i == 1400 { UNENCODABLE } else { i as f64 })
+            .collect();
+        let prim = PrimitiveArray::new(values, Validity::NonNullable);
+        let alp_array = alp_encode(
+            prim.as_view(),
+            None,
+            &mut LEGACY_SESSION.create_execution_ctx(),
+        )?;
+        assert!(
+            alp_array.patches().is_some(),
+            "expected patches from ALP exceptions"
+        );
+
+        let cpu_result = canonicalize_cpu(alp_array.clone())?.into_array();
 
         let gpu_result = alp_array
             .into_array()
diff --git a/vortex-cuda/src/kernel/encodings/bitpacked.rs b/vortex-cuda/src/kernel/encodings/bitpacked.rs
index 0059d74fd5e..29b56436feb 100644
--- a/vortex-cuda/src/kernel/encodings/bitpacked.rs
+++ b/vortex-cuda/src/kernel/encodings/bitpacked.rs
@@ -28,10 +28,8 @@ use crate::CudaBufferExt;
 use crate::CudaDeviceBuffer;
 use crate::executor::CudaExecute;
 use crate::executor::CudaExecutionCtx;
-use crate::kernel::patches::gpu::ChunkOffsetType_CO_U32;
-use crate::kernel::patches::gpu::GPUPatches;
+use crate::kernel::patches::build_gpu_patches;
 use crate::kernel::patches::types::load_patches;
-use crate::kernel::patches::types::ptype_to_chunk_offset_type;
 
 /// CUDA decoder for bit-packed arrays.
 #[derive(Debug)]
@@ -86,8 +84,6 @@ pub fn bitpacked_cuda_launch_config(output_width: usize, len: usize) -> VortexRe
     })
 }
 
-unsafe impl DeviceRepr for GPUPatches {}
-
 #[instrument(skip_all)]
 pub(crate) async fn decode_bitpacked<A>(
     array: BitPackedArray,
@@ -131,31 +127,7 @@ where
         None
     };
 
-    #[expect(clippy::cast_possible_truncation)]
-    let patches_arg = if let Some(p) = &device_patches {
-        GPUPatches {
-            chunk_offsets: p.chunk_offsets.cuda_device_ptr()? as _,
-            chunk_offset_type: ptype_to_chunk_offset_type(p.chunk_offset_ptype)?,
-            indices: p.indices.cuda_device_ptr()? as _,
-            values: p.values.cuda_device_ptr()? as _,
-            offset: p.offset as u32,
-            offset_within_chunk: p.offset_within_chunk as u32,
-            num_patches: p.num_patches as u32,
-            n_chunks: p.n_chunks as u32,
-        }
-    } else {
-        // NULL chunk_offsets signals no patches to the kernel
-        GPUPatches {
-            chunk_offsets: std::ptr::null_mut(),
-            chunk_offset_type: ChunkOffsetType_CO_U32,
-            indices: std::ptr::null_mut(),
-            values: std::ptr::null_mut(),
-            offset: 0,
-            offset_within_chunk: 0,
-            num_patches: 0,
-            n_chunks: 0,
-        }
-    };
+    let patches_arg = build_gpu_patches(device_patches.as_ref())?;
 
     ctx.launch_kernel_config(&cuda_function, config, len, |args| {
         args.arg(&input_view)
diff --git a/vortex-cuda/src/kernel/patches/mod.rs b/vortex-cuda/src/kernel/patches/mod.rs
index 07cd51c3f01..6075b664f72 100644
--- a/vortex-cuda/src/kernel/patches/mod.rs
+++ b/vortex-cuda/src/kernel/patches/mod.rs
@@ -17,14 +17,79 @@ use vortex::array::patches::Patches;
 use vortex::array::validity::Validity;
 use vortex::dtype::NativePType;
 use vortex::error::VortexResult;
+use vortex::error::vortex_bail;
 use vortex::error::vortex_ensure;
 
 use crate::CudaBufferExt;
 use crate::CudaDeviceBuffer;
 use crate::CudaExecutionCtx;
 use crate::executor::CudaArrayExt;
+use crate::kernel::patches::gpu::ChunkOffsetType;
+use crate::kernel::patches::gpu::ChunkOffsetType_CO_U8;
+use crate::kernel::patches::gpu::ChunkOffsetType_CO_U16;
+use crate::kernel::patches::gpu::ChunkOffsetType_CO_U32;
+use crate::kernel::patches::gpu::ChunkOffsetType_CO_U64;
+use crate::kernel::patches::gpu::GPUPatches;
+use crate::kernel::patches::types::DevicePatches;
+
+// Safe because `GPUPatches` contains only raw pointers, POD integers, and an enum.
+unsafe impl DeviceRepr for GPUPatches {}
+
+impl GPUPatches {
+    /// Sentinel value passed to kernels when no patches are present. A NULL
+    /// `chunk_offsets` pointer is the signal `PatchesCursor` checks for.
+    pub(crate) const NULL_PATCHES: Self = Self {
+        chunk_offsets: std::ptr::null_mut(),
+        chunk_offset_type: ChunkOffsetType_CO_U32,
+        indices: std::ptr::null_mut(),
+        values: std::ptr::null_mut(),
+        offset: 0,
+        offset_within_chunk: 0,
+        num_patches: 0,
+        n_chunks: 0,
+    };
+}
+
+/// Convert a [`PType`] to the corresponding [`ChunkOffsetType`] for GPU patches.
+fn ptype_to_chunk_offset_type(ptype: vortex::dtype::PType) -> VortexResult<ChunkOffsetType> {
+    match ptype {
+        vortex::dtype::PType::U8 => Ok(ChunkOffsetType_CO_U8),
+        vortex::dtype::PType::U16 => Ok(ChunkOffsetType_CO_U16),
+        vortex::dtype::PType::U32 => Ok(ChunkOffsetType_CO_U32),
+        vortex::dtype::PType::U64 => Ok(ChunkOffsetType_CO_U64),
+        _ => vortex_bail!("Invalid PType for chunk_offsets: {:?}", ptype),
+    }
+}
+
+/// Build a [`GPUPatches`] kernel argument from optional device-resident patches.
+///
+/// When `device_patches` is `None`, returns a sentinel value whose NULL
+/// `chunk_offsets` signals "no patches" to the kernel.
+pub(crate) fn build_gpu_patches(
+    device_patches: Option<&DevicePatches>,
+) -> VortexResult<GPUPatches> {
+    #[expect(clippy::cast_possible_truncation)]
+    match device_patches {
+        Some(p) => Ok(GPUPatches {
+            chunk_offsets: p.chunk_offsets.cuda_device_ptr()? as _,
+            chunk_offset_type: ptype_to_chunk_offset_type(p.chunk_offset_ptype)?,
+            indices: p.indices.cuda_device_ptr()? as _,
+            values: p.values.cuda_device_ptr()? as _,
+            offset: p.offset as u32,
+            offset_within_chunk: p.offset_within_chunk as u32,
+            num_patches: p.num_patches as u32,
+            n_chunks: p.n_chunks as u32,
+        }),
+        None => Ok(GPUPatches::NULL_PATCHES),
+    }
+}
 
 /// Apply a set of patches in-place onto a [`CudaDeviceBuffer`] holding `ValuesT`.
+///
+/// Naive scatter kernel. Kept as a reusable fallback for encoders that cannot
+/// use the chunk-based fused patching path (e.g., where `chunk_offsets` are
+/// unavailable); no production caller uses it today.
+#[allow(dead_code)]
 #[instrument(skip_all)]
 pub(crate) async fn execute_patches<
     ValuesT: NativePType + DeviceRepr,