diff --git a/vortex-cuda/benches/dynamic_dispatch_cuda.rs b/vortex-cuda/benches/dynamic_dispatch_cuda.rs
index 5b81a2c2120..488e22f38d8 100644
--- a/vortex-cuda/benches/dynamic_dispatch_cuda.rs
+++ b/vortex-cuda/benches/dynamic_dispatch_cuda.rs
@@ -123,7 +123,7 @@ struct BenchRunner {
 }
 
 impl BenchRunner {
-    fn new(array: &vortex::array::ArrayRef, len: usize, cuda_ctx: &CudaExecutionCtx) -> Self {
+    fn new(array: &vortex::array::ArrayRef, len: usize, cuda_ctx: &mut CudaExecutionCtx) -> Self {
         let plan = match DispatchPlan::new(array).vortex_expect("build_dyn_dispatch_plan") {
             DispatchPlan::Fused(plan) => plan,
             _ => unreachable!("encoding not fusable"),
@@ -201,7 +201,7 @@ fn bench_for_bitpacked(c: &mut Criterion) {
                 let mut cuda_ctx =
                     CudaSession::create_execution_ctx(&VortexSession::empty()).vortex_expect("ctx");
 
-                let bench_runner = BenchRunner::new(&array, n, &cuda_ctx);
+                let bench_runner = BenchRunner::new(&array, n, &mut cuda_ctx);
 
                 b.iter_custom(|iters| {
                     let mut total_time = Duration::ZERO;
@@ -246,7 +246,7 @@ fn bench_dict_bp_codes(c: &mut Criterion) {
                 let mut cuda_ctx =
                     CudaSession::create_execution_ctx(&VortexSession::empty()).vortex_expect("ctx");
 
-                let bench_runner = BenchRunner::new(&array, n, &cuda_ctx);
+                let bench_runner = BenchRunner::new(&array, n, &mut cuda_ctx);
 
                 b.iter_custom(|iters| {
                     let mut total_time = Duration::ZERO;
@@ -290,7 +290,7 @@ fn bench_runend(c: &mut Criterion) {
                 let mut cuda_ctx =
                     CudaSession::create_execution_ctx(&VortexSession::empty()).vortex_expect("ctx");
 
-                let bench_runner = BenchRunner::new(&array, n, &cuda_ctx);
+                let bench_runner = BenchRunner::new(&array, n, &mut cuda_ctx);
 
                 b.iter_custom(|iters| {
                     let mut total_time = Duration::ZERO;
@@ -344,7 +344,7 @@ fn bench_dict_bp_codes_bp_for_values(c: &mut Criterion) {
                 let mut cuda_ctx =
                     CudaSession::create_execution_ctx(&VortexSession::empty()).vortex_expect("ctx");
 
-                let bench_runner = BenchRunner::new(&array, n, &cuda_ctx);
+                let bench_runner = BenchRunner::new(&array, n, &mut cuda_ctx);
 
                 b.iter_custom(|iters| {
                     let mut total_time = Duration::ZERO;
@@ -409,7 +409,7 @@ fn bench_alp_for_bitpacked(c: &mut Criterion) {
                 let mut cuda_ctx =
                     CudaSession::create_execution_ctx(&VortexSession::empty()).vortex_expect("ctx");
 
-                let bench_runner = BenchRunner::new(&array, n, &cuda_ctx);
+                let bench_runner = BenchRunner::new(&array, n, &mut cuda_ctx);
 
                 b.iter_custom(|iters| {
                     let mut total_time = Duration::ZERO;
diff --git a/vortex-cuda/kernels/src/bit_unpack_16.cu b/vortex-cuda/kernels/src/bit_unpack_16.cu
index 3c05baf2011..a784df201d3 100644
--- a/vortex-cuda/kernels/src/bit_unpack_16.cu
+++ b/vortex-cuda/kernels/src/bit_unpack_16.cu
@@ -4,19 +4,19 @@
 
 template <int BW>
 __device__ void _bit_unpack_16_device(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, int thread_idx, GPUPatches& patches) {
-    __shared__ uint16_t shared_out[1024];
+    __shared__ uint16_t shared_out[FL_CHUNK];
 
     // Step 1: Unpack into shared memory
     #pragma unroll
-    for (int i = 0; i < 2; i++) {
-        _bit_unpack_16_lane<BW>(in, shared_out, reference, thread_idx * 2 + i);
+    for (int i = 0; i < FL_LANES<uint16_t> / 32; i++) {
+        _bit_unpack_16_lane<BW>(in, shared_out, reference, thread_idx * (FL_LANES<uint16_t> / 32) + i);
     }
     __syncwarp();
 
     // Step 2: Apply patches to shared memory in parallel
     PatchesCursor<uint16_t> cursor(patches, blockIdx.x, thread_idx, 32);
     auto patch = cursor.next();
-    while (patch.index != 1024) {
+    while (patch.index != FL_CHUNK) {
         shared_out[patch.index] = patch.value;
         patch = cursor.next();
     }
@@ -24,7 +24,7 @@ __device__ void _bit_unpack_16_device(const uint16_t *__restrict in, uint16_t *_
 
     // Step 3: Copy to global memory
     #pragma unroll
-    for (int i = 0; i < 32; i++) {
+    for (int i = 0; i < FL_CHUNK / 32; i++) {
         auto idx = i * 32 + thread_idx;
         out[idx] = shared_out[idx];
     }
@@ -32,120 +32,120 @@ __device__ void _bit_unpack_16_device(const uint16_t *__restrict in, uint16_t *_
 
 extern "C" __global__ void bit_unpack_16_0bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 0 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 0));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<0>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_16_1bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 1 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 1));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<1>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_16_2bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 2 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 2));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<2>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_16_3bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 3 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 3));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<3>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_16_4bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 4 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 4));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<4>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_16_5bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 5 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 5));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<5>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_16_6bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 6 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 6));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<6>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_16_7bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 7 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 7));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<7>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_16_8bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 8 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 8));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<8>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_16_9bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 9 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 9));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<9>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_16_10bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 10 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 10));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<10>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_16_11bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 11 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 11));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<11>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_16_12bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 12 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 12));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<12>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_16_13bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 13 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 13));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<13>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_16_14bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 14 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 14));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<14>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_16_15bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 15 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 15));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<15>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_16_16bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 16 / sizeof(uint16_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint16_t> * 16));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_16_device<16>(in, out, reference, thread_idx, patches);
 }
 
diff --git a/vortex-cuda/kernels/src/bit_unpack_16_lanes.cuh b/vortex-cuda/kernels/src/bit_unpack_16_lanes.cuh
index 48c62fde2d4..ff992fa4ffb 100644
--- a/vortex-cuda/kernels/src/bit_unpack_16_lanes.cuh
+++ b/vortex-cuda/kernels/src/bit_unpack_16_lanes.cuh
@@ -19,7 +19,7 @@ __device__ void _bit_unpack_16_lane<0>(const uint16_t *__restrict in, uint16_t *
 
 template <>
 __device__ void _bit_unpack_16_lane<1>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 64;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
     uint16_t src;
     uint16_t tmp;
     src = in[lane];
@@ -59,7 +59,7 @@ __device__ void _bit_unpack_16_lane<1>(const uint16_t *__restrict in, uint16_t *
 
 template <>
 __device__ void _bit_unpack_16_lane<2>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 64;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
     uint16_t src;
     uint16_t tmp;
     src = in[lane];
@@ -101,7 +101,7 @@ __device__ void _bit_unpack_16_lane<2>(const uint16_t *__restrict in, uint16_t *
 
 template <>
 __device__ void _bit_unpack_16_lane<3>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 64;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
     uint16_t src;
     uint16_t tmp;
     src = in[lane];
@@ -145,7 +145,7 @@ __device__ void _bit_unpack_16_lane<3>(const uint16_t *__restrict in, uint16_t *
 
 template <>
 __device__ void _bit_unpack_16_lane<4>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 64;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
     uint16_t src;
     uint16_t tmp;
     src = in[lane];
@@ -191,7 +191,7 @@ __device__ void _bit_unpack_16_lane<4>(const uint16_t *__restrict in, uint16_t *
 
 template <>
 __device__ void _bit_unpack_16_lane<5>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 64;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
     uint16_t src;
     uint16_t tmp;
     src = in[lane];
@@ -239,7 +239,7 @@ __device__ void _bit_unpack_16_lane<5>(const uint16_t *__restrict in, uint16_t *
 
 template <>
 __device__ void _bit_unpack_16_lane<6>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 64;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
     uint16_t src;
     uint16_t tmp;
     src = in[lane];
@@ -289,7 +289,7 @@ __device__ void _bit_unpack_16_lane<6>(const uint16_t *__restrict in, uint16_t *
 
 template <>
 __device__ void _bit_unpack_16_lane<7>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 64;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
     uint16_t src;
     uint16_t tmp;
     src = in[lane];
@@ -341,7 +341,7 @@ __device__ void _bit_unpack_16_lane<7>(const uint16_t *__restrict in, uint16_t *
 
 template <>
 __device__ void _bit_unpack_16_lane<8>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 64;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
     uint16_t src;
     uint16_t tmp;
     src = in[lane];
@@ -395,7 +395,7 @@ __device__ void _bit_unpack_16_lane<8>(const uint16_t *__restrict in, uint16_t *
 
 template <>
 __device__ void _bit_unpack_16_lane<9>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 64;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
     uint16_t src;
     uint16_t tmp;
     src = in[lane];
@@ -451,7 +451,7 @@ __device__ void _bit_unpack_16_lane<9>(const uint16_t *__restrict in, uint16_t *
 
 template <>
 __device__ void _bit_unpack_16_lane<10>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 64;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
     uint16_t src;
     uint16_t tmp;
     src = in[lane];
@@ -509,7 +509,7 @@ __device__ void _bit_unpack_16_lane<10>(const uint16_t *__restrict in, uint16_t
 
 template <>
 __device__ void _bit_unpack_16_lane<11>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 64;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
     uint16_t src;
     uint16_t tmp;
     src = in[lane];
@@ -569,7 +569,7 @@ __device__ void _bit_unpack_16_lane<11>(const uint16_t *__restrict in, uint16_t
 
 template <>
 __device__ void _bit_unpack_16_lane<12>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 64;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
     uint16_t src;
     uint16_t tmp;
     src = in[lane];
@@ -631,7 +631,7 @@ __device__ void _bit_unpack_16_lane<12>(const uint16_t *__restrict in, uint16_t
 
 template <>
 __device__ void _bit_unpack_16_lane<13>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 64;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
     uint16_t src;
     uint16_t tmp;
     src = in[lane];
@@ -695,7 +695,7 @@ __device__ void _bit_unpack_16_lane<13>(const uint16_t *__restrict in, uint16_t
 
 template <>
 __device__ void _bit_unpack_16_lane<14>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 64;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
     uint16_t src;
     uint16_t tmp;
     src = in[lane];
@@ -761,7 +761,7 @@ __device__ void _bit_unpack_16_lane<14>(const uint16_t *__restrict in, uint16_t
 
 template <>
 __device__ void _bit_unpack_16_lane<15>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 64;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
     uint16_t src;
     uint16_t tmp;
     src = in[lane];
@@ -829,7 +829,7 @@ __device__ void _bit_unpack_16_lane<15>(const uint16_t *__restrict in, uint16_t
 
 template <>
 __device__ void _bit_unpack_16_lane<16>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 64;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint16_t>;
     #pragma unroll
     for (int row = 0; row < 16; row++) {
         out[INDEX(row, lane)] = in[LANE_COUNT * row + lane] + reference;
diff --git a/vortex-cuda/kernels/src/bit_unpack_32.cu b/vortex-cuda/kernels/src/bit_unpack_32.cu
index 97906f612c4..3f8fcb5c227 100644
--- a/vortex-cuda/kernels/src/bit_unpack_32.cu
+++ b/vortex-cuda/kernels/src/bit_unpack_32.cu
@@ -4,19 +4,19 @@
 
 template <int BW>
 __device__ void _bit_unpack_32_device(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, int thread_idx, GPUPatches& patches) {
-    __shared__ uint32_t shared_out[1024];
+    __shared__ uint32_t shared_out[FL_CHUNK];
 
     // Step 1: Unpack into shared memory
     #pragma unroll
-    for (int i = 0; i < 1; i++) {
-        _bit_unpack_32_lane<BW>(in, shared_out, reference, thread_idx * 1 + i);
+    for (int i = 0; i < FL_LANES<uint32_t> / 32; i++) {
+        _bit_unpack_32_lane<BW>(in, shared_out, reference, thread_idx * (FL_LANES<uint32_t> / 32) + i);
     }
     __syncwarp();
 
     // Step 2: Apply patches to shared memory in parallel
     PatchesCursor<uint32_t> cursor(patches, blockIdx.x, thread_idx, 32);
     auto patch = cursor.next();
-    while (patch.index != 1024) {
+    while (patch.index != FL_CHUNK) {
         shared_out[patch.index] = patch.value;
         patch = cursor.next();
     }
@@ -24,7 +24,7 @@ __device__ void _bit_unpack_32_device(const uint32_t *__restrict in, uint32_t *_
 
     // Step 3: Copy to global memory
     #pragma unroll
-    for (int i = 0; i < 32; i++) {
+    for (int i = 0; i < FL_CHUNK / 32; i++) {
         auto idx = i * 32 + thread_idx;
         out[idx] = shared_out[idx];
     }
@@ -32,232 +32,232 @@ __device__ void _bit_unpack_32_device(const uint32_t *__restrict in, uint32_t *_
 
 extern "C" __global__ void bit_unpack_32_0bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 0 / sizeof(uint32_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint32_t> * 0));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_32_device<0>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_32_1bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 1 / sizeof(uint32_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint32_t> * 1));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_32_device<1>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_32_2bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 2 / sizeof(uint32_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint32_t> * 2));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_32_device<2>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_32_3bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 3 / sizeof(uint32_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint32_t> * 3));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_32_device<3>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_32_4bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 4 / sizeof(uint32_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint32_t> * 4));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_32_device<4>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_32_5bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 5 / sizeof(uint32_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint32_t> * 5));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_32_device<5>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_32_6bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 6 / sizeof(uint32_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint32_t> * 6));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_32_device<6>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_32_7bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 7 / sizeof(uint32_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint32_t> * 7));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_32_device<7>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_32_8bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 8 / sizeof(uint32_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint32_t> * 8));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_32_device<8>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_32_9bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 9 / sizeof(uint32_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint32_t> * 9));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_32_device<9>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_32_10bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 10 / sizeof(uint32_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint32_t> * 10));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_32_device<10>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_32_11bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 11 / sizeof(uint32_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint32_t> * 11));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_32_device<11>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_32_12bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 12 / sizeof(uint32_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint32_t> * 12));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_32_device<12>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_32_13bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 13 / sizeof(uint32_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint32_t> * 13));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_32_device<13>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_32_14bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 14 / sizeof(uint32_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint32_t> * 14));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_32_device<14>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_32_15bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 15 / sizeof(uint32_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint32_t> * 15));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_32_device<15>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_32_16bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 16 / sizeof(uint32_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint32_t> * 16));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_32_device<16>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_32_17bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 17 / sizeof(uint32_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint32_t> * 17));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_32_device<17>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_32_18bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 18 / sizeof(uint32_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint32_t> * 18));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_32_device<18>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_32_19bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 19 / sizeof(uint32_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint32_t> * 19));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_32_device<19>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_32_20bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 20 / sizeof(uint32_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint32_t> * 20));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_32_device<20>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_32_21bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 21 / sizeof(uint32_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint32_t> * 21));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_32_device<21>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_32_22bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 22 / sizeof(uint32_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint32_t> * 22));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_32_device<22>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_32_23bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 23 / sizeof(uint32_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint32_t> * 23));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_32_device<23>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_32_24bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 24 / sizeof(uint32_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint32_t> * 24));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_32_device<24>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_32_25bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 25 / sizeof(uint32_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint32_t> * 25));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_32_device<25>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_32_26bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 26 / sizeof(uint32_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint32_t> * 26));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_32_device<26>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_32_27bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 27 / sizeof(uint32_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint32_t> * 27));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_32_device<27>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_32_28bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 28 / sizeof(uint32_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint32_t> * 28));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_32_device<28>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_32_29bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 29 / sizeof(uint32_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint32_t> * 29));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_32_device<29>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_32_30bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 30 / sizeof(uint32_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint32_t> * 30));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_32_device<30>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_32_31bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 31 / sizeof(uint32_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint32_t> * 31));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_32_device<31>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_32_32bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 32 / sizeof(uint32_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint32_t> * 32));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_32_device<32>(in, out, reference, thread_idx, patches);
 }
 
diff --git a/vortex-cuda/kernels/src/bit_unpack_32_lanes.cuh b/vortex-cuda/kernels/src/bit_unpack_32_lanes.cuh
index 4d1206e9f88..58d4195e3aa 100644
--- a/vortex-cuda/kernels/src/bit_unpack_32_lanes.cuh
+++ b/vortex-cuda/kernels/src/bit_unpack_32_lanes.cuh
@@ -19,7 +19,7 @@ __device__ void _bit_unpack_32_lane<0>(const uint32_t *__restrict in, uint32_t *
 
 template <>
 __device__ void _bit_unpack_32_lane<1>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 32;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint32_t>;
     uint32_t src;
     uint32_t tmp;
     src = in[lane];
@@ -91,7 +91,7 @@ __device__ void _bit_unpack_32_lane<1>(const uint32_t *__restrict in, uint32_t *
 
 template <>
 __device__ void _bit_unpack_32_lane<2>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 32;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint32_t>;
     uint32_t src;
     uint32_t tmp;
     src = in[lane];
@@ -165,7 +165,7 @@ __device__ void _bit_unpack_32_lane<2>(const uint32_t *__restrict in, uint32_t *
 
 template <>
 __device__ void _bit_unpack_32_lane<3>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 32;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint32_t>;
     uint32_t src;
     uint32_t tmp;
     src = in[lane];
@@ -241,7 +241,7 @@ __device__ void _bit_unpack_32_lane<3>(const uint32_t *__restrict in, uint32_t *
 
 template <>
 __device__ void _bit_unpack_32_lane<4>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 32;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint32_t>;
     uint32_t src;
     uint32_t tmp;
     src = in[lane];
@@ -319,7 +319,7 @@ __device__ void _bit_unpack_32_lane<4>(const uint32_t *__restrict in, uint32_t *
 
 template <>
 __device__ void _bit_unpack_32_lane<5>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 32;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint32_t>;
     uint32_t src;
     uint32_t tmp;
     src = in[lane];
@@ -399,7 +399,7 @@ __device__ void _bit_unpack_32_lane<5>(const uint32_t *__restrict in, uint32_t *
 
 template <>
 __device__ void _bit_unpack_32_lane<6>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 32;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint32_t>;
     uint32_t src;
     uint32_t tmp;
     src = in[lane];
@@ -481,7 +481,7 @@ __device__ void _bit_unpack_32_lane<6>(const uint32_t *__restrict in, uint32_t *
 
 template <>
 __device__ void _bit_unpack_32_lane<7>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 32;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint32_t>;
     uint32_t src;
     uint32_t tmp;
     src = in[lane];
@@ -565,7 +565,7 @@ __device__ void _bit_unpack_32_lane<7>(const uint32_t *__restrict in, uint32_t *
 
 template <>
 __device__ void _bit_unpack_32_lane<8>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 32;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint32_t>;
     uint32_t src;
     uint32_t tmp;
     src = in[lane];
@@ -651,7 +651,7 @@ __device__ void _bit_unpack_32_lane<8>(const uint32_t *__restrict in, uint32_t *
 
 template <>
 __device__ void _bit_unpack_32_lane<9>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 32;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint32_t>;
     uint32_t src;
     uint32_t tmp;
     src = in[lane];
@@ -739,7 +739,7 @@ __device__ void _bit_unpack_32_lane<9>(const uint32_t *__restrict in, uint32_t *
 
 template <>
 __device__ void _bit_unpack_32_lane<10>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 32;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint32_t>;
     uint32_t src;
     uint32_t tmp;
     src = in[lane];
@@ -829,7 +829,7 @@ __device__ void _bit_unpack_32_lane<10>(const uint32_t *__restrict in, uint32_t
 
 template <>
 __device__ void _bit_unpack_32_lane<11>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 32;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint32_t>;
     uint32_t src;
     uint32_t tmp;
     src = in[lane];
@@ -921,7 +921,7 @@ __device__ void _bit_unpack_32_lane<11>(const uint32_t *__restrict in, uint32_t
 
 template <>
 __device__ void _bit_unpack_32_lane<12>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 32;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint32_t>;
     uint32_t src;
     uint32_t tmp;
     src = in[lane];
@@ -1015,7 +1015,7 @@ __device__ void _bit_unpack_32_lane<12>(const uint32_t *__restrict in, uint32_t
 
 template <>
 __device__ void _bit_unpack_32_lane<13>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 32;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint32_t>;
     uint32_t src;
     uint32_t tmp;
     src = in[lane];
@@ -1111,7 +1111,7 @@ __device__ void _bit_unpack_32_lane<13>(const uint32_t *__restrict in, uint32_t
 
 template <>
 __device__ void _bit_unpack_32_lane<14>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 32;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint32_t>;
     uint32_t src;
     uint32_t tmp;
     src = in[lane];
@@ -1209,7 +1209,7 @@ __device__ void _bit_unpack_32_lane<14>(const uint32_t *__restrict in, uint32_t
 
 template <>
 __device__ void _bit_unpack_32_lane<15>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 32;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint32_t>;
     uint32_t src;
     uint32_t tmp;
     src = in[lane];
@@ -1309,7 +1309,7 @@ __device__ void _bit_unpack_32_lane<15>(const uint32_t *__restrict in, uint32_t
 
 template <>
 __device__ void _bit_unpack_32_lane<16>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 32;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint32_t>;
     uint32_t src;
     uint32_t tmp;
     src = in[lane];
@@ -1411,7 +1411,7 @@ __device__ void _bit_unpack_32_lane<16>(const uint32_t *__restrict in, uint32_t
 
 template <>
 __device__ void _bit_unpack_32_lane<17>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 32;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint32_t>;
     uint32_t src;
     uint32_t tmp;
     src = in[lane];
@@ -1515,7 +1515,7 @@ __device__ void _bit_unpack_32_lane<17>(const uint32_t *__restrict in, uint32_t
 
 template <>
 __device__ void _bit_unpack_32_lane<18>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 32;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint32_t>;
     uint32_t src;
     uint32_t tmp;
     src = in[lane];
@@ -1621,7 +1621,7 @@ __device__ void _bit_unpack_32_lane<18>(const uint32_t *__restrict in, uint32_t
 
 template <>
 __device__ void _bit_unpack_32_lane<19>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 32;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint32_t>;
     uint32_t src;
     uint32_t tmp;
     src = in[lane];
@@ -1729,7 +1729,7 @@ __device__ void _bit_unpack_32_lane<19>(const uint32_t *__restrict in, uint32_t
 
 template <>
 __device__ void _bit_unpack_32_lane<20>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 32;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint32_t>;
     uint32_t src;
     uint32_t tmp;
     src = in[lane];
@@ -1839,7 +1839,7 @@ __device__ void _bit_unpack_32_lane<20>(const uint32_t *__restrict in, uint32_t
 
 template <>
 __device__ void _bit_unpack_32_lane<21>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 32;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint32_t>;
     uint32_t src;
     uint32_t tmp;
     src = in[lane];
@@ -1951,7 +1951,7 @@ __device__ void _bit_unpack_32_lane<21>(const uint32_t *__restrict in, uint32_t
 
 template <>
 __device__ void _bit_unpack_32_lane<22>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 32;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint32_t>;
     uint32_t src;
     uint32_t tmp;
     src = in[lane];
@@ -2065,7 +2065,7 @@ __device__ void _bit_unpack_32_lane<22>(const uint32_t *__restrict in, uint32_t
 
 template <>
 __device__ void _bit_unpack_32_lane<23>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 32;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint32_t>;
     uint32_t src;
     uint32_t tmp;
     src = in[lane];
@@ -2181,7 +2181,7 @@ __device__ void _bit_unpack_32_lane<23>(const uint32_t *__restrict in, uint32_t
 
 template <>
 __device__ void _bit_unpack_32_lane<24>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 32;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint32_t>;
     uint32_t src;
     uint32_t tmp;
     src = in[lane];
@@ -2299,7 +2299,7 @@ __device__ void _bit_unpack_32_lane<24>(const uint32_t *__restrict in, uint32_t
 
 template <>
 __device__ void _bit_unpack_32_lane<25>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 32;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint32_t>;
     uint32_t src;
     uint32_t tmp;
     src = in[lane];
@@ -2419,7 +2419,7 @@ __device__ void _bit_unpack_32_lane<25>(const uint32_t *__restrict in, uint32_t
 
 template <>
 __device__ void _bit_unpack_32_lane<26>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 32;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint32_t>;
     uint32_t src;
     uint32_t tmp;
     src = in[lane];
@@ -2541,7 +2541,7 @@ __device__ void _bit_unpack_32_lane<26>(const uint32_t *__restrict in, uint32_t
 
 template <>
 __device__ void _bit_unpack_32_lane<27>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 32;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint32_t>;
     uint32_t src;
     uint32_t tmp;
     src = in[lane];
@@ -2665,7 +2665,7 @@ __device__ void _bit_unpack_32_lane<27>(const uint32_t *__restrict in, uint32_t
 
 template <>
 __device__ void _bit_unpack_32_lane<28>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 32;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint32_t>;
     uint32_t src;
     uint32_t tmp;
     src = in[lane];
@@ -2791,7 +2791,7 @@ __device__ void _bit_unpack_32_lane<28>(const uint32_t *__restrict in, uint32_t
 
 template <>
 __device__ void _bit_unpack_32_lane<29>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 32;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint32_t>;
     uint32_t src;
     uint32_t tmp;
     src = in[lane];
@@ -2919,7 +2919,7 @@ __device__ void _bit_unpack_32_lane<29>(const uint32_t *__restrict in, uint32_t
 
 template <>
 __device__ void _bit_unpack_32_lane<30>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 32;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint32_t>;
     uint32_t src;
     uint32_t tmp;
     src = in[lane];
@@ -3049,7 +3049,7 @@ __device__ void _bit_unpack_32_lane<30>(const uint32_t *__restrict in, uint32_t
 
 template <>
 __device__ void _bit_unpack_32_lane<31>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 32;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint32_t>;
     uint32_t src;
     uint32_t tmp;
     src = in[lane];
@@ -3181,7 +3181,7 @@ __device__ void _bit_unpack_32_lane<31>(const uint32_t *__restrict in, uint32_t
 
 template <>
 __device__ void _bit_unpack_32_lane<32>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 32;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint32_t>;
     #pragma unroll
     for (int row = 0; row < 32; row++) {
         out[INDEX(row, lane)] = in[LANE_COUNT * row + lane] + reference;
diff --git a/vortex-cuda/kernels/src/bit_unpack_64.cu b/vortex-cuda/kernels/src/bit_unpack_64.cu
index 6270f4f8261..ebe0b125369 100644
--- a/vortex-cuda/kernels/src/bit_unpack_64.cu
+++ b/vortex-cuda/kernels/src/bit_unpack_64.cu
@@ -4,19 +4,19 @@
 
 template <int BW>
 __device__ void _bit_unpack_64_device(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx, GPUPatches& patches) {
-    __shared__ uint64_t shared_out[1024];
+    __shared__ uint64_t shared_out[FL_CHUNK];
 
     // Step 1: Unpack into shared memory
     #pragma unroll
-    for (int i = 0; i < 1; i++) {
-        _bit_unpack_64_lane<BW>(in, shared_out, reference, thread_idx * 1 + i);
+    for (int i = 0; i < FL_LANES<uint64_t> / 16; i++) {
+        _bit_unpack_64_lane<BW>(in, shared_out, reference, thread_idx * (FL_LANES<uint64_t> / 16) + i);
     }
     __syncwarp();
 
     // Step 2: Apply patches to shared memory in parallel
     PatchesCursor<uint64_t> cursor(patches, blockIdx.x, thread_idx, 16);
     auto patch = cursor.next();
-    while (patch.index != 1024) {
+    while (patch.index != FL_CHUNK) {
         shared_out[patch.index] = patch.value;
         patch = cursor.next();
     }
@@ -24,7 +24,7 @@ __device__ void _bit_unpack_64_device(const uint64_t *__restrict in, uint64_t *_
 
     // Step 3: Copy to global memory
     #pragma unroll
-    for (int i = 0; i < 64; i++) {
+    for (int i = 0; i < FL_CHUNK / 16; i++) {
         auto idx = i * 16 + thread_idx;
         out[idx] = shared_out[idx];
     }
@@ -32,456 +32,456 @@ __device__ void _bit_unpack_64_device(const uint64_t *__restrict in, uint64_t *_
 
 extern "C" __global__ void bit_unpack_64_0bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 0 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 0));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<0>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_1bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 1 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 1));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<1>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_2bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 2 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 2));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<2>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_3bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 3 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 3));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<3>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_4bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 4 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 4));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<4>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_5bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 5 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 5));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<5>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_6bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 6 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 6));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<6>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_7bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 7 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 7));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<7>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_8bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 8 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 8));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<8>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_9bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 9 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 9));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<9>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_10bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 10 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 10));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<10>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_11bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 11 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 11));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<11>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_12bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 12 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 12));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<12>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_13bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 13 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 13));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<13>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_14bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 14 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 14));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<14>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_15bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 15 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 15));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<15>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_16bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 16 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 16));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<16>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_17bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 17 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 17));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<17>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_18bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 18 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 18));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<18>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_19bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 19 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 19));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<19>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_20bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 20 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 20));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<20>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_21bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 21 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 21));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<21>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_22bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 22 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 22));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<22>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_23bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 23 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 23));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<23>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_24bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 24 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 24));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<24>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_25bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 25 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 25));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<25>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_26bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 26 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 26));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<26>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_27bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 27 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 27));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<27>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_28bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 28 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 28));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<28>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_29bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 29 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 29));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<29>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_30bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 30 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 30));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<30>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_31bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 31 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 31));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<31>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_32bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 32 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 32));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<32>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_33bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 33 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 33));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<33>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_34bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 34 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 34));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<34>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_35bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 35 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 35));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<35>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_36bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 36 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 36));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<36>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_37bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 37 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 37));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<37>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_38bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 38 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 38));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<38>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_39bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 39 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 39));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<39>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_40bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 40 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 40));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<40>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_41bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 41 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 41));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<41>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_42bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 42 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 42));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<42>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_43bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 43 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 43));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<43>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_44bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 44 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 44));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<44>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_45bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 45 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 45));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<45>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_46bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 46 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 46));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<46>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_47bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 47 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 47));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<47>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_48bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 48 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 48));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<48>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_49bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 49 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 49));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<49>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_50bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 50 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 50));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<50>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_51bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 51 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 51));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<51>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_52bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 52 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 52));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<52>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_53bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 53 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 53));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<53>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_54bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 54 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 54));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<54>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_55bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 55 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 55));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<55>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_56bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 56 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 56));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<56>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_57bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 57 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 57));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<57>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_58bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 58 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 58));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<58>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_59bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 59 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 59));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<59>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_60bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 60 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 60));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<60>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_61bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 61 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 61));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<61>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_62bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 62 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 62));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<62>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_63bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 63 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 63));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<63>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_64_64bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 64 / sizeof(uint64_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint64_t> * 64));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_64_device<64>(in, out, reference, thread_idx, patches);
 }
 
diff --git a/vortex-cuda/kernels/src/bit_unpack_64_lanes.cuh b/vortex-cuda/kernels/src/bit_unpack_64_lanes.cuh
index 0f20fee4e7d..afc17737b40 100644
--- a/vortex-cuda/kernels/src/bit_unpack_64_lanes.cuh
+++ b/vortex-cuda/kernels/src/bit_unpack_64_lanes.cuh
@@ -19,7 +19,7 @@ __device__ void _bit_unpack_64_lane<0>(const uint64_t *__restrict in, uint64_t *
 
 template <>
 __device__ void _bit_unpack_64_lane<1>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -155,7 +155,7 @@ __device__ void _bit_unpack_64_lane<1>(const uint64_t *__restrict in, uint64_t *
 
 template <>
 __device__ void _bit_unpack_64_lane<2>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -293,7 +293,7 @@ __device__ void _bit_unpack_64_lane<2>(const uint64_t *__restrict in, uint64_t *
 
 template <>
 __device__ void _bit_unpack_64_lane<3>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -433,7 +433,7 @@ __device__ void _bit_unpack_64_lane<3>(const uint64_t *__restrict in, uint64_t *
 
 template <>
 __device__ void _bit_unpack_64_lane<4>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -575,7 +575,7 @@ __device__ void _bit_unpack_64_lane<4>(const uint64_t *__restrict in, uint64_t *
 
 template <>
 __device__ void _bit_unpack_64_lane<5>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -719,7 +719,7 @@ __device__ void _bit_unpack_64_lane<5>(const uint64_t *__restrict in, uint64_t *
 
 template <>
 __device__ void _bit_unpack_64_lane<6>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -865,7 +865,7 @@ __device__ void _bit_unpack_64_lane<6>(const uint64_t *__restrict in, uint64_t *
 
 template <>
 __device__ void _bit_unpack_64_lane<7>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -1013,7 +1013,7 @@ __device__ void _bit_unpack_64_lane<7>(const uint64_t *__restrict in, uint64_t *
 
 template <>
 __device__ void _bit_unpack_64_lane<8>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -1163,7 +1163,7 @@ __device__ void _bit_unpack_64_lane<8>(const uint64_t *__restrict in, uint64_t *
 
 template <>
 __device__ void _bit_unpack_64_lane<9>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -1315,7 +1315,7 @@ __device__ void _bit_unpack_64_lane<9>(const uint64_t *__restrict in, uint64_t *
 
 template <>
 __device__ void _bit_unpack_64_lane<10>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -1469,7 +1469,7 @@ __device__ void _bit_unpack_64_lane<10>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<11>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -1625,7 +1625,7 @@ __device__ void _bit_unpack_64_lane<11>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<12>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -1783,7 +1783,7 @@ __device__ void _bit_unpack_64_lane<12>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<13>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -1943,7 +1943,7 @@ __device__ void _bit_unpack_64_lane<13>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<14>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -2105,7 +2105,7 @@ __device__ void _bit_unpack_64_lane<14>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<15>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -2269,7 +2269,7 @@ __device__ void _bit_unpack_64_lane<15>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<16>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -2435,7 +2435,7 @@ __device__ void _bit_unpack_64_lane<16>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<17>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -2603,7 +2603,7 @@ __device__ void _bit_unpack_64_lane<17>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<18>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -2773,7 +2773,7 @@ __device__ void _bit_unpack_64_lane<18>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<19>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -2945,7 +2945,7 @@ __device__ void _bit_unpack_64_lane<19>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<20>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -3119,7 +3119,7 @@ __device__ void _bit_unpack_64_lane<20>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<21>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -3295,7 +3295,7 @@ __device__ void _bit_unpack_64_lane<21>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<22>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -3473,7 +3473,7 @@ __device__ void _bit_unpack_64_lane<22>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<23>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -3653,7 +3653,7 @@ __device__ void _bit_unpack_64_lane<23>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<24>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -3835,7 +3835,7 @@ __device__ void _bit_unpack_64_lane<24>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<25>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -4019,7 +4019,7 @@ __device__ void _bit_unpack_64_lane<25>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<26>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -4205,7 +4205,7 @@ __device__ void _bit_unpack_64_lane<26>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<27>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -4393,7 +4393,7 @@ __device__ void _bit_unpack_64_lane<27>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<28>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -4583,7 +4583,7 @@ __device__ void _bit_unpack_64_lane<28>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<29>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -4775,7 +4775,7 @@ __device__ void _bit_unpack_64_lane<29>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<30>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -4969,7 +4969,7 @@ __device__ void _bit_unpack_64_lane<30>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<31>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -5165,7 +5165,7 @@ __device__ void _bit_unpack_64_lane<31>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<32>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -5363,7 +5363,7 @@ __device__ void _bit_unpack_64_lane<32>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<33>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -5563,7 +5563,7 @@ __device__ void _bit_unpack_64_lane<33>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<34>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -5765,7 +5765,7 @@ __device__ void _bit_unpack_64_lane<34>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<35>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -5969,7 +5969,7 @@ __device__ void _bit_unpack_64_lane<35>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<36>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -6175,7 +6175,7 @@ __device__ void _bit_unpack_64_lane<36>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<37>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -6383,7 +6383,7 @@ __device__ void _bit_unpack_64_lane<37>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<38>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -6593,7 +6593,7 @@ __device__ void _bit_unpack_64_lane<38>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<39>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -6805,7 +6805,7 @@ __device__ void _bit_unpack_64_lane<39>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<40>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -7019,7 +7019,7 @@ __device__ void _bit_unpack_64_lane<40>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<41>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -7235,7 +7235,7 @@ __device__ void _bit_unpack_64_lane<41>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<42>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -7453,7 +7453,7 @@ __device__ void _bit_unpack_64_lane<42>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<43>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -7673,7 +7673,7 @@ __device__ void _bit_unpack_64_lane<43>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<44>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -7895,7 +7895,7 @@ __device__ void _bit_unpack_64_lane<44>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<45>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -8119,7 +8119,7 @@ __device__ void _bit_unpack_64_lane<45>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<46>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -8345,7 +8345,7 @@ __device__ void _bit_unpack_64_lane<46>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<47>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -8573,7 +8573,7 @@ __device__ void _bit_unpack_64_lane<47>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<48>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -8803,7 +8803,7 @@ __device__ void _bit_unpack_64_lane<48>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<49>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -9035,7 +9035,7 @@ __device__ void _bit_unpack_64_lane<49>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<50>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -9269,7 +9269,7 @@ __device__ void _bit_unpack_64_lane<50>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<51>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -9505,7 +9505,7 @@ __device__ void _bit_unpack_64_lane<51>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<52>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -9743,7 +9743,7 @@ __device__ void _bit_unpack_64_lane<52>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<53>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -9983,7 +9983,7 @@ __device__ void _bit_unpack_64_lane<53>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<54>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -10225,7 +10225,7 @@ __device__ void _bit_unpack_64_lane<54>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<55>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -10469,7 +10469,7 @@ __device__ void _bit_unpack_64_lane<55>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<56>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -10715,7 +10715,7 @@ __device__ void _bit_unpack_64_lane<56>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<57>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -10963,7 +10963,7 @@ __device__ void _bit_unpack_64_lane<57>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<58>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -11213,7 +11213,7 @@ __device__ void _bit_unpack_64_lane<58>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<59>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -11465,7 +11465,7 @@ __device__ void _bit_unpack_64_lane<59>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<60>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -11719,7 +11719,7 @@ __device__ void _bit_unpack_64_lane<60>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<61>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -11975,7 +11975,7 @@ __device__ void _bit_unpack_64_lane<61>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<62>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -12233,7 +12233,7 @@ __device__ void _bit_unpack_64_lane<62>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<63>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     uint64_t src;
     uint64_t tmp;
     src = in[lane];
@@ -12493,7 +12493,7 @@ __device__ void _bit_unpack_64_lane<63>(const uint64_t *__restrict in, uint64_t
 
 template <>
 __device__ void _bit_unpack_64_lane<64>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 16;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint64_t>;
     #pragma unroll
     for (int row = 0; row < 64; row++) {
         out[INDEX(row, lane)] = in[LANE_COUNT * row + lane] + reference;
diff --git a/vortex-cuda/kernels/src/bit_unpack_8.cu b/vortex-cuda/kernels/src/bit_unpack_8.cu
index 064970d650f..b2fcfd26f04 100644
--- a/vortex-cuda/kernels/src/bit_unpack_8.cu
+++ b/vortex-cuda/kernels/src/bit_unpack_8.cu
@@ -4,19 +4,19 @@
 
 template <int BW>
 __device__ void _bit_unpack_8_device(const uint8_t *__restrict in, uint8_t *__restrict out, uint8_t reference, int thread_idx, GPUPatches& patches) {
-    __shared__ uint8_t shared_out[1024];
+    __shared__ uint8_t shared_out[FL_CHUNK];
 
     // Step 1: Unpack into shared memory
     #pragma unroll
-    for (int i = 0; i < 4; i++) {
-        _bit_unpack_8_lane<BW>(in, shared_out, reference, thread_idx * 4 + i);
+    for (int i = 0; i < FL_LANES<uint8_t> / 32; i++) {
+        _bit_unpack_8_lane<BW>(in, shared_out, reference, thread_idx * (FL_LANES<uint8_t> / 32) + i);
     }
     __syncwarp();
 
     // Step 2: Apply patches to shared memory in parallel
     PatchesCursor<uint8_t> cursor(patches, blockIdx.x, thread_idx, 32);
     auto patch = cursor.next();
-    while (patch.index != 1024) {
+    while (patch.index != FL_CHUNK) {
         shared_out[patch.index] = patch.value;
         patch = cursor.next();
     }
@@ -24,7 +24,7 @@ __device__ void _bit_unpack_8_device(const uint8_t *__restrict in, uint8_t *__re
 
     // Step 3: Copy to global memory
     #pragma unroll
-    for (int i = 0; i < 32; i++) {
+    for (int i = 0; i < FL_CHUNK / 32; i++) {
         auto idx = i * 32 + thread_idx;
         out[idx] = shared_out[idx];
     }
@@ -32,64 +32,64 @@ __device__ void _bit_unpack_8_device(const uint8_t *__restrict in, uint8_t *__re
 
 extern "C" __global__ void bit_unpack_8_0bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out, uint8_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 0 / sizeof(uint8_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint8_t> * 0));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_8_device<0>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_8_1bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out, uint8_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 1 / sizeof(uint8_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint8_t> * 1));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_8_device<1>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_8_2bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out, uint8_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 2 / sizeof(uint8_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint8_t> * 2));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_8_device<2>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_8_3bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out, uint8_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 3 / sizeof(uint8_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint8_t> * 3));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_8_device<3>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_8_4bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out, uint8_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 4 / sizeof(uint8_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint8_t> * 4));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_8_device<4>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_8_5bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out, uint8_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 5 / sizeof(uint8_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint8_t> * 5));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_8_device<5>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_8_6bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out, uint8_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 6 / sizeof(uint8_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint8_t> * 6));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_8_device<6>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_8_7bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out, uint8_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 7 / sizeof(uint8_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint8_t> * 7));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_8_device<7>(in, out, reference, thread_idx, patches);
 }
 
 extern "C" __global__ void bit_unpack_8_8bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out, uint8_t reference, GPUPatches patches) {
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * 8 / sizeof(uint8_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint8_t> * 8));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_8_device<8>(in, out, reference, thread_idx, patches);
 }
 
diff --git a/vortex-cuda/kernels/src/bit_unpack_8_lanes.cuh b/vortex-cuda/kernels/src/bit_unpack_8_lanes.cuh
index 64fb8ce39a8..d9c3550cb18 100644
--- a/vortex-cuda/kernels/src/bit_unpack_8_lanes.cuh
+++ b/vortex-cuda/kernels/src/bit_unpack_8_lanes.cuh
@@ -19,7 +19,7 @@ __device__ void _bit_unpack_8_lane<0>(const uint8_t *__restrict in, uint8_t *__r
 
 template <>
 __device__ void _bit_unpack_8_lane<1>(const uint8_t *__restrict in, uint8_t *__restrict out, uint8_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 128;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint8_t>;
     uint8_t src;
     uint8_t tmp;
     src = in[lane];
@@ -43,7 +43,7 @@ __device__ void _bit_unpack_8_lane<1>(const uint8_t *__restrict in, uint8_t *__r
 
 template <>
 __device__ void _bit_unpack_8_lane<2>(const uint8_t *__restrict in, uint8_t *__restrict out, uint8_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 128;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint8_t>;
     uint8_t src;
     uint8_t tmp;
     src = in[lane];
@@ -69,7 +69,7 @@ __device__ void _bit_unpack_8_lane<2>(const uint8_t *__restrict in, uint8_t *__r
 
 template <>
 __device__ void _bit_unpack_8_lane<3>(const uint8_t *__restrict in, uint8_t *__restrict out, uint8_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 128;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint8_t>;
     uint8_t src;
     uint8_t tmp;
     src = in[lane];
@@ -97,7 +97,7 @@ __device__ void _bit_unpack_8_lane<3>(const uint8_t *__restrict in, uint8_t *__r
 
 template <>
 __device__ void _bit_unpack_8_lane<4>(const uint8_t *__restrict in, uint8_t *__restrict out, uint8_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 128;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint8_t>;
     uint8_t src;
     uint8_t tmp;
     src = in[lane];
@@ -127,7 +127,7 @@ __device__ void _bit_unpack_8_lane<4>(const uint8_t *__restrict in, uint8_t *__r
 
 template <>
 __device__ void _bit_unpack_8_lane<5>(const uint8_t *__restrict in, uint8_t *__restrict out, uint8_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 128;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint8_t>;
     uint8_t src;
     uint8_t tmp;
     src = in[lane];
@@ -159,7 +159,7 @@ __device__ void _bit_unpack_8_lane<5>(const uint8_t *__restrict in, uint8_t *__r
 
 template <>
 __device__ void _bit_unpack_8_lane<6>(const uint8_t *__restrict in, uint8_t *__restrict out, uint8_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 128;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint8_t>;
     uint8_t src;
     uint8_t tmp;
     src = in[lane];
@@ -193,7 +193,7 @@ __device__ void _bit_unpack_8_lane<6>(const uint8_t *__restrict in, uint8_t *__r
 
 template <>
 __device__ void _bit_unpack_8_lane<7>(const uint8_t *__restrict in, uint8_t *__restrict out, uint8_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 128;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint8_t>;
     uint8_t src;
     uint8_t tmp;
     src = in[lane];
@@ -229,7 +229,7 @@ __device__ void _bit_unpack_8_lane<7>(const uint8_t *__restrict in, uint8_t *__r
 
 template <>
 __device__ void _bit_unpack_8_lane<8>(const uint8_t *__restrict in, uint8_t *__restrict out, uint8_t reference, unsigned int lane) {
-    unsigned int LANE_COUNT = 128;
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint8_t>;
     #pragma unroll
     for (int row = 0; row < 8; row++) {
         out[INDEX(row, lane)] = in[LANE_COUNT * row + lane] + reference;
diff --git a/vortex-cuda/kernels/src/dynamic_dispatch.cu b/vortex-cuda/kernels/src/dynamic_dispatch.cu
index d3950485435..9a413af17c1 100644
--- a/vortex-cuda/kernels/src/dynamic_dispatch.cu
+++ b/vortex-cuda/kernels/src/dynamic_dispatch.cu
@@ -52,6 +52,7 @@
 
 #include "bit_unpack.cuh"
 #include "dynamic_dispatch.h"
+#include "patches.cuh"
 #include "types.cuh"
 
 // ═══════════════════════════════════════════════════════════════════════════
@@ -109,8 +110,12 @@ __shared__ uint64_t runend_cursors[BLOCK_SIZE];
 // ═══════════════════════════════════════════════════════════════════════════
 
 /// Apply one scalar operation to N values in registers.
+///
+/// `abs_pos` is the absolute output position of the first value to process.
+/// It is used by scalar operations that apply patches, e.g. ALP.
 template <typename T, uint32_t N>
-__device__ inline void scalar_op(T *values, const struct ScalarOp &op, char *__restrict smem) {
+__device__ inline void
+scalar_op(T *values, const struct ScalarOp &op, char *__restrict smem, uint64_t abs_pos = 0) {
     switch (op.op_code) {
     case ScalarOp::FOR: {
         const T ref = static_cast<T>(op.params.frame_of_ref.reference);
@@ -134,6 +139,33 @@ __device__ inline void scalar_op(T *values, const struct ScalarOp &op, char *__r
             float r = static_cast<float>(static_cast<int32_t>(values[i])) * f * e;
             values[i] = static_cast<T>(__float_as_uint(r));
         }
+        // Apply ALP patches: override positions whose float value couldn't
+        // be reconstructed through the ALP encode/decode cycle.
+        // Per-value cursor — with a slice offset, a tile's N values can
+        // straddle two FL chunks, so each value needs its own lookup.
+        if (op.params.alp.patches_ptr != 0) {
+            const auto &patches = *reinterpret_cast<const GPUPatches *>(op.params.alp.patches_ptr);
+            // The sliced chunk_offsets array starts at original chunk
+            // (offset / FL_CHUNK). PatchesCursor indexes from 0, so
+            // subtract that base to get the index into chunk_offsets.
+            const uint32_t chunk_start = patches.offset / FL_CHUNK;
+#pragma unroll
+            for (uint32_t i = 0; i < N; ++i) {
+                uint64_t my_pos = (N > 1) ? abs_pos + i * blockDim.x + threadIdx.x : abs_pos;
+                uint64_t orig = my_pos + patches.offset;
+                uint32_t chunk = static_cast<uint32_t>(orig / FL_CHUNK) - chunk_start;
+                uint32_t within = static_cast<uint32_t>(orig % FL_CHUNK);
+                PatchesCursor<T> cursor(patches, chunk, 0, 1);
+                auto patch = cursor.next();
+                while (patch.index != FL_CHUNK) {
+                    if (patch.index == within) {
+                        values[i] = patch.value;
+                        break;
+                    }
+                    patch = cursor.next();
+                }
+            }
+        }
         break;
     }
     case ScalarOp::DICT: {
@@ -149,6 +181,24 @@ __device__ inline void scalar_op(T *values, const struct ScalarOp &op, char *__r
     }
 }
 
+// ═══════════════════════════════════════════════════════════════════════════
+// Patches
+// ═══════════════════════════════════════════════════════════════════════════
+
+/// Scatter patches for a single chunk into `out` using PatchesCursor.
+/// All threads in the block cooperate. Caller must issue __syncthreads()
+/// afterward if other threads read from `out`.
+template <typename T>
+__device__ __forceinline__ void
+scatter_patches_chunk(const GPUPatches &patches, T *__restrict out, uint32_t chunk) {
+    PatchesCursor<T> cursor(patches, chunk, threadIdx.x, blockDim.x);
+    auto patch = cursor.next();
+    while (patch.index != FL_CHUNK) {
+        out[patch.index] = patch.value;
+        patch = cursor.next();
+    }
+}
+
 // ═══════════════════════════════════════════════════════════════════════════
 // Source ops
 // ═══════════════════════════════════════════════════════════════════════════
@@ -162,11 +212,8 @@ __device__ inline void bitunpack(const T *__restrict packed,
                                  uint64_t chunk_start,
                                  uint32_t chunk_len,
                                  const struct SourceOp &src) {
-    constexpr uint32_t T_BITS = sizeof(T) * 8;
-    constexpr uint32_t FL_CHUNK = 1024;
-    constexpr uint32_t LANES = FL_CHUNK / T_BITS;
     const uint32_t bw = src.params.bitunpack.bit_width;
-    const uint32_t words_per_block = LANES * bw;
+    const uint32_t words_per_block = FL_LANES<T> * bw;
     const uint32_t elem_off = src.params.bitunpack.element_offset;
     const uint32_t dst_off = (chunk_start + elem_off) % FL_CHUNK;
     const uint64_t first_block = (chunk_start + elem_off) / FL_CHUNK;
@@ -177,9 +224,15 @@ __device__ inline void bitunpack(const T *__restrict packed,
     for (uint32_t c = 0; c < n_chunks; ++c) {
         const T *src_chunk = packed + (first_block + c) * words_per_block;
         T *chunk_dst = dst + c * FL_CHUNK;
-        for (uint32_t lane = threadIdx.x; lane < LANES; lane += blockDim.x) {
+        for (uint32_t lane = threadIdx.x; lane < FL_LANES<T>; lane += blockDim.x) {
             bit_unpack_lane<T>(src_chunk, chunk_dst, 0, lane, bw);
         }
+        // Apply BitPacked patches inline, matching the standalone kernel pattern.
+        if (src.params.bitunpack.patches_ptr != 0) {
+            __syncthreads();
+            const auto &patches = *reinterpret_cast<const GPUPatches *>(src.params.bitunpack.patches_ptr);
+            scatter_patches_chunk<T>(patches, chunk_dst, first_block + c);
+        }
     }
 }
 
@@ -282,15 +335,12 @@ __device__ void execute_output_stage(T *__restrict output,
     // Cap at 4 values per thread per tile to minimise register pressure.
     constexpr uint32_t VALUES_PER_TILE = (32 / sizeof(T)) < 4 ? (32 / sizeof(T)) : 4;
     const uint32_t tile_size = blockDim.x * VALUES_PER_TILE;
+
     const auto &src = stage.source;
     const void *raw_input = reinterpret_cast<const void *>(stage.input_ptr);
     const PTypeTag ptype = stage.source_ptype;
 
     if (src.op_code == SourceOp::RUNEND) {
-        // Seed each thread's cursor with the run containing its first
-        // strided position. The RUNEND arm in source_op advances the
-        // cursor monotonically, so this avoids a full binary search on
-        // every element.
         const T *ends = reinterpret_cast<const T *>(smem + src.params.runend.ends_smem_byte_offset);
         runend_cursors[threadIdx.x] = upper_bound(ends,
                                                   src.params.runend.num_runs,
@@ -301,10 +351,6 @@ __device__ void execute_output_stage(T *__restrict output,
         uint32_t chunk_len;
         const T *smem_src = nullptr;
 
-        // BITUNPACK uses smem scratch, so the outer loop advances one
-        // chunk at a time. LOAD, SEQUENCE, and RUNEND need no smem
-        // scratch, so chunk_len = block_len (single outer iteration);
-        // tiling happens in the inner tile_idx loop.
         if (src.op_code == SourceOp::BITUNPACK) {
             chunk_len = bitunpack_tile_len(stage, block_len, elem_idx);
             T *scratch = reinterpret_cast<T *>(smem + stage.smem_byte_offset);
@@ -313,10 +359,10 @@ __device__ void execute_output_stage(T *__restrict output,
                          block_start + elem_idx,
                          chunk_len,
                          src);
-            constexpr uint32_t FL_CHUNK = 1024; // FastLanes chunk size
             const uint32_t align = (block_start + elem_idx + src.params.bitunpack.element_offset) % FL_CHUNK;
             smem_src = scratch + align;
-            // Write barrier: all threads finished bitunpack, safe to read from scratch.
+            // Write barrier: all threads finished bitunpack (and any
+            // patches), safe to read from scratch.
             __syncthreads();
         } else {
             chunk_len = block_len;
@@ -337,7 +383,7 @@ __device__ void execute_output_stage(T *__restrict output,
                                           smem);
 
             for (uint8_t op = 0; op < stage.num_scalar_ops; ++op) {
-                scalar_op<T, VALUES_PER_TILE>(values, stage.scalar_ops[op], smem);
+                scalar_op<T, VALUES_PER_TILE>(values, stage.scalar_ops[op], smem, tile_start);
             }
 
 #pragma unroll
@@ -359,7 +405,7 @@ __device__ void execute_output_stage(T *__restrict output,
             source_op<T, 1>(&val, src, raw_input, ptype, smem_src, i, gpos, smem);
 
             for (uint8_t op = 0; op < stage.num_scalar_ops; ++op) {
-                scalar_op<T, 1>(&val, stage.scalar_ops[op], smem);
+                scalar_op<T, 1>(&val, stage.scalar_ops[op], smem, gpos);
             }
             __stcs(&output[gpos], val);
         }
@@ -392,24 +438,27 @@ __device__ void execute_input_stage(const Stage &stage, char *__restrict smem) {
     const auto &src = stage.source;
 
     if (src.op_code == SourceOp::BITUNPACK) {
+        T *raw_smem = smem_out;
         bitunpack<T>(reinterpret_cast<const T *>(stage.input_ptr), smem_out, 0, stage.len, src);
-        smem_out += src.params.bitunpack.element_offset % SMEM_TILE_SIZE;
         // Write barrier: cooperative bitunpack finished, safe to read
-        // decoded elements in the scalar-op loop below.
+        // decoded elements below.
         __syncthreads();
 
+        smem_out += src.params.bitunpack.element_offset % SMEM_TILE_SIZE;
+
         if (stage.num_scalar_ops > 0) {
-            for (uint32_t i = threadIdx.x; i < stage.len; i += blockDim.x) {
-                T val = smem_out[i];
+            for (uint32_t elem_idx = threadIdx.x; elem_idx < stage.len; elem_idx += blockDim.x) {
+                T val = smem_out[elem_idx];
                 for (uint8_t op = 0; op < stage.num_scalar_ops; ++op) {
-                    scalar_op<T, 1>(&val, stage.scalar_ops[op], smem);
+                    scalar_op<T, 1>(&val, stage.scalar_ops[op], smem, elem_idx);
                 }
-                smem_out[i] = val;
+                smem_out[elem_idx] = val;
             }
             // Write barrier: scalar ops applied in-place, smem region is
             // now fully populated for subsequent stages to read.
             __syncthreads();
         }
+
     } else {
         if (src.op_code == SourceOp::RUNEND) {
             // Seed each thread's cursor with the run containing its first
@@ -421,13 +470,13 @@ __device__ void execute_input_stage(const Stage &stage, char *__restrict smem) {
                 upper_bound(ends, src.params.runend.num_runs, threadIdx.x + src.params.runend.offset);
         }
         const void *raw_input = reinterpret_cast<const void *>(stage.input_ptr);
-        for (uint32_t i = threadIdx.x; i < stage.len; i += blockDim.x) {
+        for (uint32_t elem_idx = threadIdx.x; elem_idx < stage.len; elem_idx += blockDim.x) {
             T val;
-            source_op<T, 1>(&val, src, raw_input, stage.source_ptype, nullptr, 0, i, smem);
+            source_op<T, 1>(&val, src, raw_input, stage.source_ptype, nullptr, 0, elem_idx, smem);
             for (uint8_t op = 0; op < stage.num_scalar_ops; ++op) {
-                scalar_op<T, 1>(&val, stage.scalar_ops[op], smem);
+                scalar_op<T, 1>(&val, stage.scalar_ops[op], smem, elem_idx);
             }
-            smem_out[i] = val;
+            smem_out[elem_idx] = val;
         }
         // Write barrier: smem region is fully populated for subsequent
         // stages to read.
diff --git a/vortex-cuda/kernels/src/dynamic_dispatch.h b/vortex-cuda/kernels/src/dynamic_dispatch.h
index 95540c51581..9bba3f50a04 100644
--- a/vortex-cuda/kernels/src/dynamic_dispatch.h
+++ b/vortex-cuda/kernels/src/dynamic_dispatch.h
@@ -103,11 +103,18 @@ extern "C" {
 #endif
 
 /// Parameters for source ops, which decode data into a stage's shared memory region.
+///
+/// patches_ptr lives on the union variant that owns it (BitunpackParams,
+/// AlpParams) — not per-stage — so the pointer is tied to its op.
+/// Adding a u64 can grow the union and every ScalarOp/SourceOp read in the
+/// tile loop; SourceParams was already 24 B (no change), ScalarParams grew
+/// 8→16 B (no measurable impact — tile loop is compute-bound).
 union SourceParams {
     /// Unpack FastLanes bit-packed data.
     struct BitunpackParams {
         uint8_t bit_width;
         uint32_t element_offset; // Sub-byte offset
+        uint64_t patches_ptr;    // device pointer to GPUPatches struct (0 = none)
     } bitunpack;
 
     /// Copy from global to shared memory.
@@ -157,6 +164,7 @@ union ScalarParams {
     struct AlpParams {
         float f;
         float e;
+        uint64_t patches_ptr; // device pointer to GPUPatches struct (0 = none)
     } alp;
 
     /// Dictionary gather: use current value as index into decoded values in smem.
@@ -193,6 +201,7 @@ struct PackedStage {
     uint32_t len;              // number of elements this stage produces
 
     struct SourceOp source;
+
     uint8_t num_scalar_ops;
     enum PTypeTag source_ptype; // PType produced by the source op
 };
@@ -220,11 +229,12 @@ struct __attribute__((aligned(8))) PlanHeader {
 /// change the type; the final output PType is given by the last scalar op's
 /// `output_ptype` (or `source_ptype` if there are no scalar ops).
 struct Stage {
-    uint64_t input_ptr;                // encoded input in global memory
-    uint32_t smem_byte_offset;         // byte offset within dynamic shared memory
-    uint32_t len;                      // elements produced
-    enum PTypeTag source_ptype;        // PType produced by the source op
-    struct SourceOp source;            // source decode op
+    uint64_t input_ptr;         // encoded input in global memory
+    uint32_t smem_byte_offset;  // byte offset within dynamic shared memory
+    uint32_t len;               // elements produced
+    enum PTypeTag source_ptype; // PType produced by the source op
+    struct SourceOp source;     // source decode op
+
     uint8_t num_scalar_ops;            // number of scalar ops
     const struct ScalarOp *scalar_ops; // scalar decode ops
 };
@@ -247,6 +257,7 @@ __device__ inline Stage parse_stage(const uint8_t *&cursor) {
         .len = packed_stage->len,
         .source_ptype = packed_stage->source_ptype,
         .source = packed_stage->source,
+
         .num_scalar_ops = packed_stage->num_scalar_ops,
         .scalar_ops = ops,
     };
diff --git a/vortex-cuda/kernels/src/fastlanes_common.cuh b/vortex-cuda/kernels/src/fastlanes_common.cuh
index 660a1c554f4..8536de26789 100644
--- a/vortex-cuda/kernels/src/fastlanes_common.cuh
+++ b/vortex-cuda/kernels/src/fastlanes_common.cuh
@@ -8,8 +8,25 @@
 // FastLanes ordering array
 __constant__ int FL_ORDER[] = {0, 4, 2, 6, 1, 5, 3, 7};
 
+// FastLanes organises every 1024-element vector into a transposed layout
+// of FL_LANES columns × (1024 / FL_LANES) rows. Each column is a "lane"
+// that can be processed independently of every other lane, which is what
+// makes all FastLanes encodings (FFOR, DELTA, RLE, ALP, …) fully
+// data-parallel. One CUDA thread or one CPU SIMD lane handles one
+// FastLanes lane.
+//
+// Paper: https://ir.cwi.nl/pub/35881/35881.pdf
+// Repo:  https://github.com/cwida/FastLanes
+
+/// FastLanes chunk size in elements.
+constexpr uint32_t FL_CHUNK = 1024;
+
+/// Number of FastLanes lanes for element type T (1024 / bit-width).
+template <typename T>
+constexpr uint32_t FL_LANES = FL_CHUNK / (sizeof(T) * 8);
+
 // Compute the index in the FastLanes layout
 #define INDEX(row, lane) (FL_ORDER[row / 8] * 16 + (row % 8) * 128 + lane)
 
 // Create a mask with 'width' bits set
-#define MASK(T, width) (((T)1 << width) - 1)
\ No newline at end of file
+#define MASK(T, width) (((T)1 << width) - 1)
diff --git a/vortex-cuda/kernels/src/patches.cuh b/vortex-cuda/kernels/src/patches.cuh
index 66a97b935e9..fa9ff18def9 100644
--- a/vortex-cuda/kernels/src/patches.cuh
+++ b/vortex-cuda/kernels/src/patches.cuh
@@ -3,6 +3,7 @@
 
 #pragma once
 
+#include "fastlanes_common.cuh"
 #include "patches.h"
 
 /// Load a chunk offset value, dispatching on the runtime type.
@@ -21,8 +22,8 @@ __device__ inline uint32_t load_chunk_offset(const GPUPatches &patches, uint32_t
 }
 
 /// A single patch: a within-chunk index and its replacement value.
-/// A sentinel patch has index == 1024, which can never match a valid
-/// within-chunk position (0–1023).
+/// A sentinel patch has index == FL_CHUNK, which can never match a valid
+/// within-chunk position (0–FL_CHUNK-1).
 template <typename T>
 struct Patch {
     uint16_t index;
@@ -38,7 +39,7 @@ struct Patch {
 ///
 ///     PatchesCursor<uint32_t> cursor(patches, blockIdx.x, thread_idx, 32);
 ///     auto patch = cursor.next();
-///     while (patch.index != 1024) {
+///     while (patch.index != FL_CHUNK) {
 ///         shared_out[patch.index] = patch.value;
 ///         patch = cursor.next();
 ///     }
@@ -89,15 +90,15 @@ public:
         // The iterator returns indices relative to the start of the chunk.
         // `chunk_base` is the index of the first element within a chunk, accounting
         // for the slice offset.
-        chunk_base = chunk * 1024 + patches.offset;
-        chunk_base -= min(chunk_base, patches.offset % 1024);
+        chunk_base = chunk * FL_CHUNK + patches.offset;
+        chunk_base -= min(chunk_base, patches.offset % FL_CHUNK);
     }
 
     /// Return the current patch (with within-chunk index) and advance,
     /// or a sentinel {1024, 0} if exhausted.
     __device__ Patch<T> next() {
         if (remaining == 0) {
-            return {1024, T {}};
+            return {FL_CHUNK, T {}};
         }
         uint16_t within_chunk = static_cast<uint16_t>(*indices - chunk_base);
         Patch<T> patch = {within_chunk, *values};
@@ -110,6 +111,6 @@ public:
 private:
     const uint32_t *indices;
     const T *values;
-    uint8_t remaining;
+    uint32_t remaining;
     uint32_t chunk_base;
 };
diff --git a/vortex-cuda/src/bit_unpack_gen.rs b/vortex-cuda/src/bit_unpack_gen.rs
index 8d5eda920cd..2482c0996b8 100644
--- a/vortex-cuda/src/bit_unpack_gen.rs
+++ b/vortex-cuda/src/bit_unpack_gen.rs
@@ -48,12 +48,7 @@ fn write_row(output: &mut impl Write, bits: usize, bit_width: usize, row: usize)
 /// loop.  For all other bit widths, emits pre-computed per-row bit extraction
 /// with register-cached `src` words — identical to the original hand-unrolled
 /// codegen, preserving minimal memory loads and zero extra work.
-fn generate_lane_decoder(
-    output: &mut impl Write,
-    bits: usize,
-    lanes: usize,
-    bit_width: usize,
-) -> io::Result<()> {
+fn generate_lane_decoder(output: &mut impl Write, bits: usize, bit_width: usize) -> io::Result<()> {
     if bit_width == 0 {
         write!(
             output,
@@ -71,7 +66,7 @@ __device__ void _bit_unpack_{bits}_lane<0>(const uint{bits}_t *__restrict in, ui
             output,
             r#"template <>
 __device__ void _bit_unpack_{bits}_lane<{bit_width}>(const uint{bits}_t *__restrict in, uint{bits}_t *__restrict out, uint{bits}_t reference, unsigned int lane) {{
-    unsigned int LANE_COUNT = {lanes};
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint{bits}_t>;
     #pragma unroll
     for (int row = 0; row < {bits}; row++) {{
         out[INDEX(row, lane)] = in[LANE_COUNT * row + lane] + reference;
@@ -84,7 +79,7 @@ __device__ void _bit_unpack_{bits}_lane<{bit_width}>(const uint{bits}_t *__restr
             output,
             r#"template <>
 __device__ void _bit_unpack_{bits}_lane<{bit_width}>(const uint{bits}_t *__restrict in, uint{bits}_t *__restrict out, uint{bits}_t reference, unsigned int lane) {{
-    unsigned int LANE_COUNT = {lanes};
+    constexpr unsigned int LANE_COUNT = FL_LANES<uint{bits}_t>;
     uint{bits}_t src;
     uint{bits}_t tmp;
     src = in[lane];
@@ -142,29 +137,25 @@ __device__ __noinline__ void bit_unpack_{bits}_lane(
 fn generate_device_kernel_template(
     output: &mut impl Write,
     bits: usize,
-    lanes: usize,
     thread_count: usize,
 ) -> io::Result<()> {
-    let per_thread_loop_count = lanes / thread_count;
-    let shared_copy_ncount = 1024 / thread_count;
-
     write!(
         output,
         r#"template <int BW>
 __device__ void _bit_unpack_{bits}_device(const uint{bits}_t *__restrict in, uint{bits}_t *__restrict out, uint{bits}_t reference, int thread_idx, GPUPatches& patches) {{
-    __shared__ uint{bits}_t shared_out[1024];
+    __shared__ uint{bits}_t shared_out[FL_CHUNK];
 
     // Step 1: Unpack into shared memory
     #pragma unroll
-    for (int i = 0; i < {per_thread_loop_count}; i++) {{
-        _bit_unpack_{bits}_lane<BW>(in, shared_out, reference, thread_idx * {per_thread_loop_count} + i);
+    for (int i = 0; i < FL_LANES<uint{bits}_t> / {thread_count}; i++) {{
+        _bit_unpack_{bits}_lane<BW>(in, shared_out, reference, thread_idx * (FL_LANES<uint{bits}_t> / {thread_count}) + i);
     }}
     __syncwarp();
 
     // Step 2: Apply patches to shared memory in parallel
     PatchesCursor<uint{bits}_t> cursor(patches, blockIdx.x, thread_idx, {thread_count});
     auto patch = cursor.next();
-    while (patch.index != 1024) {{
+    while (patch.index != FL_CHUNK) {{
         shared_out[patch.index] = patch.value;
         patch = cursor.next();
     }}
@@ -172,7 +163,7 @@ __device__ void _bit_unpack_{bits}_device(const uint{bits}_t *__restrict in, uin
 
     // Step 3: Copy to global memory
     #pragma unroll
-    for (int i = 0; i < {shared_copy_ncount}; i++) {{
+    for (int i = 0; i < FL_CHUNK / {thread_count}; i++) {{
         auto idx = i * {thread_count} + thread_idx;
         out[idx] = shared_out[idx];
     }}
@@ -194,8 +185,8 @@ fn generate_global_kernel(
         output,
         r#"extern "C" __global__ void {func_name}(const uint{bits}_t *__restrict full_in, uint{bits}_t *__restrict full_out, uint{bits}_t reference, GPUPatches patches) {{
     int thread_idx = threadIdx.x;
-    auto in = full_in + (blockIdx.x * (128 * {bit_width} / sizeof(uint{bits}_t)));
-    auto out = full_out + (blockIdx.x * 1024);
+    auto in = full_in + (blockIdx.x * (FL_LANES<uint{bits}_t> * {bit_width}));
+    auto out = full_out + (blockIdx.x * FL_CHUNK);
     _bit_unpack_{bits}_device<{bit_width}>(in, out, reference, thread_idx, patches);
 }}
 "#
@@ -210,7 +201,6 @@ fn generate_global_kernel(
 /// not pull in the 129 standalone bit-unpack kernel entry points.
 pub fn generate_cuda_unpack_lanes<T: FastLanes>(output: &mut impl Write) -> io::Result<()> {
     let bits = T::T;
-    let lanes = T::LANES;
 
     write!(
         output,
@@ -230,7 +220,7 @@ __device__ void _bit_unpack_{bits}_lane(const uint{bits}_t *__restrict in, uint{
 
     // Lane-decoder template specializations (one per bit width).
     for bit_width in 0..=bits {
-        generate_lane_decoder(output, bits, lanes, bit_width)?;
+        generate_lane_decoder(output, bits, bit_width)?;
         writeln!(output)?;
     }
 
@@ -250,7 +240,6 @@ pub fn generate_cuda_unpack_kernels<T: FastLanes>(
     thread_count: usize,
 ) -> io::Result<()> {
     let bits = T::T;
-    let lanes = T::LANES;
 
     write!(
         output,
@@ -262,7 +251,7 @@ pub fn generate_cuda_unpack_kernels<T: FastLanes>(
     )?;
 
     // Device kernel template (written once, instantiated per bit width).
-    generate_device_kernel_template(output, bits, lanes, thread_count)?;
+    generate_device_kernel_template(output, bits, thread_count)?;
     writeln!(output)?;
 
     // Thin extern "C" global-kernel wrappers (one per bit width).
diff --git a/vortex-cuda/src/dynamic_dispatch/mod.rs b/vortex-cuda/src/dynamic_dispatch/mod.rs
index 847fa39340d..46aa35d0465 100644
--- a/vortex-cuda/src/dynamic_dispatch/mod.rs
+++ b/vortex-cuda/src/dynamic_dispatch/mod.rs
@@ -313,6 +313,7 @@ impl SourceOp {
                 bitunpack: SourceParams_BitunpackParams {
                     bit_width,
                     element_offset: u32::from(element_offset),
+                    patches_ptr: 0,
                 },
             },
         }
@@ -393,7 +394,11 @@ impl ScalarOp {
             op_code: ScalarOp_ScalarOpCode_ALP,
             output_ptype: PTypeTag_PTYPE_F32,
             params: ScalarParams {
-                alp: ScalarParams_AlpParams { f, e },
+                alp: ScalarParams_AlpParams {
+                    f,
+                    e,
+                    patches_ptr: 0,
+                },
             },
         }
     }
@@ -493,6 +498,7 @@ impl MaterializedPlan {
 
 #[cfg(test)]
 mod tests {
+    use std::ops::Range;
     use std::sync::Arc;
 
     use cudarc::driver::DevicePtr;
@@ -515,6 +521,7 @@ mod tests {
     use vortex::encodings::alp::alp_encode;
     use vortex::encodings::fastlanes::BitPacked;
     use vortex::encodings::fastlanes::BitPackedArray;
+    use vortex::encodings::fastlanes::BitPackedArrayExt;
     use vortex::encodings::fastlanes::FoR;
     use vortex::encodings::fastlanes::FoRArrayExt;
     use vortex::encodings::runend::RunEnd;
@@ -550,7 +557,7 @@ mod tests {
 
     fn dispatch_plan(
         array: &vortex::array::ArrayRef,
-        ctx: &CudaExecutionCtx,
+        ctx: &mut CudaExecutionCtx,
     ) -> VortexResult<MaterializedPlan> {
         match DispatchPlan::new(array)? {
             DispatchPlan::Fused(plan) => plan.materialize(ctx),
@@ -778,8 +785,8 @@ mod tests {
             .collect();
 
         let bp = bitpacked_array_u32(bit_width, len);
-        let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
-        let plan = dispatch_plan(&bp.into_array(), &cuda_ctx)?;
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
+        let plan = dispatch_plan(&bp.into_array(), &mut cuda_ctx)?;
 
         let actual =
             run_dynamic_dispatch_plan(&cuda_ctx, len, &plan.dispatch_plan, plan.shared_mem_bytes)?;
@@ -803,8 +810,8 @@ mod tests {
         let bp = bitpacked_array_u32(bit_width, len);
         let for_arr = FoR::try_new(bp.into_array(), Scalar::from(reference))?;
 
-        let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
-        let plan = dispatch_plan(&for_arr.into_array(), &cuda_ctx)?;
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
+        let plan = dispatch_plan(&for_arr.into_array(), &mut cuda_ctx)?;
 
         let actual =
             run_dynamic_dispatch_plan(&cuda_ctx, len, &plan.dispatch_plan, plan.shared_mem_bytes)?;
@@ -830,7 +837,7 @@ mod tests {
         let values_arr = PrimitiveArray::new(Buffer::from(values), NonNullable).into_array();
         let re = RunEnd::new(ends_arr, values_arr, cuda_ctx.execution_ctx());
 
-        let plan = dispatch_plan(&re.into_array(), &cuda_ctx)?;
+        let plan = dispatch_plan(&re.into_array(), &mut cuda_ctx)?;
 
         let actual =
             run_dynamic_dispatch_plan(&cuda_ctx, len, &plan.dispatch_plan, plan.shared_mem_bytes)?;
@@ -870,8 +877,8 @@ mod tests {
 
         let dict = DictArray::try_new(codes_bp.into_array(), dict_for.into_array())?;
 
-        let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
-        let plan = dispatch_plan(&dict.into_array(), &cuda_ctx)?;
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
+        let plan = dispatch_plan(&dict.into_array(), &mut cuda_ctx)?;
 
         let actual =
             run_dynamic_dispatch_plan(&cuda_ctx, len, &plan.dispatch_plan, plan.shared_mem_bytes)?;
@@ -907,8 +914,8 @@ mod tests {
             None,
         );
 
-        let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
-        let plan = dispatch_plan(&tree.into_array(), &cuda_ctx)?;
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
+        let plan = dispatch_plan(&tree.into_array(), &mut cuda_ctx)?;
 
         let actual =
             run_dispatch_plan_f32(&cuda_ctx, len, &plan.dispatch_plan, plan.shared_mem_bytes)?;
@@ -940,8 +947,8 @@ mod tests {
         )?;
         let zz = ZigZag::try_new(bp.into_array())?;
 
-        let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
-        let plan = dispatch_plan(&zz.into_array(), &cuda_ctx)?;
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
+        let plan = dispatch_plan(&zz.into_array(), &mut cuda_ctx)?;
 
         let actual =
             run_dynamic_dispatch_plan(&cuda_ctx, len, &plan.dispatch_plan, plan.shared_mem_bytes)?;
@@ -970,7 +977,7 @@ mod tests {
         let re = RunEnd::new(ends_arr, values_arr, cuda_ctx.execution_ctx());
         let for_arr = FoR::try_new(re.into_array(), Scalar::from(reference))?;
 
-        let plan = dispatch_plan(&for_arr.into_array(), &cuda_ctx)?;
+        let plan = dispatch_plan(&for_arr.into_array(), &mut cuda_ctx)?;
 
         let actual =
             run_dynamic_dispatch_plan(&cuda_ctx, len, &plan.dispatch_plan, plan.shared_mem_bytes)?;
@@ -998,8 +1005,8 @@ mod tests {
         let dict = DictArray::try_new(codes_prim.into_array(), values_prim.into_array())?;
         let for_arr = FoR::try_new(dict.into_array(), Scalar::from(reference))?;
 
-        let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
-        let plan = dispatch_plan(&for_arr.into_array(), &cuda_ctx)?;
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
+        let plan = dispatch_plan(&for_arr.into_array(), &mut cuda_ctx)?;
 
         let actual =
             run_dynamic_dispatch_plan(&cuda_ctx, len, &plan.dispatch_plan, plan.shared_mem_bytes)?;
@@ -1030,8 +1037,8 @@ mod tests {
         let values_prim = PrimitiveArray::new(Buffer::from(dict_values), NonNullable);
         let dict = DictArray::try_new(codes_for.into_array(), values_prim.into_array())?;
 
-        let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
-        let plan = dispatch_plan(&dict.into_array(), &cuda_ctx)?;
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
+        let plan = dispatch_plan(&dict.into_array(), &mut cuda_ctx)?;
 
         let actual =
             run_dynamic_dispatch_plan(&cuda_ctx, len, &plan.dispatch_plan, plan.shared_mem_bytes)?;
@@ -1059,8 +1066,8 @@ mod tests {
 
         let dict = DictArray::try_new(codes_bp.into_array(), values_prim.into_array())?;
 
-        let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
-        let plan = dispatch_plan(&dict.into_array(), &cuda_ctx)?;
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
+        let plan = dispatch_plan(&dict.into_array(), &mut cuda_ctx)?;
 
         let actual =
             run_dynamic_dispatch_plan(&cuda_ctx, len, &plan.dispatch_plan, plan.shared_mem_bytes)?;
@@ -1203,8 +1210,8 @@ mod tests {
 
         let expected: Vec<u32> = data[slice_start..slice_end].to_vec();
 
-        let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
-        let plan = dispatch_plan(&sliced, &cuda_ctx)?;
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
+        let plan = dispatch_plan(&sliced, &mut cuda_ctx)?;
 
         let actual = run_dynamic_dispatch_plan(
             &cuda_ctx,
@@ -1258,8 +1265,8 @@ mod tests {
         let sliced = zz.into_array().slice(slice_start..slice_end)?;
         let expected: Vec<u32> = all_decoded[slice_start..slice_end].to_vec();
 
-        let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
-        let plan = dispatch_plan(&sliced, &cuda_ctx)?;
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
+        let plan = dispatch_plan(&sliced, &mut cuda_ctx)?;
 
         let actual = run_dynamic_dispatch_plan(
             &cuda_ctx,
@@ -1308,8 +1315,8 @@ mod tests {
             .map(|&c| dict_values[c as usize])
             .collect();
 
-        let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
-        let plan = dispatch_plan(&sliced, &cuda_ctx)?;
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
+        let plan = dispatch_plan(&sliced, &mut cuda_ctx)?;
 
         let actual = run_dynamic_dispatch_plan(
             &cuda_ctx,
@@ -1357,8 +1364,8 @@ mod tests {
         let sliced = bp.into_array().slice(slice_start..slice_end)?;
         let expected: Vec<u32> = data[slice_start..slice_end].to_vec();
 
-        let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
-        let plan = dispatch_plan(&sliced, &cuda_ctx)?;
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
+        let plan = dispatch_plan(&sliced, &mut cuda_ctx)?;
 
         let actual = run_dynamic_dispatch_plan(
             &cuda_ctx,
@@ -1410,8 +1417,8 @@ mod tests {
         let sliced = for_arr.into_array().slice(slice_start..slice_end)?;
         let expected: Vec<u32> = all_decoded[slice_start..slice_end].to_vec();
 
-        let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
-        let plan = dispatch_plan(&sliced, &cuda_ctx)?;
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
+        let plan = dispatch_plan(&sliced, &mut cuda_ctx)?;
 
         let actual = run_dynamic_dispatch_plan(
             &cuda_ctx,
@@ -1475,8 +1482,8 @@ mod tests {
         let sliced = dict.into_array().slice(slice_start..slice_end)?;
         let expected: Vec<u32> = all_decoded[slice_start..slice_end].to_vec();
 
-        let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
-        let plan = dispatch_plan(&sliced, &cuda_ctx)?;
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
+        let plan = dispatch_plan(&sliced, &mut cuda_ctx)?;
 
         let actual = run_dynamic_dispatch_plan(
             &cuda_ctx,
@@ -1507,8 +1514,8 @@ mod tests {
 
         let seq = Sequence::try_new_typed(base, multiplier, Nullability::NonNullable, len)?;
 
-        let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
-        let plan = dispatch_plan(&seq.into_array(), &cuda_ctx)?;
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
+        let plan = dispatch_plan(&seq.into_array(), &mut cuda_ctx)?;
 
         let actual = run_dynamic_dispatch_plan(
             &cuda_ctx,
@@ -1540,8 +1547,8 @@ mod tests {
 
         let seq = Sequence::try_new_typed(base, multiplier, Nullability::NonNullable, len)?;
 
-        let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
-        let plan = dispatch_plan(&seq.into_array(), &cuda_ctx)?;
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
+        let plan = dispatch_plan(&seq.into_array(), &mut cuda_ctx)?;
 
         let actual_u32 = run_dynamic_dispatch_plan(
             &cuda_ctx,
@@ -2127,4 +2134,438 @@ mod tests {
         assert_eq!(prim.validity()?.nullability(), Nullability::Nullable);
         Ok(())
     }
+
+    // ---------------------------------------------------------------
+    // Patch tests — fused dynamic dispatch with exception values
+    // ---------------------------------------------------------------
+
+    #[rstest]
+    #[case::unsliced(3000, None)]
+    #[case::mid_slice(5000, Some(500..3500))]
+    #[case::start_slice(5000, Some(0..1000))]
+    #[case::chunk_aligned(5000, Some(1024..3000))]
+    #[crate::test]
+    fn test_bitpacked_with_patches(
+        #[case] len: usize,
+        #[case] slice_range: Option<Range<usize>>,
+    ) -> VortexResult<()> {
+        let bit_width: u8 = 4;
+        let max_val = (1u32 << bit_width) - 1;
+        let values: Vec<u32> = (0..len)
+            .map(|i| {
+                if i % 100 == 0 {
+                    1000
+                } else {
+                    (i as u32) % (max_val + 1)
+                }
+            })
+            .collect();
+
+        let prim = PrimitiveArray::new(Buffer::from(values.clone()), NonNullable);
+        let bp = BitPacked::encode(
+            &prim.into_array(),
+            bit_width,
+            &mut LEGACY_SESSION.create_execution_ctx(),
+        )?;
+        assert!(bp.patches().is_some(), "expected patches");
+
+        let (array, expected) = if let Some(range) = slice_range {
+            let sliced = bp.into_array().slice(range.clone())?;
+            (sliced, values[range].to_vec())
+        } else {
+            (bp.into_array(), values)
+        };
+
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
+        let plan = dispatch_plan(&array, &mut cuda_ctx)?;
+        let actual = run_dynamic_dispatch_plan(
+            &cuda_ctx,
+            expected.len(),
+            &plan.dispatch_plan,
+            plan.shared_mem_bytes,
+        )?;
+        assert_eq!(actual, expected);
+        Ok(())
+    }
+
+    #[rstest]
+    #[case::unsliced(3000, None)]
+    #[case::mid_slice(5000, Some(500..3500))]
+    #[crate::test]
+    fn test_for_bitpacked_with_patches(
+        #[case] len: usize,
+        #[case] slice_range: Option<Range<usize>>,
+    ) -> VortexResult<()> {
+        let bit_width: u8 = 6;
+        let reference = 42u32;
+        let max_val = (1u32 << bit_width) - 1;
+        let residuals: Vec<u32> = (0..len)
+            .map(|i| {
+                if i % 200 == 0 {
+                    500
+                } else {
+                    (i as u32) % (max_val + 1)
+                }
+            })
+            .collect();
+        let all_values: Vec<u32> = residuals.iter().map(|&v| v + reference).collect();
+
+        let prim = PrimitiveArray::new(Buffer::from(residuals), NonNullable);
+        let bp = BitPacked::encode(
+            &prim.into_array(),
+            bit_width,
+            &mut LEGACY_SESSION.create_execution_ctx(),
+        )?;
+        assert!(bp.patches().is_some(), "expected patches");
+        let for_arr = FoR::try_new(bp.into_array(), Scalar::from(reference))?;
+
+        let (array, expected) = if let Some(range) = slice_range {
+            let sliced = for_arr.into_array().slice(range.clone())?;
+            (sliced, all_values[range].to_vec())
+        } else {
+            (for_arr.into_array(), all_values)
+        };
+
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
+        let plan = dispatch_plan(&array, &mut cuda_ctx)?;
+        let actual = run_dynamic_dispatch_plan(
+            &cuda_ctx,
+            expected.len(),
+            &plan.dispatch_plan,
+            plan.shared_mem_bytes,
+        )?;
+        assert_eq!(actual, expected);
+        Ok(())
+    }
+
+    #[rstest]
+    #[case::unsliced(2000, None)]
+    #[case::mid_slice(5000, Some(100..4000))]
+    #[case::large_offset(5000, Some(1500..4500))]
+    #[crate::test]
+    fn test_alp_with_patches(
+        #[case] len: usize,
+        #[case] slice_range: Option<Range<usize>>,
+    ) -> VortexResult<()> {
+        let mut values: Vec<f32> = (0..len).map(|i| (i as f32) * 1.1).collect();
+        // Insert exception values that ALP can't encode.
+        values[0] = 99.9;
+        values[500] = std::f32::consts::PI;
+        values[1024] = std::f32::consts::E;
+        if len > 2048 {
+            values[2048] = std::f32::consts::LN_2;
+        }
+        if len > 3333 {
+            values[3333] = std::f32::consts::SQRT_2;
+        }
+
+        let float_prim = PrimitiveArray::new(Buffer::from(values), NonNullable);
+        let encoded = alp_encode(
+            float_prim.as_view(),
+            None,
+            &mut LEGACY_SESSION.create_execution_ctx(),
+        )?
+        .into_array();
+
+        let (array, base_offset) = if let Some(ref range) = slice_range {
+            (encoded.slice(range.clone())?, range.start)
+        } else {
+            (encoded, 0)
+        };
+
+        // Decode on CPU as ground truth (accounts for ALP precision loss + patches).
+        let cpu_decoded = array
+            .clone()
+            .execute::<PrimitiveArray>(&mut LEGACY_SESSION.create_execution_ctx())?;
+        let expected: Vec<f32> = cpu_decoded.as_slice::<f32>().to_vec();
+
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
+        let plan = dispatch_plan(&array, &mut cuda_ctx)?;
+        let actual = run_dispatch_plan_f32(
+            &cuda_ctx,
+            expected.len(),
+            &plan.dispatch_plan,
+            plan.shared_mem_bytes,
+        )?;
+        for (i, (&a, &e)) in actual.iter().zip(expected.iter()).enumerate() {
+            assert!(
+                a.to_bits() == e.to_bits(),
+                "mismatch at index {i} (original index {}): gpu={a} cpu={e} (bits: {:#010x} vs {:#010x})",
+                i + base_offset,
+                a.to_bits(),
+                e.to_bits(),
+            );
+        }
+        Ok(())
+    }
+
+    // ---------------------------------------------------------------
+    // Additional patch tests — typed widths, edge cases, composites
+    // ---------------------------------------------------------------
+
+    /// u8 BitPacked with patches (bit_width=3, patch values > 7).
+    #[crate::test]
+    async fn test_bitpacked_with_patches_u8() -> VortexResult<()> {
+        let bit_width: u8 = 3;
+        let len = 3000usize;
+        let max_val = (1u8 << bit_width) - 1;
+        let values: Vec<u8> = (0..len)
+            .map(|i| {
+                if i % 100 == 0 {
+                    200u8
+                } else {
+                    (i as u8) % (max_val + 1)
+                }
+            })
+            .collect();
+
+        let prim = PrimitiveArray::new(Buffer::from(values.clone()), NonNullable);
+        let bp = BitPacked::encode(
+            &prim.into_array(),
+            bit_width,
+            &mut LEGACY_SESSION.create_execution_ctx(),
+        )?;
+        assert!(bp.patches().is_some(), "expected patches");
+
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
+        let canonical = try_gpu_dispatch(&bp.into_array(), &mut cuda_ctx).await?;
+        let result = CanonicalCudaExt::into_host(canonical).await?.into_array();
+
+        let expected_arr = PrimitiveArray::new(Buffer::from(values), NonNullable).into_array();
+        vortex::array::assert_arrays_eq!(expected_arr, result);
+        Ok(())
+    }
+
+    /// u16 BitPacked with patches (bit_width=6, patch values > 63).
+    #[crate::test]
+    async fn test_bitpacked_with_patches_u16() -> VortexResult<()> {
+        let bit_width: u8 = 6;
+        let len = 3000usize;
+        let max_val = (1u16 << bit_width) - 1;
+        let values: Vec<u16> = (0..len)
+            .map(|i| {
+                if i % 150 == 0 {
+                    5000u16
+                } else {
+                    (i as u16) % (max_val + 1)
+                }
+            })
+            .collect();
+
+        let prim = PrimitiveArray::new(Buffer::from(values.clone()), NonNullable);
+        let bp = BitPacked::encode(
+            &prim.into_array(),
+            bit_width,
+            &mut LEGACY_SESSION.create_execution_ctx(),
+        )?;
+        assert!(bp.patches().is_some(), "expected patches");
+
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
+        let canonical = try_gpu_dispatch(&bp.into_array(), &mut cuda_ctx).await?;
+        let result = CanonicalCudaExt::into_host(canonical).await?.into_array();
+
+        let expected_arr = PrimitiveArray::new(Buffer::from(values), NonNullable).into_array();
+        vortex::array::assert_arrays_eq!(expected_arr, result);
+        Ok(())
+    }
+
+    /// u64 BitPacked with patches (bit_width=4, patch values > 15).
+    #[crate::test]
+    async fn test_bitpacked_with_patches_u64() -> VortexResult<()> {
+        let bit_width: u8 = 4;
+        let len = 3000usize;
+        let max_val = (1u64 << bit_width) - 1;
+        let values: Vec<u64> = (0..len)
+            .map(|i| {
+                if i % 200 == 0 {
+                    1_000_000u64
+                } else {
+                    (i as u64) % (max_val + 1)
+                }
+            })
+            .collect();
+
+        let prim = PrimitiveArray::new(Buffer::from(values.clone()), NonNullable);
+        let bp = BitPacked::encode(
+            &prim.into_array(),
+            bit_width,
+            &mut LEGACY_SESSION.create_execution_ctx(),
+        )?;
+        assert!(bp.patches().is_some(), "expected patches");
+
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
+        let canonical = try_gpu_dispatch(&bp.into_array(), &mut cuda_ctx).await?;
+        let result = CanonicalCudaExt::into_host(canonical).await?.into_array();
+
+        let expected_arr = PrimitiveArray::new(Buffer::from(values), NonNullable).into_array();
+        vortex::array::assert_arrays_eq!(expected_arr, result);
+        Ok(())
+    }
+
+    /// Dict where codes are BitPacked u32 with patches exceeding the bit width.
+    #[crate::test]
+    fn test_dict_bitpacked_codes_with_patches() -> VortexResult<()> {
+        let dict_values: Vec<u32> = (0..256).map(|i| i * 1000 + 42).collect();
+        let len = 3000;
+        let bit_width: u8 = 4;
+        let max_code = (1u32 << bit_width) - 1;
+        // Some codes exceed max_code (15), creating patches in the BitPacked codes.
+        let codes: Vec<u32> = (0..len)
+            .map(|i| {
+                if i % 100 == 0 {
+                    100u32 // exceeds max_code=15, becomes a patch
+                } else {
+                    (i as u32) % (max_code + 1)
+                }
+            })
+            .collect();
+        let expected: Vec<u32> = codes.iter().map(|&c| dict_values[c as usize]).collect();
+
+        let codes_prim = PrimitiveArray::new(Buffer::from(codes), NonNullable);
+        let codes_bp = BitPacked::encode(
+            &codes_prim.into_array(),
+            bit_width,
+            &mut LEGACY_SESSION.create_execution_ctx(),
+        )?;
+        assert!(codes_bp.patches().is_some(), "expected patches on codes");
+
+        let values_prim = PrimitiveArray::new(Buffer::from(dict_values), NonNullable);
+        let dict = DictArray::try_new(codes_bp.into_array(), values_prim.into_array())?;
+
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
+        let plan = dispatch_plan(&dict.into_array(), &mut cuda_ctx)?;
+        let actual =
+            run_dynamic_dispatch_plan(&cuda_ctx, len, &plan.dispatch_plan, plan.shared_mem_bytes)?;
+        assert_eq!(actual, expected);
+        Ok(())
+    }
+
+    /// Patches placed exactly at FastLanes chunk boundaries (1024-element chunks).
+    #[crate::test]
+    fn test_bitpacked_patches_at_chunk_boundaries() -> VortexResult<()> {
+        let len = 4096usize;
+        let bit_width: u8 = 4;
+        let max_val = (1u32 << bit_width) - 1;
+        let mut values: Vec<u32> = (0..len).map(|i| (i as u32) % (max_val + 1)).collect();
+        // Place patches at FL chunk boundaries (1024 elements per chunk).
+        values[1023] = 1000; // end of chunk 0
+        values[1024] = 2000; // start of chunk 1
+        values[2047] = 3000; // end of chunk 1
+        values[2048] = 4000; // start of chunk 2
+
+        let prim = PrimitiveArray::new(Buffer::from(values.clone()), NonNullable);
+        let bp = BitPacked::encode(
+            &prim.into_array(),
+            bit_width,
+            &mut LEGACY_SESSION.create_execution_ctx(),
+        )?;
+        assert!(bp.patches().is_some(), "expected patches");
+
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
+        let plan = dispatch_plan(&bp.into_array(), &mut cuda_ctx)?;
+        let actual =
+            run_dynamic_dispatch_plan(&cuda_ctx, len, &plan.dispatch_plan, plan.shared_mem_bytes)?;
+        assert_eq!(actual, values);
+        Ok(())
+    }
+
+    /// Large array (100k elements) spanning many blocks with sparse patches.
+    #[crate::test]
+    fn test_bitpacked_large_array_with_patches() -> VortexResult<()> {
+        let len = 100_000usize;
+        let bit_width: u8 = 6;
+        let max_val = (1u32 << bit_width) - 1;
+        let values: Vec<u32> = (0..len)
+            .map(|i| {
+                if i % 500 == 0 {
+                    1000
+                } else {
+                    (i as u32) % (max_val + 1)
+                }
+            })
+            .collect();
+
+        let prim = PrimitiveArray::new(Buffer::from(values.clone()), NonNullable);
+        let bp = BitPacked::encode(
+            &prim.into_array(),
+            bit_width,
+            &mut LEGACY_SESSION.create_execution_ctx(),
+        )?;
+        assert!(bp.patches().is_some(), "expected patches");
+
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
+        let plan = dispatch_plan(&bp.into_array(), &mut cuda_ctx)?;
+        let actual =
+            run_dynamic_dispatch_plan(&cuda_ctx, len, &plan.dispatch_plan, plan.shared_mem_bytes)?;
+        assert_eq!(actual, values);
+        Ok(())
+    }
+
+    /// Nullable BitPacked with patches — validity must survive through fused
+    /// dispatch alongside patch application.
+    #[crate::test]
+    async fn test_nullable_bitpacked_with_patches() -> VortexResult<()> {
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
+
+        let len = 3000usize;
+        let bit_width: u8 = 4;
+        let max_val = (1u32 << bit_width) - 1;
+
+        // Every 7th element is null; every 100th (non-null) element is a patch.
+        let values: Vec<Option<u32>> = (0..len)
+            .map(|i| {
+                if i % 7 == 0 {
+                    None
+                } else if i % 100 == 0 {
+                    Some(1000u32) // exceeds max_val=15, becomes a patch
+                } else {
+                    Some((i as u32) % (max_val + 1))
+                }
+            })
+            .collect();
+
+        let prim = PrimitiveArray::from_option_iter(values.iter().copied());
+        let cpu = crate::canonicalize_cpu(prim.clone())?.into_array();
+
+        let bp = BitPacked::encode(
+            &prim.into_array(),
+            bit_width,
+            &mut LEGACY_SESSION.create_execution_ctx(),
+        )?;
+        assert!(bp.patches().is_some(), "expected patches");
+
+        let gpu = try_gpu_dispatch(&bp.into_array(), &mut cuda_ctx)
+            .await?
+            .into_host()
+            .await?
+            .into_array();
+
+        vortex::array::assert_arrays_eq!(cpu, gpu);
+        Ok(())
+    }
+
+    /// Extreme case: ALL values are patches (bit_width=1, every value > 1).
+    #[crate::test]
+    fn test_bitpacked_all_patches() -> VortexResult<()> {
+        let bit_width: u8 = 1;
+        let len = 2000usize;
+        // All values >= 2, so every single element exceeds max storable (1) and
+        // becomes a patch.
+        let values: Vec<u32> = (0..len).map(|i| (i as u32) + 2).collect();
+
+        let prim = PrimitiveArray::new(Buffer::from(values.clone()), NonNullable);
+        let bp = BitPacked::encode(
+            &prim.into_array(),
+            bit_width,
+            &mut LEGACY_SESSION.create_execution_ctx(),
+        )?;
+        assert!(bp.patches().is_some(), "expected patches");
+
+        let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
+        let plan = dispatch_plan(&bp.into_array(), &mut cuda_ctx)?;
+        let actual =
+            run_dynamic_dispatch_plan(&cuda_ctx, len, &plan.dispatch_plan, plan.shared_mem_bytes)?;
+        assert_eq!(actual, values);
+        Ok(())
+    }
 }
diff --git a/vortex-cuda/src/dynamic_dispatch/plan_builder.rs b/vortex-cuda/src/dynamic_dispatch/plan_builder.rs
index bec93d4336f..1b63630906c 100644
--- a/vortex-cuda/src/dynamic_dispatch/plan_builder.rs
+++ b/vortex-cuda/src/dynamic_dispatch/plan_builder.rs
@@ -16,12 +16,14 @@ use vortex::array::arrays::Slice;
 use vortex::array::arrays::dict::DictArraySlotsExt;
 use vortex::array::arrays::slice::SliceArrayExt;
 use vortex::array::buffer::BufferHandle;
+use vortex::array::patches::Patches;
 use vortex::array::validity::Validity;
 use vortex::dtype::PType;
 use vortex::encodings::alp::ALP;
 use vortex::encodings::alp::ALPArrayExt;
 use vortex::encodings::alp::ALPArraySlotsExt;
 use vortex::encodings::alp::ALPFloat;
+use vortex::encodings::alp::Exponents;
 use vortex::encodings::fastlanes::BitPacked;
 use vortex::encodings::fastlanes::BitPackedArrayExt;
 use vortex::encodings::fastlanes::FoR;
@@ -45,6 +47,7 @@ use super::ptype_to_tag;
 use super::tag_to_ptype;
 use crate::CudaBufferExt;
 use crate::CudaExecutionCtx;
+use crate::kernel::load_patches_sync;
 
 /// A plan whose source buffers have been copied to the device, ready for kernel launch.
 pub struct MaterializedPlan {
@@ -69,10 +72,10 @@ fn is_dyn_dispatch_compatible(array: &ArrayRef) -> bool {
     let id = array.encoding_id();
     if id == ALP.id() {
         let arr = array.as_::<ALP>();
-        return arr.patches().is_none() && arr.dtype().as_ptype() == PType::F32;
+        return arr.dtype().as_ptype() == PType::F32;
     }
     if id == BitPacked.id() {
-        return array.as_::<BitPacked>().patches().is_none();
+        return true;
     }
     if id == Dict.id() {
         let arr = array.as_::<Dict>();
@@ -119,9 +122,17 @@ fn is_dyn_dispatch_compatible(array: &ArrayRef) -> bool {
 }
 
 /// An unmaterialized stage: a source op, scalar ops, and optional source buffer reference.
+///
+/// Patches are tied to their owning ops, mirroring the CUDA side where
+/// `patches_ptr` lives on `BitunpackParams` / `AlpParams`:
+/// - `source_patches` for the source op (BitPacked exceptions)
+/// - Each scalar op carries its own `Option<Patches>` (ALP exceptions)
 struct Stage {
     source: SourceOp,
-    scalar_ops: Vec<ScalarOp>,
+    /// Patches from the source op (e.g. BitPacked overflow exceptions).
+    source_patches: Option<Patches>,
+    /// Scalar ops with optional per-op patches (e.g. ALP float exceptions).
+    scalar_ops: Vec<(ScalarOp, Option<Patches>)>,
     /// Index into `FusedPlan::source_buffers`, or `None`
     /// for sources that don't read from a device buffer.
     source_buffer_index: Option<usize>,
@@ -133,6 +144,7 @@ impl Stage {
     fn new(source: SourceOp, source_buffer_index: Option<usize>, source_ptype: PTypeTag) -> Self {
         Self {
             source,
+            source_patches: None,
             scalar_ops: vec![],
             source_buffer_index,
             source_ptype,
@@ -240,7 +252,7 @@ impl DispatchPlan {
     /// - Validity is propagated from the root array to the output. Nullable
     ///   arrays are supported, but Dict with nullable codes and RunEnd with
     ///   nullable ends are rejected to guard against out-of-bounds access.
-    /// - `BitPackedArray` and `ALPArray` with patches are not supported.
+    /// - `BitPackedArray` and `ALPArray` with patches are supported.
     /// - Only f32 ALP is supported (kernel stores multipliers as `float`).
     pub fn new(array: &ArrayRef) -> VortexResult<Self> {
         if PType::try_from(array.dtype()).is_err() || !is_dyn_dispatch_compatible(array) {
@@ -340,7 +352,7 @@ impl FusedPlan {
     }
 
     /// Copy source buffers to the device, producing a [`MaterializedPlan`].
-    pub fn materialize(self, ctx: &CudaExecutionCtx) -> VortexResult<MaterializedPlan> {
+    pub fn materialize(self, ctx: &mut CudaExecutionCtx) -> VortexResult<MaterializedPlan> {
         let shared_mem_bytes = self.dynamic_shared_mem_bytes();
 
         let mut device_buffers = Vec::new();
@@ -367,20 +379,37 @@ impl FusedPlan {
         // Byte offsets are passed directly to the C ABI — the kernel now
         // indexes shared memory by byte offset and casts to the correct type
         // using source_ptype / output_ptype.
-        let stages: Vec<MaterializedStage> = self
-            .stages
-            .iter()
-            .map(|(stage, smem_byte_offset, len)| {
-                MaterializedStage::new(
-                    resolve_ptr(stage),
-                    *smem_byte_offset,
-                    *len,
-                    stage.source_ptype,
-                    stage.source,
-                    &stage.scalar_ops,
-                )
-            })
-            .collect();
+        let mut stages: Vec<MaterializedStage> = Vec::new();
+        for (stage, smem_byte_offset, len) in &self.stages {
+            let mut source = stage.source;
+
+            // Upload source patches (e.g. BitPacked exceptions).
+            if let Some(patches) = &stage.source_patches {
+                let (ptr, bufs) = load_patches_sync(patches, ctx)?;
+                source.params.bitunpack.patches_ptr = ptr;
+                device_buffers.extend(bufs);
+            }
+
+            // Upload patches for each scalar op that carries them.
+            let mut scalar_ops: Vec<ScalarOp> = Vec::with_capacity(stage.scalar_ops.len());
+            for (mut op, patches) in stage.scalar_ops.clone() {
+                if let Some(patches) = &patches {
+                    let (ptr, bufs) = load_patches_sync(patches, ctx)?;
+                    op.params.alp.patches_ptr = ptr;
+                    device_buffers.extend(bufs);
+                }
+                scalar_ops.push(op);
+            }
+
+            stages.push(MaterializedStage::new(
+                resolve_ptr(stage),
+                *smem_byte_offset,
+                *len,
+                stage.source_ptype,
+                source,
+                &scalar_ops,
+            ));
+        }
 
         Ok(MaterializedPlan {
             dispatch_plan: CudaDispatchPlan::new(stages, self.output_ptype),
@@ -398,7 +427,7 @@ impl FusedPlan {
     pub fn materialize_with_subtrees(
         mut self,
         subtree_buffers: Vec<BufferHandle>,
-        ctx: &CudaExecutionCtx,
+        ctx: &mut CudaExecutionCtx,
     ) -> VortexResult<MaterializedPlan> {
         for (slot, buf) in zip_eq(
             self.source_buffers.iter_mut().filter(|s| s.is_none()),
@@ -465,7 +494,8 @@ impl FusedPlan {
     /// SliceArray → resolve the slice via reduce/execute rules.
     ///
     /// When the plan builder encounters a `SliceArray`, it resolves the slice
-    /// by invoking the child's `reduce_parent`, `execute_parent`.
+    /// by invoking the child's `reduce_parent`. If that fails (e.g. ALP
+    /// doesn't implement it), we manually slice the child's sub-arrays.
     fn walk_slice(
         &mut self,
         array: ArrayRef,
@@ -478,6 +508,26 @@ impl FusedPlan {
             return self.walk(reduced, pending_subtrees);
         }
 
+        // ALP doesn't implement reduce_parent — slice encoded child and
+        // patches manually (Patches::slice adjusts offsets for the range).
+        if child.encoding_id() == ALP.id() {
+            let alp = child.as_::<ALP>();
+            let offset = slice_arr.data().slice_range().start;
+            let len = array.len();
+            let sliced_encoded = alp.encoded().clone().slice(offset..offset + len)?;
+            let sliced_patches = alp
+                .patches()
+                .map(|p| p.slice(offset..offset + len))
+                .transpose()?
+                .flatten();
+            return self.walk_alp_inner(
+                sliced_encoded,
+                sliced_patches,
+                alp.exponents(),
+                pending_subtrees,
+            );
+        }
+
         vortex_bail!(
             "Cannot resolve SliceArray wrapping {:?} in dynamic dispatch plan builder",
             child.encoding_id()
@@ -498,20 +548,18 @@ impl FusedPlan {
     fn walk_bitpacked(&mut self, array: ArrayRef) -> VortexResult<Stage> {
         let bp = array.as_::<BitPacked>();
 
-        if bp.patches().is_some() {
-            vortex_bail!("Dynamic dispatch does not support BitPackedArray with patches");
-        }
-
         let source_ptype = ptype_to_tag(PType::try_from(bp.dtype()).map_err(|_| {
             vortex_err!("BitPacked must have primitive dtype, got {:?}", bp.dtype())
         })?);
         let buf_index = self.source_buffers.len();
         self.source_buffers.push(Some(bp.packed().clone()));
-        Ok(Stage::new(
+        let mut stage = Stage::new(
             SourceOp::bitunpack(bp.bit_width(), bp.offset()),
             Some(buf_index),
             source_ptype,
-        ))
+        );
+        stage.source_patches = bp.patches();
+        Ok(stage)
     }
 
     fn walk_for(
@@ -537,7 +585,7 @@ impl FusedPlan {
             .cast::<u64>()?;
         pipeline
             .scalar_ops
-            .push(ScalarOp::frame_of_ref(ref_u64, output_ptype));
+            .push((ScalarOp::frame_of_ref(ref_u64, output_ptype), None));
         Ok(pipeline)
     }
 
@@ -553,7 +601,9 @@ impl FusedPlan {
         })?);
 
         let mut pipeline = self.walk(encoded, pending_subtrees)?;
-        pipeline.scalar_ops.push(ScalarOp::zigzag(output_ptype));
+        pipeline
+            .scalar_ops
+            .push((ScalarOp::zigzag(output_ptype), None));
         Ok(pipeline)
     }
 
@@ -563,26 +613,29 @@ impl FusedPlan {
         pending_subtrees: &mut Vec<ArrayRef>,
     ) -> VortexResult<Stage> {
         let alp = array.as_::<ALP>();
+        self.walk_alp_inner(
+            alp.encoded().clone(),
+            alp.patches(),
+            alp.exponents(),
+            pending_subtrees,
+        )
+    }
 
-        if alp.patches().is_some() {
-            vortex_bail!("Dynamic dispatch does not support ALPArray with patches");
-        }
-
-        let ptype = alp.dtype().as_ptype();
-        if ptype != PType::F32 {
-            vortex_bail!(
-                "Dynamic dispatch only supports f32 ALP, got {:?}",
-                alp.dtype()
-            );
-        }
-
-        let exponents = alp.exponents();
+    /// Shared ALP logic for both `walk_alp` and `walk_slice` (Slice(ALP)).
+    fn walk_alp_inner(
+        &mut self,
+        encoded: ArrayRef,
+        patches: Option<Patches>,
+        exponents: Exponents,
+        pending_subtrees: &mut Vec<ArrayRef>,
+    ) -> VortexResult<Stage> {
         let alp_f = <f32 as ALPFloat>::F10[exponents.f as usize];
         let alp_e = <f32 as ALPFloat>::IF10[exponents.e as usize];
-        let encoded = alp.encoded().clone();
 
         let mut pipeline = self.walk(encoded, pending_subtrees)?;
-        pipeline.scalar_ops.push(ScalarOp::alp(alp_f, alp_e));
+        pipeline
+            .scalar_ops
+            .push((ScalarOp::alp(alp_f, alp_e), patches));
         Ok(pipeline)
     }
 
@@ -645,9 +698,9 @@ impl FusedPlan {
         };
         // DICT scalar op: pass byte offset directly (C ABI uses byte offsets).
         // output_ptype is the values' ptype — DICT transforms codes → values.
-        pipeline.scalar_ops.push(ScalarOp::dict(
-            values_smem_byte_offset,
-            ptype_to_tag(values_ptype),
+        pipeline.scalar_ops.push((
+            ScalarOp::dict(values_smem_byte_offset, ptype_to_tag(values_ptype)),
+            None,
         ));
         Ok(pipeline)
     }
@@ -727,7 +780,7 @@ impl FusedPlan {
         let final_ptype = spec
             .scalar_ops
             .last()
-            .map(|op| op.output_ptype)
+            .map(|(op, _)| op.output_ptype)
             .unwrap_or(spec.source_ptype);
         let final_elem_bytes = tag_to_ptype(final_ptype).byte_width() as u32;
         let elem_bytes = final_elem_bytes.max(self.output_elem_bytes);
diff --git a/vortex-cuda/src/kernel/encodings/bitpacked.rs b/vortex-cuda/src/kernel/encodings/bitpacked.rs
index 1853557e809..0059d74fd5e 100644
--- a/vortex-cuda/src/kernel/encodings/bitpacked.rs
+++ b/vortex-cuda/src/kernel/encodings/bitpacked.rs
@@ -21,7 +21,6 @@ use vortex::encodings::fastlanes::BitPackedArray;
 use vortex::encodings::fastlanes::BitPackedDataParts;
 use vortex::encodings::fastlanes::unpack_iter::BitPacked as BitPackedUnpack;
 use vortex::error::VortexResult;
-use vortex::error::vortex_bail;
 use vortex::error::vortex_ensure;
 use vortex::error::vortex_err;
 
@@ -29,13 +28,10 @@ use crate::CudaBufferExt;
 use crate::CudaDeviceBuffer;
 use crate::executor::CudaExecute;
 use crate::executor::CudaExecutionCtx;
-use crate::kernel::patches::gpu::ChunkOffsetType;
-use crate::kernel::patches::gpu::ChunkOffsetType_CO_U8;
-use crate::kernel::patches::gpu::ChunkOffsetType_CO_U16;
 use crate::kernel::patches::gpu::ChunkOffsetType_CO_U32;
-use crate::kernel::patches::gpu::ChunkOffsetType_CO_U64;
 use crate::kernel::patches::gpu::GPUPatches;
 use crate::kernel::patches::types::load_patches;
+use crate::kernel::patches::types::ptype_to_chunk_offset_type;
 
 /// CUDA decoder for bit-packed arrays.
 #[derive(Debug)]
@@ -92,17 +88,6 @@ pub fn bitpacked_cuda_launch_config(output_width: usize, len: usize) -> VortexRe
 
 unsafe impl DeviceRepr for GPUPatches {}
 
-/// Convert a PType to the corresponding ChunkOffsetType for GPU patches.
-fn ptype_to_chunk_offset_type(ptype: vortex::dtype::PType) -> VortexResult<ChunkOffsetType> {
-    match ptype {
-        vortex::dtype::PType::U8 => Ok(ChunkOffsetType_CO_U8),
-        vortex::dtype::PType::U16 => Ok(ChunkOffsetType_CO_U16),
-        vortex::dtype::PType::U32 => Ok(ChunkOffsetType_CO_U32),
-        vortex::dtype::PType::U64 => Ok(ChunkOffsetType_CO_U64),
-        _ => vortex_bail!("Invalid PType for chunk_offsets: {:?}", ptype),
-    }
-}
-
 #[instrument(skip_all)]
 pub(crate) async fn decode_bitpacked<A>(
     array: BitPackedArray,
diff --git a/vortex-cuda/src/kernel/mod.rs b/vortex-cuda/src/kernel/mod.rs
index 68f40a15005..2c9dfe0d033 100644
--- a/vortex-cuda/src/kernel/mod.rs
+++ b/vortex-cuda/src/kernel/mod.rs
@@ -34,6 +34,7 @@ pub use encodings::ZstdKernelPrep;
 pub use encodings::zstd_kernel_prepare;
 pub(crate) use encodings::*;
 pub(crate) use filter::FilterExecutor;
+pub(crate) use patches::types::load_patches_sync;
 pub(crate) use slice::SliceExecutor;
 
 use crate::CudaKernelEvents;
diff --git a/vortex-cuda/src/kernel/patches/types.rs b/vortex-cuda/src/kernel/patches/types.rs
index 350c5210e8a..e8e3ce5bfc6 100644
--- a/vortex-cuda/src/kernel/patches/types.rs
+++ b/vortex-cuda/src/kernel/patches/types.rs
@@ -3,18 +3,29 @@
 
 //! GPU patches loading for fused exception patching during bit-unpacking.
 
+use std::mem::size_of;
+
 use num_traits::ToPrimitive;
 use vortex::array::buffer::BufferHandle;
+use vortex::buffer::Alignment;
 use vortex::buffer::Buffer;
 use vortex::buffer::BufferMut;
+use vortex::buffer::ByteBufferMut;
 use vortex::dtype::PType;
 use vortex_array::match_each_unsigned_integer_ptype;
 use vortex_array::patches::Patches;
 use vortex_error::VortexResult;
 use vortex_error::vortex_bail;
 
+use crate::CudaBufferExt;
 use crate::CudaExecutionCtx;
 use crate::executor::CudaArrayExt;
+use crate::kernel::patches::gpu::ChunkOffsetType;
+use crate::kernel::patches::gpu::ChunkOffsetType_CO_U8;
+use crate::kernel::patches::gpu::ChunkOffsetType_CO_U16;
+use crate::kernel::patches::gpu::ChunkOffsetType_CO_U32;
+use crate::kernel::patches::gpu::ChunkOffsetType_CO_U64;
+use crate::kernel::patches::gpu::GPUPatches;
 
 /// A set of device-resident patches.
 pub struct DevicePatches {
@@ -28,7 +39,7 @@ pub struct DevicePatches {
     pub(crate) n_chunks: usize,
 }
 
-/// Load patches for GPU use.
+/// Load patches for GPU use (async).
 ///
 /// # Errors
 ///
@@ -40,17 +51,16 @@ pub(crate) async fn load_patches(
 ) -> VortexResult<DevicePatches> {
     let offset = patches.offset();
     let offset_within_chunk = patches.offset_within_chunk().unwrap_or_default();
-    let array_len = patches.array_len();
-
     // Get or compute chunk_offsets
     let Some(co) = patches.chunk_offsets() else {
         vortex_bail!("cannot execute_cuda for patched BitPacked array without chunk_offsets")
     };
 
-    let (chunk_offsets, chunk_offset_ptype) = {
+    let (chunk_offsets, chunk_offset_ptype, n_chunks) = {
         let co_canonical = co.clone().execute_cuda(ctx).await?.into_primitive();
         let ptype = co_canonical.ptype();
-        (co_canonical.buffer_handle().clone(), ptype)
+        let len = co_canonical.len();
+        (co_canonical.buffer_handle().clone(), ptype, len)
     };
 
     // Load indices - must be converted to u32 for GPU use
@@ -93,7 +103,6 @@ pub(crate) async fn load_patches(
     let values = ctx.ensure_on_device(values.buffer_handle().clone()).await?;
 
     let num_patches = patches.num_patches();
-    let n_chunks = array_len.div_ceil(1024);
 
     Ok(DevicePatches {
         chunk_offsets,
@@ -107,6 +116,76 @@ pub(crate) async fn load_patches(
     })
 }
 
+/// Convert a PType to the corresponding `ChunkOffsetType` for GPU patches.
+pub(crate) fn ptype_to_chunk_offset_type(ptype: PType) -> VortexResult<ChunkOffsetType> {
+    match ptype {
+        PType::U8 => Ok(ChunkOffsetType_CO_U8),
+        PType::U16 => Ok(ChunkOffsetType_CO_U16),
+        PType::U32 => Ok(ChunkOffsetType_CO_U32),
+        PType::U64 => Ok(ChunkOffsetType_CO_U64),
+        _ => vortex_bail!("Invalid PType for chunk_offsets: {:?}", ptype),
+    }
+}
+
+/// Build a [`GPUPatches`] struct from [`DevicePatches`], serialize it to
+/// bytes, and upload to the device. Returns the device pointer and a buffer
+/// handle that must be kept alive for the kernel launch.
+fn build_gpu_patches(
+    dp: &DevicePatches,
+    ctx: &CudaExecutionCtx,
+) -> VortexResult<(BufferHandle, u64)> {
+    // Zero-initialize to avoid uninitialized padding bytes (e.g. between
+    // chunk_offset_type and indices) which would be UB when serialized.
+    let mut gpu_patches: GPUPatches = unsafe { std::mem::zeroed() };
+    gpu_patches.chunk_offsets = dp.chunk_offsets.cuda_device_ptr()? as _;
+    gpu_patches.chunk_offset_type = ptype_to_chunk_offset_type(dp.chunk_offset_ptype)?;
+    gpu_patches.indices = dp.indices.cuda_device_ptr()? as _;
+    gpu_patches.values = dp.values.cuda_device_ptr()? as _;
+    #[expect(clippy::cast_possible_truncation)]
+    {
+        gpu_patches.offset = dp.offset as u32;
+        gpu_patches.offset_within_chunk = dp.offset_within_chunk as u32;
+        gpu_patches.num_patches = dp.num_patches as u32;
+        // n_chunks must match the chunk_offsets array length, not array_len / 1024.
+        // When patches are sliced, chunk_offsets is sliced to only include chunks
+        // overlapping the slice range — matching the CPU's patch_chunk which uses
+        // chunk_offsets_slice.len().
+        gpu_patches.n_chunks = dp.n_chunks as u32;
+    }
+
+    let bytes = unsafe {
+        std::slice::from_raw_parts(
+            std::ptr::from_ref(&gpu_patches).cast::<u8>(),
+            size_of::<GPUPatches>(),
+        )
+    };
+    let mut buf =
+        ByteBufferMut::with_capacity_aligned(size_of::<GPUPatches>(), Alignment::of::<u64>());
+    buf.extend_from_slice(bytes);
+    let gpu_buf = ctx.ensure_on_device_sync(BufferHandle::new_host(buf.freeze()))?;
+    let ptr = gpu_buf.cuda_device_ptr()?;
+    Ok((gpu_buf, ptr))
+}
+
+/// Sync wrapper: load patches via [`load_patches`] (blocking), then build and
+/// upload a [`GPUPatches`] struct. Returns the device pointer and all buffer
+/// handles that must be kept alive for the kernel launch.
+pub(crate) fn load_patches_sync(
+    patches: &Patches,
+    ctx: &mut CudaExecutionCtx,
+) -> VortexResult<(u64, Vec<BufferHandle>)> {
+    let device_patches = futures::executor::block_on(load_patches(patches, ctx))?;
+    let (gpu_buf, ptr) = build_gpu_patches(&device_patches, ctx)?;
+
+    let DevicePatches {
+        chunk_offsets,
+        indices,
+        values,
+        ..
+    } = device_patches;
+    Ok((ptr, vec![chunk_offsets, indices, values, gpu_buf]))
+}
+
 #[cfg(test)]
 mod tests {
     use vortex_array::IntoArray;