diff --git a/vortex-cuda/benches/dynamic_dispatch_cuda.rs b/vortex-cuda/benches/dynamic_dispatch_cuda.rs index 5b81a2c2120..488e22f38d8 100644 --- a/vortex-cuda/benches/dynamic_dispatch_cuda.rs +++ b/vortex-cuda/benches/dynamic_dispatch_cuda.rs @@ -123,7 +123,7 @@ struct BenchRunner { } impl BenchRunner { - fn new(array: &vortex::array::ArrayRef, len: usize, cuda_ctx: &CudaExecutionCtx) -> Self { + fn new(array: &vortex::array::ArrayRef, len: usize, cuda_ctx: &mut CudaExecutionCtx) -> Self { let plan = match DispatchPlan::new(array).vortex_expect("build_dyn_dispatch_plan") { DispatchPlan::Fused(plan) => plan, _ => unreachable!("encoding not fusable"), @@ -201,7 +201,7 @@ fn bench_for_bitpacked(c: &mut Criterion) { let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()).vortex_expect("ctx"); - let bench_runner = BenchRunner::new(&array, n, &cuda_ctx); + let bench_runner = BenchRunner::new(&array, n, &mut cuda_ctx); b.iter_custom(|iters| { let mut total_time = Duration::ZERO; @@ -246,7 +246,7 @@ fn bench_dict_bp_codes(c: &mut Criterion) { let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()).vortex_expect("ctx"); - let bench_runner = BenchRunner::new(&array, n, &cuda_ctx); + let bench_runner = BenchRunner::new(&array, n, &mut cuda_ctx); b.iter_custom(|iters| { let mut total_time = Duration::ZERO; @@ -290,7 +290,7 @@ fn bench_runend(c: &mut Criterion) { let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()).vortex_expect("ctx"); - let bench_runner = BenchRunner::new(&array, n, &cuda_ctx); + let bench_runner = BenchRunner::new(&array, n, &mut cuda_ctx); b.iter_custom(|iters| { let mut total_time = Duration::ZERO; @@ -344,7 +344,7 @@ fn bench_dict_bp_codes_bp_for_values(c: &mut Criterion) { let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()).vortex_expect("ctx"); - let bench_runner = BenchRunner::new(&array, n, &cuda_ctx); + let bench_runner = BenchRunner::new(&array, n, &mut cuda_ctx); b.iter_custom(|iters| { let mut total_time = Duration::ZERO; @@ -409,7 +409,7 @@ fn bench_alp_for_bitpacked(c: &mut Criterion) { let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()).vortex_expect("ctx"); - let bench_runner = BenchRunner::new(&array, n, &cuda_ctx); + let bench_runner = BenchRunner::new(&array, n, &mut cuda_ctx); b.iter_custom(|iters| { let mut total_time = Duration::ZERO; diff --git a/vortex-cuda/kernels/src/bit_unpack_16.cu b/vortex-cuda/kernels/src/bit_unpack_16.cu index 3c05baf2011..a784df201d3 100644 --- a/vortex-cuda/kernels/src/bit_unpack_16.cu +++ b/vortex-cuda/kernels/src/bit_unpack_16.cu @@ -4,19 +4,19 @@ template __device__ void _bit_unpack_16_device(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, int thread_idx, GPUPatches& patches) { - __shared__ uint16_t shared_out[1024]; + __shared__ uint16_t shared_out[FL_CHUNK]; // Step 1: Unpack into shared memory #pragma unroll - for (int i = 0; i < 2; i++) { - _bit_unpack_16_lane(in, shared_out, reference, thread_idx * 2 + i); + for (int i = 0; i < FL_LANES / 32; i++) { + _bit_unpack_16_lane(in, shared_out, reference, thread_idx * (FL_LANES / 32) + i); } __syncwarp(); // Step 2: Apply patches to shared memory in parallel PatchesCursor cursor(patches, blockIdx.x, thread_idx, 32); auto patch = cursor.next(); - while (patch.index != 1024) { + while (patch.index != FL_CHUNK) { shared_out[patch.index] = patch.value; patch = cursor.next(); } @@ -24,7 +24,7 @@ __device__ void _bit_unpack_16_device(const uint16_t *__restrict in, uint16_t *_ // Step 3: Copy to global memory #pragma unroll - for (int i = 0; i < 32; i++) { + for (int i = 0; i < FL_CHUNK / 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } @@ -32,120 +32,120 @@ __device__ void _bit_unpack_16_device(const uint16_t *__restrict in, uint16_t *_ extern "C" __global__ void bit_unpack_16_0bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 0 / sizeof(uint16_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 0)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_16_device<0>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_16_1bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 1 / sizeof(uint16_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 1)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_16_device<1>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_16_2bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 2 / sizeof(uint16_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 2)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_16_device<2>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_16_3bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 3 / sizeof(uint16_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 3)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_16_device<3>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_16_4bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 4 / sizeof(uint16_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 4)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_16_device<4>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_16_5bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 5 / sizeof(uint16_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 5)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_16_device<5>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_16_6bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 6 / sizeof(uint16_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 6)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_16_device<6>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_16_7bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 7 / sizeof(uint16_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 7)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_16_device<7>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_16_8bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 8 / sizeof(uint16_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 8)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_16_device<8>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_16_9bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 9 / sizeof(uint16_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 9)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_16_device<9>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_16_10bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 10 / sizeof(uint16_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 10)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_16_device<10>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_16_11bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 11 / sizeof(uint16_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 11)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_16_device<11>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_16_12bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 12 / sizeof(uint16_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 12)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_16_device<12>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_16_13bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 13 / sizeof(uint16_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 13)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_16_device<13>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_16_14bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 14 / sizeof(uint16_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 14)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_16_device<14>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_16_15bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 15 / sizeof(uint16_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 15)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_16_device<15>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_16_16bw_32t(const uint16_t *__restrict full_in, uint16_t *__restrict full_out, uint16_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 16 / sizeof(uint16_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 16)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_16_device<16>(in, out, reference, thread_idx, patches); } diff --git a/vortex-cuda/kernels/src/bit_unpack_16_lanes.cuh b/vortex-cuda/kernels/src/bit_unpack_16_lanes.cuh index 48c62fde2d4..ff992fa4ffb 100644 --- a/vortex-cuda/kernels/src/bit_unpack_16_lanes.cuh +++ b/vortex-cuda/kernels/src/bit_unpack_16_lanes.cuh @@ -19,7 +19,7 @@ __device__ void _bit_unpack_16_lane<0>(const uint16_t *__restrict in, uint16_t * template <> __device__ void _bit_unpack_16_lane<1>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 64; + constexpr unsigned int LANE_COUNT = FL_LANES; uint16_t src; uint16_t tmp; src = in[lane]; @@ -59,7 +59,7 @@ __device__ void _bit_unpack_16_lane<1>(const uint16_t *__restrict in, uint16_t * template <> __device__ void _bit_unpack_16_lane<2>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 64; + constexpr unsigned int LANE_COUNT = FL_LANES; uint16_t src; uint16_t tmp; src = in[lane]; @@ -101,7 +101,7 @@ __device__ void _bit_unpack_16_lane<2>(const uint16_t *__restrict in, uint16_t * template <> __device__ void _bit_unpack_16_lane<3>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 64; + constexpr unsigned int LANE_COUNT = FL_LANES; uint16_t src; uint16_t tmp; src = in[lane]; @@ -145,7 +145,7 @@ __device__ void _bit_unpack_16_lane<3>(const uint16_t *__restrict in, uint16_t * template <> __device__ void _bit_unpack_16_lane<4>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 64; + constexpr unsigned int LANE_COUNT = FL_LANES; uint16_t src; uint16_t tmp; src = in[lane]; @@ -191,7 +191,7 @@ __device__ void _bit_unpack_16_lane<4>(const uint16_t *__restrict in, uint16_t * template <> __device__ void _bit_unpack_16_lane<5>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 64; + constexpr unsigned int LANE_COUNT = FL_LANES; uint16_t src; uint16_t tmp; src = in[lane]; @@ -239,7 +239,7 @@ __device__ void _bit_unpack_16_lane<5>(const uint16_t *__restrict in, uint16_t * template <> __device__ void _bit_unpack_16_lane<6>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 64; + constexpr unsigned int LANE_COUNT = FL_LANES; uint16_t src; uint16_t tmp; src = in[lane]; @@ -289,7 +289,7 @@ __device__ void _bit_unpack_16_lane<6>(const uint16_t *__restrict in, uint16_t * template <> __device__ void _bit_unpack_16_lane<7>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 64; + constexpr unsigned int LANE_COUNT = FL_LANES; uint16_t src; uint16_t tmp; src = in[lane]; @@ -341,7 +341,7 @@ __device__ void _bit_unpack_16_lane<7>(const uint16_t *__restrict in, uint16_t * template <> __device__ void _bit_unpack_16_lane<8>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 64; + constexpr unsigned int LANE_COUNT = FL_LANES; uint16_t src; uint16_t tmp; src = in[lane]; @@ -395,7 +395,7 @@ __device__ void _bit_unpack_16_lane<8>(const uint16_t *__restrict in, uint16_t * template <> __device__ void _bit_unpack_16_lane<9>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 64; + constexpr unsigned int LANE_COUNT = FL_LANES; uint16_t src; uint16_t tmp; src = in[lane]; @@ -451,7 +451,7 @@ __device__ void _bit_unpack_16_lane<9>(const uint16_t *__restrict in, uint16_t * template <> __device__ void _bit_unpack_16_lane<10>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 64; + constexpr unsigned int LANE_COUNT = FL_LANES; uint16_t src; uint16_t tmp; src = in[lane]; @@ -509,7 +509,7 @@ __device__ void _bit_unpack_16_lane<10>(const uint16_t *__restrict in, uint16_t template <> __device__ void _bit_unpack_16_lane<11>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 64; + constexpr unsigned int LANE_COUNT = FL_LANES; uint16_t src; uint16_t tmp; src = in[lane]; @@ -569,7 +569,7 @@ __device__ void _bit_unpack_16_lane<11>(const uint16_t *__restrict in, uint16_t template <> __device__ void _bit_unpack_16_lane<12>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 64; + constexpr unsigned int LANE_COUNT = FL_LANES; uint16_t src; uint16_t tmp; src = in[lane]; @@ -631,7 +631,7 @@ __device__ void _bit_unpack_16_lane<12>(const uint16_t *__restrict in, uint16_t template <> __device__ void _bit_unpack_16_lane<13>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 64; + constexpr unsigned int LANE_COUNT = FL_LANES; uint16_t src; uint16_t tmp; src = in[lane]; @@ -695,7 +695,7 @@ __device__ void _bit_unpack_16_lane<13>(const uint16_t *__restrict in, uint16_t template <> __device__ void _bit_unpack_16_lane<14>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 64; + constexpr unsigned int LANE_COUNT = FL_LANES; uint16_t src; uint16_t tmp; src = in[lane]; @@ -761,7 +761,7 @@ __device__ void _bit_unpack_16_lane<14>(const uint16_t *__restrict in, uint16_t template <> __device__ void _bit_unpack_16_lane<15>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 64; + constexpr unsigned int LANE_COUNT = FL_LANES; uint16_t src; uint16_t tmp; src = in[lane]; @@ -829,7 +829,7 @@ __device__ void _bit_unpack_16_lane<15>(const uint16_t *__restrict in, uint16_t template <> __device__ void _bit_unpack_16_lane<16>(const uint16_t *__restrict in, uint16_t *__restrict out, uint16_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 64; + constexpr unsigned int LANE_COUNT = FL_LANES; #pragma unroll for (int row = 0; row < 16; row++) { out[INDEX(row, lane)] = in[LANE_COUNT * row + lane] + reference; diff --git a/vortex-cuda/kernels/src/bit_unpack_32.cu b/vortex-cuda/kernels/src/bit_unpack_32.cu index 97906f612c4..3f8fcb5c227 100644 --- a/vortex-cuda/kernels/src/bit_unpack_32.cu +++ b/vortex-cuda/kernels/src/bit_unpack_32.cu @@ -4,19 +4,19 @@ template __device__ void _bit_unpack_32_device(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, int thread_idx, GPUPatches& patches) { - __shared__ uint32_t shared_out[1024]; + __shared__ uint32_t shared_out[FL_CHUNK]; // Step 1: Unpack into shared memory #pragma unroll - for (int i = 0; i < 1; i++) { - _bit_unpack_32_lane(in, shared_out, reference, thread_idx * 1 + i); + for (int i = 0; i < FL_LANES / 32; i++) { + _bit_unpack_32_lane(in, shared_out, reference, thread_idx * (FL_LANES / 32) + i); } __syncwarp(); // Step 2: Apply patches to shared memory in parallel PatchesCursor cursor(patches, blockIdx.x, thread_idx, 32); auto patch = cursor.next(); - while (patch.index != 1024) { + while (patch.index != FL_CHUNK) { shared_out[patch.index] = patch.value; patch = cursor.next(); } @@ -24,7 +24,7 @@ __device__ void _bit_unpack_32_device(const uint32_t *__restrict in, uint32_t *_ // Step 3: Copy to global memory #pragma unroll - for (int i = 0; i < 32; i++) { + for (int i = 0; i < FL_CHUNK / 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } @@ -32,232 +32,232 @@ __device__ void _bit_unpack_32_device(const uint32_t *__restrict in, uint32_t *_ extern "C" __global__ void bit_unpack_32_0bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 0 / sizeof(uint32_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 0)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_32_device<0>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_32_1bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 1 / sizeof(uint32_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 1)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_32_device<1>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_32_2bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 2 / sizeof(uint32_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 2)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_32_device<2>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_32_3bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 3 / sizeof(uint32_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 3)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_32_device<3>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_32_4bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 4 / sizeof(uint32_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 4)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_32_device<4>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_32_5bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 5 / sizeof(uint32_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 5)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_32_device<5>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_32_6bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 6 / sizeof(uint32_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 6)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_32_device<6>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_32_7bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 7 / sizeof(uint32_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 7)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_32_device<7>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_32_8bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 8 / sizeof(uint32_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 8)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_32_device<8>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_32_9bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 9 / sizeof(uint32_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 9)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_32_device<9>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_32_10bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 10 / sizeof(uint32_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 10)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_32_device<10>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_32_11bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 11 / sizeof(uint32_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 11)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_32_device<11>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_32_12bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 12 / sizeof(uint32_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 12)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_32_device<12>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_32_13bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 13 / sizeof(uint32_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 13)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_32_device<13>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_32_14bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 14 / sizeof(uint32_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 14)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_32_device<14>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_32_15bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 15 / sizeof(uint32_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 15)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_32_device<15>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_32_16bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 16 / sizeof(uint32_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 16)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_32_device<16>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_32_17bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 17 / sizeof(uint32_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 17)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_32_device<17>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_32_18bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 18 / sizeof(uint32_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 18)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_32_device<18>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_32_19bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 19 / sizeof(uint32_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 19)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_32_device<19>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_32_20bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 20 / sizeof(uint32_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 20)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_32_device<20>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_32_21bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 21 / sizeof(uint32_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 21)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_32_device<21>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_32_22bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 22 / sizeof(uint32_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 22)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_32_device<22>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_32_23bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 23 / sizeof(uint32_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 23)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_32_device<23>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_32_24bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 24 / sizeof(uint32_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 24)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_32_device<24>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_32_25bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 25 / sizeof(uint32_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 25)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_32_device<25>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_32_26bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 26 / sizeof(uint32_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 26)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_32_device<26>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_32_27bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 27 / sizeof(uint32_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 27)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_32_device<27>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_32_28bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 28 / sizeof(uint32_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 28)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_32_device<28>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_32_29bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 29 / sizeof(uint32_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 29)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_32_device<29>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_32_30bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 30 / sizeof(uint32_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 30)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_32_device<30>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_32_31bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 31 / sizeof(uint32_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 31)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_32_device<31>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_32_32bw_32t(const uint32_t *__restrict full_in, uint32_t *__restrict full_out, uint32_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 32 / sizeof(uint32_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 32)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_32_device<32>(in, out, reference, thread_idx, patches); } diff --git a/vortex-cuda/kernels/src/bit_unpack_32_lanes.cuh b/vortex-cuda/kernels/src/bit_unpack_32_lanes.cuh index 4d1206e9f88..58d4195e3aa 100644 --- a/vortex-cuda/kernels/src/bit_unpack_32_lanes.cuh +++ b/vortex-cuda/kernels/src/bit_unpack_32_lanes.cuh @@ -19,7 +19,7 @@ __device__ void _bit_unpack_32_lane<0>(const uint32_t *__restrict in, uint32_t * template <> __device__ void _bit_unpack_32_lane<1>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 32; + constexpr unsigned int LANE_COUNT = FL_LANES; uint32_t src; uint32_t tmp; src = in[lane]; @@ -91,7 +91,7 @@ __device__ void _bit_unpack_32_lane<1>(const uint32_t *__restrict in, uint32_t * template <> __device__ void _bit_unpack_32_lane<2>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 32; + constexpr unsigned int LANE_COUNT = FL_LANES; uint32_t src; uint32_t tmp; src = in[lane]; @@ -165,7 +165,7 @@ __device__ void _bit_unpack_32_lane<2>(const uint32_t *__restrict in, uint32_t * template <> __device__ void _bit_unpack_32_lane<3>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 32; + constexpr unsigned int LANE_COUNT = FL_LANES; uint32_t src; uint32_t tmp; src = in[lane]; @@ -241,7 +241,7 @@ __device__ void _bit_unpack_32_lane<3>(const uint32_t *__restrict in, uint32_t * template <> __device__ void _bit_unpack_32_lane<4>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 32; + constexpr unsigned int LANE_COUNT = FL_LANES; uint32_t src; uint32_t tmp; src = in[lane]; @@ -319,7 +319,7 @@ __device__ void _bit_unpack_32_lane<4>(const uint32_t *__restrict in, uint32_t * template <> __device__ void _bit_unpack_32_lane<5>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 32; + constexpr unsigned int LANE_COUNT = FL_LANES; uint32_t src; uint32_t tmp; src = in[lane]; @@ -399,7 +399,7 @@ __device__ void _bit_unpack_32_lane<5>(const uint32_t *__restrict in, uint32_t * template <> __device__ void _bit_unpack_32_lane<6>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 32; + constexpr unsigned int LANE_COUNT = FL_LANES; uint32_t src; uint32_t tmp; src = in[lane]; @@ -481,7 +481,7 @@ __device__ void _bit_unpack_32_lane<6>(const uint32_t *__restrict in, uint32_t * template <> __device__ void _bit_unpack_32_lane<7>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 32; + constexpr unsigned int LANE_COUNT = FL_LANES; uint32_t src; uint32_t tmp; src = in[lane]; @@ -565,7 +565,7 @@ __device__ void _bit_unpack_32_lane<7>(const uint32_t *__restrict in, uint32_t * template <> __device__ void _bit_unpack_32_lane<8>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 32; + constexpr unsigned int LANE_COUNT = FL_LANES; uint32_t src; uint32_t tmp; src = in[lane]; @@ -651,7 +651,7 @@ __device__ void _bit_unpack_32_lane<8>(const uint32_t *__restrict in, uint32_t * template <> __device__ void _bit_unpack_32_lane<9>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 32; + constexpr unsigned int LANE_COUNT = FL_LANES; uint32_t src; uint32_t tmp; src = in[lane]; @@ -739,7 +739,7 @@ __device__ void _bit_unpack_32_lane<9>(const uint32_t *__restrict in, uint32_t * template <> __device__ void _bit_unpack_32_lane<10>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 32; + constexpr unsigned int LANE_COUNT = FL_LANES; uint32_t src; uint32_t tmp; src = in[lane]; @@ -829,7 +829,7 @@ __device__ void _bit_unpack_32_lane<10>(const uint32_t *__restrict in, uint32_t template <> __device__ void _bit_unpack_32_lane<11>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 32; + constexpr unsigned int LANE_COUNT = FL_LANES; uint32_t src; uint32_t tmp; src = in[lane]; @@ -921,7 +921,7 @@ __device__ void _bit_unpack_32_lane<11>(const uint32_t *__restrict in, uint32_t template <> __device__ void _bit_unpack_32_lane<12>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 32; + constexpr unsigned int LANE_COUNT = FL_LANES; uint32_t src; uint32_t tmp; src = in[lane]; @@ -1015,7 +1015,7 @@ __device__ void _bit_unpack_32_lane<12>(const uint32_t *__restrict in, uint32_t template <> __device__ void _bit_unpack_32_lane<13>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 32; + constexpr unsigned int LANE_COUNT = FL_LANES; uint32_t src; uint32_t tmp; src = in[lane]; @@ -1111,7 +1111,7 @@ __device__ void _bit_unpack_32_lane<13>(const uint32_t *__restrict in, uint32_t template <> __device__ void _bit_unpack_32_lane<14>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 32; + constexpr unsigned int LANE_COUNT = FL_LANES; uint32_t src; uint32_t tmp; src = in[lane]; @@ -1209,7 +1209,7 @@ __device__ void _bit_unpack_32_lane<14>(const uint32_t *__restrict in, uint32_t template <> __device__ void _bit_unpack_32_lane<15>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 32; + constexpr unsigned int LANE_COUNT = FL_LANES; uint32_t src; uint32_t tmp; src = in[lane]; @@ -1309,7 +1309,7 @@ __device__ void _bit_unpack_32_lane<15>(const uint32_t *__restrict in, uint32_t template <> __device__ void _bit_unpack_32_lane<16>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 32; + constexpr unsigned int LANE_COUNT = FL_LANES; uint32_t src; uint32_t tmp; src = in[lane]; @@ -1411,7 +1411,7 @@ __device__ void _bit_unpack_32_lane<16>(const uint32_t *__restrict in, uint32_t template <> __device__ void _bit_unpack_32_lane<17>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 32; + constexpr unsigned int LANE_COUNT = FL_LANES; uint32_t src; uint32_t tmp; src = in[lane]; @@ -1515,7 +1515,7 @@ __device__ void _bit_unpack_32_lane<17>(const uint32_t *__restrict in, uint32_t template <> __device__ void _bit_unpack_32_lane<18>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 32; + constexpr unsigned int LANE_COUNT = FL_LANES; uint32_t src; uint32_t tmp; src = in[lane]; @@ -1621,7 +1621,7 @@ __device__ void _bit_unpack_32_lane<18>(const uint32_t *__restrict in, uint32_t template <> __device__ void _bit_unpack_32_lane<19>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 32; + constexpr unsigned int LANE_COUNT = FL_LANES; uint32_t src; uint32_t tmp; src = in[lane]; @@ -1729,7 +1729,7 @@ __device__ void _bit_unpack_32_lane<19>(const uint32_t *__restrict in, uint32_t template <> __device__ void _bit_unpack_32_lane<20>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 32; + constexpr unsigned int LANE_COUNT = FL_LANES; uint32_t src; uint32_t tmp; src = in[lane]; @@ -1839,7 +1839,7 @@ __device__ void _bit_unpack_32_lane<20>(const uint32_t *__restrict in, uint32_t template <> __device__ void _bit_unpack_32_lane<21>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 32; + constexpr unsigned int LANE_COUNT = FL_LANES; uint32_t src; uint32_t tmp; src = in[lane]; @@ -1951,7 +1951,7 @@ __device__ void _bit_unpack_32_lane<21>(const uint32_t *__restrict in, uint32_t template <> __device__ void _bit_unpack_32_lane<22>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 32; + constexpr unsigned int LANE_COUNT = FL_LANES; uint32_t src; uint32_t tmp; src = in[lane]; @@ -2065,7 +2065,7 @@ __device__ void _bit_unpack_32_lane<22>(const uint32_t *__restrict in, uint32_t template <> __device__ void _bit_unpack_32_lane<23>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 32; + constexpr unsigned int LANE_COUNT = FL_LANES; uint32_t src; uint32_t tmp; src = in[lane]; @@ -2181,7 +2181,7 @@ __device__ void _bit_unpack_32_lane<23>(const uint32_t *__restrict in, uint32_t template <> __device__ void _bit_unpack_32_lane<24>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 32; + constexpr unsigned int LANE_COUNT = FL_LANES; uint32_t src; uint32_t tmp; src = in[lane]; @@ -2299,7 +2299,7 @@ __device__ void _bit_unpack_32_lane<24>(const uint32_t *__restrict in, uint32_t template <> __device__ void _bit_unpack_32_lane<25>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 32; + constexpr unsigned int LANE_COUNT = FL_LANES; uint32_t src; uint32_t tmp; src = in[lane]; @@ -2419,7 +2419,7 @@ __device__ void _bit_unpack_32_lane<25>(const uint32_t *__restrict in, uint32_t template <> __device__ void _bit_unpack_32_lane<26>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 32; + constexpr unsigned int LANE_COUNT = FL_LANES; uint32_t src; uint32_t tmp; src = in[lane]; @@ -2541,7 +2541,7 @@ __device__ void _bit_unpack_32_lane<26>(const uint32_t *__restrict in, uint32_t template <> __device__ void _bit_unpack_32_lane<27>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 32; + constexpr unsigned int LANE_COUNT = FL_LANES; uint32_t src; uint32_t tmp; src = in[lane]; @@ -2665,7 +2665,7 @@ __device__ void _bit_unpack_32_lane<27>(const uint32_t *__restrict in, uint32_t template <> __device__ void _bit_unpack_32_lane<28>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 32; + constexpr unsigned int LANE_COUNT = FL_LANES; uint32_t src; uint32_t tmp; src = in[lane]; @@ -2791,7 +2791,7 @@ __device__ void _bit_unpack_32_lane<28>(const uint32_t *__restrict in, uint32_t template <> __device__ void _bit_unpack_32_lane<29>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 32; + constexpr unsigned int LANE_COUNT = FL_LANES; uint32_t src; uint32_t tmp; src = in[lane]; @@ -2919,7 +2919,7 @@ __device__ void _bit_unpack_32_lane<29>(const uint32_t *__restrict in, uint32_t template <> __device__ void _bit_unpack_32_lane<30>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 32; + constexpr unsigned int LANE_COUNT = FL_LANES; uint32_t src; uint32_t tmp; src = in[lane]; @@ -3049,7 +3049,7 @@ __device__ void _bit_unpack_32_lane<30>(const uint32_t *__restrict in, uint32_t template <> __device__ void _bit_unpack_32_lane<31>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 32; + constexpr unsigned int LANE_COUNT = FL_LANES; uint32_t src; uint32_t tmp; src = in[lane]; @@ -3181,7 +3181,7 @@ __device__ void _bit_unpack_32_lane<31>(const uint32_t *__restrict in, uint32_t template <> __device__ void _bit_unpack_32_lane<32>(const uint32_t *__restrict in, uint32_t *__restrict out, uint32_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 32; + constexpr unsigned int LANE_COUNT = FL_LANES; #pragma unroll for (int row = 0; row < 32; row++) { out[INDEX(row, lane)] = in[LANE_COUNT * row + lane] + reference; diff --git a/vortex-cuda/kernels/src/bit_unpack_64.cu b/vortex-cuda/kernels/src/bit_unpack_64.cu index 6270f4f8261..ebe0b125369 100644 --- a/vortex-cuda/kernels/src/bit_unpack_64.cu +++ b/vortex-cuda/kernels/src/bit_unpack_64.cu @@ -4,19 +4,19 @@ template __device__ void _bit_unpack_64_device(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, int thread_idx, GPUPatches& patches) { - __shared__ uint64_t shared_out[1024]; + __shared__ uint64_t shared_out[FL_CHUNK]; // Step 1: Unpack into shared memory #pragma unroll - for (int i = 0; i < 1; i++) { - _bit_unpack_64_lane(in, shared_out, reference, thread_idx * 1 + i); + for (int i = 0; i < FL_LANES / 16; i++) { + _bit_unpack_64_lane(in, shared_out, reference, thread_idx * (FL_LANES / 16) + i); } __syncwarp(); // Step 2: Apply patches to shared memory in parallel PatchesCursor cursor(patches, blockIdx.x, thread_idx, 16); auto patch = cursor.next(); - while (patch.index != 1024) { + while (patch.index != FL_CHUNK) { shared_out[patch.index] = patch.value; patch = cursor.next(); } @@ -24,7 +24,7 @@ __device__ void _bit_unpack_64_device(const uint64_t *__restrict in, uint64_t *_ // Step 3: Copy to global memory #pragma unroll - for (int i = 0; i < 64; i++) { + for (int i = 0; i < FL_CHUNK / 16; i++) { auto idx = i * 16 + thread_idx; out[idx] = shared_out[idx]; } @@ -32,456 +32,456 @@ __device__ void _bit_unpack_64_device(const uint64_t *__restrict in, uint64_t *_ extern "C" __global__ void bit_unpack_64_0bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 0 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 0)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<0>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_1bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 1 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 1)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<1>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_2bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 2 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 2)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<2>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_3bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 3 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 3)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<3>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_4bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 4 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 4)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<4>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_5bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 5 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 5)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<5>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_6bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 6 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 6)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<6>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_7bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 7 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 7)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<7>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_8bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 8 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 8)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<8>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_9bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 9 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 9)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<9>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_10bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 10 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 10)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<10>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_11bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 11 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 11)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<11>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_12bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 12 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 12)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<12>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_13bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 13 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 13)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<13>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_14bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 14 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 14)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<14>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_15bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 15 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 15)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<15>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_16bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 16 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 16)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<16>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_17bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 17 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 17)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<17>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_18bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 18 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 18)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<18>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_19bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 19 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 19)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<19>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_20bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 20 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 20)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<20>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_21bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 21 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 21)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<21>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_22bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 22 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 22)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<22>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_23bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 23 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 23)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<23>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_24bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 24 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 24)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<24>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_25bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 25 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 25)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<25>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_26bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 26 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 26)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<26>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_27bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 27 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 27)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<27>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_28bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 28 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 28)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<28>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_29bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 29 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 29)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<29>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_30bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 30 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 30)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<30>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_31bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 31 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 31)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<31>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_32bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 32 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 32)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<32>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_33bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 33 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 33)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<33>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_34bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 34 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 34)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<34>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_35bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 35 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 35)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<35>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_36bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 36 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 36)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<36>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_37bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 37 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 37)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<37>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_38bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 38 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 38)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<38>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_39bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 39 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 39)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<39>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_40bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 40 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 40)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<40>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_41bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 41 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 41)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<41>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_42bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 42 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 42)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<42>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_43bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 43 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 43)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<43>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_44bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 44 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 44)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<44>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_45bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 45 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 45)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<45>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_46bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 46 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 46)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<46>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_47bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 47 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 47)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<47>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_48bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 48 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 48)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<48>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_49bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 49 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 49)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<49>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_50bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 50 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 50)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<50>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_51bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 51 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 51)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<51>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_52bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 52 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 52)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<52>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_53bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 53 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 53)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<53>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_54bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 54 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 54)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<54>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_55bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 55 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 55)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<55>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_56bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 56 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 56)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<56>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_57bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 57 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 57)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<57>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_58bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 58 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 58)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<58>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_59bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 59 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 59)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<59>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_60bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 60 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 60)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<60>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_61bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 61 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 61)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<61>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_62bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 62 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 62)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<62>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_63bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 63 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 63)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<63>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_64_64bw_16t(const uint64_t *__restrict full_in, uint64_t *__restrict full_out, uint64_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 64 / sizeof(uint64_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 64)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_64_device<64>(in, out, reference, thread_idx, patches); } diff --git a/vortex-cuda/kernels/src/bit_unpack_64_lanes.cuh b/vortex-cuda/kernels/src/bit_unpack_64_lanes.cuh index 0f20fee4e7d..afc17737b40 100644 --- a/vortex-cuda/kernels/src/bit_unpack_64_lanes.cuh +++ b/vortex-cuda/kernels/src/bit_unpack_64_lanes.cuh @@ -19,7 +19,7 @@ __device__ void _bit_unpack_64_lane<0>(const uint64_t *__restrict in, uint64_t * template <> __device__ void _bit_unpack_64_lane<1>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -155,7 +155,7 @@ __device__ void _bit_unpack_64_lane<1>(const uint64_t *__restrict in, uint64_t * template <> __device__ void _bit_unpack_64_lane<2>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -293,7 +293,7 @@ __device__ void _bit_unpack_64_lane<2>(const uint64_t *__restrict in, uint64_t * template <> __device__ void _bit_unpack_64_lane<3>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -433,7 +433,7 @@ __device__ void _bit_unpack_64_lane<3>(const uint64_t *__restrict in, uint64_t * template <> __device__ void _bit_unpack_64_lane<4>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -575,7 +575,7 @@ __device__ void _bit_unpack_64_lane<4>(const uint64_t *__restrict in, uint64_t * template <> __device__ void _bit_unpack_64_lane<5>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -719,7 +719,7 @@ __device__ void _bit_unpack_64_lane<5>(const uint64_t *__restrict in, uint64_t * template <> __device__ void _bit_unpack_64_lane<6>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -865,7 +865,7 @@ __device__ void _bit_unpack_64_lane<6>(const uint64_t *__restrict in, uint64_t * template <> __device__ void _bit_unpack_64_lane<7>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -1013,7 +1013,7 @@ __device__ void _bit_unpack_64_lane<7>(const uint64_t *__restrict in, uint64_t * template <> __device__ void _bit_unpack_64_lane<8>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -1163,7 +1163,7 @@ __device__ void _bit_unpack_64_lane<8>(const uint64_t *__restrict in, uint64_t * template <> __device__ void _bit_unpack_64_lane<9>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -1315,7 +1315,7 @@ __device__ void _bit_unpack_64_lane<9>(const uint64_t *__restrict in, uint64_t * template <> __device__ void _bit_unpack_64_lane<10>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -1469,7 +1469,7 @@ __device__ void _bit_unpack_64_lane<10>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<11>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -1625,7 +1625,7 @@ __device__ void _bit_unpack_64_lane<11>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<12>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -1783,7 +1783,7 @@ __device__ void _bit_unpack_64_lane<12>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<13>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -1943,7 +1943,7 @@ __device__ void _bit_unpack_64_lane<13>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<14>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -2105,7 +2105,7 @@ __device__ void _bit_unpack_64_lane<14>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<15>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -2269,7 +2269,7 @@ __device__ void _bit_unpack_64_lane<15>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<16>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -2435,7 +2435,7 @@ __device__ void _bit_unpack_64_lane<16>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<17>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -2603,7 +2603,7 @@ __device__ void _bit_unpack_64_lane<17>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<18>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -2773,7 +2773,7 @@ __device__ void _bit_unpack_64_lane<18>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<19>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -2945,7 +2945,7 @@ __device__ void _bit_unpack_64_lane<19>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<20>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -3119,7 +3119,7 @@ __device__ void _bit_unpack_64_lane<20>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<21>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -3295,7 +3295,7 @@ __device__ void _bit_unpack_64_lane<21>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<22>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -3473,7 +3473,7 @@ __device__ void _bit_unpack_64_lane<22>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<23>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -3653,7 +3653,7 @@ __device__ void _bit_unpack_64_lane<23>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<24>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -3835,7 +3835,7 @@ __device__ void _bit_unpack_64_lane<24>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<25>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -4019,7 +4019,7 @@ __device__ void _bit_unpack_64_lane<25>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<26>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -4205,7 +4205,7 @@ __device__ void _bit_unpack_64_lane<26>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<27>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -4393,7 +4393,7 @@ __device__ void _bit_unpack_64_lane<27>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<28>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -4583,7 +4583,7 @@ __device__ void _bit_unpack_64_lane<28>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<29>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -4775,7 +4775,7 @@ __device__ void _bit_unpack_64_lane<29>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<30>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -4969,7 +4969,7 @@ __device__ void _bit_unpack_64_lane<30>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<31>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -5165,7 +5165,7 @@ __device__ void _bit_unpack_64_lane<31>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<32>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -5363,7 +5363,7 @@ __device__ void _bit_unpack_64_lane<32>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<33>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -5563,7 +5563,7 @@ __device__ void _bit_unpack_64_lane<33>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<34>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -5765,7 +5765,7 @@ __device__ void _bit_unpack_64_lane<34>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<35>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -5969,7 +5969,7 @@ __device__ void _bit_unpack_64_lane<35>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<36>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -6175,7 +6175,7 @@ __device__ void _bit_unpack_64_lane<36>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<37>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -6383,7 +6383,7 @@ __device__ void _bit_unpack_64_lane<37>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<38>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -6593,7 +6593,7 @@ __device__ void _bit_unpack_64_lane<38>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<39>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -6805,7 +6805,7 @@ __device__ void _bit_unpack_64_lane<39>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<40>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -7019,7 +7019,7 @@ __device__ void _bit_unpack_64_lane<40>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<41>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -7235,7 +7235,7 @@ __device__ void _bit_unpack_64_lane<41>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<42>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -7453,7 +7453,7 @@ __device__ void _bit_unpack_64_lane<42>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<43>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -7673,7 +7673,7 @@ __device__ void _bit_unpack_64_lane<43>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<44>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -7895,7 +7895,7 @@ __device__ void _bit_unpack_64_lane<44>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<45>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -8119,7 +8119,7 @@ __device__ void _bit_unpack_64_lane<45>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<46>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -8345,7 +8345,7 @@ __device__ void _bit_unpack_64_lane<46>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<47>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -8573,7 +8573,7 @@ __device__ void _bit_unpack_64_lane<47>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<48>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -8803,7 +8803,7 @@ __device__ void _bit_unpack_64_lane<48>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<49>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -9035,7 +9035,7 @@ __device__ void _bit_unpack_64_lane<49>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<50>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -9269,7 +9269,7 @@ __device__ void _bit_unpack_64_lane<50>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<51>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -9505,7 +9505,7 @@ __device__ void _bit_unpack_64_lane<51>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<52>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -9743,7 +9743,7 @@ __device__ void _bit_unpack_64_lane<52>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<53>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -9983,7 +9983,7 @@ __device__ void _bit_unpack_64_lane<53>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<54>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -10225,7 +10225,7 @@ __device__ void _bit_unpack_64_lane<54>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<55>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -10469,7 +10469,7 @@ __device__ void _bit_unpack_64_lane<55>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<56>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -10715,7 +10715,7 @@ __device__ void _bit_unpack_64_lane<56>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<57>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -10963,7 +10963,7 @@ __device__ void _bit_unpack_64_lane<57>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<58>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -11213,7 +11213,7 @@ __device__ void _bit_unpack_64_lane<58>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<59>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -11465,7 +11465,7 @@ __device__ void _bit_unpack_64_lane<59>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<60>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -11719,7 +11719,7 @@ __device__ void _bit_unpack_64_lane<60>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<61>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -11975,7 +11975,7 @@ __device__ void _bit_unpack_64_lane<61>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<62>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -12233,7 +12233,7 @@ __device__ void _bit_unpack_64_lane<62>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<63>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; uint64_t src; uint64_t tmp; src = in[lane]; @@ -12493,7 +12493,7 @@ __device__ void _bit_unpack_64_lane<63>(const uint64_t *__restrict in, uint64_t template <> __device__ void _bit_unpack_64_lane<64>(const uint64_t *__restrict in, uint64_t *__restrict out, uint64_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 16; + constexpr unsigned int LANE_COUNT = FL_LANES; #pragma unroll for (int row = 0; row < 64; row++) { out[INDEX(row, lane)] = in[LANE_COUNT * row + lane] + reference; diff --git a/vortex-cuda/kernels/src/bit_unpack_8.cu b/vortex-cuda/kernels/src/bit_unpack_8.cu index 064970d650f..b2fcfd26f04 100644 --- a/vortex-cuda/kernels/src/bit_unpack_8.cu +++ b/vortex-cuda/kernels/src/bit_unpack_8.cu @@ -4,19 +4,19 @@ template __device__ void _bit_unpack_8_device(const uint8_t *__restrict in, uint8_t *__restrict out, uint8_t reference, int thread_idx, GPUPatches& patches) { - __shared__ uint8_t shared_out[1024]; + __shared__ uint8_t shared_out[FL_CHUNK]; // Step 1: Unpack into shared memory #pragma unroll - for (int i = 0; i < 4; i++) { - _bit_unpack_8_lane(in, shared_out, reference, thread_idx * 4 + i); + for (int i = 0; i < FL_LANES / 32; i++) { + _bit_unpack_8_lane(in, shared_out, reference, thread_idx * (FL_LANES / 32) + i); } __syncwarp(); // Step 2: Apply patches to shared memory in parallel PatchesCursor cursor(patches, blockIdx.x, thread_idx, 32); auto patch = cursor.next(); - while (patch.index != 1024) { + while (patch.index != FL_CHUNK) { shared_out[patch.index] = patch.value; patch = cursor.next(); } @@ -24,7 +24,7 @@ __device__ void _bit_unpack_8_device(const uint8_t *__restrict in, uint8_t *__re // Step 3: Copy to global memory #pragma unroll - for (int i = 0; i < 32; i++) { + for (int i = 0; i < FL_CHUNK / 32; i++) { auto idx = i * 32 + thread_idx; out[idx] = shared_out[idx]; } @@ -32,64 +32,64 @@ __device__ void _bit_unpack_8_device(const uint8_t *__restrict in, uint8_t *__re extern "C" __global__ void bit_unpack_8_0bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out, uint8_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 0 / sizeof(uint8_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 0)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_8_device<0>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_8_1bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out, uint8_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 1 / sizeof(uint8_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 1)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_8_device<1>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_8_2bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out, uint8_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 2 / sizeof(uint8_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 2)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_8_device<2>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_8_3bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out, uint8_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 3 / sizeof(uint8_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 3)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_8_device<3>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_8_4bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out, uint8_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 4 / sizeof(uint8_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 4)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_8_device<4>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_8_5bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out, uint8_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 5 / sizeof(uint8_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 5)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_8_device<5>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_8_6bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out, uint8_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 6 / sizeof(uint8_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 6)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_8_device<6>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_8_7bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out, uint8_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 7 / sizeof(uint8_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 7)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_8_device<7>(in, out, reference, thread_idx, patches); } extern "C" __global__ void bit_unpack_8_8bw_32t(const uint8_t *__restrict full_in, uint8_t *__restrict full_out, uint8_t reference, GPUPatches patches) { int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * 8 / sizeof(uint8_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * 8)); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_8_device<8>(in, out, reference, thread_idx, patches); } diff --git a/vortex-cuda/kernels/src/bit_unpack_8_lanes.cuh b/vortex-cuda/kernels/src/bit_unpack_8_lanes.cuh index 64fb8ce39a8..d9c3550cb18 100644 --- a/vortex-cuda/kernels/src/bit_unpack_8_lanes.cuh +++ b/vortex-cuda/kernels/src/bit_unpack_8_lanes.cuh @@ -19,7 +19,7 @@ __device__ void _bit_unpack_8_lane<0>(const uint8_t *__restrict in, uint8_t *__r template <> __device__ void _bit_unpack_8_lane<1>(const uint8_t *__restrict in, uint8_t *__restrict out, uint8_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 128; + constexpr unsigned int LANE_COUNT = FL_LANES; uint8_t src; uint8_t tmp; src = in[lane]; @@ -43,7 +43,7 @@ __device__ void _bit_unpack_8_lane<1>(const uint8_t *__restrict in, uint8_t *__r template <> __device__ void _bit_unpack_8_lane<2>(const uint8_t *__restrict in, uint8_t *__restrict out, uint8_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 128; + constexpr unsigned int LANE_COUNT = FL_LANES; uint8_t src; uint8_t tmp; src = in[lane]; @@ -69,7 +69,7 @@ __device__ void _bit_unpack_8_lane<2>(const uint8_t *__restrict in, uint8_t *__r template <> __device__ void _bit_unpack_8_lane<3>(const uint8_t *__restrict in, uint8_t *__restrict out, uint8_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 128; + constexpr unsigned int LANE_COUNT = FL_LANES; uint8_t src; uint8_t tmp; src = in[lane]; @@ -97,7 +97,7 @@ __device__ void _bit_unpack_8_lane<3>(const uint8_t *__restrict in, uint8_t *__r template <> __device__ void _bit_unpack_8_lane<4>(const uint8_t *__restrict in, uint8_t *__restrict out, uint8_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 128; + constexpr unsigned int LANE_COUNT = FL_LANES; uint8_t src; uint8_t tmp; src = in[lane]; @@ -127,7 +127,7 @@ __device__ void _bit_unpack_8_lane<4>(const uint8_t *__restrict in, uint8_t *__r template <> __device__ void _bit_unpack_8_lane<5>(const uint8_t *__restrict in, uint8_t *__restrict out, uint8_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 128; + constexpr unsigned int LANE_COUNT = FL_LANES; uint8_t src; uint8_t tmp; src = in[lane]; @@ -159,7 +159,7 @@ __device__ void _bit_unpack_8_lane<5>(const uint8_t *__restrict in, uint8_t *__r template <> __device__ void _bit_unpack_8_lane<6>(const uint8_t *__restrict in, uint8_t *__restrict out, uint8_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 128; + constexpr unsigned int LANE_COUNT = FL_LANES; uint8_t src; uint8_t tmp; src = in[lane]; @@ -193,7 +193,7 @@ __device__ void _bit_unpack_8_lane<6>(const uint8_t *__restrict in, uint8_t *__r template <> __device__ void _bit_unpack_8_lane<7>(const uint8_t *__restrict in, uint8_t *__restrict out, uint8_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 128; + constexpr unsigned int LANE_COUNT = FL_LANES; uint8_t src; uint8_t tmp; src = in[lane]; @@ -229,7 +229,7 @@ __device__ void _bit_unpack_8_lane<7>(const uint8_t *__restrict in, uint8_t *__r template <> __device__ void _bit_unpack_8_lane<8>(const uint8_t *__restrict in, uint8_t *__restrict out, uint8_t reference, unsigned int lane) { - unsigned int LANE_COUNT = 128; + constexpr unsigned int LANE_COUNT = FL_LANES; #pragma unroll for (int row = 0; row < 8; row++) { out[INDEX(row, lane)] = in[LANE_COUNT * row + lane] + reference; diff --git a/vortex-cuda/kernels/src/dynamic_dispatch.cu b/vortex-cuda/kernels/src/dynamic_dispatch.cu index d3950485435..9a413af17c1 100644 --- a/vortex-cuda/kernels/src/dynamic_dispatch.cu +++ b/vortex-cuda/kernels/src/dynamic_dispatch.cu @@ -52,6 +52,7 @@ #include "bit_unpack.cuh" #include "dynamic_dispatch.h" +#include "patches.cuh" #include "types.cuh" // ═══════════════════════════════════════════════════════════════════════════ @@ -109,8 +110,12 @@ __shared__ uint64_t runend_cursors[BLOCK_SIZE]; // ═══════════════════════════════════════════════════════════════════════════ /// Apply one scalar operation to N values in registers. +/// +/// `abs_pos` is the absolute output position of the first value to process. +/// It is used by scalar operations that apply patches, e.g. ALP. template -__device__ inline void scalar_op(T *values, const struct ScalarOp &op, char *__restrict smem) { +__device__ inline void +scalar_op(T *values, const struct ScalarOp &op, char *__restrict smem, uint64_t abs_pos = 0) { switch (op.op_code) { case ScalarOp::FOR: { const T ref = static_cast(op.params.frame_of_ref.reference); @@ -134,6 +139,33 @@ __device__ inline void scalar_op(T *values, const struct ScalarOp &op, char *__r float r = static_cast(static_cast(values[i])) * f * e; values[i] = static_cast(__float_as_uint(r)); } + // Apply ALP patches: override positions whose float value couldn't + // be reconstructed through the ALP encode/decode cycle. + // Per-value cursor — with a slice offset, a tile's N values can + // straddle two FL chunks, so each value needs its own lookup. + if (op.params.alp.patches_ptr != 0) { + const auto &patches = *reinterpret_cast(op.params.alp.patches_ptr); + // The sliced chunk_offsets array starts at original chunk + // (offset / FL_CHUNK). PatchesCursor indexes from 0, so + // subtract that base to get the index into chunk_offsets. + const uint32_t chunk_start = patches.offset / FL_CHUNK; +#pragma unroll + for (uint32_t i = 0; i < N; ++i) { + uint64_t my_pos = (N > 1) ? abs_pos + i * blockDim.x + threadIdx.x : abs_pos; + uint64_t orig = my_pos + patches.offset; + uint32_t chunk = static_cast(orig / FL_CHUNK) - chunk_start; + uint32_t within = static_cast(orig % FL_CHUNK); + PatchesCursor cursor(patches, chunk, 0, 1); + auto patch = cursor.next(); + while (patch.index != FL_CHUNK) { + if (patch.index == within) { + values[i] = patch.value; + break; + } + patch = cursor.next(); + } + } + } break; } case ScalarOp::DICT: { @@ -149,6 +181,24 @@ __device__ inline void scalar_op(T *values, const struct ScalarOp &op, char *__r } } +// ═══════════════════════════════════════════════════════════════════════════ +// Patches +// ═══════════════════════════════════════════════════════════════════════════ + +/// Scatter patches for a single chunk into `out` using PatchesCursor. +/// All threads in the block cooperate. Caller must issue __syncthreads() +/// afterward if other threads read from `out`. +template +__device__ __forceinline__ void +scatter_patches_chunk(const GPUPatches &patches, T *__restrict out, uint32_t chunk) { + PatchesCursor cursor(patches, chunk, threadIdx.x, blockDim.x); + auto patch = cursor.next(); + while (patch.index != FL_CHUNK) { + out[patch.index] = patch.value; + patch = cursor.next(); + } +} + // ═══════════════════════════════════════════════════════════════════════════ // Source ops // ═══════════════════════════════════════════════════════════════════════════ @@ -162,11 +212,8 @@ __device__ inline void bitunpack(const T *__restrict packed, uint64_t chunk_start, uint32_t chunk_len, const struct SourceOp &src) { - constexpr uint32_t T_BITS = sizeof(T) * 8; - constexpr uint32_t FL_CHUNK = 1024; - constexpr uint32_t LANES = FL_CHUNK / T_BITS; const uint32_t bw = src.params.bitunpack.bit_width; - const uint32_t words_per_block = LANES * bw; + const uint32_t words_per_block = FL_LANES * bw; const uint32_t elem_off = src.params.bitunpack.element_offset; const uint32_t dst_off = (chunk_start + elem_off) % FL_CHUNK; const uint64_t first_block = (chunk_start + elem_off) / FL_CHUNK; @@ -177,9 +224,15 @@ __device__ inline void bitunpack(const T *__restrict packed, for (uint32_t c = 0; c < n_chunks; ++c) { const T *src_chunk = packed + (first_block + c) * words_per_block; T *chunk_dst = dst + c * FL_CHUNK; - for (uint32_t lane = threadIdx.x; lane < LANES; lane += blockDim.x) { + for (uint32_t lane = threadIdx.x; lane < FL_LANES; lane += blockDim.x) { bit_unpack_lane(src_chunk, chunk_dst, 0, lane, bw); } + // Apply BitPacked patches inline, matching the standalone kernel pattern. + if (src.params.bitunpack.patches_ptr != 0) { + __syncthreads(); + const auto &patches = *reinterpret_cast(src.params.bitunpack.patches_ptr); + scatter_patches_chunk(patches, chunk_dst, first_block + c); + } } } @@ -282,15 +335,12 @@ __device__ void execute_output_stage(T *__restrict output, // Cap at 4 values per thread per tile to minimise register pressure. constexpr uint32_t VALUES_PER_TILE = (32 / sizeof(T)) < 4 ? (32 / sizeof(T)) : 4; const uint32_t tile_size = blockDim.x * VALUES_PER_TILE; + const auto &src = stage.source; const void *raw_input = reinterpret_cast(stage.input_ptr); const PTypeTag ptype = stage.source_ptype; if (src.op_code == SourceOp::RUNEND) { - // Seed each thread's cursor with the run containing its first - // strided position. The RUNEND arm in source_op advances the - // cursor monotonically, so this avoids a full binary search on - // every element. const T *ends = reinterpret_cast(smem + src.params.runend.ends_smem_byte_offset); runend_cursors[threadIdx.x] = upper_bound(ends, src.params.runend.num_runs, @@ -301,10 +351,6 @@ __device__ void execute_output_stage(T *__restrict output, uint32_t chunk_len; const T *smem_src = nullptr; - // BITUNPACK uses smem scratch, so the outer loop advances one - // chunk at a time. LOAD, SEQUENCE, and RUNEND need no smem - // scratch, so chunk_len = block_len (single outer iteration); - // tiling happens in the inner tile_idx loop. if (src.op_code == SourceOp::BITUNPACK) { chunk_len = bitunpack_tile_len(stage, block_len, elem_idx); T *scratch = reinterpret_cast(smem + stage.smem_byte_offset); @@ -313,10 +359,10 @@ __device__ void execute_output_stage(T *__restrict output, block_start + elem_idx, chunk_len, src); - constexpr uint32_t FL_CHUNK = 1024; // FastLanes chunk size const uint32_t align = (block_start + elem_idx + src.params.bitunpack.element_offset) % FL_CHUNK; smem_src = scratch + align; - // Write barrier: all threads finished bitunpack, safe to read from scratch. + // Write barrier: all threads finished bitunpack (and any + // patches), safe to read from scratch. __syncthreads(); } else { chunk_len = block_len; @@ -337,7 +383,7 @@ __device__ void execute_output_stage(T *__restrict output, smem); for (uint8_t op = 0; op < stage.num_scalar_ops; ++op) { - scalar_op(values, stage.scalar_ops[op], smem); + scalar_op(values, stage.scalar_ops[op], smem, tile_start); } #pragma unroll @@ -359,7 +405,7 @@ __device__ void execute_output_stage(T *__restrict output, source_op(&val, src, raw_input, ptype, smem_src, i, gpos, smem); for (uint8_t op = 0; op < stage.num_scalar_ops; ++op) { - scalar_op(&val, stage.scalar_ops[op], smem); + scalar_op(&val, stage.scalar_ops[op], smem, gpos); } __stcs(&output[gpos], val); } @@ -392,24 +438,27 @@ __device__ void execute_input_stage(const Stage &stage, char *__restrict smem) { const auto &src = stage.source; if (src.op_code == SourceOp::BITUNPACK) { + T *raw_smem = smem_out; bitunpack(reinterpret_cast(stage.input_ptr), smem_out, 0, stage.len, src); - smem_out += src.params.bitunpack.element_offset % SMEM_TILE_SIZE; // Write barrier: cooperative bitunpack finished, safe to read - // decoded elements in the scalar-op loop below. + // decoded elements below. __syncthreads(); + smem_out += src.params.bitunpack.element_offset % SMEM_TILE_SIZE; + if (stage.num_scalar_ops > 0) { - for (uint32_t i = threadIdx.x; i < stage.len; i += blockDim.x) { - T val = smem_out[i]; + for (uint32_t elem_idx = threadIdx.x; elem_idx < stage.len; elem_idx += blockDim.x) { + T val = smem_out[elem_idx]; for (uint8_t op = 0; op < stage.num_scalar_ops; ++op) { - scalar_op(&val, stage.scalar_ops[op], smem); + scalar_op(&val, stage.scalar_ops[op], smem, elem_idx); } - smem_out[i] = val; + smem_out[elem_idx] = val; } // Write barrier: scalar ops applied in-place, smem region is // now fully populated for subsequent stages to read. __syncthreads(); } + } else { if (src.op_code == SourceOp::RUNEND) { // Seed each thread's cursor with the run containing its first @@ -421,13 +470,13 @@ __device__ void execute_input_stage(const Stage &stage, char *__restrict smem) { upper_bound(ends, src.params.runend.num_runs, threadIdx.x + src.params.runend.offset); } const void *raw_input = reinterpret_cast(stage.input_ptr); - for (uint32_t i = threadIdx.x; i < stage.len; i += blockDim.x) { + for (uint32_t elem_idx = threadIdx.x; elem_idx < stage.len; elem_idx += blockDim.x) { T val; - source_op(&val, src, raw_input, stage.source_ptype, nullptr, 0, i, smem); + source_op(&val, src, raw_input, stage.source_ptype, nullptr, 0, elem_idx, smem); for (uint8_t op = 0; op < stage.num_scalar_ops; ++op) { - scalar_op(&val, stage.scalar_ops[op], smem); + scalar_op(&val, stage.scalar_ops[op], smem, elem_idx); } - smem_out[i] = val; + smem_out[elem_idx] = val; } // Write barrier: smem region is fully populated for subsequent // stages to read. diff --git a/vortex-cuda/kernels/src/dynamic_dispatch.h b/vortex-cuda/kernels/src/dynamic_dispatch.h index 95540c51581..9bba3f50a04 100644 --- a/vortex-cuda/kernels/src/dynamic_dispatch.h +++ b/vortex-cuda/kernels/src/dynamic_dispatch.h @@ -103,11 +103,18 @@ extern "C" { #endif /// Parameters for source ops, which decode data into a stage's shared memory region. +/// +/// patches_ptr lives on the union variant that owns it (BitunpackParams, +/// AlpParams) — not per-stage — so the pointer is tied to its op. +/// Adding a u64 can grow the union and every ScalarOp/SourceOp read in the +/// tile loop; SourceParams was already 24 B (no change), ScalarParams grew +/// 8→16 B (no measurable impact — tile loop is compute-bound). union SourceParams { /// Unpack FastLanes bit-packed data. struct BitunpackParams { uint8_t bit_width; uint32_t element_offset; // Sub-byte offset + uint64_t patches_ptr; // device pointer to GPUPatches struct (0 = none) } bitunpack; /// Copy from global to shared memory. @@ -157,6 +164,7 @@ union ScalarParams { struct AlpParams { float f; float e; + uint64_t patches_ptr; // device pointer to GPUPatches struct (0 = none) } alp; /// Dictionary gather: use current value as index into decoded values in smem. @@ -193,6 +201,7 @@ struct PackedStage { uint32_t len; // number of elements this stage produces struct SourceOp source; + uint8_t num_scalar_ops; enum PTypeTag source_ptype; // PType produced by the source op }; @@ -220,11 +229,12 @@ struct __attribute__((aligned(8))) PlanHeader { /// change the type; the final output PType is given by the last scalar op's /// `output_ptype` (or `source_ptype` if there are no scalar ops). struct Stage { - uint64_t input_ptr; // encoded input in global memory - uint32_t smem_byte_offset; // byte offset within dynamic shared memory - uint32_t len; // elements produced - enum PTypeTag source_ptype; // PType produced by the source op - struct SourceOp source; // source decode op + uint64_t input_ptr; // encoded input in global memory + uint32_t smem_byte_offset; // byte offset within dynamic shared memory + uint32_t len; // elements produced + enum PTypeTag source_ptype; // PType produced by the source op + struct SourceOp source; // source decode op + uint8_t num_scalar_ops; // number of scalar ops const struct ScalarOp *scalar_ops; // scalar decode ops }; @@ -247,6 +257,7 @@ __device__ inline Stage parse_stage(const uint8_t *&cursor) { .len = packed_stage->len, .source_ptype = packed_stage->source_ptype, .source = packed_stage->source, + .num_scalar_ops = packed_stage->num_scalar_ops, .scalar_ops = ops, }; diff --git a/vortex-cuda/kernels/src/fastlanes_common.cuh b/vortex-cuda/kernels/src/fastlanes_common.cuh index 660a1c554f4..8536de26789 100644 --- a/vortex-cuda/kernels/src/fastlanes_common.cuh +++ b/vortex-cuda/kernels/src/fastlanes_common.cuh @@ -8,8 +8,25 @@ // FastLanes ordering array __constant__ int FL_ORDER[] = {0, 4, 2, 6, 1, 5, 3, 7}; +// FastLanes organises every 1024-element vector into a transposed layout +// of FL_LANES columns × (1024 / FL_LANES) rows. Each column is a "lane" +// that can be processed independently of every other lane, which is what +// makes all FastLanes encodings (FFOR, DELTA, RLE, ALP, …) fully +// data-parallel. One CUDA thread or one CPU SIMD lane handles one +// FastLanes lane. +// +// Paper: https://ir.cwi.nl/pub/35881/35881.pdf +// Repo: https://github.com/cwida/FastLanes + +/// FastLanes chunk size in elements. +constexpr uint32_t FL_CHUNK = 1024; + +/// Number of FastLanes lanes for element type T (1024 / bit-width). +template +constexpr uint32_t FL_LANES = FL_CHUNK / (sizeof(T) * 8); + // Compute the index in the FastLanes layout #define INDEX(row, lane) (FL_ORDER[row / 8] * 16 + (row % 8) * 128 + lane) // Create a mask with 'width' bits set -#define MASK(T, width) (((T)1 << width) - 1) \ No newline at end of file +#define MASK(T, width) (((T)1 << width) - 1) diff --git a/vortex-cuda/kernels/src/patches.cuh b/vortex-cuda/kernels/src/patches.cuh index 66a97b935e9..fa9ff18def9 100644 --- a/vortex-cuda/kernels/src/patches.cuh +++ b/vortex-cuda/kernels/src/patches.cuh @@ -3,6 +3,7 @@ #pragma once +#include "fastlanes_common.cuh" #include "patches.h" /// Load a chunk offset value, dispatching on the runtime type. @@ -21,8 +22,8 @@ __device__ inline uint32_t load_chunk_offset(const GPUPatches &patches, uint32_t } /// A single patch: a within-chunk index and its replacement value. -/// A sentinel patch has index == 1024, which can never match a valid -/// within-chunk position (0–1023). +/// A sentinel patch has index == FL_CHUNK, which can never match a valid +/// within-chunk position (0–FL_CHUNK-1). template struct Patch { uint16_t index; @@ -38,7 +39,7 @@ struct Patch { /// /// PatchesCursor cursor(patches, blockIdx.x, thread_idx, 32); /// auto patch = cursor.next(); -/// while (patch.index != 1024) { +/// while (patch.index != FL_CHUNK) { /// shared_out[patch.index] = patch.value; /// patch = cursor.next(); /// } @@ -89,15 +90,15 @@ public: // The iterator returns indices relative to the start of the chunk. // `chunk_base` is the index of the first element within a chunk, accounting // for the slice offset. - chunk_base = chunk * 1024 + patches.offset; - chunk_base -= min(chunk_base, patches.offset % 1024); + chunk_base = chunk * FL_CHUNK + patches.offset; + chunk_base -= min(chunk_base, patches.offset % FL_CHUNK); } /// Return the current patch (with within-chunk index) and advance, /// or a sentinel {1024, 0} if exhausted. __device__ Patch next() { if (remaining == 0) { - return {1024, T {}}; + return {FL_CHUNK, T {}}; } uint16_t within_chunk = static_cast(*indices - chunk_base); Patch patch = {within_chunk, *values}; @@ -110,6 +111,6 @@ public: private: const uint32_t *indices; const T *values; - uint8_t remaining; + uint32_t remaining; uint32_t chunk_base; }; diff --git a/vortex-cuda/src/bit_unpack_gen.rs b/vortex-cuda/src/bit_unpack_gen.rs index 8d5eda920cd..2482c0996b8 100644 --- a/vortex-cuda/src/bit_unpack_gen.rs +++ b/vortex-cuda/src/bit_unpack_gen.rs @@ -48,12 +48,7 @@ fn write_row(output: &mut impl Write, bits: usize, bit_width: usize, row: usize) /// loop. For all other bit widths, emits pre-computed per-row bit extraction /// with register-cached `src` words — identical to the original hand-unrolled /// codegen, preserving minimal memory loads and zero extra work. -fn generate_lane_decoder( - output: &mut impl Write, - bits: usize, - lanes: usize, - bit_width: usize, -) -> io::Result<()> { +fn generate_lane_decoder(output: &mut impl Write, bits: usize, bit_width: usize) -> io::Result<()> { if bit_width == 0 { write!( output, @@ -71,7 +66,7 @@ __device__ void _bit_unpack_{bits}_lane<0>(const uint{bits}_t *__restrict in, ui output, r#"template <> __device__ void _bit_unpack_{bits}_lane<{bit_width}>(const uint{bits}_t *__restrict in, uint{bits}_t *__restrict out, uint{bits}_t reference, unsigned int lane) {{ - unsigned int LANE_COUNT = {lanes}; + constexpr unsigned int LANE_COUNT = FL_LANES; #pragma unroll for (int row = 0; row < {bits}; row++) {{ out[INDEX(row, lane)] = in[LANE_COUNT * row + lane] + reference; @@ -84,7 +79,7 @@ __device__ void _bit_unpack_{bits}_lane<{bit_width}>(const uint{bits}_t *__restr output, r#"template <> __device__ void _bit_unpack_{bits}_lane<{bit_width}>(const uint{bits}_t *__restrict in, uint{bits}_t *__restrict out, uint{bits}_t reference, unsigned int lane) {{ - unsigned int LANE_COUNT = {lanes}; + constexpr unsigned int LANE_COUNT = FL_LANES; uint{bits}_t src; uint{bits}_t tmp; src = in[lane]; @@ -142,29 +137,25 @@ __device__ __noinline__ void bit_unpack_{bits}_lane( fn generate_device_kernel_template( output: &mut impl Write, bits: usize, - lanes: usize, thread_count: usize, ) -> io::Result<()> { - let per_thread_loop_count = lanes / thread_count; - let shared_copy_ncount = 1024 / thread_count; - write!( output, r#"template __device__ void _bit_unpack_{bits}_device(const uint{bits}_t *__restrict in, uint{bits}_t *__restrict out, uint{bits}_t reference, int thread_idx, GPUPatches& patches) {{ - __shared__ uint{bits}_t shared_out[1024]; + __shared__ uint{bits}_t shared_out[FL_CHUNK]; // Step 1: Unpack into shared memory #pragma unroll - for (int i = 0; i < {per_thread_loop_count}; i++) {{ - _bit_unpack_{bits}_lane(in, shared_out, reference, thread_idx * {per_thread_loop_count} + i); + for (int i = 0; i < FL_LANES / {thread_count}; i++) {{ + _bit_unpack_{bits}_lane(in, shared_out, reference, thread_idx * (FL_LANES / {thread_count}) + i); }} __syncwarp(); // Step 2: Apply patches to shared memory in parallel PatchesCursor cursor(patches, blockIdx.x, thread_idx, {thread_count}); auto patch = cursor.next(); - while (patch.index != 1024) {{ + while (patch.index != FL_CHUNK) {{ shared_out[patch.index] = patch.value; patch = cursor.next(); }} @@ -172,7 +163,7 @@ __device__ void _bit_unpack_{bits}_device(const uint{bits}_t *__restrict in, uin // Step 3: Copy to global memory #pragma unroll - for (int i = 0; i < {shared_copy_ncount}; i++) {{ + for (int i = 0; i < FL_CHUNK / {thread_count}; i++) {{ auto idx = i * {thread_count} + thread_idx; out[idx] = shared_out[idx]; }} @@ -194,8 +185,8 @@ fn generate_global_kernel( output, r#"extern "C" __global__ void {func_name}(const uint{bits}_t *__restrict full_in, uint{bits}_t *__restrict full_out, uint{bits}_t reference, GPUPatches patches) {{ int thread_idx = threadIdx.x; - auto in = full_in + (blockIdx.x * (128 * {bit_width} / sizeof(uint{bits}_t))); - auto out = full_out + (blockIdx.x * 1024); + auto in = full_in + (blockIdx.x * (FL_LANES * {bit_width})); + auto out = full_out + (blockIdx.x * FL_CHUNK); _bit_unpack_{bits}_device<{bit_width}>(in, out, reference, thread_idx, patches); }} "# @@ -210,7 +201,6 @@ fn generate_global_kernel( /// not pull in the 129 standalone bit-unpack kernel entry points. pub fn generate_cuda_unpack_lanes(output: &mut impl Write) -> io::Result<()> { let bits = T::T; - let lanes = T::LANES; write!( output, @@ -230,7 +220,7 @@ __device__ void _bit_unpack_{bits}_lane(const uint{bits}_t *__restrict in, uint{ // Lane-decoder template specializations (one per bit width). for bit_width in 0..=bits { - generate_lane_decoder(output, bits, lanes, bit_width)?; + generate_lane_decoder(output, bits, bit_width)?; writeln!(output)?; } @@ -250,7 +240,6 @@ pub fn generate_cuda_unpack_kernels( thread_count: usize, ) -> io::Result<()> { let bits = T::T; - let lanes = T::LANES; write!( output, @@ -262,7 +251,7 @@ pub fn generate_cuda_unpack_kernels( )?; // Device kernel template (written once, instantiated per bit width). - generate_device_kernel_template(output, bits, lanes, thread_count)?; + generate_device_kernel_template(output, bits, thread_count)?; writeln!(output)?; // Thin extern "C" global-kernel wrappers (one per bit width). diff --git a/vortex-cuda/src/dynamic_dispatch/mod.rs b/vortex-cuda/src/dynamic_dispatch/mod.rs index 847fa39340d..46aa35d0465 100644 --- a/vortex-cuda/src/dynamic_dispatch/mod.rs +++ b/vortex-cuda/src/dynamic_dispatch/mod.rs @@ -313,6 +313,7 @@ impl SourceOp { bitunpack: SourceParams_BitunpackParams { bit_width, element_offset: u32::from(element_offset), + patches_ptr: 0, }, }, } @@ -393,7 +394,11 @@ impl ScalarOp { op_code: ScalarOp_ScalarOpCode_ALP, output_ptype: PTypeTag_PTYPE_F32, params: ScalarParams { - alp: ScalarParams_AlpParams { f, e }, + alp: ScalarParams_AlpParams { + f, + e, + patches_ptr: 0, + }, }, } } @@ -493,6 +498,7 @@ impl MaterializedPlan { #[cfg(test)] mod tests { + use std::ops::Range; use std::sync::Arc; use cudarc::driver::DevicePtr; @@ -515,6 +521,7 @@ mod tests { use vortex::encodings::alp::alp_encode; use vortex::encodings::fastlanes::BitPacked; use vortex::encodings::fastlanes::BitPackedArray; + use vortex::encodings::fastlanes::BitPackedArrayExt; use vortex::encodings::fastlanes::FoR; use vortex::encodings::fastlanes::FoRArrayExt; use vortex::encodings::runend::RunEnd; @@ -550,7 +557,7 @@ mod tests { fn dispatch_plan( array: &vortex::array::ArrayRef, - ctx: &CudaExecutionCtx, + ctx: &mut CudaExecutionCtx, ) -> VortexResult { match DispatchPlan::new(array)? { DispatchPlan::Fused(plan) => plan.materialize(ctx), @@ -778,8 +785,8 @@ mod tests { .collect(); let bp = bitpacked_array_u32(bit_width, len); - let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?; - let plan = dispatch_plan(&bp.into_array(), &cuda_ctx)?; + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?; + let plan = dispatch_plan(&bp.into_array(), &mut cuda_ctx)?; let actual = run_dynamic_dispatch_plan(&cuda_ctx, len, &plan.dispatch_plan, plan.shared_mem_bytes)?; @@ -803,8 +810,8 @@ mod tests { let bp = bitpacked_array_u32(bit_width, len); let for_arr = FoR::try_new(bp.into_array(), Scalar::from(reference))?; - let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?; - let plan = dispatch_plan(&for_arr.into_array(), &cuda_ctx)?; + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?; + let plan = dispatch_plan(&for_arr.into_array(), &mut cuda_ctx)?; let actual = run_dynamic_dispatch_plan(&cuda_ctx, len, &plan.dispatch_plan, plan.shared_mem_bytes)?; @@ -830,7 +837,7 @@ mod tests { let values_arr = PrimitiveArray::new(Buffer::from(values), NonNullable).into_array(); let re = RunEnd::new(ends_arr, values_arr, cuda_ctx.execution_ctx()); - let plan = dispatch_plan(&re.into_array(), &cuda_ctx)?; + let plan = dispatch_plan(&re.into_array(), &mut cuda_ctx)?; let actual = run_dynamic_dispatch_plan(&cuda_ctx, len, &plan.dispatch_plan, plan.shared_mem_bytes)?; @@ -870,8 +877,8 @@ mod tests { let dict = DictArray::try_new(codes_bp.into_array(), dict_for.into_array())?; - let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?; - let plan = dispatch_plan(&dict.into_array(), &cuda_ctx)?; + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?; + let plan = dispatch_plan(&dict.into_array(), &mut cuda_ctx)?; let actual = run_dynamic_dispatch_plan(&cuda_ctx, len, &plan.dispatch_plan, plan.shared_mem_bytes)?; @@ -907,8 +914,8 @@ mod tests { None, ); - let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?; - let plan = dispatch_plan(&tree.into_array(), &cuda_ctx)?; + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?; + let plan = dispatch_plan(&tree.into_array(), &mut cuda_ctx)?; let actual = run_dispatch_plan_f32(&cuda_ctx, len, &plan.dispatch_plan, plan.shared_mem_bytes)?; @@ -940,8 +947,8 @@ mod tests { )?; let zz = ZigZag::try_new(bp.into_array())?; - let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?; - let plan = dispatch_plan(&zz.into_array(), &cuda_ctx)?; + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?; + let plan = dispatch_plan(&zz.into_array(), &mut cuda_ctx)?; let actual = run_dynamic_dispatch_plan(&cuda_ctx, len, &plan.dispatch_plan, plan.shared_mem_bytes)?; @@ -970,7 +977,7 @@ mod tests { let re = RunEnd::new(ends_arr, values_arr, cuda_ctx.execution_ctx()); let for_arr = FoR::try_new(re.into_array(), Scalar::from(reference))?; - let plan = dispatch_plan(&for_arr.into_array(), &cuda_ctx)?; + let plan = dispatch_plan(&for_arr.into_array(), &mut cuda_ctx)?; let actual = run_dynamic_dispatch_plan(&cuda_ctx, len, &plan.dispatch_plan, plan.shared_mem_bytes)?; @@ -998,8 +1005,8 @@ mod tests { let dict = DictArray::try_new(codes_prim.into_array(), values_prim.into_array())?; let for_arr = FoR::try_new(dict.into_array(), Scalar::from(reference))?; - let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?; - let plan = dispatch_plan(&for_arr.into_array(), &cuda_ctx)?; + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?; + let plan = dispatch_plan(&for_arr.into_array(), &mut cuda_ctx)?; let actual = run_dynamic_dispatch_plan(&cuda_ctx, len, &plan.dispatch_plan, plan.shared_mem_bytes)?; @@ -1030,8 +1037,8 @@ mod tests { let values_prim = PrimitiveArray::new(Buffer::from(dict_values), NonNullable); let dict = DictArray::try_new(codes_for.into_array(), values_prim.into_array())?; - let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?; - let plan = dispatch_plan(&dict.into_array(), &cuda_ctx)?; + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?; + let plan = dispatch_plan(&dict.into_array(), &mut cuda_ctx)?; let actual = run_dynamic_dispatch_plan(&cuda_ctx, len, &plan.dispatch_plan, plan.shared_mem_bytes)?; @@ -1059,8 +1066,8 @@ mod tests { let dict = DictArray::try_new(codes_bp.into_array(), values_prim.into_array())?; - let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?; - let plan = dispatch_plan(&dict.into_array(), &cuda_ctx)?; + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?; + let plan = dispatch_plan(&dict.into_array(), &mut cuda_ctx)?; let actual = run_dynamic_dispatch_plan(&cuda_ctx, len, &plan.dispatch_plan, plan.shared_mem_bytes)?; @@ -1203,8 +1210,8 @@ mod tests { let expected: Vec = data[slice_start..slice_end].to_vec(); - let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?; - let plan = dispatch_plan(&sliced, &cuda_ctx)?; + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?; + let plan = dispatch_plan(&sliced, &mut cuda_ctx)?; let actual = run_dynamic_dispatch_plan( &cuda_ctx, @@ -1258,8 +1265,8 @@ mod tests { let sliced = zz.into_array().slice(slice_start..slice_end)?; let expected: Vec = all_decoded[slice_start..slice_end].to_vec(); - let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?; - let plan = dispatch_plan(&sliced, &cuda_ctx)?; + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?; + let plan = dispatch_plan(&sliced, &mut cuda_ctx)?; let actual = run_dynamic_dispatch_plan( &cuda_ctx, @@ -1308,8 +1315,8 @@ mod tests { .map(|&c| dict_values[c as usize]) .collect(); - let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?; - let plan = dispatch_plan(&sliced, &cuda_ctx)?; + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?; + let plan = dispatch_plan(&sliced, &mut cuda_ctx)?; let actual = run_dynamic_dispatch_plan( &cuda_ctx, @@ -1357,8 +1364,8 @@ mod tests { let sliced = bp.into_array().slice(slice_start..slice_end)?; let expected: Vec = data[slice_start..slice_end].to_vec(); - let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?; - let plan = dispatch_plan(&sliced, &cuda_ctx)?; + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?; + let plan = dispatch_plan(&sliced, &mut cuda_ctx)?; let actual = run_dynamic_dispatch_plan( &cuda_ctx, @@ -1410,8 +1417,8 @@ mod tests { let sliced = for_arr.into_array().slice(slice_start..slice_end)?; let expected: Vec = all_decoded[slice_start..slice_end].to_vec(); - let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?; - let plan = dispatch_plan(&sliced, &cuda_ctx)?; + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?; + let plan = dispatch_plan(&sliced, &mut cuda_ctx)?; let actual = run_dynamic_dispatch_plan( &cuda_ctx, @@ -1475,8 +1482,8 @@ mod tests { let sliced = dict.into_array().slice(slice_start..slice_end)?; let expected: Vec = all_decoded[slice_start..slice_end].to_vec(); - let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?; - let plan = dispatch_plan(&sliced, &cuda_ctx)?; + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?; + let plan = dispatch_plan(&sliced, &mut cuda_ctx)?; let actual = run_dynamic_dispatch_plan( &cuda_ctx, @@ -1507,8 +1514,8 @@ mod tests { let seq = Sequence::try_new_typed(base, multiplier, Nullability::NonNullable, len)?; - let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?; - let plan = dispatch_plan(&seq.into_array(), &cuda_ctx)?; + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?; + let plan = dispatch_plan(&seq.into_array(), &mut cuda_ctx)?; let actual = run_dynamic_dispatch_plan( &cuda_ctx, @@ -1540,8 +1547,8 @@ mod tests { let seq = Sequence::try_new_typed(base, multiplier, Nullability::NonNullable, len)?; - let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?; - let plan = dispatch_plan(&seq.into_array(), &cuda_ctx)?; + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?; + let plan = dispatch_plan(&seq.into_array(), &mut cuda_ctx)?; let actual_u32 = run_dynamic_dispatch_plan( &cuda_ctx, @@ -2127,4 +2134,438 @@ mod tests { assert_eq!(prim.validity()?.nullability(), Nullability::Nullable); Ok(()) } + + // --------------------------------------------------------------- + // Patch tests — fused dynamic dispatch with exception values + // --------------------------------------------------------------- + + #[rstest] + #[case::unsliced(3000, None)] + #[case::mid_slice(5000, Some(500..3500))] + #[case::start_slice(5000, Some(0..1000))] + #[case::chunk_aligned(5000, Some(1024..3000))] + #[crate::test] + fn test_bitpacked_with_patches( + #[case] len: usize, + #[case] slice_range: Option>, + ) -> VortexResult<()> { + let bit_width: u8 = 4; + let max_val = (1u32 << bit_width) - 1; + let values: Vec = (0..len) + .map(|i| { + if i % 100 == 0 { + 1000 + } else { + (i as u32) % (max_val + 1) + } + }) + .collect(); + + let prim = PrimitiveArray::new(Buffer::from(values.clone()), NonNullable); + let bp = BitPacked::encode( + &prim.into_array(), + bit_width, + &mut LEGACY_SESSION.create_execution_ctx(), + )?; + assert!(bp.patches().is_some(), "expected patches"); + + let (array, expected) = if let Some(range) = slice_range { + let sliced = bp.into_array().slice(range.clone())?; + (sliced, values[range].to_vec()) + } else { + (bp.into_array(), values) + }; + + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?; + let plan = dispatch_plan(&array, &mut cuda_ctx)?; + let actual = run_dynamic_dispatch_plan( + &cuda_ctx, + expected.len(), + &plan.dispatch_plan, + plan.shared_mem_bytes, + )?; + assert_eq!(actual, expected); + Ok(()) + } + + #[rstest] + #[case::unsliced(3000, None)] + #[case::mid_slice(5000, Some(500..3500))] + #[crate::test] + fn test_for_bitpacked_with_patches( + #[case] len: usize, + #[case] slice_range: Option>, + ) -> VortexResult<()> { + let bit_width: u8 = 6; + let reference = 42u32; + let max_val = (1u32 << bit_width) - 1; + let residuals: Vec = (0..len) + .map(|i| { + if i % 200 == 0 { + 500 + } else { + (i as u32) % (max_val + 1) + } + }) + .collect(); + let all_values: Vec = residuals.iter().map(|&v| v + reference).collect(); + + let prim = PrimitiveArray::new(Buffer::from(residuals), NonNullable); + let bp = BitPacked::encode( + &prim.into_array(), + bit_width, + &mut LEGACY_SESSION.create_execution_ctx(), + )?; + assert!(bp.patches().is_some(), "expected patches"); + let for_arr = FoR::try_new(bp.into_array(), Scalar::from(reference))?; + + let (array, expected) = if let Some(range) = slice_range { + let sliced = for_arr.into_array().slice(range.clone())?; + (sliced, all_values[range].to_vec()) + } else { + (for_arr.into_array(), all_values) + }; + + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?; + let plan = dispatch_plan(&array, &mut cuda_ctx)?; + let actual = run_dynamic_dispatch_plan( + &cuda_ctx, + expected.len(), + &plan.dispatch_plan, + plan.shared_mem_bytes, + )?; + assert_eq!(actual, expected); + Ok(()) + } + + #[rstest] + #[case::unsliced(2000, None)] + #[case::mid_slice(5000, Some(100..4000))] + #[case::large_offset(5000, Some(1500..4500))] + #[crate::test] + fn test_alp_with_patches( + #[case] len: usize, + #[case] slice_range: Option>, + ) -> VortexResult<()> { + let mut values: Vec = (0..len).map(|i| (i as f32) * 1.1).collect(); + // Insert exception values that ALP can't encode. + values[0] = 99.9; + values[500] = std::f32::consts::PI; + values[1024] = std::f32::consts::E; + if len > 2048 { + values[2048] = std::f32::consts::LN_2; + } + if len > 3333 { + values[3333] = std::f32::consts::SQRT_2; + } + + let float_prim = PrimitiveArray::new(Buffer::from(values), NonNullable); + let encoded = alp_encode( + float_prim.as_view(), + None, + &mut LEGACY_SESSION.create_execution_ctx(), + )? + .into_array(); + + let (array, base_offset) = if let Some(ref range) = slice_range { + (encoded.slice(range.clone())?, range.start) + } else { + (encoded, 0) + }; + + // Decode on CPU as ground truth (accounts for ALP precision loss + patches). + let cpu_decoded = array + .clone() + .execute::(&mut LEGACY_SESSION.create_execution_ctx())?; + let expected: Vec = cpu_decoded.as_slice::().to_vec(); + + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?; + let plan = dispatch_plan(&array, &mut cuda_ctx)?; + let actual = run_dispatch_plan_f32( + &cuda_ctx, + expected.len(), + &plan.dispatch_plan, + plan.shared_mem_bytes, + )?; + for (i, (&a, &e)) in actual.iter().zip(expected.iter()).enumerate() { + assert!( + a.to_bits() == e.to_bits(), + "mismatch at index {i} (original index {}): gpu={a} cpu={e} (bits: {:#010x} vs {:#010x})", + i + base_offset, + a.to_bits(), + e.to_bits(), + ); + } + Ok(()) + } + + // --------------------------------------------------------------- + // Additional patch tests — typed widths, edge cases, composites + // --------------------------------------------------------------- + + /// u8 BitPacked with patches (bit_width=3, patch values > 7). + #[crate::test] + async fn test_bitpacked_with_patches_u8() -> VortexResult<()> { + let bit_width: u8 = 3; + let len = 3000usize; + let max_val = (1u8 << bit_width) - 1; + let values: Vec = (0..len) + .map(|i| { + if i % 100 == 0 { + 200u8 + } else { + (i as u8) % (max_val + 1) + } + }) + .collect(); + + let prim = PrimitiveArray::new(Buffer::from(values.clone()), NonNullable); + let bp = BitPacked::encode( + &prim.into_array(), + bit_width, + &mut LEGACY_SESSION.create_execution_ctx(), + )?; + assert!(bp.patches().is_some(), "expected patches"); + + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?; + let canonical = try_gpu_dispatch(&bp.into_array(), &mut cuda_ctx).await?; + let result = CanonicalCudaExt::into_host(canonical).await?.into_array(); + + let expected_arr = PrimitiveArray::new(Buffer::from(values), NonNullable).into_array(); + vortex::array::assert_arrays_eq!(expected_arr, result); + Ok(()) + } + + /// u16 BitPacked with patches (bit_width=6, patch values > 63). + #[crate::test] + async fn test_bitpacked_with_patches_u16() -> VortexResult<()> { + let bit_width: u8 = 6; + let len = 3000usize; + let max_val = (1u16 << bit_width) - 1; + let values: Vec = (0..len) + .map(|i| { + if i % 150 == 0 { + 5000u16 + } else { + (i as u16) % (max_val + 1) + } + }) + .collect(); + + let prim = PrimitiveArray::new(Buffer::from(values.clone()), NonNullable); + let bp = BitPacked::encode( + &prim.into_array(), + bit_width, + &mut LEGACY_SESSION.create_execution_ctx(), + )?; + assert!(bp.patches().is_some(), "expected patches"); + + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?; + let canonical = try_gpu_dispatch(&bp.into_array(), &mut cuda_ctx).await?; + let result = CanonicalCudaExt::into_host(canonical).await?.into_array(); + + let expected_arr = PrimitiveArray::new(Buffer::from(values), NonNullable).into_array(); + vortex::array::assert_arrays_eq!(expected_arr, result); + Ok(()) + } + + /// u64 BitPacked with patches (bit_width=4, patch values > 15). + #[crate::test] + async fn test_bitpacked_with_patches_u64() -> VortexResult<()> { + let bit_width: u8 = 4; + let len = 3000usize; + let max_val = (1u64 << bit_width) - 1; + let values: Vec = (0..len) + .map(|i| { + if i % 200 == 0 { + 1_000_000u64 + } else { + (i as u64) % (max_val + 1) + } + }) + .collect(); + + let prim = PrimitiveArray::new(Buffer::from(values.clone()), NonNullable); + let bp = BitPacked::encode( + &prim.into_array(), + bit_width, + &mut LEGACY_SESSION.create_execution_ctx(), + )?; + assert!(bp.patches().is_some(), "expected patches"); + + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?; + let canonical = try_gpu_dispatch(&bp.into_array(), &mut cuda_ctx).await?; + let result = CanonicalCudaExt::into_host(canonical).await?.into_array(); + + let expected_arr = PrimitiveArray::new(Buffer::from(values), NonNullable).into_array(); + vortex::array::assert_arrays_eq!(expected_arr, result); + Ok(()) + } + + /// Dict where codes are BitPacked u32 with patches exceeding the bit width. + #[crate::test] + fn test_dict_bitpacked_codes_with_patches() -> VortexResult<()> { + let dict_values: Vec = (0..256).map(|i| i * 1000 + 42).collect(); + let len = 3000; + let bit_width: u8 = 4; + let max_code = (1u32 << bit_width) - 1; + // Some codes exceed max_code (15), creating patches in the BitPacked codes. + let codes: Vec = (0..len) + .map(|i| { + if i % 100 == 0 { + 100u32 // exceeds max_code=15, becomes a patch + } else { + (i as u32) % (max_code + 1) + } + }) + .collect(); + let expected: Vec = codes.iter().map(|&c| dict_values[c as usize]).collect(); + + let codes_prim = PrimitiveArray::new(Buffer::from(codes), NonNullable); + let codes_bp = BitPacked::encode( + &codes_prim.into_array(), + bit_width, + &mut LEGACY_SESSION.create_execution_ctx(), + )?; + assert!(codes_bp.patches().is_some(), "expected patches on codes"); + + let values_prim = PrimitiveArray::new(Buffer::from(dict_values), NonNullable); + let dict = DictArray::try_new(codes_bp.into_array(), values_prim.into_array())?; + + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?; + let plan = dispatch_plan(&dict.into_array(), &mut cuda_ctx)?; + let actual = + run_dynamic_dispatch_plan(&cuda_ctx, len, &plan.dispatch_plan, plan.shared_mem_bytes)?; + assert_eq!(actual, expected); + Ok(()) + } + + /// Patches placed exactly at FastLanes chunk boundaries (1024-element chunks). + #[crate::test] + fn test_bitpacked_patches_at_chunk_boundaries() -> VortexResult<()> { + let len = 4096usize; + let bit_width: u8 = 4; + let max_val = (1u32 << bit_width) - 1; + let mut values: Vec = (0..len).map(|i| (i as u32) % (max_val + 1)).collect(); + // Place patches at FL chunk boundaries (1024 elements per chunk). + values[1023] = 1000; // end of chunk 0 + values[1024] = 2000; // start of chunk 1 + values[2047] = 3000; // end of chunk 1 + values[2048] = 4000; // start of chunk 2 + + let prim = PrimitiveArray::new(Buffer::from(values.clone()), NonNullable); + let bp = BitPacked::encode( + &prim.into_array(), + bit_width, + &mut LEGACY_SESSION.create_execution_ctx(), + )?; + assert!(bp.patches().is_some(), "expected patches"); + + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?; + let plan = dispatch_plan(&bp.into_array(), &mut cuda_ctx)?; + let actual = + run_dynamic_dispatch_plan(&cuda_ctx, len, &plan.dispatch_plan, plan.shared_mem_bytes)?; + assert_eq!(actual, values); + Ok(()) + } + + /// Large array (100k elements) spanning many blocks with sparse patches. + #[crate::test] + fn test_bitpacked_large_array_with_patches() -> VortexResult<()> { + let len = 100_000usize; + let bit_width: u8 = 6; + let max_val = (1u32 << bit_width) - 1; + let values: Vec = (0..len) + .map(|i| { + if i % 500 == 0 { + 1000 + } else { + (i as u32) % (max_val + 1) + } + }) + .collect(); + + let prim = PrimitiveArray::new(Buffer::from(values.clone()), NonNullable); + let bp = BitPacked::encode( + &prim.into_array(), + bit_width, + &mut LEGACY_SESSION.create_execution_ctx(), + )?; + assert!(bp.patches().is_some(), "expected patches"); + + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?; + let plan = dispatch_plan(&bp.into_array(), &mut cuda_ctx)?; + let actual = + run_dynamic_dispatch_plan(&cuda_ctx, len, &plan.dispatch_plan, plan.shared_mem_bytes)?; + assert_eq!(actual, values); + Ok(()) + } + + /// Nullable BitPacked with patches — validity must survive through fused + /// dispatch alongside patch application. + #[crate::test] + async fn test_nullable_bitpacked_with_patches() -> VortexResult<()> { + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?; + + let len = 3000usize; + let bit_width: u8 = 4; + let max_val = (1u32 << bit_width) - 1; + + // Every 7th element is null; every 100th (non-null) element is a patch. + let values: Vec> = (0..len) + .map(|i| { + if i % 7 == 0 { + None + } else if i % 100 == 0 { + Some(1000u32) // exceeds max_val=15, becomes a patch + } else { + Some((i as u32) % (max_val + 1)) + } + }) + .collect(); + + let prim = PrimitiveArray::from_option_iter(values.iter().copied()); + let cpu = crate::canonicalize_cpu(prim.clone())?.into_array(); + + let bp = BitPacked::encode( + &prim.into_array(), + bit_width, + &mut LEGACY_SESSION.create_execution_ctx(), + )?; + assert!(bp.patches().is_some(), "expected patches"); + + let gpu = try_gpu_dispatch(&bp.into_array(), &mut cuda_ctx) + .await? + .into_host() + .await? + .into_array(); + + vortex::array::assert_arrays_eq!(cpu, gpu); + Ok(()) + } + + /// Extreme case: ALL values are patches (bit_width=1, every value > 1). + #[crate::test] + fn test_bitpacked_all_patches() -> VortexResult<()> { + let bit_width: u8 = 1; + let len = 2000usize; + // All values >= 2, so every single element exceeds max storable (1) and + // becomes a patch. + let values: Vec = (0..len).map(|i| (i as u32) + 2).collect(); + + let prim = PrimitiveArray::new(Buffer::from(values.clone()), NonNullable); + let bp = BitPacked::encode( + &prim.into_array(), + bit_width, + &mut LEGACY_SESSION.create_execution_ctx(), + )?; + assert!(bp.patches().is_some(), "expected patches"); + + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?; + let plan = dispatch_plan(&bp.into_array(), &mut cuda_ctx)?; + let actual = + run_dynamic_dispatch_plan(&cuda_ctx, len, &plan.dispatch_plan, plan.shared_mem_bytes)?; + assert_eq!(actual, values); + Ok(()) + } } diff --git a/vortex-cuda/src/dynamic_dispatch/plan_builder.rs b/vortex-cuda/src/dynamic_dispatch/plan_builder.rs index bec93d4336f..1b63630906c 100644 --- a/vortex-cuda/src/dynamic_dispatch/plan_builder.rs +++ b/vortex-cuda/src/dynamic_dispatch/plan_builder.rs @@ -16,12 +16,14 @@ use vortex::array::arrays::Slice; use vortex::array::arrays::dict::DictArraySlotsExt; use vortex::array::arrays::slice::SliceArrayExt; use vortex::array::buffer::BufferHandle; +use vortex::array::patches::Patches; use vortex::array::validity::Validity; use vortex::dtype::PType; use vortex::encodings::alp::ALP; use vortex::encodings::alp::ALPArrayExt; use vortex::encodings::alp::ALPArraySlotsExt; use vortex::encodings::alp::ALPFloat; +use vortex::encodings::alp::Exponents; use vortex::encodings::fastlanes::BitPacked; use vortex::encodings::fastlanes::BitPackedArrayExt; use vortex::encodings::fastlanes::FoR; @@ -45,6 +47,7 @@ use super::ptype_to_tag; use super::tag_to_ptype; use crate::CudaBufferExt; use crate::CudaExecutionCtx; +use crate::kernel::load_patches_sync; /// A plan whose source buffers have been copied to the device, ready for kernel launch. pub struct MaterializedPlan { @@ -69,10 +72,10 @@ fn is_dyn_dispatch_compatible(array: &ArrayRef) -> bool { let id = array.encoding_id(); if id == ALP.id() { let arr = array.as_::(); - return arr.patches().is_none() && arr.dtype().as_ptype() == PType::F32; + return arr.dtype().as_ptype() == PType::F32; } if id == BitPacked.id() { - return array.as_::().patches().is_none(); + return true; } if id == Dict.id() { let arr = array.as_::(); @@ -119,9 +122,17 @@ fn is_dyn_dispatch_compatible(array: &ArrayRef) -> bool { } /// An unmaterialized stage: a source op, scalar ops, and optional source buffer reference. +/// +/// Patches are tied to their owning ops, mirroring the CUDA side where +/// `patches_ptr` lives on `BitunpackParams` / `AlpParams`: +/// - `source_patches` for the source op (BitPacked exceptions) +/// - Each scalar op carries its own `Option` (ALP exceptions) struct Stage { source: SourceOp, - scalar_ops: Vec, + /// Patches from the source op (e.g. BitPacked overflow exceptions). + source_patches: Option, + /// Scalar ops with optional per-op patches (e.g. ALP float exceptions). + scalar_ops: Vec<(ScalarOp, Option)>, /// Index into `FusedPlan::source_buffers`, or `None` /// for sources that don't read from a device buffer. source_buffer_index: Option, @@ -133,6 +144,7 @@ impl Stage { fn new(source: SourceOp, source_buffer_index: Option, source_ptype: PTypeTag) -> Self { Self { source, + source_patches: None, scalar_ops: vec![], source_buffer_index, source_ptype, @@ -240,7 +252,7 @@ impl DispatchPlan { /// - Validity is propagated from the root array to the output. Nullable /// arrays are supported, but Dict with nullable codes and RunEnd with /// nullable ends are rejected to guard against out-of-bounds access. - /// - `BitPackedArray` and `ALPArray` with patches are not supported. + /// - `BitPackedArray` and `ALPArray` with patches are supported. /// - Only f32 ALP is supported (kernel stores multipliers as `float`). pub fn new(array: &ArrayRef) -> VortexResult { if PType::try_from(array.dtype()).is_err() || !is_dyn_dispatch_compatible(array) { @@ -340,7 +352,7 @@ impl FusedPlan { } /// Copy source buffers to the device, producing a [`MaterializedPlan`]. - pub fn materialize(self, ctx: &CudaExecutionCtx) -> VortexResult { + pub fn materialize(self, ctx: &mut CudaExecutionCtx) -> VortexResult { let shared_mem_bytes = self.dynamic_shared_mem_bytes(); let mut device_buffers = Vec::new(); @@ -367,20 +379,37 @@ impl FusedPlan { // Byte offsets are passed directly to the C ABI — the kernel now // indexes shared memory by byte offset and casts to the correct type // using source_ptype / output_ptype. - let stages: Vec = self - .stages - .iter() - .map(|(stage, smem_byte_offset, len)| { - MaterializedStage::new( - resolve_ptr(stage), - *smem_byte_offset, - *len, - stage.source_ptype, - stage.source, - &stage.scalar_ops, - ) - }) - .collect(); + let mut stages: Vec = Vec::new(); + for (stage, smem_byte_offset, len) in &self.stages { + let mut source = stage.source; + + // Upload source patches (e.g. BitPacked exceptions). + if let Some(patches) = &stage.source_patches { + let (ptr, bufs) = load_patches_sync(patches, ctx)?; + source.params.bitunpack.patches_ptr = ptr; + device_buffers.extend(bufs); + } + + // Upload patches for each scalar op that carries them. + let mut scalar_ops: Vec = Vec::with_capacity(stage.scalar_ops.len()); + for (mut op, patches) in stage.scalar_ops.clone() { + if let Some(patches) = &patches { + let (ptr, bufs) = load_patches_sync(patches, ctx)?; + op.params.alp.patches_ptr = ptr; + device_buffers.extend(bufs); + } + scalar_ops.push(op); + } + + stages.push(MaterializedStage::new( + resolve_ptr(stage), + *smem_byte_offset, + *len, + stage.source_ptype, + source, + &scalar_ops, + )); + } Ok(MaterializedPlan { dispatch_plan: CudaDispatchPlan::new(stages, self.output_ptype), @@ -398,7 +427,7 @@ impl FusedPlan { pub fn materialize_with_subtrees( mut self, subtree_buffers: Vec, - ctx: &CudaExecutionCtx, + ctx: &mut CudaExecutionCtx, ) -> VortexResult { for (slot, buf) in zip_eq( self.source_buffers.iter_mut().filter(|s| s.is_none()), @@ -465,7 +494,8 @@ impl FusedPlan { /// SliceArray → resolve the slice via reduce/execute rules. /// /// When the plan builder encounters a `SliceArray`, it resolves the slice - /// by invoking the child's `reduce_parent`, `execute_parent`. + /// by invoking the child's `reduce_parent`. If that fails (e.g. ALP + /// doesn't implement it), we manually slice the child's sub-arrays. fn walk_slice( &mut self, array: ArrayRef, @@ -478,6 +508,26 @@ impl FusedPlan { return self.walk(reduced, pending_subtrees); } + // ALP doesn't implement reduce_parent — slice encoded child and + // patches manually (Patches::slice adjusts offsets for the range). + if child.encoding_id() == ALP.id() { + let alp = child.as_::(); + let offset = slice_arr.data().slice_range().start; + let len = array.len(); + let sliced_encoded = alp.encoded().clone().slice(offset..offset + len)?; + let sliced_patches = alp + .patches() + .map(|p| p.slice(offset..offset + len)) + .transpose()? + .flatten(); + return self.walk_alp_inner( + sliced_encoded, + sliced_patches, + alp.exponents(), + pending_subtrees, + ); + } + vortex_bail!( "Cannot resolve SliceArray wrapping {:?} in dynamic dispatch plan builder", child.encoding_id() @@ -498,20 +548,18 @@ impl FusedPlan { fn walk_bitpacked(&mut self, array: ArrayRef) -> VortexResult { let bp = array.as_::(); - if bp.patches().is_some() { - vortex_bail!("Dynamic dispatch does not support BitPackedArray with patches"); - } - let source_ptype = ptype_to_tag(PType::try_from(bp.dtype()).map_err(|_| { vortex_err!("BitPacked must have primitive dtype, got {:?}", bp.dtype()) })?); let buf_index = self.source_buffers.len(); self.source_buffers.push(Some(bp.packed().clone())); - Ok(Stage::new( + let mut stage = Stage::new( SourceOp::bitunpack(bp.bit_width(), bp.offset()), Some(buf_index), source_ptype, - )) + ); + stage.source_patches = bp.patches(); + Ok(stage) } fn walk_for( @@ -537,7 +585,7 @@ impl FusedPlan { .cast::()?; pipeline .scalar_ops - .push(ScalarOp::frame_of_ref(ref_u64, output_ptype)); + .push((ScalarOp::frame_of_ref(ref_u64, output_ptype), None)); Ok(pipeline) } @@ -553,7 +601,9 @@ impl FusedPlan { })?); let mut pipeline = self.walk(encoded, pending_subtrees)?; - pipeline.scalar_ops.push(ScalarOp::zigzag(output_ptype)); + pipeline + .scalar_ops + .push((ScalarOp::zigzag(output_ptype), None)); Ok(pipeline) } @@ -563,26 +613,29 @@ impl FusedPlan { pending_subtrees: &mut Vec, ) -> VortexResult { let alp = array.as_::(); + self.walk_alp_inner( + alp.encoded().clone(), + alp.patches(), + alp.exponents(), + pending_subtrees, + ) + } - if alp.patches().is_some() { - vortex_bail!("Dynamic dispatch does not support ALPArray with patches"); - } - - let ptype = alp.dtype().as_ptype(); - if ptype != PType::F32 { - vortex_bail!( - "Dynamic dispatch only supports f32 ALP, got {:?}", - alp.dtype() - ); - } - - let exponents = alp.exponents(); + /// Shared ALP logic for both `walk_alp` and `walk_slice` (Slice(ALP)). + fn walk_alp_inner( + &mut self, + encoded: ArrayRef, + patches: Option, + exponents: Exponents, + pending_subtrees: &mut Vec, + ) -> VortexResult { let alp_f = ::F10[exponents.f as usize]; let alp_e = ::IF10[exponents.e as usize]; - let encoded = alp.encoded().clone(); let mut pipeline = self.walk(encoded, pending_subtrees)?; - pipeline.scalar_ops.push(ScalarOp::alp(alp_f, alp_e)); + pipeline + .scalar_ops + .push((ScalarOp::alp(alp_f, alp_e), patches)); Ok(pipeline) } @@ -645,9 +698,9 @@ impl FusedPlan { }; // DICT scalar op: pass byte offset directly (C ABI uses byte offsets). // output_ptype is the values' ptype — DICT transforms codes → values. - pipeline.scalar_ops.push(ScalarOp::dict( - values_smem_byte_offset, - ptype_to_tag(values_ptype), + pipeline.scalar_ops.push(( + ScalarOp::dict(values_smem_byte_offset, ptype_to_tag(values_ptype)), + None, )); Ok(pipeline) } @@ -727,7 +780,7 @@ impl FusedPlan { let final_ptype = spec .scalar_ops .last() - .map(|op| op.output_ptype) + .map(|(op, _)| op.output_ptype) .unwrap_or(spec.source_ptype); let final_elem_bytes = tag_to_ptype(final_ptype).byte_width() as u32; let elem_bytes = final_elem_bytes.max(self.output_elem_bytes); diff --git a/vortex-cuda/src/kernel/encodings/bitpacked.rs b/vortex-cuda/src/kernel/encodings/bitpacked.rs index 1853557e809..0059d74fd5e 100644 --- a/vortex-cuda/src/kernel/encodings/bitpacked.rs +++ b/vortex-cuda/src/kernel/encodings/bitpacked.rs @@ -21,7 +21,6 @@ use vortex::encodings::fastlanes::BitPackedArray; use vortex::encodings::fastlanes::BitPackedDataParts; use vortex::encodings::fastlanes::unpack_iter::BitPacked as BitPackedUnpack; use vortex::error::VortexResult; -use vortex::error::vortex_bail; use vortex::error::vortex_ensure; use vortex::error::vortex_err; @@ -29,13 +28,10 @@ use crate::CudaBufferExt; use crate::CudaDeviceBuffer; use crate::executor::CudaExecute; use crate::executor::CudaExecutionCtx; -use crate::kernel::patches::gpu::ChunkOffsetType; -use crate::kernel::patches::gpu::ChunkOffsetType_CO_U8; -use crate::kernel::patches::gpu::ChunkOffsetType_CO_U16; use crate::kernel::patches::gpu::ChunkOffsetType_CO_U32; -use crate::kernel::patches::gpu::ChunkOffsetType_CO_U64; use crate::kernel::patches::gpu::GPUPatches; use crate::kernel::patches::types::load_patches; +use crate::kernel::patches::types::ptype_to_chunk_offset_type; /// CUDA decoder for bit-packed arrays. #[derive(Debug)] @@ -92,17 +88,6 @@ pub fn bitpacked_cuda_launch_config(output_width: usize, len: usize) -> VortexRe unsafe impl DeviceRepr for GPUPatches {} -/// Convert a PType to the corresponding ChunkOffsetType for GPU patches. -fn ptype_to_chunk_offset_type(ptype: vortex::dtype::PType) -> VortexResult { - match ptype { - vortex::dtype::PType::U8 => Ok(ChunkOffsetType_CO_U8), - vortex::dtype::PType::U16 => Ok(ChunkOffsetType_CO_U16), - vortex::dtype::PType::U32 => Ok(ChunkOffsetType_CO_U32), - vortex::dtype::PType::U64 => Ok(ChunkOffsetType_CO_U64), - _ => vortex_bail!("Invalid PType for chunk_offsets: {:?}", ptype), - } -} - #[instrument(skip_all)] pub(crate) async fn decode_bitpacked( array: BitPackedArray, diff --git a/vortex-cuda/src/kernel/mod.rs b/vortex-cuda/src/kernel/mod.rs index 68f40a15005..2c9dfe0d033 100644 --- a/vortex-cuda/src/kernel/mod.rs +++ b/vortex-cuda/src/kernel/mod.rs @@ -34,6 +34,7 @@ pub use encodings::ZstdKernelPrep; pub use encodings::zstd_kernel_prepare; pub(crate) use encodings::*; pub(crate) use filter::FilterExecutor; +pub(crate) use patches::types::load_patches_sync; pub(crate) use slice::SliceExecutor; use crate::CudaKernelEvents; diff --git a/vortex-cuda/src/kernel/patches/types.rs b/vortex-cuda/src/kernel/patches/types.rs index 350c5210e8a..e8e3ce5bfc6 100644 --- a/vortex-cuda/src/kernel/patches/types.rs +++ b/vortex-cuda/src/kernel/patches/types.rs @@ -3,18 +3,29 @@ //! GPU patches loading for fused exception patching during bit-unpacking. +use std::mem::size_of; + use num_traits::ToPrimitive; use vortex::array::buffer::BufferHandle; +use vortex::buffer::Alignment; use vortex::buffer::Buffer; use vortex::buffer::BufferMut; +use vortex::buffer::ByteBufferMut; use vortex::dtype::PType; use vortex_array::match_each_unsigned_integer_ptype; use vortex_array::patches::Patches; use vortex_error::VortexResult; use vortex_error::vortex_bail; +use crate::CudaBufferExt; use crate::CudaExecutionCtx; use crate::executor::CudaArrayExt; +use crate::kernel::patches::gpu::ChunkOffsetType; +use crate::kernel::patches::gpu::ChunkOffsetType_CO_U8; +use crate::kernel::patches::gpu::ChunkOffsetType_CO_U16; +use crate::kernel::patches::gpu::ChunkOffsetType_CO_U32; +use crate::kernel::patches::gpu::ChunkOffsetType_CO_U64; +use crate::kernel::patches::gpu::GPUPatches; /// A set of device-resident patches. pub struct DevicePatches { @@ -28,7 +39,7 @@ pub struct DevicePatches { pub(crate) n_chunks: usize, } -/// Load patches for GPU use. +/// Load patches for GPU use (async). /// /// # Errors /// @@ -40,17 +51,16 @@ pub(crate) async fn load_patches( ) -> VortexResult { let offset = patches.offset(); let offset_within_chunk = patches.offset_within_chunk().unwrap_or_default(); - let array_len = patches.array_len(); - // Get or compute chunk_offsets let Some(co) = patches.chunk_offsets() else { vortex_bail!("cannot execute_cuda for patched BitPacked array without chunk_offsets") }; - let (chunk_offsets, chunk_offset_ptype) = { + let (chunk_offsets, chunk_offset_ptype, n_chunks) = { let co_canonical = co.clone().execute_cuda(ctx).await?.into_primitive(); let ptype = co_canonical.ptype(); - (co_canonical.buffer_handle().clone(), ptype) + let len = co_canonical.len(); + (co_canonical.buffer_handle().clone(), ptype, len) }; // Load indices - must be converted to u32 for GPU use @@ -93,7 +103,6 @@ pub(crate) async fn load_patches( let values = ctx.ensure_on_device(values.buffer_handle().clone()).await?; let num_patches = patches.num_patches(); - let n_chunks = array_len.div_ceil(1024); Ok(DevicePatches { chunk_offsets, @@ -107,6 +116,76 @@ pub(crate) async fn load_patches( }) } +/// Convert a PType to the corresponding `ChunkOffsetType` for GPU patches. +pub(crate) fn ptype_to_chunk_offset_type(ptype: PType) -> VortexResult { + match ptype { + PType::U8 => Ok(ChunkOffsetType_CO_U8), + PType::U16 => Ok(ChunkOffsetType_CO_U16), + PType::U32 => Ok(ChunkOffsetType_CO_U32), + PType::U64 => Ok(ChunkOffsetType_CO_U64), + _ => vortex_bail!("Invalid PType for chunk_offsets: {:?}", ptype), + } +} + +/// Build a [`GPUPatches`] struct from [`DevicePatches`], serialize it to +/// bytes, and upload to the device. Returns the device pointer and a buffer +/// handle that must be kept alive for the kernel launch. +fn build_gpu_patches( + dp: &DevicePatches, + ctx: &CudaExecutionCtx, +) -> VortexResult<(BufferHandle, u64)> { + // Zero-initialize to avoid uninitialized padding bytes (e.g. between + // chunk_offset_type and indices) which would be UB when serialized. + let mut gpu_patches: GPUPatches = unsafe { std::mem::zeroed() }; + gpu_patches.chunk_offsets = dp.chunk_offsets.cuda_device_ptr()? as _; + gpu_patches.chunk_offset_type = ptype_to_chunk_offset_type(dp.chunk_offset_ptype)?; + gpu_patches.indices = dp.indices.cuda_device_ptr()? as _; + gpu_patches.values = dp.values.cuda_device_ptr()? as _; + #[expect(clippy::cast_possible_truncation)] + { + gpu_patches.offset = dp.offset as u32; + gpu_patches.offset_within_chunk = dp.offset_within_chunk as u32; + gpu_patches.num_patches = dp.num_patches as u32; + // n_chunks must match the chunk_offsets array length, not array_len / 1024. + // When patches are sliced, chunk_offsets is sliced to only include chunks + // overlapping the slice range — matching the CPU's patch_chunk which uses + // chunk_offsets_slice.len(). + gpu_patches.n_chunks = dp.n_chunks as u32; + } + + let bytes = unsafe { + std::slice::from_raw_parts( + std::ptr::from_ref(&gpu_patches).cast::(), + size_of::(), + ) + }; + let mut buf = + ByteBufferMut::with_capacity_aligned(size_of::(), Alignment::of::()); + buf.extend_from_slice(bytes); + let gpu_buf = ctx.ensure_on_device_sync(BufferHandle::new_host(buf.freeze()))?; + let ptr = gpu_buf.cuda_device_ptr()?; + Ok((gpu_buf, ptr)) +} + +/// Sync wrapper: load patches via [`load_patches`] (blocking), then build and +/// upload a [`GPUPatches`] struct. Returns the device pointer and all buffer +/// handles that must be kept alive for the kernel launch. +pub(crate) fn load_patches_sync( + patches: &Patches, + ctx: &mut CudaExecutionCtx, +) -> VortexResult<(u64, Vec)> { + let device_patches = futures::executor::block_on(load_patches(patches, ctx))?; + let (gpu_buf, ptr) = build_gpu_patches(&device_patches, ctx)?; + + let DevicePatches { + chunk_offsets, + indices, + values, + .. + } = device_patches; + Ok((ptr, vec![chunk_offsets, indices, values, gpu_buf])) +} + #[cfg(test)] mod tests { use vortex_array::IntoArray;