diff --git a/vortex-array/src/arrays/dict/array.rs b/vortex-array/src/arrays/dict/array.rs index 4e203001114..d55d5e09f62 100644 --- a/vortex-array/src/arrays/dict/array.rs +++ b/vortex-array/src/arrays/dict/array.rs @@ -45,6 +45,12 @@ pub struct DictArray { pub(super) all_values_referenced: bool, } +pub struct DictArrayParts { + pub codes: ArrayRef, + pub values: ArrayRef, + pub dtype: DType, +} + impl DictArray { /// Build a new `DictArray` without validating the codes or values. /// @@ -114,8 +120,12 @@ impl DictArray { Ok(unsafe { Self::new_unchecked(codes, values) }) } - pub fn into_parts(self) -> (ArrayRef, ArrayRef) { - (self.codes, self.values) + pub fn into_parts(self) -> DictArrayParts { + DictArrayParts { + codes: self.codes, + values: self.values, + dtype: self.dtype, + } } #[inline] diff --git a/vortex-array/src/arrow/executor/dictionary.rs b/vortex-array/src/arrow/executor/dictionary.rs index 8f25ab072dc..9d6b1c33710 100644 --- a/vortex-array/src/arrow/executor/dictionary.rs +++ b/vortex-array/src/arrow/executor/dictionary.rs @@ -15,6 +15,7 @@ use vortex_error::vortex_bail; use crate::ArrayRef; use crate::ExecutionCtx; use crate::arrays::DictArray; +use crate::arrays::DictArrayParts; use crate::arrays::DictVTable; use crate::arrow::ArrowArrayExecutor; @@ -47,7 +48,7 @@ fn dict_to_dict( values_type: &DataType, ctx: &mut ExecutionCtx, ) -> VortexResult { - let (codes, values) = array.into_parts(); + let DictArrayParts { codes, values, .. } = array.into_parts(); let codes = codes.execute_arrow(Some(codes_type), ctx)?; let values = values.execute_arrow(Some(values_type), ctx)?; diff --git a/vortex-cuda/benches/for_cuda.rs b/vortex-cuda/benches/for_cuda.rs index 284291a8ead..1cfa2ff69b3 100644 --- a/vortex-cuda/benches/for_cuda.rs +++ b/vortex-cuda/benches/for_cuda.rs @@ -199,7 +199,7 @@ fn benchmark_for_u8(c: &mut Criterion) { &for_array, |b, for_array| { b.iter_custom(|iters| { - let mut cuda_ctx = CudaSession::new_ctx(VortexSession::empty()) + let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty()) .vortex_expect("failed to create execution context"); let encoded = for_array.encoded(); @@ -248,7 +248,7 @@ fn benchmark_for_u16(c: &mut Criterion) { &for_array, |b, for_array| { b.iter_custom(|iters| { - let mut cuda_ctx = CudaSession::new_ctx(VortexSession::empty()) + let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty()) .vortex_expect("failed to create execution context"); let encoded = for_array.encoded(); @@ -297,7 +297,7 @@ fn benchmark_for_u32(c: &mut Criterion) { &for_array, |b, for_array| { b.iter_custom(|iters| { - let mut cuda_ctx = CudaSession::new_ctx(VortexSession::empty()) + let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty()) .vortex_expect("failed to create execution context"); let encoded = for_array.encoded(); @@ -346,7 +346,7 @@ fn benchmark_for_u64(c: &mut Criterion) { &for_array, |b, for_array| { b.iter_custom(|iters| { - let mut cuda_ctx = CudaSession::new_ctx(VortexSession::empty()) + let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty()) .vortex_expect("failed to create execution context"); let encoded = for_array.encoded(); diff --git a/vortex-cuda/src/executor.rs b/vortex-cuda/src/executor.rs index a2bab00c5dc..557774d2a10 100644 --- a/vortex-cuda/src/executor.rs +++ b/vortex-cuda/src/executor.rs @@ -17,20 +17,18 @@ use cudarc::driver::LaunchArgs; use cudarc::driver::result; use cudarc::driver::result::memcpy_htod_async; use cudarc::driver::sys; -use cudarc::driver::sys::CUevent_flags; use futures::future::BoxFuture; use kanal::Sender; use result::stream; use vortex_array::Array; use vortex_array::ArrayRef; use vortex_array::Canonical; -use vortex_array::VortexSessionExecute; +use vortex_array::ExecutionCtx; use vortex_array::buffer::BufferHandle; use vortex_buffer::Buffer; use vortex_dtype::PType; use vortex_error::VortexResult; use vortex_error::vortex_err; -use vortex_session::VortexSession; use crate::CudaDeviceBuffer; use crate::CudaSession; @@ -115,109 +113,23 @@ pub struct CudaKernelEvents { pub after_launch: CudaEvent, } -/// Convenience macro to launch a CUDA kernel. -/// -/// The kernel gets launched on the stream of the execution context. -/// -/// The kernel launch config: -/// LaunchConfig { -/// grid_dim: (array.len() / 2048, 1, 1), -/// block_dim: (64, 1, 1), -/// shared_mem_bytes: 0, -/// }; -/// 64 threads are used per block which corresponds to 2 warps. -/// Each block handles 2048 elements. Each thread handles 32 elements. -/// The last block and thread are allowed to have less elements. -/// -/// Note: A macro is necessary to unroll the launch builder arguments. -/// -/// # Returns -/// -/// A pair of CUDA events submitted before and after the kernel. -/// Depending on `CUevent_flags` these events can contain timestamps. Use -/// `CU_EVENT_DISABLE_TIMING` for minimal overhead and `CU_EVENT_DEFAULT` to -/// enable timestamps. -#[macro_export] -macro_rules! launch_cuda_kernel { - ( - execution_ctx: $ctx:expr, - module: $module:expr, - ptypes: $ptypes:expr, - launch_args: [$($arg:expr),* $(,)?], - event_recording: $event_recording:expr, - array_len: $len:expr - ) => {{ - let cuda_function = $ctx.load_function($module, $ptypes)?; - let mut launch_builder = $ctx.launch_builder(&cuda_function); - - $( - launch_builder.arg(&$arg); - )* - - $crate::executor::launch_cuda_kernel_impl(&mut launch_builder, $event_recording, $len)? - }}; -} - -/// Launches a CUDA kernel with the passed launch builder. -/// -/// # Arguments -/// -/// * `launch_builder` - Configured launch builder -/// * `array_len` - Length of the array to process -/// -/// # Returns -/// -/// A pair of CUDA events submitted before and after the kernel. -/// Depending on `CUevent_flags` these events can contain timestamps. Use -/// `CU_EVENT_DISABLE_TIMING` for minimal overhead and `CU_EVENT_DEFAULT` to -/// enable timestamps. -pub fn launch_cuda_kernel_impl( - launch_builder: &mut LaunchArgs, - event_flags: CUevent_flags, - array_len: usize, -) -> VortexResult { - let num_chunks = u32::try_from(array_len.div_ceil(2048))?; - - let config = cudarc::driver::LaunchConfig { - grid_dim: (num_chunks, 1, 1), - block_dim: (64, 1, 1), - shared_mem_bytes: 0, - }; - - launch_builder.record_kernel_launch(event_flags); - - unsafe { - launch_builder - .launch(config) - .map_err(|e| vortex_err!("Failed to launch kernel: {}", e)) - .and_then(|events| { - events - .ok_or_else(|| vortex_err!("CUDA events not recorded")) - .map(|(before_launch, after_launch)| CudaKernelEvents { - before_launch, - after_launch, - }) - }) - } -} - /// CUDA execution context. /// /// Provides access to the CUDA context and stream for kernel execution. /// Handles memory allocation and data transfers between host and device. pub struct CudaExecutionCtx { stream: Arc, - vortex_session: VortexSession, + ctx: ExecutionCtx, cuda_session: CudaSession, } impl CudaExecutionCtx { /// Creates a new CUDA execution context. - pub(crate) fn new(stream: Arc, vortex_session: VortexSession) -> Self { - let cuda_session = vortex_session.cuda_session().clone(); + pub(crate) fn new(stream: Arc, ctx: ExecutionCtx) -> Self { + let cuda_session = ctx.session().cuda_session().clone(); Self { stream, - vortex_session, + ctx, cuda_session, } } @@ -351,8 +263,8 @@ pub trait CudaArrayExt: Array { #[async_trait] impl CudaArrayExt for ArrayRef { async fn execute_cuda(self, ctx: &mut CudaExecutionCtx) -> VortexResult { - if self.is_canonical() { - return self.to_canonical(); + if self.is_canonical() || self.is_empty() { + return self.execute(&mut ctx.ctx); } let Some(support) = ctx.cuda_session.kernel(&self.encoding_id()) else { @@ -360,8 +272,7 @@ impl CudaArrayExt for ArrayRef { encoding = %self.encoding_id(), "No CUDA support registered for encoding, falling back to CPU execution" ); - let mut array_ctx = ctx.vortex_session.create_execution_ctx(); - return self.execute(&mut array_ctx); + return self.execute(&mut ctx.ctx); }; tracing::debug!( diff --git a/vortex-cuda/src/kernel/arrays/dict.rs b/vortex-cuda/src/kernel/arrays/dict.rs new file mode 100644 index 00000000000..0f217001019 --- /dev/null +++ b/vortex-cuda/src/kernel/arrays/dict.rs @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use async_trait::async_trait; +use vortex_array::ArrayRef; +use vortex_array::Canonical; +use vortex_array::arrays::DictVTable; +use vortex_error::VortexExpect; +use vortex_error::VortexResult; + +use crate::executor::CudaExecute; +use crate::executor::CudaExecutionCtx; + +/// CUDA executor for dictionary-encoded arrays. +#[derive(Debug)] +pub struct DictExecutor; + +#[async_trait] +impl CudaExecute for DictExecutor { + async fn execute( + &self, + array: ArrayRef, + _ctx: &mut CudaExecutionCtx, + ) -> VortexResult { + let _dict_array = array + .try_into::() + .ok() + .vortex_expect("Array is not a Dict array"); + + todo!() + } +} diff --git a/vortex-cuda/src/kernel/arrays/mod.rs b/vortex-cuda/src/kernel/arrays/mod.rs new file mode 100644 index 00000000000..6c9821b2308 --- /dev/null +++ b/vortex-cuda/src/kernel/arrays/mod.rs @@ -0,0 +1,5 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +mod dict; +pub use dict::DictExecutor; diff --git a/vortex-cuda/src/for_.rs b/vortex-cuda/src/kernel/encodings/for_.rs similarity index 85% rename from vortex-cuda/src/for_.rs rename to vortex-cuda/src/kernel/encodings/for_.rs index 05bd9a61a0b..33d04be614f 100644 --- a/vortex-cuda/src/for_.rs +++ b/vortex-cuda/src/kernel/encodings/for_.rs @@ -5,7 +5,6 @@ use async_trait::async_trait; use cudarc::driver::DeviceRepr; use cudarc::driver::PushKernelArg; use cudarc::driver::sys::CUevent_flags::CU_EVENT_DISABLE_TIMING; -use vortex_array::Array; use vortex_array::ArrayRef; use vortex_array::Canonical; use vortex_array::arrays::PrimitiveArray; @@ -13,8 +12,8 @@ use vortex_dtype::NativePType; use vortex_dtype::match_each_native_simd_ptype; use vortex_error::VortexExpect; use vortex_error::VortexResult; -use vortex_error::vortex_err; use vortex_fastlanes::FoRArray; +use vortex_fastlanes::FoRVTable; use crate::CudaBufferExt; use crate::executor::CudaArrayExt; @@ -24,39 +23,32 @@ use crate::launch_cuda_kernel; /// CUDA executor for frame-of-reference. #[derive(Debug)] -pub struct ForExecutor; +pub struct FoRExecutor; #[async_trait] -impl CudaExecute for ForExecutor { +impl CudaExecute for FoRExecutor { async fn execute( &self, array: ArrayRef, ctx: &mut CudaExecutionCtx, ) -> VortexResult { let for_array = array - .as_any() - .downcast_ref::() - .ok_or_else(|| vortex_err!("Array is not a FOR array"))?; - - execute_for(for_array, ctx).await + .try_into::() + .ok() + .vortex_expect("Array is not a FOR array"); + + // Excludes f16 support. + match_each_native_simd_ptype!(for_array.ptype(), |T| { + execute_for_typed::(for_array, ctx).await + }) } } -async fn execute_for(array: &FoRArray, ctx: &mut CudaExecutionCtx) -> VortexResult { - if array.is_empty() { - return array.to_array().to_canonical(); - } - - // Excludes f16 support. - match_each_native_simd_ptype!(array.ptype(), |T| { - execute_for_typed::(array, ctx).await - }) -} - async fn execute_for_typed( - array: &FoRArray, + array: FoRArray, ctx: &mut CudaExecutionCtx, ) -> VortexResult { + assert!(!array.is_empty()); let reference = array .reference_scalar() .as_primitive() @@ -114,7 +106,7 @@ mod tests { return; } - let mut cuda_ctx = CudaSession::new_ctx(VortexSession::empty()) + let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty()) .vortex_expect("failed to create execution context"); // Create u8 offset values that cycle through 0-255, creating 5000 elements @@ -128,7 +120,8 @@ mod tests { .vortex_expect("failed to create FoR array"); // Decompress on the GPU. - let result = execute_for(&for_array, &mut cuda_ctx) + let result = FoRExecutor + .execute(for_array.to_array(), &mut cuda_ctx) .await .vortex_expect("GPU decompression failed"); @@ -150,7 +143,7 @@ mod tests { return; } - let mut cuda_ctx = CudaSession::new_ctx(VortexSession::empty()) + let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty()) .vortex_expect("failed to create execution context"); // Create u16 offset values that cycle through 0-5000, creating 5000 elements @@ -163,7 +156,8 @@ mod tests { .vortex_expect("failed to create FoR array"); // Decompress on the GPU. - let result = execute_for(&for_array, &mut cuda_ctx) + let result = FoRExecutor + .execute(for_array.to_array(), &mut cuda_ctx) .await .vortex_expect("GPU decompression failed"); @@ -185,7 +179,7 @@ mod tests { return; } - let mut cuda_ctx = CudaSession::new_ctx(VortexSession::empty()) + let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty()) .vortex_expect("failed to create execution context"); // Create u32 offset values that cycle through 0-5000, creating 5000 elements @@ -198,7 +192,8 @@ mod tests { .vortex_expect("failed to create FoR array"); // Decompress on the GPU. - let result = execute_for(&for_array, &mut cuda_ctx) + let result = FoRExecutor + .execute(for_array.to_array(), &mut cuda_ctx) .await .vortex_expect("GPU decompression failed"); @@ -220,7 +215,7 @@ mod tests { return; } - let mut cuda_ctx = CudaSession::new_ctx(VortexSession::empty()) + let mut cuda_ctx = CudaSession::create_execution_ctx(VortexSession::empty()) .vortex_expect("failed to create execution context"); // Create u64 offset values that cycle through 0-5000, creating 5000 elements @@ -233,7 +228,8 @@ mod tests { .vortex_expect("failed to create FoR array"); // Decompress on the GPU. - let result = execute_for(&for_array, &mut cuda_ctx) + let result = FoRExecutor + .execute(for_array.to_array(), &mut cuda_ctx) .await .vortex_expect("GPU decompression failed"); diff --git a/vortex-cuda/src/kernel/encodings/mod.rs b/vortex-cuda/src/kernel/encodings/mod.rs new file mode 100644 index 00000000000..ae57c8649fd --- /dev/null +++ b/vortex-cuda/src/kernel/encodings/mod.rs @@ -0,0 +1,5 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +mod for_; +pub use for_::FoRExecutor; diff --git a/vortex-cuda/src/kernel.rs b/vortex-cuda/src/kernel/mod.rs similarity index 53% rename from vortex-cuda/src/kernel.rs rename to vortex-cuda/src/kernel/mod.rs index 01a896fc59b..a4efa4727ed 100644 --- a/vortex-cuda/src/kernel.rs +++ b/vortex-cuda/src/kernel/mod.rs @@ -12,12 +12,108 @@ use std::sync::Arc; use cudarc::driver::CudaContext; use cudarc::driver::CudaFunction; use cudarc::driver::CudaModule; +use cudarc::driver::LaunchArgs; +use cudarc::driver::sys::CUevent_flags; use cudarc::nvrtc::Ptx; use vortex_dtype::PType; use vortex_error::VortexResult; use vortex_error::vortex_err; use vortex_utils::aliases::dash_map::DashMap; +mod arrays; +mod encodings; + +pub use arrays::DictExecutor; +pub use encodings::FoRExecutor; + +use crate::CudaKernelEvents; + +/// Convenience macro to launch a CUDA kernel. +/// +/// The kernel gets launched on the stream of the execution context. +/// +/// The kernel launch config: +/// LaunchConfig { +/// grid_dim: (array.len() / 2048, 1, 1), +/// block_dim: (64, 1, 1), +/// shared_mem_bytes: 0, +/// }; +/// 64 threads are used per block which corresponds to 2 warps. +/// Each block handles 2048 elements. Each thread handles 32 elements. +/// The last block and thread are allowed to have less elements. +/// +/// Note: A macro is necessary to unroll the launch builder arguments. +/// +/// # Returns +/// +/// A pair of CUDA events submitted before and after the kernel. +/// Depending on `CUevent_flags` these events can contain timestamps. Use +/// `CU_EVENT_DISABLE_TIMING` for minimal overhead and `CU_EVENT_DEFAULT` to +/// enable timestamps. +#[macro_export] +macro_rules! launch_cuda_kernel { + ( + execution_ctx: $ctx:expr, + module: $module:expr, + ptypes: $ptypes:expr, + launch_args: [$($arg:expr),* $(,)?], + event_recording: $event_recording:expr, + array_len: $len:expr + ) => {{ + let cuda_function = $ctx.load_function($module, $ptypes)?; + let mut launch_builder = $ctx.launch_builder(&cuda_function); + + $( + launch_builder.arg(&$arg); + )* + + $crate::launch_cuda_kernel_impl(&mut launch_builder, $event_recording, $len)? + }}; +} + +/// Launches a CUDA kernel with the passed launch builder. +/// +/// # Arguments +/// +/// * `launch_builder` - Configured launch builder +/// * `array_len` - Length of the array to process +/// +/// # Returns +/// +/// A pair of CUDA events submitted before and after the kernel. +/// Depending on `CUevent_flags` these events can contain timestamps. Use +/// `CU_EVENT_DISABLE_TIMING` for minimal overhead and `CU_EVENT_DEFAULT` to +/// enable timestamps. +pub fn launch_cuda_kernel_impl( + launch_builder: &mut LaunchArgs, + event_flags: CUevent_flags, + array_len: usize, +) -> VortexResult { + let num_chunks = u32::try_from(array_len.div_ceil(2048))?; + + let config = cudarc::driver::LaunchConfig { + grid_dim: (num_chunks, 1, 1), + block_dim: (64, 1, 1), + shared_mem_bytes: 0, + }; + + launch_builder.record_kernel_launch(event_flags); + + unsafe { + launch_builder + .launch(config) + .map_err(|e| vortex_err!("Failed to launch kernel: {}", e)) + .and_then(|events| { + events + .ok_or_else(|| vortex_err!("CUDA events not recorded")) + .map(|(before_launch, after_launch)| CudaKernelEvents { + before_launch, + after_launch, + }) + }) + } +} + /// Loader for CUDA kernels with PTX caching. /// /// Handles loading PTX files, compiling modules, and loading functions. diff --git a/vortex-cuda/src/lib.rs b/vortex-cuda/src/lib.rs index be6399a8c32..8075a70283a 100644 --- a/vortex-cuda/src/lib.rs +++ b/vortex-cuda/src/lib.rs @@ -5,7 +5,6 @@ mod device_buffer; pub mod executor; -mod for_; mod kernel; mod session; @@ -15,8 +14,12 @@ pub use device_buffer::CudaBufferExt; pub use device_buffer::CudaDeviceBuffer; pub use executor::CudaExecutionCtx; pub use executor::CudaKernelEvents; -use for_::ForExecutor; +use kernel::DictExecutor; +use kernel::FoRExecutor; +pub use kernel::launch_cuda_kernel_impl; pub use session::CudaSession; +use vortex_array::arrays::DictVTable; +use vortex_fastlanes::FoRVTable; /// Check if the NVIDIA CUDA Compiler is available. pub fn has_nvcc() -> bool { @@ -29,6 +32,6 @@ pub fn has_nvcc() -> bool { /// Registers CUDA kernels. pub fn initialize_cuda(session: &CudaSession) { tracing::info!("Registering CUDA kernels"); - session.register_kernel("fastlanes.for".into(), &ForExecutor); - // TODO(0ax1): Register additional executors + session.register_kernel(FoRVTable::ID, &FoRExecutor); + session.register_kernel(DictVTable::ID, &DictExecutor); } diff --git a/vortex-cuda/src/session.rs b/vortex-cuda/src/session.rs index 6b33d10e753..1e33b5c3d5e 100644 --- a/vortex-cuda/src/session.rs +++ b/vortex-cuda/src/session.rs @@ -5,6 +5,7 @@ use std::fmt::Debug; use std::sync::Arc; use cudarc::driver::CudaContext; +use vortex_array::VortexSessionExecute; use vortex_array::vtable::ArrayId; use vortex_dtype::PType; use vortex_error::VortexResult; @@ -39,7 +40,7 @@ impl CudaSession { } /// Creates a new CUDA execution context. - pub fn new_ctx( + pub fn create_execution_ctx( vortex_session: vortex_session::VortexSession, ) -> VortexResult { let stream = vortex_session @@ -47,7 +48,10 @@ impl CudaSession { .context .new_stream() .map_err(|e| vortex_err!("Failed to create CUDA stream: {}", e))?; - Ok(CudaExecutionCtx::new(stream, vortex_session)) + Ok(CudaExecutionCtx::new( + stream, + vortex_session.create_execution_ctx(), + )) } /// Registers CUDA support for an array encoding.