diff --git a/include/infiniop.h b/include/infiniop.h index c0a09fcb4..8c47255eb 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -2,6 +2,8 @@ #define __INFINIOP_API_H__ #include "infiniop/handle.h" +#include "infiniop/ops/2dmrope.h" +#include "infiniop/ops/3dmrope.h" #include "infiniop/ops/add.h" #include "infiniop/ops/add_rms_norm.h" #include "infiniop/ops/attention.h" diff --git a/include/infiniop/ops/2dmrope.h b/include/infiniop/ops/2dmrope.h new file mode 100644 index 000000000..78fa6a22c --- /dev/null +++ b/include/infiniop/ops/2dmrope.h @@ -0,0 +1,32 @@ +#ifndef __INFINIOP_2DMROPE_API_H__ +#define __INFINIOP_2DMROPE_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopMRoPE2DDescriptor_t; + +__C __export infiniStatus_t infiniopCreateMRoPE2DDescriptor( + infiniopHandle_t handle, + infiniopMRoPE2DDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + infiniopTensorDescriptor_t pos_ids, + infiniopTensorDescriptor_t sin_table, + infiniopTensorDescriptor_t cos_table); + +__C __export infiniStatus_t infiniopGetMRoPE2DWorkspaceSize(infiniopMRoPE2DDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopMRoPE2D( + infiniopMRoPE2DDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void const *pos_ids, + void const *sin_table, + void const *cos_table, + void *stream); + +__C __export infiniStatus_t infiniopDestroyMRoPE2DDescriptor(infiniopMRoPE2DDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/3dmrope.h b/include/infiniop/ops/3dmrope.h new file mode 100644 index 000000000..954f3e1a0 --- /dev/null +++ b/include/infiniop/ops/3dmrope.h @@ -0,0 +1,34 @@ +#ifndef __INFINIOP_3DMROPE_API_H__ +#define __INFINIOP_3DMROPE_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopMRoPE3DDescriptor_t; + +__C __export infiniStatus_t infiniopCreateMRoPE3DDescriptor( + infiniopHandle_t handle, + infiniopMRoPE3DDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + infiniopTensorDescriptor_t pos_ids, + infiniopTensorDescriptor_t sin_table, + infiniopTensorDescriptor_t cos_table, + infiniopTensorDescriptor_t rope_section); + +__C __export infiniStatus_t infiniopGetMRoPE3DWorkspaceSize(infiniopMRoPE3DDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopMRoPE3D( + infiniopMRoPE3DDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void const *pos_ids, + void const *sin_table, + void const *cos_table, + void const *rope_section, + void *stream); + +__C __export infiniStatus_t infiniopDestroyMRoPE3DDescriptor(infiniopMRoPE3DDescriptor_t desc); + +#endif diff --git a/scripts/python_test.py b/scripts/python_test.py index 06af369ef..6ca352af8 100644 --- a/scripts/python_test.py +++ b/scripts/python_test.py @@ -39,6 +39,8 @@ def run_tests(args): "topkrouter.py", "topksoftmax.py", "zeros.py", + "2dmrope.py", + "3dmrope.py", ]: result = subprocess.run( f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True diff --git a/src/infiniop/ops/2dmrope/2dmrope.h b/src/infiniop/ops/2dmrope/2dmrope.h new file mode 100644 index 000000000..24ff3d9e3 --- /dev/null +++ b/src/infiniop/ops/2dmrope/2dmrope.h @@ -0,0 +1,140 @@ +#ifndef __2DMROPE_H__ +#define __2DMROPE_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" +#include + +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::mrope2d::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + MRoPE2DInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + MRoPE2DInfo info, \ + size_t workspace_size_, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size_) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t y_desc, \ + infiniopTensorDescriptor_t x_desc, \ + infiniopTensorDescriptor_t pos_desc, \ + infiniopTensorDescriptor_t sin_desc, \ + infiniopTensorDescriptor_t cos_desc); \ + \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *y, \ + const void *x, \ + const void *pos_ids, \ + const void *sin_table, \ + const void *cos_table, \ + void *stream) const; \ + }; \ + } + +class MRoPE2DInfo { +private: + MRoPE2DInfo() = default; + +public: + infiniDtype_t data_type, pos_type; + size_t seqlen, nhead, dhead, table_len, table_dim; + ptrdiff_t + y_stride_seqlen, + y_stride_nhead, + x_stride_seqlen, + x_stride_nhead; + + static utils::Result createMRoPE2DInfo( + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t pos_desc, + infiniopTensorDescriptor_t sin_desc, + infiniopTensorDescriptor_t cos_desc) { + CHECK_OR_RETURN( + y_desc != nullptr && x_desc != nullptr && pos_desc != nullptr && sin_desc != nullptr && cos_desc != nullptr, + INFINI_STATUS_NULL_POINTER); + + const infiniDtype_t data_type = y_desc->dtype(); + const infiniDtype_t pos_type = pos_desc->dtype(); + CHECK_OR_RETURN(data_type == x_desc->dtype(), + INFINI_STATUS_BAD_TENSOR_DTYPE); + // // sin_table and cos_table should be float32 for precision + // CHECK_OR_RETURN(sin_desc->dtype() == INFINI_DTYPE_F32 && cos_desc->dtype() == INFINI_DTYPE_F32, + // INFINI_STATUS_BAD_TENSOR_DTYPE); + CHECK_OR_RETURN(data_type == x_desc->dtype() && data_type == sin_desc->dtype() && data_type == cos_desc->dtype(), + INFINI_STATUS_BAD_TENSOR_DTYPE); + CHECK_DTYPE(data_type, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + CHECK_DTYPE_ANY_INT(pos_type); + + CHECK_OR_RETURN(y_desc->ndim() == 3 + && x_desc->ndim() == 3 + && pos_desc->ndim() == 2 + && sin_desc->ndim() == 2 + && cos_desc->ndim() == 2, + INFINI_STATUS_BAD_TENSOR_SHAPE); + + const auto nhead = y_desc->dim(0), + seqlen = y_desc->dim(1), + dhead = y_desc->dim(2), + table_len = sin_desc->dim(0), + table_dim = sin_desc->dim(1); + printf("y_desc->dim(0): %zu, y_desc->dim(1): %zu, y_desc->dim(2): %zu\n", y_desc->dim(0), y_desc->dim(1), y_desc->dim(2)); + printf("x_desc->dim(0): %zu, x_desc->dim(1): %zu, x_desc->dim(2): %zu\n", x_desc->dim(0), x_desc->dim(1), x_desc->dim(2)); + printf("pos_desc->dim(0): %zu, pos_desc->dim(1): %zu\n", pos_desc->dim(0), pos_desc->dim(1)); + printf("sin_desc->dim(0): %zu, sin_desc->dim(1): %zu\n", sin_desc->dim(0), sin_desc->dim(1)); + printf("cos_desc->dim(0): %zu, cos_desc->dim(1): %zu\n", cos_desc->dim(0), cos_desc->dim(1)); + printf("nhead: %zu, seqlen: %zu, dhead: %zu, table_len: %zu, table_dim: %zu\n", nhead, seqlen, dhead, table_len, table_dim); + + CHECK_OR_RETURN(nhead == x_desc->dim(0) + && seqlen == x_desc->dim(1) && seqlen == pos_desc->dim(0) + && dhead == x_desc->dim(2) + && table_len == cos_desc->dim(0) && table_dim == cos_desc->dim(1) + && pos_desc->dim(1) == 2, + INFINI_STATUS_BAD_TENSOR_SHAPE); + + CHECK_OR_RETURN(dhead == table_dim * 4, INFINI_STATUS_BAD_TENSOR_SHAPE); // 2D MRoPE: dhead = table_dim * 4 + // Last dimension of x and y must be contiguous + CHECK_OR_RETURN(y_desc->stride(2) == 1 && x_desc->stride(2) == 1, INFINI_STATUS_BAD_TENSOR_STRIDES); + // sin table and cos table must be totally contiguous + CHECK_OR_RETURN(sin_desc->isContiguous() && cos_desc->isContiguous(), INFINI_STATUS_BAD_TENSOR_STRIDES); + // pos_ids must be contiguous + CHECK_OR_RETURN(pos_desc->isContiguous(), INFINI_STATUS_BAD_TENSOR_STRIDES); + + return utils::Result(MRoPE2DInfo{ + data_type, + pos_type, + seqlen, + nhead, + dhead, + table_len, + table_dim, + y_desc->stride(1), + y_desc->stride(0), + x_desc->stride(1), + x_desc->stride(0), + }); + } +}; + +#endif diff --git a/src/infiniop/ops/2dmrope/cuda/mrope.cuh b/src/infiniop/ops/2dmrope/cuda/mrope.cuh new file mode 100644 index 000000000..24d8a324f --- /dev/null +++ b/src/infiniop/ops/2dmrope/cuda/mrope.cuh @@ -0,0 +1,42 @@ +template +static __device__ void padding( + Ta *__restrict__ y_, + int const stride_token_y, + int const stride_head_y, + Ta const *__restrict__ x_, + int const stride_token_x, + int const stride_head_x, + Tp const *__restrict__ pos_, + float const *__restrict__ sin_table, + float const *__restrict__ cos_table) { + + // n = gridDim.y + // nh_h = gridDim.x + int nh_l = blockDim.y, + dh_div_2 = blockDim.x, + it = blockIdx.y, + ih_h = blockIdx.x, + ih_l = threadIdx.y, + ih = ih_h * nh_l + ih_l, + i = threadIdx.x; + + // 计算 x 和 y 的位置, 每相距 d_div_2 的两个为一组 + auto x1 = x_ + it * stride_token_x + ih * stride_head_x + i; + auto x2 = x_ + it * stride_token_x + ih * stride_head_x + i + dh_div_2; + auto y1 = y_ + it * stride_token_y + ih * stride_head_y + i; + auto y2 = y_ + it * stride_token_y + ih * stride_head_y + i + dh_div_2; + + // 获取位置索引 + // 2 维 mrope 的 w, h 维度均分 d_div_2,每个分到 d_div_2 / 2 + int id_h = i / (dh_div_2 / 2); // w, h 的维度索引 + int id_l = i % (dh_div_2 / 2); // w, h 维度内索引 + auto pos = pos_[it * 2 + id_h]; // 2 维 pos 的 shape: [it, 2], strides: [2, 1] + float sin = sin_table[pos * (dh_div_2 / 2) + id_l], + cos = cos_table[pos * (dh_div_2 / 2) + id_l], + a = x1[0], + b = x2[0]; + + // 应用旋转并写入 y + y1[0] = Ta(a * cos - b * sin); + y2[0] = Ta(a * sin + b * cos); +} diff --git a/src/infiniop/ops/2dmrope/nvidia/2dmrope_nvidia.cu b/src/infiniop/ops/2dmrope/nvidia/2dmrope_nvidia.cu new file mode 100644 index 000000000..2e0e58a1a --- /dev/null +++ b/src/infiniop/ops/2dmrope/nvidia/2dmrope_nvidia.cu @@ -0,0 +1,166 @@ +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "2dmrope_nvidia.cuh" + +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include "../cuda/mrope.cuh" + +namespace op::mrope2d::nvidia { + + struct Descriptor::Opaque { + std::shared_ptr internal; + }; + + Descriptor::~Descriptor() { + delete _opaque; + } + + infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t pos_desc, + infiniopTensorDescriptor_t sin_desc, + infiniopTensorDescriptor_t cos_desc) { + + auto handle_nvidia = reinterpret_cast(handle); + + auto info = MRoPE2DInfo::createMRoPE2DInfo(y_desc, x_desc, pos_desc, sin_desc, cos_desc); + CHECK_RESULT(info); + + // Create descriptor + *desc_ptr = new Descriptor( + info.take(), + 0, + new Opaque{handle_nvidia->internal()}, + handle_nvidia->device, + handle_nvidia->device_id); + return INFINI_STATUS_SUCCESS; + } + + template + __global__ void mrope2d_kernel( + Tdata *__restrict__ y_, + int const stride_token_y, + int const stride_head_y, + Tdata const *__restrict__ x_, + int const stride_token_x, + int const stride_head_x, + Tpos const *__restrict__ pos_, + float const *__restrict__ sin_table, + float const *__restrict__ cos_table, + int const dh_div_2) { + + padding( + y_, stride_token_y, stride_head_y, + x_, stride_token_x, stride_head_x, + pos_, sin_table, cos_table); + } + + template + __global__ void mrope2d_kernel( + Tdata *__restrict__ y_, + int const stride_token_y, + int const stride_head_y, + Tdata const *__restrict__ x_, + int const stride_token_x, + int const stride_head_x, + Tpos const *__restrict__ pos_, + float const *__restrict__ sin_table, + float const *__restrict__ cos_table) { + + padding( + y_, stride_token_y, stride_head_y, + x_, stride_token_x, stride_head_x, + pos_, sin_table, cos_table); + } + + template + infiniStatus_t calculateMRoPE2D(const MRoPE2DInfo &info, + int block_size, + Tdata *y, + const Tdata *x, + const Tpos *pos_ids, + const float *sin_table, + const float *cos_table, + cudaStream_t stream) { + auto dimy = uint32_t(info.seqlen), // grid.y = n + dimx = uint32_t(info.nhead); // grid.x = nh_h + int dh_div_2 = info.dhead / 2; + int nh_l = 1; // 每个 block 处理的 head 数量 + + // 注意:Rust 中的顺序是 (grid.y, grid.x), (block.y, block.x) + // 所以 CUDA 调用应该是 (grid.x, grid.y), (block.x, block.y) + dim3 gridDim(dimx, dimy); // (nh_h, n) + dim3 blockDim(dh_div_2, nh_l); // (dh_div_2, nh_l) + + mrope2d_kernel<<>>( + y, info.y_stride_seqlen, info.y_stride_nhead, + x, info.x_stride_seqlen, info.x_stride_nhead, + pos_ids, sin_table, cos_table); + + return INFINI_STATUS_SUCCESS; + } + +#define CALCULATE_MROPE2D(TDATA, TPOS) \ + calculateMRoPE2D(_info, \ + _opaque->internal->maxThreadsPerBlock(), \ + (TDATA *)y, \ + (const TDATA *)x, \ + (const TPOS *)pos_ids, \ + (const float *)sin_table, \ + (const float *)cos_table, \ + (cudaStream_t)stream) + +#define MROPE2D_TYPE(TDATA) \ + switch (_info.pos_type) { \ + case INFINI_DTYPE_U8: \ + return CALCULATE_MROPE2D(TDATA, uint8_t); \ + case INFINI_DTYPE_U16: \ + return CALCULATE_MROPE2D(TDATA, uint16_t); \ + case INFINI_DTYPE_U32: \ + return CALCULATE_MROPE2D(TDATA, uint32_t); \ + case INFINI_DTYPE_U64: \ + return CALCULATE_MROPE2D(TDATA, uint64_t); \ + case INFINI_DTYPE_I8: \ + return CALCULATE_MROPE2D(TDATA, int8_t); \ + case INFINI_DTYPE_I16: \ + return CALCULATE_MROPE2D(TDATA, int16_t); \ + case INFINI_DTYPE_I32: \ + return CALCULATE_MROPE2D(TDATA, int32_t); \ + case INFINI_DTYPE_I64: \ + return CALCULATE_MROPE2D(TDATA, int64_t); \ + default: \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + const void *pos_ids, + const void *sin_table, + const void *cos_table, + void *stream) const { + + switch (_info.data_type) { + case INFINI_DTYPE_F16: + MROPE2D_TYPE(half); + case INFINI_DTYPE_BF16: + MROPE2D_TYPE(cuda_bfloat16); + case INFINI_DTYPE_F32: + MROPE2D_TYPE(float); + case INFINI_DTYPE_F64: + MROPE2D_TYPE(double); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; + } + +#undef MROPE2D_TYPE +#undef CALCULATE_MROPE2D + +} diff --git a/src/infiniop/ops/2dmrope/nvidia/2dmrope_nvidia.cuh b/src/infiniop/ops/2dmrope/nvidia/2dmrope_nvidia.cuh new file mode 100644 index 000000000..2e14e3730 --- /dev/null +++ b/src/infiniop/ops/2dmrope/nvidia/2dmrope_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __INFINIOP_2DMROPE_CUDA_H__ +#define __INFINIOP_2DMROPE_CUDA_H__ + +#include "../2dmrope.h" + +DESCRIPTOR(nvidia) + +#endif // __INFINIOP_2DMROPE_CUDA_H__ diff --git a/src/infiniop/ops/2dmrope/operator.cc b/src/infiniop/ops/2dmrope/operator.cc new file mode 100644 index 000000000..2b09a17e0 --- /dev/null +++ b/src/infiniop/ops/2dmrope/operator.cc @@ -0,0 +1,204 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/2dmrope.h" + +#ifdef ENABLE_CPU_API +#include "cpu/2dmrope_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/2dmrope_nvidia.cuh" +#endif +#ifdef ENABLE_ASCEND_API +#include "ascend/2dmrope_ascend.h" +#endif +#ifdef ENABLE_CAMBRICON_API +#include "bang/2dmrope_bang.h" +#endif +#ifdef ENABLE_METAX_API +#include "metax/2dmrope_metax.h" +#endif +#ifdef ENABLE_KUNLUN_API +#include "kunlun/2dmrope_kunlun.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/2dmrope_moore.h" +#endif + +__C infiniStatus_t infiniopCreateMRoPE2DDescriptor( + infiniopHandle_t handle, + infiniopMRoPE2DDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + infiniopTensorDescriptor_t pos_ids, + infiniopTensorDescriptor_t sin_table, + infiniopTensorDescriptor_t cos_table) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::mrope2d::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y, \ + x, \ + pos_ids, \ + sin_table, \ + cos_table) + + switch (handle->device) { +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_ASCEND_API + CREATE(INFINI_DEVICE_ASCEND, ascend); +#endif +#ifdef ENABLE_KUNLUN_API + CREATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CREATE(INFINI_DEVICE_CAMBRICON, bang); +#endif + } + +#undef CREATE + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopGetMRoPE2DWorkspaceSize(infiniopMRoPE2DDescriptor_t desc, + size_t *size) { +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + GET(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + GET(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_ASCEND_API + GET(INFINI_DEVICE_ASCEND, ascend); +#endif + } + +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopMRoPE2D( + infiniopMRoPE2DDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + const void *pos_ids, + const void *sin_table, + const void *cos_table, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, x, pos_ids, sin_table, cos_table, stream) + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CALCULATE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_ASCEND_API + CALCULATE(INFINI_DEVICE_ASCEND, ascend); +#endif + } + +#undef CALCULATE + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t +infiniopDestroyMRoPE2DDescriptor(infiniopMRoPE2DDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + DELETE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + DELETE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_ASCEND_API + DELETE(INFINI_DEVICE_ASCEND, ascend); +#endif + } + +#undef DELETE + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} diff --git a/src/infiniop/ops/3dmrope/3dmrope.h b/src/infiniop/ops/3dmrope/3dmrope.h new file mode 100644 index 000000000..c39687c71 --- /dev/null +++ b/src/infiniop/ops/3dmrope/3dmrope.h @@ -0,0 +1,141 @@ +#ifndef __3DMROPE_H__ +#define __3DMROPE_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" + +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::mrope3d::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + MRoPE3DInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + MRoPE3DInfo info, \ + size_t workspace_size_, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size_) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t y_desc, \ + infiniopTensorDescriptor_t x_desc, \ + infiniopTensorDescriptor_t pos_desc, \ + infiniopTensorDescriptor_t sin_desc, \ + infiniopTensorDescriptor_t cos_desc, \ + infiniopTensorDescriptor_t rope_section_desc); \ + \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *y, \ + const void *x, \ + const void *pos_ids, \ + const void *sin_table, \ + const void *cos_table, \ + const void *rope_section, \ + void *stream) const; \ + }; \ + } + +class MRoPE3DInfo { +private: + MRoPE3DInfo() = default; + +public: + infiniDtype_t data_type, pos_type, section_type; + size_t seqlen, nhead, dhead, table_len, table_dim; + ptrdiff_t + y_stride_seqlen, + y_stride_nhead, + x_stride_seqlen, + x_stride_nhead; + + static utils::Result createMRoPE3DInfo( + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t pos_desc, + infiniopTensorDescriptor_t sin_desc, + infiniopTensorDescriptor_t cos_desc, + infiniopTensorDescriptor_t rope_section_desc) { + CHECK_OR_RETURN( + y_desc != nullptr && x_desc != nullptr && pos_desc != nullptr && sin_desc != nullptr && cos_desc != nullptr && rope_section_desc != nullptr, + INFINI_STATUS_NULL_POINTER); + + const infiniDtype_t data_type = y_desc->dtype(); + const infiniDtype_t pos_type = pos_desc->dtype(); + const infiniDtype_t section_type = rope_section_desc->dtype(); + CHECK_OR_RETURN(data_type == x_desc->dtype(), + INFINI_STATUS_BAD_TENSOR_DTYPE); + // // sin_table and cos_table should be float32 for precision + // CHECK_OR_RETURN(sin_desc->dtype() == INFINI_DTYPE_F32 && cos_desc->dtype() == INFINI_DTYPE_F32, + // INFINI_STATUS_BAD_TENSOR_DTYPE); + CHECK_OR_RETURN(data_type == x_desc->dtype() && data_type == sin_desc->dtype() && data_type == cos_desc->dtype(), + INFINI_STATUS_BAD_TENSOR_DTYPE); + CHECK_DTYPE(data_type, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + CHECK_DTYPE_ANY_INT(pos_type); + CHECK_DTYPE_ANY_INT(section_type); + + CHECK_OR_RETURN(y_desc->ndim() == 3 + && x_desc->ndim() == 3 + && pos_desc->ndim() == 2 + && sin_desc->ndim() == 2 + && cos_desc->ndim() == 2 + && rope_section_desc->ndim() == 1, + INFINI_STATUS_BAD_TENSOR_SHAPE); + + const auto nhead = y_desc->dim(0), + seqlen = y_desc->dim(1), + dhead = y_desc->dim(2), + table_len = sin_desc->dim(0), + table_dim = sin_desc->dim(1); + + CHECK_OR_RETURN(nhead == x_desc->dim(0) + && seqlen == x_desc->dim(1) && seqlen == pos_desc->dim(0) + && dhead == x_desc->dim(2) + && table_len == cos_desc->dim(0) && table_dim == cos_desc->dim(1) + && pos_desc->dim(1) == 3 + && rope_section_desc->dim(0) == 3, + INFINI_STATUS_BAD_TENSOR_SHAPE); + + CHECK_OR_RETURN(dhead == table_dim * 2, INFINI_STATUS_BAD_TENSOR_SHAPE); + // Last dimension of x and y must be contiguous + CHECK_OR_RETURN(y_desc->stride(2) == 1 && x_desc->stride(2) == 1, INFINI_STATUS_BAD_TENSOR_STRIDES); + // sin table and cos table must be totally contiguous + CHECK_OR_RETURN(sin_desc->isContiguous() && cos_desc->isContiguous(), INFINI_STATUS_BAD_TENSOR_STRIDES); + // pos_ids and rope_section must be contiguous + CHECK_OR_RETURN(pos_desc->isContiguous() && rope_section_desc->isContiguous(), INFINI_STATUS_BAD_TENSOR_STRIDES); + + return utils::Result(MRoPE3DInfo{ + data_type, + pos_type, + section_type, + seqlen, + nhead, + dhead, + table_len, + table_dim, + y_desc->stride(1), + y_desc->stride(0), + x_desc->stride(1), + x_desc->stride(0), + }); + } +}; + +#endif diff --git a/src/infiniop/ops/3dmrope/cuda/mrope_3d.cuh b/src/infiniop/ops/3dmrope/cuda/mrope_3d.cuh new file mode 100644 index 000000000..82f352f08 --- /dev/null +++ b/src/infiniop/ops/3dmrope/cuda/mrope_3d.cuh @@ -0,0 +1,50 @@ +template +static __device__ void padding( + Ta *__restrict__ y_, + int const stride_token_y, + int const stride_head_y, + Ta const *__restrict__ x_, + int const stride_token_x, + int const stride_head_x, + Tp const *__restrict__ pos_, + float const *__restrict__ sin_table, + float const *__restrict__ cos_table, + Tp const *__restrict__ rope_section_ +) { + + // n = gridDim.y + // nh_h = gridDim.x + int nh_l = blockDim.y, + dh_div_2 = blockDim.x, + it = blockIdx.y, + ih_h = blockIdx.x, + ih_l = threadIdx.y, + ih = ih_h * nh_l + ih_l, + i = threadIdx.x; + + // 计算 x 和 y 的位置, 每相距 d_div_2 的两个为一组 + auto x1 = x_ + it * stride_token_x + ih * stride_head_x + i; + auto x2 = x_ + it * stride_token_x + ih * stride_head_x + i + dh_div_2; + auto y1 = y_ + it * stride_token_y + ih * stride_head_y + i; + auto y2 = y_ + it * stride_token_y + ih * stride_head_y + i + dh_div_2; + + // 寻找 i 在 rope_section 中的位置 + int thw = 0; + for (int j = 0; j < 3; j++) { + if (i < rope_section_[j]) { + thw = j; + break; + } + } + + // 获取位置索引 + auto pos = pos_[it * 3 + thw]; // 3 维 pos 的 shape: [n, 3], strides: [3, 1] + float sin = sin_table[pos * dh_div_2 + i], + cos = cos_table[pos * dh_div_2 + i], + a = x1[0], + b = x2[0]; + + // 应用旋转并写入 y + y1[0] = Ta(a * cos - b * sin); + y2[0] = Ta(a * sin + b * cos); +} diff --git a/src/infiniop/ops/3dmrope/cuda/mrope_3d_optimized.cuh b/src/infiniop/ops/3dmrope/cuda/mrope_3d_optimized.cuh new file mode 100644 index 000000000..7b9952e39 --- /dev/null +++ b/src/infiniop/ops/3dmrope/cuda/mrope_3d_optimized.cuh @@ -0,0 +1,48 @@ +// 优化版本的3D MRoPE CUDA内核 +template +static __device__ void padding( + Ta *__restrict__ y_, + int const stride_token_y, + int const stride_head_y, + Ta const *__restrict__ x_, + int const stride_token_x, + int const stride_head_x, + Tp const *__restrict__ pos_, + float const *__restrict__ sin_table, + float const *__restrict__ cos_table, + Tp const *__restrict__ rope_section_) { + + // n = gridDim.y + // nh_h = gridDim.x + int nh_l = blockDim.y, + dh_div_2 = blockDim.x, + it = blockIdx.y, + ih_h = blockIdx.x, + ih_l = threadIdx.y, + ih = ih_h * nh_l + ih_l, + i = threadIdx.x; + + // 计算 x 和 y 的位置, 每相距 d_div_2 的两个为一组 + auto x1 = x_ + it * stride_token_x + ih * stride_head_x + i; + auto x2 = x_ + it * stride_token_x + ih * stride_head_x + i + dh_div_2; + auto y1 = y_ + it * stride_token_y + ih * stride_head_y + i; + auto y2 = y_ + it * stride_token_y + ih * stride_head_y + i + dh_div_2; + + // 优化版本:使用条件表达式替代循环 + // rope_section 是累积值,例如 [32, 64, 96] 对应原始的 [32, 32, 32] + // i 范围是 [0, dh_div_2),即 [0, 96) + // 逻辑:找到第一个 rope_section_[j] > i 的 j + int thw = (i < rope_section_[0]) ? 0 : + (i < rope_section_[1]) ? 1 : 2; + + // 获取位置索引 + auto pos = pos_[it * 3 + thw]; // 3 维 pos 的 shape: [n, 3], strides: [3, 1] + float sin = sin_table[pos * dh_div_2 + i], + cos = cos_table[pos * dh_div_2 + i], + a = x1[0], + b = x2[0]; + + // 应用旋转并写入 y + y1[0] = Ta(a * cos - b * sin); + y2[0] = Ta(a * sin + b * cos); +} diff --git a/src/infiniop/ops/3dmrope/nvidia/3dmrope_nvidia.cu b/src/infiniop/ops/3dmrope/nvidia/3dmrope_nvidia.cu new file mode 100644 index 000000000..e97a33bfb --- /dev/null +++ b/src/infiniop/ops/3dmrope/nvidia/3dmrope_nvidia.cu @@ -0,0 +1,172 @@ +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "3dmrope_nvidia.cuh" + +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include "../cuda/mrope_3d_optimized.cuh" + +namespace op::mrope3d::nvidia { + + struct Descriptor::Opaque { + std::shared_ptr internal; + }; + + Descriptor::~Descriptor() { + delete _opaque; + } + + infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t pos_desc, + infiniopTensorDescriptor_t sin_desc, + infiniopTensorDescriptor_t cos_desc, + infiniopTensorDescriptor_t rope_section_desc) { + + auto handle_nvidia = reinterpret_cast(handle); + + auto info = MRoPE3DInfo::createMRoPE3DInfo(y_desc, x_desc, pos_desc, sin_desc, cos_desc, rope_section_desc); + CHECK_RESULT(info); + + // Create descriptor + *desc_ptr = new Descriptor( + info.take(), + 0, + new Opaque{handle_nvidia->internal()}, + handle_nvidia->device, + handle_nvidia->device_id); + return INFINI_STATUS_SUCCESS; + } + + template + __global__ void mrope3d_kernel( + Tdata *__restrict__ y_, + int const stride_token_y, + int const stride_head_y, + Tdata const *__restrict__ x_, + int const stride_token_x, + int const stride_head_x, + Tpos const *__restrict__ pos_, + float const *__restrict__ sin_table, + float const *__restrict__ cos_table, + Tpos const *__restrict__ rope_section_, + int const dh_div_2) { + + padding( + y_, stride_token_y, stride_head_y, + x_, stride_token_x, stride_head_x, + pos_, sin_table, cos_table, rope_section_); + } + + template + __global__ void mrope3d_kernel( + Tdata *__restrict__ y_, + int const stride_token_y, + int const stride_head_y, + Tdata const *__restrict__ x_, + int const stride_token_x, + int const stride_head_x, + Tpos const *__restrict__ pos_, + float const *__restrict__ sin_table, + float const *__restrict__ cos_table, + Tpos const *__restrict__ rope_section_) { + + padding( + y_, stride_token_y, stride_head_y, + x_, stride_token_x, stride_head_x, + pos_, sin_table, cos_table, rope_section_); + } + + template + infiniStatus_t calculateMRoPE3D(const MRoPE3DInfo &info, + int block_size, + Tdata *y, + const Tdata *x, + const Tpos *pos_ids, + const float *sin_table, + const float *cos_table, + const Tpos *rope_section, + cudaStream_t stream) { + auto dimy = uint32_t(info.seqlen), // grid.y = n + dimx = uint32_t(info.nhead); // grid.x = nh_h + int dh_div_2 = info.dhead / 2; + int nh_l = 1; // 每个 block 处理的 head 数量 + + // 注意:Rust 中的顺序是 (grid.y, grid.x), (block.y, block.x) + // 所以 CUDA 调用应该是 (grid.x, grid.y), (block.x, block.y) + dim3 gridDim(dimx, dimy); // (nh_h, n) + dim3 blockDim(dh_div_2, nh_l); // (dh_div_2, nh_l) + + mrope3d_kernel<<>>( + y, info.y_stride_seqlen, info.y_stride_nhead, + x, info.x_stride_seqlen, info.x_stride_nhead, + pos_ids, sin_table, cos_table, rope_section); + + return INFINI_STATUS_SUCCESS; + } + +#define CALCULATE_MROPE3D(TDATA, TPOS) \ + calculateMRoPE3D(_info, \ + _opaque->internal->maxThreadsPerBlock(), \ + (TDATA *)y, \ + (const TDATA *)x, \ + (const TPOS *)pos_ids, \ + (const float *)sin_table, \ + (const float *)cos_table, \ + (const TPOS *)rope_section, \ + (cudaStream_t)stream) + +#define MROPE3D_TYPE(TDATA) \ + switch (_info.pos_type) { \ + case INFINI_DTYPE_U8: \ + return CALCULATE_MROPE3D(TDATA, uint8_t); \ + case INFINI_DTYPE_U16: \ + return CALCULATE_MROPE3D(TDATA, uint16_t); \ + case INFINI_DTYPE_U32: \ + return CALCULATE_MROPE3D(TDATA, uint32_t); \ + case INFINI_DTYPE_U64: \ + return CALCULATE_MROPE3D(TDATA, uint64_t); \ + case INFINI_DTYPE_I8: \ + return CALCULATE_MROPE3D(TDATA, int8_t); \ + case INFINI_DTYPE_I16: \ + return CALCULATE_MROPE3D(TDATA, int16_t); \ + case INFINI_DTYPE_I32: \ + return CALCULATE_MROPE3D(TDATA, int32_t); \ + case INFINI_DTYPE_I64: \ + return CALCULATE_MROPE3D(TDATA, int64_t); \ + default: \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *y, + const void *x, + const void *pos_ids, + const void *sin_table, + const void *cos_table, + const void *rope_section, + void *stream) const { + + switch (_info.data_type) { + case INFINI_DTYPE_F16: + MROPE3D_TYPE(half); + case INFINI_DTYPE_BF16: + MROPE3D_TYPE(cuda_bfloat16); + case INFINI_DTYPE_F32: + MROPE3D_TYPE(float); + case INFINI_DTYPE_F64: + MROPE3D_TYPE(double); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; + } + +#undef MROPE3D_TYPE +#undef CALCULATE_MROPE3D + +} diff --git a/src/infiniop/ops/3dmrope/nvidia/3dmrope_nvidia.cuh b/src/infiniop/ops/3dmrope/nvidia/3dmrope_nvidia.cuh new file mode 100644 index 000000000..fb070abe8 --- /dev/null +++ b/src/infiniop/ops/3dmrope/nvidia/3dmrope_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __INFINIOP_3DMROPE_CUDA_H__ +#define __INFINIOP_3DMROPE_CUDA_H__ + +#include "../3dmrope.h" + +DESCRIPTOR(nvidia) + +#endif // __INFINIOP_3DMROPE_CUDA_H__ diff --git a/src/infiniop/ops/3dmrope/operator.cc b/src/infiniop/ops/3dmrope/operator.cc new file mode 100644 index 000000000..5725556d5 --- /dev/null +++ b/src/infiniop/ops/3dmrope/operator.cc @@ -0,0 +1,207 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/3dmrope.h" + +#ifdef ENABLE_CPU_API +#include "cpu/3dmrope_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/3dmrope_nvidia.cuh" +#endif +#ifdef ENABLE_ASCEND_API +#include "ascend/3dmrope_ascend.h" +#endif +#ifdef ENABLE_CAMBRICON_API +#include "bang/3dmrope_bang.h" +#endif +#ifdef ENABLE_METAX_API +#include "metax/3dmrope_metax.h" +#endif +#ifdef ENABLE_KUNLUN_API +#include "kunlun/3dmrope_kunlun.h" +#endif +#ifdef ENABLE_MOORE_API +#include "moore/3dmrope_moore.h" +#endif + +__C infiniStatus_t infiniopCreateMRoPE3DDescriptor( + infiniopHandle_t handle, + infiniopMRoPE3DDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + infiniopTensorDescriptor_t pos_ids, + infiniopTensorDescriptor_t sin_table, + infiniopTensorDescriptor_t cos_table, + infiniopTensorDescriptor_t rope_section) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::mrope3d::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y, \ + x, \ + pos_ids, \ + sin_table, \ + cos_table, \ + rope_section) + + switch (handle->device) { +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_MOORE_API + CREATE(INFINI_DEVICE_MOORE, moore); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_ASCEND_API + CREATE(INFINI_DEVICE_ASCEND, ascend); +#endif +#ifdef ENABLE_KUNLUN_API + CREATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CREATE(INFINI_DEVICE_CAMBRICON, bang); +#endif + } + +#undef CREATE + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopGetMRoPE3DWorkspaceSize(infiniopMRoPE3DDescriptor_t desc, + size_t *size) { +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, moore); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + GET(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + GET(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_ASCEND_API + GET(INFINI_DEVICE_ASCEND, ascend); +#endif + } + +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopMRoPE3D( + infiniopMRoPE3DDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + const void *pos_ids, + const void *sin_table, + const void *cos_table, + const void *rope_section, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, x, pos_ids, sin_table, cos_table, rope_section, stream) + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_MOORE_API + CALCULATE(INFINI_DEVICE_MOORE, moore); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + CALCULATE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_ASCEND_API + CALCULATE(INFINI_DEVICE_ASCEND, ascend); +#endif + } + +#undef CALCULATE + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t +infiniopDestroyMRoPE3DDescriptor(infiniopMRoPE3DDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_MOORE_API + DELETE(INFINI_DEVICE_MOORE, moore); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_KUNLUN_API + DELETE(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_CAMBRICON_API + DELETE(INFINI_DEVICE_CAMBRICON, bang); +#endif +#ifdef ENABLE_ASCEND_API + DELETE(INFINI_DEVICE_ASCEND, ascend); +#endif + } + +#undef DELETE + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} diff --git a/test/infiniop/2dmrope.py b/test/infiniop/2dmrope.py new file mode 100644 index 000000000..5f7b4417c --- /dev/null +++ b/test/infiniop/2dmrope.py @@ -0,0 +1,342 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceEnum, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration based on Qwen2-VL +# ============================================================================== + +# 使用真实的图像参数进行测试 +H, W, D_PATCH = 336, 476, 14 +HP = H // D_PATCH # 24 +WP = W // D_PATCH # 34 + +# 根据 pos_ids.rs 的实现,计算实际的序列长度 + + +def calculate_2d_seq_len(h, w, d_patch): + hp = h // d_patch + wp = w // d_patch + count = 0 + for y in range(0, hp, 2): + for x in range(0, wp, 2): + for dy in range(2): + for dx in range(2): + if y + dy < hp and x + dx < wp: + count += 1 + return count + + +ACTUAL_SEQ_LEN = calculate_2d_seq_len(H, W, D_PATCH) + +# 注意:根据 Qwen2-VL,形状是 [nhead, seqlen, dhead] +_TEST_CASES_ = [ + # (shape, x_strides, y_strides) - 形状:[nhead, seqlen, dhead] + # 2D MRoPE: dhead = table_dim * 4 + ((32, ACTUAL_SEQ_LEN, 128), None, None), # 大规模测试: 32头, 128维 + ((16, ACTUAL_SEQ_LEN, 64), None, None), # 中等规模: 16头, 64维 + ((8, ACTUAL_SEQ_LEN, 32), None, None), # 小规模: 8头, 32维 +] + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.F32, InfiniDtype.F16, InfiniDtype.BF16] +# InfiniDtype.BF16, InfiniDtype.F32] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-4}, + InfiniDtype.BF16: {"atol": 8e-3, "rtol": 1e-4}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-9}, +} + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +_INPLACE = [ + Inplace.OUT_OF_PLACE, + # Inplace.INPLACE_X, # 先测试非原地操作 +] + +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +DEBUG = False +PROFILE = False +NUM_PRERUN = 1 +NUM_ITERATIONS = 3 + + +def generate_2d_pos_ids(h, w, d_patch, device, dtype=InfiniDtype.I32): + """Generate 2D position IDs according to pos_ids.rs implementation""" + hp = h // d_patch + wp = w // d_patch + pos = [] + + # Following the Rust implementation exactly + for y in range(0, hp, 2): + for x in range(0, wp, 2): + for dy in range(2): + for dx in range(2): + if y + dy < hp and x + dx < wp: + pos.append([y + dy, x + dx]) + + pos_ids = torch.tensor(pos, dtype=torch.int32) + return TestTensor.from_torch(pos_ids, dtype, device) + + +def rotate_half(x): + """Rotates half the hidden dims of the input. (Qwen2-VL style)""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2:] + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb_vision_reference(tensor, freqs): + """ + Reference implementation based on Qwen2-VL's apply_rotary_pos_emb_vision + tensor: [nhead, seqlen, dhead] + freqs: [seqlen, dhead//2] or similar + """ + orig_dtype = tensor.dtype + tensor = tensor.float() + + # freqs should contain the angles for rotation + cos = freqs.cos() # [seqlen, dhead//2] + sin = freqs.sin() # [seqlen, dhead//2] + + # Expand to match tensor dimensions + # cos/sin: [seqlen, dhead//2] -> [seqlen, dhead] + cos = cos.repeat(1, 2) # [seqlen, dhead] + sin = sin.repeat(1, 2) # [seqlen, dhead] + + # Add batch dimension for broadcasting with [nhead, seqlen, dhead] + cos = cos.unsqueeze(0) # [1, seqlen, dhead] + sin = sin.unsqueeze(0) # [1, seqlen, dhead] + + output = (tensor * cos) + (rotate_half(tensor) * sin) + output = output.to(orig_dtype) + return output + + +def multimodal_rotary_embedding_2d_reference(ans, t, pos_ids, sin_table, cos_table, device): + """ + 2D MRoPE reference implementation based on Qwen2-VL style + t: [nhead, seqlen, dhead] + pos_ids: [seqlen, 2] - (h, w) positions + sin_table/cos_table: [max_pos, dhead//4] - table for each dimension + """ + nhead, seqlen, dhead = t.shape + dt = t.dtype + assert dhead % 4 == 0, "Embedding dimension must be divisible by 4 for 2D MRoPE." + + dhead_div_2 = dhead // 2 + dhead_div_4 = dhead // 4 + + if device == InfiniDeviceEnum.CPU: + t = t.float() + sin_table = sin_table.float() + cos_table = cos_table.float() + + # Create frequency tensor for each position + # This mimics the freqs parameter in apply_rotary_pos_emb_vision + freqs = torch.zeros(seqlen, dhead_div_2, + dtype=torch.float32, device=t.device) + + for seq_idx in range(seqlen): + for i in range(dhead_div_2): + # 2 维 mrope 的 w, h 维度均分 dhead_div_2,每个分到 dhead_div_4 + dim_idx = i // dhead_div_4 # 0 for h, 1 for w + within_dim_idx = i % dhead_div_4 # index within dimension + + pos = pos_ids[seq_idx, dim_idx].item() + freqs[seq_idx, i] = torch.atan2( + sin_table[pos, within_dim_idx], + cos_table[pos, within_dim_idx] + ) + + # Apply rotary embedding using Qwen2-VL style + ans[:] = apply_rotary_pos_emb_vision_reference(t, freqs) + + +def sin_cos_table_2d(max_pos, dim, device, theta, dtype): + """Generate sin/cos table for 2D MRoPE""" + assert dim % 4 == 0, "Embedding dimension must be divisible by 4 for 2D MRoPE." + dh_div_4 = dim // 4 + + # Create frequency for each dimension component + freqs = 1.0 / (theta ** (torch.arange(0, dh_div_4, 1).float() / dh_div_4)) + pos = torch.arange(0, max_pos, dtype=torch.float32) + angles = torch.outer(pos, freqs) + + return ( + TestTensor.from_torch(torch.sin(angles), InfiniDtype.F32, device), + TestTensor.from_torch(torch.cos(angles), InfiniDtype.F32, device), + ) + + +def test( + handle, + device, + shape, + x_strides=None, + y_strides=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float32, + sync=None, +): + nhead, seqlen, dhead = shape + + # For 2D MRoPE, dh must be divisible by 4 + if dhead % 4 != 0: + return + + print( + f"Testing 2D MRoPE (Qwen2-VL style) on {InfiniDeviceNames[device]} with shape:[{nhead}, {seqlen}, {dhead}] dtype:{InfiniDtypeNames[dtype]}" + ) + + x = TestTensor(shape, x_strides, dtype, device) + if inplace == Inplace.INPLACE_X: + if x_strides != y_strides: + return + y = x + else: + y = TestTensor(shape, y_strides, dtype, device) + + # Generate 2D position IDs using real parameters + h, w, d_patch = H, W, D_PATCH + pos_ids = generate_2d_pos_ids(h, w, d_patch, device) + + # Verify the sequence length matches + assert pos_ids.shape[0] == seqlen, f"pos_ids length {pos_ids.shape[0]} != seqlen {seqlen}" + + max_pos = pos_ids.torch_tensor().max().item() + 1 + + # Generate sin/cos tables + sin_table, cos_table = sin_cos_table_2d( + max_pos, dhead, device, 10000.0, dtype) + + # Compute reference result using Qwen2-VL style + multimodal_rotary_embedding_2d_reference( + y.torch_tensor(), + x.torch_tensor(), + pos_ids.torch_tensor(), + sin_table.torch_tensor(), + cos_table.torch_tensor(), + device, + ) + + # Create descriptor + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateMRoPE2DDescriptor( + handle, + ctypes.byref(descriptor), + y.descriptor, + x.descriptor, + pos_ids.descriptor, + sin_table.descriptor, + cos_table.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [y, x, pos_ids, sin_table, cos_table]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetMRoPE2DWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, x.device) + + def lib_mrope2d(): + check_error( + LIBINFINIOP.infiniopMRoPE2D( + descriptor, + workspace.data(), + workspace_size.value, + y.data(), + x.data(), + pos_ids.data(), + sin_table.data(), + cos_table.data(), + None, + ) + ) + + lib_mrope2d() + + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + print(f"Expected shape: {y.torch_tensor().shape}") + print(f"Actual shape: {y.actual_tensor().shape}") + print(f"pos_ids shape: {pos_ids.torch_tensor().shape}") + print(f"sin_table shape: {sin_table.torch_tensor().shape}") + debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol) + + success = torch.allclose( + y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol) + if success: + print("✅ Test PASSED!") + else: + print("❌ Test FAILED!") + if not DEBUG: + print("Run with --debug to see detailed comparison") + # Show a brief comparison + diff = torch.abs(y.actual_tensor() - y.torch_tensor()) + print(f"Max absolute difference: {diff.max().item():.6f}") + print(f"Mean absolute difference: {diff.mean().item():.6f}") + + assert success + + check_error(LIBINFINIOP.infiniopDestroyMRoPE2DDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + print(f"2D MRoPE Test Configuration (Qwen2-VL style):") + print(f" Image size: {H}x{W}, Patch size: {D_PATCH}") + print(f" Calculated sequence length: {ACTUAL_SEQ_LEN}") + print(f" Tensor shape format: [nhead, seqlen, dhead]") + + # Execute tests + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92m2D MRoPE Test (Qwen2-VL style) passed!\033[0m") diff --git a/test/infiniop/2dmrope_fixed.py b/test/infiniop/2dmrope_fixed.py new file mode 100644 index 000000000..e3572bfe8 --- /dev/null +++ b/test/infiniop/2dmrope_fixed.py @@ -0,0 +1,307 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceEnum, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules + +# 使用真实的图像参数进行测试 +H, W, D_PATCH = 336, 476, 14 +HP = H // D_PATCH # 24 +WP = W // D_PATCH # 34 + +# 根据 pos_ids.rs 的实现,计算实际的序列长度 +# pos_ids.rs: ptr 从 0 开始,按 2x2 块遍历 +def calculate_2d_seq_len(h, w, d_patch): + hp = h // d_patch + wp = w // d_patch + count = 0 + for y in range(0, hp, 2): + for x in range(0, wp, 2): + for dy in range(2): + for dx in range(2): + if y + dy < hp and x + dx < wp: + count += 1 + return count + +ACTUAL_SEQ_LEN = calculate_2d_seq_len(H, W, D_PATCH) + +_TEST_CASES_ = [ + # (shape, x_strides, y_strides) - 使用实际计算的序列长度 + ((ACTUAL_SEQ_LEN, 32, 128), None, None), # 2D MRoPE: dhead = table_dim * 4, so 128 = 32 * 4 + ((ACTUAL_SEQ_LEN, 16, 64), None, None), # 64 = 16 * 4 +] + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-2}, + InfiniDtype.BF16: {"atol": 5e-3, "rtol": 5e-2}, + InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-3}, +} + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def generate_2d_pos_ids(h, w, d_patch, device, dtype=InfiniDtype.I32): + """Generate 2D position IDs according to pos_ids.rs implementation""" + hp = h // d_patch + wp = w // d_patch + pos = [] + + # Following the Rust implementation exactly + for y in range(0, hp, 2): + for x in range(0, wp, 2): + for dy in range(2): + for dx in range(2): + if y + dy < hp and x + dx < wp: + pos.append([y + dy, x + dx]) + + pos_ids = torch.tensor(pos, dtype=torch.int32) + return TestTensor.from_torch(pos_ids, dtype, device) + + +def multimodal_rotary_embedding_2d(ans, t, pos_ids, sin, cos, device): + """ + 2D MRoPE implementation for reference + pos_ids shape: [seq_len, 2] - (h, w) positions + sin/cos shape: [max_pos, dh//4] - table for each dimension + """ + seq_len, n_head, dh = t.shape + dt = t.dtype + assert dh % 4 == 0, "Embedding dimension must be divisible by 4 for 2D MRoPE." + + dh_div_4 = dh // 4 + dh_div_2 = dh // 2 + + if device == InfiniDeviceEnum.CPU: + t = t.float() + sin = sin.float() + cos = cos.float() + + # Apply rotation based on your .cuh implementation + for seq_idx in range(seq_len): + for head_idx in range(n_head): + for i in range(dh_div_2): + # 2 维 mrope 的 w, h 维度均分 d_div_2,每个分到 d_div_2 / 2 + id_h = i // (dh_div_2 // 2) # w, h 的维度索引 + id_l = i % (dh_div_2 // 2) # w, h 维度内索引 + pos = pos_ids[seq_idx, id_h].item() # 2 维 pos 的 shape: [seq_len, 2], strides: [2, 1] + + sin_val = sin[pos, id_l].item() + cos_val = cos[pos, id_l].item() + + # Apply rotation + a = t[seq_idx, head_idx, i].item() + b = t[seq_idx, head_idx, i + dh_div_2].item() + + ans[seq_idx, head_idx, i] = (a * cos_val - b * sin_val) + ans[seq_idx, head_idx, i + dh_div_2] = (a * sin_val + b * cos_val) + + if device == InfiniDeviceEnum.CPU: + ans = ans.to(dt) + + +def sin_cos_table_2d(max_pos, dim, device, theta, dtype): + """Generate sin/cos table for 2D MRoPE""" + assert dim % 4 == 0, "Embedding dimension must be divisible by 4 for 2D MRoPE." + dh_div_4 = dim // 4 + + # Create frequency for each dimension component + freqs = 1.0 / (theta ** (torch.arange(0, dh_div_4, 1).float() / dh_div_4)) + pos = torch.arange(0, max_pos, dtype=torch.float32) + angles = torch.outer(pos, freqs) + + return ( + TestTensor.from_torch(torch.sin(angles), dtype, device), + TestTensor.from_torch(torch.cos(angles), dtype, device), + ) + + +def test( + handle, + device, + shape, + x_strides=None, + y_strides=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float32, + sync=None, +): + seq_len, n_head, dh = shape + + # For 2D MRoPE, dh must be divisible by 4 + if dh % 4 != 0: + return + + print( + f"Testing 2D MRoPE on {InfiniDeviceNames[device]} with shape:{shape} x_strides:{x_strides} y_strides:{y_strides} and dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + x = TestTensor(shape, x_strides, dtype, device) + if inplace == Inplace.INPLACE_X: + if x_strides != y_strides: + return + y = x + else: + y = TestTensor(shape, y_strides, dtype, device) + + # Generate 2D position IDs using real parameters + h, w, d_patch = H, W, D_PATCH + pos_ids = generate_2d_pos_ids(h, w, d_patch, device) + + # Verify the sequence length matches + assert pos_ids.shape[0] == seq_len, f"pos_ids length {pos_ids.shape[0]} != seq_len {seq_len}" + + max_pos = pos_ids.torch_tensor().max().item() + 1 + + # Generate sin/cos tables + sin_table, cos_table = sin_cos_table_2d(max_pos, dh, device, 10000.0, dtype) + + # Compute reference result + multimodal_rotary_embedding_2d( + y.torch_tensor(), + x.torch_tensor(), + pos_ids.torch_tensor(), + sin_table.torch_tensor(), + cos_table.torch_tensor(), + device, + ) + + # Create descriptor + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateMRoPE2DDescriptor( + handle, + ctypes.byref(descriptor), + y.descriptor, + x.descriptor, + pos_ids.descriptor, + sin_table.descriptor, + cos_table.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [y, x, pos_ids, sin_table, cos_table]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetMRoPE2DWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, x.device) + + def lib_mrope2d(): + check_error( + LIBINFINIOP.infiniopMRoPE2D( + descriptor, + workspace.data(), + workspace_size.value, + y.data(), + x.data(), + pos_ids.data(), + sin_table.data(), + cos_table.data(), + None, + ) + ) + + lib_mrope2d() + + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol) + + if PROFILE: + profile_operation( + "PyTorch", + lambda: multimodal_rotary_embedding_2d( + y.torch_tensor(), + x.torch_tensor(), + pos_ids.torch_tensor(), + sin_table.torch_tensor(), + cos_table.torch_tensor(), + device, + ), + device, + NUM_PRERUN, + NUM_ITERATIONS, + ) + profile_operation( + "InfiniOP", + lib_mrope2d, + device, + NUM_PRERUN, + NUM_ITERATIONS, + ) + + check_error(LIBINFINIOP.infiniopDestroyMRoPE2DDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + print(f"2D MRoPE Test Configuration:") + print(f" Image size: {H}x{W}, Patch size: {D_PATCH}") + print(f" Calculated sequence length: {ACTUAL_SEQ_LEN}") + + # Execute tests + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92m2D MRoPE Test passed!\033[0m") diff --git a/test/infiniop/3dmrope.py b/test/infiniop/3dmrope.py new file mode 100644 index 000000000..db0630de3 --- /dev/null +++ b/test/infiniop/3dmrope.py @@ -0,0 +1,406 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceEnum, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration based on Qwen2-VL for 3D MRoPE +# ============================================================================== + +# 3D parameters for video/temporal modeling +T_DIM = 3 +H_IMG = 224 +W_IMG = 224 +D_PATCH = 14 +PRE_TEXT_LEN = 4 +POST_TEXT_LEN = 5 + + +def calculate_3d_seq_len(t, h, w, d_patch, pre_text_len, post_text_len): + """Calculate sequence length according to 3D pos_ids.rs implementation""" + spatial_merge_size = 2 + t_len = t + h_len = h // d_patch // spatial_merge_size + w_len = w // d_patch // spatial_merge_size + vision_len = t_len * h_len * w_len + total_len = pre_text_len + vision_len + post_text_len + return total_len + + +ACTUAL_SEQ_LEN = calculate_3d_seq_len( + T_DIM, H_IMG, W_IMG, D_PATCH, PRE_TEXT_LEN, POST_TEXT_LEN) + +# 注意:根据 Qwen2-VL,形状是 [nhead, seqlen, dhead] +# 3D MRoPE: dhead = table_dim * 2 +_TEST_CASES_ = [ + # (shape, x_strides, y_strides) - 形状:[nhead, seqlen, dhead] + # 3D MRoPE: dhead = table_dim * 2 + # ((32, ACTUAL_SEQ_LEN, 128), None, None), # 大规模测试: 32头, 128维 + # ((16, ACTUAL_SEQ_LEN, 64), None, None), # 中等规模: 16头, 64维 + ((8, ACTUAL_SEQ_LEN, 32), None, None), # 小规模: 8头, 32维 +] + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32] + +# Tolerance map for different data types (stricter for testing optimization) +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 5e-4, "rtol": 1e-5}, + InfiniDtype.BF16: {"atol": 5e-4, "rtol": 1e-5}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-9}, +} + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + + +_INPLACE = [ + Inplace.OUT_OF_PLACE, + # Inplace.INPLACE_X, # 先测试非原地操作 +] + +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +DEBUG = False +PROFILE = False +NUM_PRERUN = 1 +NUM_ITERATIONS = 3 + + +def generate_3d_pos_ids(t, h, w, d_patch, pre_text_len, post_text_len, device, dtype=InfiniDtype.I32): + """Generate 3D position IDs according to pos_ids.rs implementation""" + spatial_merge_size = 2 + t_len = t + h_len = h // d_patch // spatial_merge_size + w_len = w // d_patch // spatial_merge_size + vision_len = t_len * h_len * w_len + total_len = pre_text_len + vision_len + post_text_len + + pos = [] + idx = 0 + + # 图像前文本 + for i in range(pre_text_len): + pos.append([i, i, i]) + idx += 1 + + # 图像 + img_start_pos = pre_text_len + for t_idx in range(t_len): + for h_idx in range(h_len): + for w_idx in range(w_len): + t_pos = img_start_pos + t_idx + h_pos = img_start_pos + h_idx + w_pos = img_start_pos + w_idx + pos.append([t_pos, h_pos, w_pos]) + idx += 1 + + # 图像后文本 + t_max_pos = img_start_pos + t_len - 1 + h_max_pos = img_start_pos + h_len - 1 + w_max_pos = img_start_pos + w_len - 1 + image_max_pos = max(t_max_pos, h_max_pos, w_max_pos) + text_start_pos = image_max_pos + 1 + for i in range(post_text_len): + pos_val = text_start_pos + i + pos.append([pos_val, pos_val, pos_val]) + idx += 1 + + assert idx == total_len + pos_ids = torch.tensor(pos, dtype=torch.int32) + return TestTensor.from_torch(pos_ids, dtype, device) + + +def rotate_half(x): + """Rotates half the hidden dims of the input. (Qwen2-VL style)""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2:] + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb_vision_reference(tensor, freqs): + """ + Reference implementation based on Qwen2-VL's apply_rotary_pos_emb_vision + tensor: [nhead, seqlen, dhead] + freqs: [seqlen, dhead//2] or similar + """ + orig_dtype = tensor.dtype + tensor = tensor.float() + + # freqs should contain the angles for rotation + cos = freqs.cos() # [seqlen, dhead//2] + sin = freqs.sin() # [seqlen, dhead//2] + + # Expand to match tensor dimensions + # cos/sin: [seqlen, dhead//2] -> [seqlen, dhead] + cos = cos.repeat(1, 2) # [seqlen, dhead] + sin = sin.repeat(1, 2) # [seqlen, dhead] + + # Add batch dimension for broadcasting with [nhead, seqlen, dhead] + cos = cos.unsqueeze(0) # [1, seqlen, dhead] + sin = sin.unsqueeze(0) # [1, seqlen, dhead] + + output = (tensor * cos) + (rotate_half(tensor) * sin) + output = output.to(orig_dtype) + return output + + +def multimodal_rotary_embedding_3d_reference(ans, t, pos_ids, sin_table, cos_table, rope_section, device): + """ + 3D MRoPE reference implementation based on Qwen2-VL style + t: [nhead, seqlen, dhead] + pos_ids: [seqlen, 3] - (t, h, w) positions + sin_table/cos_table: [max_pos, dhead//2] - table for all dimensions + rope_section: [3] - section boundaries for t, h, w dimensions + """ + nhead, seqlen, dhead = t.shape + dt = t.dtype + assert dhead % 2 == 0, "Embedding dimension must be divisible by 2 for 3D MRoPE." + + dhead_div_2 = dhead // 2 + + if device == InfiniDeviceEnum.CPU: + t = t.float() + sin_table = sin_table.float() + cos_table = cos_table.float() + + # Process each sequence position and head directly (matching CUDA kernel logic) + for seq_idx in range(seqlen): + for head_idx in range(nhead): + for i in range(dhead_div_2): + # Find i in rope_section (matching CUDA kernel logic) + thw = 0 + for j in range(3): + if i < rope_section[j].item(): + thw = j + break + + # Get position index + pos = pos_ids[seq_idx, thw].item() + + sin_val = sin_table[pos, i].item() + cos_val = cos_table[pos, i].item() + + a = t[head_idx, seq_idx, i].item() + b = t[head_idx, seq_idx, i + dhead_div_2].item() + + ans[head_idx, seq_idx, i] = a * cos_val - b * sin_val + ans[head_idx, seq_idx, i + dhead_div_2] = a * \ + sin_val + b * cos_val + + +def sin_cos_table_3d(max_pos, dim, device, theta, dtype): + """Generate sin/cos table for 3D MRoPE""" + assert dim % 2 == 0, "Embedding dimension must be divisible by 2 for 3D MRoPE." + dh_div_2 = dim // 2 + + # Create frequency for all dimensions (unified table) + freqs = 1.0 / (theta ** (torch.arange(0, dh_div_2, 1).float() / dh_div_2)) + pos = torch.arange(0, max_pos, dtype=torch.float32) + angles = torch.outer(pos, freqs) + + return ( + TestTensor.from_torch(torch.sin(angles), InfiniDtype.F32, device), + TestTensor.from_torch(torch.cos(angles), InfiniDtype.F32, device), + ) + + +def test( + handle, + device, + shape, + x_strides=None, + y_strides=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float32, + sync=None, +): + nhead, seqlen, dhead = shape + + # For 3D MRoPE, dh must be divisible by 2 + if dhead % 2 != 0: + return + + print( + f"Testing 3D MRoPE (Qwen2-VL style) on {InfiniDeviceNames[device]} with shape:[{nhead}, {seqlen}, {dhead}] dtype:{InfiniDtypeNames[dtype]}" + ) + + x = TestTensor(shape, x_strides, dtype, device) + if inplace == Inplace.INPLACE_X: + if x_strides != y_strides: + return + y = x + else: + y = TestTensor(shape, y_strides, dtype, device) + + # Generate 3D position IDs using real parameters + t_dim, h_dim, w_dim, d_patch = T_DIM, H_IMG, W_IMG, D_PATCH + pre_text_len, post_text_len = PRE_TEXT_LEN, POST_TEXT_LEN + + pos_ids = generate_3d_pos_ids( + t_dim, h_dim, w_dim, d_patch, pre_text_len, post_text_len, device) + + # Verify the sequence length matches + assert pos_ids.shape[0] == seqlen, f"pos_ids length {pos_ids.shape[0]} != seqlen {seqlen}" + + # Calculate max_pos_val for sin/cos table + spatial_merge_size = 2 + t_len = t_dim + h_len = h_dim // d_patch // spatial_merge_size + w_len = w_dim // d_patch // spatial_merge_size + img_start_pos = pre_text_len + t_max_pos = img_start_pos + t_len - 1 + h_max_pos = img_start_pos + h_len - 1 + w_max_pos = img_start_pos + w_len - 1 + image_max_pos = max(t_max_pos, h_max_pos, w_max_pos) + text_start_pos = image_max_pos + 1 + max_pos_val = text_start_pos + post_text_len + + # Generate sin/cos tables + sin_table, cos_table = sin_cos_table_3d( + max_pos_val, dhead, device, 10000.0, dtype) + + # rope_section represents the accumulated dimensions for t, h, w + # For example, if dhead=32, we might split as [10, 11, 11] -> accumulated [10, 21, 32] + dhead_div_2 = dhead // 2 + section_t = dhead_div_2 // 3 + section_h = dhead_div_2 // 3 + section_w = dhead_div_2 - section_t - section_h # remainder + rope_section = TestTensor.from_torch( + torch.tensor([section_t, section_t + section_h, + dhead_div_2], dtype=torch.int32), + InfiniDtype.I32, + device + ) + + # Compute reference result using Qwen2-VL style + multimodal_rotary_embedding_3d_reference( + y.torch_tensor(), + x.torch_tensor(), + pos_ids.torch_tensor(), + sin_table.torch_tensor(), + cos_table.torch_tensor(), + rope_section.torch_tensor(), + device, + ) + + # Create descriptor + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateMRoPE3DDescriptor( + handle, + ctypes.byref(descriptor), + y.descriptor, + x.descriptor, + pos_ids.descriptor, + sin_table.descriptor, + cos_table.descriptor, + rope_section.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [y, x, pos_ids, sin_table, cos_table, rope_section]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetMRoPE3DWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, x.device) + + def lib_mrope3d(): + check_error( + LIBINFINIOP.infiniopMRoPE3D( + descriptor, + workspace.data(), + workspace_size.value, + y.data(), + x.data(), + pos_ids.data(), + sin_table.data(), + cos_table.data(), + rope_section.data(), + None, + ) + ) + + lib_mrope3d() + + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + print(f"Expected shape: {y.torch_tensor().shape}") + print(f"Actual shape: {y.actual_tensor().shape}") + print(f"pos_ids shape: {pos_ids.torch_tensor().shape}") + print(f"sin_table shape: {sin_table.torch_tensor().shape}") + print(f"rope_section shape: {rope_section.torch_tensor().shape}") + debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol) + + success = torch.allclose( + y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol) + if success: + print("✅ Test PASSED!") + else: + print("❌ Test FAILED!") + if not DEBUG: + print("Run with --debug to see detailed comparison") + # Show a brief comparison + diff = torch.abs(y.actual_tensor() - y.torch_tensor()) + print(f"Max absolute difference: {diff.max().item():.6f}") + print(f"Mean absolute difference: {diff.mean().item():.6f}") + + assert success + + check_error(LIBINFINIOP.infiniopDestroyMRoPE3DDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + print(f"3D MRoPE Test Configuration (Qwen2-VL style):") + print( + f" Video/Image size: {T_DIM}x{H_IMG}x{W_IMG}, Patch size: {D_PATCH}") + print( + f" Pre-text length: {PRE_TEXT_LEN}, Post-text length: {POST_TEXT_LEN}") + print(f" Calculated sequence length: {ACTUAL_SEQ_LEN}") + print(f" Tensor shape format: [nhead, seqlen, dhead]") + + # Execute tests + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92m3D MRoPE Test (Qwen2-VL style) passed!\033[0m") diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py index 283bdb1cd..fa8044629 100644 --- a/test/infiniop/libinfiniop/op_register.py +++ b/test/infiniop/libinfiniop/op_register.py @@ -458,6 +458,82 @@ def rope_(lib): infiniopOperatorDescriptor_t, ] +@OpRegister.operator +def mrope2d_(lib): + lib.infiniopCreateMRoPE2DDescriptor.restype = c_int32 + lib.infiniopCreateMRoPE2DDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetMRoPE2DWorkspaceSize.restype = c_int32 + lib.infiniopGetMRoPE2DWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopMRoPE2D.restype = c_int32 + lib.infiniopMRoPE2D.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyMRoPE2DDescriptor.restype = c_int32 + lib.infiniopDestroyMRoPE2DDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + +@OpRegister.operator +def mrope3d_(lib): + lib.infiniopCreateMRoPE3DDescriptor.restype = c_int32 + lib.infiniopCreateMRoPE3DDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetMRoPE3DWorkspaceSize.restype = c_int32 + lib.infiniopGetMRoPE3DWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopMRoPE3D.restype = c_int32 + lib.infiniopMRoPE3D.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyMRoPE3DDescriptor.restype = c_int32 + lib.infiniopDestroyMRoPE3DDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + @OpRegister.operator def sub_(lib):