diff --git a/include/infiniop.h b/include/infiniop.h
index c0a09fcb4..8c47255eb 100644
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -2,6 +2,8 @@
 #define __INFINIOP_API_H__
 
 #include "infiniop/handle.h"
+#include "infiniop/ops/2dmrope.h"
+#include "infiniop/ops/3dmrope.h"
 #include "infiniop/ops/add.h"
 #include "infiniop/ops/add_rms_norm.h"
 #include "infiniop/ops/attention.h"
diff --git a/include/infiniop/ops/2dmrope.h b/include/infiniop/ops/2dmrope.h
new file mode 100644
index 000000000..78fa6a22c
--- /dev/null
+++ b/include/infiniop/ops/2dmrope.h
@@ -0,0 +1,32 @@
+#ifndef __INFINIOP_2DMROPE_API_H__
+#define __INFINIOP_2DMROPE_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopMRoPE2DDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateMRoPE2DDescriptor(
+    infiniopHandle_t handle,
+    infiniopMRoPE2DDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y,
+    infiniopTensorDescriptor_t x,
+    infiniopTensorDescriptor_t pos_ids,
+    infiniopTensorDescriptor_t sin_table,
+    infiniopTensorDescriptor_t cos_table);
+
+__C __export infiniStatus_t infiniopGetMRoPE2DWorkspaceSize(infiniopMRoPE2DDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopMRoPE2D(
+    infiniopMRoPE2DDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void const *pos_ids,
+    void const *sin_table,
+    void const *cos_table,
+    void *stream);
+
+__C __export infiniStatus_t infiniopDestroyMRoPE2DDescriptor(infiniopMRoPE2DDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/3dmrope.h b/include/infiniop/ops/3dmrope.h
new file mode 100644
index 000000000..954f3e1a0
--- /dev/null
+++ b/include/infiniop/ops/3dmrope.h
@@ -0,0 +1,34 @@
+#ifndef __INFINIOP_3DMROPE_API_H__
+#define __INFINIOP_3DMROPE_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopMRoPE3DDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateMRoPE3DDescriptor(
+    infiniopHandle_t handle,
+    infiniopMRoPE3DDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y,
+    infiniopTensorDescriptor_t x,
+    infiniopTensorDescriptor_t pos_ids,
+    infiniopTensorDescriptor_t sin_table,
+    infiniopTensorDescriptor_t cos_table,
+    infiniopTensorDescriptor_t rope_section);
+
+__C __export infiniStatus_t infiniopGetMRoPE3DWorkspaceSize(infiniopMRoPE3DDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopMRoPE3D(
+    infiniopMRoPE3DDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void const *pos_ids,
+    void const *sin_table,
+    void const *cos_table,
+    void const *rope_section,
+    void *stream);
+
+__C __export infiniStatus_t infiniopDestroyMRoPE3DDescriptor(infiniopMRoPE3DDescriptor_t desc);
+
+#endif
diff --git a/scripts/python_test.py b/scripts/python_test.py
index 06af369ef..6ca352af8 100644
--- a/scripts/python_test.py
+++ b/scripts/python_test.py
@@ -39,6 +39,8 @@ def run_tests(args):
         "topkrouter.py",
         "topksoftmax.py",
         "zeros.py",
+        "2dmrope.py",
+        "3dmrope.py",
     ]:
         result = subprocess.run(
             f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True
diff --git a/src/infiniop/ops/2dmrope/2dmrope.h b/src/infiniop/ops/2dmrope/2dmrope.h
new file mode 100644
index 000000000..24ff3d9e3
--- /dev/null
+++ b/src/infiniop/ops/2dmrope/2dmrope.h
@@ -0,0 +1,140 @@
+#ifndef __2DMROPE_H__
+#define __2DMROPE_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+#include <cstdio>
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::mrope2d::NAMESPACE {                           \
+    class Descriptor final : public InfiniopDescriptor {         \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        MRoPE2DInfo _info;                                       \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            MRoPE2DInfo info,                                    \
+            size_t workspace_size_,                              \
+            Opaque *opaque,                                      \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _info(info),                                       \
+              _workspace_size(workspace_size_) {}                \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t y_desc,                   \
+            infiniopTensorDescriptor_t x_desc,                   \
+            infiniopTensorDescriptor_t pos_desc,                 \
+            infiniopTensorDescriptor_t sin_desc,                 \
+            infiniopTensorDescriptor_t cos_desc);                \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace,                                     \
+            size_t workspace_size,                               \
+            void *y,                                             \
+            const void *x,                                       \
+            const void *pos_ids,                                 \
+            const void *sin_table,                               \
+            const void *cos_table,                               \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+class MRoPE2DInfo {
+private:
+    MRoPE2DInfo() = default;
+
+public:
+    infiniDtype_t data_type, pos_type;
+    size_t seqlen, nhead, dhead, table_len, table_dim;
+    ptrdiff_t
+        y_stride_seqlen,
+        y_stride_nhead,
+        x_stride_seqlen,
+        x_stride_nhead;
+
+    static utils::Result<MRoPE2DInfo> createMRoPE2DInfo(
+        infiniopTensorDescriptor_t y_desc,
+        infiniopTensorDescriptor_t x_desc,
+        infiniopTensorDescriptor_t pos_desc,
+        infiniopTensorDescriptor_t sin_desc,
+        infiniopTensorDescriptor_t cos_desc) {
+        CHECK_OR_RETURN(
+            y_desc != nullptr && x_desc != nullptr && pos_desc != nullptr && sin_desc != nullptr && cos_desc != nullptr,
+            INFINI_STATUS_NULL_POINTER);
+
+        const infiniDtype_t data_type = y_desc->dtype();
+        const infiniDtype_t pos_type = pos_desc->dtype();
+        CHECK_OR_RETURN(data_type == x_desc->dtype(),
+                        INFINI_STATUS_BAD_TENSOR_DTYPE);
+        // // sin_table and cos_table should be float32 for precision
+        // CHECK_OR_RETURN(sin_desc->dtype() == INFINI_DTYPE_F32 && cos_desc->dtype() == INFINI_DTYPE_F32,
+        //                 INFINI_STATUS_BAD_TENSOR_DTYPE);
+        CHECK_OR_RETURN(data_type == x_desc->dtype() && data_type == sin_desc->dtype() && data_type == cos_desc->dtype(),
+                        INFINI_STATUS_BAD_TENSOR_DTYPE);
+        CHECK_DTYPE(data_type, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+        CHECK_DTYPE_ANY_INT(pos_type);
+
+        CHECK_OR_RETURN(y_desc->ndim() == 3
+                            && x_desc->ndim() == 3
+                            && pos_desc->ndim() == 2
+                            && sin_desc->ndim() == 2
+                            && cos_desc->ndim() == 2,
+                        INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        const auto nhead = y_desc->dim(0),
+                   seqlen = y_desc->dim(1),
+                   dhead = y_desc->dim(2),
+                   table_len = sin_desc->dim(0),
+                   table_dim = sin_desc->dim(1);
+        printf("y_desc->dim(0): %zu, y_desc->dim(1): %zu, y_desc->dim(2): %zu\n", y_desc->dim(0), y_desc->dim(1), y_desc->dim(2));
+        printf("x_desc->dim(0): %zu, x_desc->dim(1): %zu, x_desc->dim(2): %zu\n", x_desc->dim(0), x_desc->dim(1), x_desc->dim(2));
+        printf("pos_desc->dim(0): %zu, pos_desc->dim(1): %zu\n", pos_desc->dim(0), pos_desc->dim(1));
+        printf("sin_desc->dim(0): %zu, sin_desc->dim(1): %zu\n", sin_desc->dim(0), sin_desc->dim(1));
+        printf("cos_desc->dim(0): %zu, cos_desc->dim(1): %zu\n", cos_desc->dim(0), cos_desc->dim(1));
+        printf("nhead: %zu, seqlen: %zu, dhead: %zu, table_len: %zu, table_dim: %zu\n", nhead, seqlen, dhead, table_len, table_dim);
+
+        CHECK_OR_RETURN(nhead == x_desc->dim(0)
+                            && seqlen == x_desc->dim(1) && seqlen == pos_desc->dim(0)
+                            && dhead == x_desc->dim(2)
+                            && table_len == cos_desc->dim(0) && table_dim == cos_desc->dim(1)
+                            && pos_desc->dim(1) == 2,
+                        INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        CHECK_OR_RETURN(dhead == table_dim * 4, INFINI_STATUS_BAD_TENSOR_SHAPE); // 2D MRoPE: dhead = table_dim * 4
+        // Last dimension of x and y must be contiguous
+        CHECK_OR_RETURN(y_desc->stride(2) == 1 && x_desc->stride(2) == 1, INFINI_STATUS_BAD_TENSOR_STRIDES);
+        // sin table and cos table must be totally contiguous
+        CHECK_OR_RETURN(sin_desc->isContiguous() && cos_desc->isContiguous(), INFINI_STATUS_BAD_TENSOR_STRIDES);
+        // pos_ids must be contiguous
+        CHECK_OR_RETURN(pos_desc->isContiguous(), INFINI_STATUS_BAD_TENSOR_STRIDES);
+
+        return utils::Result<MRoPE2DInfo>(MRoPE2DInfo{
+            data_type,
+            pos_type,
+            seqlen,
+            nhead,
+            dhead,
+            table_len,
+            table_dim,
+            y_desc->stride(1),
+            y_desc->stride(0),
+            x_desc->stride(1),
+            x_desc->stride(0),
+        });
+    }
+};
+
+#endif
diff --git a/src/infiniop/ops/2dmrope/cuda/mrope.cuh b/src/infiniop/ops/2dmrope/cuda/mrope.cuh
new file mode 100644
index 000000000..24d8a324f
--- /dev/null
+++ b/src/infiniop/ops/2dmrope/cuda/mrope.cuh
@@ -0,0 +1,42 @@
+template <class Tp, class Ta>
+static __device__ void padding(
+    Ta *__restrict__ y_,
+    int const stride_token_y,
+    int const stride_head_y,
+    Ta const *__restrict__ x_,
+    int const stride_token_x,
+    int const stride_head_x,
+    Tp const *__restrict__ pos_,
+    float const *__restrict__ sin_table,
+    float const *__restrict__ cos_table) {
+
+    // n = gridDim.y
+    // nh_h = gridDim.x
+    int nh_l = blockDim.y,
+        dh_div_2 = blockDim.x,
+        it = blockIdx.y,
+        ih_h = blockIdx.x,
+        ih_l = threadIdx.y,
+        ih = ih_h * nh_l + ih_l,
+        i = threadIdx.x;
+
+    // 计算 x 和 y 的位置, 每相距 d_div_2 的两个为一组
+    auto x1 = x_ + it * stride_token_x + ih * stride_head_x + i;
+    auto x2 = x_ + it * stride_token_x + ih * stride_head_x + i + dh_div_2;
+    auto y1 = y_ + it * stride_token_y + ih * stride_head_y + i;
+    auto y2 = y_ + it * stride_token_y + ih * stride_head_y + i + dh_div_2;
+
+    // 获取位置索引
+    // 2 维 mrope 的 w, h 维度均分 d_div_2，每个分到 d_div_2 / 2
+    int id_h = i / (dh_div_2 / 2);  // w, h 的维度索引
+    int id_l = i % (dh_div_2 / 2);  // w, h 维度内索引
+    auto pos = pos_[it * 2 + id_h]; // 2 维 pos 的 shape: [it, 2], strides: [2, 1]
+    float sin = sin_table[pos * (dh_div_2 / 2) + id_l],
+          cos = cos_table[pos * (dh_div_2 / 2) + id_l],
+          a = x1[0],
+          b = x2[0];
+
+    // 应用旋转并写入 y
+    y1[0] = Ta(a * cos - b * sin);
+    y2[0] = Ta(a * sin + b * cos);
+}
diff --git a/src/infiniop/ops/2dmrope/nvidia/2dmrope_nvidia.cu b/src/infiniop/ops/2dmrope/nvidia/2dmrope_nvidia.cu
new file mode 100644
index 000000000..2e0e58a1a
--- /dev/null
+++ b/src/infiniop/ops/2dmrope/nvidia/2dmrope_nvidia.cu
@@ -0,0 +1,166 @@
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "2dmrope_nvidia.cuh"
+
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "../cuda/mrope.cuh"
+
+namespace op::mrope2d::nvidia {
+
+    struct Descriptor::Opaque {
+        std::shared_ptr<device::nvidia::Handle::Internal> internal;
+    };
+
+    Descriptor::~Descriptor() {
+        delete _opaque;
+    }
+
+    infiniStatus_t Descriptor::create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t y_desc,
+        infiniopTensorDescriptor_t x_desc,
+        infiniopTensorDescriptor_t pos_desc,
+        infiniopTensorDescriptor_t sin_desc,
+        infiniopTensorDescriptor_t cos_desc) {
+
+        auto handle_nvidia = reinterpret_cast<device::nvidia::Handle *>(handle);
+
+        auto info = MRoPE2DInfo::createMRoPE2DInfo(y_desc, x_desc, pos_desc, sin_desc, cos_desc);
+        CHECK_RESULT(info);
+
+        // Create descriptor
+        *desc_ptr = new Descriptor(
+            info.take(),
+            0,
+            new Opaque{handle_nvidia->internal()},
+            handle_nvidia->device,
+            handle_nvidia->device_id);
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    template<typename Tdata, typename Tpos>
+    __global__ void mrope2d_kernel(
+        Tdata *__restrict__ y_,
+        int const stride_token_y,
+        int const stride_head_y,
+        Tdata const *__restrict__ x_,
+        int const stride_token_x,
+        int const stride_head_x,
+        Tpos const *__restrict__ pos_,
+        float const *__restrict__ sin_table,
+        float const *__restrict__ cos_table,
+        int const dh_div_2) {
+
+        padding<Tpos, Tdata>(
+            y_, stride_token_y, stride_head_y,
+            x_, stride_token_x, stride_head_x,
+            pos_, sin_table, cos_table);
+    }
+
+    template<typename Tdata, typename Tpos>
+    __global__ void mrope2d_kernel(
+        Tdata *__restrict__ y_,
+        int const stride_token_y,
+        int const stride_head_y,
+        Tdata const *__restrict__ x_,
+        int const stride_token_x,
+        int const stride_head_x,
+        Tpos const *__restrict__ pos_,
+        float const *__restrict__ sin_table,
+        float const *__restrict__ cos_table) {
+
+        padding<Tpos, Tdata>(
+            y_, stride_token_y, stride_head_y,
+            x_, stride_token_x, stride_head_x,
+            pos_, sin_table, cos_table);
+    }
+
+    template <typename Tdata, typename Tpos>
+    infiniStatus_t calculateMRoPE2D(const MRoPE2DInfo &info,
+                                    int block_size,
+                                    Tdata *y,
+                                    const Tdata *x,
+                                    const Tpos *pos_ids,
+                                    const float *sin_table,
+                                    const float *cos_table,
+                                    cudaStream_t stream) {
+        auto dimy = uint32_t(info.seqlen),    // grid.y = n
+             dimx = uint32_t(info.nhead);     // grid.x = nh_h
+        int dh_div_2 = info.dhead / 2;
+        int nh_l = 1; // 每个 block 处理的 head 数量
+
+        // 注意：Rust 中的顺序是 (grid.y, grid.x), (block.y, block.x)
+        // 所以 CUDA 调用应该是 (grid.x, grid.y), (block.x, block.y)
+        dim3 gridDim(dimx, dimy);    // (nh_h, n)
+        dim3 blockDim(dh_div_2, nh_l); // (dh_div_2, nh_l)
+
+        mrope2d_kernel<<<gridDim, blockDim, 0, stream>>>(
+            y, info.y_stride_seqlen, info.y_stride_nhead,
+            x, info.x_stride_seqlen, info.x_stride_nhead,
+            pos_ids, sin_table, cos_table);
+
+        return INFINI_STATUS_SUCCESS;
+    }
+
+#define CALCULATE_MROPE2D(TDATA, TPOS)                     \
+    calculateMRoPE2D(_info,                                \
+                     _opaque->internal->maxThreadsPerBlock(), \
+                     (TDATA *)y,                           \
+                     (const TDATA *)x,                     \
+                     (const TPOS *)pos_ids,                \
+                     (const float *)sin_table,             \
+                     (const float *)cos_table,             \
+                     (cudaStream_t)stream)
+
+#define MROPE2D_TYPE(TDATA)                     \
+    switch (_info.pos_type) {                  \
+    case INFINI_DTYPE_U8:                      \
+        return CALCULATE_MROPE2D(TDATA, uint8_t);  \
+    case INFINI_DTYPE_U16:                     \
+        return CALCULATE_MROPE2D(TDATA, uint16_t); \
+    case INFINI_DTYPE_U32:                     \
+        return CALCULATE_MROPE2D(TDATA, uint32_t); \
+    case INFINI_DTYPE_U64:                     \
+        return CALCULATE_MROPE2D(TDATA, uint64_t); \
+    case INFINI_DTYPE_I8:                      \
+        return CALCULATE_MROPE2D(TDATA, int8_t);   \
+    case INFINI_DTYPE_I16:                     \
+        return CALCULATE_MROPE2D(TDATA, int16_t);  \
+    case INFINI_DTYPE_I32:                     \
+        return CALCULATE_MROPE2D(TDATA, int32_t);  \
+    case INFINI_DTYPE_I64:                     \
+        return CALCULATE_MROPE2D(TDATA, int64_t);  \
+    default:                                   \
+        return INFINI_STATUS_BAD_TENSOR_DTYPE; \
+    }
+
+    infiniStatus_t Descriptor::calculate(
+        void *workspace,
+        size_t workspace_size,
+        void *y,
+        const void *x,
+        const void *pos_ids,
+        const void *sin_table,
+        const void *cos_table,
+        void *stream) const {
+
+        switch (_info.data_type) {
+        case INFINI_DTYPE_F16:
+            MROPE2D_TYPE(half);
+        case INFINI_DTYPE_BF16:
+            MROPE2D_TYPE(cuda_bfloat16);
+        case INFINI_DTYPE_F32:
+            MROPE2D_TYPE(float);
+        case INFINI_DTYPE_F64:
+            MROPE2D_TYPE(double);
+        default:
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+
+        return INFINI_STATUS_SUCCESS;
+    }
+
+#undef MROPE2D_TYPE
+#undef CALCULATE_MROPE2D
+
+}
diff --git a/src/infiniop/ops/2dmrope/nvidia/2dmrope_nvidia.cuh b/src/infiniop/ops/2dmrope/nvidia/2dmrope_nvidia.cuh
new file mode 100644
index 000000000..2e14e3730
--- /dev/null
+++ b/src/infiniop/ops/2dmrope/nvidia/2dmrope_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __INFINIOP_2DMROPE_CUDA_H__
+#define __INFINIOP_2DMROPE_CUDA_H__
+
+#include "../2dmrope.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __INFINIOP_2DMROPE_CUDA_H__
diff --git a/src/infiniop/ops/2dmrope/operator.cc b/src/infiniop/ops/2dmrope/operator.cc
new file mode 100644
index 000000000..2b09a17e0
--- /dev/null
+++ b/src/infiniop/ops/2dmrope/operator.cc
@@ -0,0 +1,204 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/2dmrope.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/2dmrope_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/2dmrope_nvidia.cuh"
+#endif
+#ifdef ENABLE_ASCEND_API
+#include "ascend/2dmrope_ascend.h"
+#endif
+#ifdef ENABLE_CAMBRICON_API
+#include "bang/2dmrope_bang.h"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/2dmrope_metax.h"
+#endif
+#ifdef ENABLE_KUNLUN_API
+#include "kunlun/2dmrope_kunlun.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/2dmrope_moore.h"
+#endif
+
+__C infiniStatus_t infiniopCreateMRoPE2DDescriptor(
+    infiniopHandle_t handle,
+    infiniopMRoPE2DDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y,
+    infiniopTensorDescriptor_t x,
+    infiniopTensorDescriptor_t pos_ids,
+    infiniopTensorDescriptor_t sin_table,
+    infiniopTensorDescriptor_t cos_table) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::mrope2d::NAMESPACE::Descriptor::create(                  \
+            handle,                                                         \
+            reinterpret_cast<op::mrope2d::NAMESPACE::Descriptor **>(desc_ptr), \
+            y,                                                              \
+            x,                                                              \
+            pos_ids,                                                        \
+            sin_table,                                                      \
+            cos_table)
+
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_ASCEND_API
+        CREATE(INFINI_DEVICE_ASCEND, ascend);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CREATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    }
+
+#undef CREATE
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopGetMRoPE2DWorkspaceSize(infiniopMRoPE2DDescriptor_t desc,
+                                                   size_t *size) {
+#define GET(CASE, NAMESPACE)                                                                      \
+    case CASE:                                                                                    \
+        *size = reinterpret_cast<const op::mrope2d::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        GET(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        GET(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_ASCEND_API
+        GET(INFINI_DEVICE_ASCEND, ascend);
+#endif
+    }
+
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopMRoPE2D(
+    infiniopMRoPE2DDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    const void *pos_ids,
+    const void *sin_table,
+    const void *cos_table,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::mrope2d::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, x, pos_ids, sin_table, cos_table, stream)
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_ASCEND_API
+        CALCULATE(INFINI_DEVICE_ASCEND, ascend);
+#endif
+    }
+
+#undef CALCULATE
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t
+infiniopDestroyMRoPE2DDescriptor(infiniopMRoPE2DDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::mrope2d::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        DELETE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_ASCEND_API
+        DELETE(INFINI_DEVICE_ASCEND, ascend);
+#endif
+    }
+
+#undef DELETE
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
diff --git a/src/infiniop/ops/3dmrope/3dmrope.h b/src/infiniop/ops/3dmrope/3dmrope.h
new file mode 100644
index 000000000..c39687c71
--- /dev/null
+++ b/src/infiniop/ops/3dmrope/3dmrope.h
@@ -0,0 +1,141 @@
+#ifndef __3DMROPE_H__
+#define __3DMROPE_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::mrope3d::NAMESPACE {                           \
+    class Descriptor final : public InfiniopDescriptor {        \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        MRoPE3DInfo _info;                                       \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            MRoPE3DInfo info,                                    \
+            size_t workspace_size_,                              \
+            Opaque *opaque,                                      \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _info(info),                                       \
+              _workspace_size(workspace_size_) {}                \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t y_desc,                   \
+            infiniopTensorDescriptor_t x_desc,                   \
+            infiniopTensorDescriptor_t pos_desc,                 \
+            infiniopTensorDescriptor_t sin_desc,                 \
+            infiniopTensorDescriptor_t cos_desc,                 \
+            infiniopTensorDescriptor_t rope_section_desc);       \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace,                                     \
+            size_t workspace_size,                               \
+            void *y,                                             \
+            const void *x,                                       \
+            const void *pos_ids,                                 \
+            const void *sin_table,                               \
+            const void *cos_table,                               \
+            const void *rope_section,                            \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+class MRoPE3DInfo {
+private:
+    MRoPE3DInfo() = default;
+
+public:
+    infiniDtype_t data_type, pos_type, section_type;
+    size_t seqlen, nhead, dhead, table_len, table_dim;
+    ptrdiff_t
+        y_stride_seqlen,
+        y_stride_nhead,
+        x_stride_seqlen,
+        x_stride_nhead;
+
+    static utils::Result<MRoPE3DInfo> createMRoPE3DInfo(
+        infiniopTensorDescriptor_t y_desc,
+        infiniopTensorDescriptor_t x_desc,
+        infiniopTensorDescriptor_t pos_desc,
+        infiniopTensorDescriptor_t sin_desc,
+        infiniopTensorDescriptor_t cos_desc,
+        infiniopTensorDescriptor_t rope_section_desc) {
+        CHECK_OR_RETURN(
+            y_desc != nullptr && x_desc != nullptr && pos_desc != nullptr && sin_desc != nullptr && cos_desc != nullptr && rope_section_desc != nullptr,
+            INFINI_STATUS_NULL_POINTER);
+
+        const infiniDtype_t data_type = y_desc->dtype();
+        const infiniDtype_t pos_type = pos_desc->dtype();
+        const infiniDtype_t section_type = rope_section_desc->dtype();
+        CHECK_OR_RETURN(data_type == x_desc->dtype(),
+                        INFINI_STATUS_BAD_TENSOR_DTYPE);
+        // // sin_table and cos_table should be float32 for precision
+        // CHECK_OR_RETURN(sin_desc->dtype() == INFINI_DTYPE_F32 && cos_desc->dtype() == INFINI_DTYPE_F32,
+        //                 INFINI_STATUS_BAD_TENSOR_DTYPE);
+        CHECK_OR_RETURN(data_type == x_desc->dtype() && data_type == sin_desc->dtype() && data_type == cos_desc->dtype(),
+        INFINI_STATUS_BAD_TENSOR_DTYPE);
+        CHECK_DTYPE(data_type, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+        CHECK_DTYPE_ANY_INT(pos_type);
+        CHECK_DTYPE_ANY_INT(section_type);
+
+        CHECK_OR_RETURN(y_desc->ndim() == 3
+                            && x_desc->ndim() == 3
+                            && pos_desc->ndim() == 2
+                            && sin_desc->ndim() == 2
+                            && cos_desc->ndim() == 2
+                            && rope_section_desc->ndim() == 1,
+                        INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        const auto nhead = y_desc->dim(0),
+                   seqlen = y_desc->dim(1),
+                   dhead = y_desc->dim(2),
+                   table_len = sin_desc->dim(0),
+                   table_dim = sin_desc->dim(1);
+
+        CHECK_OR_RETURN(nhead == x_desc->dim(0)
+                            && seqlen == x_desc->dim(1) && seqlen == pos_desc->dim(0)
+                            && dhead == x_desc->dim(2)
+                            && table_len == cos_desc->dim(0) && table_dim == cos_desc->dim(1)
+                            && pos_desc->dim(1) == 3
+                            && rope_section_desc->dim(0) == 3,
+                        INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        CHECK_OR_RETURN(dhead == table_dim * 2, INFINI_STATUS_BAD_TENSOR_SHAPE);
+        // Last dimension of x and y must be contiguous
+        CHECK_OR_RETURN(y_desc->stride(2) == 1 && x_desc->stride(2) == 1, INFINI_STATUS_BAD_TENSOR_STRIDES);
+        // sin table and cos table must be totally contiguous
+        CHECK_OR_RETURN(sin_desc->isContiguous() && cos_desc->isContiguous(), INFINI_STATUS_BAD_TENSOR_STRIDES);
+        // pos_ids and rope_section must be contiguous
+        CHECK_OR_RETURN(pos_desc->isContiguous() && rope_section_desc->isContiguous(), INFINI_STATUS_BAD_TENSOR_STRIDES);
+
+        return utils::Result<MRoPE3DInfo>(MRoPE3DInfo{
+            data_type,
+            pos_type,
+            section_type,
+            seqlen,
+            nhead,
+            dhead,
+            table_len,
+            table_dim,
+            y_desc->stride(1),
+            y_desc->stride(0),
+            x_desc->stride(1),
+            x_desc->stride(0),
+        });
+    }
+};
+
+#endif
diff --git a/src/infiniop/ops/3dmrope/cuda/mrope_3d.cuh b/src/infiniop/ops/3dmrope/cuda/mrope_3d.cuh
new file mode 100644
index 000000000..82f352f08
--- /dev/null
+++ b/src/infiniop/ops/3dmrope/cuda/mrope_3d.cuh
@@ -0,0 +1,50 @@
+template <class Tp, class Ta>
+static __device__ void padding(
+    Ta *__restrict__ y_,
+    int const stride_token_y,
+    int const stride_head_y,
+    Ta const *__restrict__ x_,
+    int const stride_token_x,
+    int const stride_head_x,
+    Tp const *__restrict__ pos_,
+    float const *__restrict__ sin_table,
+    float const *__restrict__ cos_table,
+    Tp const *__restrict__ rope_section_
+) {
+
+    // n = gridDim.y
+    // nh_h = gridDim.x
+    int nh_l = blockDim.y,
+        dh_div_2 = blockDim.x,
+        it = blockIdx.y,
+        ih_h = blockIdx.x,
+        ih_l = threadIdx.y,
+        ih = ih_h * nh_l + ih_l,
+        i = threadIdx.x;
+
+    // 计算 x 和 y 的位置, 每相距 d_div_2 的两个为一组
+    auto x1 = x_ + it * stride_token_x + ih * stride_head_x + i;
+    auto x2 = x_ + it * stride_token_x + ih * stride_head_x + i + dh_div_2;
+    auto y1 = y_ + it * stride_token_y + ih * stride_head_y + i;
+    auto y2 = y_ + it * stride_token_y + ih * stride_head_y + i + dh_div_2;
+
+    // 寻找 i 在 rope_section 中的位置
+    int thw = 0;
+    for (int j = 0; j < 3; j++) {
+        if (i < rope_section_[j]) {
+            thw = j;
+            break;
+        }
+    }
+
+    // 获取位置索引
+    auto pos = pos_[it * 3 + thw]; // 3 维 pos 的 shape: [n, 3], strides: [3, 1]
+    float sin = sin_table[pos * dh_div_2 + i],
+          cos = cos_table[pos * dh_div_2 + i],
+          a = x1[0],
+          b = x2[0];
+
+    // 应用旋转并写入 y
+    y1[0] = Ta(a * cos - b * sin);
+    y2[0] = Ta(a * sin + b * cos);
+}
diff --git a/src/infiniop/ops/3dmrope/cuda/mrope_3d_optimized.cuh b/src/infiniop/ops/3dmrope/cuda/mrope_3d_optimized.cuh
new file mode 100644
index 000000000..7b9952e39
--- /dev/null
+++ b/src/infiniop/ops/3dmrope/cuda/mrope_3d_optimized.cuh
@@ -0,0 +1,48 @@
+// 优化版本的3D MRoPE CUDA内核
+template <class Tp, class Ta>
+static __device__ void padding(
+    Ta *__restrict__ y_,
+    int const stride_token_y,
+    int const stride_head_y,
+    Ta const *__restrict__ x_,
+    int const stride_token_x,
+    int const stride_head_x,
+    Tp const *__restrict__ pos_,
+    float const *__restrict__ sin_table,
+    float const *__restrict__ cos_table,
+    Tp const *__restrict__ rope_section_) {
+
+    // n = gridDim.y
+    // nh_h = gridDim.x
+    int nh_l = blockDim.y,
+        dh_div_2 = blockDim.x,
+        it = blockIdx.y,
+        ih_h = blockIdx.x,
+        ih_l = threadIdx.y,
+        ih = ih_h * nh_l + ih_l,
+        i = threadIdx.x;
+
+    // 计算 x 和 y 的位置, 每相距 d_div_2 的两个为一组
+    auto x1 = x_ + it * stride_token_x + ih * stride_head_x + i;
+    auto x2 = x_ + it * stride_token_x + ih * stride_head_x + i + dh_div_2;
+    auto y1 = y_ + it * stride_token_y + ih * stride_head_y + i;
+    auto y2 = y_ + it * stride_token_y + ih * stride_head_y + i + dh_div_2;
+
+    // 优化版本：使用条件表达式替代循环
+    // rope_section 是累积值，例如 [32, 64, 96] 对应原始的 [32, 32, 32]
+    // i 范围是 [0, dh_div_2)，即 [0, 96)
+    // 逻辑：找到第一个 rope_section_[j] > i 的 j
+    int thw = (i < rope_section_[0]) ? 0 :
+              (i < rope_section_[1]) ? 1 : 2;
+
+    // 获取位置索引
+    auto pos = pos_[it * 3 + thw]; // 3 维 pos 的 shape: [n, 3], strides: [3, 1]
+    float sin = sin_table[pos * dh_div_2 + i],
+          cos = cos_table[pos * dh_div_2 + i],
+          a = x1[0],
+          b = x2[0];
+
+    // 应用旋转并写入 y
+    y1[0] = Ta(a * cos - b * sin);
+    y2[0] = Ta(a * sin + b * cos);
+}
diff --git a/src/infiniop/ops/3dmrope/nvidia/3dmrope_nvidia.cu b/src/infiniop/ops/3dmrope/nvidia/3dmrope_nvidia.cu
new file mode 100644
index 000000000..e97a33bfb
--- /dev/null
+++ b/src/infiniop/ops/3dmrope/nvidia/3dmrope_nvidia.cu
@@ -0,0 +1,172 @@
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "3dmrope_nvidia.cuh"
+
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "../cuda/mrope_3d_optimized.cuh"
+
+namespace op::mrope3d::nvidia {
+
+    struct Descriptor::Opaque {
+        std::shared_ptr<device::nvidia::Handle::Internal> internal;
+    };
+
+    Descriptor::~Descriptor() {
+        delete _opaque;
+    }
+
+    infiniStatus_t Descriptor::create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t y_desc,
+        infiniopTensorDescriptor_t x_desc,
+        infiniopTensorDescriptor_t pos_desc,
+        infiniopTensorDescriptor_t sin_desc,
+        infiniopTensorDescriptor_t cos_desc,
+        infiniopTensorDescriptor_t rope_section_desc) {
+
+        auto handle_nvidia = reinterpret_cast<device::nvidia::Handle *>(handle);
+
+        auto info = MRoPE3DInfo::createMRoPE3DInfo(y_desc, x_desc, pos_desc, sin_desc, cos_desc, rope_section_desc);
+        CHECK_RESULT(info);
+
+        // Create descriptor
+        *desc_ptr = new Descriptor(
+            info.take(),
+            0,
+            new Opaque{handle_nvidia->internal()},
+            handle_nvidia->device,
+            handle_nvidia->device_id);
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    template<typename Tdata, typename Tpos>
+    __global__ void mrope3d_kernel(
+        Tdata *__restrict__ y_,
+        int const stride_token_y,
+        int const stride_head_y,
+        Tdata const *__restrict__ x_,
+        int const stride_token_x,
+        int const stride_head_x,
+        Tpos const *__restrict__ pos_,
+        float const *__restrict__ sin_table,
+        float const *__restrict__ cos_table,
+        Tpos const *__restrict__ rope_section_,
+        int const dh_div_2) {
+
+        padding<Tpos, Tdata>(
+            y_, stride_token_y, stride_head_y,
+            x_, stride_token_x, stride_head_x,
+            pos_, sin_table, cos_table, rope_section_);
+    }
+
+    template<typename Tdata, typename Tpos>
+    __global__ void mrope3d_kernel(
+        Tdata *__restrict__ y_,
+        int const stride_token_y,
+        int const stride_head_y,
+        Tdata const *__restrict__ x_,
+        int const stride_token_x,
+        int const stride_head_x,
+        Tpos const *__restrict__ pos_,
+        float const *__restrict__ sin_table,
+        float const *__restrict__ cos_table,
+        Tpos const *__restrict__ rope_section_) {
+
+        padding<Tpos, Tdata>(
+            y_, stride_token_y, stride_head_y,
+            x_, stride_token_x, stride_head_x,
+            pos_, sin_table, cos_table, rope_section_);
+    }
+
+    template <typename Tdata, typename Tpos>
+    infiniStatus_t calculateMRoPE3D(const MRoPE3DInfo &info,
+                                    int block_size,
+                                    Tdata *y,
+                                    const Tdata *x,
+                                    const Tpos *pos_ids,
+                                    const float *sin_table,
+                                    const float *cos_table,
+                                    const Tpos *rope_section,
+                                    cudaStream_t stream) {
+        auto dimy = uint32_t(info.seqlen),    // grid.y = n
+             dimx = uint32_t(info.nhead);     // grid.x = nh_h
+        int dh_div_2 = info.dhead / 2;
+        int nh_l = 1; // 每个 block 处理的 head 数量
+
+        // 注意：Rust 中的顺序是 (grid.y, grid.x), (block.y, block.x)
+        // 所以 CUDA 调用应该是 (grid.x, grid.y), (block.x, block.y)
+        dim3 gridDim(dimx, dimy);    // (nh_h, n)
+        dim3 blockDim(dh_div_2, nh_l); // (dh_div_2, nh_l)
+
+        mrope3d_kernel<<<gridDim, blockDim, 0, stream>>>(
+            y, info.y_stride_seqlen, info.y_stride_nhead,
+            x, info.x_stride_seqlen, info.x_stride_nhead,
+            pos_ids, sin_table, cos_table, rope_section);
+
+        return INFINI_STATUS_SUCCESS;
+    }
+
+#define CALCULATE_MROPE3D(TDATA, TPOS)                     \
+    calculateMRoPE3D(_info,                                \
+                     _opaque->internal->maxThreadsPerBlock(), \
+                     (TDATA *)y,                           \
+                     (const TDATA *)x,                     \
+                     (const TPOS *)pos_ids,                \
+                     (const float *)sin_table,             \
+                     (const float *)cos_table,             \
+                     (const TPOS *)rope_section,           \
+                     (cudaStream_t)stream)
+
+#define MROPE3D_TYPE(TDATA)                     \
+    switch (_info.pos_type) {                  \
+    case INFINI_DTYPE_U8:                      \
+        return CALCULATE_MROPE3D(TDATA, uint8_t);  \
+    case INFINI_DTYPE_U16:                     \
+        return CALCULATE_MROPE3D(TDATA, uint16_t); \
+    case INFINI_DTYPE_U32:                     \
+        return CALCULATE_MROPE3D(TDATA, uint32_t); \
+    case INFINI_DTYPE_U64:                     \
+        return CALCULATE_MROPE3D(TDATA, uint64_t); \
+    case INFINI_DTYPE_I8:                      \
+        return CALCULATE_MROPE3D(TDATA, int8_t);   \
+    case INFINI_DTYPE_I16:                     \
+        return CALCULATE_MROPE3D(TDATA, int16_t);  \
+    case INFINI_DTYPE_I32:                     \
+        return CALCULATE_MROPE3D(TDATA, int32_t);  \
+    case INFINI_DTYPE_I64:                     \
+        return CALCULATE_MROPE3D(TDATA, int64_t);  \
+    default:                                   \
+        return INFINI_STATUS_BAD_TENSOR_DTYPE; \
+    }
+
+    infiniStatus_t Descriptor::calculate(
+        void *workspace,
+        size_t workspace_size,
+        void *y,
+        const void *x,
+        const void *pos_ids,
+        const void *sin_table,
+        const void *cos_table,
+        const void *rope_section,
+        void *stream) const {
+
+        switch (_info.data_type) {
+        case INFINI_DTYPE_F16:
+            MROPE3D_TYPE(half);
+        case INFINI_DTYPE_BF16:
+            MROPE3D_TYPE(cuda_bfloat16);
+        case INFINI_DTYPE_F32:
+            MROPE3D_TYPE(float);
+        case INFINI_DTYPE_F64:
+            MROPE3D_TYPE(double);
+        default:
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+
+        return INFINI_STATUS_SUCCESS;
+    }
+
+#undef MROPE3D_TYPE
+#undef CALCULATE_MROPE3D
+
+}
diff --git a/src/infiniop/ops/3dmrope/nvidia/3dmrope_nvidia.cuh b/src/infiniop/ops/3dmrope/nvidia/3dmrope_nvidia.cuh
new file mode 100644
index 000000000..fb070abe8
--- /dev/null
+++ b/src/infiniop/ops/3dmrope/nvidia/3dmrope_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __INFINIOP_3DMROPE_CUDA_H__
+#define __INFINIOP_3DMROPE_CUDA_H__
+
+#include "../3dmrope.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __INFINIOP_3DMROPE_CUDA_H__
diff --git a/src/infiniop/ops/3dmrope/operator.cc b/src/infiniop/ops/3dmrope/operator.cc
new file mode 100644
index 000000000..5725556d5
--- /dev/null
+++ b/src/infiniop/ops/3dmrope/operator.cc
@@ -0,0 +1,207 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/3dmrope.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/3dmrope_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/3dmrope_nvidia.cuh"
+#endif
+#ifdef ENABLE_ASCEND_API
+#include "ascend/3dmrope_ascend.h"
+#endif
+#ifdef ENABLE_CAMBRICON_API
+#include "bang/3dmrope_bang.h"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/3dmrope_metax.h"
+#endif
+#ifdef ENABLE_KUNLUN_API
+#include "kunlun/3dmrope_kunlun.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/3dmrope_moore.h"
+#endif
+
+__C infiniStatus_t infiniopCreateMRoPE3DDescriptor(
+    infiniopHandle_t handle,
+    infiniopMRoPE3DDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y,
+    infiniopTensorDescriptor_t x,
+    infiniopTensorDescriptor_t pos_ids,
+    infiniopTensorDescriptor_t sin_table,
+    infiniopTensorDescriptor_t cos_table,
+    infiniopTensorDescriptor_t rope_section) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::mrope3d::NAMESPACE::Descriptor::create(                  \
+            handle,                                                         \
+            reinterpret_cast<op::mrope3d::NAMESPACE::Descriptor **>(desc_ptr), \
+            y,                                                              \
+            x,                                                              \
+            pos_ids,                                                        \
+            sin_table,                                                      \
+            cos_table,                                                      \
+            rope_section)
+
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_ASCEND_API
+        CREATE(INFINI_DEVICE_ASCEND, ascend);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CREATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+    }
+
+#undef CREATE
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopGetMRoPE3DWorkspaceSize(infiniopMRoPE3DDescriptor_t desc,
+                                                   size_t *size) {
+#define GET(CASE, NAMESPACE)                                                                      \
+    case CASE:                                                                                    \
+        *size = reinterpret_cast<const op::mrope3d::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        GET(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        GET(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_ASCEND_API
+        GET(INFINI_DEVICE_ASCEND, ascend);
+#endif
+    }
+
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopMRoPE3D(
+    infiniopMRoPE3DDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    const void *pos_ids,
+    const void *sin_table,
+    const void *cos_table,
+    const void *rope_section,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::mrope3d::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, x, pos_ids, sin_table, cos_table, rope_section, stream)
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_ASCEND_API
+        CALCULATE(INFINI_DEVICE_ASCEND, ascend);
+#endif
+    }
+
+#undef CALCULATE
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t
+infiniopDestroyMRoPE3DDescriptor(infiniopMRoPE3DDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::mrope3d::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        DELETE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_ASCEND_API
+        DELETE(INFINI_DEVICE_ASCEND, ascend);
+#endif
+    }
+
+#undef DELETE
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
diff --git a/test/infiniop/2dmrope.py b/test/infiniop/2dmrope.py
new file mode 100644
index 000000000..5f7b4417c
--- /dev/null
+++ b/test/infiniop/2dmrope.py
@@ -0,0 +1,342 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceEnum,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration based on Qwen2-VL
+# ==============================================================================
+
+# 使用真实的图像参数进行测试
+H, W, D_PATCH = 336, 476, 14
+HP = H // D_PATCH  # 24
+WP = W // D_PATCH  # 34
+
+# 根据 pos_ids.rs 的实现，计算实际的序列长度
+
+
+def calculate_2d_seq_len(h, w, d_patch):
+    hp = h // d_patch
+    wp = w // d_patch
+    count = 0
+    for y in range(0, hp, 2):
+        for x in range(0, wp, 2):
+            for dy in range(2):
+                for dx in range(2):
+                    if y + dy < hp and x + dx < wp:
+                        count += 1
+    return count
+
+
+ACTUAL_SEQ_LEN = calculate_2d_seq_len(H, W, D_PATCH)
+
+# 注意：根据 Qwen2-VL，形状是 [nhead, seqlen, dhead]
+_TEST_CASES_ = [
+    # (shape, x_strides, y_strides) - 形状：[nhead, seqlen, dhead]
+    # 2D MRoPE: dhead = table_dim * 4
+    ((32, ACTUAL_SEQ_LEN, 128), None, None),   # 大规模测试: 32头, 128维
+    ((16, ACTUAL_SEQ_LEN, 64), None, None),    # 中等规模: 16头, 64维
+    ((8, ACTUAL_SEQ_LEN, 32), None, None),     # 小规模: 8头, 32维
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F32, InfiniDtype.F16, InfiniDtype.BF16]
+# InfiniDtype.BF16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-4},
+    InfiniDtype.BF16: {"atol": 8e-3, "rtol": 1e-4},
+    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-9},
+}
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    # Inplace.INPLACE_X,  # 先测试非原地操作
+]
+
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 1
+NUM_ITERATIONS = 3
+
+
+def generate_2d_pos_ids(h, w, d_patch, device, dtype=InfiniDtype.I32):
+    """Generate 2D position IDs according to pos_ids.rs implementation"""
+    hp = h // d_patch
+    wp = w // d_patch
+    pos = []
+
+    # Following the Rust implementation exactly
+    for y in range(0, hp, 2):
+        for x in range(0, wp, 2):
+            for dy in range(2):
+                for dx in range(2):
+                    if y + dy < hp and x + dx < wp:
+                        pos.append([y + dy, x + dx])
+
+    pos_ids = torch.tensor(pos, dtype=torch.int32)
+    return TestTensor.from_torch(pos_ids, dtype, device)
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input. (Qwen2-VL style)"""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2:]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb_vision_reference(tensor, freqs):
+    """
+    Reference implementation based on Qwen2-VL's apply_rotary_pos_emb_vision
+    tensor: [nhead, seqlen, dhead]
+    freqs: [seqlen, dhead//2] or similar
+    """
+    orig_dtype = tensor.dtype
+    tensor = tensor.float()
+
+    # freqs should contain the angles for rotation
+    cos = freqs.cos()  # [seqlen, dhead//2]
+    sin = freqs.sin()  # [seqlen, dhead//2]
+
+    # Expand to match tensor dimensions
+    # cos/sin: [seqlen, dhead//2] -> [seqlen, dhead]
+    cos = cos.repeat(1, 2)  # [seqlen, dhead]
+    sin = sin.repeat(1, 2)  # [seqlen, dhead]
+
+    # Add batch dimension for broadcasting with [nhead, seqlen, dhead]
+    cos = cos.unsqueeze(0)  # [1, seqlen, dhead]
+    sin = sin.unsqueeze(0)  # [1, seqlen, dhead]
+
+    output = (tensor * cos) + (rotate_half(tensor) * sin)
+    output = output.to(orig_dtype)
+    return output
+
+
+def multimodal_rotary_embedding_2d_reference(ans, t, pos_ids, sin_table, cos_table, device):
+    """
+    2D MRoPE reference implementation based on Qwen2-VL style
+    t: [nhead, seqlen, dhead]
+    pos_ids: [seqlen, 2] - (h, w) positions
+    sin_table/cos_table: [max_pos, dhead//4] - table for each dimension
+    """
+    nhead, seqlen, dhead = t.shape
+    dt = t.dtype
+    assert dhead % 4 == 0, "Embedding dimension must be divisible by 4 for 2D MRoPE."
+
+    dhead_div_2 = dhead // 2
+    dhead_div_4 = dhead // 4
+
+    if device == InfiniDeviceEnum.CPU:
+        t = t.float()
+        sin_table = sin_table.float()
+        cos_table = cos_table.float()
+
+    # Create frequency tensor for each position
+    # This mimics the freqs parameter in apply_rotary_pos_emb_vision
+    freqs = torch.zeros(seqlen, dhead_div_2,
+                        dtype=torch.float32, device=t.device)
+
+    for seq_idx in range(seqlen):
+        for i in range(dhead_div_2):
+            # 2 维 mrope 的 w, h 维度均分 dhead_div_2，每个分到 dhead_div_4
+            dim_idx = i // dhead_div_4  # 0 for h, 1 for w
+            within_dim_idx = i % dhead_div_4  # index within dimension
+
+            pos = pos_ids[seq_idx, dim_idx].item()
+            freqs[seq_idx, i] = torch.atan2(
+                sin_table[pos, within_dim_idx],
+                cos_table[pos, within_dim_idx]
+            )
+
+    # Apply rotary embedding using Qwen2-VL style
+    ans[:] = apply_rotary_pos_emb_vision_reference(t, freqs)
+
+
+def sin_cos_table_2d(max_pos, dim, device, theta, dtype):
+    """Generate sin/cos table for 2D MRoPE"""
+    assert dim % 4 == 0, "Embedding dimension must be divisible by 4 for 2D MRoPE."
+    dh_div_4 = dim // 4
+
+    # Create frequency for each dimension component
+    freqs = 1.0 / (theta ** (torch.arange(0, dh_div_4, 1).float() / dh_div_4))
+    pos = torch.arange(0, max_pos, dtype=torch.float32)
+    angles = torch.outer(pos, freqs)
+
+    return (
+        TestTensor.from_torch(torch.sin(angles), InfiniDtype.F32, device),
+        TestTensor.from_torch(torch.cos(angles), InfiniDtype.F32, device),
+    )
+
+
+def test(
+    handle,
+    device,
+    shape,
+    x_strides=None,
+    y_strides=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float32,
+    sync=None,
+):
+    nhead, seqlen, dhead = shape
+
+    # For 2D MRoPE, dh must be divisible by 4
+    if dhead % 4 != 0:
+        return
+
+    print(
+        f"Testing 2D MRoPE (Qwen2-VL style) on {InfiniDeviceNames[device]} with shape:[{nhead}, {seqlen}, {dhead}] dtype:{InfiniDtypeNames[dtype]}"
+    )
+
+    x = TestTensor(shape, x_strides, dtype, device)
+    if inplace == Inplace.INPLACE_X:
+        if x_strides != y_strides:
+            return
+        y = x
+    else:
+        y = TestTensor(shape, y_strides, dtype, device)
+
+    # Generate 2D position IDs using real parameters
+    h, w, d_patch = H, W, D_PATCH
+    pos_ids = generate_2d_pos_ids(h, w, d_patch, device)
+
+    # Verify the sequence length matches
+    assert pos_ids.shape[0] == seqlen, f"pos_ids length {pos_ids.shape[0]} != seqlen {seqlen}"
+
+    max_pos = pos_ids.torch_tensor().max().item() + 1
+
+    # Generate sin/cos tables
+    sin_table, cos_table = sin_cos_table_2d(
+        max_pos, dhead, device, 10000.0, dtype)
+
+    # Compute reference result using Qwen2-VL style
+    multimodal_rotary_embedding_2d_reference(
+        y.torch_tensor(),
+        x.torch_tensor(),
+        pos_ids.torch_tensor(),
+        sin_table.torch_tensor(),
+        cos_table.torch_tensor(),
+        device,
+    )
+
+    # Create descriptor
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateMRoPE2DDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y.descriptor,
+            x.descriptor,
+            pos_ids.descriptor,
+            sin_table.descriptor,
+            cos_table.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [y, x, pos_ids, sin_table, cos_table]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetMRoPE2DWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, x.device)
+
+    def lib_mrope2d():
+        check_error(
+            LIBINFINIOP.infiniopMRoPE2D(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                y.data(),
+                x.data(),
+                pos_ids.data(),
+                sin_table.data(),
+                cos_table.data(),
+                None,
+            )
+        )
+
+    lib_mrope2d()
+
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        print(f"Expected shape: {y.torch_tensor().shape}")
+        print(f"Actual shape: {y.actual_tensor().shape}")
+        print(f"pos_ids shape: {pos_ids.torch_tensor().shape}")
+        print(f"sin_table shape: {sin_table.torch_tensor().shape}")
+        debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
+
+    success = torch.allclose(
+        y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
+    if success:
+        print("✅ Test PASSED!")
+    else:
+        print("❌ Test FAILED!")
+        if not DEBUG:
+            print("Run with --debug to see detailed comparison")
+            # Show a brief comparison
+            diff = torch.abs(y.actual_tensor() - y.torch_tensor())
+            print(f"Max absolute difference: {diff.max().item():.6f}")
+            print(f"Mean absolute difference: {diff.mean().item():.6f}")
+
+    assert success
+
+    check_error(LIBINFINIOP.infiniopDestroyMRoPE2DDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    print(f"2D MRoPE Test Configuration (Qwen2-VL style):")
+    print(f"  Image size: {H}x{W}, Patch size: {D_PATCH}")
+    print(f"  Calculated sequence length: {ACTUAL_SEQ_LEN}")
+    print(f"  Tensor shape format: [nhead, seqlen, dhead]")
+
+    # Execute tests
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92m2D MRoPE Test (Qwen2-VL style) passed!\033[0m")
diff --git a/test/infiniop/2dmrope_fixed.py b/test/infiniop/2dmrope_fixed.py
new file mode 100644
index 000000000..e3572bfe8
--- /dev/null
+++ b/test/infiniop/2dmrope_fixed.py
@@ -0,0 +1,307 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceEnum,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+
+# 使用真实的图像参数进行测试
+H, W, D_PATCH = 336, 476, 14
+HP = H // D_PATCH  # 24
+WP = W // D_PATCH  # 34
+
+# 根据 pos_ids.rs 的实现，计算实际的序列长度
+# pos_ids.rs: ptr 从 0 开始，按 2x2 块遍历
+def calculate_2d_seq_len(h, w, d_patch):
+    hp = h // d_patch
+    wp = w // d_patch
+    count = 0
+    for y in range(0, hp, 2):
+        for x in range(0, wp, 2):
+            for dy in range(2):
+                for dx in range(2):
+                    if y + dy < hp and x + dx < wp:
+                        count += 1
+    return count
+
+ACTUAL_SEQ_LEN = calculate_2d_seq_len(H, W, D_PATCH)
+
+_TEST_CASES_ = [
+    # (shape, x_strides, y_strides) - 使用实际计算的序列长度
+    ((ACTUAL_SEQ_LEN, 32, 128), None, None),  # 2D MRoPE: dhead = table_dim * 4, so 128 = 32 * 4
+    ((ACTUAL_SEQ_LEN, 16, 64), None, None),   # 64 = 16 * 4
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-2},
+    InfiniDtype.BF16: {"atol": 5e-3, "rtol": 5e-2},
+    InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-3},
+}
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def generate_2d_pos_ids(h, w, d_patch, device, dtype=InfiniDtype.I32):
+    """Generate 2D position IDs according to pos_ids.rs implementation"""
+    hp = h // d_patch
+    wp = w // d_patch
+    pos = []
+
+    # Following the Rust implementation exactly
+    for y in range(0, hp, 2):
+        for x in range(0, wp, 2):
+            for dy in range(2):
+                for dx in range(2):
+                    if y + dy < hp and x + dx < wp:
+                        pos.append([y + dy, x + dx])
+
+    pos_ids = torch.tensor(pos, dtype=torch.int32)
+    return TestTensor.from_torch(pos_ids, dtype, device)
+
+
+def multimodal_rotary_embedding_2d(ans, t, pos_ids, sin, cos, device):
+    """
+    2D MRoPE implementation for reference
+    pos_ids shape: [seq_len, 2] - (h, w) positions
+    sin/cos shape: [max_pos, dh//4] - table for each dimension
+    """
+    seq_len, n_head, dh = t.shape
+    dt = t.dtype
+    assert dh % 4 == 0, "Embedding dimension must be divisible by 4 for 2D MRoPE."
+
+    dh_div_4 = dh // 4
+    dh_div_2 = dh // 2
+
+    if device == InfiniDeviceEnum.CPU:
+        t = t.float()
+        sin = sin.float()
+        cos = cos.float()
+
+    # Apply rotation based on your .cuh implementation
+    for seq_idx in range(seq_len):
+        for head_idx in range(n_head):
+            for i in range(dh_div_2):
+                # 2 维 mrope 的 w, h 维度均分 d_div_2，每个分到 d_div_2 / 2
+                id_h = i // (dh_div_2 // 2)  # w, h 的维度索引
+                id_l = i % (dh_div_2 // 2)   # w, h 维度内索引
+                pos = pos_ids[seq_idx, id_h].item()  # 2 维 pos 的 shape: [seq_len, 2], strides: [2, 1]
+
+                sin_val = sin[pos, id_l].item()
+                cos_val = cos[pos, id_l].item()
+
+                # Apply rotation
+                a = t[seq_idx, head_idx, i].item()
+                b = t[seq_idx, head_idx, i + dh_div_2].item()
+
+                ans[seq_idx, head_idx, i] = (a * cos_val - b * sin_val)
+                ans[seq_idx, head_idx, i + dh_div_2] = (a * sin_val + b * cos_val)
+
+    if device == InfiniDeviceEnum.CPU:
+        ans = ans.to(dt)
+
+
+def sin_cos_table_2d(max_pos, dim, device, theta, dtype):
+    """Generate sin/cos table for 2D MRoPE"""
+    assert dim % 4 == 0, "Embedding dimension must be divisible by 4 for 2D MRoPE."
+    dh_div_4 = dim // 4
+
+    # Create frequency for each dimension component
+    freqs = 1.0 / (theta ** (torch.arange(0, dh_div_4, 1).float() / dh_div_4))
+    pos = torch.arange(0, max_pos, dtype=torch.float32)
+    angles = torch.outer(pos, freqs)
+
+    return (
+        TestTensor.from_torch(torch.sin(angles), dtype, device),
+        TestTensor.from_torch(torch.cos(angles), dtype, device),
+    )
+
+
+def test(
+    handle,
+    device,
+    shape,
+    x_strides=None,
+    y_strides=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float32,
+    sync=None,
+):
+    seq_len, n_head, dh = shape
+
+    # For 2D MRoPE, dh must be divisible by 4
+    if dh % 4 != 0:
+        return
+
+    print(
+        f"Testing 2D MRoPE on {InfiniDeviceNames[device]} with shape:{shape} x_strides:{x_strides} y_strides:{y_strides} and dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    x = TestTensor(shape, x_strides, dtype, device)
+    if inplace == Inplace.INPLACE_X:
+        if x_strides != y_strides:
+            return
+        y = x
+    else:
+        y = TestTensor(shape, y_strides, dtype, device)
+
+    # Generate 2D position IDs using real parameters
+    h, w, d_patch = H, W, D_PATCH
+    pos_ids = generate_2d_pos_ids(h, w, d_patch, device)
+
+    # Verify the sequence length matches
+    assert pos_ids.shape[0] == seq_len, f"pos_ids length {pos_ids.shape[0]} != seq_len {seq_len}"
+
+    max_pos = pos_ids.torch_tensor().max().item() + 1
+
+    # Generate sin/cos tables
+    sin_table, cos_table = sin_cos_table_2d(max_pos, dh, device, 10000.0, dtype)
+
+    # Compute reference result
+    multimodal_rotary_embedding_2d(
+        y.torch_tensor(),
+        x.torch_tensor(),
+        pos_ids.torch_tensor(),
+        sin_table.torch_tensor(),
+        cos_table.torch_tensor(),
+        device,
+    )
+
+    # Create descriptor
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateMRoPE2DDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y.descriptor,
+            x.descriptor,
+            pos_ids.descriptor,
+            sin_table.descriptor,
+            cos_table.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [y, x, pos_ids, sin_table, cos_table]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetMRoPE2DWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, x.device)
+
+    def lib_mrope2d():
+        check_error(
+            LIBINFINIOP.infiniopMRoPE2D(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                y.data(),
+                x.data(),
+                pos_ids.data(),
+                sin_table.data(),
+                cos_table.data(),
+                None,
+            )
+        )
+
+    lib_mrope2d()
+
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
+
+    if PROFILE:
+        profile_operation(
+            "PyTorch",
+            lambda: multimodal_rotary_embedding_2d(
+                y.torch_tensor(),
+                x.torch_tensor(),
+                pos_ids.torch_tensor(),
+                sin_table.torch_tensor(),
+                cos_table.torch_tensor(),
+                device,
+            ),
+            device,
+            NUM_PRERUN,
+            NUM_ITERATIONS,
+        )
+        profile_operation(
+            "InfiniOP",
+            lib_mrope2d,
+            device,
+            NUM_PRERUN,
+            NUM_ITERATIONS,
+        )
+
+    check_error(LIBINFINIOP.infiniopDestroyMRoPE2DDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    print(f"2D MRoPE Test Configuration:")
+    print(f"  Image size: {H}x{W}, Patch size: {D_PATCH}")
+    print(f"  Calculated sequence length: {ACTUAL_SEQ_LEN}")
+
+    # Execute tests
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92m2D MRoPE Test passed!\033[0m")
diff --git a/test/infiniop/3dmrope.py b/test/infiniop/3dmrope.py
new file mode 100644
index 000000000..db0630de3
--- /dev/null
+++ b/test/infiniop/3dmrope.py
@@ -0,0 +1,406 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceEnum,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration based on Qwen2-VL for 3D MRoPE
+# ==============================================================================
+
+# 3D parameters for video/temporal modeling
+T_DIM = 3
+H_IMG = 224
+W_IMG = 224
+D_PATCH = 14
+PRE_TEXT_LEN = 4
+POST_TEXT_LEN = 5
+
+
+def calculate_3d_seq_len(t, h, w, d_patch, pre_text_len, post_text_len):
+    """Calculate sequence length according to 3D pos_ids.rs implementation"""
+    spatial_merge_size = 2
+    t_len = t
+    h_len = h // d_patch // spatial_merge_size
+    w_len = w // d_patch // spatial_merge_size
+    vision_len = t_len * h_len * w_len
+    total_len = pre_text_len + vision_len + post_text_len
+    return total_len
+
+
+ACTUAL_SEQ_LEN = calculate_3d_seq_len(
+    T_DIM, H_IMG, W_IMG, D_PATCH, PRE_TEXT_LEN, POST_TEXT_LEN)
+
+# 注意：根据 Qwen2-VL，形状是 [nhead, seqlen, dhead]
+# 3D MRoPE: dhead = table_dim * 2
+_TEST_CASES_ = [
+    # (shape, x_strides, y_strides) - 形状：[nhead, seqlen, dhead]
+    # 3D MRoPE: dhead = table_dim * 2
+    # ((32, ACTUAL_SEQ_LEN, 128), None, None),   # 大规模测试: 32头, 128维
+    # ((16, ACTUAL_SEQ_LEN, 64), None, None),    # 中等规模: 16头, 64维
+    ((8, ACTUAL_SEQ_LEN, 32), None, None),     # 小规模: 8头, 32维
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32]
+
+# Tolerance map for different data types (stricter for testing optimization)
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 5e-4, "rtol": 1e-5},
+    InfiniDtype.BF16: {"atol": 5e-4, "rtol": 1e-5},
+    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-9},
+}
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    # Inplace.INPLACE_X,  # 先测试非原地操作
+]
+
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 1
+NUM_ITERATIONS = 3
+
+
+def generate_3d_pos_ids(t, h, w, d_patch, pre_text_len, post_text_len, device, dtype=InfiniDtype.I32):
+    """Generate 3D position IDs according to pos_ids.rs implementation"""
+    spatial_merge_size = 2
+    t_len = t
+    h_len = h // d_patch // spatial_merge_size
+    w_len = w // d_patch // spatial_merge_size
+    vision_len = t_len * h_len * w_len
+    total_len = pre_text_len + vision_len + post_text_len
+
+    pos = []
+    idx = 0
+
+    # 图像前文本
+    for i in range(pre_text_len):
+        pos.append([i, i, i])
+        idx += 1
+
+    # 图像
+    img_start_pos = pre_text_len
+    for t_idx in range(t_len):
+        for h_idx in range(h_len):
+            for w_idx in range(w_len):
+                t_pos = img_start_pos + t_idx
+                h_pos = img_start_pos + h_idx
+                w_pos = img_start_pos + w_idx
+                pos.append([t_pos, h_pos, w_pos])
+                idx += 1
+
+    # 图像后文本
+    t_max_pos = img_start_pos + t_len - 1
+    h_max_pos = img_start_pos + h_len - 1
+    w_max_pos = img_start_pos + w_len - 1
+    image_max_pos = max(t_max_pos, h_max_pos, w_max_pos)
+    text_start_pos = image_max_pos + 1
+    for i in range(post_text_len):
+        pos_val = text_start_pos + i
+        pos.append([pos_val, pos_val, pos_val])
+        idx += 1
+
+    assert idx == total_len
+    pos_ids = torch.tensor(pos, dtype=torch.int32)
+    return TestTensor.from_torch(pos_ids, dtype, device)
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input. (Qwen2-VL style)"""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2:]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb_vision_reference(tensor, freqs):
+    """
+    Reference implementation based on Qwen2-VL's apply_rotary_pos_emb_vision
+    tensor: [nhead, seqlen, dhead]
+    freqs: [seqlen, dhead//2] or similar
+    """
+    orig_dtype = tensor.dtype
+    tensor = tensor.float()
+
+    # freqs should contain the angles for rotation
+    cos = freqs.cos()  # [seqlen, dhead//2]
+    sin = freqs.sin()  # [seqlen, dhead//2]
+
+    # Expand to match tensor dimensions
+    # cos/sin: [seqlen, dhead//2] -> [seqlen, dhead]
+    cos = cos.repeat(1, 2)  # [seqlen, dhead]
+    sin = sin.repeat(1, 2)  # [seqlen, dhead]
+
+    # Add batch dimension for broadcasting with [nhead, seqlen, dhead]
+    cos = cos.unsqueeze(0)  # [1, seqlen, dhead]
+    sin = sin.unsqueeze(0)  # [1, seqlen, dhead]
+
+    output = (tensor * cos) + (rotate_half(tensor) * sin)
+    output = output.to(orig_dtype)
+    return output
+
+
+def multimodal_rotary_embedding_3d_reference(ans, t, pos_ids, sin_table, cos_table, rope_section, device):
+    """
+    3D MRoPE reference implementation based on Qwen2-VL style
+    t: [nhead, seqlen, dhead]
+    pos_ids: [seqlen, 3] - (t, h, w) positions
+    sin_table/cos_table: [max_pos, dhead//2] - table for all dimensions
+    rope_section: [3] - section boundaries for t, h, w dimensions
+    """
+    nhead, seqlen, dhead = t.shape
+    dt = t.dtype
+    assert dhead % 2 == 0, "Embedding dimension must be divisible by 2 for 3D MRoPE."
+
+    dhead_div_2 = dhead // 2
+
+    if device == InfiniDeviceEnum.CPU:
+        t = t.float()
+        sin_table = sin_table.float()
+        cos_table = cos_table.float()
+
+    # Process each sequence position and head directly (matching CUDA kernel logic)
+    for seq_idx in range(seqlen):
+        for head_idx in range(nhead):
+            for i in range(dhead_div_2):
+                # Find i in rope_section (matching CUDA kernel logic)
+                thw = 0
+                for j in range(3):
+                    if i < rope_section[j].item():
+                        thw = j
+                        break
+
+                # Get position index
+                pos = pos_ids[seq_idx, thw].item()
+
+                sin_val = sin_table[pos, i].item()
+                cos_val = cos_table[pos, i].item()
+
+                a = t[head_idx, seq_idx, i].item()
+                b = t[head_idx, seq_idx, i + dhead_div_2].item()
+
+                ans[head_idx, seq_idx, i] = a * cos_val - b * sin_val
+                ans[head_idx, seq_idx, i + dhead_div_2] = a * \
+                    sin_val + b * cos_val
+
+
+def sin_cos_table_3d(max_pos, dim, device, theta, dtype):
+    """Generate sin/cos table for 3D MRoPE"""
+    assert dim % 2 == 0, "Embedding dimension must be divisible by 2 for 3D MRoPE."
+    dh_div_2 = dim // 2
+
+    # Create frequency for all dimensions (unified table)
+    freqs = 1.0 / (theta ** (torch.arange(0, dh_div_2, 1).float() / dh_div_2))
+    pos = torch.arange(0, max_pos, dtype=torch.float32)
+    angles = torch.outer(pos, freqs)
+
+    return (
+        TestTensor.from_torch(torch.sin(angles), InfiniDtype.F32, device),
+        TestTensor.from_torch(torch.cos(angles), InfiniDtype.F32, device),
+    )
+
+
+def test(
+    handle,
+    device,
+    shape,
+    x_strides=None,
+    y_strides=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float32,
+    sync=None,
+):
+    nhead, seqlen, dhead = shape
+
+    # For 3D MRoPE, dh must be divisible by 2
+    if dhead % 2 != 0:
+        return
+
+    print(
+        f"Testing 3D MRoPE (Qwen2-VL style) on {InfiniDeviceNames[device]} with shape:[{nhead}, {seqlen}, {dhead}] dtype:{InfiniDtypeNames[dtype]}"
+    )
+
+    x = TestTensor(shape, x_strides, dtype, device)
+    if inplace == Inplace.INPLACE_X:
+        if x_strides != y_strides:
+            return
+        y = x
+    else:
+        y = TestTensor(shape, y_strides, dtype, device)
+
+    # Generate 3D position IDs using real parameters
+    t_dim, h_dim, w_dim, d_patch = T_DIM, H_IMG, W_IMG, D_PATCH
+    pre_text_len, post_text_len = PRE_TEXT_LEN, POST_TEXT_LEN
+
+    pos_ids = generate_3d_pos_ids(
+        t_dim, h_dim, w_dim, d_patch, pre_text_len, post_text_len, device)
+
+    # Verify the sequence length matches
+    assert pos_ids.shape[0] == seqlen, f"pos_ids length {pos_ids.shape[0]} != seqlen {seqlen}"
+
+    # Calculate max_pos_val for sin/cos table
+    spatial_merge_size = 2
+    t_len = t_dim
+    h_len = h_dim // d_patch // spatial_merge_size
+    w_len = w_dim // d_patch // spatial_merge_size
+    img_start_pos = pre_text_len
+    t_max_pos = img_start_pos + t_len - 1
+    h_max_pos = img_start_pos + h_len - 1
+    w_max_pos = img_start_pos + w_len - 1
+    image_max_pos = max(t_max_pos, h_max_pos, w_max_pos)
+    text_start_pos = image_max_pos + 1
+    max_pos_val = text_start_pos + post_text_len
+
+    # Generate sin/cos tables
+    sin_table, cos_table = sin_cos_table_3d(
+        max_pos_val, dhead, device, 10000.0, dtype)
+
+    # rope_section represents the accumulated dimensions for t, h, w
+    # For example, if dhead=32, we might split as [10, 11, 11] -> accumulated [10, 21, 32]
+    dhead_div_2 = dhead // 2
+    section_t = dhead_div_2 // 3
+    section_h = dhead_div_2 // 3
+    section_w = dhead_div_2 - section_t - section_h  # remainder
+    rope_section = TestTensor.from_torch(
+        torch.tensor([section_t, section_t + section_h,
+                     dhead_div_2], dtype=torch.int32),
+        InfiniDtype.I32,
+        device
+    )
+
+    # Compute reference result using Qwen2-VL style
+    multimodal_rotary_embedding_3d_reference(
+        y.torch_tensor(),
+        x.torch_tensor(),
+        pos_ids.torch_tensor(),
+        sin_table.torch_tensor(),
+        cos_table.torch_tensor(),
+        rope_section.torch_tensor(),
+        device,
+    )
+
+    # Create descriptor
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateMRoPE3DDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y.descriptor,
+            x.descriptor,
+            pos_ids.descriptor,
+            sin_table.descriptor,
+            cos_table.descriptor,
+            rope_section.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [y, x, pos_ids, sin_table, cos_table, rope_section]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetMRoPE3DWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, x.device)
+
+    def lib_mrope3d():
+        check_error(
+            LIBINFINIOP.infiniopMRoPE3D(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                y.data(),
+                x.data(),
+                pos_ids.data(),
+                sin_table.data(),
+                cos_table.data(),
+                rope_section.data(),
+                None,
+            )
+        )
+
+    lib_mrope3d()
+
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        print(f"Expected shape: {y.torch_tensor().shape}")
+        print(f"Actual shape: {y.actual_tensor().shape}")
+        print(f"pos_ids shape: {pos_ids.torch_tensor().shape}")
+        print(f"sin_table shape: {sin_table.torch_tensor().shape}")
+        print(f"rope_section shape: {rope_section.torch_tensor().shape}")
+        debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
+
+    success = torch.allclose(
+        y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
+    if success:
+        print("✅ Test PASSED!")
+    else:
+        print("❌ Test FAILED!")
+        if not DEBUG:
+            print("Run with --debug to see detailed comparison")
+            # Show a brief comparison
+            diff = torch.abs(y.actual_tensor() - y.torch_tensor())
+            print(f"Max absolute difference: {diff.max().item():.6f}")
+            print(f"Mean absolute difference: {diff.mean().item():.6f}")
+
+    assert success
+
+    check_error(LIBINFINIOP.infiniopDestroyMRoPE3DDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    print(f"3D MRoPE Test Configuration (Qwen2-VL style):")
+    print(
+        f"  Video/Image size: {T_DIM}x{H_IMG}x{W_IMG}, Patch size: {D_PATCH}")
+    print(
+        f"  Pre-text length: {PRE_TEXT_LEN}, Post-text length: {POST_TEXT_LEN}")
+    print(f"  Calculated sequence length: {ACTUAL_SEQ_LEN}")
+    print(f"  Tensor shape format: [nhead, seqlen, dhead]")
+
+    # Execute tests
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92m3D MRoPE Test (Qwen2-VL style) passed!\033[0m")
diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py
index 283bdb1cd..fa8044629 100644
--- a/test/infiniop/libinfiniop/op_register.py
+++ b/test/infiniop/libinfiniop/op_register.py
@@ -458,6 +458,82 @@ def rope_(lib):
         infiniopOperatorDescriptor_t,
     ]
 
+@OpRegister.operator
+def mrope2d_(lib):
+    lib.infiniopCreateMRoPE2DDescriptor.restype = c_int32
+    lib.infiniopCreateMRoPE2DDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetMRoPE2DWorkspaceSize.restype = c_int32
+    lib.infiniopGetMRoPE2DWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopMRoPE2D.restype = c_int32
+    lib.infiniopMRoPE2D.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyMRoPE2DDescriptor.restype = c_int32
+    lib.infiniopDestroyMRoPE2DDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+@OpRegister.operator
+def mrope3d_(lib):
+    lib.infiniopCreateMRoPE3DDescriptor.restype = c_int32
+    lib.infiniopCreateMRoPE3DDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetMRoPE3DWorkspaceSize.restype = c_int32
+    lib.infiniopGetMRoPE3DWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopMRoPE3D.restype = c_int32
+    lib.infiniopMRoPE3D.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyMRoPE3DDescriptor.restype = c_int32
+    lib.infiniopDestroyMRoPE3DDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
 
 @OpRegister.operator
 def sub_(lib):