From 3376bcdb4d81416ecce97fe5628b6adf112ebed2 Mon Sep 17 00:00:00 2001
From: gongchensu <zhuyue_134@qq.com>
Date: Mon, 5 Jan 2026 15:28:13 +0000
Subject: [PATCH 1/3] Issue/887 - Add pow,div,mod,min,max operator with CPU and
 NVIDIA implementations.

---
 include/infiniop.h                         |   3 +
 include/infiniop/ops/div.h                 |  26 +++
 include/infiniop/ops/max.h                 |  26 +++
 include/infiniop/ops/min.h                 |  26 +++
 include/infiniop/ops/mod.h                 |  26 +++
 include/infiniop/ops/pow.h                 |  26 +++
 src/infiniop/ops/div/cpu/div_cpu.cc        |  50 +++++
 src/infiniop/ops/div/cpu/div_cpu.h         |  19 ++
 src/infiniop/ops/div/cuda/kernel.cuh       |  23 +++
 src/infiniop/ops/div/nvidia/div_nvidia.cu  |  57 ++++++
 src/infiniop/ops/div/nvidia/div_nvidia.cuh |   8 +
 src/infiniop/ops/div/operator.cc           | 202 +++++++++++++++++++++
 src/infiniop/ops/max/cpu/max_cpu.cc        |  50 +++++
 src/infiniop/ops/max/cpu/max_cpu.h         |  20 ++
 src/infiniop/ops/max/cuda/kernel.cuh       |  23 +++
 src/infiniop/ops/max/nvidia/max_nvidia.cu  |  57 ++++++
 src/infiniop/ops/max/nvidia/max_nvidia.cuh |   8 +
 src/infiniop/ops/max/operator.cc           | 202 +++++++++++++++++++++
 src/infiniop/ops/min/cpu/min_cpu.cc        |  50 +++++
 src/infiniop/ops/min/cpu/min_cpu.h         |  20 ++
 src/infiniop/ops/min/cuda/kernel.cuh       |  23 +++
 src/infiniop/ops/min/nvidia/min_nvidia.cu  |  57 ++++++
 src/infiniop/ops/min/nvidia/min_nvidia.cuh |   8 +
 src/infiniop/ops/min/operator.cc           | 202 +++++++++++++++++++++
 src/infiniop/ops/mod/cpu/mod_cpu.cc        |  49 +++++
 src/infiniop/ops/mod/cpu/mod_cpu.h         |  23 +++
 src/infiniop/ops/mod/cuda/kernel.cuh       |  30 +++
 src/infiniop/ops/mod/nvidia/mod_nvidia.cu  |  57 ++++++
 src/infiniop/ops/mod/nvidia/mod_nvidia.cuh |   8 +
 src/infiniop/ops/mod/operator.cc           | 142 +++++++++++++++
 src/infiniop/ops/pow/cpu/pow_cpu.cc        |  49 +++++
 src/infiniop/ops/pow/cpu/pow_cpu.h         |  19 ++
 src/infiniop/ops/pow/cuda/kernel.cuh       |  40 ++++
 src/infiniop/ops/pow/nvidia/pow_nvidia.cu  |  57 ++++++
 src/infiniop/ops/pow/nvidia/pow_nvidia.cuh |   8 +
 src/infiniop/ops/pow/operator.cc           | 142 +++++++++++++++
 test/infiniop/div.py                       | 192 ++++++++++++++++++++
 test/infiniop/libinfiniop/op_register.py   | 170 +++++++++++++++++
 test/infiniop/max.py                       | 189 +++++++++++++++++++
 test/infiniop/min.py                       | 189 +++++++++++++++++++
 test/infiniop/mod.py                       | 190 +++++++++++++++++++
 test/infiniop/pow.py                       | 190 +++++++++++++++++++
 42 files changed, 2956 insertions(+)
 create mode 100644 include/infiniop/ops/div.h
 create mode 100644 include/infiniop/ops/max.h
 create mode 100644 include/infiniop/ops/min.h
 create mode 100644 include/infiniop/ops/mod.h
 create mode 100644 include/infiniop/ops/pow.h
 create mode 100644 src/infiniop/ops/div/cpu/div_cpu.cc
 create mode 100644 src/infiniop/ops/div/cpu/div_cpu.h
 create mode 100644 src/infiniop/ops/div/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/div/nvidia/div_nvidia.cu
 create mode 100644 src/infiniop/ops/div/nvidia/div_nvidia.cuh
 create mode 100644 src/infiniop/ops/div/operator.cc
 create mode 100644 src/infiniop/ops/max/cpu/max_cpu.cc
 create mode 100644 src/infiniop/ops/max/cpu/max_cpu.h
 create mode 100644 src/infiniop/ops/max/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/max/nvidia/max_nvidia.cu
 create mode 100644 src/infiniop/ops/max/nvidia/max_nvidia.cuh
 create mode 100644 src/infiniop/ops/max/operator.cc
 create mode 100644 src/infiniop/ops/min/cpu/min_cpu.cc
 create mode 100644 src/infiniop/ops/min/cpu/min_cpu.h
 create mode 100644 src/infiniop/ops/min/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/min/nvidia/min_nvidia.cu
 create mode 100644 src/infiniop/ops/min/nvidia/min_nvidia.cuh
 create mode 100644 src/infiniop/ops/min/operator.cc
 create mode 100644 src/infiniop/ops/mod/cpu/mod_cpu.cc
 create mode 100644 src/infiniop/ops/mod/cpu/mod_cpu.h
 create mode 100644 src/infiniop/ops/mod/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/mod/nvidia/mod_nvidia.cu
 create mode 100644 src/infiniop/ops/mod/nvidia/mod_nvidia.cuh
 create mode 100644 src/infiniop/ops/mod/operator.cc
 create mode 100644 src/infiniop/ops/pow/cpu/pow_cpu.cc
 create mode 100644 src/infiniop/ops/pow/cpu/pow_cpu.h
 create mode 100644 src/infiniop/ops/pow/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/pow/nvidia/pow_nvidia.cu
 create mode 100644 src/infiniop/ops/pow/nvidia/pow_nvidia.cuh
 create mode 100644 src/infiniop/ops/pow/operator.cc
 create mode 100644 test/infiniop/div.py
 create mode 100644 test/infiniop/max.py
 create mode 100644 test/infiniop/min.py
 create mode 100644 test/infiniop/mod.py
 create mode 100644 test/infiniop/pow.py

diff --git a/include/infiniop.h b/include/infiniop.h
index c0a09fcb4..cf1688868 100644
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -9,11 +9,14 @@
 #include "infiniop/ops/clip.h"
 #include "infiniop/ops/conv.h"
 #include "infiniop/ops/dequantize_awq.h"
+#include "infiniop/ops/div.h"
 #include "infiniop/ops/gelu.h"
 #include "infiniop/ops/gemm.h"
 #include "infiniop/ops/layer_norm.h"
 #include "infiniop/ops/logsoftmax.h"
 #include "infiniop/ops/lp_norm.h"
+#include "infiniop/ops/max.h"
+#include "infiniop/ops/min.h"
 #include "infiniop/ops/mul.h"
 #include "infiniop/ops/ones.h"
 #include "infiniop/ops/paged_attention.h"
diff --git a/include/infiniop/ops/div.h b/include/infiniop/ops/div.h
new file mode 100644
index 000000000..e539b440c
--- /dev/null
+++ b/include/infiniop/ops/div.h
@@ -0,0 +1,26 @@
+#ifndef __INFINIOP_DIV_API_H__
+#define __INFINIOP_DIV_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopDivDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateDivDescriptor(infiniopHandle_t handle,
+                                                        infiniopDivDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t c,
+                                                        infiniopTensorDescriptor_t a,
+                                                        infiniopTensorDescriptor_t b);
+
+__C __export infiniStatus_t infiniopGetDivWorkspaceSize(infiniopDivDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopDiv(infiniopDivDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *c,
+                                        const void *a,
+                                        const void *b,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyDivDescriptor(infiniopDivDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/max.h b/include/infiniop/ops/max.h
new file mode 100644
index 000000000..e6f2f5d4c
--- /dev/null
+++ b/include/infiniop/ops/max.h
@@ -0,0 +1,26 @@
+#ifndef __INFINIOP_MAX_API_H__
+#define __INFINIOP_MAX_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopMaxDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateMaxDescriptor(infiniopHandle_t handle,
+                                                        infiniopMaxDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t c,
+                                                        infiniopTensorDescriptor_t a,
+                                                        infiniopTensorDescriptor_t b);
+
+__C __export infiniStatus_t infiniopGetMaxWorkspaceSize(infiniopMaxDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopMax(infiniopMaxDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *c,
+                                        const void *a,
+                                        const void *b,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyMaxDescriptor(infiniopMaxDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/min.h b/include/infiniop/ops/min.h
new file mode 100644
index 000000000..f72f0c4db
--- /dev/null
+++ b/include/infiniop/ops/min.h
@@ -0,0 +1,26 @@
+#ifndef __INFINIOP_MIN_API_H__
+#define __INFINIOP_MIN_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopMinDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateMinDescriptor(infiniopHandle_t handle,
+                                                        infiniopMinDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t c,
+                                                        infiniopTensorDescriptor_t a,
+                                                        infiniopTensorDescriptor_t b);
+
+__C __export infiniStatus_t infiniopGetMinWorkspaceSize(infiniopMinDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopMin(infiniopMinDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *c,
+                                        const void *a,
+                                        const void *b,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyMinDescriptor(infiniopMinDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/mod.h b/include/infiniop/ops/mod.h
new file mode 100644
index 000000000..5a6cd5bbf
--- /dev/null
+++ b/include/infiniop/ops/mod.h
@@ -0,0 +1,26 @@
+#ifndef __INFINIOP_MOD_API_H__
+#define __INFINIOP_MOD_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopModDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateModDescriptor(infiniopHandle_t handle,
+                                                        infiniopModDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t c,
+                                                        infiniopTensorDescriptor_t a,
+                                                        infiniopTensorDescriptor_t b);
+
+__C __export infiniStatus_t infiniopGetModWorkspaceSize(infiniopModDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopMod(infiniopModDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *c,
+                                        const void *a,
+                                        const void *b,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyModDescriptor(infiniopModDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/pow.h b/include/infiniop/ops/pow.h
new file mode 100644
index 000000000..6449d8622
--- /dev/null
+++ b/include/infiniop/ops/pow.h
@@ -0,0 +1,26 @@
+#ifndef __INFINIOP_POW_API_H__
+#define __INFINIOP_POW_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopPowDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreatePowDescriptor(infiniopHandle_t handle,
+                                                        infiniopPowDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t c,
+                                                        infiniopTensorDescriptor_t a,
+                                                        infiniopTensorDescriptor_t b);
+
+__C __export infiniStatus_t infiniopGetPowWorkspaceSize(infiniopPowDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopPow(infiniopPowDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *c,
+                                        const void *a,
+                                        const void *b,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyPowDescriptor(infiniopPowDescriptor_t desc);
+
+#endif
diff --git a/src/infiniop/ops/div/cpu/div_cpu.cc b/src/infiniop/ops/div/cpu/div_cpu.cc
new file mode 100644
index 000000000..19e222031
--- /dev/null
+++ b/src/infiniop/ops/div/cpu/div_cpu.cc
@@ -0,0 +1,50 @@
+#include "div_cpu.h"
+
+namespace op::div::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<DivOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<DivOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::div::cpu
diff --git a/src/infiniop/ops/div/cpu/div_cpu.h b/src/infiniop/ops/div/cpu/div_cpu.h
new file mode 100644
index 000000000..0373b766f
--- /dev/null
+++ b/src/infiniop/ops/div/cpu/div_cpu.h
@@ -0,0 +1,19 @@
+#ifndef __DIV_CPU_H__
+#define __DIV_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(div, cpu)
+
+namespace op::div::cpu {
+typedef struct DivOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    T operator()(const T &a, const T &b) const {
+        return a / b;
+    }
+} DivOp;
+} // namespace op::div::cpu
+
+#endif // __DIV_CPU_H__
diff --git a/src/infiniop/ops/div/cuda/kernel.cuh b/src/infiniop/ops/div/cuda/kernel.cuh
new file mode 100644
index 000000000..a67993da5
--- /dev/null
+++ b/src/infiniop/ops/div/cuda/kernel.cuh
@@ -0,0 +1,23 @@
+#ifndef __DIV_CUDA_H__
+#define __DIV_CUDA_H__
+
+namespace op::div::cuda {
+typedef struct DivOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return __h2div(a, b);
+        } else if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
+            return a / b;
+        } else if constexpr (std::is_same_v<T, float>) {
+            return __fdividef(a, b);
+        } else {
+            return a / b;
+        }
+    }
+} DivOp;
+} // namespace op::div::cuda
+
+#endif // __DIV_CUDA_H__
diff --git a/src/infiniop/ops/div/nvidia/div_nvidia.cu b/src/infiniop/ops/div/nvidia/div_nvidia.cu
new file mode 100644
index 000000000..1abffe816
--- /dev/null
+++ b/src/infiniop/ops/div/nvidia/div_nvidia.cu
@@ -0,0 +1,57 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "div_nvidia.cuh"
+
+namespace op::div::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::DivOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::DivOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::div::nvidia
diff --git a/src/infiniop/ops/div/nvidia/div_nvidia.cuh b/src/infiniop/ops/div/nvidia/div_nvidia.cuh
new file mode 100644
index 000000000..1ad8af94e
--- /dev/null
+++ b/src/infiniop/ops/div/nvidia/div_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __DIV_CUDA_API_H__
+#define __DIV_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(div, nvidia)
+
+#endif // __DIV_CUDA_API_H__
diff --git a/src/infiniop/ops/div/operator.cc b/src/infiniop/ops/div/operator.cc
new file mode 100644
index 000000000..84021a1af
--- /dev/null
+++ b/src/infiniop/ops/div/operator.cc
@@ -0,0 +1,202 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/div.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/div_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/div_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/div_metax.h"
+#endif
+#ifdef ENABLE_KUNLUN_API
+#include "kunlun/div_kunlun.h"
+#endif
+#ifdef ENABLE_CAMBRICON_API
+#include "bang/div_bang.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/div_moore.h"
+#endif
+
+__C infiniStatus_t infiniopCreateDivDescriptor(
+    infiniopHandle_t handle,
+    infiniopDivDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+        return op::div::NAMESPACE::Descriptor::create(                     \
+            handle,                                                        \
+            reinterpret_cast<op::div::NAMESPACE::Descriptor **>(desc_ptr), \
+            c_desc,                                                        \
+            {a_desc,                                                       \
+             b_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CREATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetDivWorkspaceSize(infiniopDivDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::div::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        GET(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        GET(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopDiv(
+    infiniopDivDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *c,
+    const void *a,
+    const void *b,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::div::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, c, {a, b}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyDivDescriptor(infiniopDivDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        delete reinterpret_cast<const op::div::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        DELETE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/max/cpu/max_cpu.cc b/src/infiniop/ops/max/cpu/max_cpu.cc
new file mode 100644
index 000000000..1b30fa4e4
--- /dev/null
+++ b/src/infiniop/ops/max/cpu/max_cpu.cc
@@ -0,0 +1,50 @@
+#include "max_cpu.h"
+
+namespace op::max::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<MaxOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<MaxOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::max::cpu
diff --git a/src/infiniop/ops/max/cpu/max_cpu.h b/src/infiniop/ops/max/cpu/max_cpu.h
new file mode 100644
index 000000000..4d085ed39
--- /dev/null
+++ b/src/infiniop/ops/max/cpu/max_cpu.h
@@ -0,0 +1,20 @@
+#ifndef __MAX_CPU_H__
+#define __MAX_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include <algorithm>
+
+ELEMENTWISE_DESCRIPTOR(max, cpu)
+
+namespace op::max::cpu {
+typedef struct MaxOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    T operator()(const T &a, const T &b) const {
+        return std::max(a, b);
+    }
+} MaxOp;
+} // namespace op::max::cpu
+
+#endif // __MAX_CPU_H__
diff --git a/src/infiniop/ops/max/cuda/kernel.cuh b/src/infiniop/ops/max/cuda/kernel.cuh
new file mode 100644
index 000000000..bf3977a31
--- /dev/null
+++ b/src/infiniop/ops/max/cuda/kernel.cuh
@@ -0,0 +1,23 @@
+#ifndef __MAX_CUDA_H__
+#define __MAX_CUDA_H__
+
+namespace op::max::cuda {
+typedef struct MaxOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return __hmax2(a, b);
+        } else if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
+            return a > b ? a : b;
+        } else if constexpr (std::is_same_v<T, float>) {
+            return fmaxf(a, b);
+        } else {
+            return a > b ? a : b;
+        }
+    }
+} MaxOp;
+} // namespace op::max::cuda
+
+#endif // __MAX_CUDA_H__
diff --git a/src/infiniop/ops/max/nvidia/max_nvidia.cu b/src/infiniop/ops/max/nvidia/max_nvidia.cu
new file mode 100644
index 000000000..5e9fb13f4
--- /dev/null
+++ b/src/infiniop/ops/max/nvidia/max_nvidia.cu
@@ -0,0 +1,57 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "max_nvidia.cuh"
+
+namespace op::max::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::MaxOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::MaxOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::max::nvidia
diff --git a/src/infiniop/ops/max/nvidia/max_nvidia.cuh b/src/infiniop/ops/max/nvidia/max_nvidia.cuh
new file mode 100644
index 000000000..b3b60dd2a
--- /dev/null
+++ b/src/infiniop/ops/max/nvidia/max_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __MAX_CUDA_API_H__
+#define __MAX_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(max, nvidia)
+
+#endif // __MAX_CUDA_API_H__
diff --git a/src/infiniop/ops/max/operator.cc b/src/infiniop/ops/max/operator.cc
new file mode 100644
index 000000000..e04368533
--- /dev/null
+++ b/src/infiniop/ops/max/operator.cc
@@ -0,0 +1,202 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/max.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/max_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/max_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/max_metax.h"
+#endif
+#ifdef ENABLE_KUNLUN_API
+#include "kunlun/max_kunlun.h"
+#endif
+#ifdef ENABLE_CAMBRICON_API
+#include "bang/max_bang.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/max_moore.h"
+#endif
+
+__C infiniStatus_t infiniopCreateMaxDescriptor(
+    infiniopHandle_t handle,
+    infiniopMaxDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+        return op::max::NAMESPACE::Descriptor::create(                     \
+            handle,                                                        \
+            reinterpret_cast<op::max::NAMESPACE::Descriptor **>(desc_ptr), \
+            c_desc,                                                        \
+            {a_desc,                                                       \
+             b_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CREATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetMaxWorkspaceSize(infiniopMaxDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::max::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        GET(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        GET(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopMax(
+    infiniopMaxDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *c,
+    const void *a,
+    const void *b,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::max::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, c, {a, b}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyMaxDescriptor(infiniopMaxDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        delete reinterpret_cast<const op::max::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        DELETE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/min/cpu/min_cpu.cc b/src/infiniop/ops/min/cpu/min_cpu.cc
new file mode 100644
index 000000000..dc30ee57f
--- /dev/null
+++ b/src/infiniop/ops/min/cpu/min_cpu.cc
@@ -0,0 +1,50 @@
+#include "min_cpu.h"
+
+namespace op::min::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<MinOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<MinOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::min::cpu
diff --git a/src/infiniop/ops/min/cpu/min_cpu.h b/src/infiniop/ops/min/cpu/min_cpu.h
new file mode 100644
index 000000000..1c84d4fca
--- /dev/null
+++ b/src/infiniop/ops/min/cpu/min_cpu.h
@@ -0,0 +1,20 @@
+#ifndef __MIN_CPU_H__
+#define __MIN_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include <algorithm>
+
+ELEMENTWISE_DESCRIPTOR(min, cpu)
+
+namespace op::min::cpu {
+typedef struct MinOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    T operator()(const T &a, const T &b) const {
+        return std::min(a, b);
+    }
+} MinOp;
+} // namespace op::min::cpu
+
+#endif // __MIN_CPU_H__
diff --git a/src/infiniop/ops/min/cuda/kernel.cuh b/src/infiniop/ops/min/cuda/kernel.cuh
new file mode 100644
index 000000000..aac14a0e8
--- /dev/null
+++ b/src/infiniop/ops/min/cuda/kernel.cuh
@@ -0,0 +1,23 @@
+#ifndef __MIN_CUDA_H__
+#define __MIN_CUDA_H__
+
+namespace op::min::cuda {
+typedef struct MinOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return __hmin2(a, b);
+        } else if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
+            return a < b ? a : b;
+        } else if constexpr (std::is_same_v<T, float>) {
+            return fminf(a, b);
+        } else {
+            return a < b ? a : b;
+        }
+    }
+} MinOp;
+} // namespace op::min::cuda
+
+#endif // __MIN_CUDA_H__
diff --git a/src/infiniop/ops/min/nvidia/min_nvidia.cu b/src/infiniop/ops/min/nvidia/min_nvidia.cu
new file mode 100644
index 000000000..419655e29
--- /dev/null
+++ b/src/infiniop/ops/min/nvidia/min_nvidia.cu
@@ -0,0 +1,57 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "min_nvidia.cuh"
+
+namespace op::min::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::MinOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::MinOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::min::nvidia
diff --git a/src/infiniop/ops/min/nvidia/min_nvidia.cuh b/src/infiniop/ops/min/nvidia/min_nvidia.cuh
new file mode 100644
index 000000000..ada9a3545
--- /dev/null
+++ b/src/infiniop/ops/min/nvidia/min_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __MIN_CUDA_API_H__
+#define __MIN_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(min, nvidia)
+
+#endif // __MIN_CUDA_API_H__
diff --git a/src/infiniop/ops/min/operator.cc b/src/infiniop/ops/min/operator.cc
new file mode 100644
index 000000000..8479feab4
--- /dev/null
+++ b/src/infiniop/ops/min/operator.cc
@@ -0,0 +1,202 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/min.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/min_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/min_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/min_metax.h"
+#endif
+#ifdef ENABLE_KUNLUN_API
+#include "kunlun/min_kunlun.h"
+#endif
+#ifdef ENABLE_CAMBRICON_API
+#include "bang/min_bang.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/min_moore.h"
+#endif
+
+__C infiniStatus_t infiniopCreateMinDescriptor(
+    infiniopHandle_t handle,
+    infiniopMinDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+        return op::min::NAMESPACE::Descriptor::create(                     \
+            handle,                                                        \
+            reinterpret_cast<op::min::NAMESPACE::Descriptor **>(desc_ptr), \
+            c_desc,                                                        \
+            {a_desc,                                                       \
+             b_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CREATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetMinWorkspaceSize(infiniopMinDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::min::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        GET(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        GET(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopMin(
+    infiniopMinDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *c,
+    const void *a,
+    const void *b,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::min::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, c, {a, b}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyMinDescriptor(infiniopMinDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        delete reinterpret_cast<const op::min::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        DELETE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/mod/cpu/mod_cpu.cc b/src/infiniop/ops/mod/cpu/mod_cpu.cc
new file mode 100644
index 000000000..907d05166
--- /dev/null
+++ b/src/infiniop/ops/mod/cpu/mod_cpu.cc
@@ -0,0 +1,49 @@
+#include "mod_cpu.h"
+
+namespace op::mod::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &out_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(out_shape, a_shape, b_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<ModOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<ModOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::mod::cpu
diff --git a/src/infiniop/ops/mod/cpu/mod_cpu.h b/src/infiniop/ops/mod/cpu/mod_cpu.h
new file mode 100644
index 000000000..9e78adca6
--- /dev/null
+++ b/src/infiniop/ops/mod/cpu/mod_cpu.h
@@ -0,0 +1,23 @@
+#ifndef __MOD_CPU_H__
+#define __MOD_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(mod, cpu)
+
+namespace op::mod::cpu {
+typedef struct ModOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    T operator()(const T &a, const T &b) const {
+        if constexpr (std::is_floating_point_v<T>) {
+            return std::fmod(a, b);
+        } else {
+            return a % b;
+        }
+    }
+} ModOp;
+} // namespace op::mod::cpu
+
+#endif // __MOD_CPU_H__
diff --git a/src/infiniop/ops/mod/cuda/kernel.cuh b/src/infiniop/ops/mod/cuda/kernel.cuh
new file mode 100644
index 000000000..0dcb54136
--- /dev/null
+++ b/src/infiniop/ops/mod/cuda/kernel.cuh
@@ -0,0 +1,30 @@
+#ifndef __MOD_CUDA_H__
+#define __MOD_CUDA_H__
+
+#include <cmath>
+#include <cuda_fp16.h>
+
+namespace op::mod::cuda {
+typedef struct ModOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            float2 a_f2 = __half22float2(a);
+            float2 b_f2 = __half22float2(b);
+            return __float22half2_rn(make_float2(std::fmod(a_f2.x, b_f2.x), std::fmod(a_f2.y, b_f2.y)));
+        } else if constexpr (std::is_same_v<T, half>) {
+            float a_ = __half2float(a);
+            float b_ = __half2float(b);
+            return __float2half(std::fmod(a_, b_));
+        } else if constexpr (std::is_floating_point_v<T>) {
+            return std::fmod(a, b);
+        } else {
+            return a % b;
+        }
+    }
+} ModOp;
+} // namespace op::mod::cuda
+
+#endif // __MOD_CUDA_H__
diff --git a/src/infiniop/ops/mod/nvidia/mod_nvidia.cu b/src/infiniop/ops/mod/nvidia/mod_nvidia.cu
new file mode 100644
index 000000000..64326d441
--- /dev/null
+++ b/src/infiniop/ops/mod/nvidia/mod_nvidia.cu
@@ -0,0 +1,57 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "mod_nvidia.cuh"
+
+namespace op::mod::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::ModOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::ModOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::mod::nvidia
diff --git a/src/infiniop/ops/mod/nvidia/mod_nvidia.cuh b/src/infiniop/ops/mod/nvidia/mod_nvidia.cuh
new file mode 100644
index 000000000..31788cfd2
--- /dev/null
+++ b/src/infiniop/ops/mod/nvidia/mod_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __MOD_CUDA_API_H__
+#define __MOD_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(mod, nvidia)
+
+#endif // __MOD_CUDA_API_H__
diff --git a/src/infiniop/ops/mod/operator.cc b/src/infiniop/ops/mod/operator.cc
new file mode 100644
index 000000000..85810e794
--- /dev/null
+++ b/src/infiniop/ops/mod/operator.cc
@@ -0,0 +1,142 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/mod.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/mod_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/mod_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateModDescriptor(
+    infiniopHandle_t handle,
+    infiniopModDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+        return op::mod::NAMESPACE::Descriptor::create(                     \
+            handle,                                                        \
+            reinterpret_cast<op::mod::NAMESPACE::Descriptor **>(desc_ptr), \
+            c_desc,                                                        \
+            {a_desc,                                                       \
+             b_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetModWorkspaceSize(infiniopModDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::mod::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopMod(
+    infiniopModDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *c,
+    const void *a,
+    const void *b,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::mod::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, c, {a, b}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyModDescriptor(infiniopModDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        delete reinterpret_cast<const op::mod::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/pow/cpu/pow_cpu.cc b/src/infiniop/ops/pow/cpu/pow_cpu.cc
new file mode 100644
index 000000000..0c6fda0f7
--- /dev/null
+++ b/src/infiniop/ops/pow/cpu/pow_cpu.cc
@@ -0,0 +1,49 @@
+#include "pow_cpu.h"
+
+namespace op::pow::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &out_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(out_shape, a_shape, b_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<PowOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<PowOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::pow::cpu
diff --git a/src/infiniop/ops/pow/cpu/pow_cpu.h b/src/infiniop/ops/pow/cpu/pow_cpu.h
new file mode 100644
index 000000000..21d9bb897
--- /dev/null
+++ b/src/infiniop/ops/pow/cpu/pow_cpu.h
@@ -0,0 +1,19 @@
+#ifndef __POW_CPU_H__
+#define __POW_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(pow, cpu)
+
+namespace op::pow::cpu {
+typedef struct PowOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    T operator()(const T &a, const T &b) const {
+        return std::pow(a, b);
+    }
+} PowOp;
+} // namespace op::pow::cpu
+
+#endif // __POW_CPU_H__
diff --git a/src/infiniop/ops/pow/cuda/kernel.cuh b/src/infiniop/ops/pow/cuda/kernel.cuh
new file mode 100644
index 000000000..e8b5324a0
--- /dev/null
+++ b/src/infiniop/ops/pow/cuda/kernel.cuh
@@ -0,0 +1,40 @@
+#ifndef __POW_CUDA_H__
+#define __POW_CUDA_H__
+
+#include <cmath>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+
+namespace op::pow::cuda {
+typedef struct PowOp {
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            float2 a_f2 = __half22float2(a);
+            float2 b_f2 = __half22float2(b);
+            return __float22half2_rn(make_float2(__powf(a_f2.x, b_f2.x), __powf(a_f2.y, b_f2.y)));
+        } else if constexpr (std::is_same_v<T, half>) {
+            float a_ = __half2float(a);
+            float b_ = __half2float(b);
+            float ans_f = __powf(a_, b_);
+            return __float2half(isnan(ans_f) ? std::pow(a_, b_) : ans_f);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float2 a_f2 = __bfloat1622float2(a);
+            float2 b_f2 = __bfloat1622float2(b);
+            return __floats2bfloat162_rn(__powf(a_f2.x, b_f2.x), __powf(a_f2.y, b_f2.y));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            float a_ = __bfloat162float(a);
+            float b_ = __bfloat162float(b);
+            return __float2bfloat16_rn(__powf(a_, b_));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return __powf(a, b);
+        } else {
+            return std::pow(a, b);
+        }
+    }
+} PowOp;
+
+} // namespace op::pow::cuda
+
+#endif // __POW_CUDA_H__
diff --git a/src/infiniop/ops/pow/nvidia/pow_nvidia.cu b/src/infiniop/ops/pow/nvidia/pow_nvidia.cu
new file mode 100644
index 000000000..3cfd0cd2f
--- /dev/null
+++ b/src/infiniop/ops/pow/nvidia/pow_nvidia.cu
@@ -0,0 +1,57 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "pow_nvidia.cuh"
+
+namespace op::pow::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::PowOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::PowOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::pow::nvidia
diff --git a/src/infiniop/ops/pow/nvidia/pow_nvidia.cuh b/src/infiniop/ops/pow/nvidia/pow_nvidia.cuh
new file mode 100644
index 000000000..5bbb2fb8c
--- /dev/null
+++ b/src/infiniop/ops/pow/nvidia/pow_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __POW_CUDA_API_H__
+#define __POW_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(pow, nvidia)
+
+#endif // __POW_CUDA_API_H__
diff --git a/src/infiniop/ops/pow/operator.cc b/src/infiniop/ops/pow/operator.cc
new file mode 100644
index 000000000..e90639f67
--- /dev/null
+++ b/src/infiniop/ops/pow/operator.cc
@@ -0,0 +1,142 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/pow.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/pow_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/pow_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreatePowDescriptor(
+    infiniopHandle_t handle,
+    infiniopPowDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+        return op::pow::NAMESPACE::Descriptor::create(                     \
+            handle,                                                        \
+            reinterpret_cast<op::pow::NAMESPACE::Descriptor **>(desc_ptr), \
+            c_desc,                                                        \
+            {a_desc,                                                       \
+             b_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetPowWorkspaceSize(infiniopPowDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::pow::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopPow(
+    infiniopPowDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *c,
+    const void *a,
+    const void *b,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::pow::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, c, {a, b}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyPowDescriptor(infiniopPowDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        delete reinterpret_cast<const op::pow::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/test/infiniop/div.py b/test/infiniop/div.py
new file mode 100644
index 000000000..17b22b2e5
--- /dev/null
+++ b/test/infiniop/div.py
@@ -0,0 +1,192 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # shape, a_stride, b_stride, c_stride
+    ((13, 4), None, None, None),
+    ((13, 4), (10, 1), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None, None),
+    ((13, 4, 4), None, None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
+    ((16, 5632), None, None, None),
+    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
+    ((13, 16, 2), (128, 4, 1), (0, 2, 1), (64, 4, 1)),
+    ((13, 16, 2), (128, 4, 1), (2, 0, 1), (64, 4, 1)),
+    ((4, 4, 5632), None, None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+    INPLACE_B = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_A,
+    Inplace.INPLACE_B,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+# Note: F32 tolerance is relaxed compared to theoretical precision due to:
+# - Old operators library uses vectorized operations (pack_size=4) with vecN<float2, float, 2>
+# - InfiniCore uses elementwise operations, which can cause 1 ULP differences
+# - This is acceptable as it's within floating-point precision limits
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},  # Relaxed from 1e-7 to accommodate vectorization differences
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def div(c, a, b):
+    # Only support F16 and F32 (matching old operators library)
+    torch.div(a, b, out=c)
+
+
+def test(
+    handle,
+    device,
+    shape,
+    a_stride=None,
+    b_stride=None,
+    c_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=InfiniDtype.F16,
+    sync=None,
+):
+    a = TestTensor(shape, a_stride, dtype, device)
+    # For division, ensure b doesn't contain zeros to avoid division by zero
+    # Similar to old test: b = torch.rand(...) * 2, which gives range [0, 2)
+    # Use scale=2 to ensure values are in [0, 2) range, then add small bias to avoid zero
+    b = TestTensor(shape, b_stride, dtype, device, scale=2, bias=0.1)
+    
+    if inplace == Inplace.INPLACE_A:
+        if c_stride is not None and c_stride != a_stride:
+            return
+        c = a
+    elif inplace == Inplace.INPLACE_B:
+        if c_stride is not None and c_stride != b_stride:
+            return
+        c = b
+    else:
+        c = TestTensor(shape, c_stride, dtype, device)
+
+    if c.is_broadcast():
+        return
+
+    print(
+        f"Testing Div on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+    div(c.torch_tensor(), a.torch_tensor(), b.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateDivDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            c.descriptor,
+            a.descriptor,
+            b.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [a, b, c]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetDivWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, device)
+
+    def lib_div():
+        check_error(
+            LIBINFINIOP.infiniopDiv(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                c.data(),
+                a.data(),
+                b.data(),
+                None,
+            )
+        )
+
+    lib_div()
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: div(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_div(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyDivDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py
index 618be2b05..a61cea018 100644
--- a/test/infiniop/libinfiniop/op_register.py
+++ b/test/infiniop/libinfiniop/op_register.py
@@ -269,6 +269,176 @@ def mul_(lib):
     ]
 
 
+@OpRegister.operator
+def pow_(lib):
+    lib.infiniopCreatePowDescriptor.restype = c_int32
+    lib.infiniopCreatePowDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetPowWorkspaceSize.restype = c_int32
+    lib.infiniopGetPowWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopPow.restype = c_int32
+    lib.infiniopPow.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyPowDescriptor.restype = c_int32
+    lib.infiniopDestroyPowDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def div_(lib):
+    lib.infiniopCreateDivDescriptor.restype = c_int32
+    lib.infiniopCreateDivDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetDivWorkspaceSize.restype = c_int32
+    lib.infiniopGetDivWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopDiv.restype = c_int32
+    lib.infiniopDiv.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyDivDescriptor.restype = c_int32
+    lib.infiniopDestroyDivDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def mod_(lib):
+    lib.infiniopCreateModDescriptor.restype = c_int32
+    lib.infiniopCreateModDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetModWorkspaceSize.restype = c_int32
+    lib.infiniopGetModWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopMod.restype = c_int32
+    lib.infiniopMod.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyModDescriptor.restype = c_int32
+    lib.infiniopDestroyModDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def max_(lib):
+    lib.infiniopCreateMaxDescriptor.restype = c_int32
+    lib.infiniopCreateMaxDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetMaxWorkspaceSize.restype = c_int32
+    lib.infiniopGetMaxWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopMax.restype = c_int32
+    lib.infiniopMax.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyMaxDescriptor.restype = c_int32
+    lib.infiniopDestroyMaxDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def min_(lib):
+    lib.infiniopCreateMinDescriptor.restype = c_int32
+    lib.infiniopCreateMinDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetMinWorkspaceSize.restype = c_int32
+    lib.infiniopGetMinWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopMin.restype = c_int32
+    lib.infiniopMin.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyMinDescriptor.restype = c_int32
+    lib.infiniopDestroyMinDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
 @OpRegister.operator
 def random_sample_(lib):
     lib.infiniopCreateRandomSampleDescriptor.restype = c_int32
diff --git a/test/infiniop/max.py b/test/infiniop/max.py
new file mode 100644
index 000000000..e4221cf3e
--- /dev/null
+++ b/test/infiniop/max.py
@@ -0,0 +1,189 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # shape, a_stride, b_stride, c_stride
+    ((13, 4), None, None, None),
+    ((13, 4), (10, 1), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None, None),
+    ((13, 4, 4), None, None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
+    ((16, 5632), None, None, None),
+    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
+    ((13, 16, 2), (128, 4, 1), (0, 2, 1), (64, 4, 1)),
+    ((13, 16, 2), (128, 4, 1), (2, 0, 1), (64, 4, 1)),
+    ((4, 4, 5632), None, None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+    INPLACE_B = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_A,
+    Inplace.INPLACE_B,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+# Note: F32 tolerance is relaxed compared to theoretical precision due to:
+# - Old operators library uses vectorized operations (pack_size=4) with vecN<float2, float, 2>
+# - InfiniCore uses elementwise operations, which can cause 1 ULP differences
+# - This is acceptable as it's within floating-point precision limits
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},  # Relaxed from 1e-7 to accommodate vectorization differences
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def max(c, a, b):
+    # Only support F16 and F32 (matching old operators library)
+    torch.maximum(a, b, out=c)
+
+
+def test(
+    handle,
+    device,
+    shape,
+    a_stride=None,
+    b_stride=None,
+    c_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=InfiniDtype.F16,
+    sync=None,
+):
+    a = TestTensor(shape, a_stride, dtype, device)
+    b = TestTensor(shape, b_stride, dtype, device)
+    
+    if inplace == Inplace.INPLACE_A:
+        if c_stride is not None and c_stride != a_stride:
+            return
+        c = a
+    elif inplace == Inplace.INPLACE_B:
+        if c_stride is not None and c_stride != b_stride:
+            return
+        c = b
+    else:
+        c = TestTensor(shape, c_stride, dtype, device)
+
+    if c.is_broadcast():
+        return
+
+    print(
+        f"Testing Max on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+    max(c.torch_tensor(), a.torch_tensor(), b.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateMaxDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            c.descriptor,
+            a.descriptor,
+            b.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [a, b, c]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetMaxWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, device)
+
+    def lib_max():
+        check_error(
+            LIBINFINIOP.infiniopMax(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                c.data(),
+                a.data(),
+                b.data(),
+                None,
+            )
+        )
+
+    lib_max()
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: max(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_max(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyMaxDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/min.py b/test/infiniop/min.py
new file mode 100644
index 000000000..19f52a334
--- /dev/null
+++ b/test/infiniop/min.py
@@ -0,0 +1,189 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # shape, a_stride, b_stride, c_stride
+    ((13, 4), None, None, None),
+    ((13, 4), (10, 1), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None, None),
+    ((13, 4, 4), None, None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
+    ((16, 5632), None, None, None),
+    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
+    ((13, 16, 2), (128, 4, 1), (0, 2, 1), (64, 4, 1)),
+    ((13, 16, 2), (128, 4, 1), (2, 0, 1), (64, 4, 1)),
+    ((4, 4, 5632), None, None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+    INPLACE_B = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_A,
+    Inplace.INPLACE_B,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+# Note: F32 tolerance is relaxed compared to theoretical precision due to:
+# - Old operators library uses vectorized operations (pack_size=4) with vecN<float2, float, 2>
+# - InfiniCore uses elementwise operations, which can cause 1 ULP differences
+# - This is acceptable as it's within floating-point precision limits
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},  # Relaxed from 1e-7 to accommodate vectorization differences
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def min(c, a, b):
+    # Only support F16 and F32 (matching old operators library)
+    torch.minimum(a, b, out=c)
+
+
+def test(
+    handle,
+    device,
+    shape,
+    a_stride=None,
+    b_stride=None,
+    c_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=InfiniDtype.F16,
+    sync=None,
+):
+    a = TestTensor(shape, a_stride, dtype, device)
+    b = TestTensor(shape, b_stride, dtype, device)
+    
+    if inplace == Inplace.INPLACE_A:
+        if c_stride is not None and c_stride != a_stride:
+            return
+        c = a
+    elif inplace == Inplace.INPLACE_B:
+        if c_stride is not None and c_stride != b_stride:
+            return
+        c = b
+    else:
+        c = TestTensor(shape, c_stride, dtype, device)
+
+    if c.is_broadcast():
+        return
+
+    print(
+        f"Testing Min on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+    min(c.torch_tensor(), a.torch_tensor(), b.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateMinDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            c.descriptor,
+            a.descriptor,
+            b.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [a, b, c]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetMinWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, device)
+
+    def lib_min():
+        check_error(
+            LIBINFINIOP.infiniopMin(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                c.data(),
+                a.data(),
+                b.data(),
+                None,
+            )
+        )
+
+    lib_min()
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: min(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_min(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyMinDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/mod.py b/test/infiniop/mod.py
new file mode 100644
index 000000000..298f3137f
--- /dev/null
+++ b/test/infiniop/mod.py
@@ -0,0 +1,190 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # shape, a_stride, b_stride, c_stride
+    ((13, 4), None, None, None),
+    ((13, 4), (10, 1), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None, None),
+    ((13, 4, 4), None, None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
+    ((16, 5632), None, None, None),
+    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
+    ((13, 16, 2), (128, 4, 1), (0, 2, 1), (64, 4, 1)),
+    ((13, 16, 2), (128, 4, 1), (2, 0, 1), (64, 4, 1)),
+    ((4, 4, 5632), None, None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+    INPLACE_B = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_A,
+    Inplace.INPLACE_B,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+# Note: mod operation uses fmod for floating point, which should be exact
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def mod_op(c, a, b):
+    torch.fmod(a, b, out=c)
+
+
+def test(
+    handle,
+    device,
+    shape,
+    a_stride=None,
+    b_stride=None,
+    c_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=InfiniDtype.F16,
+    sync=None,
+):
+    # Generate test tensors with values in a reasonable range for mod operation
+    # Use scale=10 to get values in [0, 10) range, similar to old test
+    a = TestTensor(shape, a_stride, dtype, device, mode="random", scale=10.0)
+    # Ensure b doesn't contain zeros to avoid division by zero in mod
+    b = TestTensor(shape, b_stride, dtype, device, mode="random", scale=10.0, bias=0.1)
+    
+    if inplace == Inplace.INPLACE_A:
+        if c_stride is not None and c_stride != a_stride:
+            return
+        c = a
+    elif inplace == Inplace.INPLACE_B:
+        if c_stride is not None and c_stride != b_stride:
+            return
+        c = b
+    else:
+        c = TestTensor(shape, c_stride, dtype, device)
+
+    if c.is_broadcast():
+        return
+
+    print(
+        f"Testing Mod on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+    
+    mod_op(c.torch_tensor(), a.torch_tensor(), b.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateModDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            c.descriptor,
+            a.descriptor,
+            b.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [a, b, c]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetModWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, device)
+
+    def lib_mod():
+        check_error(
+            LIBINFINIOP.infiniopMod(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                c.data(),
+                a.data(),
+                b.data(),
+                None,
+            )
+        )
+
+    lib_mod()
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True)
+    assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: mod_op(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_mod(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyModDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/pow.py b/test/infiniop/pow.py
new file mode 100644
index 000000000..f437c4229
--- /dev/null
+++ b/test/infiniop/pow.py
@@ -0,0 +1,190 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # shape, a_stride, b_stride, c_stride
+    ((13, 4), None, None, None),
+    ((13, 4), (10, 1), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None, None),
+    ((13, 4, 4), None, None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
+    ((16, 5632), None, None, None),
+    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
+    ((4, 4, 5632), None, None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+    INPLACE_B = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_A,
+    Inplace.INPLACE_B,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing
+# Note: Only F16 and F32 are supported, matching the old repository's binary operator
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+# Note: pow operation may have larger numerical errors, especially for F16
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-3},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def pow_op(c, a, b):
+    torch.pow(a, b, out=c)
+
+
+def test(
+    handle,
+    device,
+    shape,
+    a_stride=None,
+    b_stride=None,
+    c_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=InfiniDtype.F16,
+    sync=None,
+):
+    # Generate test tensors with values in a reasonable range for pow operation
+    # Avoid negative bases and very large exponents to prevent numerical issues
+    a = TestTensor(shape, a_stride, dtype, device, mode="random", scale=5.0, bias=0.1)
+    b = TestTensor(shape, b_stride, dtype, device, mode="random", scale=3.0, bias=0.1)
+    
+    if inplace == Inplace.INPLACE_A:
+        if c_stride is not None and c_stride != a_stride:
+            return
+        c = a
+    elif inplace == Inplace.INPLACE_B:
+        if c_stride is not None and c_stride != b_stride:
+            return
+        c = b
+    else:
+        c = TestTensor(shape, c_stride, dtype, device)
+
+    if c.is_broadcast():
+        return
+
+    print(
+        f"Testing Pow on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+    
+    pow_op(c.torch_tensor(), a.torch_tensor(), b.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreatePowDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            c.descriptor,
+            a.descriptor,
+            b.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [a, b, c]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetPowWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, c.device)
+
+    def lib_pow():
+        check_error(
+            LIBINFINIOP.infiniopPow(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                c.data(),
+                a.data(),
+                b.data(),
+                None,
+            )
+        )
+
+    lib_pow()
+
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True)
+    # Use equal_nan=True to handle NaN cases in pow operation
+    assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: pow_op(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_pow(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyPowDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")

From 547c2bc3a79aacc9c94222b9e26c967a4c2c6364 Mon Sep 17 00:00:00 2001
From: gongchensu <zhuyue_134@qq.com>
Date: Wed, 7 Jan 2026 02:34:11 +0000
Subject: [PATCH 2/3] Issue/887 - Add
 abs,acos,acosh,asin,asinh,atan,atanh,ceil,cos,cosh,erf,floor,log,neg,reciprocal,round,sign,sinh,sqrt,tan
 operator with CPU and NVIDIA implementations.

---
 include/infiniop.h                            |  20 +
 include/infiniop/ops/abs.h                    |  24 +
 include/infiniop/ops/acos.h                   |  24 +
 include/infiniop/ops/acosh.h                  |  24 +
 include/infiniop/ops/asin.h                   |  24 +
 include/infiniop/ops/asinh.h                  |  24 +
 include/infiniop/ops/atan.h                   |  24 +
 include/infiniop/ops/atanh.h                  |  24 +
 include/infiniop/ops/ceil.h                   |  24 +
 include/infiniop/ops/cos.h                    |  24 +
 include/infiniop/ops/cosh.h                   |  24 +
 include/infiniop/ops/erf.h                    |  24 +
 include/infiniop/ops/floor.h                  |  24 +
 include/infiniop/ops/log.h                    |  24 +
 include/infiniop/ops/neg.h                    |  24 +
 include/infiniop/ops/reciprocal.h             |  24 +
 include/infiniop/ops/round.h                  |  24 +
 include/infiniop/ops/sign.h                   |  24 +
 include/infiniop/ops/sinh.h                   |  24 +
 include/infiniop/ops/sqrt.h                   |  24 +
 include/infiniop/ops/tan.h                    |  24 +
 src/infiniop/ops/abs/cpu/abs_cpu.cc           |  48 ++
 src/infiniop/ops/abs/cpu/abs_cpu.h            |  26 +
 src/infiniop/ops/abs/cuda/kernel.cuh          |  26 +
 src/infiniop/ops/abs/nvidia/abs_nvidia.cu     |  54 ++
 src/infiniop/ops/abs/nvidia/abs_nvidia.cuh    |   8 +
 src/infiniop/ops/abs/operator.cc              | 139 +++++
 src/infiniop/ops/acos/cpu/acos_cpu.cc         |  48 ++
 src/infiniop/ops/acos/cpu/acos_cpu.h          |  22 +
 src/infiniop/ops/acos/cuda/kernel.cuh         |  32 +
 src/infiniop/ops/acos/nvidia/acos_nvidia.cu   |  54 ++
 src/infiniop/ops/acos/nvidia/acos_nvidia.cuh  |   8 +
 src/infiniop/ops/acos/operator.cc             | 139 +++++
 src/infiniop/ops/acosh/cpu/acosh_cpu.cc       |  48 ++
 src/infiniop/ops/acosh/cpu/acosh_cpu.h        |  22 +
 src/infiniop/ops/acosh/cuda/kernel.cuh        |  32 +
 src/infiniop/ops/acosh/nvidia/acosh_nvidia.cu |  54 ++
 .../ops/acosh/nvidia/acosh_nvidia.cuh         |   8 +
 src/infiniop/ops/acosh/operator.cc            | 139 +++++
 src/infiniop/ops/asin/cpu/asin_cpu.cc         |  48 ++
 src/infiniop/ops/asin/cpu/asin_cpu.h          |  22 +
 src/infiniop/ops/asin/cuda/kernel.cuh         |  32 +
 src/infiniop/ops/asin/nvidia/asin_nvidia.cu   |  54 ++
 src/infiniop/ops/asin/nvidia/asin_nvidia.cuh  |   8 +
 src/infiniop/ops/asin/operator.cc             | 139 +++++
 src/infiniop/ops/asinh/cpu/asinh_cpu.cc       |  48 ++
 src/infiniop/ops/asinh/cpu/asinh_cpu.h        |  22 +
 src/infiniop/ops/asinh/cuda/kernel.cuh        |  32 +
 src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu |  54 ++
 .../ops/asinh/nvidia/asinh_nvidia.cuh         |   8 +
 src/infiniop/ops/asinh/operator.cc            | 139 +++++
 src/infiniop/ops/atan/cpu/atan_cpu.cc         |  48 ++
 src/infiniop/ops/atan/cpu/atan_cpu.h          |  22 +
 src/infiniop/ops/atan/cuda/kernel.cuh         |  32 +
 src/infiniop/ops/atan/nvidia/atan_nvidia.cu   |  54 ++
 src/infiniop/ops/atan/nvidia/atan_nvidia.cuh  |   8 +
 src/infiniop/ops/atan/operator.cc             | 139 +++++
 src/infiniop/ops/atanh/cpu/atanh_cpu.cc       |  48 ++
 src/infiniop/ops/atanh/cpu/atanh_cpu.h        |  22 +
 src/infiniop/ops/atanh/cuda/kernel.cuh        |  32 +
 src/infiniop/ops/atanh/nvidia/atanh_nvidia.cu |  54 ++
 .../ops/atanh/nvidia/atanh_nvidia.cuh         |   8 +
 src/infiniop/ops/atanh/operator.cc            | 139 +++++
 src/infiniop/ops/ceil/cpu/ceil_cpu.cc         |  48 ++
 src/infiniop/ops/ceil/cpu/ceil_cpu.h          |  26 +
 src/infiniop/ops/ceil/cuda/kernel.cuh         |  34 +
 src/infiniop/ops/ceil/nvidia/ceil_nvidia.cu   |  54 ++
 src/infiniop/ops/ceil/nvidia/ceil_nvidia.cuh  |   8 +
 src/infiniop/ops/ceil/operator.cc             | 139 +++++
 src/infiniop/ops/cos/cpu/cos_cpu.cc           |  48 ++
 src/infiniop/ops/cos/cpu/cos_cpu.h            |  22 +
 src/infiniop/ops/cos/cuda/kernel.cuh          |  32 +
 src/infiniop/ops/cos/nvidia/cos_nvidia.cu     |  54 ++
 src/infiniop/ops/cos/nvidia/cos_nvidia.cuh    |   8 +
 src/infiniop/ops/cos/operator.cc              | 139 +++++
 src/infiniop/ops/cosh/cpu/cosh_cpu.cc         |  48 ++
 src/infiniop/ops/cosh/cpu/cosh_cpu.h          |  22 +
 src/infiniop/ops/cosh/cuda/kernel.cuh         |  32 +
 src/infiniop/ops/cosh/nvidia/cosh_nvidia.cu   |  54 ++
 src/infiniop/ops/cosh/nvidia/cosh_nvidia.cuh  |   8 +
 src/infiniop/ops/cosh/operator.cc             | 139 +++++
 src/infiniop/ops/erf/cpu/erf_cpu.cc           |  48 ++
 src/infiniop/ops/erf/cpu/erf_cpu.h            |  22 +
 src/infiniop/ops/erf/cuda/kernel.cuh          |  32 +
 src/infiniop/ops/erf/nvidia/erf_nvidia.cu     |  54 ++
 src/infiniop/ops/erf/nvidia/erf_nvidia.cuh    |   8 +
 src/infiniop/ops/erf/operator.cc              | 139 +++++
 src/infiniop/ops/floor/cpu/floor_cpu.cc       |  48 ++
 src/infiniop/ops/floor/cpu/floor_cpu.h        |  26 +
 src/infiniop/ops/floor/cuda/kernel.cuh        |  34 +
 src/infiniop/ops/floor/nvidia/floor_nvidia.cu |  54 ++
 .../ops/floor/nvidia/floor_nvidia.cuh         |   8 +
 src/infiniop/ops/floor/operator.cc            | 139 +++++
 src/infiniop/ops/log/cpu/log_cpu.cc           |  48 ++
 src/infiniop/ops/log/cpu/log_cpu.h            |  22 +
 src/infiniop/ops/log/cuda/kernel.cuh          |  32 +
 src/infiniop/ops/log/nvidia/log_nvidia.cu     |  54 ++
 src/infiniop/ops/log/nvidia/log_nvidia.cuh    |   8 +
 src/infiniop/ops/log/operator.cc              | 139 +++++
 src/infiniop/ops/neg/cpu/neg_cpu.cc           |  48 ++
 src/infiniop/ops/neg/cpu/neg_cpu.h            |  20 +
 src/infiniop/ops/neg/cuda/kernel.cuh          |  23 +
 src/infiniop/ops/neg/nvidia/neg_nvidia.cu     |  54 ++
 src/infiniop/ops/neg/nvidia/neg_nvidia.cuh    |   8 +
 src/infiniop/ops/neg/operator.cc              | 139 +++++
 src/infiniop/ops/pow/cuda/kernel.cuh          |   2 +-
 .../ops/reciprocal/cpu/reciprocal_cpu.cc      |  48 ++
 .../ops/reciprocal/cpu/reciprocal_cpu.h       |  20 +
 src/infiniop/ops/reciprocal/cuda/kernel.cuh   |  32 +
 .../reciprocal/nvidia/reciprocal_nvidia.cu    |  54 ++
 .../reciprocal/nvidia/reciprocal_nvidia.cuh   |   8 +
 src/infiniop/ops/reciprocal/operator.cc       | 139 +++++
 src/infiniop/ops/round/cpu/round_cpu.cc       |  48 ++
 src/infiniop/ops/round/cpu/round_cpu.h        |  25 +
 src/infiniop/ops/round/cuda/kernel.cuh        |  34 +
 src/infiniop/ops/round/nvidia/round_nvidia.cu |  54 ++
 .../ops/round/nvidia/round_nvidia.cuh         |   8 +
 src/infiniop/ops/round/operator.cc            | 139 +++++
 src/infiniop/ops/sign/cpu/sign_cpu.cc         |  48 ++
 src/infiniop/ops/sign/cpu/sign_cpu.h          |  20 +
 src/infiniop/ops/sign/cuda/kernel.cuh         |  25 +
 src/infiniop/ops/sign/nvidia/sign_nvidia.cu   |  54 ++
 src/infiniop/ops/sign/nvidia/sign_nvidia.cuh  |   8 +
 src/infiniop/ops/sign/operator.cc             | 139 +++++
 src/infiniop/ops/sinh/cpu/sinh_cpu.cc         |  48 ++
 src/infiniop/ops/sinh/cpu/sinh_cpu.h          |  22 +
 src/infiniop/ops/sinh/cuda/kernel.cuh         |  32 +
 src/infiniop/ops/sinh/nvidia/sinh_nvidia.cu   |  54 ++
 src/infiniop/ops/sinh/nvidia/sinh_nvidia.cuh  |   8 +
 src/infiniop/ops/sinh/operator.cc             | 139 +++++
 src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc         |  48 ++
 src/infiniop/ops/sqrt/cpu/sqrt_cpu.h          |  22 +
 src/infiniop/ops/sqrt/cuda/kernel.cuh         |  32 +
 src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cu   |  54 ++
 src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cuh  |   8 +
 src/infiniop/ops/sqrt/operator.cc             | 139 +++++
 src/infiniop/ops/tan/cpu/tan_cpu.cc           |  48 ++
 src/infiniop/ops/tan/cpu/tan_cpu.h            |  22 +
 src/infiniop/ops/tan/cuda/kernel.cuh          |  55 ++
 src/infiniop/ops/tan/nvidia/tan_nvidia.cu     |  54 ++
 src/infiniop/ops/tan/nvidia/tan_nvidia.cuh    |   8 +
 src/infiniop/ops/tan/operator.cc              | 139 +++++
 test/infiniop/abs.py                          | 164 +++++
 test/infiniop/acos.py                         | 165 +++++
 test/infiniop/acosh.py                        | 165 +++++
 test/infiniop/asin.py                         | 165 +++++
 test/infiniop/asinh.py                        | 165 +++++
 test/infiniop/atan.py                         | 164 +++++
 test/infiniop/atanh.py                        | 165 +++++
 test/infiniop/ceil.py                         | 165 +++++
 test/infiniop/cos.py                          | 166 +++++
 test/infiniop/cosh.py                         | 165 +++++
 test/infiniop/erf.py                          | 165 +++++
 test/infiniop/floor.py                        | 165 +++++
 test/infiniop/libinfiniop/op_register.py      | 583 ++++++++++++++++++
 test/infiniop/log.py                          | 166 +++++
 test/infiniop/neg.py                          | 165 +++++
 test/infiniop/reciprocal.py                   | 168 +++++
 test/infiniop/round.py                        | 165 +++++
 test/infiniop/sign.py                         | 166 +++++
 test/infiniop/sinh.py                         | 166 +++++
 test/infiniop/sqrt.py                         | 166 +++++
 test/infiniop/tan.py                          | 167 +++++
 163 files changed, 10468 insertions(+), 1 deletion(-)
 create mode 100644 include/infiniop/ops/abs.h
 create mode 100644 include/infiniop/ops/acos.h
 create mode 100644 include/infiniop/ops/acosh.h
 create mode 100644 include/infiniop/ops/asin.h
 create mode 100644 include/infiniop/ops/asinh.h
 create mode 100644 include/infiniop/ops/atan.h
 create mode 100644 include/infiniop/ops/atanh.h
 create mode 100644 include/infiniop/ops/ceil.h
 create mode 100644 include/infiniop/ops/cos.h
 create mode 100644 include/infiniop/ops/cosh.h
 create mode 100644 include/infiniop/ops/erf.h
 create mode 100644 include/infiniop/ops/floor.h
 create mode 100644 include/infiniop/ops/log.h
 create mode 100644 include/infiniop/ops/neg.h
 create mode 100644 include/infiniop/ops/reciprocal.h
 create mode 100644 include/infiniop/ops/round.h
 create mode 100644 include/infiniop/ops/sign.h
 create mode 100644 include/infiniop/ops/sinh.h
 create mode 100644 include/infiniop/ops/sqrt.h
 create mode 100644 include/infiniop/ops/tan.h
 create mode 100644 src/infiniop/ops/abs/cpu/abs_cpu.cc
 create mode 100644 src/infiniop/ops/abs/cpu/abs_cpu.h
 create mode 100644 src/infiniop/ops/abs/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/abs/nvidia/abs_nvidia.cu
 create mode 100644 src/infiniop/ops/abs/nvidia/abs_nvidia.cuh
 create mode 100644 src/infiniop/ops/abs/operator.cc
 create mode 100644 src/infiniop/ops/acos/cpu/acos_cpu.cc
 create mode 100644 src/infiniop/ops/acos/cpu/acos_cpu.h
 create mode 100644 src/infiniop/ops/acos/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/acos/nvidia/acos_nvidia.cu
 create mode 100644 src/infiniop/ops/acos/nvidia/acos_nvidia.cuh
 create mode 100644 src/infiniop/ops/acos/operator.cc
 create mode 100644 src/infiniop/ops/acosh/cpu/acosh_cpu.cc
 create mode 100644 src/infiniop/ops/acosh/cpu/acosh_cpu.h
 create mode 100644 src/infiniop/ops/acosh/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/acosh/nvidia/acosh_nvidia.cu
 create mode 100644 src/infiniop/ops/acosh/nvidia/acosh_nvidia.cuh
 create mode 100644 src/infiniop/ops/acosh/operator.cc
 create mode 100644 src/infiniop/ops/asin/cpu/asin_cpu.cc
 create mode 100644 src/infiniop/ops/asin/cpu/asin_cpu.h
 create mode 100644 src/infiniop/ops/asin/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/asin/nvidia/asin_nvidia.cu
 create mode 100644 src/infiniop/ops/asin/nvidia/asin_nvidia.cuh
 create mode 100644 src/infiniop/ops/asin/operator.cc
 create mode 100644 src/infiniop/ops/asinh/cpu/asinh_cpu.cc
 create mode 100644 src/infiniop/ops/asinh/cpu/asinh_cpu.h
 create mode 100644 src/infiniop/ops/asinh/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu
 create mode 100644 src/infiniop/ops/asinh/nvidia/asinh_nvidia.cuh
 create mode 100644 src/infiniop/ops/asinh/operator.cc
 create mode 100644 src/infiniop/ops/atan/cpu/atan_cpu.cc
 create mode 100644 src/infiniop/ops/atan/cpu/atan_cpu.h
 create mode 100644 src/infiniop/ops/atan/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/atan/nvidia/atan_nvidia.cu
 create mode 100644 src/infiniop/ops/atan/nvidia/atan_nvidia.cuh
 create mode 100644 src/infiniop/ops/atan/operator.cc
 create mode 100644 src/infiniop/ops/atanh/cpu/atanh_cpu.cc
 create mode 100644 src/infiniop/ops/atanh/cpu/atanh_cpu.h
 create mode 100644 src/infiniop/ops/atanh/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/atanh/nvidia/atanh_nvidia.cu
 create mode 100644 src/infiniop/ops/atanh/nvidia/atanh_nvidia.cuh
 create mode 100644 src/infiniop/ops/atanh/operator.cc
 create mode 100644 src/infiniop/ops/ceil/cpu/ceil_cpu.cc
 create mode 100644 src/infiniop/ops/ceil/cpu/ceil_cpu.h
 create mode 100644 src/infiniop/ops/ceil/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/ceil/nvidia/ceil_nvidia.cu
 create mode 100644 src/infiniop/ops/ceil/nvidia/ceil_nvidia.cuh
 create mode 100644 src/infiniop/ops/ceil/operator.cc
 create mode 100644 src/infiniop/ops/cos/cpu/cos_cpu.cc
 create mode 100644 src/infiniop/ops/cos/cpu/cos_cpu.h
 create mode 100644 src/infiniop/ops/cos/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/cos/nvidia/cos_nvidia.cu
 create mode 100644 src/infiniop/ops/cos/nvidia/cos_nvidia.cuh
 create mode 100644 src/infiniop/ops/cos/operator.cc
 create mode 100644 src/infiniop/ops/cosh/cpu/cosh_cpu.cc
 create mode 100644 src/infiniop/ops/cosh/cpu/cosh_cpu.h
 create mode 100644 src/infiniop/ops/cosh/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/cosh/nvidia/cosh_nvidia.cu
 create mode 100644 src/infiniop/ops/cosh/nvidia/cosh_nvidia.cuh
 create mode 100644 src/infiniop/ops/cosh/operator.cc
 create mode 100644 src/infiniop/ops/erf/cpu/erf_cpu.cc
 create mode 100644 src/infiniop/ops/erf/cpu/erf_cpu.h
 create mode 100644 src/infiniop/ops/erf/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/erf/nvidia/erf_nvidia.cu
 create mode 100644 src/infiniop/ops/erf/nvidia/erf_nvidia.cuh
 create mode 100644 src/infiniop/ops/erf/operator.cc
 create mode 100644 src/infiniop/ops/floor/cpu/floor_cpu.cc
 create mode 100644 src/infiniop/ops/floor/cpu/floor_cpu.h
 create mode 100644 src/infiniop/ops/floor/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/floor/nvidia/floor_nvidia.cu
 create mode 100644 src/infiniop/ops/floor/nvidia/floor_nvidia.cuh
 create mode 100644 src/infiniop/ops/floor/operator.cc
 create mode 100644 src/infiniop/ops/log/cpu/log_cpu.cc
 create mode 100644 src/infiniop/ops/log/cpu/log_cpu.h
 create mode 100644 src/infiniop/ops/log/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/log/nvidia/log_nvidia.cu
 create mode 100644 src/infiniop/ops/log/nvidia/log_nvidia.cuh
 create mode 100644 src/infiniop/ops/log/operator.cc
 create mode 100644 src/infiniop/ops/neg/cpu/neg_cpu.cc
 create mode 100644 src/infiniop/ops/neg/cpu/neg_cpu.h
 create mode 100644 src/infiniop/ops/neg/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/neg/nvidia/neg_nvidia.cu
 create mode 100644 src/infiniop/ops/neg/nvidia/neg_nvidia.cuh
 create mode 100644 src/infiniop/ops/neg/operator.cc
 create mode 100644 src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.cc
 create mode 100644 src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.h
 create mode 100644 src/infiniop/ops/reciprocal/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cu
 create mode 100644 src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cuh
 create mode 100644 src/infiniop/ops/reciprocal/operator.cc
 create mode 100644 src/infiniop/ops/round/cpu/round_cpu.cc
 create mode 100644 src/infiniop/ops/round/cpu/round_cpu.h
 create mode 100644 src/infiniop/ops/round/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/round/nvidia/round_nvidia.cu
 create mode 100644 src/infiniop/ops/round/nvidia/round_nvidia.cuh
 create mode 100644 src/infiniop/ops/round/operator.cc
 create mode 100644 src/infiniop/ops/sign/cpu/sign_cpu.cc
 create mode 100644 src/infiniop/ops/sign/cpu/sign_cpu.h
 create mode 100644 src/infiniop/ops/sign/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/sign/nvidia/sign_nvidia.cu
 create mode 100644 src/infiniop/ops/sign/nvidia/sign_nvidia.cuh
 create mode 100644 src/infiniop/ops/sign/operator.cc
 create mode 100644 src/infiniop/ops/sinh/cpu/sinh_cpu.cc
 create mode 100644 src/infiniop/ops/sinh/cpu/sinh_cpu.h
 create mode 100644 src/infiniop/ops/sinh/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/sinh/nvidia/sinh_nvidia.cu
 create mode 100644 src/infiniop/ops/sinh/nvidia/sinh_nvidia.cuh
 create mode 100644 src/infiniop/ops/sinh/operator.cc
 create mode 100644 src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc
 create mode 100644 src/infiniop/ops/sqrt/cpu/sqrt_cpu.h
 create mode 100644 src/infiniop/ops/sqrt/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cu
 create mode 100644 src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cuh
 create mode 100644 src/infiniop/ops/sqrt/operator.cc
 create mode 100644 src/infiniop/ops/tan/cpu/tan_cpu.cc
 create mode 100644 src/infiniop/ops/tan/cpu/tan_cpu.h
 create mode 100644 src/infiniop/ops/tan/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/tan/nvidia/tan_nvidia.cu
 create mode 100644 src/infiniop/ops/tan/nvidia/tan_nvidia.cuh
 create mode 100644 src/infiniop/ops/tan/operator.cc
 create mode 100644 test/infiniop/abs.py
 create mode 100644 test/infiniop/acos.py
 create mode 100644 test/infiniop/acosh.py
 create mode 100644 test/infiniop/asin.py
 create mode 100644 test/infiniop/asinh.py
 create mode 100644 test/infiniop/atan.py
 create mode 100644 test/infiniop/atanh.py
 create mode 100644 test/infiniop/ceil.py
 create mode 100644 test/infiniop/cos.py
 create mode 100644 test/infiniop/cosh.py
 create mode 100644 test/infiniop/erf.py
 create mode 100644 test/infiniop/floor.py
 create mode 100644 test/infiniop/log.py
 create mode 100644 test/infiniop/neg.py
 create mode 100644 test/infiniop/reciprocal.py
 create mode 100644 test/infiniop/round.py
 create mode 100644 test/infiniop/sign.py
 create mode 100644 test/infiniop/sinh.py
 create mode 100644 test/infiniop/sqrt.py
 create mode 100644 test/infiniop/tan.py

diff --git a/include/infiniop.h b/include/infiniop.h
index cf1688868..4778fce90 100644
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -2,9 +2,21 @@
 #define __INFINIOP_API_H__
 
 #include "infiniop/handle.h"
+#include "infiniop/ops/abs.h"
+#include "infiniop/ops/acos.h"
+#include "infiniop/ops/acosh.h"
 #include "infiniop/ops/add.h"
 #include "infiniop/ops/add_rms_norm.h"
+#include "infiniop/ops/asin.h"
+#include "infiniop/ops/asinh.h"
+#include "infiniop/ops/atan.h"
+#include "infiniop/ops/atanh.h"
 #include "infiniop/ops/attention.h"
+#include "infiniop/ops/ceil.h"
+#include "infiniop/ops/cos.h"
+#include "infiniop/ops/cosh.h"
+#include "infiniop/ops/erf.h"
+#include "infiniop/ops/floor.h"
 #include "infiniop/ops/causal_softmax.h"
 #include "infiniop/ops/clip.h"
 #include "infiniop/ops/conv.h"
@@ -13,17 +25,24 @@
 #include "infiniop/ops/gelu.h"
 #include "infiniop/ops/gemm.h"
 #include "infiniop/ops/layer_norm.h"
+#include "infiniop/ops/log.h"
 #include "infiniop/ops/logsoftmax.h"
 #include "infiniop/ops/lp_norm.h"
 #include "infiniop/ops/max.h"
 #include "infiniop/ops/min.h"
 #include "infiniop/ops/mul.h"
+#include "infiniop/ops/neg.h"
 #include "infiniop/ops/ones.h"
 #include "infiniop/ops/paged_attention.h"
 #include "infiniop/ops/paged_attention_prefill.h"
 #include "infiniop/ops/paged_caching.h"
 #include "infiniop/ops/random_sample.h"
+#include "infiniop/ops/reciprocal.h"
 #include "infiniop/ops/rearrange.h"
+#include "infiniop/ops/round.h"
+#include "infiniop/ops/sign.h"
+#include "infiniop/ops/sinh.h"
+#include "infiniop/ops/sqrt.h"
 #include "infiniop/ops/relu.h"
 #include "infiniop/ops/rms_norm.h"
 #include "infiniop/ops/rope.h"
@@ -33,6 +52,7 @@
 #include "infiniop/ops/softplus.h"
 #include "infiniop/ops/sub.h"
 #include "infiniop/ops/swiglu.h"
+#include "infiniop/ops/tan.h"
 #include "infiniop/ops/tanh.h"
 #include "infiniop/ops/topkrouter.h"
 #include "infiniop/ops/topksoftmax.h"
diff --git a/include/infiniop/ops/abs.h b/include/infiniop/ops/abs.h
new file mode 100644
index 000000000..7b5872657
--- /dev/null
+++ b/include/infiniop/ops/abs.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_ABS_API_H__
+#define __INFINIOP_ABS_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopAbsDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateAbsDescriptor(infiniopHandle_t handle,
+                                                        infiniopAbsDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t y,
+                                                        infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetAbsWorkspaceSize(infiniopAbsDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopAbs(infiniopAbsDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *y,
+                                        const void *x,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyAbsDescriptor(infiniopAbsDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/acos.h b/include/infiniop/ops/acos.h
new file mode 100644
index 000000000..fe6af01ed
--- /dev/null
+++ b/include/infiniop/ops/acos.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_ACOS_API_H__
+#define __INFINIOP_ACOS_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopAcosDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateAcosDescriptor(infiniopHandle_t handle,
+                                                        infiniopAcosDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t y,
+                                                        infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetAcosWorkspaceSize(infiniopAcosDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopAcos(infiniopAcosDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *y,
+                                        const void *x,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyAcosDescriptor(infiniopAcosDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/acosh.h b/include/infiniop/ops/acosh.h
new file mode 100644
index 000000000..be28918bb
--- /dev/null
+++ b/include/infiniop/ops/acosh.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_ACOSH_API_H__
+#define __INFINIOP_ACOSH_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopAcoshDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateAcoshDescriptor(infiniopHandle_t handle,
+                                                        infiniopAcoshDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t y,
+                                                        infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetAcoshWorkspaceSize(infiniopAcoshDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopAcosh(infiniopAcoshDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *y,
+                                        const void *x,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyAcoshDescriptor(infiniopAcoshDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/asin.h b/include/infiniop/ops/asin.h
new file mode 100644
index 000000000..2aac6d1e1
--- /dev/null
+++ b/include/infiniop/ops/asin.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_ASIN_API_H__
+#define __INFINIOP_ASIN_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopAsinDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateAsinDescriptor(infiniopHandle_t handle,
+                                                        infiniopAsinDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t y,
+                                                        infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetAsinWorkspaceSize(infiniopAsinDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopAsin(infiniopAsinDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *y,
+                                        const void *x,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyAsinDescriptor(infiniopAsinDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/asinh.h b/include/infiniop/ops/asinh.h
new file mode 100644
index 000000000..d1385fc01
--- /dev/null
+++ b/include/infiniop/ops/asinh.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_ASINH_API_H__
+#define __INFINIOP_ASINH_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopAsinhDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateAsinhDescriptor(infiniopHandle_t handle,
+                                                        infiniopAsinhDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t y,
+                                                        infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetAsinhWorkspaceSize(infiniopAsinhDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopAsinh(infiniopAsinhDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *y,
+                                        const void *x,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyAsinhDescriptor(infiniopAsinhDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/atan.h b/include/infiniop/ops/atan.h
new file mode 100644
index 000000000..3b1a5bde3
--- /dev/null
+++ b/include/infiniop/ops/atan.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_ATAN_API_H__
+#define __INFINIOP_ATAN_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopAtanDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateAtanDescriptor(infiniopHandle_t handle,
+                                                        infiniopAtanDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t y,
+                                                        infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetAtanWorkspaceSize(infiniopAtanDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopAtan(infiniopAtanDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *y,
+                                        const void *x,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyAtanDescriptor(infiniopAtanDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/atanh.h b/include/infiniop/ops/atanh.h
new file mode 100644
index 000000000..800afd5d5
--- /dev/null
+++ b/include/infiniop/ops/atanh.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_ATANH_API_H__
+#define __INFINIOP_ATANH_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopAtanhDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateAtanhDescriptor(infiniopHandle_t handle,
+                                                        infiniopAtanhDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t y,
+                                                        infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetAtanhWorkspaceSize(infiniopAtanhDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopAtanh(infiniopAtanhDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *y,
+                                        const void *x,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyAtanhDescriptor(infiniopAtanhDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/ceil.h b/include/infiniop/ops/ceil.h
new file mode 100644
index 000000000..4539d77fd
--- /dev/null
+++ b/include/infiniop/ops/ceil.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_CEIL_API_H__
+#define __INFINIOP_CEIL_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopCeilDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateCeilDescriptor(infiniopHandle_t handle,
+                                                        infiniopCeilDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t y,
+                                                        infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetCeilWorkspaceSize(infiniopCeilDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopCeil(infiniopCeilDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *y,
+                                        const void *x,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyCeilDescriptor(infiniopCeilDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/cos.h b/include/infiniop/ops/cos.h
new file mode 100644
index 000000000..8f0b6eeb7
--- /dev/null
+++ b/include/infiniop/ops/cos.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_COS_API_H__
+#define __INFINIOP_COS_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopCosDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateCosDescriptor(infiniopHandle_t handle,
+                                                        infiniopCosDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t y,
+                                                        infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetCosWorkspaceSize(infiniopCosDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopCos(infiniopCosDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *y,
+                                        const void *x,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyCosDescriptor(infiniopCosDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/cosh.h b/include/infiniop/ops/cosh.h
new file mode 100644
index 000000000..3328151ad
--- /dev/null
+++ b/include/infiniop/ops/cosh.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_COSH_API_H__
+#define __INFINIOP_COSH_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopCoshDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateCoshDescriptor(infiniopHandle_t handle,
+                                                        infiniopCoshDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t y,
+                                                        infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetCoshWorkspaceSize(infiniopCoshDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopCosh(infiniopCoshDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *y,
+                                        const void *x,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyCoshDescriptor(infiniopCoshDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/erf.h b/include/infiniop/ops/erf.h
new file mode 100644
index 000000000..8cbb8fb74
--- /dev/null
+++ b/include/infiniop/ops/erf.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_ERF_API_H__
+#define __INFINIOP_ERF_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopErfDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateErfDescriptor(infiniopHandle_t handle,
+                                                        infiniopErfDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t y,
+                                                        infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetErfWorkspaceSize(infiniopErfDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopErf(infiniopErfDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *y,
+                                        const void *x,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyErfDescriptor(infiniopErfDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/floor.h b/include/infiniop/ops/floor.h
new file mode 100644
index 000000000..2f65f8f4a
--- /dev/null
+++ b/include/infiniop/ops/floor.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_FLOOR_API_H__
+#define __INFINIOP_FLOOR_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopFloorDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateFloorDescriptor(infiniopHandle_t handle,
+                                                        infiniopFloorDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t y,
+                                                        infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetFloorWorkspaceSize(infiniopFloorDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopFloor(infiniopFloorDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *y,
+                                        const void *x,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyFloorDescriptor(infiniopFloorDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/log.h b/include/infiniop/ops/log.h
new file mode 100644
index 000000000..f5bec4382
--- /dev/null
+++ b/include/infiniop/ops/log.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_LOG_API_H__
+#define __INFINIOP_LOG_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopLogDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateLogDescriptor(infiniopHandle_t handle,
+                                                        infiniopLogDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t y,
+                                                        infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetLogWorkspaceSize(infiniopLogDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopLog(infiniopLogDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *y,
+                                        const void *x,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyLogDescriptor(infiniopLogDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/neg.h b/include/infiniop/ops/neg.h
new file mode 100644
index 000000000..4d3b06e21
--- /dev/null
+++ b/include/infiniop/ops/neg.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_NEG_API_H__
+#define __INFINIOP_NEG_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopNegDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateNegDescriptor(infiniopHandle_t handle,
+                                                        infiniopNegDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t y,
+                                                        infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetNegWorkspaceSize(infiniopNegDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopNeg(infiniopNegDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *y,
+                                        const void *x,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyNegDescriptor(infiniopNegDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/reciprocal.h b/include/infiniop/ops/reciprocal.h
new file mode 100644
index 000000000..73836fea4
--- /dev/null
+++ b/include/infiniop/ops/reciprocal.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_RECIPROCAL_API_H__
+#define __INFINIOP_RECIPROCAL_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopReciprocalDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateReciprocalDescriptor(infiniopHandle_t handle,
+                                                                infiniopReciprocalDescriptor_t *desc_ptr,
+                                                                infiniopTensorDescriptor_t y,
+                                                                infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetReciprocalWorkspaceSize(infiniopReciprocalDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopReciprocal(infiniopReciprocalDescriptor_t desc,
+                                                void *workspace,
+                                                size_t workspace_size,
+                                                void *y,
+                                                const void *x,
+                                                void *stream);
+
+__C __export infiniStatus_t infiniopDestroyReciprocalDescriptor(infiniopReciprocalDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/round.h b/include/infiniop/ops/round.h
new file mode 100644
index 000000000..18c7fe44e
--- /dev/null
+++ b/include/infiniop/ops/round.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_ROUND_API_H__
+#define __INFINIOP_ROUND_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopRoundDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateRoundDescriptor(infiniopHandle_t handle,
+                                                           infiniopRoundDescriptor_t *desc_ptr,
+                                                           infiniopTensorDescriptor_t y,
+                                                           infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetRoundWorkspaceSize(infiniopRoundDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopRound(infiniopRoundDescriptor_t desc,
+                                          void *workspace,
+                                          size_t workspace_size,
+                                          void *y,
+                                          const void *x,
+                                          void *stream);
+
+__C __export infiniStatus_t infiniopDestroyRoundDescriptor(infiniopRoundDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/sign.h b/include/infiniop/ops/sign.h
new file mode 100644
index 000000000..fe47c7190
--- /dev/null
+++ b/include/infiniop/ops/sign.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_SIGN_API_H__
+#define __INFINIOP_SIGN_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopSignDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateSignDescriptor(infiniopHandle_t handle,
+                                                        infiniopSignDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t y,
+                                                        infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetSignWorkspaceSize(infiniopSignDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopSign(infiniopSignDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *y,
+                                        const void *x,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroySignDescriptor(infiniopSignDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/sinh.h b/include/infiniop/ops/sinh.h
new file mode 100644
index 000000000..a5325fb81
--- /dev/null
+++ b/include/infiniop/ops/sinh.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_SINH_API_H__
+#define __INFINIOP_SINH_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopSinhDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateSinhDescriptor(infiniopHandle_t handle,
+                                                        infiniopSinhDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t y,
+                                                        infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetSinhWorkspaceSize(infiniopSinhDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopSinh(infiniopSinhDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *y,
+                                        const void *x,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroySinhDescriptor(infiniopSinhDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/sqrt.h b/include/infiniop/ops/sqrt.h
new file mode 100644
index 000000000..db04ec8bc
--- /dev/null
+++ b/include/infiniop/ops/sqrt.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_SQRT_API_H__
+#define __INFINIOP_SQRT_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopSqrtDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateSqrtDescriptor(infiniopHandle_t handle,
+                                                         infiniopSqrtDescriptor_t *desc_ptr,
+                                                         infiniopTensorDescriptor_t y,
+                                                         infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetSqrtWorkspaceSize(infiniopSqrtDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopSqrt(infiniopSqrtDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *y,
+                                        const void *x,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroySqrtDescriptor(infiniopSqrtDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/tan.h b/include/infiniop/ops/tan.h
new file mode 100644
index 000000000..69fc47bf1
--- /dev/null
+++ b/include/infiniop/ops/tan.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_TAN_API_H__
+#define __INFINIOP_TAN_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopTanDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateTanDescriptor(infiniopHandle_t handle,
+                                                        infiniopTanDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t y,
+                                                        infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetTanWorkspaceSize(infiniopTanDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopTan(infiniopTanDescriptor_t desc,
+                                       void *workspace,
+                                       size_t workspace_size,
+                                       void *y,
+                                       const void *x,
+                                       void *stream);
+
+__C __export infiniStatus_t infiniopDestroyTanDescriptor(infiniopTanDescriptor_t desc);
+
+#endif
diff --git a/src/infiniop/ops/abs/cpu/abs_cpu.cc b/src/infiniop/ops/abs/cpu/abs_cpu.cc
new file mode 100644
index 000000000..7d6e81d04
--- /dev/null
+++ b/src/infiniop/ops/abs/cpu/abs_cpu.cc
@@ -0,0 +1,48 @@
+#include "abs_cpu.h"
+
+namespace op::abs::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<AbsOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<AbsOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::abs::cpu
diff --git a/src/infiniop/ops/abs/cpu/abs_cpu.h b/src/infiniop/ops/abs/cpu/abs_cpu.h
new file mode 100644
index 000000000..5b9773298
--- /dev/null
+++ b/src/infiniop/ops/abs/cpu/abs_cpu.h
@@ -0,0 +1,26 @@
+#ifndef __ABS_CPU_H__
+#define __ABS_CPU_H__
+
+#include <cmath>
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(abs, cpu)
+
+namespace op::abs::cpu {
+typedef struct AbsOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        if constexpr (std::is_floating_point_v<T>) {
+            return std::fabs(x);
+        } else {
+            return std::abs(x);
+        }
+    }
+} AbsOp;
+} // namespace op::abs::cpu
+
+#endif // __ABS_CPU_H__
diff --git a/src/infiniop/ops/abs/cuda/kernel.cuh b/src/infiniop/ops/abs/cuda/kernel.cuh
new file mode 100644
index 000000000..d7ff2db12
--- /dev/null
+++ b/src/infiniop/ops/abs/cuda/kernel.cuh
@@ -0,0 +1,26 @@
+#ifndef __ABS_CUDA_H__
+#define __ABS_CUDA_H__
+
+#include <cmath>
+#include <cuda_fp16.h>
+
+namespace op::abs::cuda {
+typedef struct AbsOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return __habs2(x);
+        } else if constexpr (std::is_same_v<T, half>) {
+            return __habs(x);
+        } else if constexpr (std::is_floating_point_v<T>) {
+            return std::fabs(x);
+        } else {
+            return std::abs(x);
+        }
+    }
+} AbsOp;
+} // namespace op::abs::cuda
+
+#endif // __ABS_CUDA_H__
diff --git a/src/infiniop/ops/abs/nvidia/abs_nvidia.cu b/src/infiniop/ops/abs/nvidia/abs_nvidia.cu
new file mode 100644
index 000000000..485f0406a
--- /dev/null
+++ b/src/infiniop/ops/abs/nvidia/abs_nvidia.cu
@@ -0,0 +1,54 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "abs_nvidia.cuh"
+
+namespace op::abs::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::AbsOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::AbsOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::abs::nvidia
diff --git a/src/infiniop/ops/abs/nvidia/abs_nvidia.cuh b/src/infiniop/ops/abs/nvidia/abs_nvidia.cuh
new file mode 100644
index 000000000..db1751e26
--- /dev/null
+++ b/src/infiniop/ops/abs/nvidia/abs_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __ABS_NVIDIA_API_H__
+#define __ABS_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(abs, nvidia)
+
+#endif // __ABS_NVIDIA_API_H__
diff --git a/src/infiniop/ops/abs/operator.cc b/src/infiniop/ops/abs/operator.cc
new file mode 100644
index 000000000..b6820079d
--- /dev/null
+++ b/src/infiniop/ops/abs/operator.cc
@@ -0,0 +1,139 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/abs.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/abs_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/abs_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateAbsDescriptor(
+    infiniopHandle_t handle,
+    infiniopAbsDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+        return op::abs::NAMESPACE::Descriptor::create(                     \
+            handle,                                                        \
+            reinterpret_cast<op::abs::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                        \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetAbsWorkspaceSize(infiniopAbsDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::abs::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopAbs(
+    infiniopAbsDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::abs::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyAbsDescriptor(infiniopAbsDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        delete reinterpret_cast<const op::abs::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/acos/cpu/acos_cpu.cc b/src/infiniop/ops/acos/cpu/acos_cpu.cc
new file mode 100644
index 000000000..1accb6752
--- /dev/null
+++ b/src/infiniop/ops/acos/cpu/acos_cpu.cc
@@ -0,0 +1,48 @@
+#include "acos_cpu.h"
+
+namespace op::acos::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<AcosOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<AcosOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::acos::cpu
diff --git a/src/infiniop/ops/acos/cpu/acos_cpu.h b/src/infiniop/ops/acos/cpu/acos_cpu.h
new file mode 100644
index 000000000..14e74b75c
--- /dev/null
+++ b/src/infiniop/ops/acos/cpu/acos_cpu.h
@@ -0,0 +1,22 @@
+#ifndef __ACOS_CPU_H__
+#define __ACOS_CPU_H__
+
+#include <cmath>
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(acos, cpu)
+
+namespace op::acos::cpu {
+typedef struct AcosOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        return std::acos(x);
+    }
+} AcosOp;
+} // namespace op::acos::cpu
+
+#endif // __ACOS_CPU_H__
diff --git a/src/infiniop/ops/acos/cuda/kernel.cuh b/src/infiniop/ops/acos/cuda/kernel.cuh
new file mode 100644
index 000000000..c3281c7e3
--- /dev/null
+++ b/src/infiniop/ops/acos/cuda/kernel.cuh
@@ -0,0 +1,32 @@
+#ifndef __ACOS_CUDA_H__
+#define __ACOS_CUDA_H__
+
+#include <cmath>
+#include <cuda_fp16.h>
+
+namespace op::acos::cuda {
+typedef struct AcosOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return __floats2half2_rn(acosf(__half2float(__low2half(x))), acosf(__half2float(__high2half(x))));
+        } else if constexpr (std::is_same_v<T, half>) {
+            return __float2half(acosf(__half2float(x)));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float x0 = __bfloat162float(__low2bfloat16(x));
+            float x1 = __bfloat162float(__high2bfloat16(x));
+            return __floats2bfloat162_rn(acosf(x0), acosf(x1));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            return __float2bfloat16_rn(acosf(__bfloat162float(x)));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return acosf(x);
+        } else {
+            return std::acos(x);
+        }
+    }
+} AcosOp;
+} // namespace op::acos::cuda
+
+#endif // __ACOS_CUDA_H__
diff --git a/src/infiniop/ops/acos/nvidia/acos_nvidia.cu b/src/infiniop/ops/acos/nvidia/acos_nvidia.cu
new file mode 100644
index 000000000..8480219bc
--- /dev/null
+++ b/src/infiniop/ops/acos/nvidia/acos_nvidia.cu
@@ -0,0 +1,54 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "acos_nvidia.cuh"
+
+namespace op::acos::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::AcosOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::AcosOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::acos::nvidia
diff --git a/src/infiniop/ops/acos/nvidia/acos_nvidia.cuh b/src/infiniop/ops/acos/nvidia/acos_nvidia.cuh
new file mode 100644
index 000000000..a7ac7e190
--- /dev/null
+++ b/src/infiniop/ops/acos/nvidia/acos_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __ACOS_NVIDIA_API_H__
+#define __ACOS_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(acos, nvidia)
+
+#endif // __ACOS_NVIDIA_API_H__
diff --git a/src/infiniop/ops/acos/operator.cc b/src/infiniop/ops/acos/operator.cc
new file mode 100644
index 000000000..e775a005a
--- /dev/null
+++ b/src/infiniop/ops/acos/operator.cc
@@ -0,0 +1,139 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/acos.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/acos_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/acos_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateAcosDescriptor(
+    infiniopHandle_t handle,
+    infiniopAcosDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::acos::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::acos::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                         \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetAcosWorkspaceSize(infiniopAcosDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
+        *size = reinterpret_cast<op::acos::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopAcos(
+    infiniopAcosDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::acos::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyAcosDescriptor(infiniopAcosDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::acos::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/acosh/cpu/acosh_cpu.cc b/src/infiniop/ops/acosh/cpu/acosh_cpu.cc
new file mode 100644
index 000000000..005463679
--- /dev/null
+++ b/src/infiniop/ops/acosh/cpu/acosh_cpu.cc
@@ -0,0 +1,48 @@
+#include "acosh_cpu.h"
+
+namespace op::acosh::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<AcoshOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<AcoshOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::acosh::cpu
diff --git a/src/infiniop/ops/acosh/cpu/acosh_cpu.h b/src/infiniop/ops/acosh/cpu/acosh_cpu.h
new file mode 100644
index 000000000..b4b710ed5
--- /dev/null
+++ b/src/infiniop/ops/acosh/cpu/acosh_cpu.h
@@ -0,0 +1,22 @@
+#ifndef __ACOSH_CPU_H__
+#define __ACOSH_CPU_H__
+
+#include <cmath>
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(acosh, cpu)
+
+namespace op::acosh::cpu {
+typedef struct AcoshOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        return std::acosh(x);
+    }
+} AcoshOp;
+} // namespace op::acosh::cpu
+
+#endif // __ACOSH_CPU_H__
diff --git a/src/infiniop/ops/acosh/cuda/kernel.cuh b/src/infiniop/ops/acosh/cuda/kernel.cuh
new file mode 100644
index 000000000..fe444b1b4
--- /dev/null
+++ b/src/infiniop/ops/acosh/cuda/kernel.cuh
@@ -0,0 +1,32 @@
+#ifndef __ACOSH_CUDA_H__
+#define __ACOSH_CUDA_H__
+
+#include <cmath>
+#include <cuda_fp16.h>
+
+namespace op::acosh::cuda {
+typedef struct AcoshOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return __floats2half2_rn(acoshf(__half2float(__low2half(x))), acoshf(__half2float(__high2half(x))));
+        } else if constexpr (std::is_same_v<T, half>) {
+            return __float2half(acoshf(__half2float(x)));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float x0 = __bfloat162float(__low2bfloat16(x));
+            float x1 = __bfloat162float(__high2bfloat16(x));
+            return __floats2bfloat162_rn(acoshf(x0), acoshf(x1));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            return __float2bfloat16_rn(acoshf(__bfloat162float(x)));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return acoshf(x);
+        } else {
+            return std::acosh(x);
+        }
+    }
+} AcoshOp;
+} // namespace op::acosh::cuda
+
+#endif // __ACOSH_CUDA_H__
diff --git a/src/infiniop/ops/acosh/nvidia/acosh_nvidia.cu b/src/infiniop/ops/acosh/nvidia/acosh_nvidia.cu
new file mode 100644
index 000000000..fc06590a7
--- /dev/null
+++ b/src/infiniop/ops/acosh/nvidia/acosh_nvidia.cu
@@ -0,0 +1,54 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "acosh_nvidia.cuh"
+
+namespace op::acosh::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::AcoshOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::AcoshOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::acosh::nvidia
diff --git a/src/infiniop/ops/acosh/nvidia/acosh_nvidia.cuh b/src/infiniop/ops/acosh/nvidia/acosh_nvidia.cuh
new file mode 100644
index 000000000..b13332431
--- /dev/null
+++ b/src/infiniop/ops/acosh/nvidia/acosh_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __ACOSH_NVIDIA_API_H__
+#define __ACOSH_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(acosh, nvidia)
+
+#endif // __ACOSH_NVIDIA_API_H__
diff --git a/src/infiniop/ops/acosh/operator.cc b/src/infiniop/ops/acosh/operator.cc
new file mode 100644
index 000000000..9bba3389a
--- /dev/null
+++ b/src/infiniop/ops/acosh/operator.cc
@@ -0,0 +1,139 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/acosh.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/acosh_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/acosh_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateAcoshDescriptor(
+    infiniopHandle_t handle,
+    infiniopAcoshDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                               \
+        return op::acosh::NAMESPACE::Descriptor::create(                     \
+            handle,                                                          \
+            reinterpret_cast<op::acosh::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                          \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetAcoshWorkspaceSize(infiniopAcoshDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                 \
+    case CASE:                                                                               \
+        *size = reinterpret_cast<op::acosh::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopAcosh(
+    infiniopAcoshDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                                  \
+        return reinterpret_cast<const op::acosh::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyAcoshDescriptor(infiniopAcoshDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                  \
+    case CASE:                                                                   \
+        delete reinterpret_cast<const op::acosh::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/asin/cpu/asin_cpu.cc b/src/infiniop/ops/asin/cpu/asin_cpu.cc
new file mode 100644
index 000000000..e149044f1
--- /dev/null
+++ b/src/infiniop/ops/asin/cpu/asin_cpu.cc
@@ -0,0 +1,48 @@
+#include "asin_cpu.h"
+
+namespace op::asin::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<AsinOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<AsinOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::asin::cpu
diff --git a/src/infiniop/ops/asin/cpu/asin_cpu.h b/src/infiniop/ops/asin/cpu/asin_cpu.h
new file mode 100644
index 000000000..22bcba337
--- /dev/null
+++ b/src/infiniop/ops/asin/cpu/asin_cpu.h
@@ -0,0 +1,22 @@
+#ifndef __ASIN_CPU_H__
+#define __ASIN_CPU_H__
+
+#include <cmath>
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(asin, cpu)
+
+namespace op::asin::cpu {
+typedef struct AsinOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        return std::asin(x);
+    }
+} AsinOp;
+} // namespace op::asin::cpu
+
+#endif // __ASIN_CPU_H__
diff --git a/src/infiniop/ops/asin/cuda/kernel.cuh b/src/infiniop/ops/asin/cuda/kernel.cuh
new file mode 100644
index 000000000..3e8d11a07
--- /dev/null
+++ b/src/infiniop/ops/asin/cuda/kernel.cuh
@@ -0,0 +1,32 @@
+#ifndef __ASIN_CUDA_H__
+#define __ASIN_CUDA_H__
+
+#include <cmath>
+#include <cuda_fp16.h>
+
+namespace op::asin::cuda {
+typedef struct AsinOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return __floats2half2_rn(asinf(__half2float(__low2half(x))), asinf(__half2float(__high2half(x))));
+        } else if constexpr (std::is_same_v<T, half>) {
+            return __float2half(asinf(__half2float(x)));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float x0 = __bfloat162float(__low2bfloat16(x));
+            float x1 = __bfloat162float(__high2bfloat16(x));
+            return __floats2bfloat162_rn(asinf(x0), asinf(x1));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            return __float2bfloat16_rn(asinf(__bfloat162float(x)));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return asinf(x);
+        } else {
+            return std::asin(x);
+        }
+    }
+} AsinOp;
+} // namespace op::asin::cuda
+
+#endif // __ASIN_CUDA_H__
diff --git a/src/infiniop/ops/asin/nvidia/asin_nvidia.cu b/src/infiniop/ops/asin/nvidia/asin_nvidia.cu
new file mode 100644
index 000000000..714d2b1b3
--- /dev/null
+++ b/src/infiniop/ops/asin/nvidia/asin_nvidia.cu
@@ -0,0 +1,54 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "asin_nvidia.cuh"
+
+namespace op::asin::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::AsinOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::AsinOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::asin::nvidia
diff --git a/src/infiniop/ops/asin/nvidia/asin_nvidia.cuh b/src/infiniop/ops/asin/nvidia/asin_nvidia.cuh
new file mode 100644
index 000000000..46e168ede
--- /dev/null
+++ b/src/infiniop/ops/asin/nvidia/asin_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __ASIN_NVIDIA_API_H__
+#define __ASIN_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(asin, nvidia)
+
+#endif // __ASIN_NVIDIA_API_H__
diff --git a/src/infiniop/ops/asin/operator.cc b/src/infiniop/ops/asin/operator.cc
new file mode 100644
index 000000000..c4973e9f5
--- /dev/null
+++ b/src/infiniop/ops/asin/operator.cc
@@ -0,0 +1,139 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/asin.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/asin_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/asin_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateAsinDescriptor(
+    infiniopHandle_t handle,
+    infiniopAsinDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::asin::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::asin::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                         \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetAsinWorkspaceSize(infiniopAsinDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
+        *size = reinterpret_cast<op::asin::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopAsin(
+    infiniopAsinDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::asin::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyAsinDescriptor(infiniopAsinDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::asin::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/asinh/cpu/asinh_cpu.cc b/src/infiniop/ops/asinh/cpu/asinh_cpu.cc
new file mode 100644
index 000000000..e0d5b749a
--- /dev/null
+++ b/src/infiniop/ops/asinh/cpu/asinh_cpu.cc
@@ -0,0 +1,48 @@
+#include "asinh_cpu.h"
+
+namespace op::asinh::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<AsinhOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<AsinhOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::asinh::cpu
diff --git a/src/infiniop/ops/asinh/cpu/asinh_cpu.h b/src/infiniop/ops/asinh/cpu/asinh_cpu.h
new file mode 100644
index 000000000..0a999b63b
--- /dev/null
+++ b/src/infiniop/ops/asinh/cpu/asinh_cpu.h
@@ -0,0 +1,22 @@
+#ifndef __ASINH_CPU_H__
+#define __ASINH_CPU_H__
+
+#include <cmath>
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(asinh, cpu)
+
+namespace op::asinh::cpu {
+typedef struct AsinhOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        return std::asinh(x);
+    }
+} AsinhOp;
+} // namespace op::asinh::cpu
+
+#endif // __ASINH_CPU_H__
diff --git a/src/infiniop/ops/asinh/cuda/kernel.cuh b/src/infiniop/ops/asinh/cuda/kernel.cuh
new file mode 100644
index 000000000..7cb018c8a
--- /dev/null
+++ b/src/infiniop/ops/asinh/cuda/kernel.cuh
@@ -0,0 +1,32 @@
+#ifndef __ASINH_CUDA_H__
+#define __ASINH_CUDA_H__
+
+#include <cmath>
+#include <cuda_fp16.h>
+
+namespace op::asinh::cuda {
+typedef struct AsinhOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return __floats2half2_rn(asinhf(__half2float(__low2half(x))), asinhf(__half2float(__high2half(x))));
+        } else if constexpr (std::is_same_v<T, half>) {
+            return __float2half(asinhf(__half2float(x)));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float x0 = __bfloat162float(__low2bfloat16(x));
+            float x1 = __bfloat162float(__high2bfloat16(x));
+            return __floats2bfloat162_rn(asinhf(x0), asinhf(x1));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            return __float2bfloat16_rn(asinhf(__bfloat162float(x)));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return asinhf(x);
+        } else {
+            return std::asinh(x);
+        }
+    }
+} AsinhOp;
+} // namespace op::asinh::cuda
+
+#endif // __ASINH_CUDA_H__
diff --git a/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu b/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu
new file mode 100644
index 000000000..203008b81
--- /dev/null
+++ b/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu
@@ -0,0 +1,54 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "asinh_nvidia.cuh"
+
+namespace op::asinh::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::AsinhOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::AsinhOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::asinh::nvidia
diff --git a/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cuh b/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cuh
new file mode 100644
index 000000000..d1dcb4287
--- /dev/null
+++ b/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __ASINH_NVIDIA_API_H__
+#define __ASINH_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(asinh, nvidia)
+
+#endif // __ASINH_NVIDIA_API_H__
diff --git a/src/infiniop/ops/asinh/operator.cc b/src/infiniop/ops/asinh/operator.cc
new file mode 100644
index 000000000..d9ff5beda
--- /dev/null
+++ b/src/infiniop/ops/asinh/operator.cc
@@ -0,0 +1,139 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/asinh.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/asinh_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/asinh_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateAsinhDescriptor(
+    infiniopHandle_t handle,
+    infiniopAsinhDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                               \
+        return op::asinh::NAMESPACE::Descriptor::create(                     \
+            handle,                                                          \
+            reinterpret_cast<op::asinh::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                          \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetAsinhWorkspaceSize(infiniopAsinhDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                 \
+    case CASE:                                                                               \
+        *size = reinterpret_cast<op::asinh::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopAsinh(
+    infiniopAsinhDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                                  \
+        return reinterpret_cast<const op::asinh::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyAsinhDescriptor(infiniopAsinhDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                  \
+    case CASE:                                                                   \
+        delete reinterpret_cast<const op::asinh::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/atan/cpu/atan_cpu.cc b/src/infiniop/ops/atan/cpu/atan_cpu.cc
new file mode 100644
index 000000000..a8c613d1e
--- /dev/null
+++ b/src/infiniop/ops/atan/cpu/atan_cpu.cc
@@ -0,0 +1,48 @@
+#include "atan_cpu.h"
+
+namespace op::atan::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<AtanOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<AtanOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::atan::cpu
diff --git a/src/infiniop/ops/atan/cpu/atan_cpu.h b/src/infiniop/ops/atan/cpu/atan_cpu.h
new file mode 100644
index 000000000..ac2a1bc0c
--- /dev/null
+++ b/src/infiniop/ops/atan/cpu/atan_cpu.h
@@ -0,0 +1,22 @@
+#ifndef __ATAN_CPU_H__
+#define __ATAN_CPU_H__
+
+#include <cmath>
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(atan, cpu)
+
+namespace op::atan::cpu {
+typedef struct AtanOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        return std::atan(x);
+    }
+} AtanOp;
+} // namespace op::atan::cpu
+
+#endif // __ATAN_CPU_H__
diff --git a/src/infiniop/ops/atan/cuda/kernel.cuh b/src/infiniop/ops/atan/cuda/kernel.cuh
new file mode 100644
index 000000000..0c7745196
--- /dev/null
+++ b/src/infiniop/ops/atan/cuda/kernel.cuh
@@ -0,0 +1,32 @@
+#ifndef __ATAN_CUDA_H__
+#define __ATAN_CUDA_H__
+
+#include <cmath>
+#include <cuda_fp16.h>
+
+namespace op::atan::cuda {
+typedef struct AtanOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return __floats2half2_rn(atanf(__half2float(__low2half(x))), atanf(__half2float(__high2half(x))));
+        } else if constexpr (std::is_same_v<T, half>) {
+            return __float2half(atanf(__half2float(x)));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float x0 = __bfloat162float(__low2bfloat16(x));
+            float x1 = __bfloat162float(__high2bfloat16(x));
+            return __floats2bfloat162_rn(atanf(x0), atanf(x1));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            return __float2bfloat16_rn(atanf(__bfloat162float(x)));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return atanf(x);
+        } else {
+            return std::atan(x);
+        }
+    }
+} AtanOp;
+} // namespace op::atan::cuda
+
+#endif // __ATAN_CUDA_H__
diff --git a/src/infiniop/ops/atan/nvidia/atan_nvidia.cu b/src/infiniop/ops/atan/nvidia/atan_nvidia.cu
new file mode 100644
index 000000000..2c6cf53d4
--- /dev/null
+++ b/src/infiniop/ops/atan/nvidia/atan_nvidia.cu
@@ -0,0 +1,54 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "atan_nvidia.cuh"
+
+namespace op::atan::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::AtanOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::AtanOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::atan::nvidia
diff --git a/src/infiniop/ops/atan/nvidia/atan_nvidia.cuh b/src/infiniop/ops/atan/nvidia/atan_nvidia.cuh
new file mode 100644
index 000000000..2aaee1ad9
--- /dev/null
+++ b/src/infiniop/ops/atan/nvidia/atan_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __ATAN_NVIDIA_API_H__
+#define __ATAN_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(atan, nvidia)
+
+#endif // __ATAN_NVIDIA_API_H__
diff --git a/src/infiniop/ops/atan/operator.cc b/src/infiniop/ops/atan/operator.cc
new file mode 100644
index 000000000..c56e101d2
--- /dev/null
+++ b/src/infiniop/ops/atan/operator.cc
@@ -0,0 +1,139 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/atan.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/atan_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/atan_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateAtanDescriptor(
+    infiniopHandle_t handle,
+    infiniopAtanDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::atan::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::atan::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                         \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetAtanWorkspaceSize(infiniopAtanDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
+        *size = reinterpret_cast<op::atan::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopAtan(
+    infiniopAtanDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::atan::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyAtanDescriptor(infiniopAtanDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::atan::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/atanh/cpu/atanh_cpu.cc b/src/infiniop/ops/atanh/cpu/atanh_cpu.cc
new file mode 100644
index 000000000..66ef4b1df
--- /dev/null
+++ b/src/infiniop/ops/atanh/cpu/atanh_cpu.cc
@@ -0,0 +1,48 @@
+#include "atanh_cpu.h"
+
+namespace op::atanh::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<AtanhOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<AtanhOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::atanh::cpu
diff --git a/src/infiniop/ops/atanh/cpu/atanh_cpu.h b/src/infiniop/ops/atanh/cpu/atanh_cpu.h
new file mode 100644
index 000000000..8c2b04755
--- /dev/null
+++ b/src/infiniop/ops/atanh/cpu/atanh_cpu.h
@@ -0,0 +1,22 @@
+#ifndef __ATANH_CPU_H__
+#define __ATANH_CPU_H__
+
+#include <cmath>
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(atanh, cpu)
+
+namespace op::atanh::cpu {
+typedef struct AtanhOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        return std::atanh(x);
+    }
+} AtanhOp;
+} // namespace op::atanh::cpu
+
+#endif // __ATANH_CPU_H__
diff --git a/src/infiniop/ops/atanh/cuda/kernel.cuh b/src/infiniop/ops/atanh/cuda/kernel.cuh
new file mode 100644
index 000000000..5337d8243
--- /dev/null
+++ b/src/infiniop/ops/atanh/cuda/kernel.cuh
@@ -0,0 +1,32 @@
+#ifndef __ATANH_CUDA_H__
+#define __ATANH_CUDA_H__
+
+#include <cmath>
+#include <cuda_fp16.h>
+
+namespace op::atanh::cuda {
+typedef struct AtanhOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return __floats2half2_rn(atanhf(__half2float(__low2half(x))), atanhf(__half2float(__high2half(x))));
+        } else if constexpr (std::is_same_v<T, half>) {
+            return __float2half(atanhf(__half2float(x)));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float x0 = __bfloat162float(__low2bfloat16(x));
+            float x1 = __bfloat162float(__high2bfloat16(x));
+            return __floats2bfloat162_rn(atanhf(x0), atanhf(x1));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            return __float2bfloat16_rn(atanhf(__bfloat162float(x)));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return atanhf(x);
+        } else {
+            return std::atanh(x);
+        }
+    }
+} AtanhOp;
+} // namespace op::atanh::cuda
+
+#endif // __ATANH_CUDA_H__
diff --git a/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cu b/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cu
new file mode 100644
index 000000000..cb5a1ff03
--- /dev/null
+++ b/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cu
@@ -0,0 +1,54 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "atanh_nvidia.cuh"
+
+namespace op::atanh::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::AtanhOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::AtanhOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::atanh::nvidia
diff --git a/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cuh b/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cuh
new file mode 100644
index 000000000..da73cfa99
--- /dev/null
+++ b/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __ATANH_NVIDIA_API_H__
+#define __ATANH_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(atanh, nvidia)
+
+#endif // __ATANH_NVIDIA_API_H__
diff --git a/src/infiniop/ops/atanh/operator.cc b/src/infiniop/ops/atanh/operator.cc
new file mode 100644
index 000000000..a73adcb23
--- /dev/null
+++ b/src/infiniop/ops/atanh/operator.cc
@@ -0,0 +1,139 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/atanh.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/atanh_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/atanh_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateAtanhDescriptor(
+    infiniopHandle_t handle,
+    infiniopAtanhDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                               \
+        return op::atanh::NAMESPACE::Descriptor::create(                     \
+            handle,                                                          \
+            reinterpret_cast<op::atanh::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                          \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetAtanhWorkspaceSize(infiniopAtanhDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                 \
+    case CASE:                                                                               \
+        *size = reinterpret_cast<op::atanh::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopAtanh(
+    infiniopAtanhDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                                  \
+        return reinterpret_cast<const op::atanh::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyAtanhDescriptor(infiniopAtanhDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                  \
+    case CASE:                                                                   \
+        delete reinterpret_cast<const op::atanh::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/ceil/cpu/ceil_cpu.cc b/src/infiniop/ops/ceil/cpu/ceil_cpu.cc
new file mode 100644
index 000000000..17b3ec888
--- /dev/null
+++ b/src/infiniop/ops/ceil/cpu/ceil_cpu.cc
@@ -0,0 +1,48 @@
+#include "ceil_cpu.h"
+
+namespace op::ceil::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<CeilOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<CeilOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::ceil::cpu
diff --git a/src/infiniop/ops/ceil/cpu/ceil_cpu.h b/src/infiniop/ops/ceil/cpu/ceil_cpu.h
new file mode 100644
index 000000000..c3ca8e441
--- /dev/null
+++ b/src/infiniop/ops/ceil/cpu/ceil_cpu.h
@@ -0,0 +1,26 @@
+#ifndef __CEIL_CPU_H__
+#define __CEIL_CPU_H__
+
+#include <cmath>
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(ceil, cpu)
+
+namespace op::ceil::cpu {
+typedef struct CeilOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        if constexpr (std::is_integral_v<T>) {
+            return x;
+        } else {
+            return std::ceil(x);
+        }
+    }
+} CeilOp;
+} // namespace op::ceil::cpu
+
+#endif // __CEIL_CPU_H__
diff --git a/src/infiniop/ops/ceil/cuda/kernel.cuh b/src/infiniop/ops/ceil/cuda/kernel.cuh
new file mode 100644
index 000000000..a2d2e7fb5
--- /dev/null
+++ b/src/infiniop/ops/ceil/cuda/kernel.cuh
@@ -0,0 +1,34 @@
+#ifndef __CEIL_CUDA_H__
+#define __CEIL_CUDA_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include <cuda_fp16.h>
+
+namespace op::ceil::cuda {
+typedef struct CeilOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return h2ceil(x);
+        } else if constexpr (std::is_same_v<T, half>) {
+            return hceil(x);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float x0 = __bfloat162float(__low2bfloat16(x));
+            float x1 = __bfloat162float(__high2bfloat16(x));
+            return __floats2bfloat162_rn(ceilf(x0), ceilf(x1));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            return __float2bfloat16_rn(ceilf(__bfloat162float(x)));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return ceilf(x);
+        } else if constexpr (std::is_integral_v<T>) {
+            return x;
+        } else {
+            return std::ceil(x);
+        }
+    }
+} CeilOp;
+} // namespace op::ceil::cuda
+
+#endif // __CEIL_CUDA_H__
diff --git a/src/infiniop/ops/ceil/nvidia/ceil_nvidia.cu b/src/infiniop/ops/ceil/nvidia/ceil_nvidia.cu
new file mode 100644
index 000000000..c7ad2ee5b
--- /dev/null
+++ b/src/infiniop/ops/ceil/nvidia/ceil_nvidia.cu
@@ -0,0 +1,54 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "ceil_nvidia.cuh"
+
+namespace op::ceil::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::CeilOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::CeilOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::ceil::nvidia
diff --git a/src/infiniop/ops/ceil/nvidia/ceil_nvidia.cuh b/src/infiniop/ops/ceil/nvidia/ceil_nvidia.cuh
new file mode 100644
index 000000000..9bada334d
--- /dev/null
+++ b/src/infiniop/ops/ceil/nvidia/ceil_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __CEIL_NVIDIA_API_H__
+#define __CEIL_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(ceil, nvidia)
+
+#endif // __CEIL_NVIDIA_API_H__
diff --git a/src/infiniop/ops/ceil/operator.cc b/src/infiniop/ops/ceil/operator.cc
new file mode 100644
index 000000000..4e5ee7800
--- /dev/null
+++ b/src/infiniop/ops/ceil/operator.cc
@@ -0,0 +1,139 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/ceil.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/ceil_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/ceil_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateCeilDescriptor(
+    infiniopHandle_t handle,
+    infiniopCeilDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::ceil::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::ceil::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                         \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetCeilWorkspaceSize(infiniopCeilDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
+        *size = reinterpret_cast<op::ceil::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopCeil(
+    infiniopCeilDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::ceil::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyCeilDescriptor(infiniopCeilDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::ceil::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/cos/cpu/cos_cpu.cc b/src/infiniop/ops/cos/cpu/cos_cpu.cc
new file mode 100644
index 000000000..9dc68d327
--- /dev/null
+++ b/src/infiniop/ops/cos/cpu/cos_cpu.cc
@@ -0,0 +1,48 @@
+#include "cos_cpu.h"
+
+namespace op::cos::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<CosOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<CosOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::cos::cpu
diff --git a/src/infiniop/ops/cos/cpu/cos_cpu.h b/src/infiniop/ops/cos/cpu/cos_cpu.h
new file mode 100644
index 000000000..9b4236fc2
--- /dev/null
+++ b/src/infiniop/ops/cos/cpu/cos_cpu.h
@@ -0,0 +1,22 @@
+#ifndef __COS_CPU_H__
+#define __COS_CPU_H__
+
+#include <cmath>
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(cos, cpu)
+
+namespace op::cos::cpu {
+typedef struct CosOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        return std::cos(x);
+    }
+} CosOp;
+} // namespace op::cos::cpu
+
+#endif // __COS_CPU_H__
diff --git a/src/infiniop/ops/cos/cuda/kernel.cuh b/src/infiniop/ops/cos/cuda/kernel.cuh
new file mode 100644
index 000000000..b0dabb340
--- /dev/null
+++ b/src/infiniop/ops/cos/cuda/kernel.cuh
@@ -0,0 +1,32 @@
+#ifndef __COS_CUDA_H__
+#define __COS_CUDA_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include <cuda_fp16.h>
+
+namespace op::cos::cuda {
+typedef struct CosOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return h2cos(x);
+        } else if constexpr (std::is_same_v<T, half>) {
+            return hcos(x);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float x0 = __bfloat162float(__low2bfloat16(x));
+            float x1 = __bfloat162float(__high2bfloat16(x));
+            return __floats2bfloat162_rn(cosf(x0), cosf(x1));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            return __float2bfloat16_rn(cosf(__bfloat162float(x)));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return __cosf(x);
+        } else {
+            return std::cos(x);
+        }
+    }
+} CosOp;
+} // namespace op::cos::cuda
+
+#endif // __COS_CUDA_H__
diff --git a/src/infiniop/ops/cos/nvidia/cos_nvidia.cu b/src/infiniop/ops/cos/nvidia/cos_nvidia.cu
new file mode 100644
index 000000000..044c59ca0
--- /dev/null
+++ b/src/infiniop/ops/cos/nvidia/cos_nvidia.cu
@@ -0,0 +1,54 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "cos_nvidia.cuh"
+
+namespace op::cos::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::CosOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::CosOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::cos::nvidia
diff --git a/src/infiniop/ops/cos/nvidia/cos_nvidia.cuh b/src/infiniop/ops/cos/nvidia/cos_nvidia.cuh
new file mode 100644
index 000000000..a9866e4d2
--- /dev/null
+++ b/src/infiniop/ops/cos/nvidia/cos_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __COS_NVIDIA_API_H__
+#define __COS_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(cos, nvidia)
+
+#endif // __COS_NVIDIA_API_H__
diff --git a/src/infiniop/ops/cos/operator.cc b/src/infiniop/ops/cos/operator.cc
new file mode 100644
index 000000000..5c464ad60
--- /dev/null
+++ b/src/infiniop/ops/cos/operator.cc
@@ -0,0 +1,139 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/cos.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/cos_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/cos_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateCosDescriptor(
+    infiniopHandle_t handle,
+    infiniopCosDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+        return op::cos::NAMESPACE::Descriptor::create(                     \
+            handle,                                                        \
+            reinterpret_cast<op::cos::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                        \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetCosWorkspaceSize(infiniopCosDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::cos::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopCos(
+    infiniopCosDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::cos::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyCosDescriptor(infiniopCosDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        delete reinterpret_cast<const op::cos::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/cosh/cpu/cosh_cpu.cc b/src/infiniop/ops/cosh/cpu/cosh_cpu.cc
new file mode 100644
index 000000000..9ed8e33da
--- /dev/null
+++ b/src/infiniop/ops/cosh/cpu/cosh_cpu.cc
@@ -0,0 +1,48 @@
+#include "cosh_cpu.h"
+
+namespace op::cosh::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<CoshOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<CoshOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::cosh::cpu
diff --git a/src/infiniop/ops/cosh/cpu/cosh_cpu.h b/src/infiniop/ops/cosh/cpu/cosh_cpu.h
new file mode 100644
index 000000000..aea359ef2
--- /dev/null
+++ b/src/infiniop/ops/cosh/cpu/cosh_cpu.h
@@ -0,0 +1,22 @@
+#ifndef __COSH_CPU_H__
+#define __COSH_CPU_H__
+
+#include <cmath>
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(cosh, cpu)
+
+namespace op::cosh::cpu {
+typedef struct CoshOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        return std::cosh(x);
+    }
+} CoshOp;
+} // namespace op::cosh::cpu
+
+#endif // __COSH_CPU_H__
diff --git a/src/infiniop/ops/cosh/cuda/kernel.cuh b/src/infiniop/ops/cosh/cuda/kernel.cuh
new file mode 100644
index 000000000..ce6806433
--- /dev/null
+++ b/src/infiniop/ops/cosh/cuda/kernel.cuh
@@ -0,0 +1,32 @@
+#ifndef __COSH_CUDA_H__
+#define __COSH_CUDA_H__
+
+#include <cmath>
+#include <cuda_fp16.h>
+
+namespace op::cosh::cuda {
+typedef struct CoshOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return __floats2half2_rn(coshf(__half2float(__low2half(x))), coshf(__half2float(__high2half(x))));
+        } else if constexpr (std::is_same_v<T, half>) {
+            return __float2half(coshf(__half2float(x)));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float x0 = __bfloat162float(__low2bfloat16(x));
+            float x1 = __bfloat162float(__high2bfloat16(x));
+            return __floats2bfloat162_rn(coshf(x0), coshf(x1));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            return __float2bfloat16_rn(coshf(__bfloat162float(x)));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return coshf(x);
+        } else {
+            return std::cosh(x);
+        }
+    }
+} CoshOp;
+} // namespace op::cosh::cuda
+
+#endif // __COSH_CUDA_H__
diff --git a/src/infiniop/ops/cosh/nvidia/cosh_nvidia.cu b/src/infiniop/ops/cosh/nvidia/cosh_nvidia.cu
new file mode 100644
index 000000000..a5e1442ce
--- /dev/null
+++ b/src/infiniop/ops/cosh/nvidia/cosh_nvidia.cu
@@ -0,0 +1,54 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "cosh_nvidia.cuh"
+
+namespace op::cosh::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::CoshOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::CoshOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::cosh::nvidia
diff --git a/src/infiniop/ops/cosh/nvidia/cosh_nvidia.cuh b/src/infiniop/ops/cosh/nvidia/cosh_nvidia.cuh
new file mode 100644
index 000000000..6a032b0bb
--- /dev/null
+++ b/src/infiniop/ops/cosh/nvidia/cosh_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __COSH_NVIDIA_API_H__
+#define __COSH_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(cosh, nvidia)
+
+#endif // __COSH_NVIDIA_API_H__
diff --git a/src/infiniop/ops/cosh/operator.cc b/src/infiniop/ops/cosh/operator.cc
new file mode 100644
index 000000000..75aac0c91
--- /dev/null
+++ b/src/infiniop/ops/cosh/operator.cc
@@ -0,0 +1,139 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/cosh.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/cosh_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/cosh_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateCoshDescriptor(
+    infiniopHandle_t handle,
+    infiniopCoshDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::cosh::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::cosh::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                         \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetCoshWorkspaceSize(infiniopCoshDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
+        *size = reinterpret_cast<op::cosh::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopCosh(
+    infiniopCoshDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::cosh::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyCoshDescriptor(infiniopCoshDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::cosh::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/erf/cpu/erf_cpu.cc b/src/infiniop/ops/erf/cpu/erf_cpu.cc
new file mode 100644
index 000000000..00b1897d1
--- /dev/null
+++ b/src/infiniop/ops/erf/cpu/erf_cpu.cc
@@ -0,0 +1,48 @@
+#include "erf_cpu.h"
+
+namespace op::erf::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<ErfOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<ErfOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::erf::cpu
diff --git a/src/infiniop/ops/erf/cpu/erf_cpu.h b/src/infiniop/ops/erf/cpu/erf_cpu.h
new file mode 100644
index 000000000..c26f519cf
--- /dev/null
+++ b/src/infiniop/ops/erf/cpu/erf_cpu.h
@@ -0,0 +1,22 @@
+#ifndef __ERF_CPU_H__
+#define __ERF_CPU_H__
+
+#include <cmath>
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(erf, cpu)
+
+namespace op::erf::cpu {
+typedef struct ErfOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        return std::erf(x);
+    }
+} ErfOp;
+} // namespace op::erf::cpu
+
+#endif // __ERF_CPU_H__
diff --git a/src/infiniop/ops/erf/cuda/kernel.cuh b/src/infiniop/ops/erf/cuda/kernel.cuh
new file mode 100644
index 000000000..820c10b19
--- /dev/null
+++ b/src/infiniop/ops/erf/cuda/kernel.cuh
@@ -0,0 +1,32 @@
+#ifndef __ERF_CUDA_H__
+#define __ERF_CUDA_H__
+
+#include <cmath>
+#include <cuda_fp16.h>
+
+namespace op::erf::cuda {
+typedef struct ErfOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return __floats2half2_rn(erff(__half2float(__low2half(x))), erff(__half2float(__high2half(x))));
+        } else if constexpr (std::is_same_v<T, half>) {
+            return __float2half(erff(__half2float(x)));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float x0 = __bfloat162float(__low2bfloat16(x));
+            float x1 = __bfloat162float(__high2bfloat16(x));
+            return __floats2bfloat162_rn(erff(x0), erff(x1));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            return __float2bfloat16_rn(erff(__bfloat162float(x)));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return erff(x);
+        } else {
+            return std::erf(x);
+        }
+    }
+} ErfOp;
+} // namespace op::erf::cuda
+
+#endif // __ERF_CUDA_H__
diff --git a/src/infiniop/ops/erf/nvidia/erf_nvidia.cu b/src/infiniop/ops/erf/nvidia/erf_nvidia.cu
new file mode 100644
index 000000000..9080593de
--- /dev/null
+++ b/src/infiniop/ops/erf/nvidia/erf_nvidia.cu
@@ -0,0 +1,54 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "erf_nvidia.cuh"
+
+namespace op::erf::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::ErfOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::ErfOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::erf::nvidia
diff --git a/src/infiniop/ops/erf/nvidia/erf_nvidia.cuh b/src/infiniop/ops/erf/nvidia/erf_nvidia.cuh
new file mode 100644
index 000000000..0621150fa
--- /dev/null
+++ b/src/infiniop/ops/erf/nvidia/erf_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __ERF_NVIDIA_API_H__
+#define __ERF_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(erf, nvidia)
+
+#endif // __ERF_NVIDIA_API_H__
diff --git a/src/infiniop/ops/erf/operator.cc b/src/infiniop/ops/erf/operator.cc
new file mode 100644
index 000000000..1491cfa9a
--- /dev/null
+++ b/src/infiniop/ops/erf/operator.cc
@@ -0,0 +1,139 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/erf.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/erf_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/erf_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateErfDescriptor(
+    infiniopHandle_t handle,
+    infiniopErfDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+        return op::erf::NAMESPACE::Descriptor::create(                     \
+            handle,                                                        \
+            reinterpret_cast<op::erf::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                        \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetErfWorkspaceSize(infiniopErfDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::erf::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopErf(
+    infiniopErfDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::erf::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyErfDescriptor(infiniopErfDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        delete reinterpret_cast<const op::erf::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/floor/cpu/floor_cpu.cc b/src/infiniop/ops/floor/cpu/floor_cpu.cc
new file mode 100644
index 000000000..e809a02e2
--- /dev/null
+++ b/src/infiniop/ops/floor/cpu/floor_cpu.cc
@@ -0,0 +1,48 @@
+#include "floor_cpu.h"
+
+namespace op::floor::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<FloorOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<FloorOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::floor::cpu
diff --git a/src/infiniop/ops/floor/cpu/floor_cpu.h b/src/infiniop/ops/floor/cpu/floor_cpu.h
new file mode 100644
index 000000000..91508a384
--- /dev/null
+++ b/src/infiniop/ops/floor/cpu/floor_cpu.h
@@ -0,0 +1,26 @@
+#ifndef __FLOOR_CPU_H__
+#define __FLOOR_CPU_H__
+
+#include <cmath>
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(floor, cpu)
+
+namespace op::floor::cpu {
+typedef struct FloorOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        if constexpr (std::is_integral_v<T>) {
+            return x;
+        } else {
+            return std::floor(x);
+        }
+    }
+} FloorOp;
+} // namespace op::floor::cpu
+
+#endif // __FLOOR_CPU_H__
diff --git a/src/infiniop/ops/floor/cuda/kernel.cuh b/src/infiniop/ops/floor/cuda/kernel.cuh
new file mode 100644
index 000000000..c89ce34f4
--- /dev/null
+++ b/src/infiniop/ops/floor/cuda/kernel.cuh
@@ -0,0 +1,34 @@
+#ifndef __FLOOR_CUDA_H__
+#define __FLOOR_CUDA_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include <cuda_fp16.h>
+
+namespace op::floor::cuda {
+typedef struct FloorOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return h2floor(x);
+        } else if constexpr (std::is_same_v<T, half>) {
+            return hfloor(x);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float x0 = __bfloat162float(__low2bfloat16(x));
+            float x1 = __bfloat162float(__high2bfloat16(x));
+            return __floats2bfloat162_rn(floorf(x0), floorf(x1));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            return __float2bfloat16_rn(floorf(__bfloat162float(x)));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return floorf(x);
+        } else if constexpr (std::is_integral_v<T>) {
+            return x;
+        } else {
+            return std::floor(x);
+        }
+    }
+} FloorOp;
+} // namespace op::floor::cuda
+
+#endif // __FLOOR_CUDA_H__
diff --git a/src/infiniop/ops/floor/nvidia/floor_nvidia.cu b/src/infiniop/ops/floor/nvidia/floor_nvidia.cu
new file mode 100644
index 000000000..08305048a
--- /dev/null
+++ b/src/infiniop/ops/floor/nvidia/floor_nvidia.cu
@@ -0,0 +1,54 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "floor_nvidia.cuh"
+
+namespace op::floor::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::FloorOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::FloorOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::floor::nvidia
diff --git a/src/infiniop/ops/floor/nvidia/floor_nvidia.cuh b/src/infiniop/ops/floor/nvidia/floor_nvidia.cuh
new file mode 100644
index 000000000..7a3c2f5c7
--- /dev/null
+++ b/src/infiniop/ops/floor/nvidia/floor_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __FLOOR_NVIDIA_API_H__
+#define __FLOOR_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(floor, nvidia)
+
+#endif // __FLOOR_NVIDIA_API_H__
diff --git a/src/infiniop/ops/floor/operator.cc b/src/infiniop/ops/floor/operator.cc
new file mode 100644
index 000000000..4e4ed2b5a
--- /dev/null
+++ b/src/infiniop/ops/floor/operator.cc
@@ -0,0 +1,139 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/floor.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/floor_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/floor_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateFloorDescriptor(
+    infiniopHandle_t handle,
+    infiniopFloorDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                               \
+        return op::floor::NAMESPACE::Descriptor::create(                     \
+            handle,                                                          \
+            reinterpret_cast<op::floor::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                          \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetFloorWorkspaceSize(infiniopFloorDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                 \
+    case CASE:                                                                               \
+        *size = reinterpret_cast<op::floor::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopFloor(
+    infiniopFloorDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                                  \
+        return reinterpret_cast<const op::floor::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyFloorDescriptor(infiniopFloorDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                  \
+    case CASE:                                                                   \
+        delete reinterpret_cast<const op::floor::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/log/cpu/log_cpu.cc b/src/infiniop/ops/log/cpu/log_cpu.cc
new file mode 100644
index 000000000..e7314c319
--- /dev/null
+++ b/src/infiniop/ops/log/cpu/log_cpu.cc
@@ -0,0 +1,48 @@
+#include "log_cpu.h"
+
+namespace op::log::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<LogOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<LogOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::log::cpu
diff --git a/src/infiniop/ops/log/cpu/log_cpu.h b/src/infiniop/ops/log/cpu/log_cpu.h
new file mode 100644
index 000000000..535e681d3
--- /dev/null
+++ b/src/infiniop/ops/log/cpu/log_cpu.h
@@ -0,0 +1,22 @@
+#ifndef __LOG_CPU_H__
+#define __LOG_CPU_H__
+
+#include <cmath>
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(log, cpu)
+
+namespace op::log::cpu {
+typedef struct LogOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        return std::log(x);
+    }
+} LogOp;
+} // namespace op::log::cpu
+
+#endif // __LOG_CPU_H__
diff --git a/src/infiniop/ops/log/cuda/kernel.cuh b/src/infiniop/ops/log/cuda/kernel.cuh
new file mode 100644
index 000000000..b1e46873c
--- /dev/null
+++ b/src/infiniop/ops/log/cuda/kernel.cuh
@@ -0,0 +1,32 @@
+#ifndef __LOG_CUDA_H__
+#define __LOG_CUDA_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include <cuda_fp16.h>
+
+namespace op::log::cuda {
+typedef struct LogOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return h2log(x);
+        } else if constexpr (std::is_same_v<T, half>) {
+            return __float2half(__logf(__half2float(x)));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float x0 = __bfloat162float(__low2bfloat16(x));
+            float x1 = __bfloat162float(__high2bfloat16(x));
+            return __floats2bfloat162_rn(logf(x0), logf(x1));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            return __float2bfloat16_rn(logf(__bfloat162float(x)));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return __logf(x);
+        } else {
+            return std::log(x);
+        }
+    }
+} LogOp;
+} // namespace op::log::cuda
+
+#endif // __LOG_CUDA_H__
diff --git a/src/infiniop/ops/log/nvidia/log_nvidia.cu b/src/infiniop/ops/log/nvidia/log_nvidia.cu
new file mode 100644
index 000000000..9e7bcafc4
--- /dev/null
+++ b/src/infiniop/ops/log/nvidia/log_nvidia.cu
@@ -0,0 +1,54 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "log_nvidia.cuh"
+
+namespace op::log::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::LogOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::LogOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::log::nvidia
diff --git a/src/infiniop/ops/log/nvidia/log_nvidia.cuh b/src/infiniop/ops/log/nvidia/log_nvidia.cuh
new file mode 100644
index 000000000..c48841622
--- /dev/null
+++ b/src/infiniop/ops/log/nvidia/log_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __LOG_NVIDIA_API_H__
+#define __LOG_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(log, nvidia)
+
+#endif // __LOG_NVIDIA_API_H__
diff --git a/src/infiniop/ops/log/operator.cc b/src/infiniop/ops/log/operator.cc
new file mode 100644
index 000000000..8f2add408
--- /dev/null
+++ b/src/infiniop/ops/log/operator.cc
@@ -0,0 +1,139 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/log.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/log_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/log_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateLogDescriptor(
+    infiniopHandle_t handle,
+    infiniopLogDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+        return op::log::NAMESPACE::Descriptor::create(                     \
+            handle,                                                        \
+            reinterpret_cast<op::log::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                        \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetLogWorkspaceSize(infiniopLogDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::log::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopLog(
+    infiniopLogDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::log::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyLogDescriptor(infiniopLogDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        delete reinterpret_cast<const op::log::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/neg/cpu/neg_cpu.cc b/src/infiniop/ops/neg/cpu/neg_cpu.cc
new file mode 100644
index 000000000..5da2ae4c3
--- /dev/null
+++ b/src/infiniop/ops/neg/cpu/neg_cpu.cc
@@ -0,0 +1,48 @@
+#include "neg_cpu.h"
+
+namespace op::neg::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<NegOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<NegOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::neg::cpu
diff --git a/src/infiniop/ops/neg/cpu/neg_cpu.h b/src/infiniop/ops/neg/cpu/neg_cpu.h
new file mode 100644
index 000000000..ea45989b3
--- /dev/null
+++ b/src/infiniop/ops/neg/cpu/neg_cpu.h
@@ -0,0 +1,20 @@
+#ifndef __NEG_CPU_H__
+#define __NEG_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(neg, cpu)
+
+namespace op::neg::cpu {
+typedef struct NegOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        return -x;
+    }
+} NegOp;
+} // namespace op::neg::cpu
+
+#endif // __NEG_CPU_H__
diff --git a/src/infiniop/ops/neg/cuda/kernel.cuh b/src/infiniop/ops/neg/cuda/kernel.cuh
new file mode 100644
index 000000000..57904b3df
--- /dev/null
+++ b/src/infiniop/ops/neg/cuda/kernel.cuh
@@ -0,0 +1,23 @@
+#ifndef __NEG_CUDA_H__
+#define __NEG_CUDA_H__
+
+#include <cuda_fp16.h>
+
+namespace op::neg::cuda {
+typedef struct NegOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return __hneg2(x);
+        } else if constexpr (std::is_same_v<T, half>) {
+            return __hneg(x);
+        } else {
+            return -x;
+        }
+    }
+} NegOp;
+} // namespace op::neg::cuda
+
+#endif // __NEG_CUDA_H__
diff --git a/src/infiniop/ops/neg/nvidia/neg_nvidia.cu b/src/infiniop/ops/neg/nvidia/neg_nvidia.cu
new file mode 100644
index 000000000..d18b8bf25
--- /dev/null
+++ b/src/infiniop/ops/neg/nvidia/neg_nvidia.cu
@@ -0,0 +1,54 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "neg_nvidia.cuh"
+
+namespace op::neg::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::NegOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::NegOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::neg::nvidia
diff --git a/src/infiniop/ops/neg/nvidia/neg_nvidia.cuh b/src/infiniop/ops/neg/nvidia/neg_nvidia.cuh
new file mode 100644
index 000000000..1265cd3df
--- /dev/null
+++ b/src/infiniop/ops/neg/nvidia/neg_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __NEG_NVIDIA_API_H__
+#define __NEG_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(neg, nvidia)
+
+#endif // __NEG_NVIDIA_API_H__
diff --git a/src/infiniop/ops/neg/operator.cc b/src/infiniop/ops/neg/operator.cc
new file mode 100644
index 000000000..d4134df3e
--- /dev/null
+++ b/src/infiniop/ops/neg/operator.cc
@@ -0,0 +1,139 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/neg.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/neg_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/neg_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateNegDescriptor(
+    infiniopHandle_t handle,
+    infiniopNegDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+        return op::neg::NAMESPACE::Descriptor::create(                     \
+            handle,                                                        \
+            reinterpret_cast<op::neg::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                        \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetNegWorkspaceSize(infiniopNegDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::neg::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopNeg(
+    infiniopNegDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::neg::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyNegDescriptor(infiniopNegDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        delete reinterpret_cast<const op::neg::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/pow/cuda/kernel.cuh b/src/infiniop/ops/pow/cuda/kernel.cuh
index e8b5324a0..3786e7a52 100644
--- a/src/infiniop/ops/pow/cuda/kernel.cuh
+++ b/src/infiniop/ops/pow/cuda/kernel.cuh
@@ -2,8 +2,8 @@
 #define __POW_CUDA_H__
 
 #include <cmath>
-#include <cuda_fp16.h>
 #include <cuda_bf16.h>
+#include <cuda_fp16.h>
 
 namespace op::pow::cuda {
 typedef struct PowOp {
diff --git a/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.cc b/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.cc
new file mode 100644
index 000000000..52874c8b3
--- /dev/null
+++ b/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.cc
@@ -0,0 +1,48 @@
+#include "reciprocal_cpu.h"
+
+namespace op::reciprocal::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<ReciprocalOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<ReciprocalOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::reciprocal::cpu
diff --git a/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.h b/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.h
new file mode 100644
index 000000000..0a0f223f0
--- /dev/null
+++ b/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.h
@@ -0,0 +1,20 @@
+#ifndef __RECIPROCAL_CPU_H__
+#define __RECIPROCAL_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(reciprocal, cpu)
+
+namespace op::reciprocal::cpu {
+typedef struct ReciprocalOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        return T(1) / x;
+    }
+} ReciprocalOp;
+} // namespace op::reciprocal::cpu
+
+#endif // __RECIPROCAL_CPU_H__
diff --git a/src/infiniop/ops/reciprocal/cuda/kernel.cuh b/src/infiniop/ops/reciprocal/cuda/kernel.cuh
new file mode 100644
index 000000000..94c71de90
--- /dev/null
+++ b/src/infiniop/ops/reciprocal/cuda/kernel.cuh
@@ -0,0 +1,32 @@
+#ifndef __RECIPROCAL_CUDA_H__
+#define __RECIPROCAL_CUDA_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include <cuda_fp16.h>
+
+namespace op::reciprocal::cuda {
+typedef struct ReciprocalOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return h2rcp(x);
+        } else if constexpr (std::is_same_v<T, half>) {
+            return hrcp(x);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float x0 = __bfloat162float(__low2bfloat16(x));
+            float x1 = __bfloat162float(__high2bfloat16(x));
+            return __floats2bfloat162_rn(__frcp_rn(x0), __frcp_rn(x1));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            return __float2bfloat16_rn(__frcp_rn(__bfloat162float(x)));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return __frcp_rn(x);
+        } else {
+            return T(1) / x;
+        }
+    }
+} ReciprocalOp;
+} // namespace op::reciprocal::cuda
+
+#endif // __RECIPROCAL_CUDA_H__
diff --git a/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cu b/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cu
new file mode 100644
index 000000000..45b74e25e
--- /dev/null
+++ b/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cu
@@ -0,0 +1,54 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "reciprocal_nvidia.cuh"
+
+namespace op::reciprocal::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::ReciprocalOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::ReciprocalOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::reciprocal::nvidia
diff --git a/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cuh b/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cuh
new file mode 100644
index 000000000..d98c8f4c2
--- /dev/null
+++ b/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __RECIPROCAL_NVIDIA_API_H__
+#define __RECIPROCAL_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(reciprocal, nvidia)
+
+#endif // __RECIPROCAL_NVIDIA_API_H__
diff --git a/src/infiniop/ops/reciprocal/operator.cc b/src/infiniop/ops/reciprocal/operator.cc
new file mode 100644
index 000000000..033286024
--- /dev/null
+++ b/src/infiniop/ops/reciprocal/operator.cc
@@ -0,0 +1,139 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/reciprocal.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/reciprocal_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/reciprocal_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateReciprocalDescriptor(
+    infiniopHandle_t handle,
+    infiniopReciprocalDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                                   \
+    case CASE:                                                                    \
+        return op::reciprocal::NAMESPACE::Descriptor::create(                     \
+            handle,                                                               \
+            reinterpret_cast<op::reciprocal::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                               \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetReciprocalWorkspaceSize(infiniopReciprocalDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                      \
+    case CASE:                                                                                    \
+        *size = reinterpret_cast<op::reciprocal::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopReciprocal(
+    infiniopReciprocalDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                   \
+    case CASE:                                                                       \
+        return reinterpret_cast<const op::reciprocal::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyReciprocalDescriptor(infiniopReciprocalDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                       \
+    case CASE:                                                                        \
+        delete reinterpret_cast<const op::reciprocal::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/round/cpu/round_cpu.cc b/src/infiniop/ops/round/cpu/round_cpu.cc
new file mode 100644
index 000000000..0b0cea7b7
--- /dev/null
+++ b/src/infiniop/ops/round/cpu/round_cpu.cc
@@ -0,0 +1,48 @@
+#include "round_cpu.h"
+
+namespace op::round::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<RoundOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<RoundOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::round::cpu
diff --git a/src/infiniop/ops/round/cpu/round_cpu.h b/src/infiniop/ops/round/cpu/round_cpu.h
new file mode 100644
index 000000000..eccd6df0f
--- /dev/null
+++ b/src/infiniop/ops/round/cpu/round_cpu.h
@@ -0,0 +1,25 @@
+#ifndef __ROUND_CPU_H__
+#define __ROUND_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include <cmath>
+
+ELEMENTWISE_DESCRIPTOR(round, cpu)
+
+namespace op::round::cpu {
+typedef struct RoundOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        if constexpr (std::is_integral_v<T>) {
+            return x;
+        } else {
+            return std::nearbyint(x);
+        }
+    }
+} RoundOp;
+} // namespace op::round::cpu
+
+#endif // __ROUND_CPU_H__
diff --git a/src/infiniop/ops/round/cuda/kernel.cuh b/src/infiniop/ops/round/cuda/kernel.cuh
new file mode 100644
index 000000000..c52a10716
--- /dev/null
+++ b/src/infiniop/ops/round/cuda/kernel.cuh
@@ -0,0 +1,34 @@
+#ifndef __ROUND_CUDA_H__
+#define __ROUND_CUDA_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include <cuda_fp16.h>
+
+namespace op::round::cuda {
+typedef struct RoundOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return h2rint(x);
+        } else if constexpr (std::is_same_v<T, half>) {
+            return hrint(x);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float x0 = __bfloat162float(__low2bfloat16(x));
+            float x1 = __bfloat162float(__high2bfloat16(x));
+            return __floats2bfloat162_rn(rintf(x0), rintf(x1));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            return __float2bfloat16_rn(rintf(__bfloat162float(x)));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return rintf(x);
+        } else if constexpr (std::is_integral_v<T>) {
+            return x;
+        } else {
+            return std::nearbyint(x);
+        }
+    }
+} RoundOp;
+} // namespace op::round::cuda
+
+#endif // __ROUND_CUDA_H__
diff --git a/src/infiniop/ops/round/nvidia/round_nvidia.cu b/src/infiniop/ops/round/nvidia/round_nvidia.cu
new file mode 100644
index 000000000..c1fabc885
--- /dev/null
+++ b/src/infiniop/ops/round/nvidia/round_nvidia.cu
@@ -0,0 +1,54 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "round_nvidia.cuh"
+
+namespace op::round::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::RoundOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::RoundOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::round::nvidia
diff --git a/src/infiniop/ops/round/nvidia/round_nvidia.cuh b/src/infiniop/ops/round/nvidia/round_nvidia.cuh
new file mode 100644
index 000000000..65bb38566
--- /dev/null
+++ b/src/infiniop/ops/round/nvidia/round_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __ROUND_NVIDIA_API_H__
+#define __ROUND_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(round, nvidia)
+
+#endif // __ROUND_NVIDIA_API_H__
diff --git a/src/infiniop/ops/round/operator.cc b/src/infiniop/ops/round/operator.cc
new file mode 100644
index 000000000..9468803c8
--- /dev/null
+++ b/src/infiniop/ops/round/operator.cc
@@ -0,0 +1,139 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/round.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/round_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/round_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateRoundDescriptor(
+    infiniopHandle_t handle,
+    infiniopRoundDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                               \
+        return op::round::NAMESPACE::Descriptor::create(                     \
+            handle,                                                          \
+            reinterpret_cast<op::round::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                          \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetRoundWorkspaceSize(infiniopRoundDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                 \
+    case CASE:                                                                               \
+        *size = reinterpret_cast<op::round::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopRound(
+    infiniopRoundDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                                  \
+        return reinterpret_cast<const op::round::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyRoundDescriptor(infiniopRoundDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                  \
+    case CASE:                                                                   \
+        delete reinterpret_cast<const op::round::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/sign/cpu/sign_cpu.cc b/src/infiniop/ops/sign/cpu/sign_cpu.cc
new file mode 100644
index 000000000..1f3430e73
--- /dev/null
+++ b/src/infiniop/ops/sign/cpu/sign_cpu.cc
@@ -0,0 +1,48 @@
+#include "sign_cpu.h"
+
+namespace op::sign::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<SignOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<SignOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::sign::cpu
diff --git a/src/infiniop/ops/sign/cpu/sign_cpu.h b/src/infiniop/ops/sign/cpu/sign_cpu.h
new file mode 100644
index 000000000..505194c85
--- /dev/null
+++ b/src/infiniop/ops/sign/cpu/sign_cpu.h
@@ -0,0 +1,20 @@
+#ifndef __SIGN_CPU_H__
+#define __SIGN_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(sign, cpu)
+
+namespace op::sign::cpu {
+typedef struct SignOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        return x > T(0) ? T(1) : (x == T(0) ? T(0) : T(-1));
+    }
+} SignOp;
+} // namespace op::sign::cpu
+
+#endif // __SIGN_CPU_H__
diff --git a/src/infiniop/ops/sign/cuda/kernel.cuh b/src/infiniop/ops/sign/cuda/kernel.cuh
new file mode 100644
index 000000000..3737282b0
--- /dev/null
+++ b/src/infiniop/ops/sign/cuda/kernel.cuh
@@ -0,0 +1,25 @@
+#ifndef __SIGN_CUDA_H__
+#define __SIGN_CUDA_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include <cuda_fp16.h>
+
+namespace op::sign::cuda {
+typedef struct SignOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            const auto lt_mask = __hlt2(x, __floats2half2_rn(0.0f, 0.0f));
+            return __hadd2(__hneg2(lt_mask), __hsub2(__floats2half2_rn(1.0f, 1.0f), lt_mask));
+        } else if constexpr (std::is_same_v<T, half>) {
+            return x > half(0) ? half(1) : (x == half(0) ? half(0) : half(-1));
+        } else {
+            return x > T(0) ? T(1) : (x == T(0) ? T(0) : T(-1));
+        }
+    }
+} SignOp;
+} // namespace op::sign::cuda
+
+#endif // __SIGN_CUDA_H__
diff --git a/src/infiniop/ops/sign/nvidia/sign_nvidia.cu b/src/infiniop/ops/sign/nvidia/sign_nvidia.cu
new file mode 100644
index 000000000..6a3152e41
--- /dev/null
+++ b/src/infiniop/ops/sign/nvidia/sign_nvidia.cu
@@ -0,0 +1,54 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "sign_nvidia.cuh"
+
+namespace op::sign::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::SignOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::SignOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::sign::nvidia
diff --git a/src/infiniop/ops/sign/nvidia/sign_nvidia.cuh b/src/infiniop/ops/sign/nvidia/sign_nvidia.cuh
new file mode 100644
index 000000000..d5f2540a3
--- /dev/null
+++ b/src/infiniop/ops/sign/nvidia/sign_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __SIGN_NVIDIA_API_H__
+#define __SIGN_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(sign, nvidia)
+
+#endif // __SIGN_NVIDIA_API_H__
diff --git a/src/infiniop/ops/sign/operator.cc b/src/infiniop/ops/sign/operator.cc
new file mode 100644
index 000000000..8f658a9b3
--- /dev/null
+++ b/src/infiniop/ops/sign/operator.cc
@@ -0,0 +1,139 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/sign.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/sign_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/sign_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateSignDescriptor(
+    infiniopHandle_t handle,
+    infiniopSignDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::sign::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::sign::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                         \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetSignWorkspaceSize(infiniopSignDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
+        *size = reinterpret_cast<op::sign::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopSign(
+    infiniopSignDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::sign::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroySignDescriptor(infiniopSignDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::sign::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/sinh/cpu/sinh_cpu.cc b/src/infiniop/ops/sinh/cpu/sinh_cpu.cc
new file mode 100644
index 000000000..40685847d
--- /dev/null
+++ b/src/infiniop/ops/sinh/cpu/sinh_cpu.cc
@@ -0,0 +1,48 @@
+#include "sinh_cpu.h"
+
+namespace op::sinh::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<SinhOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<SinhOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::sinh::cpu
diff --git a/src/infiniop/ops/sinh/cpu/sinh_cpu.h b/src/infiniop/ops/sinh/cpu/sinh_cpu.h
new file mode 100644
index 000000000..dbc8f3c7e
--- /dev/null
+++ b/src/infiniop/ops/sinh/cpu/sinh_cpu.h
@@ -0,0 +1,22 @@
+#ifndef __SINH_CPU_H__
+#define __SINH_CPU_H__
+
+#include <cmath>
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(sinh, cpu)
+
+namespace op::sinh::cpu {
+typedef struct SinhOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        return std::sinh(x);
+    }
+} SinhOp;
+} // namespace op::sinh::cpu
+
+#endif // __SINH_CPU_H__
diff --git a/src/infiniop/ops/sinh/cuda/kernel.cuh b/src/infiniop/ops/sinh/cuda/kernel.cuh
new file mode 100644
index 000000000..c09150666
--- /dev/null
+++ b/src/infiniop/ops/sinh/cuda/kernel.cuh
@@ -0,0 +1,32 @@
+#ifndef __SINH_CUDA_H__
+#define __SINH_CUDA_H__
+
+#include <cmath>
+#include <cuda_fp16.h>
+
+namespace op::sinh::cuda {
+typedef struct SinhOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return __floats2half2_rn(sinhf(__half2float(__low2half(x))), sinhf(__half2float(__high2half(x))));
+        } else if constexpr (std::is_same_v<T, half>) {
+            return __float2half(sinhf(__half2float(x)));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float x0 = __bfloat162float(__low2bfloat16(x));
+            float x1 = __bfloat162float(__high2bfloat16(x));
+            return __floats2bfloat162_rn(sinhf(x0), sinhf(x1));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            return __float2bfloat16_rn(sinhf(__bfloat162float(x)));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return sinhf(x);
+        } else {
+            return std::sinh(x);
+        }
+    }
+} SinhOp;
+} // namespace op::sinh::cuda
+
+#endif // __SINH_CUDA_H__
diff --git a/src/infiniop/ops/sinh/nvidia/sinh_nvidia.cu b/src/infiniop/ops/sinh/nvidia/sinh_nvidia.cu
new file mode 100644
index 000000000..d4c3fd165
--- /dev/null
+++ b/src/infiniop/ops/sinh/nvidia/sinh_nvidia.cu
@@ -0,0 +1,54 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "sinh_nvidia.cuh"
+
+namespace op::sinh::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::SinhOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::SinhOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::sinh::nvidia
diff --git a/src/infiniop/ops/sinh/nvidia/sinh_nvidia.cuh b/src/infiniop/ops/sinh/nvidia/sinh_nvidia.cuh
new file mode 100644
index 000000000..66e3e3e67
--- /dev/null
+++ b/src/infiniop/ops/sinh/nvidia/sinh_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __SINH_NVIDIA_API_H__
+#define __SINH_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(sinh, nvidia)
+
+#endif // __SINH_NVIDIA_API_H__
diff --git a/src/infiniop/ops/sinh/operator.cc b/src/infiniop/ops/sinh/operator.cc
new file mode 100644
index 000000000..1636ce2c8
--- /dev/null
+++ b/src/infiniop/ops/sinh/operator.cc
@@ -0,0 +1,139 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/sinh.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/sinh_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/sinh_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateSinhDescriptor(
+    infiniopHandle_t handle,
+    infiniopSinhDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::sinh::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::sinh::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                         \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetSinhWorkspaceSize(infiniopSinhDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
+        *size = reinterpret_cast<op::sinh::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopSinh(
+    infiniopSinhDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::sinh::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroySinhDescriptor(infiniopSinhDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::sinh::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc b/src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc
new file mode 100644
index 000000000..99e723126
--- /dev/null
+++ b/src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc
@@ -0,0 +1,48 @@
+#include "sqrt_cpu.h"
+
+namespace op::sqrt::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<SqrtOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<SqrtOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::sqrt::cpu
diff --git a/src/infiniop/ops/sqrt/cpu/sqrt_cpu.h b/src/infiniop/ops/sqrt/cpu/sqrt_cpu.h
new file mode 100644
index 000000000..3d026cf63
--- /dev/null
+++ b/src/infiniop/ops/sqrt/cpu/sqrt_cpu.h
@@ -0,0 +1,22 @@
+#ifndef __SQRT_CPU_H__
+#define __SQRT_CPU_H__
+
+#include <cmath>
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(sqrt, cpu)
+
+namespace op::sqrt::cpu {
+typedef struct SqrtOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        return std::sqrt(x);
+    }
+} SqrtOp;
+} // namespace op::sqrt::cpu
+
+#endif // __SQRT_CPU_H__
diff --git a/src/infiniop/ops/sqrt/cuda/kernel.cuh b/src/infiniop/ops/sqrt/cuda/kernel.cuh
new file mode 100644
index 000000000..c82cd7dd5
--- /dev/null
+++ b/src/infiniop/ops/sqrt/cuda/kernel.cuh
@@ -0,0 +1,32 @@
+#ifndef __SQRT_CUDA_H__
+#define __SQRT_CUDA_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include <cuda_fp16.h>
+
+namespace op::sqrt::cuda {
+typedef struct SqrtOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return h2sqrt(x);
+        } else if constexpr (std::is_same_v<T, half>) {
+            return hsqrt(x);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float x0 = __bfloat162float(__low2bfloat16(x));
+            float x1 = __bfloat162float(__high2bfloat16(x));
+            return __floats2bfloat162_rn(__fsqrt_rn(x0), __fsqrt_rn(x1));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            return __float2bfloat16_rn(__fsqrt_rn(__bfloat162float(x)));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return __fsqrt_rn(x);
+        } else {
+            return std::sqrt(x);
+        }
+    }
+} SqrtOp;
+} // namespace op::sqrt::cuda
+
+#endif // __SQRT_CUDA_H__
diff --git a/src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cu b/src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cu
new file mode 100644
index 000000000..519d06e89
--- /dev/null
+++ b/src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cu
@@ -0,0 +1,54 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "sqrt_nvidia.cuh"
+
+namespace op::sqrt::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::SqrtOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::SqrtOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::sqrt::nvidia
diff --git a/src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cuh b/src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cuh
new file mode 100644
index 000000000..6cd98c814
--- /dev/null
+++ b/src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __SQRT_NVIDIA_API_H__
+#define __SQRT_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(sqrt, nvidia)
+
+#endif // __SQRT_NVIDIA_API_H__
diff --git a/src/infiniop/ops/sqrt/operator.cc b/src/infiniop/ops/sqrt/operator.cc
new file mode 100644
index 000000000..b11c8a4b5
--- /dev/null
+++ b/src/infiniop/ops/sqrt/operator.cc
@@ -0,0 +1,139 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/sqrt.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/sqrt_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/sqrt_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateSqrtDescriptor(
+    infiniopHandle_t handle,
+    infiniopSqrtDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::sqrt::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::sqrt::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                         \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetSqrtWorkspaceSize(infiniopSqrtDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
+        *size = reinterpret_cast<op::sqrt::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopSqrt(
+    infiniopSqrtDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::sqrt::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroySqrtDescriptor(infiniopSqrtDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::sqrt::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/tan/cpu/tan_cpu.cc b/src/infiniop/ops/tan/cpu/tan_cpu.cc
new file mode 100644
index 000000000..2947dfc5e
--- /dev/null
+++ b/src/infiniop/ops/tan/cpu/tan_cpu.cc
@@ -0,0 +1,48 @@
+#include "tan_cpu.h"
+
+namespace op::tan::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<TanOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<TanOp, float>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::tan::cpu
diff --git a/src/infiniop/ops/tan/cpu/tan_cpu.h b/src/infiniop/ops/tan/cpu/tan_cpu.h
new file mode 100644
index 000000000..c3a22456c
--- /dev/null
+++ b/src/infiniop/ops/tan/cpu/tan_cpu.h
@@ -0,0 +1,22 @@
+#ifndef __TAN_CPU_H__
+#define __TAN_CPU_H__
+
+#include <cmath>
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(tan, cpu)
+
+namespace op::tan::cpu {
+typedef struct TanOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        return std::tan(x);
+    }
+} TanOp;
+} // namespace op::tan::cpu
+
+#endif // __TAN_CPU_H__
diff --git a/src/infiniop/ops/tan/cuda/kernel.cuh b/src/infiniop/ops/tan/cuda/kernel.cuh
new file mode 100644
index 000000000..bbd8facaa
--- /dev/null
+++ b/src/infiniop/ops/tan/cuda/kernel.cuh
@@ -0,0 +1,55 @@
+#ifndef __TAN_CUDA_H__
+#define __TAN_CUDA_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include <cmath>
+#include <cuda_fp16.h>
+
+#define TAN_THRESHOLD 15000
+
+namespace op::tan::cuda {
+typedef struct TanOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return h2sin(x) / h2cos(x);
+        } else if constexpr (std::is_same_v<T, half>) {
+            float tan_f = __tanf(__half2float(x));
+            if (std::fabs(tan_f) > TAN_THRESHOLD) {
+                return __float2half(tanf(__half2float(x)));
+            }
+            return __float2half(tan_f);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float x0 = __bfloat162float(__low2bfloat16(x));
+            float x1 = __bfloat162float(__high2bfloat16(x));
+            float tan_f0 = __tanf(x0);
+            float tan_f1 = __tanf(x1);
+            if (std::fabs(tan_f0) > TAN_THRESHOLD) {
+                tan_f0 = tanf(x0);
+            }
+            if (std::fabs(tan_f1) > TAN_THRESHOLD) {
+                tan_f1 = tanf(x1);
+            }
+            return __floats2bfloat162_rn(tan_f0, tan_f1);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            float tan_f = __tanf(__bfloat162float(x));
+            if (std::fabs(tan_f) > TAN_THRESHOLD) {
+                return __float2bfloat16_rn(tanf(__bfloat162float(x)));
+            }
+            return __float2bfloat16_rn(tan_f);
+        } else if constexpr (std::is_same_v<T, float>) {
+            float tan_f = __tanf(x);
+            if (std::fabs(tan_f) > TAN_THRESHOLD) {
+                return tanf(x);
+            }
+            return tan_f;
+        } else {
+            return std::tan(x);
+        }
+    }
+} TanOp;
+} // namespace op::tan::cuda
+
+#endif // __TAN_CUDA_H__
diff --git a/src/infiniop/ops/tan/nvidia/tan_nvidia.cu b/src/infiniop/ops/tan/nvidia/tan_nvidia.cu
new file mode 100644
index 000000000..b4c24e2fe
--- /dev/null
+++ b/src/infiniop/ops/tan/nvidia/tan_nvidia.cu
@@ -0,0 +1,54 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "tan_nvidia.cuh"
+
+namespace op::tan::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::TanOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::TanOp, float>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::tan::nvidia
diff --git a/src/infiniop/ops/tan/nvidia/tan_nvidia.cuh b/src/infiniop/ops/tan/nvidia/tan_nvidia.cuh
new file mode 100644
index 000000000..ec620cbeb
--- /dev/null
+++ b/src/infiniop/ops/tan/nvidia/tan_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __TAN_NVIDIA_API_H__
+#define __TAN_NVIDIA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(tan, nvidia)
+
+#endif // __TAN_NVIDIA_API_H__
diff --git a/src/infiniop/ops/tan/operator.cc b/src/infiniop/ops/tan/operator.cc
new file mode 100644
index 000000000..48ae8d48e
--- /dev/null
+++ b/src/infiniop/ops/tan/operator.cc
@@ -0,0 +1,139 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/tan.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/tan_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/tan_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateTanDescriptor(
+    infiniopHandle_t handle,
+    infiniopTanDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+        return op::tan::NAMESPACE::Descriptor::create(                     \
+            handle,                                                        \
+            reinterpret_cast<op::tan::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                        \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetTanWorkspaceSize(infiniopTanDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::tan::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia)
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopTan(
+    infiniopTanDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::tan::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyTanDescriptor(infiniopTanDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        delete reinterpret_cast<const op::tan::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/test/infiniop/abs.py b/test/infiniop/abs.py
new file mode 100644
index 000000000..df8748a97
--- /dev/null
+++ b/test/infiniop/abs.py
@@ -0,0 +1,164 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def abs_op(x):
+    return torch.abs(x).to(x.dtype)
+
+
+def test(
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
+):
+    # Generate test tensors with values in range [-1, 1) for abs operation
+    x_torch_tensor = torch.rand(shape) * 2 - 1
+
+    x = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Abs on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = abs_op(x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateAbsDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetAbsWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_abs():
+        check_error(
+            LIBINFINIOP.infiniopAbs(
+                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
+            )
+        )
+
+    lib_abs()
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: abs_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_abs(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyAbsDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/acos.py b/test/infiniop/acos.py
new file mode 100644
index 000000000..d39e966c4
--- /dev/null
+++ b/test/infiniop/acos.py
@@ -0,0 +1,165 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def acos_op(x):
+    return torch.acos(x).to(x.dtype)
+
+
+def test(
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
+):
+    # Generate test tensors with values in range [-1, 1) for acos operation
+    # acos domain is [-1, 1], so we use range [-1, 1)
+    x_torch_tensor = torch.rand(shape) * 2 - 1
+
+    x = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Acos on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = acos_op(x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateAcosDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetAcosWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_acos():
+        check_error(
+            LIBINFINIOP.infiniopAcos(
+                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
+            )
+        )
+
+    lib_acos()
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: acos_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_acos(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyAcosDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/acosh.py b/test/infiniop/acosh.py
new file mode 100644
index 000000000..c6777998b
--- /dev/null
+++ b/test/infiniop/acosh.py
@@ -0,0 +1,165 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def acosh_op(x):
+    return torch.acosh(x).to(x.dtype)
+
+
+def test(
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
+):
+    # Generate test tensors with values in range [1, 101) for acosh operation
+    # acosh domain is [1, +∞), so we use range [1, 101)
+    x_torch_tensor = torch.rand(shape) * 100 + 1
+
+    x = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Acosh on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = acosh_op(x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateAcoshDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetAcoshWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_acosh():
+        check_error(
+            LIBINFINIOP.infiniopAcosh(
+                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
+            )
+        )
+
+    lib_acosh()
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: acosh_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_acosh(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyAcoshDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/asin.py b/test/infiniop/asin.py
new file mode 100644
index 000000000..18cf0ec8e
--- /dev/null
+++ b/test/infiniop/asin.py
@@ -0,0 +1,165 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def asin_op(x):
+    return torch.asin(x).to(x.dtype)
+
+
+def test(
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
+):
+    # Generate test tensors with values in range [-1, 1) for asin operation
+    # asin domain is [-1, 1], so we use range [-1, 1)
+    x_torch_tensor = torch.rand(shape) * 2 - 1
+
+    x = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Asin on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = asin_op(x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateAsinDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetAsinWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_asin():
+        check_error(
+            LIBINFINIOP.infiniopAsin(
+                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
+            )
+        )
+
+    lib_asin()
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: asin_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_asin(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyAsinDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/asinh.py b/test/infiniop/asinh.py
new file mode 100644
index 000000000..d051d486e
--- /dev/null
+++ b/test/infiniop/asinh.py
@@ -0,0 +1,165 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def asinh_op(x):
+    return torch.asinh(x).to(x.dtype)
+
+
+def test(
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
+):
+    # Generate test tensors with values in range [0, 100) for asinh operation
+    # asinh domain is (-∞, +∞), so we use range [0, 100)
+    x_torch_tensor = torch.rand(shape) * 100
+
+    x = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Asinh on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = asinh_op(x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateAsinhDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetAsinhWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_asinh():
+        check_error(
+            LIBINFINIOP.infiniopAsinh(
+                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
+            )
+        )
+
+    lib_asinh()
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: asinh_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_asinh(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyAsinhDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/atan.py b/test/infiniop/atan.py
new file mode 100644
index 000000000..01fceff5b
--- /dev/null
+++ b/test/infiniop/atan.py
@@ -0,0 +1,164 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    ((1, 3),),
+    ((3, 3, 13, 9, 17),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def atan_op(x):
+    return torch.atan(x).to(x.dtype)
+
+
+def test(
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
+):
+    # Generate test tensors with values in range [-200, -100) for atan operation
+    # atan domain is (-∞, +∞), so we use range [-200, -100)
+    x_torch_tensor = torch.rand(shape) * 100 - 200
+
+    x = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Atan on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = atan_op(x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateAtanDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetAtanWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_atan():
+        check_error(
+            LIBINFINIOP.infiniopAtan(
+                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
+            )
+        )
+
+    lib_atan()
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: atan_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_atan(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyAtanDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/atanh.py b/test/infiniop/atanh.py
new file mode 100644
index 000000000..74073a6f2
--- /dev/null
+++ b/test/infiniop/atanh.py
@@ -0,0 +1,165 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def atanh_op(x):
+    return torch.atanh(x).to(x.dtype)
+
+
+def test(
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
+):
+    # Generate test tensors with values in range [-1, 1) for atanh operation
+    # atanh domain is (-1, 1), so we use range [-1, 1)
+    x_torch_tensor = torch.rand(shape) * 2 - 1
+
+    x = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Atanh on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = atanh_op(x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateAtanhDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetAtanhWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_atanh():
+        check_error(
+            LIBINFINIOP.infiniopAtanh(
+                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
+            )
+        )
+
+    lib_atanh()
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: atanh_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_atanh(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyAtanhDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/ceil.py b/test/infiniop/ceil.py
new file mode 100644
index 000000000..afc1993c1
--- /dev/null
+++ b/test/infiniop/ceil.py
@@ -0,0 +1,165 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def ceil_op(x):
+    return torch.ceil(x).to(x.dtype)
+
+
+def test(
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
+):
+    # Generate test tensors with values in range [-20, -10) for ceil operation
+    # ceil domain is (-∞, +∞), so we use range [-20, -10)
+    x_torch_tensor = torch.rand(shape) * 10 - 20
+
+    x = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Ceil on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = ceil_op(x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateCeilDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetCeilWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_ceil():
+        check_error(
+            LIBINFINIOP.infiniopCeil(
+                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
+            )
+        )
+
+    lib_ceil()
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: ceil_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_ceil(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyCeilDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/cos.py b/test/infiniop/cos.py
new file mode 100644
index 000000000..972f17b7b
--- /dev/null
+++ b/test/infiniop/cos.py
@@ -0,0 +1,166 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+# Copied from old operators library: atol=1e-4, rtol=1e-2
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-4, "rtol": 1e-2},
+    InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-2},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def cos_op(x):
+    return torch.cos(x).to(x.dtype)
+
+
+def test(
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
+):
+    # Generate test tensors with values in range [-200, -100) for cos operation
+    # cos domain is (-∞, +∞), so we use range [-200, -100)
+    x_torch_tensor = torch.rand(shape) * 100 - 200
+
+    x = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Cos on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = cos_op(x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateCosDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetCosWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_cos():
+        check_error(
+            LIBINFINIOP.infiniopCos(
+                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
+            )
+        )
+
+    lib_cos()
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: cos_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_cos(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyCosDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/cosh.py b/test/infiniop/cosh.py
new file mode 100644
index 000000000..ee7994531
--- /dev/null
+++ b/test/infiniop/cosh.py
@@ -0,0 +1,165 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def cosh_op(x):
+    return torch.cosh(x).to(x.dtype)
+
+
+def test(
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
+):
+    # Generate test tensors with values in range [-200, -100) for cosh operation
+    # cosh domain is (-∞, +∞), so we use range [-200, -100)
+    x_torch_tensor = torch.rand(shape) * 100 - 200
+
+    x = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Cosh on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = cosh_op(x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateCoshDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetCoshWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_cosh():
+        check_error(
+            LIBINFINIOP.infiniopCosh(
+                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
+            )
+        )
+
+    lib_cosh()
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: cosh_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_cosh(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyCoshDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/erf.py b/test/infiniop/erf.py
new file mode 100644
index 000000000..f5f9c4cd9
--- /dev/null
+++ b/test/infiniop/erf.py
@@ -0,0 +1,165 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def erf_op(x):
+    return torch.erf(x).to(x.dtype)
+
+
+def test(
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
+):
+    # Generate test tensors with values in range [-3, 3) for erf operation
+    # erf domain is (-∞, +∞), so we use range [-3, 3)
+    x_torch_tensor = torch.rand(shape) * 6 - 3
+
+    x = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Erf on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = erf_op(x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateErfDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetErfWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_erf():
+        check_error(
+            LIBINFINIOP.infiniopErf(
+                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
+            )
+        )
+
+    lib_erf()
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: erf_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_erf(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyErfDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/floor.py b/test/infiniop/floor.py
new file mode 100644
index 000000000..b981da809
--- /dev/null
+++ b/test/infiniop/floor.py
@@ -0,0 +1,165 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def floor_op(x):
+    return torch.floor(x).to(x.dtype)
+
+
+def test(
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
+):
+    # Generate test tensors with values in range [-20, -10) for floor operation
+    # floor domain is (-∞, +∞), so we use range [-20, -10)
+    x_torch_tensor = torch.rand(shape) * 10 - 20
+
+    x = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Floor on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = floor_op(x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateFloorDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetFloorWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_floor():
+        check_error(
+            LIBINFINIOP.infiniopFloor(
+                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
+            )
+        )
+
+    lib_floor()
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: floor_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_floor(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyFloorDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py
index a61cea018..20a9188d6 100644
--- a/test/infiniop/libinfiniop/op_register.py
+++ b/test/infiniop/libinfiniop/op_register.py
@@ -496,6 +496,589 @@ def rearrange_(lib):
     lib.infiniopDestroyRearrangeDescriptor.argtypes = [infiniopOperatorDescriptor_t]
 
 
+@OpRegister.operator
+def abs_(lib):
+    lib.infiniopCreateAbsDescriptor.restype = c_int32
+    lib.infiniopCreateAbsDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetAbsWorkspaceSize.restype = c_int32
+    lib.infiniopGetAbsWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopAbs.restype = c_int32
+    lib.infiniopAbs.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyAbsDescriptor.restype = c_int32
+    lib.infiniopDestroyAbsDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def acos_(lib):
+    lib.infiniopCreateAcosDescriptor.restype = c_int32
+    lib.infiniopCreateAcosDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetAcosWorkspaceSize.restype = c_int32
+    lib.infiniopGetAcosWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopAcos.restype = c_int32
+    lib.infiniopAcos.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyAcosDescriptor.restype = c_int32
+    lib.infiniopDestroyAcosDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def acosh_(lib):
+    lib.infiniopCreateAcoshDescriptor.restype = c_int32
+    lib.infiniopCreateAcoshDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetAcoshWorkspaceSize.restype = c_int32
+    lib.infiniopGetAcoshWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopAcosh.restype = c_int32
+    lib.infiniopAcosh.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyAcoshDescriptor.restype = c_int32
+    lib.infiniopDestroyAcoshDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def asin_(lib):
+    lib.infiniopCreateAsinDescriptor.restype = c_int32
+    lib.infiniopCreateAsinDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetAsinWorkspaceSize.restype = c_int32
+    lib.infiniopGetAsinWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopAsin.restype = c_int32
+    lib.infiniopAsin.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyAsinDescriptor.restype = c_int32
+    lib.infiniopDestroyAsinDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def asinh_(lib):
+    lib.infiniopCreateAsinhDescriptor.restype = c_int32
+    lib.infiniopCreateAsinhDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetAsinhWorkspaceSize.restype = c_int32
+    lib.infiniopGetAsinhWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopAsinh.restype = c_int32
+    lib.infiniopAsinh.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyAsinhDescriptor.restype = c_int32
+    lib.infiniopDestroyAsinhDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def atan_(lib):
+    lib.infiniopCreateAtanDescriptor.restype = c_int32
+    lib.infiniopCreateAtanDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetAtanWorkspaceSize.restype = c_int32
+    lib.infiniopGetAtanWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopAtan.restype = c_int32
+    lib.infiniopAtan.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyAtanDescriptor.restype = c_int32
+    lib.infiniopDestroyAtanDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def atanh_(lib):
+    lib.infiniopCreateAtanhDescriptor.restype = c_int32
+    lib.infiniopCreateAtanhDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetAtanhWorkspaceSize.restype = c_int32
+    lib.infiniopGetAtanhWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopAtanh.restype = c_int32
+    lib.infiniopAtanh.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyAtanhDescriptor.restype = c_int32
+    lib.infiniopDestroyAtanhDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def ceil_(lib):
+    lib.infiniopCreateCeilDescriptor.restype = c_int32
+    lib.infiniopCreateCeilDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetCeilWorkspaceSize.restype = c_int32
+    lib.infiniopGetCeilWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopCeil.restype = c_int32
+    lib.infiniopCeil.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyCeilDescriptor.restype = c_int32
+    lib.infiniopDestroyCeilDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def cos_(lib):
+    lib.infiniopCreateCosDescriptor.restype = c_int32
+    lib.infiniopCreateCosDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetCosWorkspaceSize.restype = c_int32
+    lib.infiniopGetCosWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopCos.restype = c_int32
+    lib.infiniopCos.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyCosDescriptor.restype = c_int32
+    lib.infiniopDestroyCosDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def cosh_(lib):
+    lib.infiniopCreateCoshDescriptor.restype = c_int32
+    lib.infiniopCreateCoshDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetCoshWorkspaceSize.restype = c_int32
+    lib.infiniopGetCoshWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopCosh.restype = c_int32
+    lib.infiniopCosh.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyCoshDescriptor.restype = c_int32
+    lib.infiniopDestroyCoshDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def sinh_(lib):
+    lib.infiniopCreateSinhDescriptor.restype = c_int32
+    lib.infiniopCreateSinhDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetSinhWorkspaceSize.restype = c_int32
+    lib.infiniopGetSinhWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopSinh.restype = c_int32
+    lib.infiniopSinh.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroySinhDescriptor.restype = c_int32
+    lib.infiniopDestroySinhDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def erf_(lib):
+    lib.infiniopCreateErfDescriptor.restype = c_int32
+    lib.infiniopCreateErfDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetErfWorkspaceSize.restype = c_int32
+    lib.infiniopGetErfWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopErf.restype = c_int32
+    lib.infiniopErf.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyErfDescriptor.restype = c_int32
+    lib.infiniopDestroyErfDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def floor_(lib):
+    lib.infiniopCreateFloorDescriptor.restype = c_int32
+    lib.infiniopCreateFloorDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetFloorWorkspaceSize.restype = c_int32
+    lib.infiniopGetFloorWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopFloor.restype = c_int32
+    lib.infiniopFloor.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyFloorDescriptor.restype = c_int32
+    lib.infiniopDestroyFloorDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def neg_(lib):
+    lib.infiniopCreateNegDescriptor.restype = c_int32
+    lib.infiniopCreateNegDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetNegWorkspaceSize.restype = c_int32
+    lib.infiniopGetNegWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopNeg.restype = c_int32
+    lib.infiniopNeg.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyNegDescriptor.restype = c_int32
+    lib.infiniopDestroyNegDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def reciprocal_(lib):
+    lib.infiniopCreateReciprocalDescriptor.restype = c_int32
+    lib.infiniopCreateReciprocalDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetReciprocalWorkspaceSize.restype = c_int32
+    lib.infiniopGetReciprocalWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopReciprocal.restype = c_int32
+    lib.infiniopReciprocal.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyReciprocalDescriptor.restype = c_int32
+    lib.infiniopDestroyReciprocalDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def round_(lib):
+    lib.infiniopCreateRoundDescriptor.restype = c_int32
+    lib.infiniopCreateRoundDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetRoundWorkspaceSize.restype = c_int32
+    lib.infiniopGetRoundWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopRound.restype = c_int32
+    lib.infiniopRound.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyRoundDescriptor.restype = c_int32
+    lib.infiniopDestroyRoundDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def sign_(lib):
+    lib.infiniopCreateSignDescriptor.restype = c_int32
+    lib.infiniopCreateSignDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetSignWorkspaceSize.restype = c_int32
+    lib.infiniopGetSignWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopSign.restype = c_int32
+    lib.infiniopSign.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroySignDescriptor.restype = c_int32
+    lib.infiniopDestroySignDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def sqrt_(lib):
+    lib.infiniopCreateSqrtDescriptor.restype = c_int32
+    lib.infiniopCreateSqrtDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetSqrtWorkspaceSize.restype = c_int32
+    lib.infiniopGetSqrtWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopSqrt.restype = c_int32
+    lib.infiniopSqrt.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroySqrtDescriptor.restype = c_int32
+    lib.infiniopDestroySqrtDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def log_(lib):
+    lib.infiniopCreateLogDescriptor.restype = c_int32
+    lib.infiniopCreateLogDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetLogWorkspaceSize.restype = c_int32
+    lib.infiniopGetLogWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopLog.restype = c_int32
+    lib.infiniopLog.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyLogDescriptor.restype = c_int32
+    lib.infiniopDestroyLogDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def tan_(lib):
+    lib.infiniopCreateTanDescriptor.restype = c_int32
+    lib.infiniopCreateTanDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetTanWorkspaceSize.restype = c_int32
+    lib.infiniopGetTanWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopTan.restype = c_int32
+    lib.infiniopTan.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyTanDescriptor.restype = c_int32
+    lib.infiniopDestroyTanDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
 @OpRegister.operator
 def relu_(lib):
     lib.infiniopCreateReluDescriptor.restype = c_int32
diff --git a/test/infiniop/log.py b/test/infiniop/log.py
new file mode 100644
index 000000000..4f97de374
--- /dev/null
+++ b/test/infiniop/log.py
@@ -0,0 +1,166 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+# Copied from old operators library: atol=1e-7, rtol=1e-3
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-7, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-3},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def log_op(x):
+    return torch.log(x).to(x.dtype)
+
+
+def test(
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
+):
+    # Generate test tensors with values in range [0.1, 1.1) for log operation
+    # log domain is (0, +∞), so we use range [0.1, 1.1)
+    x_torch_tensor = torch.rand(shape) + 0.1
+
+    x = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Log on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = log_op(x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateLogDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetLogWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_log():
+        check_error(
+            LIBINFINIOP.infiniopLog(
+                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
+            )
+        )
+
+    lib_log()
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: log_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_log(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyLogDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/neg.py b/test/infiniop/neg.py
new file mode 100644
index 000000000..62607bce0
--- /dev/null
+++ b/test/infiniop/neg.py
@@ -0,0 +1,165 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 0, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 0, "rtol": 1e-7},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def neg_op(x):
+    return torch.neg(x).to(x.dtype)
+
+
+def test(
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
+):
+    # Generate test tensors with values in range [-200, -100) for neg operation
+    # This matches the original test case: * 100 - 200
+    x_torch_tensor = torch.rand(shape) * 100 - 200
+
+    x = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Neg on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = neg_op(x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateNegDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetNegWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_neg():
+        check_error(
+            LIBINFINIOP.infiniopNeg(
+                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
+            )
+        )
+
+    lib_neg()
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: neg_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_neg(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyNegDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/reciprocal.py b/test/infiniop/reciprocal.py
new file mode 100644
index 000000000..4e816481c
--- /dev/null
+++ b/test/infiniop/reciprocal.py
@@ -0,0 +1,168 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 0, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 0, "rtol": 1e-7},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def reciprocal_op(x):
+    return torch.reciprocal(x).to(x.dtype)
+
+
+def test(
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
+):
+    # Generate test tensors with values in range [-10, 10) for reciprocal operation
+    # This matches the original test case: * 20 - 10
+    # Note: Avoid values too close to zero to prevent division by zero issues
+    x_torch_tensor = torch.rand(shape) * 20 - 10
+    # Ensure no zero values
+    x_torch_tensor = torch.where(x_torch_tensor == 0, torch.ones_like(x_torch_tensor), x_torch_tensor)
+
+    x = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Reciprocal on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = reciprocal_op(x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateReciprocalDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetReciprocalWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_reciprocal():
+        check_error(
+            LIBINFINIOP.infiniopReciprocal(
+                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
+            )
+        )
+
+    lib_reciprocal()
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: reciprocal_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_reciprocal(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyReciprocalDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/round.py b/test/infiniop/round.py
new file mode 100644
index 000000000..d6053f676
--- /dev/null
+++ b/test/infiniop/round.py
@@ -0,0 +1,165 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def round_op(x):
+    return torch.round(x).to(x.dtype)
+
+
+def test(
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
+):
+    # Generate test tensors with values in range [-20, -10) for round operation
+    # This matches the original test case: * 10 - 20
+    x_torch_tensor = torch.rand(shape) * 10 - 20
+
+    x = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Round on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = round_op(x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateRoundDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetRoundWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_round():
+        check_error(
+            LIBINFINIOP.infiniopRound(
+                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
+            )
+        )
+
+    lib_round()
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: round_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_round(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyRoundDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/sign.py b/test/infiniop/sign.py
new file mode 100644
index 000000000..f0eb5b5f8
--- /dev/null
+++ b/test/infiniop/sign.py
@@ -0,0 +1,166 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+# Copied from old operators library: atol=0, rtol=0
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 0, "rtol": 0},
+    InfiniDtype.F32: {"atol": 0, "rtol": 0},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def sign_op(x):
+    return torch.sign(x).to(x.dtype)
+
+
+def test(
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
+):
+    # Generate test tensors with values in range [-200, -100) for sign operation
+    # sign domain is (-∞, +∞), so we use range [-200, -100)
+    x_torch_tensor = torch.rand(shape) * 100 - 200
+
+    x = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Sign on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = sign_op(x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateSignDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetSignWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_sign():
+        check_error(
+            LIBINFINIOP.infiniopSign(
+                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
+            )
+        )
+
+    lib_sign()
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: sign_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_sign(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroySignDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/sinh.py b/test/infiniop/sinh.py
new file mode 100644
index 000000000..99bc02c58
--- /dev/null
+++ b/test/infiniop/sinh.py
@@ -0,0 +1,166 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+# Copied from old operators library: atol=0, rtol=0
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 0, "rtol": 0},
+    InfiniDtype.F32: {"atol": 0, "rtol": 0},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def sinh_op(x):
+    return torch.sinh(x).to(x.dtype)
+
+
+def test(
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
+):
+    # Generate test tensors with values in range [-200, -100) for sinh operation
+    # sinh domain is (-∞, +∞), so we use range [-200, -100)
+    x_torch_tensor = torch.rand(shape) * 100 - 200
+
+    x = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Sinh on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = sinh_op(x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateSinhDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetSinhWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_sinh():
+        check_error(
+            LIBINFINIOP.infiniopSinh(
+                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
+            )
+        )
+
+    lib_sinh()
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: sinh_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_sinh(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroySinhDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/sqrt.py b/test/infiniop/sqrt.py
new file mode 100644
index 000000000..6e1419971
--- /dev/null
+++ b/test/infiniop/sqrt.py
@@ -0,0 +1,166 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+# Copied from old operators library: atol=0, rtol=1e-3
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 0, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 0, "rtol": 1e-3},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def sqrt_op(x):
+    return torch.sqrt(x).to(x.dtype)
+
+
+def test(
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
+):
+    # Generate test tensors with values in range [0, 100) for sqrt operation
+    # sqrt domain is [0, +∞), so we use range [0, 100)
+    x_torch_tensor = torch.rand(shape) * 100
+
+    x = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Sqrt on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = sqrt_op(x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateSqrtDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetSqrtWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_sqrt():
+        check_error(
+            LIBINFINIOP.infiniopSqrt(
+                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
+            )
+        )
+
+    lib_sqrt()
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: sqrt_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_sqrt(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroySqrtDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/tan.py b/test/infiniop/tan.py
new file mode 100644
index 000000000..877f5dd58
--- /dev/null
+++ b/test/infiniop/tan.py
@@ -0,0 +1,167 @@
+import ctypes
+import math
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # tensor_shape, inplace
+    ((1, 3),),
+    ((3, 3),),
+    ((32, 20, 512),),
+    ((33, 333, 333),),
+    ((32, 256, 112, 112),),
+    ((3, 3, 13, 9, 17),),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing (matching old operators library: only F16 and F32)
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32]
+
+# Tolerance map for different data types
+# Copied from old operators library: atol=1e-6, rtol=1e-2
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-6, "rtol": 1e-2},
+    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-2},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def tan_op(x):
+    return torch.tan(x).to(x.dtype)
+
+
+def test(
+    handle, device, shape, inplace=Inplace.OUT_OF_PLACE, dtype=InfiniDtype.F16, sync=None
+):
+    # Generate test tensors with values in range [-2π, 2π) for tan operation
+    # tan domain is (-∞, +∞), so we use range [-2π, 2π)
+    x_torch_tensor = torch.rand(shape) * 4 * math.pi - 2 * math.pi
+
+    x = TestTensor(
+        shape,
+        x_torch_tensor.stride(),
+        dtype,
+        device,
+        mode="manual",
+        set_tensor=x_torch_tensor,
+    )
+
+    if inplace == Inplace.INPLACE_X:
+        y = x
+    else:
+        y = TestTensor(shape, None, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Tan on {InfiniDeviceNames[device]} with shape:{shape} dtype:{InfiniDtypeNames[dtype]} inplace: {inplace}"
+    )
+
+    ans = tan_op(x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateTanDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetTanWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_tan():
+        check_error(
+            LIBINFINIOP.infiniopTan(
+                descriptor, workspace.data(), workspace_size.value, y.data(), x.data(), None
+            )
+        )
+
+    lib_tan()
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol, equal_nan=True)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: tan_op(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_tan(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyTanDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")

From 8ca4b3248f4d5b7b0bd4de4184957a8e9d93dbb7 Mon Sep 17 00:00:00 2001
From: gongchensu <zhuyue_134@qq.com>
Date: Thu, 15 Jan 2026 02:33:13 +0000
Subject: [PATCH 3/3] Issue/887 - Refactor binary and unary operators to reduce
 code duplication.

---
 include/infiniop/ops/abs.h                    |  20 +-
 include/infiniop/ops/acos.h                   |  20 +-
 include/infiniop/ops/acosh.h                  |  20 +-
 include/infiniop/ops/add.h                    |  22 +-
 include/infiniop/ops/asin.h                   |  20 +-
 include/infiniop/ops/asinh.h                  |  20 +-
 include/infiniop/ops/atan.h                   |  20 +-
 include/infiniop/ops/atanh.h                  |  20 +-
 include/infiniop/ops/binary_op_api.h          |  50 ++
 include/infiniop/ops/ceil.h                   |  20 +-
 include/infiniop/ops/cos.h                    |  20 +-
 include/infiniop/ops/cosh.h                   |  20 +-
 include/infiniop/ops/div.h                    |  22 +-
 include/infiniop/ops/erf.h                    |  20 +-
 include/infiniop/ops/floor.h                  |  20 +-
 include/infiniop/ops/log.h                    |  20 +-
 include/infiniop/ops/max.h                    |  22 +-
 include/infiniop/ops/min.h                    |  22 +-
 include/infiniop/ops/mod.h                    |  22 +-
 include/infiniop/ops/mul.h                    |  22 +-
 include/infiniop/ops/neg.h                    |  20 +-
 include/infiniop/ops/pow.h                    |  22 +-
 include/infiniop/ops/reciprocal.h             |  20 +-
 include/infiniop/ops/round.h                  |  20 +-
 include/infiniop/ops/sign.h                   |  20 +-
 include/infiniop/ops/sinh.h                   |  20 +-
 include/infiniop/ops/sqrt.h                   |  20 +-
 include/infiniop/ops/sub.h                    |  22 +-
 include/infiniop/ops/tan.h                    |  20 +-
 include/infiniop/ops/unary_op_api.h           |  48 ++
 scripts/test_binary_unary.py                  | 143 +++++
 src/infiniop/elementwise/binary.h             | 261 +++++++++
 .../elementwise/cpu/elementwise_cpu_impl.h    | 130 +++++
 .../nvidia/elementwise_nvidia_impl.cuh        | 134 +++++
 src/infiniop/elementwise/unary.h              | 524 ++++++++++++++++++
 src/infiniop/operator_impl.h                  | 288 ++++++++++
 src/infiniop/ops/abs/cpu/abs_cpu.cc           |  44 +-
 src/infiniop/ops/abs/cpu/abs_cpu.h            |  21 +-
 src/infiniop/ops/abs/cuda/kernel.cuh          |  20 +-
 src/infiniop/ops/abs/nvidia/abs_nvidia.cu     |  48 +-
 src/infiniop/ops/abs/operator.cc              | 132 +----
 src/infiniop/ops/acos/cpu/acos_cpu.cc         |  44 +-
 src/infiniop/ops/acos/cpu/acos_cpu.h          |  17 +-
 src/infiniop/ops/acos/cuda/kernel.cuh         |  26 +-
 src/infiniop/ops/acos/nvidia/acos_nvidia.cu   |  48 +-
 src/infiniop/ops/acos/operator.cc             | 132 +----
 src/infiniop/ops/acosh/cpu/acosh_cpu.cc       |  44 +-
 src/infiniop/ops/acosh/cpu/acosh_cpu.h        |  17 +-
 src/infiniop/ops/acosh/cuda/kernel.cuh        |  26 +-
 src/infiniop/ops/acosh/nvidia/acosh_nvidia.cu |  48 +-
 src/infiniop/ops/acosh/operator.cc            | 132 +----
 src/infiniop/ops/asin/cpu/asin_cpu.cc         |  44 +-
 src/infiniop/ops/asin/cpu/asin_cpu.h          |  17 +-
 src/infiniop/ops/asin/cuda/kernel.cuh         |  26 +-
 src/infiniop/ops/asin/nvidia/asin_nvidia.cu   |  48 +-
 src/infiniop/ops/asin/operator.cc             | 132 +----
 src/infiniop/ops/asinh/cpu/asinh_cpu.cc       |  44 +-
 src/infiniop/ops/asinh/cpu/asinh_cpu.h        |  17 +-
 src/infiniop/ops/asinh/cuda/kernel.cuh        |  26 +-
 src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu |  48 +-
 src/infiniop/ops/asinh/operator.cc            | 132 +----
 src/infiniop/ops/atan/cpu/atan_cpu.cc         |  44 +-
 src/infiniop/ops/atan/cpu/atan_cpu.h          |  17 +-
 src/infiniop/ops/atan/cuda/kernel.cuh         |  26 +-
 src/infiniop/ops/atan/nvidia/atan_nvidia.cu   |  48 +-
 src/infiniop/ops/atan/operator.cc             | 132 +----
 src/infiniop/ops/atanh/cpu/atanh_cpu.cc       |  44 +-
 src/infiniop/ops/atanh/cpu/atanh_cpu.h        |  17 +-
 src/infiniop/ops/atanh/cuda/kernel.cuh        |  26 +-
 src/infiniop/ops/atanh/nvidia/atanh_nvidia.cu |  48 +-
 src/infiniop/ops/atanh/operator.cc            | 132 +----
 src/infiniop/ops/ceil/cpu/ceil_cpu.cc         |  44 +-
 src/infiniop/ops/ceil/cpu/ceil_cpu.h          |  21 +-
 src/infiniop/ops/ceil/cuda/kernel.cuh         |  28 +-
 src/infiniop/ops/ceil/nvidia/ceil_nvidia.cu   |  48 +-
 src/infiniop/ops/ceil/operator.cc             | 132 +----
 src/infiniop/ops/cos/cpu/cos_cpu.cc           |  44 +-
 src/infiniop/ops/cos/cpu/cos_cpu.h            |  17 +-
 src/infiniop/ops/cos/cuda/kernel.cuh          |  26 +-
 src/infiniop/ops/cos/nvidia/cos_nvidia.cu     |  48 +-
 src/infiniop/ops/cos/operator.cc              | 132 +----
 src/infiniop/ops/cosh/cpu/cosh_cpu.cc         |  44 +-
 src/infiniop/ops/cosh/cpu/cosh_cpu.h          |  17 +-
 src/infiniop/ops/cosh/cuda/kernel.cuh         |  26 +-
 src/infiniop/ops/cosh/nvidia/cosh_nvidia.cu   |  48 +-
 src/infiniop/ops/cosh/operator.cc             | 132 +----
 src/infiniop/ops/div/cpu/div_cpu.cc           |  46 +-
 src/infiniop/ops/div/cpu/div_cpu.h            |  14 +-
 src/infiniop/ops/div/cuda/kernel.cuh          |  19 +-
 src/infiniop/ops/div/nvidia/div_nvidia.cu     |  51 +-
 src/infiniop/ops/div/operator.cc              | 195 +------
 src/infiniop/ops/erf/cpu/erf_cpu.cc           |  44 +-
 src/infiniop/ops/erf/cpu/erf_cpu.h            |  17 +-
 src/infiniop/ops/erf/cuda/kernel.cuh          |  26 +-
 src/infiniop/ops/erf/nvidia/erf_nvidia.cu     |  48 +-
 src/infiniop/ops/erf/operator.cc              | 132 +----
 src/infiniop/ops/floor/cpu/floor_cpu.cc       |  44 +-
 src/infiniop/ops/floor/cpu/floor_cpu.h        |  21 +-
 src/infiniop/ops/floor/cuda/kernel.cuh        |  28 +-
 src/infiniop/ops/floor/nvidia/floor_nvidia.cu |  48 +-
 src/infiniop/ops/floor/operator.cc            | 132 +----
 src/infiniop/ops/log/cpu/log_cpu.cc           |  44 +-
 src/infiniop/ops/log/cpu/log_cpu.h            |  17 +-
 src/infiniop/ops/log/cuda/kernel.cuh          |  26 +-
 src/infiniop/ops/log/nvidia/log_nvidia.cu     |  48 +-
 src/infiniop/ops/log/operator.cc              | 132 +----
 src/infiniop/ops/max/cpu/max_cpu.cc           |  46 +-
 src/infiniop/ops/max/cpu/max_cpu.h            |  15 +-
 src/infiniop/ops/max/cuda/kernel.cuh          |  19 +-
 src/infiniop/ops/max/nvidia/max_nvidia.cu     |  51 +-
 src/infiniop/ops/max/operator.cc              | 195 +------
 src/infiniop/ops/min/cpu/min_cpu.cc           |  46 +-
 src/infiniop/ops/min/cpu/min_cpu.h            |  15 +-
 src/infiniop/ops/min/cuda/kernel.cuh          |  19 +-
 src/infiniop/ops/min/nvidia/min_nvidia.cu     |  51 +-
 src/infiniop/ops/min/operator.cc              | 195 +------
 src/infiniop/ops/mod/cpu/mod_cpu.cc           |  45 +-
 src/infiniop/ops/mod/cpu/mod_cpu.h            |  18 +-
 src/infiniop/ops/mod/cuda/kernel.cuh          |  24 +-
 src/infiniop/ops/mod/nvidia/mod_nvidia.cu     |  51 +-
 src/infiniop/ops/mod/operator.cc              | 135 +----
 src/infiniop/ops/neg/cpu/neg_cpu.cc           |  44 +-
 src/infiniop/ops/neg/cpu/neg_cpu.h            |  15 +-
 src/infiniop/ops/neg/cuda/kernel.cuh          |  17 +-
 src/infiniop/ops/neg/nvidia/neg_nvidia.cu     |  48 +-
 src/infiniop/ops/neg/operator.cc              | 132 +----
 src/infiniop/ops/pow/cpu/pow_cpu.cc           |  45 +-
 src/infiniop/ops/pow/cpu/pow_cpu.h            |  14 +-
 src/infiniop/ops/pow/cuda/kernel.cuh          |  34 +-
 src/infiniop/ops/pow/nvidia/pow_nvidia.cu     |  51 +-
 src/infiniop/ops/pow/operator.cc              | 135 +----
 .../ops/reciprocal/cpu/reciprocal_cpu.cc      |  44 +-
 .../ops/reciprocal/cpu/reciprocal_cpu.h       |  15 +-
 src/infiniop/ops/reciprocal/cuda/kernel.cuh   |  26 +-
 .../reciprocal/nvidia/reciprocal_nvidia.cu    |  48 +-
 src/infiniop/ops/reciprocal/operator.cc       | 132 +----
 src/infiniop/ops/round/cpu/round_cpu.cc       |  44 +-
 src/infiniop/ops/round/cpu/round_cpu.h        |  20 +-
 src/infiniop/ops/round/cuda/kernel.cuh        |  28 +-
 src/infiniop/ops/round/nvidia/round_nvidia.cu |  48 +-
 src/infiniop/ops/round/operator.cc            | 132 +----
 src/infiniop/ops/sign/cpu/sign_cpu.cc         |  44 +-
 src/infiniop/ops/sign/cpu/sign_cpu.h          |  15 +-
 src/infiniop/ops/sign/cuda/kernel.cuh         |  19 +-
 src/infiniop/ops/sign/nvidia/sign_nvidia.cu   |  48 +-
 src/infiniop/ops/sign/operator.cc             | 132 +----
 src/infiniop/ops/sinh/cpu/sinh_cpu.cc         |  44 +-
 src/infiniop/ops/sinh/cpu/sinh_cpu.h          |  17 +-
 src/infiniop/ops/sinh/cuda/kernel.cuh         |  26 +-
 src/infiniop/ops/sinh/nvidia/sinh_nvidia.cu   |  48 +-
 src/infiniop/ops/sinh/operator.cc             | 132 +----
 src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc         |  44 +-
 src/infiniop/ops/sqrt/cpu/sqrt_cpu.h          |  17 +-
 src/infiniop/ops/sqrt/cuda/kernel.cuh         |  26 +-
 src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cu   |  48 +-
 src/infiniop/ops/sqrt/operator.cc             | 132 +----
 src/infiniop/ops/tan/cpu/tan_cpu.cc           |  44 +-
 src/infiniop/ops/tan/cpu/tan_cpu.h            |  17 +-
 src/infiniop/ops/tan/cuda/kernel.cuh          |  49 +-
 src/infiniop/ops/tan/nvidia/tan_nvidia.cu     |  48 +-
 src/infiniop/ops/tan/operator.cc              | 132 +----
 src/infiniop/ops/tanh/cuda/kernel.cuh         |  38 +-
 src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu   |  53 +-
 163 files changed, 1891 insertions(+), 7239 deletions(-)
 create mode 100644 include/infiniop/ops/binary_op_api.h
 create mode 100644 include/infiniop/ops/unary_op_api.h
 create mode 100755 scripts/test_binary_unary.py
 create mode 100644 src/infiniop/elementwise/binary.h
 create mode 100644 src/infiniop/elementwise/cpu/elementwise_cpu_impl.h
 create mode 100644 src/infiniop/elementwise/nvidia/elementwise_nvidia_impl.cuh
 create mode 100644 src/infiniop/elementwise/unary.h
 create mode 100644 src/infiniop/operator_impl.h

diff --git a/include/infiniop/ops/abs.h b/include/infiniop/ops/abs.h
index 7b5872657..1d1f1cbd1 100644
--- a/include/infiniop/ops/abs.h
+++ b/include/infiniop/ops/abs.h
@@ -1,24 +1,8 @@
 #ifndef __INFINIOP_ABS_API_H__
 #define __INFINIOP_ABS_API_H__
 
-#include "../operator_descriptor.h"
+#include "unary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopAbsDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateAbsDescriptor(infiniopHandle_t handle,
-                                                        infiniopAbsDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t y,
-                                                        infiniopTensorDescriptor_t x);
-
-__C __export infiniStatus_t infiniopGetAbsWorkspaceSize(infiniopAbsDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopAbs(infiniopAbsDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *y,
-                                        const void *x,
-                                        void *stream);
-
-__C __export infiniStatus_t infiniopDestroyAbsDescriptor(infiniopAbsDescriptor_t desc);
+UNARY_OP_API_DECLARE(abs, Abs)
 
 #endif
diff --git a/include/infiniop/ops/acos.h b/include/infiniop/ops/acos.h
index fe6af01ed..c2f4de837 100644
--- a/include/infiniop/ops/acos.h
+++ b/include/infiniop/ops/acos.h
@@ -1,24 +1,8 @@
 #ifndef __INFINIOP_ACOS_API_H__
 #define __INFINIOP_ACOS_API_H__
 
-#include "../operator_descriptor.h"
+#include "unary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopAcosDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateAcosDescriptor(infiniopHandle_t handle,
-                                                        infiniopAcosDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t y,
-                                                        infiniopTensorDescriptor_t x);
-
-__C __export infiniStatus_t infiniopGetAcosWorkspaceSize(infiniopAcosDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopAcos(infiniopAcosDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *y,
-                                        const void *x,
-                                        void *stream);
-
-__C __export infiniStatus_t infiniopDestroyAcosDescriptor(infiniopAcosDescriptor_t desc);
+UNARY_OP_API_DECLARE(acos, Acos)
 
 #endif
diff --git a/include/infiniop/ops/acosh.h b/include/infiniop/ops/acosh.h
index be28918bb..e8630b7d5 100644
--- a/include/infiniop/ops/acosh.h
+++ b/include/infiniop/ops/acosh.h
@@ -1,24 +1,8 @@
 #ifndef __INFINIOP_ACOSH_API_H__
 #define __INFINIOP_ACOSH_API_H__
 
-#include "../operator_descriptor.h"
+#include "unary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopAcoshDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateAcoshDescriptor(infiniopHandle_t handle,
-                                                        infiniopAcoshDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t y,
-                                                        infiniopTensorDescriptor_t x);
-
-__C __export infiniStatus_t infiniopGetAcoshWorkspaceSize(infiniopAcoshDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopAcosh(infiniopAcoshDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *y,
-                                        const void *x,
-                                        void *stream);
-
-__C __export infiniStatus_t infiniopDestroyAcoshDescriptor(infiniopAcoshDescriptor_t desc);
+UNARY_OP_API_DECLARE(acosh, Acosh)
 
 #endif
diff --git a/include/infiniop/ops/add.h b/include/infiniop/ops/add.h
index 02f6225fb..abedb7f9d 100644
--- a/include/infiniop/ops/add.h
+++ b/include/infiniop/ops/add.h
@@ -1,26 +1,8 @@
 #ifndef __INFINIOP_ADD_API_H__
 #define __INFINIOP_ADD_API_H__
 
-#include "../operator_descriptor.h"
+#include "binary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopAddDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateAddDescriptor(infiniopHandle_t handle,
-                                                        infiniopAddDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t c,
-                                                        infiniopTensorDescriptor_t a,
-                                                        infiniopTensorDescriptor_t b);
-
-__C __export infiniStatus_t infiniopGetAddWorkspaceSize(infiniopAddDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopAdd(infiniopAddDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *c,
-                                        const void *a,
-                                        const void *b,
-                                        void *stream);
-
-__C __export infiniStatus_t infiniopDestroyAddDescriptor(infiniopAddDescriptor_t desc);
+BINARY_OP_API_DECLARE(add, Add)
 
 #endif
diff --git a/include/infiniop/ops/asin.h b/include/infiniop/ops/asin.h
index 2aac6d1e1..1a8bdd7b8 100644
--- a/include/infiniop/ops/asin.h
+++ b/include/infiniop/ops/asin.h
@@ -1,24 +1,8 @@
 #ifndef __INFINIOP_ASIN_API_H__
 #define __INFINIOP_ASIN_API_H__
 
-#include "../operator_descriptor.h"
+#include "unary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopAsinDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateAsinDescriptor(infiniopHandle_t handle,
-                                                        infiniopAsinDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t y,
-                                                        infiniopTensorDescriptor_t x);
-
-__C __export infiniStatus_t infiniopGetAsinWorkspaceSize(infiniopAsinDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopAsin(infiniopAsinDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *y,
-                                        const void *x,
-                                        void *stream);
-
-__C __export infiniStatus_t infiniopDestroyAsinDescriptor(infiniopAsinDescriptor_t desc);
+UNARY_OP_API_DECLARE(asin, Asin)
 
 #endif
diff --git a/include/infiniop/ops/asinh.h b/include/infiniop/ops/asinh.h
index d1385fc01..2a3aebf5a 100644
--- a/include/infiniop/ops/asinh.h
+++ b/include/infiniop/ops/asinh.h
@@ -1,24 +1,8 @@
 #ifndef __INFINIOP_ASINH_API_H__
 #define __INFINIOP_ASINH_API_H__
 
-#include "../operator_descriptor.h"
+#include "unary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopAsinhDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateAsinhDescriptor(infiniopHandle_t handle,
-                                                        infiniopAsinhDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t y,
-                                                        infiniopTensorDescriptor_t x);
-
-__C __export infiniStatus_t infiniopGetAsinhWorkspaceSize(infiniopAsinhDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopAsinh(infiniopAsinhDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *y,
-                                        const void *x,
-                                        void *stream);
-
-__C __export infiniStatus_t infiniopDestroyAsinhDescriptor(infiniopAsinhDescriptor_t desc);
+UNARY_OP_API_DECLARE(asinh, Asinh)
 
 #endif
diff --git a/include/infiniop/ops/atan.h b/include/infiniop/ops/atan.h
index 3b1a5bde3..18eed316f 100644
--- a/include/infiniop/ops/atan.h
+++ b/include/infiniop/ops/atan.h
@@ -1,24 +1,8 @@
 #ifndef __INFINIOP_ATAN_API_H__
 #define __INFINIOP_ATAN_API_H__
 
-#include "../operator_descriptor.h"
+#include "unary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopAtanDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateAtanDescriptor(infiniopHandle_t handle,
-                                                        infiniopAtanDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t y,
-                                                        infiniopTensorDescriptor_t x);
-
-__C __export infiniStatus_t infiniopGetAtanWorkspaceSize(infiniopAtanDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopAtan(infiniopAtanDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *y,
-                                        const void *x,
-                                        void *stream);
-
-__C __export infiniStatus_t infiniopDestroyAtanDescriptor(infiniopAtanDescriptor_t desc);
+UNARY_OP_API_DECLARE(atan, Atan)
 
 #endif
diff --git a/include/infiniop/ops/atanh.h b/include/infiniop/ops/atanh.h
index 800afd5d5..e7db5b53c 100644
--- a/include/infiniop/ops/atanh.h
+++ b/include/infiniop/ops/atanh.h
@@ -1,24 +1,8 @@
 #ifndef __INFINIOP_ATANH_API_H__
 #define __INFINIOP_ATANH_API_H__
 
-#include "../operator_descriptor.h"
+#include "unary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopAtanhDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateAtanhDescriptor(infiniopHandle_t handle,
-                                                        infiniopAtanhDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t y,
-                                                        infiniopTensorDescriptor_t x);
-
-__C __export infiniStatus_t infiniopGetAtanhWorkspaceSize(infiniopAtanhDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopAtanh(infiniopAtanhDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *y,
-                                        const void *x,
-                                        void *stream);
-
-__C __export infiniStatus_t infiniopDestroyAtanhDescriptor(infiniopAtanhDescriptor_t desc);
+UNARY_OP_API_DECLARE(atanh, Atanh)
 
 #endif
diff --git a/include/infiniop/ops/binary_op_api.h b/include/infiniop/ops/binary_op_api.h
new file mode 100644
index 000000000..4ab2401b9
--- /dev/null
+++ b/include/infiniop/ops/binary_op_api.h
@@ -0,0 +1,50 @@
+#ifndef __INFINIOP_BINARY_OP_API_H__
+#define __INFINIOP_BINARY_OP_API_H__
+
+#include "../operator_descriptor.h"
+
+/**
+ * @brief Macro to generate the C API header for a binary operator.
+ * 
+ * This macro generates all the necessary declarations for a binary operator:
+ * - Descriptor type definition
+ * - Create descriptor function
+ * - Get workspace size function
+ * - Execute operator function
+ * - Destroy descriptor function
+ * 
+ * Usage:
+ *   BINARY_OP_API_DECLARE(div, Div)
+ *   BINARY_OP_API_DECLARE(pow, Pow)
+ * 
+ * @param OP_NAME      Lowercase operator name (e.g., div, pow, mod)
+ * @param OP_NAME_UPPER Uppercase operator name (e.g., Div, Pow, Mod)
+ */
+#define BINARY_OP_API_DECLARE(OP_NAME, OP_NAME_UPPER)                        \
+                                                                              \
+    typedef struct InfiniopDescriptor *infiniop##OP_NAME_UPPER##Descriptor_t; \
+                                                                              \
+    __C __export infiniStatus_t infiniopCreate##OP_NAME_UPPER##Descriptor(  \
+        infiniopHandle_t handle,                                             \
+        infiniop##OP_NAME_UPPER##Descriptor_t *desc_ptr,                    \
+        infiniopTensorDescriptor_t c,                                        \
+        infiniopTensorDescriptor_t a,                                        \
+        infiniopTensorDescriptor_t b);                                       \
+                                                                              \
+    __C __export infiniStatus_t infiniopGet##OP_NAME_UPPER##WorkspaceSize(  \
+        infiniop##OP_NAME_UPPER##Descriptor_t desc,                         \
+        size_t *size);                                                        \
+                                                                              \
+    __C __export infiniStatus_t infiniop##OP_NAME_UPPER(                    \
+        infiniop##OP_NAME_UPPER##Descriptor_t desc,                         \
+        void *workspace,                                                     \
+        size_t workspace_size,                                               \
+        void *c,                                                             \
+        const void *a,                                                       \
+        const void *b,                                                       \
+        void *stream);                                                        \
+                                                                              \
+    __C __export infiniStatus_t infiniopDestroy##OP_NAME_UPPER##Descriptor( \
+        infiniop##OP_NAME_UPPER##Descriptor_t desc);
+
+#endif // __INFINIOP_BINARY_OP_API_H__
diff --git a/include/infiniop/ops/ceil.h b/include/infiniop/ops/ceil.h
index 4539d77fd..8fca73b2e 100644
--- a/include/infiniop/ops/ceil.h
+++ b/include/infiniop/ops/ceil.h
@@ -1,24 +1,8 @@
 #ifndef __INFINIOP_CEIL_API_H__
 #define __INFINIOP_CEIL_API_H__
 
-#include "../operator_descriptor.h"
+#include "unary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopCeilDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateCeilDescriptor(infiniopHandle_t handle,
-                                                        infiniopCeilDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t y,
-                                                        infiniopTensorDescriptor_t x);
-
-__C __export infiniStatus_t infiniopGetCeilWorkspaceSize(infiniopCeilDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopCeil(infiniopCeilDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *y,
-                                        const void *x,
-                                        void *stream);
-
-__C __export infiniStatus_t infiniopDestroyCeilDescriptor(infiniopCeilDescriptor_t desc);
+UNARY_OP_API_DECLARE(ceil, Ceil)
 
 #endif
diff --git a/include/infiniop/ops/cos.h b/include/infiniop/ops/cos.h
index 8f0b6eeb7..ed33b0a0e 100644
--- a/include/infiniop/ops/cos.h
+++ b/include/infiniop/ops/cos.h
@@ -1,24 +1,8 @@
 #ifndef __INFINIOP_COS_API_H__
 #define __INFINIOP_COS_API_H__
 
-#include "../operator_descriptor.h"
+#include "unary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopCosDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateCosDescriptor(infiniopHandle_t handle,
-                                                        infiniopCosDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t y,
-                                                        infiniopTensorDescriptor_t x);
-
-__C __export infiniStatus_t infiniopGetCosWorkspaceSize(infiniopCosDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopCos(infiniopCosDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *y,
-                                        const void *x,
-                                        void *stream);
-
-__C __export infiniStatus_t infiniopDestroyCosDescriptor(infiniopCosDescriptor_t desc);
+UNARY_OP_API_DECLARE(cos, Cos)
 
 #endif
diff --git a/include/infiniop/ops/cosh.h b/include/infiniop/ops/cosh.h
index 3328151ad..b607b8fd1 100644
--- a/include/infiniop/ops/cosh.h
+++ b/include/infiniop/ops/cosh.h
@@ -1,24 +1,8 @@
 #ifndef __INFINIOP_COSH_API_H__
 #define __INFINIOP_COSH_API_H__
 
-#include "../operator_descriptor.h"
+#include "unary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopCoshDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateCoshDescriptor(infiniopHandle_t handle,
-                                                        infiniopCoshDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t y,
-                                                        infiniopTensorDescriptor_t x);
-
-__C __export infiniStatus_t infiniopGetCoshWorkspaceSize(infiniopCoshDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopCosh(infiniopCoshDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *y,
-                                        const void *x,
-                                        void *stream);
-
-__C __export infiniStatus_t infiniopDestroyCoshDescriptor(infiniopCoshDescriptor_t desc);
+UNARY_OP_API_DECLARE(cosh, Cosh)
 
 #endif
diff --git a/include/infiniop/ops/div.h b/include/infiniop/ops/div.h
index e539b440c..6f146bf4c 100644
--- a/include/infiniop/ops/div.h
+++ b/include/infiniop/ops/div.h
@@ -1,26 +1,8 @@
 #ifndef __INFINIOP_DIV_API_H__
 #define __INFINIOP_DIV_API_H__
 
-#include "../operator_descriptor.h"
+#include "binary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopDivDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateDivDescriptor(infiniopHandle_t handle,
-                                                        infiniopDivDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t c,
-                                                        infiniopTensorDescriptor_t a,
-                                                        infiniopTensorDescriptor_t b);
-
-__C __export infiniStatus_t infiniopGetDivWorkspaceSize(infiniopDivDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopDiv(infiniopDivDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *c,
-                                        const void *a,
-                                        const void *b,
-                                        void *stream);
-
-__C __export infiniStatus_t infiniopDestroyDivDescriptor(infiniopDivDescriptor_t desc);
+BINARY_OP_API_DECLARE(div, Div)
 
 #endif
diff --git a/include/infiniop/ops/erf.h b/include/infiniop/ops/erf.h
index 8cbb8fb74..0dcc149da 100644
--- a/include/infiniop/ops/erf.h
+++ b/include/infiniop/ops/erf.h
@@ -1,24 +1,8 @@
 #ifndef __INFINIOP_ERF_API_H__
 #define __INFINIOP_ERF_API_H__
 
-#include "../operator_descriptor.h"
+#include "unary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopErfDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateErfDescriptor(infiniopHandle_t handle,
-                                                        infiniopErfDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t y,
-                                                        infiniopTensorDescriptor_t x);
-
-__C __export infiniStatus_t infiniopGetErfWorkspaceSize(infiniopErfDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopErf(infiniopErfDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *y,
-                                        const void *x,
-                                        void *stream);
-
-__C __export infiniStatus_t infiniopDestroyErfDescriptor(infiniopErfDescriptor_t desc);
+UNARY_OP_API_DECLARE(erf, Erf)
 
 #endif
diff --git a/include/infiniop/ops/floor.h b/include/infiniop/ops/floor.h
index 2f65f8f4a..02efc6761 100644
--- a/include/infiniop/ops/floor.h
+++ b/include/infiniop/ops/floor.h
@@ -1,24 +1,8 @@
 #ifndef __INFINIOP_FLOOR_API_H__
 #define __INFINIOP_FLOOR_API_H__
 
-#include "../operator_descriptor.h"
+#include "unary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopFloorDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateFloorDescriptor(infiniopHandle_t handle,
-                                                        infiniopFloorDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t y,
-                                                        infiniopTensorDescriptor_t x);
-
-__C __export infiniStatus_t infiniopGetFloorWorkspaceSize(infiniopFloorDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopFloor(infiniopFloorDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *y,
-                                        const void *x,
-                                        void *stream);
-
-__C __export infiniStatus_t infiniopDestroyFloorDescriptor(infiniopFloorDescriptor_t desc);
+UNARY_OP_API_DECLARE(floor, Floor)
 
 #endif
diff --git a/include/infiniop/ops/log.h b/include/infiniop/ops/log.h
index f5bec4382..3892ccb6e 100644
--- a/include/infiniop/ops/log.h
+++ b/include/infiniop/ops/log.h
@@ -1,24 +1,8 @@
 #ifndef __INFINIOP_LOG_API_H__
 #define __INFINIOP_LOG_API_H__
 
-#include "../operator_descriptor.h"
+#include "unary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopLogDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateLogDescriptor(infiniopHandle_t handle,
-                                                        infiniopLogDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t y,
-                                                        infiniopTensorDescriptor_t x);
-
-__C __export infiniStatus_t infiniopGetLogWorkspaceSize(infiniopLogDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopLog(infiniopLogDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *y,
-                                        const void *x,
-                                        void *stream);
-
-__C __export infiniStatus_t infiniopDestroyLogDescriptor(infiniopLogDescriptor_t desc);
+UNARY_OP_API_DECLARE(log, Log)
 
 #endif
diff --git a/include/infiniop/ops/max.h b/include/infiniop/ops/max.h
index e6f2f5d4c..4b91e5c83 100644
--- a/include/infiniop/ops/max.h
+++ b/include/infiniop/ops/max.h
@@ -1,26 +1,8 @@
 #ifndef __INFINIOP_MAX_API_H__
 #define __INFINIOP_MAX_API_H__
 
-#include "../operator_descriptor.h"
+#include "binary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopMaxDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateMaxDescriptor(infiniopHandle_t handle,
-                                                        infiniopMaxDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t c,
-                                                        infiniopTensorDescriptor_t a,
-                                                        infiniopTensorDescriptor_t b);
-
-__C __export infiniStatus_t infiniopGetMaxWorkspaceSize(infiniopMaxDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopMax(infiniopMaxDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *c,
-                                        const void *a,
-                                        const void *b,
-                                        void *stream);
-
-__C __export infiniStatus_t infiniopDestroyMaxDescriptor(infiniopMaxDescriptor_t desc);
+BINARY_OP_API_DECLARE(max, Max)
 
 #endif
diff --git a/include/infiniop/ops/min.h b/include/infiniop/ops/min.h
index f72f0c4db..1496806df 100644
--- a/include/infiniop/ops/min.h
+++ b/include/infiniop/ops/min.h
@@ -1,26 +1,8 @@
 #ifndef __INFINIOP_MIN_API_H__
 #define __INFINIOP_MIN_API_H__
 
-#include "../operator_descriptor.h"
+#include "binary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopMinDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateMinDescriptor(infiniopHandle_t handle,
-                                                        infiniopMinDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t c,
-                                                        infiniopTensorDescriptor_t a,
-                                                        infiniopTensorDescriptor_t b);
-
-__C __export infiniStatus_t infiniopGetMinWorkspaceSize(infiniopMinDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopMin(infiniopMinDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *c,
-                                        const void *a,
-                                        const void *b,
-                                        void *stream);
-
-__C __export infiniStatus_t infiniopDestroyMinDescriptor(infiniopMinDescriptor_t desc);
+BINARY_OP_API_DECLARE(min, Min)
 
 #endif
diff --git a/include/infiniop/ops/mod.h b/include/infiniop/ops/mod.h
index 5a6cd5bbf..e4fcd571e 100644
--- a/include/infiniop/ops/mod.h
+++ b/include/infiniop/ops/mod.h
@@ -1,26 +1,8 @@
 #ifndef __INFINIOP_MOD_API_H__
 #define __INFINIOP_MOD_API_H__
 
-#include "../operator_descriptor.h"
+#include "binary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopModDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateModDescriptor(infiniopHandle_t handle,
-                                                        infiniopModDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t c,
-                                                        infiniopTensorDescriptor_t a,
-                                                        infiniopTensorDescriptor_t b);
-
-__C __export infiniStatus_t infiniopGetModWorkspaceSize(infiniopModDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopMod(infiniopModDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *c,
-                                        const void *a,
-                                        const void *b,
-                                        void *stream);
-
-__C __export infiniStatus_t infiniopDestroyModDescriptor(infiniopModDescriptor_t desc);
+BINARY_OP_API_DECLARE(mod, Mod)
 
 #endif
diff --git a/include/infiniop/ops/mul.h b/include/infiniop/ops/mul.h
index 06200b55b..2dfd92ef4 100644
--- a/include/infiniop/ops/mul.h
+++ b/include/infiniop/ops/mul.h
@@ -1,26 +1,8 @@
 #ifndef __INFINIOP_MUL_API_H__
 #define __INFINIOP_MUL_API_H__
 
-#include "../operator_descriptor.h"
+#include "binary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopMulDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateMulDescriptor(infiniopHandle_t handle,
-                                                        infiniopMulDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t c,
-                                                        infiniopTensorDescriptor_t a,
-                                                        infiniopTensorDescriptor_t b);
-
-__C __export infiniStatus_t infiniopGetMulWorkspaceSize(infiniopMulDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopMul(infiniopMulDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *c,
-                                        const void *a,
-                                        const void *b,
-                                        void *stream);
-
-__C __export infiniStatus_t infiniopDestroyMulDescriptor(infiniopMulDescriptor_t desc);
+BINARY_OP_API_DECLARE(mul, Mul)
 
 #endif
diff --git a/include/infiniop/ops/neg.h b/include/infiniop/ops/neg.h
index 4d3b06e21..0d18bbd5c 100644
--- a/include/infiniop/ops/neg.h
+++ b/include/infiniop/ops/neg.h
@@ -1,24 +1,8 @@
 #ifndef __INFINIOP_NEG_API_H__
 #define __INFINIOP_NEG_API_H__
 
-#include "../operator_descriptor.h"
+#include "unary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopNegDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateNegDescriptor(infiniopHandle_t handle,
-                                                        infiniopNegDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t y,
-                                                        infiniopTensorDescriptor_t x);
-
-__C __export infiniStatus_t infiniopGetNegWorkspaceSize(infiniopNegDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopNeg(infiniopNegDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *y,
-                                        const void *x,
-                                        void *stream);
-
-__C __export infiniStatus_t infiniopDestroyNegDescriptor(infiniopNegDescriptor_t desc);
+UNARY_OP_API_DECLARE(neg, Neg)
 
 #endif
diff --git a/include/infiniop/ops/pow.h b/include/infiniop/ops/pow.h
index 6449d8622..f4e263a58 100644
--- a/include/infiniop/ops/pow.h
+++ b/include/infiniop/ops/pow.h
@@ -1,26 +1,8 @@
 #ifndef __INFINIOP_POW_API_H__
 #define __INFINIOP_POW_API_H__
 
-#include "../operator_descriptor.h"
+#include "binary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopPowDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreatePowDescriptor(infiniopHandle_t handle,
-                                                        infiniopPowDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t c,
-                                                        infiniopTensorDescriptor_t a,
-                                                        infiniopTensorDescriptor_t b);
-
-__C __export infiniStatus_t infiniopGetPowWorkspaceSize(infiniopPowDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopPow(infiniopPowDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *c,
-                                        const void *a,
-                                        const void *b,
-                                        void *stream);
-
-__C __export infiniStatus_t infiniopDestroyPowDescriptor(infiniopPowDescriptor_t desc);
+BINARY_OP_API_DECLARE(pow, Pow)
 
 #endif
diff --git a/include/infiniop/ops/reciprocal.h b/include/infiniop/ops/reciprocal.h
index 73836fea4..7d5626176 100644
--- a/include/infiniop/ops/reciprocal.h
+++ b/include/infiniop/ops/reciprocal.h
@@ -1,24 +1,8 @@
 #ifndef __INFINIOP_RECIPROCAL_API_H__
 #define __INFINIOP_RECIPROCAL_API_H__
 
-#include "../operator_descriptor.h"
+#include "unary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopReciprocalDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateReciprocalDescriptor(infiniopHandle_t handle,
-                                                                infiniopReciprocalDescriptor_t *desc_ptr,
-                                                                infiniopTensorDescriptor_t y,
-                                                                infiniopTensorDescriptor_t x);
-
-__C __export infiniStatus_t infiniopGetReciprocalWorkspaceSize(infiniopReciprocalDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopReciprocal(infiniopReciprocalDescriptor_t desc,
-                                                void *workspace,
-                                                size_t workspace_size,
-                                                void *y,
-                                                const void *x,
-                                                void *stream);
-
-__C __export infiniStatus_t infiniopDestroyReciprocalDescriptor(infiniopReciprocalDescriptor_t desc);
+UNARY_OP_API_DECLARE(reciprocal, Reciprocal)
 
 #endif
diff --git a/include/infiniop/ops/round.h b/include/infiniop/ops/round.h
index 18c7fe44e..1bf4377ff 100644
--- a/include/infiniop/ops/round.h
+++ b/include/infiniop/ops/round.h
@@ -1,24 +1,8 @@
 #ifndef __INFINIOP_ROUND_API_H__
 #define __INFINIOP_ROUND_API_H__
 
-#include "../operator_descriptor.h"
+#include "unary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopRoundDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateRoundDescriptor(infiniopHandle_t handle,
-                                                           infiniopRoundDescriptor_t *desc_ptr,
-                                                           infiniopTensorDescriptor_t y,
-                                                           infiniopTensorDescriptor_t x);
-
-__C __export infiniStatus_t infiniopGetRoundWorkspaceSize(infiniopRoundDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopRound(infiniopRoundDescriptor_t desc,
-                                          void *workspace,
-                                          size_t workspace_size,
-                                          void *y,
-                                          const void *x,
-                                          void *stream);
-
-__C __export infiniStatus_t infiniopDestroyRoundDescriptor(infiniopRoundDescriptor_t desc);
+UNARY_OP_API_DECLARE(round, Round)
 
 #endif
diff --git a/include/infiniop/ops/sign.h b/include/infiniop/ops/sign.h
index fe47c7190..ef7854de8 100644
--- a/include/infiniop/ops/sign.h
+++ b/include/infiniop/ops/sign.h
@@ -1,24 +1,8 @@
 #ifndef __INFINIOP_SIGN_API_H__
 #define __INFINIOP_SIGN_API_H__
 
-#include "../operator_descriptor.h"
+#include "unary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopSignDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateSignDescriptor(infiniopHandle_t handle,
-                                                        infiniopSignDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t y,
-                                                        infiniopTensorDescriptor_t x);
-
-__C __export infiniStatus_t infiniopGetSignWorkspaceSize(infiniopSignDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopSign(infiniopSignDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *y,
-                                        const void *x,
-                                        void *stream);
-
-__C __export infiniStatus_t infiniopDestroySignDescriptor(infiniopSignDescriptor_t desc);
+UNARY_OP_API_DECLARE(sign, Sign)
 
 #endif
diff --git a/include/infiniop/ops/sinh.h b/include/infiniop/ops/sinh.h
index a5325fb81..ea8511a2b 100644
--- a/include/infiniop/ops/sinh.h
+++ b/include/infiniop/ops/sinh.h
@@ -1,24 +1,8 @@
 #ifndef __INFINIOP_SINH_API_H__
 #define __INFINIOP_SINH_API_H__
 
-#include "../operator_descriptor.h"
+#include "unary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopSinhDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateSinhDescriptor(infiniopHandle_t handle,
-                                                        infiniopSinhDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t y,
-                                                        infiniopTensorDescriptor_t x);
-
-__C __export infiniStatus_t infiniopGetSinhWorkspaceSize(infiniopSinhDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopSinh(infiniopSinhDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *y,
-                                        const void *x,
-                                        void *stream);
-
-__C __export infiniStatus_t infiniopDestroySinhDescriptor(infiniopSinhDescriptor_t desc);
+UNARY_OP_API_DECLARE(sinh, Sinh)
 
 #endif
diff --git a/include/infiniop/ops/sqrt.h b/include/infiniop/ops/sqrt.h
index db04ec8bc..6df6fe89c 100644
--- a/include/infiniop/ops/sqrt.h
+++ b/include/infiniop/ops/sqrt.h
@@ -1,24 +1,8 @@
 #ifndef __INFINIOP_SQRT_API_H__
 #define __INFINIOP_SQRT_API_H__
 
-#include "../operator_descriptor.h"
+#include "unary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopSqrtDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateSqrtDescriptor(infiniopHandle_t handle,
-                                                         infiniopSqrtDescriptor_t *desc_ptr,
-                                                         infiniopTensorDescriptor_t y,
-                                                         infiniopTensorDescriptor_t x);
-
-__C __export infiniStatus_t infiniopGetSqrtWorkspaceSize(infiniopSqrtDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopSqrt(infiniopSqrtDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *y,
-                                        const void *x,
-                                        void *stream);
-
-__C __export infiniStatus_t infiniopDestroySqrtDescriptor(infiniopSqrtDescriptor_t desc);
+UNARY_OP_API_DECLARE(sqrt, Sqrt)
 
 #endif
diff --git a/include/infiniop/ops/sub.h b/include/infiniop/ops/sub.h
index da2aa8568..9b5fa397b 100644
--- a/include/infiniop/ops/sub.h
+++ b/include/infiniop/ops/sub.h
@@ -1,26 +1,8 @@
 #ifndef __INFINIOP_SUB_API_H__
 #define __INFINIOP_SUB_API_H__
 
-#include "../operator_descriptor.h"
+#include "binary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopSubDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateSubDescriptor(infiniopHandle_t handle,
-                                                        infiniopSubDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t c,
-                                                        infiniopTensorDescriptor_t a,
-                                                        infiniopTensorDescriptor_t b);
-
-__C __export infiniStatus_t infiniopGetSubWorkspaceSize(infiniopSubDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopSub(infiniopSubDescriptor_t desc,
-                                        void *workspace,
-                                        size_t workspace_size,
-                                        void *c,
-                                        const void *a,
-                                        const void *b,
-                                        void *stream);
-
-__C __export infiniStatus_t infiniopDestroySubDescriptor(infiniopSubDescriptor_t desc);
+BINARY_OP_API_DECLARE(sub, Sub)
 
 #endif
diff --git a/include/infiniop/ops/tan.h b/include/infiniop/ops/tan.h
index 69fc47bf1..d4a2f0bf2 100644
--- a/include/infiniop/ops/tan.h
+++ b/include/infiniop/ops/tan.h
@@ -1,24 +1,8 @@
 #ifndef __INFINIOP_TAN_API_H__
 #define __INFINIOP_TAN_API_H__
 
-#include "../operator_descriptor.h"
+#include "unary_op_api.h"
 
-typedef struct InfiniopDescriptor *infiniopTanDescriptor_t;
-
-__C __export infiniStatus_t infiniopCreateTanDescriptor(infiniopHandle_t handle,
-                                                        infiniopTanDescriptor_t *desc_ptr,
-                                                        infiniopTensorDescriptor_t y,
-                                                        infiniopTensorDescriptor_t x);
-
-__C __export infiniStatus_t infiniopGetTanWorkspaceSize(infiniopTanDescriptor_t desc, size_t *size);
-
-__C __export infiniStatus_t infiniopTan(infiniopTanDescriptor_t desc,
-                                       void *workspace,
-                                       size_t workspace_size,
-                                       void *y,
-                                       const void *x,
-                                       void *stream);
-
-__C __export infiniStatus_t infiniopDestroyTanDescriptor(infiniopTanDescriptor_t desc);
+UNARY_OP_API_DECLARE(tan, Tan)
 
 #endif
diff --git a/include/infiniop/ops/unary_op_api.h b/include/infiniop/ops/unary_op_api.h
new file mode 100644
index 000000000..eefe3c3a4
--- /dev/null
+++ b/include/infiniop/ops/unary_op_api.h
@@ -0,0 +1,48 @@
+#ifndef __INFINIOP_UNARY_OP_API_H__
+#define __INFINIOP_UNARY_OP_API_H__
+
+#include "../operator_descriptor.h"
+
+/**
+ * @brief Macro to generate the C API header for a unary operator.
+ * 
+ * This macro generates all the necessary declarations for a unary operator:
+ * - Descriptor type definition
+ * - Create descriptor function
+ * - Get workspace size function
+ * - Execute operator function
+ * - Destroy descriptor function
+ * 
+ * Usage:
+ *   UNARY_OP_API_DECLARE(abs, Abs)
+ *   UNARY_OP_API_DECLARE(log, Log)
+ * 
+ * @param OP_NAME      Lowercase operator name (e.g., abs, log, sin)
+ * @param OP_NAME_UPPER Uppercase operator name (e.g., Abs, Log, Sin)
+ */
+#define UNARY_OP_API_DECLARE(OP_NAME, OP_NAME_UPPER)                          \
+                                                                              \
+    typedef struct InfiniopDescriptor *infiniop##OP_NAME_UPPER##Descriptor_t; \
+                                                                              \
+    __C __export infiniStatus_t infiniopCreate##OP_NAME_UPPER##Descriptor(  \
+        infiniopHandle_t handle,                                             \
+        infiniop##OP_NAME_UPPER##Descriptor_t *desc_ptr,                    \
+        infiniopTensorDescriptor_t y,                                         \
+        infiniopTensorDescriptor_t x);                                        \
+                                                                              \
+    __C __export infiniStatus_t infiniopGet##OP_NAME_UPPER##WorkspaceSize(  \
+        infiniop##OP_NAME_UPPER##Descriptor_t desc,                         \
+        size_t *size);                                                        \
+                                                                              \
+    __C __export infiniStatus_t infiniop##OP_NAME_UPPER(                    \
+        infiniop##OP_NAME_UPPER##Descriptor_t desc,                         \
+        void *workspace,                                                     \
+        size_t workspace_size,                                               \
+        void *y,                                                             \
+        const void *x,                                                       \
+        void *stream);                                                        \
+                                                                              \
+    __C __export infiniStatus_t infiniopDestroy##OP_NAME_UPPER##Descriptor( \
+        infiniop##OP_NAME_UPPER##Descriptor_t desc);
+
+#endif // __INFINIOP_UNARY_OP_API_H__
diff --git a/scripts/test_binary_unary.py b/scripts/test_binary_unary.py
new file mode 100755
index 000000000..8dbbfbf53
--- /dev/null
+++ b/scripts/test_binary_unary.py
@@ -0,0 +1,143 @@
+import os
+import subprocess
+from set_env import set_env
+import sys
+
+PROJECT_DIR = os.path.abspath(
+    os.path.join(os.path.dirname(__file__), "..", "test", "infiniop")
+)
+os.chdir(PROJECT_DIR)
+
+
+def run_tests(args):
+    failed = []
+    
+    # Binary operators (重构过的)
+    binary_tests = [
+        "div.py",
+        "pow.py",
+        "mod.py",
+        "min.py",
+        "max.py",
+    ]
+    
+    # Unary operators (重构过的)
+    unary_tests = [
+        "abs.py",
+        "log.py",
+        "cos.py",
+        "sqrt.py",
+        "neg.py",
+        "sign.py",
+        "reciprocal.py",
+        "round.py",
+        "floor.py",
+        "ceil.py",
+        "erf.py",
+        "cosh.py",
+        "sinh.py",
+        "tan.py",
+        "acos.py",
+        "acosh.py",
+        "asin.py",
+        "asinh.py",
+        "atan.py",
+        "atanh.py",
+    ]
+    
+    all_tests = binary_tests + unary_tests
+    
+    print("\033[94m" + "=" * 60 + "\033[0m")
+    print("\033[94mTesting Binary and Unary Operators (Refactored)\033[0m")
+    print("\033[94m" + "=" * 60 + "\033[0m")
+    print(f"\033[94mTotal tests: {len(all_tests)}\033[0m")
+    print(f"\033[94m  - Binary operators: {len(binary_tests)}\033[0m")
+    print(f"\033[94m  - Unary operators: {len(unary_tests)}\033[0m")
+    print()
+    
+    for test in all_tests:
+        if not os.path.exists(test):
+            print(f"\033[93m[SKIP] {test} - test file not found\033[0m")
+            continue
+            
+        print(f"\033[96m[RUN] {test}\033[0m", end=" ... ", flush=True)
+        result = subprocess.run(
+            f"python3 {test} {args}", 
+            text=True, 
+            encoding="utf-8", 
+            shell=True,
+            capture_output=True
+        )
+        
+        if result.returncode != 0:
+            print(f"\033[91m[FAIL]\033[0m")
+            print(f"\033[91mError output:\033[0m")
+            print(result.stderr)
+            failed.append(test)
+        else:
+            print(f"\033[92m[PASS]\033[0m")
+    
+    return failed
+
+
+if __name__ == "__main__":
+    import argparse
+    
+    parser = argparse.ArgumentParser(
+        description="Test refactored binary and unary operators",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Test on CPU only (default)
+  python3 scripts/test_binary_unary.py --cpu
+  
+  # Test on NVIDIA GPU only
+  python3 scripts/test_binary_unary.py --nvidia
+  
+  # Test on both CPU and NVIDIA
+  python3 scripts/test_binary_unary.py --cpu --nvidia
+  
+  # Test with debug mode
+  python3 scripts/test_binary_unary.py --cpu --debug
+  
+  # Test with profiling
+  python3 scripts/test_binary_unary.py --nvidia --profile
+        """
+    )
+    
+    # Device selection arguments (same as test files)
+    parser.add_argument("--cpu", action="store_true", help="Run CPU tests")
+    parser.add_argument("--nvidia", action="store_true", help="Run NVIDIA GPU tests")
+    parser.add_argument("--debug", action="store_true", help="Enable debug mode")
+    parser.add_argument("--profile", action="store_true", help="Enable profiling")
+    
+    args, unknown = parser.parse_known_args()
+    
+    # Build command line arguments to pass to test files
+    test_args = []
+    if args.cpu:
+        test_args.append("--cpu")
+    if args.nvidia:
+        test_args.append("--nvidia")
+    if args.debug:
+        test_args.append("--debug")
+    if args.profile:
+        test_args.append("--profile")
+    
+    # Add any unknown arguments (for compatibility)
+    test_args.extend(unknown)
+    
+    set_env()
+    failed = run_tests(" ".join(test_args))
+    
+    print()
+    print("\033[94m" + "=" * 60 + "\033[0m")
+    if len(failed) == 0:
+        print("\033[92m✓ All tests passed!\033[0m")
+    else:
+        print(f"\033[91m✗ {len(failed)} test(s) failed:\033[0m")
+        for test in failed:
+            print(f"\033[91m  - {test}\033[0m")
+    print("\033[94m" + "=" * 60 + "\033[0m")
+    
+    exit(len(failed))
diff --git a/src/infiniop/elementwise/binary.h b/src/infiniop/elementwise/binary.h
new file mode 100644
index 000000000..1823fac3f
--- /dev/null
+++ b/src/infiniop/elementwise/binary.h
@@ -0,0 +1,261 @@
+#ifndef __INFINIOP_ELEMENTWISE_BINARY_H__
+#define __INFINIOP_ELEMENTWISE_BINARY_H__
+
+#include <algorithm>
+#include <cmath>
+#include <type_traits>
+
+#ifdef __CUDACC__
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+// Include device-specific type aliases for cuda_bfloat16
+#include "../devices/nvidia/nvidia_kernel_common.cuh"
+#endif
+
+namespace op::elementwise::binary {
+
+/**
+ * @brief Represents all the currently defined binary operations.
+ *
+ * This enum is used to specify which binary operation to perform
+ * in the generic BinaryOp template.
+ */
+enum class BinaryMode {
+    // Arithmetic operations:
+    Add,
+    Subtract,
+    Multiply,
+    Divide,
+    Pow,
+    Mod,
+    Max,
+    Min,
+    // Logical operations (for future use):
+    // And, Or, Xor, Less, LessOrEqual, Equal, Greater, GreaterOrEqual
+};
+
+/**
+ * @brief Generic binary operation template that performs different operations
+ *        based on the specified BinaryMode.
+ *
+ * This template allows multiple binary operators (pow, div, mod, min, max, etc.)
+ * to share the same implementation infrastructure while only differing in the
+ * operation mode.
+ *
+ * @tparam Mode The binary operation mode (from BinaryMode enum)
+ */
+template <BinaryMode Mode>
+struct BinaryOp {
+    static constexpr size_t num_inputs = 2;
+
+    template <typename T>
+    T operator()(const T &a, const T &b) const {
+        if constexpr (Mode == BinaryMode::Add) {
+            return a + b;
+        } else if constexpr (Mode == BinaryMode::Subtract) {
+            return a - b;
+        } else if constexpr (Mode == BinaryMode::Multiply) {
+            return a * b;
+        } else if constexpr (Mode == BinaryMode::Divide) {
+            return a / b;
+        } else if constexpr (Mode == BinaryMode::Pow) {
+            return std::pow(a, b);
+        } else if constexpr (Mode == BinaryMode::Mod) {
+            if constexpr (std::is_floating_point_v<T>) {
+                return std::fmod(a, b);
+            } else {
+                return a % b;
+            }
+        } else if constexpr (Mode == BinaryMode::Max) {
+            if constexpr (std::is_floating_point_v<T>) {
+                return std::fmax(a, b);
+            } else {
+                return std::max(a, b);
+            }
+        } else if constexpr (Mode == BinaryMode::Min) {
+            if constexpr (std::is_floating_point_v<T>) {
+                return std::fmin(a, b);
+            } else {
+                return std::min(a, b);
+            }
+        } else {
+            static_assert(Mode != Mode, "Unsupported binary operation mode");
+            return a;
+        }
+    }
+};
+
+#ifdef __CUDACC__
+/**
+ * @brief CUDA-specific binary operation template that performs different operations
+ *        based on the specified BinaryMode, using CUDA-optimized functions.
+ *
+ * This template provides CUDA device functions optimized for GPU execution,
+ * using intrinsics like __powf, __h2div, __hmin2, __hmax2, etc.
+ *
+ * @tparam Mode The binary operation mode (from BinaryMode enum)
+ */
+namespace cuda {
+template <BinaryMode Mode>
+struct BinaryOp {
+    static constexpr size_t num_inputs = 2;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+        if constexpr (Mode == BinaryMode::Add) {
+            if constexpr (std::is_same_v<T, half2>) {
+                return __hadd2(a, b);
+            } else if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
+                return __hadd(a, b);
+            } else if constexpr (std::is_same_v<T, float>) {
+                return __fadd_rn(a, b);
+            } else {
+                return a + b;
+            }
+        } else if constexpr (Mode == BinaryMode::Subtract) {
+            if constexpr (std::is_same_v<T, half2>) {
+                return __hsub2(a, b);
+            } else if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
+                return __hsub(a, b);
+            } else if constexpr (std::is_same_v<T, float>) {
+                return __fsub_rn(a, b);
+            } else {
+                return a - b;
+            }
+        } else if constexpr (Mode == BinaryMode::Multiply) {
+            if constexpr (std::is_same_v<T, half2>) {
+                return __hmul2(a, b);
+            } else if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
+                return __hmul(a, b);
+            } else if constexpr (std::is_same_v<T, float>) {
+                return __fmul_rd(a, b);
+            } else {
+                return a * b;
+            }
+        } else if constexpr (Mode == BinaryMode::Divide) {
+            if constexpr (std::is_same_v<T, half2>) {
+                return __h2div(a, b);
+            } else if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
+                return a / b;
+            } else if constexpr (std::is_same_v<T, float>) {
+                return __fdividef(a, b);
+            } else {
+                return a / b;
+            }
+        } else if constexpr (Mode == BinaryMode::Pow) {
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 a_f2 = __half22float2(a);
+                float2 b_f2 = __half22float2(b);
+                return __float22half2_rn(make_float2(__powf(a_f2.x, b_f2.x), __powf(a_f2.y, b_f2.y)));
+            } else if constexpr (std::is_same_v<T, half>) {
+                float a_ = __half2float(a);
+                float b_ = __half2float(b);
+                float ans_f = __powf(a_, b_);
+                return __float2half(isnan(ans_f) ? std::pow(a_, b_) : ans_f);
+            } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+                float2 a_f2 = __bfloat1622float2(a);
+                float2 b_f2 = __bfloat1622float2(b);
+                return __floats2bfloat162_rn(__powf(a_f2.x, b_f2.x), __powf(a_f2.y, b_f2.y));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                float a_ = __bfloat162float(a);
+                float b_ = __bfloat162float(b);
+                return __float2bfloat16_rn(__powf(a_, b_));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return __powf(a, b);
+            } else {
+                return std::pow(a, b);
+            }
+        } else if constexpr (Mode == BinaryMode::Mod) {
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 a_f2 = __half22float2(a);
+                float2 b_f2 = __half22float2(b);
+                return __float22half2_rn(make_float2(std::fmod(a_f2.x, b_f2.x), std::fmod(a_f2.y, b_f2.y)));
+            } else if constexpr (std::is_same_v<T, half>) {
+                float a_ = __half2float(a);
+                float b_ = __half2float(b);
+                return __float2half(std::fmod(a_, b_));
+            } else if constexpr (std::is_floating_point_v<T>) {
+                return std::fmod(a, b);
+            } else {
+                return a % b;
+            }
+        } else if constexpr (Mode == BinaryMode::Max) {
+            if constexpr (std::is_same_v<T, half2>) {
+                return __hmax2(a, b);
+            } else if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
+                return a > b ? a : b;
+            } else if constexpr (std::is_same_v<T, float>) {
+                return fmaxf(a, b);
+            } else {
+                return a > b ? a : b;
+            }
+        } else if constexpr (Mode == BinaryMode::Min) {
+            if constexpr (std::is_same_v<T, half2>) {
+                return __hmin2(a, b);
+            } else if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
+                return a < b ? a : b;
+            } else if constexpr (std::is_same_v<T, float>) {
+                return fminf(a, b);
+            } else {
+                return a < b ? a : b;
+            }
+        } else {
+            static_assert(Mode != Mode, "Unsupported binary operation mode");
+            return a;
+        }
+    }
+};
+} // namespace cuda
+#endif // __CUDACC__
+
+/**
+ * @brief Macro to define a binary elementwise descriptor for a specific operation.
+ *
+ * This macro simplifies the definition of binary operators (pow, div, mod, min, max, etc.)
+ * by automatically generating the Descriptor class and operation struct using the
+ * ELEMENTWISE_DESCRIPTOR macro and BinaryOp template.
+ *
+ * Usage:
+ *   BINARY_ELEMENTWISE_DESCRIPTOR(pow, cpu, BinaryMode::Pow)
+ *   BINARY_ELEMENTWISE_DESCRIPTOR(div, cpu, BinaryMode::Divide)
+ *
+ * @param OP        The operator name (e.g., pow, div, mod)
+ * @param NAMESPACE The device namespace (e.g., cpu, nvidia)
+ * @param MODE      The BinaryMode enum value for this operation
+ */
+#define BINARY_ELEMENTWISE_DESCRIPTOR(OP, NAMESPACE, MODE) \
+                                                           \
+    ELEMENTWISE_DESCRIPTOR(OP, NAMESPACE)                  \
+                                                           \
+    namespace op::OP::NAMESPACE {                          \
+    using Op = op::elementwise::binary::BinaryOp<MODE>;    \
+    }
+
+/**
+ * @brief Macro to define a binary elementwise descriptor for CUDA/NVIDIA backend.
+ *
+ * This macro is similar to BINARY_ELEMENTWISE_DESCRIPTOR but uses the CUDA-specific
+ * BinaryOp implementation for better GPU performance.
+ *
+ * Usage:
+ *   BINARY_ELEMENTWISE_DESCRIPTOR_CUDA(pow, nvidia, BinaryMode::Pow)
+ *   BINARY_ELEMENTWISE_DESCRIPTOR_CUDA(div, nvidia, BinaryMode::Divide)
+ *
+ * @param OP        The operator name (e.g., pow, div, mod)
+ * @param NAMESPACE The device namespace (e.g., nvidia)
+ * @param MODE      The BinaryMode enum value for this operation
+ */
+#ifdef __CUDACC__
+#define BINARY_ELEMENTWISE_DESCRIPTOR_CUDA(OP, NAMESPACE, MODE) \
+                                                                \
+    ELEMENTWISE_DESCRIPTOR(OP, NAMESPACE)                       \
+                                                                \
+    namespace op::OP::cuda {                                    \
+    using Op = op::elementwise::binary::cuda::BinaryOp<MODE>;   \
+    }
+#endif // __CUDACC__
+
+} // namespace op::elementwise::binary
+
+#endif // __INFINIOP_ELEMENTWISE_BINARY_H__
diff --git a/src/infiniop/elementwise/cpu/elementwise_cpu_impl.h b/src/infiniop/elementwise/cpu/elementwise_cpu_impl.h
new file mode 100644
index 000000000..030f4d87e
--- /dev/null
+++ b/src/infiniop/elementwise/cpu/elementwise_cpu_impl.h
@@ -0,0 +1,130 @@
+#ifndef __INFINIOP_ELEMENTWISE_CPU_IMPL_H__
+#define __INFINIOP_ELEMENTWISE_CPU_IMPL_H__
+
+#include "../../../utils/check.h"
+#include "../../../utils/result.hpp"
+#include "../../devices/cpu/common_cpu.h"
+#include "elementwise_cpu.h"
+
+/**
+ * @brief Generic implementation for elementwise CPU operators.
+ *
+ * This file provides a generic implementation template that can be used
+ * by all binary and unary operators to reduce code duplication.
+ *
+ * Usage:
+ *   #include "elementwise_cpu_impl.h"
+ *   namespace op::pow::cpu {
+ *       using Op = op::elementwise::binary::BinaryOp<BinaryMode::Pow>;
+ *       ELEMENTWISE_CPU_IMPL_BINARY(pow)
+ *   }
+ *
+ *   namespace op::sqrt::cpu {
+ *       using Op = op::elementwise::unary::UnaryOp<UnaryMode::Sqrt>;
+ *       ELEMENTWISE_CPU_IMPL_UNARY(sqrt)
+ *   }
+ */
+
+/**
+ * @brief Macro to generate binary operator implementation.
+ *
+ * This macro generates the Descriptor destructor, create, and calculate methods
+ * for binary operators, using the generic implementation.
+ *
+ * Usage:
+ *   namespace op::pow::cpu {
+ *       using Op = op::elementwise::binary::BinaryOp<BinaryMode::Pow>;
+ *       ELEMENTWISE_CPU_IMPL_BINARY(pow)
+ *   }
+ */
+#define ELEMENTWISE_CPU_IMPL_BINARY(OP)                                             \
+                                                                                    \
+    Descriptor::~Descriptor() = default;                                            \
+                                                                                    \
+    infiniStatus_t Descriptor::create(                                              \
+        infiniopHandle_t handle_,                                                   \
+        Descriptor **desc_ptr,                                                      \
+        infiniopTensorDescriptor_t out_desc,                                        \
+        std::vector<infiniopTensorDescriptor_t> input_desc_vec) {                   \
+        auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);             \
+        auto dtype = out_desc->dtype();                                             \
+        const auto &a_desc = input_desc_vec.at(0);                                  \
+        const auto &b_desc = input_desc_vec.at(1);                                  \
+        const auto &out_shape = out_desc->shape();                                  \
+        const auto &a_shape = a_desc->shape();                                      \
+        const auto &b_shape = b_desc->shape();                                      \
+        CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);                     \
+        CHECK_SAME_SHAPE(out_shape, a_shape, b_shape);                              \
+        CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); \
+        return INFINI_STATUS_SUCCESS;                                               \
+    }                                                                               \
+                                                                                    \
+    infiniStatus_t Descriptor::calculate(                                           \
+        void *workspace,                                                            \
+        size_t workspace_size,                                                      \
+        void *output,                                                               \
+        std::vector<const void *> inputs,                                           \
+        void *stream) const {                                                       \
+        switch (_dtype) {                                                           \
+        case INFINI_DTYPE_F16:                                                      \
+            return _device_info->template calculate<Op, fp16_t>(                    \
+                _info, output, inputs, stream);                                     \
+        case INFINI_DTYPE_F32:                                                      \
+            return _device_info->template calculate<Op, float>(                     \
+                _info, output, inputs, stream);                                     \
+        default:                                                                    \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;                                  \
+        }                                                                           \
+    }
+
+/**
+ * @brief Macro to generate unary operator implementation.
+ *
+ * This macro generates the Descriptor destructor, create, and calculate methods
+ * for unary operators, using the generic implementation.
+ *
+ * Usage:
+ *   namespace op::sqrt::cpu {
+ *       using Op = op::elementwise::unary::UnaryOp<UnaryMode::Sqrt>;
+ *       ELEMENTWISE_CPU_IMPL_UNARY(sqrt)
+ *   }
+ */
+#define ELEMENTWISE_CPU_IMPL_UNARY(OP)                                              \
+                                                                                    \
+    Descriptor::~Descriptor() = default;                                            \
+                                                                                    \
+    infiniStatus_t Descriptor::create(                                              \
+        infiniopHandle_t handle_,                                                   \
+        Descriptor **desc_ptr,                                                      \
+        infiniopTensorDescriptor_t out_desc,                                        \
+        std::vector<infiniopTensorDescriptor_t> input_desc_vec) {                   \
+        auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);             \
+        auto dtype = out_desc->dtype();                                             \
+        const auto &x_desc = input_desc_vec.at(0);                                  \
+        const auto &y_shape = out_desc->shape();                                    \
+        const auto &x_shape = x_desc->shape();                                      \
+        CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);                     \
+        CHECK_SAME_SHAPE(y_shape, x_shape);                                         \
+        CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); \
+        return INFINI_STATUS_SUCCESS;                                               \
+    }                                                                               \
+                                                                                    \
+    infiniStatus_t Descriptor::calculate(                                           \
+        void *workspace,                                                            \
+        size_t workspace_size,                                                      \
+        void *output,                                                               \
+        std::vector<const void *> inputs,                                           \
+        void *stream) const {                                                       \
+        switch (_dtype) {                                                           \
+        case INFINI_DTYPE_F16:                                                      \
+            return _device_info->template calculate<Op, fp16_t>(                    \
+                _info, output, inputs, stream);                                     \
+        case INFINI_DTYPE_F32:                                                      \
+            return _device_info->template calculate<Op, float>(                     \
+                _info, output, inputs, stream);                                     \
+        default:                                                                    \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;                                  \
+        }                                                                           \
+    }
+
+#endif // __INFINIOP_ELEMENTWISE_CPU_IMPL_H__
diff --git a/src/infiniop/elementwise/nvidia/elementwise_nvidia_impl.cuh b/src/infiniop/elementwise/nvidia/elementwise_nvidia_impl.cuh
new file mode 100644
index 000000000..39b78884a
--- /dev/null
+++ b/src/infiniop/elementwise/nvidia/elementwise_nvidia_impl.cuh
@@ -0,0 +1,134 @@
+#ifndef __INFINIOP_ELEMENTWISE_NVIDIA_IMPL_CUH__
+#define __INFINIOP_ELEMENTWISE_NVIDIA_IMPL_CUH__
+
+#include "../../../utils/check.h"
+#include "../../../utils/result.hpp"
+#include "../../devices/nvidia/nvidia_common.cuh"
+#include "elementwise_nvidia.cuh"
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+/**
+ * @brief Generic implementation for elementwise NVIDIA/CUDA operators.
+ *
+ * This file provides a generic implementation template that can be used
+ * by all binary and unary operators to reduce code duplication.
+ *
+ * Usage:
+ *   #include "elementwise_nvidia_impl.cuh"
+ *   namespace op::pow::nvidia {
+ *       ELEMENTWISE_NVIDIA_IMPL_BINARY(pow)
+ *   }
+ *
+ *   namespace op::sqrt::nvidia {
+ *       ELEMENTWISE_NVIDIA_IMPL_UNARY(sqrt)
+ *   }
+ */
+
+/**
+ * @brief Macro to generate binary operator implementation for NVIDIA/CUDA.
+ *
+ * This macro generates the Descriptor destructor, create, and calculate methods
+ * for binary operators, using the generic implementation.
+ *
+ * Usage:
+ *   namespace op::pow::nvidia {
+ *       ELEMENTWISE_NVIDIA_IMPL_BINARY(pow)
+ *   }
+ */
+#define ELEMENTWISE_NVIDIA_IMPL_BINARY(OP)                                           \
+                                                                                     \
+    Descriptor::~Descriptor() = default;                                             \
+                                                                                     \
+    infiniStatus_t Descriptor::create(                                               \
+        infiniopHandle_t handle_,                                                    \
+        Descriptor **desc_ptr,                                                       \
+        infiniopTensorDescriptor_t out_desc,                                         \
+        std::vector<infiniopTensorDescriptor_t> input_desc_vec) {                    \
+        auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);           \
+        auto dtype = out_desc->dtype();                                              \
+        const auto &a_desc = input_desc_vec.at(0);                                   \
+        const auto &b_desc = input_desc_vec.at(1);                                   \
+        const auto &c_shape = out_desc->shape();                                     \
+        const auto &a_shape = a_desc->shape();                                       \
+        const auto &b_shape = b_desc->shape();                                       \
+        CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);                      \
+        CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);                                 \
+        CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); \
+        return INFINI_STATUS_SUCCESS;                                                \
+    }                                                                                \
+                                                                                     \
+    infiniStatus_t Descriptor::calculate(                                            \
+        void *workspace,                                                             \
+        size_t workspace_size,                                                       \
+        void *output,                                                                \
+        std::vector<const void *> inputs,                                            \
+        void *stream) const {                                                        \
+        if (workspace_size < _workspace_size) {                                      \
+            return INFINI_STATUS_INSUFFICIENT_WORKSPACE;                             \
+        }                                                                            \
+        switch (_dtype) {                                                            \
+        case INFINI_DTYPE_F16:                                                       \
+            return _device_info->calculate<256, cuda::Op, half>(                     \
+                _info, workspace, output, inputs, stream);                           \
+        case INFINI_DTYPE_F32:                                                       \
+            return _device_info->calculate<256, cuda::Op, float>(                    \
+                _info, workspace, output, inputs, stream);                           \
+        default:                                                                     \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;                                   \
+        }                                                                            \
+    }
+
+/**
+ * @brief Macro to generate unary operator implementation for NVIDIA/CUDA.
+ *
+ * This macro generates the Descriptor destructor, create, and calculate methods
+ * for unary operators, using the generic implementation.
+ *
+ * Usage:
+ *   namespace op::sqrt::nvidia {
+ *       ELEMENTWISE_NVIDIA_IMPL_UNARY(sqrt)
+ *   }
+ */
+#define ELEMENTWISE_NVIDIA_IMPL_UNARY(OP)                                            \
+                                                                                     \
+    Descriptor::~Descriptor() = default;                                             \
+                                                                                     \
+    infiniStatus_t Descriptor::create(                                               \
+        infiniopHandle_t handle_,                                                    \
+        Descriptor **desc_ptr,                                                       \
+        infiniopTensorDescriptor_t out_desc,                                         \
+        std::vector<infiniopTensorDescriptor_t> input_desc_vec) {                    \
+        auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);           \
+        auto dtype = out_desc->dtype();                                              \
+        const auto &x_desc = input_desc_vec.at(0);                                   \
+        const auto &y_shape = out_desc->shape();                                     \
+        const auto &x_shape = x_desc->shape();                                       \
+        CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);                      \
+        CHECK_SAME_SHAPE(y_shape, x_shape);                                          \
+        CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); \
+        return INFINI_STATUS_SUCCESS;                                                \
+    }                                                                                \
+                                                                                     \
+    infiniStatus_t Descriptor::calculate(                                            \
+        void *workspace,                                                             \
+        size_t workspace_size,                                                       \
+        void *output,                                                                \
+        std::vector<const void *> inputs,                                            \
+        void *stream) const {                                                        \
+        if (workspace_size < _workspace_size) {                                      \
+            return INFINI_STATUS_INSUFFICIENT_WORKSPACE;                             \
+        }                                                                            \
+        switch (_dtype) {                                                            \
+        case INFINI_DTYPE_F16:                                                       \
+            return _device_info->calculate<256, cuda::Op, half>(                     \
+                _info, workspace, output, inputs, stream);                           \
+        case INFINI_DTYPE_F32:                                                       \
+            return _device_info->calculate<256, cuda::Op, float>(                    \
+                _info, workspace, output, inputs, stream);                           \
+        default:                                                                     \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;                                   \
+        }                                                                            \
+    }
+
+#endif // __INFINIOP_ELEMENTWISE_NVIDIA_IMPL_CUH__
diff --git a/src/infiniop/elementwise/unary.h b/src/infiniop/elementwise/unary.h
new file mode 100644
index 000000000..9f41dedb2
--- /dev/null
+++ b/src/infiniop/elementwise/unary.h
@@ -0,0 +1,524 @@
+#ifndef __INFINIOP_ELEMENTWISE_UNARY_H__
+#define __INFINIOP_ELEMENTWISE_UNARY_H__
+
+#include <algorithm>
+#include <cmath>
+#include <type_traits>
+
+#ifdef __CUDACC__
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+// Include device-specific type aliases for cuda_bfloat16
+#include "../devices/nvidia/nvidia_kernel_common.cuh"
+#endif
+
+namespace op::elementwise::unary {
+
+/**
+ * @brief Represents all the currently defined unary operations.
+ *
+ * This enum is used to specify which unary operation to perform
+ * in the generic UnaryOp template.
+ */
+enum class UnaryMode {
+    // Math operations:
+    Abs,
+    Exp,
+    Log,
+    Reciprocal,
+    Sqrt,
+    Neg,
+    Ceil,
+    Floor,
+    Round,
+    Sin,
+    Cos,
+    Tan,
+    Asin,
+    Acos,
+    Atan,
+    Sinh,
+    Cosh,
+    Tanh,
+    Asinh,
+    Acosh,
+    Atanh,
+    Relu,
+    Sigmoid,
+    Sign,
+    Erf,
+};
+
+/**
+ * @brief Generic unary operation template that performs different operations
+ *        based on the specified UnaryMode.
+ *
+ * This template allows multiple unary operators (abs, log, sin, cos, etc.)
+ * to share the same implementation infrastructure while only differing in the
+ * operation mode.
+ *
+ * @tparam Mode The unary operation mode (from UnaryMode enum)
+ */
+template <UnaryMode Mode>
+struct UnaryOp {
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        if constexpr (Mode == UnaryMode::Abs) {
+            if constexpr (std::is_floating_point_v<T>) {
+                return std::fabs(x);
+            } else {
+                return std::abs(x);
+            }
+        } else if constexpr (Mode == UnaryMode::Exp) {
+            return std::exp(x);
+        } else if constexpr (Mode == UnaryMode::Log) {
+            return std::log(x);
+        } else if constexpr (Mode == UnaryMode::Reciprocal) {
+            return T(1) / x;
+        } else if constexpr (Mode == UnaryMode::Sqrt) {
+            return std::sqrt(x);
+        } else if constexpr (Mode == UnaryMode::Neg) {
+            return -x;
+        } else if constexpr (Mode == UnaryMode::Ceil) {
+            return std::ceil(x);
+        } else if constexpr (Mode == UnaryMode::Floor) {
+            return std::floor(x);
+        } else if constexpr (Mode == UnaryMode::Round) {
+            if constexpr (std::is_integral_v<T>) {
+                return x;
+            } else {
+                return std::nearbyint(x);
+            }
+        } else if constexpr (Mode == UnaryMode::Sin) {
+            return std::sin(x);
+        } else if constexpr (Mode == UnaryMode::Cos) {
+            return std::cos(x);
+        } else if constexpr (Mode == UnaryMode::Tan) {
+            return std::tan(x);
+        } else if constexpr (Mode == UnaryMode::Asin) {
+            return std::asin(x);
+        } else if constexpr (Mode == UnaryMode::Acos) {
+            return std::acos(x);
+        } else if constexpr (Mode == UnaryMode::Atan) {
+            return std::atan(x);
+        } else if constexpr (Mode == UnaryMode::Sinh) {
+            return std::sinh(x);
+        } else if constexpr (Mode == UnaryMode::Cosh) {
+            return std::cosh(x);
+        } else if constexpr (Mode == UnaryMode::Tanh) {
+            return std::tanh(x);
+        } else if constexpr (Mode == UnaryMode::Asinh) {
+            return std::asinh(x);
+        } else if constexpr (Mode == UnaryMode::Acosh) {
+            return std::acosh(x);
+        } else if constexpr (Mode == UnaryMode::Atanh) {
+            return std::atanh(x);
+        } else if constexpr (Mode == UnaryMode::Relu) {
+            return x > T(0) ? x : T(0);
+        } else if constexpr (Mode == UnaryMode::Sigmoid) {
+            return T(1) / (T(1) + std::exp(-x));
+        } else if constexpr (Mode == UnaryMode::Sign) {
+            return x > T(0) ? T(1) : (x == T(0) ? T(0) : T(-1));
+        } else if constexpr (Mode == UnaryMode::Erf) {
+            return std::erf(x);
+        } else {
+            static_assert(Mode != Mode, "Unsupported unary operation mode");
+            return x;
+        }
+    }
+};
+
+#ifdef __CUDACC__
+/**
+ * @brief CUDA-specific unary operation template that performs different operations
+ *        based on the specified UnaryMode, using CUDA-optimized functions.
+ *
+ * This template provides CUDA device functions optimized for GPU execution,
+ * using intrinsics like __habs2, __logf, __sinf, etc.
+ *
+ * @tparam Mode The unary operation mode (from UnaryMode enum)
+ */
+namespace cuda {
+template <UnaryMode Mode>
+struct UnaryOp {
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (Mode == UnaryMode::Abs) {
+            if constexpr (std::is_same_v<T, half2>) {
+                return __habs2(x);
+            } else if constexpr (std::is_same_v<T, half>) {
+                return __habs(x);
+            } else if constexpr (std::is_floating_point_v<T>) {
+                return std::fabs(x);
+            } else {
+                return std::abs(x);
+            }
+        } else if constexpr (Mode == UnaryMode::Exp) {
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 x_f2 = __half22float2(x);
+                return __float22half2_rn(make_float2(__expf(x_f2.x), __expf(x_f2.y)));
+            } else if constexpr (std::is_same_v<T, half>) {
+                return __float2half(__expf(__half2float(x)));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+                float2 x_f2 = __bfloat1622float2(x);
+                return __floats2bfloat162_rn(__expf(x_f2.x), __expf(x_f2.y));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                return __float2bfloat16_rn(__expf(__bfloat162float(x)));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return __expf(x);
+            } else {
+                return std::exp(x);
+            }
+        } else if constexpr (Mode == UnaryMode::Log) {
+            if constexpr (std::is_same_v<T, half2>) {
+                return h2log(x);
+            } else if constexpr (std::is_same_v<T, half>) {
+                return __float2half(__logf(__half2float(x)));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+                float x0 = __bfloat162float(__low2bfloat16(x));
+                float x1 = __bfloat162float(__high2bfloat16(x));
+                return __floats2bfloat162_rn(logf(x0), logf(x1));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                return __float2bfloat16_rn(logf(__bfloat162float(x)));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return __logf(x);
+            } else {
+                return std::log(x);
+            }
+        } else if constexpr (Mode == UnaryMode::Reciprocal) {
+            if constexpr (std::is_same_v<T, half2>) {
+                return h2rcp(x);
+            } else if constexpr (std::is_same_v<T, half>) {
+                return hrcp(x);
+            } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+                float x0 = __bfloat162float(__low2bfloat16(x));
+                float x1 = __bfloat162float(__high2bfloat16(x));
+                return __floats2bfloat162_rn(__frcp_rn(x0), __frcp_rn(x1));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                return __float2bfloat16_rn(__frcp_rn(__bfloat162float(x)));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return __frcp_rn(x);
+            } else {
+                return T(1) / x;
+            }
+        } else if constexpr (Mode == UnaryMode::Sqrt) {
+            if constexpr (std::is_same_v<T, half2>) {
+                return h2sqrt(x);
+            } else if constexpr (std::is_same_v<T, half>) {
+                return hsqrt(x);
+            } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+                float x0 = __bfloat162float(__low2bfloat16(x));
+                float x1 = __bfloat162float(__high2bfloat16(x));
+                return __floats2bfloat162_rn(sqrtf(x0), sqrtf(x1));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                return __float2bfloat16_rn(sqrtf(__bfloat162float(x)));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return __fsqrt_rn(x);
+            } else {
+                return std::sqrt(x);
+            }
+        } else if constexpr (Mode == UnaryMode::Neg) {
+            if constexpr (std::is_same_v<T, half2>) {
+                return __hneg2(x);
+            } else if constexpr (std::is_same_v<T, half>) {
+                return __hneg(x);
+            } else {
+                return -x;
+            }
+        } else if constexpr (Mode == UnaryMode::Ceil) {
+            if constexpr (std::is_same_v<T, half2>) {
+                return h2ceil(x);
+            } else if constexpr (std::is_same_v<T, half>) {
+                return hceil(x);
+            } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+                float x0 = __bfloat162float(__low2bfloat16(x));
+                float x1 = __bfloat162float(__high2bfloat16(x));
+                return __floats2bfloat162_rn(ceilf(x0), ceilf(x1));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                return __float2bfloat16_rn(ceilf(__bfloat162float(x)));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return ceilf(x);
+            } else if constexpr (std::is_integral_v<T>) {
+                return x;
+            } else {
+                return std::ceil(x);
+            }
+        } else if constexpr (Mode == UnaryMode::Floor) {
+            if constexpr (std::is_same_v<T, half2>) {
+                return h2floor(x);
+            } else if constexpr (std::is_same_v<T, half>) {
+                return hfloor(x);
+            } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+                float x0 = __bfloat162float(__low2bfloat16(x));
+                float x1 = __bfloat162float(__high2bfloat16(x));
+                return __floats2bfloat162_rn(floorf(x0), floorf(x1));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                return __float2bfloat16_rn(floorf(__bfloat162float(x)));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return floorf(x);
+            } else if constexpr (std::is_integral_v<T>) {
+                return x;
+            } else {
+                return std::floor(x);
+            }
+        } else if constexpr (Mode == UnaryMode::Round) {
+            if constexpr (std::is_same_v<T, half2>) {
+                return h2rint(x);
+            } else if constexpr (std::is_same_v<T, half>) {
+                return hrint(x);
+            } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+                float x0 = __bfloat162float(__low2bfloat16(x));
+                float x1 = __bfloat162float(__high2bfloat16(x));
+                return __floats2bfloat162_rn(rintf(x0), rintf(x1));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                return __float2bfloat16_rn(rintf(__bfloat162float(x)));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return rintf(x);
+            } else if constexpr (std::is_integral_v<T>) {
+                return x;
+            } else {
+                return std::nearbyint(x);
+            }
+        } else if constexpr (Mode == UnaryMode::Sin) {
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 x_f2 = __half22float2(x);
+                return __float22half2_rn(make_float2(__sinf(x_f2.x), __sinf(x_f2.y)));
+            } else if constexpr (std::is_same_v<T, half>) {
+                return __float2half(__sinf(__half2float(x)));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+                float x0 = __bfloat162float(__low2bfloat16(x));
+                float x1 = __bfloat162float(__high2bfloat16(x));
+                return __floats2bfloat162_rn(sinf(x0), sinf(x1));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                return __float2bfloat16_rn(sinf(__bfloat162float(x)));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return __sinf(x);
+            } else {
+                return std::sin(x);
+            }
+        } else if constexpr (Mode == UnaryMode::Cos) {
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 x_f2 = __half22float2(x);
+                return __float22half2_rn(make_float2(__cosf(x_f2.x), __cosf(x_f2.y)));
+            } else if constexpr (std::is_same_v<T, half>) {
+                return __float2half(__cosf(__half2float(x)));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+                float x0 = __bfloat162float(__low2bfloat16(x));
+                float x1 = __bfloat162float(__high2bfloat16(x));
+                return __floats2bfloat162_rn(cosf(x0), cosf(x1));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                return __float2bfloat16_rn(cosf(__bfloat162float(x)));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return __cosf(x);
+            } else {
+                return std::cos(x);
+            }
+        } else if constexpr (Mode == UnaryMode::Tan) {
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 x_f2 = __half22float2(x);
+                return __float22half2_rn(make_float2(tanf(x_f2.x), tanf(x_f2.y)));
+            } else if constexpr (std::is_same_v<T, half>) {
+                return __float2half(tanf(__half2float(x)));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return tanf(x);
+            } else {
+                return std::tan(x);
+            }
+        } else if constexpr (Mode == UnaryMode::Asin) {
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 x_f2 = __half22float2(x);
+                return __float22half2_rn(make_float2(asinf(x_f2.x), asinf(x_f2.y)));
+            } else if constexpr (std::is_same_v<T, half>) {
+                return __float2half(asinf(__half2float(x)));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return asinf(x);
+            } else {
+                return std::asin(x);
+            }
+        } else if constexpr (Mode == UnaryMode::Acos) {
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 x_f2 = __half22float2(x);
+                return __float22half2_rn(make_float2(acosf(x_f2.x), acosf(x_f2.y)));
+            } else if constexpr (std::is_same_v<T, half>) {
+                return __float2half(acosf(__half2float(x)));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return acosf(x);
+            } else {
+                return std::acos(x);
+            }
+        } else if constexpr (Mode == UnaryMode::Atan) {
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 x_f2 = __half22float2(x);
+                return __float22half2_rn(make_float2(atanf(x_f2.x), atanf(x_f2.y)));
+            } else if constexpr (std::is_same_v<T, half>) {
+                return __float2half(atanf(__half2float(x)));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return atanf(x);
+            } else {
+                return std::atan(x);
+            }
+        } else if constexpr (Mode == UnaryMode::Sinh) {
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 x_f2 = __half22float2(x);
+                return __float22half2_rn(make_float2(sinhf(x_f2.x), sinhf(x_f2.y)));
+            } else if constexpr (std::is_same_v<T, half>) {
+                return __float2half(sinhf(__half2float(x)));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return sinhf(x);
+            } else {
+                return std::sinh(x);
+            }
+        } else if constexpr (Mode == UnaryMode::Cosh) {
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 x_f2 = __half22float2(x);
+                return __float22half2_rn(make_float2(coshf(x_f2.x), coshf(x_f2.y)));
+            } else if constexpr (std::is_same_v<T, half>) {
+                return __float2half(coshf(__half2float(x)));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return coshf(x);
+            } else {
+                return std::cosh(x);
+            }
+        } else if constexpr (Mode == UnaryMode::Tanh) {
+            if constexpr (std::is_same_v<T, half2>) {
+                return __h2tanh(x);
+            } else if constexpr (std::is_same_v<T, half>) {
+                return __float2half(tanhf(__half2float(x)));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+                float f0 = __bfloat162float(__low2bfloat16(x));
+                float f1 = __bfloat162float(__high2bfloat16(x));
+                return __floats2bfloat162_rn(tanhf(f0), tanhf(f1));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                return __float2bfloat16_rn(tanhf(__bfloat162float(x)));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return tanhf(x);
+            } else {
+                return std::tanh(x);
+            }
+        } else if constexpr (Mode == UnaryMode::Asinh) {
+            if constexpr (std::is_same_v<T, half2>) {
+                return __floats2half2_rn(asinhf(__half2float(__low2half(x))), asinhf(__half2float(__high2half(x))));
+            } else if constexpr (std::is_same_v<T, half>) {
+                return __float2half(asinhf(__half2float(x)));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+                float x0 = __bfloat162float(__low2bfloat16(x));
+                float x1 = __bfloat162float(__high2bfloat16(x));
+                return __floats2bfloat162_rn(asinhf(x0), asinhf(x1));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                return __float2bfloat16_rn(asinhf(__bfloat162float(x)));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return asinhf(x);
+            } else {
+                return std::asinh(x);
+            }
+        } else if constexpr (Mode == UnaryMode::Acosh) {
+            if constexpr (std::is_same_v<T, half2>) {
+                return __floats2half2_rn(acoshf(__half2float(__low2half(x))), acoshf(__half2float(__high2half(x))));
+            } else if constexpr (std::is_same_v<T, half>) {
+                return __float2half(acoshf(__half2float(x)));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+                float x0 = __bfloat162float(__low2bfloat16(x));
+                float x1 = __bfloat162float(__high2bfloat16(x));
+                return __floats2bfloat162_rn(acoshf(x0), acoshf(x1));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                return __float2bfloat16_rn(acoshf(__bfloat162float(x)));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return acoshf(x);
+            } else {
+                return std::acosh(x);
+            }
+        } else if constexpr (Mode == UnaryMode::Atanh) {
+            if constexpr (std::is_same_v<T, half2>) {
+                return __floats2half2_rn(atanhf(__half2float(__low2half(x))), atanhf(__half2float(__high2half(x))));
+            } else if constexpr (std::is_same_v<T, half>) {
+                return __float2half(atanhf(__half2float(x)));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+                float x0 = __bfloat162float(__low2bfloat16(x));
+                float x1 = __bfloat162float(__high2bfloat16(x));
+                return __floats2bfloat162_rn(atanhf(x0), atanhf(x1));
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                return __float2bfloat16_rn(atanhf(__bfloat162float(x)));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return atanhf(x);
+            } else {
+                return std::atanh(x);
+            }
+        } else if constexpr (Mode == UnaryMode::Relu) {
+            if constexpr (std::is_same_v<T, half2>) {
+                return __hmax2(x, __floats2half2_rn(0.0f, 0.0f));
+            } else {
+                return x > T(0) ? x : T(0);
+            }
+        } else if constexpr (Mode == UnaryMode::Sigmoid) {
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 x_f2 = __half22float2(x);
+                float2 exp_neg_x = make_float2(__expf(-x_f2.x), __expf(-x_f2.y));
+                return __float22half2_rn(make_float2(1.0f / (1.0f + exp_neg_x.x), 1.0f / (1.0f + exp_neg_x.y)));
+            } else if constexpr (std::is_same_v<T, half>) {
+                float x_ = __half2float(x);
+                return __float2half(1.0f / (1.0f + __expf(-x_)));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return 1.0f / (1.0f + __expf(-x));
+            } else {
+                return T(1) / (T(1) + std::exp(-x));
+            }
+        } else if constexpr (Mode == UnaryMode::Sign) {
+            if constexpr (std::is_same_v<T, half2>) {
+                const auto lt_mask = __hlt2(x, __floats2half2_rn(0.0f, 0.0f));
+                return __hadd2(__hneg2(lt_mask), __hsub2(__floats2half2_rn(1.0f, 1.0f), lt_mask));
+            } else if constexpr (std::is_same_v<T, half>) {
+                return x > half(0) ? half(1) : (x == half(0) ? half(0) : half(-1));
+            } else {
+                return x > T(0) ? T(1) : (x == T(0) ? T(0) : T(-1));
+            }
+        } else if constexpr (Mode == UnaryMode::Erf) {
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 x_f2 = __half22float2(x);
+                return __float22half2_rn(make_float2(erff(x_f2.x), erff(x_f2.y)));
+            } else if constexpr (std::is_same_v<T, half>) {
+                return __float2half(erff(__half2float(x)));
+            } else if constexpr (std::is_same_v<T, float>) {
+                return erff(x);
+            } else {
+                return std::erf(x);
+            }
+        } else {
+            static_assert(Mode != Mode, "Unsupported unary operation mode");
+            return x;
+        }
+    }
+};
+} // namespace cuda
+#endif // __CUDACC__
+
+/**
+ * @brief Macro to define a unary elementwise descriptor for a specific operation.
+ *
+ * This macro simplifies the definition of unary operators (abs, log, sin, cos, etc.)
+ * by automatically generating the Descriptor class and operation struct using the
+ * ELEMENTWISE_DESCRIPTOR macro and UnaryOp template.
+ *
+ * Usage:
+ *   UNARY_ELEMENTWISE_DESCRIPTOR(abs, cpu, UnaryMode::Abs)
+ *   UNARY_ELEMENTWISE_DESCRIPTOR(log, cpu, UnaryMode::Log)
+ *
+ * @param OP        The operator name (e.g., abs, log, sin)
+ * @param NAMESPACE The device namespace (e.g., cpu, nvidia)
+ * @param MODE      The UnaryMode enum value for this operation
+ */
+#define UNARY_ELEMENTWISE_DESCRIPTOR(OP, NAMESPACE, MODE) \
+                                                          \
+    ELEMENTWISE_DESCRIPTOR(OP, NAMESPACE)                 \
+                                                          \
+    namespace op::OP::NAMESPACE {                         \
+    using Op = op::elementwise::unary::UnaryOp<MODE>;     \
+    }
+
+} // namespace op::elementwise::unary
+
+#endif // __INFINIOP_ELEMENTWISE_UNARY_H__
diff --git a/src/infiniop/operator_impl.h b/src/infiniop/operator_impl.h
new file mode 100644
index 000000000..3ff543f7e
--- /dev/null
+++ b/src/infiniop/operator_impl.h
@@ -0,0 +1,288 @@
+#ifndef __INFINIOP_OPERATOR_IMPL_H__
+#define __INFINIOP_OPERATOR_IMPL_H__
+
+#include "handle.h"
+#include "operator.h"
+
+// Conditional compilation helpers
+#ifdef ENABLE_CPU_API
+#define IF_ENABLE_CPU_API(...) __VA_ARGS__
+#else
+#define IF_ENABLE_CPU_API(...)
+#endif
+
+#ifdef ENABLE_NVIDIA_API
+#define IF_ENABLE_NVIDIA_API(...) __VA_ARGS__
+#else
+#define IF_ENABLE_NVIDIA_API(...)
+#endif
+
+#ifdef ENABLE_ILUVATAR_API
+#define IF_ENABLE_ILUVATAR_API(...) __VA_ARGS__
+#else
+#define IF_ENABLE_ILUVATAR_API(...)
+#endif
+
+#ifdef ENABLE_QY_API
+#define IF_ENABLE_QY_API(...) __VA_ARGS__
+#else
+#define IF_ENABLE_QY_API(...)
+#endif
+
+#ifdef ENABLE_METAX_API
+#define IF_ENABLE_METAX_API(...) __VA_ARGS__
+#else
+#define IF_ENABLE_METAX_API(...)
+#endif
+
+#ifdef ENABLE_KUNLUN_API
+#define IF_ENABLE_KUNLUN_API(...) __VA_ARGS__
+#else
+#define IF_ENABLE_KUNLUN_API(...)
+#endif
+
+#ifdef ENABLE_CAMBRICON_API
+#define IF_ENABLE_CAMBRICON_API(...) __VA_ARGS__
+#else
+#define IF_ENABLE_CAMBRICON_API(...)
+#endif
+
+#ifdef ENABLE_MOORE_API
+#define IF_ENABLE_MOORE_API(...) __VA_ARGS__
+#else
+#define IF_ENABLE_MOORE_API(...)
+#endif
+
+/**
+ * Binary operator implementation macros
+ */
+#define BINARY_OP_IMPL_CASE(OP_NAME, DEVICE, NAMESPACE, c_desc, a_desc, b_desc) \
+    IF_ENABLE_##DEVICE##_API(                                                   \
+        case INFINI_DEVICE_##DEVICE                                             \
+        : return op::OP_NAME::NAMESPACE::Descriptor::create(                    \
+            handle,                                                             \
+            reinterpret_cast<op::OP_NAME::NAMESPACE::Descriptor **>(desc_ptr),  \
+            c_desc,                                                             \
+            {a_desc, b_desc});)
+
+#define BINARY_OP_IMPL_DEVICE_CASES(OP_NAME, c_desc, a_desc, b_desc)       \
+    BINARY_OP_IMPL_CASE(OP_NAME, CPU, cpu, c_desc, a_desc, b_desc)         \
+    BINARY_OP_IMPL_CASE(OP_NAME, NVIDIA, nvidia, c_desc, a_desc, b_desc)   \
+    BINARY_OP_IMPL_CASE(OP_NAME, ILUVATAR, nvidia, c_desc, a_desc, b_desc) \
+    BINARY_OP_IMPL_CASE(OP_NAME, QY, nvidia, c_desc, a_desc, b_desc)       \
+    BINARY_OP_IMPL_CASE(OP_NAME, METAX, metax, c_desc, a_desc, b_desc)     \
+    BINARY_OP_IMPL_CASE(OP_NAME, KUNLUN, kunlun, c_desc, a_desc, b_desc)   \
+    BINARY_OP_IMPL_CASE(OP_NAME, CAMBRICON, bang, c_desc, a_desc, b_desc)  \
+    BINARY_OP_IMPL_CASE(OP_NAME, MOORE, moore, c_desc, a_desc, b_desc)
+
+#define BINARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, DEVICE, NAMESPACE)                              \
+    IF_ENABLE_##DEVICE##_API(                                                                      \
+        case INFINI_DEVICE_##DEVICE                                                                \
+        :                                                                                          \
+            *size = reinterpret_cast<op::OP_NAME::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;)
+
+#define BINARY_OP_IMPL_GET_WORKSPACE_CASES(OP_NAME)              \
+    BINARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, CPU, cpu)         \
+    BINARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, NVIDIA, nvidia)   \
+    BINARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, ILUVATAR, nvidia) \
+    BINARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, QY, nvidia)       \
+    BINARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, METAX, metax)     \
+    BINARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, KUNLUN, kunlun)   \
+    BINARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, CAMBRICON, bang)  \
+    BINARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, MOORE, moore)
+
+#define BINARY_OP_IMPL_CALCULATE_CASE(OP_NAME, DEVICE, NAMESPACE, c, a, b)          \
+    IF_ENABLE_##DEVICE##_API(                                                       \
+        case INFINI_DEVICE_##DEVICE                                                 \
+        : return reinterpret_cast<const op::OP_NAME::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, c, {a, b}, stream);)
+
+#define BINARY_OP_IMPL_CALCULATE_CASES(OP_NAME, c, a, b)              \
+    BINARY_OP_IMPL_CALCULATE_CASE(OP_NAME, CPU, cpu, c, a, b)         \
+    BINARY_OP_IMPL_CALCULATE_CASE(OP_NAME, NVIDIA, nvidia, c, a, b)   \
+    BINARY_OP_IMPL_CALCULATE_CASE(OP_NAME, ILUVATAR, nvidia, c, a, b) \
+    BINARY_OP_IMPL_CALCULATE_CASE(OP_NAME, QY, nvidia, c, a, b)       \
+    BINARY_OP_IMPL_CALCULATE_CASE(OP_NAME, METAX, metax, c, a, b)     \
+    BINARY_OP_IMPL_CALCULATE_CASE(OP_NAME, KUNLUN, kunlun, c, a, b)   \
+    BINARY_OP_IMPL_CALCULATE_CASE(OP_NAME, CAMBRICON, bang, c, a, b)  \
+    BINARY_OP_IMPL_CALCULATE_CASE(OP_NAME, MOORE, moore, c, a, b)
+
+#define BINARY_OP_IMPL_DESTROY_CASE(OP_NAME, DEVICE, NAMESPACE)                      \
+    IF_ENABLE_##DEVICE##_API(                                                        \
+        case INFINI_DEVICE_##DEVICE                                                  \
+        : delete reinterpret_cast<const op::OP_NAME::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;)
+
+#define BINARY_OP_IMPL_DESTROY_CASES(OP_NAME)              \
+    BINARY_OP_IMPL_DESTROY_CASE(OP_NAME, CPU, cpu)         \
+    BINARY_OP_IMPL_DESTROY_CASE(OP_NAME, NVIDIA, nvidia)   \
+    BINARY_OP_IMPL_DESTROY_CASE(OP_NAME, ILUVATAR, nvidia) \
+    BINARY_OP_IMPL_DESTROY_CASE(OP_NAME, QY, nvidia)       \
+    BINARY_OP_IMPL_DESTROY_CASE(OP_NAME, METAX, metax)     \
+    BINARY_OP_IMPL_DESTROY_CASE(OP_NAME, KUNLUN, kunlun)   \
+    BINARY_OP_IMPL_DESTROY_CASE(OP_NAME, CAMBRICON, bang)  \
+    BINARY_OP_IMPL_DESTROY_CASE(OP_NAME, MOORE, moore)
+
+#define BINARY_OP_IMPL(OP_NAME, OP_NAME_UPPER)                           \
+    __C infiniStatus_t infiniopCreate##OP_NAME_UPPER##Descriptor(        \
+        infiniopHandle_t handle,                                         \
+        infiniop##OP_NAME_UPPER##Descriptor_t *desc_ptr,                 \
+        infiniopTensorDescriptor_t c_desc,                               \
+        infiniopTensorDescriptor_t a_desc,                               \
+        infiniopTensorDescriptor_t b_desc) {                             \
+        switch (handle->device) {                                        \
+            BINARY_OP_IMPL_DEVICE_CASES(OP_NAME, c_desc, a_desc, b_desc) \
+        default:                                                         \
+            return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;              \
+        }                                                                \
+    }                                                                    \
+    __C infiniStatus_t infiniopGet##OP_NAME_UPPER##WorkspaceSize(        \
+        infiniop##OP_NAME_UPPER##Descriptor_t desc,                      \
+        size_t *size) {                                                  \
+        switch (desc->device_type) {                                     \
+            BINARY_OP_IMPL_GET_WORKSPACE_CASES(OP_NAME)                  \
+        default:                                                         \
+            return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;              \
+        }                                                                \
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;                  \
+    }                                                                    \
+    __C infiniStatus_t infiniop##OP_NAME_UPPER(                          \
+        infiniop##OP_NAME_UPPER##Descriptor_t desc,                      \
+        void *workspace,                                                 \
+        size_t workspace_size,                                           \
+        void *c,                                                         \
+        const void *a,                                                   \
+        const void *b,                                                   \
+        void *stream) {                                                  \
+        switch (desc->device_type) {                                     \
+            BINARY_OP_IMPL_CALCULATE_CASES(OP_NAME, c, a, b)             \
+        default:                                                         \
+            return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;              \
+        }                                                                \
+    }                                                                    \
+    __C infiniStatus_t infiniopDestroy##OP_NAME_UPPER##Descriptor(       \
+        infiniop##OP_NAME_UPPER##Descriptor_t desc) {                    \
+        switch (desc->device_type) {                                     \
+            BINARY_OP_IMPL_DESTROY_CASES(OP_NAME)                        \
+        default:                                                         \
+            return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;              \
+        }                                                                \
+    }
+
+/**
+ * Unary operator implementation macros
+ */
+#define UNARY_OP_IMPL_CASE(OP_NAME, DEVICE, NAMESPACE, y_desc, x_desc)         \
+    IF_ENABLE_##DEVICE##_API(                                                  \
+        case INFINI_DEVICE_##DEVICE                                            \
+        : return op::OP_NAME::NAMESPACE::Descriptor::create(                   \
+            handle,                                                            \
+            reinterpret_cast<op::OP_NAME::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                            \
+            {x_desc});)
+
+#define UNARY_OP_IMPL_DEVICE_CASES(OP_NAME, y_desc, x_desc)       \
+    UNARY_OP_IMPL_CASE(OP_NAME, CPU, cpu, y_desc, x_desc)         \
+    UNARY_OP_IMPL_CASE(OP_NAME, NVIDIA, nvidia, y_desc, x_desc)   \
+    UNARY_OP_IMPL_CASE(OP_NAME, ILUVATAR, nvidia, y_desc, x_desc) \
+    UNARY_OP_IMPL_CASE(OP_NAME, QY, nvidia, y_desc, x_desc)       \
+    UNARY_OP_IMPL_CASE(OP_NAME, METAX, metax, y_desc, x_desc)     \
+    UNARY_OP_IMPL_CASE(OP_NAME, KUNLUN, kunlun, y_desc, x_desc)   \
+    UNARY_OP_IMPL_CASE(OP_NAME, CAMBRICON, bang, y_desc, x_desc)  \
+    UNARY_OP_IMPL_CASE(OP_NAME, MOORE, moore, y_desc, x_desc)
+
+#define UNARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, DEVICE, NAMESPACE)                               \
+    IF_ENABLE_##DEVICE##_API(                                                                      \
+        case INFINI_DEVICE_##DEVICE                                                                \
+        :                                                                                          \
+            *size = reinterpret_cast<op::OP_NAME::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;)
+
+#define UNARY_OP_IMPL_GET_WORKSPACE_CASES(OP_NAME)              \
+    UNARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, CPU, cpu)         \
+    UNARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, NVIDIA, nvidia)   \
+    UNARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, ILUVATAR, nvidia) \
+    UNARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, QY, nvidia)       \
+    UNARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, METAX, metax)     \
+    UNARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, KUNLUN, kunlun)   \
+    UNARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, CAMBRICON, bang)  \
+    UNARY_OP_IMPL_GET_WORKSPACE_CASE(OP_NAME, MOORE, moore)
+
+#define UNARY_OP_IMPL_CALCULATE_CASE(OP_NAME, DEVICE, NAMESPACE, y, x)              \
+    IF_ENABLE_##DEVICE##_API(                                                       \
+        case INFINI_DEVICE_##DEVICE                                                 \
+        : return reinterpret_cast<const op::OP_NAME::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream);)
+
+#define UNARY_OP_IMPL_CALCULATE_CASES(OP_NAME, y, x)              \
+    UNARY_OP_IMPL_CALCULATE_CASE(OP_NAME, CPU, cpu, y, x)         \
+    UNARY_OP_IMPL_CALCULATE_CASE(OP_NAME, NVIDIA, nvidia, y, x)   \
+    UNARY_OP_IMPL_CALCULATE_CASE(OP_NAME, ILUVATAR, nvidia, y, x) \
+    UNARY_OP_IMPL_CALCULATE_CASE(OP_NAME, QY, nvidia, y, x)       \
+    UNARY_OP_IMPL_CALCULATE_CASE(OP_NAME, METAX, metax, y, x)     \
+    UNARY_OP_IMPL_CALCULATE_CASE(OP_NAME, KUNLUN, kunlun, y, x)   \
+    UNARY_OP_IMPL_CALCULATE_CASE(OP_NAME, CAMBRICON, bang, y, x)  \
+    UNARY_OP_IMPL_CALCULATE_CASE(OP_NAME, MOORE, moore, y, x)
+
+#define UNARY_OP_IMPL_DESTROY_CASE(OP_NAME, DEVICE, NAMESPACE)                       \
+    IF_ENABLE_##DEVICE##_API(                                                        \
+        case INFINI_DEVICE_##DEVICE                                                  \
+        : delete reinterpret_cast<const op::OP_NAME::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;)
+
+#define UNARY_OP_IMPL_DESTROY_CASES(OP_NAME)              \
+    UNARY_OP_IMPL_DESTROY_CASE(OP_NAME, CPU, cpu)         \
+    UNARY_OP_IMPL_DESTROY_CASE(OP_NAME, NVIDIA, nvidia)   \
+    UNARY_OP_IMPL_DESTROY_CASE(OP_NAME, ILUVATAR, nvidia) \
+    UNARY_OP_IMPL_DESTROY_CASE(OP_NAME, QY, nvidia)       \
+    UNARY_OP_IMPL_DESTROY_CASE(OP_NAME, METAX, metax)     \
+    UNARY_OP_IMPL_DESTROY_CASE(OP_NAME, KUNLUN, kunlun)   \
+    UNARY_OP_IMPL_DESTROY_CASE(OP_NAME, CAMBRICON, bang)  \
+    UNARY_OP_IMPL_DESTROY_CASE(OP_NAME, MOORE, moore)
+
+#define UNARY_OP_IMPL(OP_NAME, OP_NAME_UPPER)                      \
+    __C infiniStatus_t infiniopCreate##OP_NAME_UPPER##Descriptor(  \
+        infiniopHandle_t handle,                                   \
+        infiniop##OP_NAME_UPPER##Descriptor_t *desc_ptr,           \
+        infiniopTensorDescriptor_t y_desc,                         \
+        infiniopTensorDescriptor_t x_desc) {                       \
+        switch (handle->device) {                                  \
+            UNARY_OP_IMPL_DEVICE_CASES(OP_NAME, y_desc, x_desc)    \
+        default:                                                   \
+            return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;        \
+        }                                                          \
+    }                                                              \
+    __C infiniStatus_t infiniopGet##OP_NAME_UPPER##WorkspaceSize(  \
+        infiniop##OP_NAME_UPPER##Descriptor_t desc,                \
+        size_t *size) {                                            \
+        switch (desc->device_type) {                               \
+            UNARY_OP_IMPL_GET_WORKSPACE_CASES(OP_NAME)             \
+        default:                                                   \
+            return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;        \
+        }                                                          \
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;            \
+    }                                                              \
+    __C infiniStatus_t infiniop##OP_NAME_UPPER(                    \
+        infiniop##OP_NAME_UPPER##Descriptor_t desc,                \
+        void *workspace,                                           \
+        size_t workspace_size,                                     \
+        void *y,                                                   \
+        const void *x,                                             \
+        void *stream) {                                            \
+        switch (desc->device_type) {                               \
+            UNARY_OP_IMPL_CALCULATE_CASES(OP_NAME, y, x)           \
+        default:                                                   \
+            return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;        \
+        }                                                          \
+    }                                                              \
+    __C infiniStatus_t infiniopDestroy##OP_NAME_UPPER##Descriptor( \
+        infiniop##OP_NAME_UPPER##Descriptor_t desc) {              \
+        switch (desc->device_type) {                               \
+            UNARY_OP_IMPL_DESTROY_CASES(OP_NAME)                   \
+        default:                                                   \
+            return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;        \
+        }                                                          \
+    }
+
+#endif // __INFINIOP_OPERATOR_IMPL_H__
diff --git a/src/infiniop/ops/abs/cpu/abs_cpu.cc b/src/infiniop/ops/abs/cpu/abs_cpu.cc
index 7d6e81d04..d4b541ba7 100644
--- a/src/infiniop/ops/abs/cpu/abs_cpu.cc
+++ b/src/infiniop/ops/abs/cpu/abs_cpu.cc
@@ -1,48 +1,8 @@
 #include "abs_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::abs::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_UNARY(abs)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<AbsOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<AbsOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::abs::cpu
diff --git a/src/infiniop/ops/abs/cpu/abs_cpu.h b/src/infiniop/ops/abs/cpu/abs_cpu.h
index 5b9773298..cba8274e6 100644
--- a/src/infiniop/ops/abs/cpu/abs_cpu.h
+++ b/src/infiniop/ops/abs/cpu/abs_cpu.h
@@ -1,26 +1,9 @@
 #ifndef __ABS_CPU_H__
 #define __ABS_CPU_H__
 
-#include <cmath>
-
 #include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
 
-ELEMENTWISE_DESCRIPTOR(abs, cpu)
-
-namespace op::abs::cpu {
-typedef struct AbsOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    T operator()(const T &x) const {
-        if constexpr (std::is_floating_point_v<T>) {
-            return std::fabs(x);
-        } else {
-            return std::abs(x);
-        }
-    }
-} AbsOp;
-} // namespace op::abs::cpu
+UNARY_ELEMENTWISE_DESCRIPTOR(abs, cpu, op::elementwise::unary::UnaryMode::Abs)
 
 #endif // __ABS_CPU_H__
diff --git a/src/infiniop/ops/abs/cuda/kernel.cuh b/src/infiniop/ops/abs/cuda/kernel.cuh
index d7ff2db12..406aa423f 100644
--- a/src/infiniop/ops/abs/cuda/kernel.cuh
+++ b/src/infiniop/ops/abs/cuda/kernel.cuh
@@ -1,26 +1,10 @@
 #ifndef __ABS_CUDA_H__
 #define __ABS_CUDA_H__
 
-#include <cmath>
-#include <cuda_fp16.h>
+#include "../../../elementwise/unary.h"
 
 namespace op::abs::cuda {
-typedef struct AbsOp {
-public:
-    static constexpr size_t num_inputs = 1;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            return __habs2(x);
-        } else if constexpr (std::is_same_v<T, half>) {
-            return __habs(x);
-        } else if constexpr (std::is_floating_point_v<T>) {
-            return std::fabs(x);
-        } else {
-            return std::abs(x);
-        }
-    }
-} AbsOp;
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Abs>;
 } // namespace op::abs::cuda
 
 #endif // __ABS_CUDA_H__
diff --git a/src/infiniop/ops/abs/nvidia/abs_nvidia.cu b/src/infiniop/ops/abs/nvidia/abs_nvidia.cu
index 485f0406a..b9687226a 100644
--- a/src/infiniop/ops/abs/nvidia/abs_nvidia.cu
+++ b/src/infiniop/ops/abs/nvidia/abs_nvidia.cu
@@ -1,54 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "abs_nvidia.cuh"
 
 namespace op::abs::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_UNARY(abs)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::AbsOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::AbsOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::abs::nvidia
diff --git a/src/infiniop/ops/abs/operator.cc b/src/infiniop/ops/abs/operator.cc
index b6820079d..051b8711a 100644
--- a/src/infiniop/ops/abs/operator.cc
+++ b/src/infiniop/ops/abs/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/abs.h"
 
 #ifdef ENABLE_CPU_API
@@ -9,131 +8,4 @@
 #include "nvidia/abs_nvidia.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreateAbsDescriptor(
-    infiniopHandle_t handle,
-    infiniopAbsDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                             \
-        return op::abs::NAMESPACE::Descriptor::create(                     \
-            handle,                                                        \
-            reinterpret_cast<op::abs::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc,                                                        \
-            {x_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetAbsWorkspaceSize(infiniopAbsDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                               \
-    case CASE:                                                                             \
-        *size = reinterpret_cast<op::abs::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia)
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia)
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia)
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopAbs(
-    infiniopAbsDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                                \
-        return reinterpret_cast<const op::abs::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, y, {x}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroyAbsDescriptor(infiniopAbsDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                \
-    case CASE:                                                                 \
-        delete reinterpret_cast<const op::abs::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
+UNARY_OP_IMPL(abs, Abs)
diff --git a/src/infiniop/ops/acos/cpu/acos_cpu.cc b/src/infiniop/ops/acos/cpu/acos_cpu.cc
index 1accb6752..9be4ca1fe 100644
--- a/src/infiniop/ops/acos/cpu/acos_cpu.cc
+++ b/src/infiniop/ops/acos/cpu/acos_cpu.cc
@@ -1,48 +1,8 @@
 #include "acos_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::acos::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_UNARY(acos)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<AcosOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<AcosOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::acos::cpu
diff --git a/src/infiniop/ops/acos/cpu/acos_cpu.h b/src/infiniop/ops/acos/cpu/acos_cpu.h
index 14e74b75c..50900e217 100644
--- a/src/infiniop/ops/acos/cpu/acos_cpu.h
+++ b/src/infiniop/ops/acos/cpu/acos_cpu.h
@@ -1,22 +1,9 @@
 #ifndef __ACOS_CPU_H__
 #define __ACOS_CPU_H__
 
-#include <cmath>
-
 #include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
 
-ELEMENTWISE_DESCRIPTOR(acos, cpu)
-
-namespace op::acos::cpu {
-typedef struct AcosOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    T operator()(const T &x) const {
-        return std::acos(x);
-    }
-} AcosOp;
-} // namespace op::acos::cpu
+UNARY_ELEMENTWISE_DESCRIPTOR(acos, cpu, op::elementwise::unary::UnaryMode::Acos)
 
 #endif // __ACOS_CPU_H__
diff --git a/src/infiniop/ops/acos/cuda/kernel.cuh b/src/infiniop/ops/acos/cuda/kernel.cuh
index c3281c7e3..b62bf1e88 100644
--- a/src/infiniop/ops/acos/cuda/kernel.cuh
+++ b/src/infiniop/ops/acos/cuda/kernel.cuh
@@ -1,32 +1,10 @@
 #ifndef __ACOS_CUDA_H__
 #define __ACOS_CUDA_H__
 
-#include <cmath>
-#include <cuda_fp16.h>
+#include "../../../elementwise/unary.h"
 
 namespace op::acos::cuda {
-typedef struct AcosOp {
-public:
-    static constexpr size_t num_inputs = 1;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            return __floats2half2_rn(acosf(__half2float(__low2half(x))), acosf(__half2float(__high2half(x))));
-        } else if constexpr (std::is_same_v<T, half>) {
-            return __float2half(acosf(__half2float(x)));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
-            float x0 = __bfloat162float(__low2bfloat16(x));
-            float x1 = __bfloat162float(__high2bfloat16(x));
-            return __floats2bfloat162_rn(acosf(x0), acosf(x1));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-            return __float2bfloat16_rn(acosf(__bfloat162float(x)));
-        } else if constexpr (std::is_same_v<T, float>) {
-            return acosf(x);
-        } else {
-            return std::acos(x);
-        }
-    }
-} AcosOp;
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Acos>;
 } // namespace op::acos::cuda
 
 #endif // __ACOS_CUDA_H__
diff --git a/src/infiniop/ops/acos/nvidia/acos_nvidia.cu b/src/infiniop/ops/acos/nvidia/acos_nvidia.cu
index 8480219bc..e7cf1feea 100644
--- a/src/infiniop/ops/acos/nvidia/acos_nvidia.cu
+++ b/src/infiniop/ops/acos/nvidia/acos_nvidia.cu
@@ -1,54 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "acos_nvidia.cuh"
 
 namespace op::acos::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_UNARY(acos)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::AcosOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::AcosOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::acos::nvidia
diff --git a/src/infiniop/ops/acos/operator.cc b/src/infiniop/ops/acos/operator.cc
index e775a005a..15872b754 100644
--- a/src/infiniop/ops/acos/operator.cc
+++ b/src/infiniop/ops/acos/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/acos.h"
 
 #ifdef ENABLE_CPU_API
@@ -9,131 +8,4 @@
 #include "nvidia/acos_nvidia.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreateAcosDescriptor(
-    infiniopHandle_t handle,
-    infiniopAcosDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                             \
-    case CASE:                                                              \
-        return op::acos::NAMESPACE::Descriptor::create(                     \
-            handle,                                                         \
-            reinterpret_cast<op::acos::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc,                                                         \
-            {x_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetAcosWorkspaceSize(infiniopAcosDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                                \
-    case CASE:                                                                              \
-        *size = reinterpret_cast<op::acos::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia)
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia)
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia)
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopAcos(
-    infiniopAcosDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                             \
-    case CASE:                                                                 \
-        return reinterpret_cast<const op::acos::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, y, {x}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroyAcosDescriptor(infiniopAcosDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                 \
-    case CASE:                                                                  \
-        delete reinterpret_cast<const op::acos::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
+UNARY_OP_IMPL(acos, Acos)
diff --git a/src/infiniop/ops/acosh/cpu/acosh_cpu.cc b/src/infiniop/ops/acosh/cpu/acosh_cpu.cc
index 005463679..0cb424c00 100644
--- a/src/infiniop/ops/acosh/cpu/acosh_cpu.cc
+++ b/src/infiniop/ops/acosh/cpu/acosh_cpu.cc
@@ -1,48 +1,8 @@
 #include "acosh_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::acosh::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_UNARY(acosh)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<AcoshOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<AcoshOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::acosh::cpu
diff --git a/src/infiniop/ops/acosh/cpu/acosh_cpu.h b/src/infiniop/ops/acosh/cpu/acosh_cpu.h
index b4b710ed5..bb05baf14 100644
--- a/src/infiniop/ops/acosh/cpu/acosh_cpu.h
+++ b/src/infiniop/ops/acosh/cpu/acosh_cpu.h
@@ -1,22 +1,9 @@
 #ifndef __ACOSH_CPU_H__
 #define __ACOSH_CPU_H__
 
-#include <cmath>
-
 #include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
 
-ELEMENTWISE_DESCRIPTOR(acosh, cpu)
-
-namespace op::acosh::cpu {
-typedef struct AcoshOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    T operator()(const T &x) const {
-        return std::acosh(x);
-    }
-} AcoshOp;
-} // namespace op::acosh::cpu
+UNARY_ELEMENTWISE_DESCRIPTOR(acosh, cpu, op::elementwise::unary::UnaryMode::Acosh)
 
 #endif // __ACOSH_CPU_H__
diff --git a/src/infiniop/ops/acosh/cuda/kernel.cuh b/src/infiniop/ops/acosh/cuda/kernel.cuh
index fe444b1b4..9fbb54636 100644
--- a/src/infiniop/ops/acosh/cuda/kernel.cuh
+++ b/src/infiniop/ops/acosh/cuda/kernel.cuh
@@ -1,32 +1,10 @@
 #ifndef __ACOSH_CUDA_H__
 #define __ACOSH_CUDA_H__
 
-#include <cmath>
-#include <cuda_fp16.h>
+#include "../../../elementwise/unary.h"
 
 namespace op::acosh::cuda {
-typedef struct AcoshOp {
-public:
-    static constexpr size_t num_inputs = 1;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            return __floats2half2_rn(acoshf(__half2float(__low2half(x))), acoshf(__half2float(__high2half(x))));
-        } else if constexpr (std::is_same_v<T, half>) {
-            return __float2half(acoshf(__half2float(x)));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
-            float x0 = __bfloat162float(__low2bfloat16(x));
-            float x1 = __bfloat162float(__high2bfloat16(x));
-            return __floats2bfloat162_rn(acoshf(x0), acoshf(x1));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-            return __float2bfloat16_rn(acoshf(__bfloat162float(x)));
-        } else if constexpr (std::is_same_v<T, float>) {
-            return acoshf(x);
-        } else {
-            return std::acosh(x);
-        }
-    }
-} AcoshOp;
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Acosh>;
 } // namespace op::acosh::cuda
 
 #endif // __ACOSH_CUDA_H__
diff --git a/src/infiniop/ops/acosh/nvidia/acosh_nvidia.cu b/src/infiniop/ops/acosh/nvidia/acosh_nvidia.cu
index fc06590a7..5d065bdbc 100644
--- a/src/infiniop/ops/acosh/nvidia/acosh_nvidia.cu
+++ b/src/infiniop/ops/acosh/nvidia/acosh_nvidia.cu
@@ -1,54 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "acosh_nvidia.cuh"
 
 namespace op::acosh::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_UNARY(acosh)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::AcoshOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::AcoshOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::acosh::nvidia
diff --git a/src/infiniop/ops/acosh/operator.cc b/src/infiniop/ops/acosh/operator.cc
index 9bba3389a..c1939a54c 100644
--- a/src/infiniop/ops/acosh/operator.cc
+++ b/src/infiniop/ops/acosh/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/acosh.h"
 
 #ifdef ENABLE_CPU_API
@@ -9,131 +8,4 @@
 #include "nvidia/acosh_nvidia.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreateAcoshDescriptor(
-    infiniopHandle_t handle,
-    infiniopAcoshDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                              \
-    case CASE:                                                               \
-        return op::acosh::NAMESPACE::Descriptor::create(                     \
-            handle,                                                          \
-            reinterpret_cast<op::acosh::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc,                                                          \
-            {x_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetAcoshWorkspaceSize(infiniopAcoshDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                                 \
-    case CASE:                                                                               \
-        *size = reinterpret_cast<op::acosh::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia)
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia)
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia)
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopAcosh(
-    infiniopAcoshDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                              \
-    case CASE:                                                                  \
-        return reinterpret_cast<const op::acosh::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, y, {x}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroyAcoshDescriptor(infiniopAcoshDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                  \
-    case CASE:                                                                   \
-        delete reinterpret_cast<const op::acosh::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
+UNARY_OP_IMPL(acosh, Acosh)
diff --git a/src/infiniop/ops/asin/cpu/asin_cpu.cc b/src/infiniop/ops/asin/cpu/asin_cpu.cc
index e149044f1..de42639ff 100644
--- a/src/infiniop/ops/asin/cpu/asin_cpu.cc
+++ b/src/infiniop/ops/asin/cpu/asin_cpu.cc
@@ -1,48 +1,8 @@
 #include "asin_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::asin::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_UNARY(asin)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<AsinOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<AsinOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::asin::cpu
diff --git a/src/infiniop/ops/asin/cpu/asin_cpu.h b/src/infiniop/ops/asin/cpu/asin_cpu.h
index 22bcba337..8c6da5e20 100644
--- a/src/infiniop/ops/asin/cpu/asin_cpu.h
+++ b/src/infiniop/ops/asin/cpu/asin_cpu.h
@@ -1,22 +1,9 @@
 #ifndef __ASIN_CPU_H__
 #define __ASIN_CPU_H__
 
-#include <cmath>
-
 #include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
 
-ELEMENTWISE_DESCRIPTOR(asin, cpu)
-
-namespace op::asin::cpu {
-typedef struct AsinOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    T operator()(const T &x) const {
-        return std::asin(x);
-    }
-} AsinOp;
-} // namespace op::asin::cpu
+UNARY_ELEMENTWISE_DESCRIPTOR(asin, cpu, op::elementwise::unary::UnaryMode::Asin)
 
 #endif // __ASIN_CPU_H__
diff --git a/src/infiniop/ops/asin/cuda/kernel.cuh b/src/infiniop/ops/asin/cuda/kernel.cuh
index 3e8d11a07..a7063f015 100644
--- a/src/infiniop/ops/asin/cuda/kernel.cuh
+++ b/src/infiniop/ops/asin/cuda/kernel.cuh
@@ -1,32 +1,10 @@
 #ifndef __ASIN_CUDA_H__
 #define __ASIN_CUDA_H__
 
-#include <cmath>
-#include <cuda_fp16.h>
+#include "../../../elementwise/unary.h"
 
 namespace op::asin::cuda {
-typedef struct AsinOp {
-public:
-    static constexpr size_t num_inputs = 1;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            return __floats2half2_rn(asinf(__half2float(__low2half(x))), asinf(__half2float(__high2half(x))));
-        } else if constexpr (std::is_same_v<T, half>) {
-            return __float2half(asinf(__half2float(x)));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
-            float x0 = __bfloat162float(__low2bfloat16(x));
-            float x1 = __bfloat162float(__high2bfloat16(x));
-            return __floats2bfloat162_rn(asinf(x0), asinf(x1));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-            return __float2bfloat16_rn(asinf(__bfloat162float(x)));
-        } else if constexpr (std::is_same_v<T, float>) {
-            return asinf(x);
-        } else {
-            return std::asin(x);
-        }
-    }
-} AsinOp;
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Asin>;
 } // namespace op::asin::cuda
 
 #endif // __ASIN_CUDA_H__
diff --git a/src/infiniop/ops/asin/nvidia/asin_nvidia.cu b/src/infiniop/ops/asin/nvidia/asin_nvidia.cu
index 714d2b1b3..262755d50 100644
--- a/src/infiniop/ops/asin/nvidia/asin_nvidia.cu
+++ b/src/infiniop/ops/asin/nvidia/asin_nvidia.cu
@@ -1,54 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "asin_nvidia.cuh"
 
 namespace op::asin::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_UNARY(asin)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::AsinOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::AsinOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::asin::nvidia
diff --git a/src/infiniop/ops/asin/operator.cc b/src/infiniop/ops/asin/operator.cc
index c4973e9f5..edb8fa867 100644
--- a/src/infiniop/ops/asin/operator.cc
+++ b/src/infiniop/ops/asin/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/asin.h"
 
 #ifdef ENABLE_CPU_API
@@ -9,131 +8,4 @@
 #include "nvidia/asin_nvidia.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreateAsinDescriptor(
-    infiniopHandle_t handle,
-    infiniopAsinDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                             \
-    case CASE:                                                              \
-        return op::asin::NAMESPACE::Descriptor::create(                     \
-            handle,                                                         \
-            reinterpret_cast<op::asin::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc,                                                         \
-            {x_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetAsinWorkspaceSize(infiniopAsinDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                                \
-    case CASE:                                                                              \
-        *size = reinterpret_cast<op::asin::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia)
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia)
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia)
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopAsin(
-    infiniopAsinDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                             \
-    case CASE:                                                                 \
-        return reinterpret_cast<const op::asin::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, y, {x}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroyAsinDescriptor(infiniopAsinDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                 \
-    case CASE:                                                                  \
-        delete reinterpret_cast<const op::asin::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
+UNARY_OP_IMPL(asin, Asin)
diff --git a/src/infiniop/ops/asinh/cpu/asinh_cpu.cc b/src/infiniop/ops/asinh/cpu/asinh_cpu.cc
index e0d5b749a..8b18ab6f8 100644
--- a/src/infiniop/ops/asinh/cpu/asinh_cpu.cc
+++ b/src/infiniop/ops/asinh/cpu/asinh_cpu.cc
@@ -1,48 +1,8 @@
 #include "asinh_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::asinh::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_UNARY(asinh)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<AsinhOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<AsinhOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::asinh::cpu
diff --git a/src/infiniop/ops/asinh/cpu/asinh_cpu.h b/src/infiniop/ops/asinh/cpu/asinh_cpu.h
index 0a999b63b..4c3603752 100644
--- a/src/infiniop/ops/asinh/cpu/asinh_cpu.h
+++ b/src/infiniop/ops/asinh/cpu/asinh_cpu.h
@@ -1,22 +1,9 @@
 #ifndef __ASINH_CPU_H__
 #define __ASINH_CPU_H__
 
-#include <cmath>
-
 #include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
 
-ELEMENTWISE_DESCRIPTOR(asinh, cpu)
-
-namespace op::asinh::cpu {
-typedef struct AsinhOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    T operator()(const T &x) const {
-        return std::asinh(x);
-    }
-} AsinhOp;
-} // namespace op::asinh::cpu
+UNARY_ELEMENTWISE_DESCRIPTOR(asinh, cpu, op::elementwise::unary::UnaryMode::Asinh)
 
 #endif // __ASINH_CPU_H__
diff --git a/src/infiniop/ops/asinh/cuda/kernel.cuh b/src/infiniop/ops/asinh/cuda/kernel.cuh
index 7cb018c8a..866ea147a 100644
--- a/src/infiniop/ops/asinh/cuda/kernel.cuh
+++ b/src/infiniop/ops/asinh/cuda/kernel.cuh
@@ -1,32 +1,10 @@
 #ifndef __ASINH_CUDA_H__
 #define __ASINH_CUDA_H__
 
-#include <cmath>
-#include <cuda_fp16.h>
+#include "../../../elementwise/unary.h"
 
 namespace op::asinh::cuda {
-typedef struct AsinhOp {
-public:
-    static constexpr size_t num_inputs = 1;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            return __floats2half2_rn(asinhf(__half2float(__low2half(x))), asinhf(__half2float(__high2half(x))));
-        } else if constexpr (std::is_same_v<T, half>) {
-            return __float2half(asinhf(__half2float(x)));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
-            float x0 = __bfloat162float(__low2bfloat16(x));
-            float x1 = __bfloat162float(__high2bfloat16(x));
-            return __floats2bfloat162_rn(asinhf(x0), asinhf(x1));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-            return __float2bfloat16_rn(asinhf(__bfloat162float(x)));
-        } else if constexpr (std::is_same_v<T, float>) {
-            return asinhf(x);
-        } else {
-            return std::asinh(x);
-        }
-    }
-} AsinhOp;
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Asinh>;
 } // namespace op::asinh::cuda
 
 #endif // __ASINH_CUDA_H__
diff --git a/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu b/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu
index 203008b81..37c44baf0 100644
--- a/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu
+++ b/src/infiniop/ops/asinh/nvidia/asinh_nvidia.cu
@@ -1,54 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "asinh_nvidia.cuh"
 
 namespace op::asinh::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_UNARY(asinh)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::AsinhOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::AsinhOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::asinh::nvidia
diff --git a/src/infiniop/ops/asinh/operator.cc b/src/infiniop/ops/asinh/operator.cc
index d9ff5beda..7b519ec05 100644
--- a/src/infiniop/ops/asinh/operator.cc
+++ b/src/infiniop/ops/asinh/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/asinh.h"
 
 #ifdef ENABLE_CPU_API
@@ -9,131 +8,4 @@
 #include "nvidia/asinh_nvidia.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreateAsinhDescriptor(
-    infiniopHandle_t handle,
-    infiniopAsinhDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                              \
-    case CASE:                                                               \
-        return op::asinh::NAMESPACE::Descriptor::create(                     \
-            handle,                                                          \
-            reinterpret_cast<op::asinh::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc,                                                          \
-            {x_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetAsinhWorkspaceSize(infiniopAsinhDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                                 \
-    case CASE:                                                                               \
-        *size = reinterpret_cast<op::asinh::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia)
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia)
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia)
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopAsinh(
-    infiniopAsinhDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                              \
-    case CASE:                                                                  \
-        return reinterpret_cast<const op::asinh::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, y, {x}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroyAsinhDescriptor(infiniopAsinhDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                  \
-    case CASE:                                                                   \
-        delete reinterpret_cast<const op::asinh::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
+UNARY_OP_IMPL(asinh, Asinh)
diff --git a/src/infiniop/ops/atan/cpu/atan_cpu.cc b/src/infiniop/ops/atan/cpu/atan_cpu.cc
index a8c613d1e..075c7fd4e 100644
--- a/src/infiniop/ops/atan/cpu/atan_cpu.cc
+++ b/src/infiniop/ops/atan/cpu/atan_cpu.cc
@@ -1,48 +1,8 @@
 #include "atan_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::atan::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_UNARY(atan)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<AtanOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<AtanOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::atan::cpu
diff --git a/src/infiniop/ops/atan/cpu/atan_cpu.h b/src/infiniop/ops/atan/cpu/atan_cpu.h
index ac2a1bc0c..6b333cfb1 100644
--- a/src/infiniop/ops/atan/cpu/atan_cpu.h
+++ b/src/infiniop/ops/atan/cpu/atan_cpu.h
@@ -1,22 +1,9 @@
 #ifndef __ATAN_CPU_H__
 #define __ATAN_CPU_H__
 
-#include <cmath>
-
 #include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
 
-ELEMENTWISE_DESCRIPTOR(atan, cpu)
-
-namespace op::atan::cpu {
-typedef struct AtanOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    T operator()(const T &x) const {
-        return std::atan(x);
-    }
-} AtanOp;
-} // namespace op::atan::cpu
+UNARY_ELEMENTWISE_DESCRIPTOR(atan, cpu, op::elementwise::unary::UnaryMode::Atan)
 
 #endif // __ATAN_CPU_H__
diff --git a/src/infiniop/ops/atan/cuda/kernel.cuh b/src/infiniop/ops/atan/cuda/kernel.cuh
index 0c7745196..ce553c1c1 100644
--- a/src/infiniop/ops/atan/cuda/kernel.cuh
+++ b/src/infiniop/ops/atan/cuda/kernel.cuh
@@ -1,32 +1,10 @@
 #ifndef __ATAN_CUDA_H__
 #define __ATAN_CUDA_H__
 
-#include <cmath>
-#include <cuda_fp16.h>
+#include "../../../elementwise/unary.h"
 
 namespace op::atan::cuda {
-typedef struct AtanOp {
-public:
-    static constexpr size_t num_inputs = 1;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            return __floats2half2_rn(atanf(__half2float(__low2half(x))), atanf(__half2float(__high2half(x))));
-        } else if constexpr (std::is_same_v<T, half>) {
-            return __float2half(atanf(__half2float(x)));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
-            float x0 = __bfloat162float(__low2bfloat16(x));
-            float x1 = __bfloat162float(__high2bfloat16(x));
-            return __floats2bfloat162_rn(atanf(x0), atanf(x1));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-            return __float2bfloat16_rn(atanf(__bfloat162float(x)));
-        } else if constexpr (std::is_same_v<T, float>) {
-            return atanf(x);
-        } else {
-            return std::atan(x);
-        }
-    }
-} AtanOp;
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Atan>;
 } // namespace op::atan::cuda
 
 #endif // __ATAN_CUDA_H__
diff --git a/src/infiniop/ops/atan/nvidia/atan_nvidia.cu b/src/infiniop/ops/atan/nvidia/atan_nvidia.cu
index 2c6cf53d4..a05d65b79 100644
--- a/src/infiniop/ops/atan/nvidia/atan_nvidia.cu
+++ b/src/infiniop/ops/atan/nvidia/atan_nvidia.cu
@@ -1,54 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "atan_nvidia.cuh"
 
 namespace op::atan::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_UNARY(atan)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::AtanOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::AtanOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::atan::nvidia
diff --git a/src/infiniop/ops/atan/operator.cc b/src/infiniop/ops/atan/operator.cc
index c56e101d2..9025489c3 100644
--- a/src/infiniop/ops/atan/operator.cc
+++ b/src/infiniop/ops/atan/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/atan.h"
 
 #ifdef ENABLE_CPU_API
@@ -9,131 +8,4 @@
 #include "nvidia/atan_nvidia.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreateAtanDescriptor(
-    infiniopHandle_t handle,
-    infiniopAtanDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                             \
-    case CASE:                                                              \
-        return op::atan::NAMESPACE::Descriptor::create(                     \
-            handle,                                                         \
-            reinterpret_cast<op::atan::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc,                                                         \
-            {x_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetAtanWorkspaceSize(infiniopAtanDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                                \
-    case CASE:                                                                              \
-        *size = reinterpret_cast<op::atan::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia)
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia)
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia)
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopAtan(
-    infiniopAtanDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                             \
-    case CASE:                                                                 \
-        return reinterpret_cast<const op::atan::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, y, {x}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroyAtanDescriptor(infiniopAtanDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                 \
-    case CASE:                                                                  \
-        delete reinterpret_cast<const op::atan::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
+UNARY_OP_IMPL(atan, Atan)
diff --git a/src/infiniop/ops/atanh/cpu/atanh_cpu.cc b/src/infiniop/ops/atanh/cpu/atanh_cpu.cc
index 66ef4b1df..d19c978e4 100644
--- a/src/infiniop/ops/atanh/cpu/atanh_cpu.cc
+++ b/src/infiniop/ops/atanh/cpu/atanh_cpu.cc
@@ -1,48 +1,8 @@
 #include "atanh_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::atanh::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_UNARY(atanh)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<AtanhOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<AtanhOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::atanh::cpu
diff --git a/src/infiniop/ops/atanh/cpu/atanh_cpu.h b/src/infiniop/ops/atanh/cpu/atanh_cpu.h
index 8c2b04755..1a37453f0 100644
--- a/src/infiniop/ops/atanh/cpu/atanh_cpu.h
+++ b/src/infiniop/ops/atanh/cpu/atanh_cpu.h
@@ -1,22 +1,9 @@
 #ifndef __ATANH_CPU_H__
 #define __ATANH_CPU_H__
 
-#include <cmath>
-
 #include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
 
-ELEMENTWISE_DESCRIPTOR(atanh, cpu)
-
-namespace op::atanh::cpu {
-typedef struct AtanhOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    T operator()(const T &x) const {
-        return std::atanh(x);
-    }
-} AtanhOp;
-} // namespace op::atanh::cpu
+UNARY_ELEMENTWISE_DESCRIPTOR(atanh, cpu, op::elementwise::unary::UnaryMode::Atanh)
 
 #endif // __ATANH_CPU_H__
diff --git a/src/infiniop/ops/atanh/cuda/kernel.cuh b/src/infiniop/ops/atanh/cuda/kernel.cuh
index 5337d8243..de0866ba5 100644
--- a/src/infiniop/ops/atanh/cuda/kernel.cuh
+++ b/src/infiniop/ops/atanh/cuda/kernel.cuh
@@ -1,32 +1,10 @@
 #ifndef __ATANH_CUDA_H__
 #define __ATANH_CUDA_H__
 
-#include <cmath>
-#include <cuda_fp16.h>
+#include "../../../elementwise/unary.h"
 
 namespace op::atanh::cuda {
-typedef struct AtanhOp {
-public:
-    static constexpr size_t num_inputs = 1;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            return __floats2half2_rn(atanhf(__half2float(__low2half(x))), atanhf(__half2float(__high2half(x))));
-        } else if constexpr (std::is_same_v<T, half>) {
-            return __float2half(atanhf(__half2float(x)));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
-            float x0 = __bfloat162float(__low2bfloat16(x));
-            float x1 = __bfloat162float(__high2bfloat16(x));
-            return __floats2bfloat162_rn(atanhf(x0), atanhf(x1));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-            return __float2bfloat16_rn(atanhf(__bfloat162float(x)));
-        } else if constexpr (std::is_same_v<T, float>) {
-            return atanhf(x);
-        } else {
-            return std::atanh(x);
-        }
-    }
-} AtanhOp;
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Atanh>;
 } // namespace op::atanh::cuda
 
 #endif // __ATANH_CUDA_H__
diff --git a/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cu b/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cu
index cb5a1ff03..55b435920 100644
--- a/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cu
+++ b/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cu
@@ -1,54 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "atanh_nvidia.cuh"
 
 namespace op::atanh::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_UNARY(atanh)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::AtanhOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::AtanhOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::atanh::nvidia
diff --git a/src/infiniop/ops/atanh/operator.cc b/src/infiniop/ops/atanh/operator.cc
index a73adcb23..cc9d6131e 100644
--- a/src/infiniop/ops/atanh/operator.cc
+++ b/src/infiniop/ops/atanh/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/atanh.h"
 
 #ifdef ENABLE_CPU_API
@@ -9,131 +8,4 @@
 #include "nvidia/atanh_nvidia.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreateAtanhDescriptor(
-    infiniopHandle_t handle,
-    infiniopAtanhDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                              \
-    case CASE:                                                               \
-        return op::atanh::NAMESPACE::Descriptor::create(                     \
-            handle,                                                          \
-            reinterpret_cast<op::atanh::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc,                                                          \
-            {x_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetAtanhWorkspaceSize(infiniopAtanhDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                                 \
-    case CASE:                                                                               \
-        *size = reinterpret_cast<op::atanh::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia)
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia)
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia)
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopAtanh(
-    infiniopAtanhDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                              \
-    case CASE:                                                                  \
-        return reinterpret_cast<const op::atanh::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, y, {x}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroyAtanhDescriptor(infiniopAtanhDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                  \
-    case CASE:                                                                   \
-        delete reinterpret_cast<const op::atanh::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
+UNARY_OP_IMPL(atanh, Atanh)
diff --git a/src/infiniop/ops/ceil/cpu/ceil_cpu.cc b/src/infiniop/ops/ceil/cpu/ceil_cpu.cc
index 17b3ec888..81ca2fe7a 100644
--- a/src/infiniop/ops/ceil/cpu/ceil_cpu.cc
+++ b/src/infiniop/ops/ceil/cpu/ceil_cpu.cc
@@ -1,48 +1,8 @@
 #include "ceil_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::ceil::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_UNARY(ceil)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<CeilOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<CeilOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::ceil::cpu
diff --git a/src/infiniop/ops/ceil/cpu/ceil_cpu.h b/src/infiniop/ops/ceil/cpu/ceil_cpu.h
index c3ca8e441..423c784cc 100644
--- a/src/infiniop/ops/ceil/cpu/ceil_cpu.h
+++ b/src/infiniop/ops/ceil/cpu/ceil_cpu.h
@@ -1,26 +1,9 @@
 #ifndef __CEIL_CPU_H__
 #define __CEIL_CPU_H__
 
-#include <cmath>
-
 #include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
 
-ELEMENTWISE_DESCRIPTOR(ceil, cpu)
-
-namespace op::ceil::cpu {
-typedef struct CeilOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    T operator()(const T &x) const {
-        if constexpr (std::is_integral_v<T>) {
-            return x;
-        } else {
-            return std::ceil(x);
-        }
-    }
-} CeilOp;
-} // namespace op::ceil::cpu
+UNARY_ELEMENTWISE_DESCRIPTOR(ceil, cpu, op::elementwise::unary::UnaryMode::Ceil)
 
 #endif // __CEIL_CPU_H__
diff --git a/src/infiniop/ops/ceil/cuda/kernel.cuh b/src/infiniop/ops/ceil/cuda/kernel.cuh
index a2d2e7fb5..1d30a42eb 100644
--- a/src/infiniop/ops/ceil/cuda/kernel.cuh
+++ b/src/infiniop/ops/ceil/cuda/kernel.cuh
@@ -1,34 +1,10 @@
 #ifndef __CEIL_CUDA_H__
 #define __CEIL_CUDA_H__
 
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
-#include <cuda_fp16.h>
+#include "../../../elementwise/unary.h"
 
 namespace op::ceil::cuda {
-typedef struct CeilOp {
-public:
-    static constexpr size_t num_inputs = 1;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            return h2ceil(x);
-        } else if constexpr (std::is_same_v<T, half>) {
-            return hceil(x);
-        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
-            float x0 = __bfloat162float(__low2bfloat16(x));
-            float x1 = __bfloat162float(__high2bfloat16(x));
-            return __floats2bfloat162_rn(ceilf(x0), ceilf(x1));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-            return __float2bfloat16_rn(ceilf(__bfloat162float(x)));
-        } else if constexpr (std::is_same_v<T, float>) {
-            return ceilf(x);
-        } else if constexpr (std::is_integral_v<T>) {
-            return x;
-        } else {
-            return std::ceil(x);
-        }
-    }
-} CeilOp;
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Ceil>;
 } // namespace op::ceil::cuda
 
 #endif // __CEIL_CUDA_H__
diff --git a/src/infiniop/ops/ceil/nvidia/ceil_nvidia.cu b/src/infiniop/ops/ceil/nvidia/ceil_nvidia.cu
index c7ad2ee5b..88ee35be8 100644
--- a/src/infiniop/ops/ceil/nvidia/ceil_nvidia.cu
+++ b/src/infiniop/ops/ceil/nvidia/ceil_nvidia.cu
@@ -1,54 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "ceil_nvidia.cuh"
 
 namespace op::ceil::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_UNARY(ceil)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::CeilOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::CeilOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::ceil::nvidia
diff --git a/src/infiniop/ops/ceil/operator.cc b/src/infiniop/ops/ceil/operator.cc
index 4e5ee7800..dbe591043 100644
--- a/src/infiniop/ops/ceil/operator.cc
+++ b/src/infiniop/ops/ceil/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/ceil.h"
 
 #ifdef ENABLE_CPU_API
@@ -9,131 +8,4 @@
 #include "nvidia/ceil_nvidia.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreateCeilDescriptor(
-    infiniopHandle_t handle,
-    infiniopCeilDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                             \
-    case CASE:                                                              \
-        return op::ceil::NAMESPACE::Descriptor::create(                     \
-            handle,                                                         \
-            reinterpret_cast<op::ceil::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc,                                                         \
-            {x_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetCeilWorkspaceSize(infiniopCeilDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                                \
-    case CASE:                                                                              \
-        *size = reinterpret_cast<op::ceil::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia)
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia)
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia)
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopCeil(
-    infiniopCeilDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                             \
-    case CASE:                                                                 \
-        return reinterpret_cast<const op::ceil::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, y, {x}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroyCeilDescriptor(infiniopCeilDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                 \
-    case CASE:                                                                  \
-        delete reinterpret_cast<const op::ceil::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
+UNARY_OP_IMPL(ceil, Ceil)
diff --git a/src/infiniop/ops/cos/cpu/cos_cpu.cc b/src/infiniop/ops/cos/cpu/cos_cpu.cc
index 9dc68d327..19ef002cf 100644
--- a/src/infiniop/ops/cos/cpu/cos_cpu.cc
+++ b/src/infiniop/ops/cos/cpu/cos_cpu.cc
@@ -1,48 +1,8 @@
 #include "cos_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::cos::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_UNARY(cos)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<CosOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<CosOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::cos::cpu
diff --git a/src/infiniop/ops/cos/cpu/cos_cpu.h b/src/infiniop/ops/cos/cpu/cos_cpu.h
index 9b4236fc2..d62aa91b8 100644
--- a/src/infiniop/ops/cos/cpu/cos_cpu.h
+++ b/src/infiniop/ops/cos/cpu/cos_cpu.h
@@ -1,22 +1,9 @@
 #ifndef __COS_CPU_H__
 #define __COS_CPU_H__
 
-#include <cmath>
-
 #include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
 
-ELEMENTWISE_DESCRIPTOR(cos, cpu)
-
-namespace op::cos::cpu {
-typedef struct CosOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    T operator()(const T &x) const {
-        return std::cos(x);
-    }
-} CosOp;
-} // namespace op::cos::cpu
+UNARY_ELEMENTWISE_DESCRIPTOR(cos, cpu, op::elementwise::unary::UnaryMode::Cos)
 
 #endif // __COS_CPU_H__
diff --git a/src/infiniop/ops/cos/cuda/kernel.cuh b/src/infiniop/ops/cos/cuda/kernel.cuh
index b0dabb340..57fe4f50e 100644
--- a/src/infiniop/ops/cos/cuda/kernel.cuh
+++ b/src/infiniop/ops/cos/cuda/kernel.cuh
@@ -1,32 +1,10 @@
 #ifndef __COS_CUDA_H__
 #define __COS_CUDA_H__
 
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
-#include <cuda_fp16.h>
+#include "../../../elementwise/unary.h"
 
 namespace op::cos::cuda {
-typedef struct CosOp {
-public:
-    static constexpr size_t num_inputs = 1;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            return h2cos(x);
-        } else if constexpr (std::is_same_v<T, half>) {
-            return hcos(x);
-        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
-            float x0 = __bfloat162float(__low2bfloat16(x));
-            float x1 = __bfloat162float(__high2bfloat16(x));
-            return __floats2bfloat162_rn(cosf(x0), cosf(x1));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-            return __float2bfloat16_rn(cosf(__bfloat162float(x)));
-        } else if constexpr (std::is_same_v<T, float>) {
-            return __cosf(x);
-        } else {
-            return std::cos(x);
-        }
-    }
-} CosOp;
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Cos>;
 } // namespace op::cos::cuda
 
 #endif // __COS_CUDA_H__
diff --git a/src/infiniop/ops/cos/nvidia/cos_nvidia.cu b/src/infiniop/ops/cos/nvidia/cos_nvidia.cu
index 044c59ca0..5da3c02e8 100644
--- a/src/infiniop/ops/cos/nvidia/cos_nvidia.cu
+++ b/src/infiniop/ops/cos/nvidia/cos_nvidia.cu
@@ -1,54 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "cos_nvidia.cuh"
 
 namespace op::cos::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_UNARY(cos)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::CosOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::CosOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::cos::nvidia
diff --git a/src/infiniop/ops/cos/operator.cc b/src/infiniop/ops/cos/operator.cc
index 5c464ad60..1531c6caa 100644
--- a/src/infiniop/ops/cos/operator.cc
+++ b/src/infiniop/ops/cos/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/cos.h"
 
 #ifdef ENABLE_CPU_API
@@ -9,131 +8,4 @@
 #include "nvidia/cos_nvidia.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreateCosDescriptor(
-    infiniopHandle_t handle,
-    infiniopCosDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                             \
-        return op::cos::NAMESPACE::Descriptor::create(                     \
-            handle,                                                        \
-            reinterpret_cast<op::cos::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc,                                                        \
-            {x_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetCosWorkspaceSize(infiniopCosDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                               \
-    case CASE:                                                                             \
-        *size = reinterpret_cast<op::cos::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia)
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia)
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia)
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopCos(
-    infiniopCosDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                                \
-        return reinterpret_cast<const op::cos::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, y, {x}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroyCosDescriptor(infiniopCosDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                \
-    case CASE:                                                                 \
-        delete reinterpret_cast<const op::cos::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
+UNARY_OP_IMPL(cos, Cos)
diff --git a/src/infiniop/ops/cosh/cpu/cosh_cpu.cc b/src/infiniop/ops/cosh/cpu/cosh_cpu.cc
index 9ed8e33da..e7b2a6dad 100644
--- a/src/infiniop/ops/cosh/cpu/cosh_cpu.cc
+++ b/src/infiniop/ops/cosh/cpu/cosh_cpu.cc
@@ -1,48 +1,8 @@
 #include "cosh_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::cosh::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_UNARY(cosh)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<CoshOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<CoshOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::cosh::cpu
diff --git a/src/infiniop/ops/cosh/cpu/cosh_cpu.h b/src/infiniop/ops/cosh/cpu/cosh_cpu.h
index aea359ef2..c789d38ea 100644
--- a/src/infiniop/ops/cosh/cpu/cosh_cpu.h
+++ b/src/infiniop/ops/cosh/cpu/cosh_cpu.h
@@ -1,22 +1,9 @@
 #ifndef __COSH_CPU_H__
 #define __COSH_CPU_H__
 
-#include <cmath>
-
 #include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
 
-ELEMENTWISE_DESCRIPTOR(cosh, cpu)
-
-namespace op::cosh::cpu {
-typedef struct CoshOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    T operator()(const T &x) const {
-        return std::cosh(x);
-    }
-} CoshOp;
-} // namespace op::cosh::cpu
+UNARY_ELEMENTWISE_DESCRIPTOR(cosh, cpu, op::elementwise::unary::UnaryMode::Cosh)
 
 #endif // __COSH_CPU_H__
diff --git a/src/infiniop/ops/cosh/cuda/kernel.cuh b/src/infiniop/ops/cosh/cuda/kernel.cuh
index ce6806433..934bfe12d 100644
--- a/src/infiniop/ops/cosh/cuda/kernel.cuh
+++ b/src/infiniop/ops/cosh/cuda/kernel.cuh
@@ -1,32 +1,10 @@
 #ifndef __COSH_CUDA_H__
 #define __COSH_CUDA_H__
 
-#include <cmath>
-#include <cuda_fp16.h>
+#include "../../../elementwise/unary.h"
 
 namespace op::cosh::cuda {
-typedef struct CoshOp {
-public:
-    static constexpr size_t num_inputs = 1;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            return __floats2half2_rn(coshf(__half2float(__low2half(x))), coshf(__half2float(__high2half(x))));
-        } else if constexpr (std::is_same_v<T, half>) {
-            return __float2half(coshf(__half2float(x)));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
-            float x0 = __bfloat162float(__low2bfloat16(x));
-            float x1 = __bfloat162float(__high2bfloat16(x));
-            return __floats2bfloat162_rn(coshf(x0), coshf(x1));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-            return __float2bfloat16_rn(coshf(__bfloat162float(x)));
-        } else if constexpr (std::is_same_v<T, float>) {
-            return coshf(x);
-        } else {
-            return std::cosh(x);
-        }
-    }
-} CoshOp;
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Cosh>;
 } // namespace op::cosh::cuda
 
 #endif // __COSH_CUDA_H__
diff --git a/src/infiniop/ops/cosh/nvidia/cosh_nvidia.cu b/src/infiniop/ops/cosh/nvidia/cosh_nvidia.cu
index a5e1442ce..038b0373e 100644
--- a/src/infiniop/ops/cosh/nvidia/cosh_nvidia.cu
+++ b/src/infiniop/ops/cosh/nvidia/cosh_nvidia.cu
@@ -1,54 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "cosh_nvidia.cuh"
 
 namespace op::cosh::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_UNARY(cosh)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::CoshOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::CoshOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::cosh::nvidia
diff --git a/src/infiniop/ops/cosh/operator.cc b/src/infiniop/ops/cosh/operator.cc
index 75aac0c91..9b18b47ee 100644
--- a/src/infiniop/ops/cosh/operator.cc
+++ b/src/infiniop/ops/cosh/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/cosh.h"
 
 #ifdef ENABLE_CPU_API
@@ -9,131 +8,4 @@
 #include "nvidia/cosh_nvidia.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreateCoshDescriptor(
-    infiniopHandle_t handle,
-    infiniopCoshDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                             \
-    case CASE:                                                              \
-        return op::cosh::NAMESPACE::Descriptor::create(                     \
-            handle,                                                         \
-            reinterpret_cast<op::cosh::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc,                                                         \
-            {x_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetCoshWorkspaceSize(infiniopCoshDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                                \
-    case CASE:                                                                              \
-        *size = reinterpret_cast<op::cosh::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia)
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia)
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia)
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopCosh(
-    infiniopCoshDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                             \
-    case CASE:                                                                 \
-        return reinterpret_cast<const op::cosh::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, y, {x}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroyCoshDescriptor(infiniopCoshDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                 \
-    case CASE:                                                                  \
-        delete reinterpret_cast<const op::cosh::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
+UNARY_OP_IMPL(cosh, Cosh)
diff --git a/src/infiniop/ops/div/cpu/div_cpu.cc b/src/infiniop/ops/div/cpu/div_cpu.cc
index 19e222031..6d150070c 100644
--- a/src/infiniop/ops/div/cpu/div_cpu.cc
+++ b/src/infiniop/ops/div/cpu/div_cpu.cc
@@ -1,50 +1,8 @@
 #include "div_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::div::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_BINARY(div)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &a_desc = input_desc_vec.at(0);
-    const auto &b_desc = input_desc_vec.at(1);
-    const auto &c_shape = out_desc->shape();
-    const auto &a_shape = a_desc->shape();
-    const auto &b_shape = b_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<DivOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<DivOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::div::cpu
diff --git a/src/infiniop/ops/div/cpu/div_cpu.h b/src/infiniop/ops/div/cpu/div_cpu.h
index 0373b766f..ad76e7ef1 100644
--- a/src/infiniop/ops/div/cpu/div_cpu.h
+++ b/src/infiniop/ops/div/cpu/div_cpu.h
@@ -1,19 +1,9 @@
 #ifndef __DIV_CPU_H__
 #define __DIV_CPU_H__
 
+#include "../../../elementwise/binary.h"
 #include "../../../elementwise/cpu/elementwise_cpu.h"
 
-ELEMENTWISE_DESCRIPTOR(div, cpu)
-
-namespace op::div::cpu {
-typedef struct DivOp {
-public:
-    static constexpr size_t num_inputs = 2;
-    template <typename T>
-    T operator()(const T &a, const T &b) const {
-        return a / b;
-    }
-} DivOp;
-} // namespace op::div::cpu
+BINARY_ELEMENTWISE_DESCRIPTOR(div, cpu, op::elementwise::binary::BinaryMode::Divide)
 
 #endif // __DIV_CPU_H__
diff --git a/src/infiniop/ops/div/cuda/kernel.cuh b/src/infiniop/ops/div/cuda/kernel.cuh
index a67993da5..f1ab13152 100644
--- a/src/infiniop/ops/div/cuda/kernel.cuh
+++ b/src/infiniop/ops/div/cuda/kernel.cuh
@@ -1,23 +1,10 @@
 #ifndef __DIV_CUDA_H__
 #define __DIV_CUDA_H__
 
+#include "../../../elementwise/binary.h"
+
 namespace op::div::cuda {
-typedef struct DivOp {
-public:
-    static constexpr size_t num_inputs = 2;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            return __h2div(a, b);
-        } else if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
-            return a / b;
-        } else if constexpr (std::is_same_v<T, float>) {
-            return __fdividef(a, b);
-        } else {
-            return a / b;
-        }
-    }
-} DivOp;
+using Op = op::elementwise::binary::cuda::BinaryOp<op::elementwise::binary::BinaryMode::Divide>;
 } // namespace op::div::cuda
 
 #endif // __DIV_CUDA_H__
diff --git a/src/infiniop/ops/div/nvidia/div_nvidia.cu b/src/infiniop/ops/div/nvidia/div_nvidia.cu
index 1abffe816..8aaba09b4 100644
--- a/src/infiniop/ops/div/nvidia/div_nvidia.cu
+++ b/src/infiniop/ops/div/nvidia/div_nvidia.cu
@@ -1,57 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "div_nvidia.cuh"
 
 namespace op::div::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_BINARY(div)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &a_desc = input_desc_vec.at(0);
-    const auto &b_desc = input_desc_vec.at(1);
-    const auto &c_shape = out_desc->shape();
-    const auto &a_shape = a_desc->shape();
-    const auto &b_shape = b_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::DivOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::DivOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::div::nvidia
diff --git a/src/infiniop/ops/div/operator.cc b/src/infiniop/ops/div/operator.cc
index 84021a1af..af9d1929a 100644
--- a/src/infiniop/ops/div/operator.cc
+++ b/src/infiniop/ops/div/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/div.h"
 
 #ifdef ENABLE_CPU_API
@@ -8,195 +7,5 @@
 #if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
 #include "nvidia/div_nvidia.cuh"
 #endif
-#ifdef ENABLE_METAX_API
-#include "metax/div_metax.h"
-#endif
-#ifdef ENABLE_KUNLUN_API
-#include "kunlun/div_kunlun.h"
-#endif
-#ifdef ENABLE_CAMBRICON_API
-#include "bang/div_bang.h"
-#endif
-#ifdef ENABLE_MOORE_API
-#include "moore/div_moore.h"
-#endif
-
-__C infiniStatus_t infiniopCreateDivDescriptor(
-    infiniopHandle_t handle,
-    infiniopDivDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t c_desc,
-    infiniopTensorDescriptor_t a_desc,
-    infiniopTensorDescriptor_t b_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                             \
-        return op::div::NAMESPACE::Descriptor::create(                     \
-            handle,                                                        \
-            reinterpret_cast<op::div::NAMESPACE::Descriptor **>(desc_ptr), \
-            c_desc,                                                        \
-            {a_desc,                                                       \
-             b_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        CREATE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_CAMBRICON_API
-        CREATE(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_MOORE_API
-        CREATE(INFINI_DEVICE_MOORE, moore);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetDivWorkspaceSize(infiniopDivDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                               \
-    case CASE:                                                                             \
-        *size = reinterpret_cast<op::div::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        GET(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        GET(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_CAMBRICON_API
-        GET(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_MOORE_API
-        GET(INFINI_DEVICE_MOORE, moore);
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopDiv(
-    infiniopDivDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *c,
-    const void *a,
-    const void *b,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                                \
-        return reinterpret_cast<const op::div::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, c, {a, b}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        CALCULATE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_CAMBRICON_API
-        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_MOORE_API
-        CALCULATE(INFINI_DEVICE_MOORE, moore);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroyDivDescriptor(infiniopDivDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                \
-    case CASE:                                                                 \
-        delete reinterpret_cast<const op::div::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        DELETE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_CAMBRICON_API
-        DELETE(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_MOORE_API
-        DELETE(INFINI_DEVICE_MOORE, moore);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
 
-#undef DELETE
-}
+BINARY_OP_IMPL(div, Div)
diff --git a/src/infiniop/ops/erf/cpu/erf_cpu.cc b/src/infiniop/ops/erf/cpu/erf_cpu.cc
index 00b1897d1..d9119c697 100644
--- a/src/infiniop/ops/erf/cpu/erf_cpu.cc
+++ b/src/infiniop/ops/erf/cpu/erf_cpu.cc
@@ -1,48 +1,8 @@
 #include "erf_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::erf::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_UNARY(erf)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<ErfOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<ErfOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::erf::cpu
diff --git a/src/infiniop/ops/erf/cpu/erf_cpu.h b/src/infiniop/ops/erf/cpu/erf_cpu.h
index c26f519cf..f50cd157d 100644
--- a/src/infiniop/ops/erf/cpu/erf_cpu.h
+++ b/src/infiniop/ops/erf/cpu/erf_cpu.h
@@ -1,22 +1,9 @@
 #ifndef __ERF_CPU_H__
 #define __ERF_CPU_H__
 
-#include <cmath>
-
 #include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
 
-ELEMENTWISE_DESCRIPTOR(erf, cpu)
-
-namespace op::erf::cpu {
-typedef struct ErfOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    T operator()(const T &x) const {
-        return std::erf(x);
-    }
-} ErfOp;
-} // namespace op::erf::cpu
+UNARY_ELEMENTWISE_DESCRIPTOR(erf, cpu, op::elementwise::unary::UnaryMode::Erf)
 
 #endif // __ERF_CPU_H__
diff --git a/src/infiniop/ops/erf/cuda/kernel.cuh b/src/infiniop/ops/erf/cuda/kernel.cuh
index 820c10b19..978890cff 100644
--- a/src/infiniop/ops/erf/cuda/kernel.cuh
+++ b/src/infiniop/ops/erf/cuda/kernel.cuh
@@ -1,32 +1,10 @@
 #ifndef __ERF_CUDA_H__
 #define __ERF_CUDA_H__
 
-#include <cmath>
-#include <cuda_fp16.h>
+#include "../../../elementwise/unary.h"
 
 namespace op::erf::cuda {
-typedef struct ErfOp {
-public:
-    static constexpr size_t num_inputs = 1;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            return __floats2half2_rn(erff(__half2float(__low2half(x))), erff(__half2float(__high2half(x))));
-        } else if constexpr (std::is_same_v<T, half>) {
-            return __float2half(erff(__half2float(x)));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
-            float x0 = __bfloat162float(__low2bfloat16(x));
-            float x1 = __bfloat162float(__high2bfloat16(x));
-            return __floats2bfloat162_rn(erff(x0), erff(x1));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-            return __float2bfloat16_rn(erff(__bfloat162float(x)));
-        } else if constexpr (std::is_same_v<T, float>) {
-            return erff(x);
-        } else {
-            return std::erf(x);
-        }
-    }
-} ErfOp;
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Erf>;
 } // namespace op::erf::cuda
 
 #endif // __ERF_CUDA_H__
diff --git a/src/infiniop/ops/erf/nvidia/erf_nvidia.cu b/src/infiniop/ops/erf/nvidia/erf_nvidia.cu
index 9080593de..0d743b538 100644
--- a/src/infiniop/ops/erf/nvidia/erf_nvidia.cu
+++ b/src/infiniop/ops/erf/nvidia/erf_nvidia.cu
@@ -1,54 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "erf_nvidia.cuh"
 
 namespace op::erf::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_UNARY(erf)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::ErfOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::ErfOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::erf::nvidia
diff --git a/src/infiniop/ops/erf/operator.cc b/src/infiniop/ops/erf/operator.cc
index 1491cfa9a..9304cf525 100644
--- a/src/infiniop/ops/erf/operator.cc
+++ b/src/infiniop/ops/erf/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/erf.h"
 
 #ifdef ENABLE_CPU_API
@@ -9,131 +8,4 @@
 #include "nvidia/erf_nvidia.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreateErfDescriptor(
-    infiniopHandle_t handle,
-    infiniopErfDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                             \
-        return op::erf::NAMESPACE::Descriptor::create(                     \
-            handle,                                                        \
-            reinterpret_cast<op::erf::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc,                                                        \
-            {x_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetErfWorkspaceSize(infiniopErfDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                               \
-    case CASE:                                                                             \
-        *size = reinterpret_cast<op::erf::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia)
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia)
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia)
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopErf(
-    infiniopErfDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                                \
-        return reinterpret_cast<const op::erf::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, y, {x}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroyErfDescriptor(infiniopErfDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                \
-    case CASE:                                                                 \
-        delete reinterpret_cast<const op::erf::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
+UNARY_OP_IMPL(erf, Erf)
diff --git a/src/infiniop/ops/floor/cpu/floor_cpu.cc b/src/infiniop/ops/floor/cpu/floor_cpu.cc
index e809a02e2..cc717ac11 100644
--- a/src/infiniop/ops/floor/cpu/floor_cpu.cc
+++ b/src/infiniop/ops/floor/cpu/floor_cpu.cc
@@ -1,48 +1,8 @@
 #include "floor_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::floor::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_UNARY(floor)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<FloorOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<FloorOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::floor::cpu
diff --git a/src/infiniop/ops/floor/cpu/floor_cpu.h b/src/infiniop/ops/floor/cpu/floor_cpu.h
index 91508a384..a246309e8 100644
--- a/src/infiniop/ops/floor/cpu/floor_cpu.h
+++ b/src/infiniop/ops/floor/cpu/floor_cpu.h
@@ -1,26 +1,9 @@
 #ifndef __FLOOR_CPU_H__
 #define __FLOOR_CPU_H__
 
-#include <cmath>
-
 #include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
 
-ELEMENTWISE_DESCRIPTOR(floor, cpu)
-
-namespace op::floor::cpu {
-typedef struct FloorOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    T operator()(const T &x) const {
-        if constexpr (std::is_integral_v<T>) {
-            return x;
-        } else {
-            return std::floor(x);
-        }
-    }
-} FloorOp;
-} // namespace op::floor::cpu
+UNARY_ELEMENTWISE_DESCRIPTOR(floor, cpu, op::elementwise::unary::UnaryMode::Floor)
 
 #endif // __FLOOR_CPU_H__
diff --git a/src/infiniop/ops/floor/cuda/kernel.cuh b/src/infiniop/ops/floor/cuda/kernel.cuh
index c89ce34f4..23a7a44e9 100644
--- a/src/infiniop/ops/floor/cuda/kernel.cuh
+++ b/src/infiniop/ops/floor/cuda/kernel.cuh
@@ -1,34 +1,10 @@
 #ifndef __FLOOR_CUDA_H__
 #define __FLOOR_CUDA_H__
 
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
-#include <cuda_fp16.h>
+#include "../../../elementwise/unary.h"
 
 namespace op::floor::cuda {
-typedef struct FloorOp {
-public:
-    static constexpr size_t num_inputs = 1;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            return h2floor(x);
-        } else if constexpr (std::is_same_v<T, half>) {
-            return hfloor(x);
-        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
-            float x0 = __bfloat162float(__low2bfloat16(x));
-            float x1 = __bfloat162float(__high2bfloat16(x));
-            return __floats2bfloat162_rn(floorf(x0), floorf(x1));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-            return __float2bfloat16_rn(floorf(__bfloat162float(x)));
-        } else if constexpr (std::is_same_v<T, float>) {
-            return floorf(x);
-        } else if constexpr (std::is_integral_v<T>) {
-            return x;
-        } else {
-            return std::floor(x);
-        }
-    }
-} FloorOp;
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Floor>;
 } // namespace op::floor::cuda
 
 #endif // __FLOOR_CUDA_H__
diff --git a/src/infiniop/ops/floor/nvidia/floor_nvidia.cu b/src/infiniop/ops/floor/nvidia/floor_nvidia.cu
index 08305048a..cec304a1c 100644
--- a/src/infiniop/ops/floor/nvidia/floor_nvidia.cu
+++ b/src/infiniop/ops/floor/nvidia/floor_nvidia.cu
@@ -1,54 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "floor_nvidia.cuh"
 
 namespace op::floor::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_UNARY(floor)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::FloorOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::FloorOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::floor::nvidia
diff --git a/src/infiniop/ops/floor/operator.cc b/src/infiniop/ops/floor/operator.cc
index 4e4ed2b5a..64e4a586b 100644
--- a/src/infiniop/ops/floor/operator.cc
+++ b/src/infiniop/ops/floor/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/floor.h"
 
 #ifdef ENABLE_CPU_API
@@ -9,131 +8,4 @@
 #include "nvidia/floor_nvidia.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreateFloorDescriptor(
-    infiniopHandle_t handle,
-    infiniopFloorDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                              \
-    case CASE:                                                               \
-        return op::floor::NAMESPACE::Descriptor::create(                     \
-            handle,                                                          \
-            reinterpret_cast<op::floor::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc,                                                          \
-            {x_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetFloorWorkspaceSize(infiniopFloorDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                                 \
-    case CASE:                                                                               \
-        *size = reinterpret_cast<op::floor::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia)
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia)
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia)
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopFloor(
-    infiniopFloorDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                              \
-    case CASE:                                                                  \
-        return reinterpret_cast<const op::floor::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, y, {x}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroyFloorDescriptor(infiniopFloorDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                  \
-    case CASE:                                                                   \
-        delete reinterpret_cast<const op::floor::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
+UNARY_OP_IMPL(floor, Floor)
diff --git a/src/infiniop/ops/log/cpu/log_cpu.cc b/src/infiniop/ops/log/cpu/log_cpu.cc
index e7314c319..734ad1617 100644
--- a/src/infiniop/ops/log/cpu/log_cpu.cc
+++ b/src/infiniop/ops/log/cpu/log_cpu.cc
@@ -1,48 +1,8 @@
 #include "log_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::log::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_UNARY(log)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<LogOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<LogOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::log::cpu
diff --git a/src/infiniop/ops/log/cpu/log_cpu.h b/src/infiniop/ops/log/cpu/log_cpu.h
index 535e681d3..b13d01442 100644
--- a/src/infiniop/ops/log/cpu/log_cpu.h
+++ b/src/infiniop/ops/log/cpu/log_cpu.h
@@ -1,22 +1,9 @@
 #ifndef __LOG_CPU_H__
 #define __LOG_CPU_H__
 
-#include <cmath>
-
 #include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
 
-ELEMENTWISE_DESCRIPTOR(log, cpu)
-
-namespace op::log::cpu {
-typedef struct LogOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    T operator()(const T &x) const {
-        return std::log(x);
-    }
-} LogOp;
-} // namespace op::log::cpu
+UNARY_ELEMENTWISE_DESCRIPTOR(log, cpu, op::elementwise::unary::UnaryMode::Log)
 
 #endif // __LOG_CPU_H__
diff --git a/src/infiniop/ops/log/cuda/kernel.cuh b/src/infiniop/ops/log/cuda/kernel.cuh
index b1e46873c..80980ada1 100644
--- a/src/infiniop/ops/log/cuda/kernel.cuh
+++ b/src/infiniop/ops/log/cuda/kernel.cuh
@@ -1,32 +1,10 @@
 #ifndef __LOG_CUDA_H__
 #define __LOG_CUDA_H__
 
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
-#include <cuda_fp16.h>
+#include "../../../elementwise/unary.h"
 
 namespace op::log::cuda {
-typedef struct LogOp {
-public:
-    static constexpr size_t num_inputs = 1;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            return h2log(x);
-        } else if constexpr (std::is_same_v<T, half>) {
-            return __float2half(__logf(__half2float(x)));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
-            float x0 = __bfloat162float(__low2bfloat16(x));
-            float x1 = __bfloat162float(__high2bfloat16(x));
-            return __floats2bfloat162_rn(logf(x0), logf(x1));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-            return __float2bfloat16_rn(logf(__bfloat162float(x)));
-        } else if constexpr (std::is_same_v<T, float>) {
-            return __logf(x);
-        } else {
-            return std::log(x);
-        }
-    }
-} LogOp;
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Log>;
 } // namespace op::log::cuda
 
 #endif // __LOG_CUDA_H__
diff --git a/src/infiniop/ops/log/nvidia/log_nvidia.cu b/src/infiniop/ops/log/nvidia/log_nvidia.cu
index 9e7bcafc4..87aaa0388 100644
--- a/src/infiniop/ops/log/nvidia/log_nvidia.cu
+++ b/src/infiniop/ops/log/nvidia/log_nvidia.cu
@@ -1,54 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "log_nvidia.cuh"
 
 namespace op::log::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_UNARY(log)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::LogOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::LogOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::log::nvidia
diff --git a/src/infiniop/ops/log/operator.cc b/src/infiniop/ops/log/operator.cc
index 8f2add408..9614a0861 100644
--- a/src/infiniop/ops/log/operator.cc
+++ b/src/infiniop/ops/log/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/log.h"
 
 #ifdef ENABLE_CPU_API
@@ -9,131 +8,4 @@
 #include "nvidia/log_nvidia.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreateLogDescriptor(
-    infiniopHandle_t handle,
-    infiniopLogDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                             \
-        return op::log::NAMESPACE::Descriptor::create(                     \
-            handle,                                                        \
-            reinterpret_cast<op::log::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc,                                                        \
-            {x_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetLogWorkspaceSize(infiniopLogDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                               \
-    case CASE:                                                                             \
-        *size = reinterpret_cast<op::log::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia)
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia)
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia)
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopLog(
-    infiniopLogDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                                \
-        return reinterpret_cast<const op::log::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, y, {x}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroyLogDescriptor(infiniopLogDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                \
-    case CASE:                                                                 \
-        delete reinterpret_cast<const op::log::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
+UNARY_OP_IMPL(log, Log)
diff --git a/src/infiniop/ops/max/cpu/max_cpu.cc b/src/infiniop/ops/max/cpu/max_cpu.cc
index 1b30fa4e4..98e8a52a2 100644
--- a/src/infiniop/ops/max/cpu/max_cpu.cc
+++ b/src/infiniop/ops/max/cpu/max_cpu.cc
@@ -1,50 +1,8 @@
 #include "max_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::max::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_BINARY(max)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &a_desc = input_desc_vec.at(0);
-    const auto &b_desc = input_desc_vec.at(1);
-    const auto &c_shape = out_desc->shape();
-    const auto &a_shape = a_desc->shape();
-    const auto &b_shape = b_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<MaxOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<MaxOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::max::cpu
diff --git a/src/infiniop/ops/max/cpu/max_cpu.h b/src/infiniop/ops/max/cpu/max_cpu.h
index 4d085ed39..2219994d5 100644
--- a/src/infiniop/ops/max/cpu/max_cpu.h
+++ b/src/infiniop/ops/max/cpu/max_cpu.h
@@ -1,20 +1,9 @@
 #ifndef __MAX_CPU_H__
 #define __MAX_CPU_H__
 
+#include "../../../elementwise/binary.h"
 #include "../../../elementwise/cpu/elementwise_cpu.h"
-#include <algorithm>
 
-ELEMENTWISE_DESCRIPTOR(max, cpu)
-
-namespace op::max::cpu {
-typedef struct MaxOp {
-public:
-    static constexpr size_t num_inputs = 2;
-    template <typename T>
-    T operator()(const T &a, const T &b) const {
-        return std::max(a, b);
-    }
-} MaxOp;
-} // namespace op::max::cpu
+BINARY_ELEMENTWISE_DESCRIPTOR(max, cpu, op::elementwise::binary::BinaryMode::Max)
 
 #endif // __MAX_CPU_H__
diff --git a/src/infiniop/ops/max/cuda/kernel.cuh b/src/infiniop/ops/max/cuda/kernel.cuh
index bf3977a31..68f634559 100644
--- a/src/infiniop/ops/max/cuda/kernel.cuh
+++ b/src/infiniop/ops/max/cuda/kernel.cuh
@@ -1,23 +1,10 @@
 #ifndef __MAX_CUDA_H__
 #define __MAX_CUDA_H__
 
+#include "../../../elementwise/binary.h"
+
 namespace op::max::cuda {
-typedef struct MaxOp {
-public:
-    static constexpr size_t num_inputs = 2;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            return __hmax2(a, b);
-        } else if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
-            return a > b ? a : b;
-        } else if constexpr (std::is_same_v<T, float>) {
-            return fmaxf(a, b);
-        } else {
-            return a > b ? a : b;
-        }
-    }
-} MaxOp;
+using Op = op::elementwise::binary::cuda::BinaryOp<op::elementwise::binary::BinaryMode::Max>;
 } // namespace op::max::cuda
 
 #endif // __MAX_CUDA_H__
diff --git a/src/infiniop/ops/max/nvidia/max_nvidia.cu b/src/infiniop/ops/max/nvidia/max_nvidia.cu
index 5e9fb13f4..ba4620f3b 100644
--- a/src/infiniop/ops/max/nvidia/max_nvidia.cu
+++ b/src/infiniop/ops/max/nvidia/max_nvidia.cu
@@ -1,57 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "max_nvidia.cuh"
 
 namespace op::max::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_BINARY(max)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &a_desc = input_desc_vec.at(0);
-    const auto &b_desc = input_desc_vec.at(1);
-    const auto &c_shape = out_desc->shape();
-    const auto &a_shape = a_desc->shape();
-    const auto &b_shape = b_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::MaxOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::MaxOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::max::nvidia
diff --git a/src/infiniop/ops/max/operator.cc b/src/infiniop/ops/max/operator.cc
index e04368533..3e5299f52 100644
--- a/src/infiniop/ops/max/operator.cc
+++ b/src/infiniop/ops/max/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/max.h"
 
 #ifdef ENABLE_CPU_API
@@ -8,195 +7,5 @@
 #if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
 #include "nvidia/max_nvidia.cuh"
 #endif
-#ifdef ENABLE_METAX_API
-#include "metax/max_metax.h"
-#endif
-#ifdef ENABLE_KUNLUN_API
-#include "kunlun/max_kunlun.h"
-#endif
-#ifdef ENABLE_CAMBRICON_API
-#include "bang/max_bang.h"
-#endif
-#ifdef ENABLE_MOORE_API
-#include "moore/max_moore.h"
-#endif
-
-__C infiniStatus_t infiniopCreateMaxDescriptor(
-    infiniopHandle_t handle,
-    infiniopMaxDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t c_desc,
-    infiniopTensorDescriptor_t a_desc,
-    infiniopTensorDescriptor_t b_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                             \
-        return op::max::NAMESPACE::Descriptor::create(                     \
-            handle,                                                        \
-            reinterpret_cast<op::max::NAMESPACE::Descriptor **>(desc_ptr), \
-            c_desc,                                                        \
-            {a_desc,                                                       \
-             b_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        CREATE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_CAMBRICON_API
-        CREATE(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_MOORE_API
-        CREATE(INFINI_DEVICE_MOORE, moore);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetMaxWorkspaceSize(infiniopMaxDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                               \
-    case CASE:                                                                             \
-        *size = reinterpret_cast<op::max::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        GET(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        GET(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_CAMBRICON_API
-        GET(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_MOORE_API
-        GET(INFINI_DEVICE_MOORE, moore);
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopMax(
-    infiniopMaxDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *c,
-    const void *a,
-    const void *b,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                                \
-        return reinterpret_cast<const op::max::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, c, {a, b}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        CALCULATE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_CAMBRICON_API
-        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_MOORE_API
-        CALCULATE(INFINI_DEVICE_MOORE, moore);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroyMaxDescriptor(infiniopMaxDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                \
-    case CASE:                                                                 \
-        delete reinterpret_cast<const op::max::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        DELETE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_CAMBRICON_API
-        DELETE(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_MOORE_API
-        DELETE(INFINI_DEVICE_MOORE, moore);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
 
-#undef DELETE
-}
+BINARY_OP_IMPL(max, Max)
diff --git a/src/infiniop/ops/min/cpu/min_cpu.cc b/src/infiniop/ops/min/cpu/min_cpu.cc
index dc30ee57f..1bac9ea61 100644
--- a/src/infiniop/ops/min/cpu/min_cpu.cc
+++ b/src/infiniop/ops/min/cpu/min_cpu.cc
@@ -1,50 +1,8 @@
 #include "min_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::min::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_BINARY(min)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &a_desc = input_desc_vec.at(0);
-    const auto &b_desc = input_desc_vec.at(1);
-    const auto &c_shape = out_desc->shape();
-    const auto &a_shape = a_desc->shape();
-    const auto &b_shape = b_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<MinOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<MinOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::min::cpu
diff --git a/src/infiniop/ops/min/cpu/min_cpu.h b/src/infiniop/ops/min/cpu/min_cpu.h
index 1c84d4fca..74042db50 100644
--- a/src/infiniop/ops/min/cpu/min_cpu.h
+++ b/src/infiniop/ops/min/cpu/min_cpu.h
@@ -1,20 +1,9 @@
 #ifndef __MIN_CPU_H__
 #define __MIN_CPU_H__
 
+#include "../../../elementwise/binary.h"
 #include "../../../elementwise/cpu/elementwise_cpu.h"
-#include <algorithm>
 
-ELEMENTWISE_DESCRIPTOR(min, cpu)
-
-namespace op::min::cpu {
-typedef struct MinOp {
-public:
-    static constexpr size_t num_inputs = 2;
-    template <typename T>
-    T operator()(const T &a, const T &b) const {
-        return std::min(a, b);
-    }
-} MinOp;
-} // namespace op::min::cpu
+BINARY_ELEMENTWISE_DESCRIPTOR(min, cpu, op::elementwise::binary::BinaryMode::Min)
 
 #endif // __MIN_CPU_H__
diff --git a/src/infiniop/ops/min/cuda/kernel.cuh b/src/infiniop/ops/min/cuda/kernel.cuh
index aac14a0e8..75c6ab6b9 100644
--- a/src/infiniop/ops/min/cuda/kernel.cuh
+++ b/src/infiniop/ops/min/cuda/kernel.cuh
@@ -1,23 +1,10 @@
 #ifndef __MIN_CUDA_H__
 #define __MIN_CUDA_H__
 
+#include "../../../elementwise/binary.h"
+
 namespace op::min::cuda {
-typedef struct MinOp {
-public:
-    static constexpr size_t num_inputs = 2;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            return __hmin2(a, b);
-        } else if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
-            return a < b ? a : b;
-        } else if constexpr (std::is_same_v<T, float>) {
-            return fminf(a, b);
-        } else {
-            return a < b ? a : b;
-        }
-    }
-} MinOp;
+using Op = op::elementwise::binary::cuda::BinaryOp<op::elementwise::binary::BinaryMode::Min>;
 } // namespace op::min::cuda
 
 #endif // __MIN_CUDA_H__
diff --git a/src/infiniop/ops/min/nvidia/min_nvidia.cu b/src/infiniop/ops/min/nvidia/min_nvidia.cu
index 419655e29..0708cbcaf 100644
--- a/src/infiniop/ops/min/nvidia/min_nvidia.cu
+++ b/src/infiniop/ops/min/nvidia/min_nvidia.cu
@@ -1,57 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "min_nvidia.cuh"
 
 namespace op::min::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_BINARY(min)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &a_desc = input_desc_vec.at(0);
-    const auto &b_desc = input_desc_vec.at(1);
-    const auto &c_shape = out_desc->shape();
-    const auto &a_shape = a_desc->shape();
-    const auto &b_shape = b_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::MinOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::MinOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::min::nvidia
diff --git a/src/infiniop/ops/min/operator.cc b/src/infiniop/ops/min/operator.cc
index 8479feab4..6f67ecf87 100644
--- a/src/infiniop/ops/min/operator.cc
+++ b/src/infiniop/ops/min/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/min.h"
 
 #ifdef ENABLE_CPU_API
@@ -8,195 +7,5 @@
 #if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
 #include "nvidia/min_nvidia.cuh"
 #endif
-#ifdef ENABLE_METAX_API
-#include "metax/min_metax.h"
-#endif
-#ifdef ENABLE_KUNLUN_API
-#include "kunlun/min_kunlun.h"
-#endif
-#ifdef ENABLE_CAMBRICON_API
-#include "bang/min_bang.h"
-#endif
-#ifdef ENABLE_MOORE_API
-#include "moore/min_moore.h"
-#endif
-
-__C infiniStatus_t infiniopCreateMinDescriptor(
-    infiniopHandle_t handle,
-    infiniopMinDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t c_desc,
-    infiniopTensorDescriptor_t a_desc,
-    infiniopTensorDescriptor_t b_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                             \
-        return op::min::NAMESPACE::Descriptor::create(                     \
-            handle,                                                        \
-            reinterpret_cast<op::min::NAMESPACE::Descriptor **>(desc_ptr), \
-            c_desc,                                                        \
-            {a_desc,                                                       \
-             b_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        CREATE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_CAMBRICON_API
-        CREATE(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_MOORE_API
-        CREATE(INFINI_DEVICE_MOORE, moore);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetMinWorkspaceSize(infiniopMinDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                               \
-    case CASE:                                                                             \
-        *size = reinterpret_cast<op::min::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        GET(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        GET(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_CAMBRICON_API
-        GET(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_MOORE_API
-        GET(INFINI_DEVICE_MOORE, moore);
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopMin(
-    infiniopMinDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *c,
-    const void *a,
-    const void *b,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                                \
-        return reinterpret_cast<const op::min::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, c, {a, b}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        CALCULATE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_CAMBRICON_API
-        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_MOORE_API
-        CALCULATE(INFINI_DEVICE_MOORE, moore);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroyMinDescriptor(infiniopMinDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                \
-    case CASE:                                                                 \
-        delete reinterpret_cast<const op::min::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-#ifdef ENABLE_METAX_API
-        DELETE(INFINI_DEVICE_METAX, metax);
-#endif
-#ifdef ENABLE_KUNLUN_API
-        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
-#endif
-#ifdef ENABLE_CAMBRICON_API
-        DELETE(INFINI_DEVICE_CAMBRICON, bang);
-#endif
-#ifdef ENABLE_MOORE_API
-        DELETE(INFINI_DEVICE_MOORE, moore);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
 
-#undef DELETE
-}
+BINARY_OP_IMPL(min, Min)
diff --git a/src/infiniop/ops/mod/cpu/mod_cpu.cc b/src/infiniop/ops/mod/cpu/mod_cpu.cc
index 907d05166..609c2e76e 100644
--- a/src/infiniop/ops/mod/cpu/mod_cpu.cc
+++ b/src/infiniop/ops/mod/cpu/mod_cpu.cc
@@ -1,49 +1,8 @@
 #include "mod_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::mod::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_BINARY(mod)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &a_desc = input_desc_vec.at(0);
-    const auto &b_desc = input_desc_vec.at(1);
-    const auto &out_shape = out_desc->shape();
-    const auto &a_shape = a_desc->shape();
-    const auto &b_shape = b_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(out_shape, a_shape, b_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<ModOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<ModOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::mod::cpu
diff --git a/src/infiniop/ops/mod/cpu/mod_cpu.h b/src/infiniop/ops/mod/cpu/mod_cpu.h
index 9e78adca6..72ea7dede 100644
--- a/src/infiniop/ops/mod/cpu/mod_cpu.h
+++ b/src/infiniop/ops/mod/cpu/mod_cpu.h
@@ -1,23 +1,9 @@
 #ifndef __MOD_CPU_H__
 #define __MOD_CPU_H__
 
+#include "../../../elementwise/binary.h"
 #include "../../../elementwise/cpu/elementwise_cpu.h"
 
-ELEMENTWISE_DESCRIPTOR(mod, cpu)
-
-namespace op::mod::cpu {
-typedef struct ModOp {
-public:
-    static constexpr size_t num_inputs = 2;
-    template <typename T>
-    T operator()(const T &a, const T &b) const {
-        if constexpr (std::is_floating_point_v<T>) {
-            return std::fmod(a, b);
-        } else {
-            return a % b;
-        }
-    }
-} ModOp;
-} // namespace op::mod::cpu
+BINARY_ELEMENTWISE_DESCRIPTOR(mod, cpu, op::elementwise::binary::BinaryMode::Mod)
 
 #endif // __MOD_CPU_H__
diff --git a/src/infiniop/ops/mod/cuda/kernel.cuh b/src/infiniop/ops/mod/cuda/kernel.cuh
index 0dcb54136..164784081 100644
--- a/src/infiniop/ops/mod/cuda/kernel.cuh
+++ b/src/infiniop/ops/mod/cuda/kernel.cuh
@@ -1,30 +1,10 @@
 #ifndef __MOD_CUDA_H__
 #define __MOD_CUDA_H__
 
-#include <cmath>
-#include <cuda_fp16.h>
+#include "../../../elementwise/binary.h"
 
 namespace op::mod::cuda {
-typedef struct ModOp {
-public:
-    static constexpr size_t num_inputs = 2;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            float2 a_f2 = __half22float2(a);
-            float2 b_f2 = __half22float2(b);
-            return __float22half2_rn(make_float2(std::fmod(a_f2.x, b_f2.x), std::fmod(a_f2.y, b_f2.y)));
-        } else if constexpr (std::is_same_v<T, half>) {
-            float a_ = __half2float(a);
-            float b_ = __half2float(b);
-            return __float2half(std::fmod(a_, b_));
-        } else if constexpr (std::is_floating_point_v<T>) {
-            return std::fmod(a, b);
-        } else {
-            return a % b;
-        }
-    }
-} ModOp;
+using Op = op::elementwise::binary::cuda::BinaryOp<op::elementwise::binary::BinaryMode::Mod>;
 } // namespace op::mod::cuda
 
 #endif // __MOD_CUDA_H__
diff --git a/src/infiniop/ops/mod/nvidia/mod_nvidia.cu b/src/infiniop/ops/mod/nvidia/mod_nvidia.cu
index 64326d441..68b78ee70 100644
--- a/src/infiniop/ops/mod/nvidia/mod_nvidia.cu
+++ b/src/infiniop/ops/mod/nvidia/mod_nvidia.cu
@@ -1,57 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "mod_nvidia.cuh"
 
 namespace op::mod::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_BINARY(mod)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &a_desc = input_desc_vec.at(0);
-    const auto &b_desc = input_desc_vec.at(1);
-    const auto &c_shape = out_desc->shape();
-    const auto &a_shape = a_desc->shape();
-    const auto &b_shape = b_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::ModOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::ModOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::mod::nvidia
diff --git a/src/infiniop/ops/mod/operator.cc b/src/infiniop/ops/mod/operator.cc
index 85810e794..aef892ce1 100644
--- a/src/infiniop/ops/mod/operator.cc
+++ b/src/infiniop/ops/mod/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/mod.h"
 
 #ifdef ENABLE_CPU_API
@@ -9,134 +8,4 @@
 #include "nvidia/mod_nvidia.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreateModDescriptor(
-    infiniopHandle_t handle,
-    infiniopModDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t c_desc,
-    infiniopTensorDescriptor_t a_desc,
-    infiniopTensorDescriptor_t b_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                             \
-        return op::mod::NAMESPACE::Descriptor::create(                     \
-            handle,                                                        \
-            reinterpret_cast<op::mod::NAMESPACE::Descriptor **>(desc_ptr), \
-            c_desc,                                                        \
-            {a_desc,                                                       \
-             b_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetModWorkspaceSize(infiniopModDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                               \
-    case CASE:                                                                             \
-        *size = reinterpret_cast<op::mod::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia);
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopMod(
-    infiniopModDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *c,
-    const void *a,
-    const void *b,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                                \
-        return reinterpret_cast<const op::mod::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, c, {a, b}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroyModDescriptor(infiniopModDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                \
-    case CASE:                                                                 \
-        delete reinterpret_cast<const op::mod::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
+BINARY_OP_IMPL(mod, Mod)
diff --git a/src/infiniop/ops/neg/cpu/neg_cpu.cc b/src/infiniop/ops/neg/cpu/neg_cpu.cc
index 5da2ae4c3..47f4d2b2e 100644
--- a/src/infiniop/ops/neg/cpu/neg_cpu.cc
+++ b/src/infiniop/ops/neg/cpu/neg_cpu.cc
@@ -1,48 +1,8 @@
 #include "neg_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::neg::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_UNARY(neg)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<NegOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<NegOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::neg::cpu
diff --git a/src/infiniop/ops/neg/cpu/neg_cpu.h b/src/infiniop/ops/neg/cpu/neg_cpu.h
index ea45989b3..f6778a6d3 100644
--- a/src/infiniop/ops/neg/cpu/neg_cpu.h
+++ b/src/infiniop/ops/neg/cpu/neg_cpu.h
@@ -2,19 +2,8 @@
 #define __NEG_CPU_H__
 
 #include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
 
-ELEMENTWISE_DESCRIPTOR(neg, cpu)
-
-namespace op::neg::cpu {
-typedef struct NegOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    T operator()(const T &x) const {
-        return -x;
-    }
-} NegOp;
-} // namespace op::neg::cpu
+UNARY_ELEMENTWISE_DESCRIPTOR(neg, cpu, op::elementwise::unary::UnaryMode::Neg)
 
 #endif // __NEG_CPU_H__
diff --git a/src/infiniop/ops/neg/cuda/kernel.cuh b/src/infiniop/ops/neg/cuda/kernel.cuh
index 57904b3df..f5cf5a449 100644
--- a/src/infiniop/ops/neg/cuda/kernel.cuh
+++ b/src/infiniop/ops/neg/cuda/kernel.cuh
@@ -1,23 +1,10 @@
 #ifndef __NEG_CUDA_H__
 #define __NEG_CUDA_H__
 
-#include <cuda_fp16.h>
+#include "../../../elementwise/unary.h"
 
 namespace op::neg::cuda {
-typedef struct NegOp {
-public:
-    static constexpr size_t num_inputs = 1;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            return __hneg2(x);
-        } else if constexpr (std::is_same_v<T, half>) {
-            return __hneg(x);
-        } else {
-            return -x;
-        }
-    }
-} NegOp;
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Neg>;
 } // namespace op::neg::cuda
 
 #endif // __NEG_CUDA_H__
diff --git a/src/infiniop/ops/neg/nvidia/neg_nvidia.cu b/src/infiniop/ops/neg/nvidia/neg_nvidia.cu
index d18b8bf25..f568585f0 100644
--- a/src/infiniop/ops/neg/nvidia/neg_nvidia.cu
+++ b/src/infiniop/ops/neg/nvidia/neg_nvidia.cu
@@ -1,54 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "neg_nvidia.cuh"
 
 namespace op::neg::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_UNARY(neg)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::NegOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::NegOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::neg::nvidia
diff --git a/src/infiniop/ops/neg/operator.cc b/src/infiniop/ops/neg/operator.cc
index d4134df3e..c3945f4bb 100644
--- a/src/infiniop/ops/neg/operator.cc
+++ b/src/infiniop/ops/neg/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/neg.h"
 
 #ifdef ENABLE_CPU_API
@@ -9,131 +8,4 @@
 #include "nvidia/neg_nvidia.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreateNegDescriptor(
-    infiniopHandle_t handle,
-    infiniopNegDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                             \
-        return op::neg::NAMESPACE::Descriptor::create(                     \
-            handle,                                                        \
-            reinterpret_cast<op::neg::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc,                                                        \
-            {x_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetNegWorkspaceSize(infiniopNegDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                               \
-    case CASE:                                                                             \
-        *size = reinterpret_cast<op::neg::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia)
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia)
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia)
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopNeg(
-    infiniopNegDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                                \
-        return reinterpret_cast<const op::neg::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, y, {x}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroyNegDescriptor(infiniopNegDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                \
-    case CASE:                                                                 \
-        delete reinterpret_cast<const op::neg::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
+UNARY_OP_IMPL(neg, Neg)
diff --git a/src/infiniop/ops/pow/cpu/pow_cpu.cc b/src/infiniop/ops/pow/cpu/pow_cpu.cc
index 0c6fda0f7..1134d8aae 100644
--- a/src/infiniop/ops/pow/cpu/pow_cpu.cc
+++ b/src/infiniop/ops/pow/cpu/pow_cpu.cc
@@ -1,49 +1,8 @@
 #include "pow_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::pow::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_BINARY(pow)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &a_desc = input_desc_vec.at(0);
-    const auto &b_desc = input_desc_vec.at(1);
-    const auto &out_shape = out_desc->shape();
-    const auto &a_shape = a_desc->shape();
-    const auto &b_shape = b_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(out_shape, a_shape, b_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<PowOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<PowOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::pow::cpu
diff --git a/src/infiniop/ops/pow/cpu/pow_cpu.h b/src/infiniop/ops/pow/cpu/pow_cpu.h
index 21d9bb897..9c8e8a368 100644
--- a/src/infiniop/ops/pow/cpu/pow_cpu.h
+++ b/src/infiniop/ops/pow/cpu/pow_cpu.h
@@ -1,19 +1,9 @@
 #ifndef __POW_CPU_H__
 #define __POW_CPU_H__
 
+#include "../../../elementwise/binary.h"
 #include "../../../elementwise/cpu/elementwise_cpu.h"
 
-ELEMENTWISE_DESCRIPTOR(pow, cpu)
-
-namespace op::pow::cpu {
-typedef struct PowOp {
-public:
-    static constexpr size_t num_inputs = 2;
-    template <typename T>
-    T operator()(const T &a, const T &b) const {
-        return std::pow(a, b);
-    }
-} PowOp;
-} // namespace op::pow::cpu
+BINARY_ELEMENTWISE_DESCRIPTOR(pow, cpu, op::elementwise::binary::BinaryMode::Pow)
 
 #endif // __POW_CPU_H__
diff --git a/src/infiniop/ops/pow/cuda/kernel.cuh b/src/infiniop/ops/pow/cuda/kernel.cuh
index 3786e7a52..0637240e8 100644
--- a/src/infiniop/ops/pow/cuda/kernel.cuh
+++ b/src/infiniop/ops/pow/cuda/kernel.cuh
@@ -1,40 +1,10 @@
 #ifndef __POW_CUDA_H__
 #define __POW_CUDA_H__
 
-#include <cmath>
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
+#include "../../../elementwise/binary.h"
 
 namespace op::pow::cuda {
-typedef struct PowOp {
-    static constexpr size_t num_inputs = 2;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            float2 a_f2 = __half22float2(a);
-            float2 b_f2 = __half22float2(b);
-            return __float22half2_rn(make_float2(__powf(a_f2.x, b_f2.x), __powf(a_f2.y, b_f2.y)));
-        } else if constexpr (std::is_same_v<T, half>) {
-            float a_ = __half2float(a);
-            float b_ = __half2float(b);
-            float ans_f = __powf(a_, b_);
-            return __float2half(isnan(ans_f) ? std::pow(a_, b_) : ans_f);
-        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
-            float2 a_f2 = __bfloat1622float2(a);
-            float2 b_f2 = __bfloat1622float2(b);
-            return __floats2bfloat162_rn(__powf(a_f2.x, b_f2.x), __powf(a_f2.y, b_f2.y));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-            float a_ = __bfloat162float(a);
-            float b_ = __bfloat162float(b);
-            return __float2bfloat16_rn(__powf(a_, b_));
-        } else if constexpr (std::is_same_v<T, float>) {
-            return __powf(a, b);
-        } else {
-            return std::pow(a, b);
-        }
-    }
-} PowOp;
-
+using Op = op::elementwise::binary::cuda::BinaryOp<op::elementwise::binary::BinaryMode::Pow>;
 } // namespace op::pow::cuda
 
 #endif // __POW_CUDA_H__
diff --git a/src/infiniop/ops/pow/nvidia/pow_nvidia.cu b/src/infiniop/ops/pow/nvidia/pow_nvidia.cu
index 3cfd0cd2f..63a3d40a3 100644
--- a/src/infiniop/ops/pow/nvidia/pow_nvidia.cu
+++ b/src/infiniop/ops/pow/nvidia/pow_nvidia.cu
@@ -1,57 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "pow_nvidia.cuh"
 
 namespace op::pow::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_BINARY(pow)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &a_desc = input_desc_vec.at(0);
-    const auto &b_desc = input_desc_vec.at(1);
-    const auto &c_shape = out_desc->shape();
-    const auto &a_shape = a_desc->shape();
-    const auto &b_shape = b_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::PowOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::PowOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::pow::nvidia
diff --git a/src/infiniop/ops/pow/operator.cc b/src/infiniop/ops/pow/operator.cc
index e90639f67..b1ddbc9c1 100644
--- a/src/infiniop/ops/pow/operator.cc
+++ b/src/infiniop/ops/pow/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/pow.h"
 
 #ifdef ENABLE_CPU_API
@@ -9,134 +8,4 @@
 #include "nvidia/pow_nvidia.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreatePowDescriptor(
-    infiniopHandle_t handle,
-    infiniopPowDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t c_desc,
-    infiniopTensorDescriptor_t a_desc,
-    infiniopTensorDescriptor_t b_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                             \
-        return op::pow::NAMESPACE::Descriptor::create(                     \
-            handle,                                                        \
-            reinterpret_cast<op::pow::NAMESPACE::Descriptor **>(desc_ptr), \
-            c_desc,                                                        \
-            {a_desc,                                                       \
-             b_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetPowWorkspaceSize(infiniopPowDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                               \
-    case CASE:                                                                             \
-        *size = reinterpret_cast<op::pow::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia);
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopPow(
-    infiniopPowDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *c,
-    const void *a,
-    const void *b,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                                \
-        return reinterpret_cast<const op::pow::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, c, {a, b}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroyPowDescriptor(infiniopPowDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                \
-    case CASE:                                                                 \
-        delete reinterpret_cast<const op::pow::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
+BINARY_OP_IMPL(pow, Pow)
diff --git a/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.cc b/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.cc
index 52874c8b3..0b66eca64 100644
--- a/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.cc
+++ b/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.cc
@@ -1,48 +1,8 @@
 #include "reciprocal_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::reciprocal::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_UNARY(reciprocal)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<ReciprocalOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<ReciprocalOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::reciprocal::cpu
diff --git a/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.h b/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.h
index 0a0f223f0..9af583ab7 100644
--- a/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.h
+++ b/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.h
@@ -2,19 +2,8 @@
 #define __RECIPROCAL_CPU_H__
 
 #include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
 
-ELEMENTWISE_DESCRIPTOR(reciprocal, cpu)
-
-namespace op::reciprocal::cpu {
-typedef struct ReciprocalOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    T operator()(const T &x) const {
-        return T(1) / x;
-    }
-} ReciprocalOp;
-} // namespace op::reciprocal::cpu
+UNARY_ELEMENTWISE_DESCRIPTOR(reciprocal, cpu, op::elementwise::unary::UnaryMode::Reciprocal)
 
 #endif // __RECIPROCAL_CPU_H__
diff --git a/src/infiniop/ops/reciprocal/cuda/kernel.cuh b/src/infiniop/ops/reciprocal/cuda/kernel.cuh
index 94c71de90..8c29a8e9e 100644
--- a/src/infiniop/ops/reciprocal/cuda/kernel.cuh
+++ b/src/infiniop/ops/reciprocal/cuda/kernel.cuh
@@ -1,32 +1,10 @@
 #ifndef __RECIPROCAL_CUDA_H__
 #define __RECIPROCAL_CUDA_H__
 
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
-#include <cuda_fp16.h>
+#include "../../../elementwise/unary.h"
 
 namespace op::reciprocal::cuda {
-typedef struct ReciprocalOp {
-public:
-    static constexpr size_t num_inputs = 1;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            return h2rcp(x);
-        } else if constexpr (std::is_same_v<T, half>) {
-            return hrcp(x);
-        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
-            float x0 = __bfloat162float(__low2bfloat16(x));
-            float x1 = __bfloat162float(__high2bfloat16(x));
-            return __floats2bfloat162_rn(__frcp_rn(x0), __frcp_rn(x1));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-            return __float2bfloat16_rn(__frcp_rn(__bfloat162float(x)));
-        } else if constexpr (std::is_same_v<T, float>) {
-            return __frcp_rn(x);
-        } else {
-            return T(1) / x;
-        }
-    }
-} ReciprocalOp;
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Reciprocal>;
 } // namespace op::reciprocal::cuda
 
 #endif // __RECIPROCAL_CUDA_H__
diff --git a/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cu b/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cu
index 45b74e25e..39a41b583 100644
--- a/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cu
+++ b/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cu
@@ -1,54 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "reciprocal_nvidia.cuh"
 
 namespace op::reciprocal::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_UNARY(reciprocal)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::ReciprocalOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::ReciprocalOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::reciprocal::nvidia
diff --git a/src/infiniop/ops/reciprocal/operator.cc b/src/infiniop/ops/reciprocal/operator.cc
index 033286024..966bd72d8 100644
--- a/src/infiniop/ops/reciprocal/operator.cc
+++ b/src/infiniop/ops/reciprocal/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/reciprocal.h"
 
 #ifdef ENABLE_CPU_API
@@ -9,131 +8,4 @@
 #include "nvidia/reciprocal_nvidia.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreateReciprocalDescriptor(
-    infiniopHandle_t handle,
-    infiniopReciprocalDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                                   \
-    case CASE:                                                                    \
-        return op::reciprocal::NAMESPACE::Descriptor::create(                     \
-            handle,                                                               \
-            reinterpret_cast<op::reciprocal::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc,                                                               \
-            {x_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetReciprocalWorkspaceSize(infiniopReciprocalDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                                      \
-    case CASE:                                                                                    \
-        *size = reinterpret_cast<op::reciprocal::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia)
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia)
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia)
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopReciprocal(
-    infiniopReciprocalDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                                   \
-    case CASE:                                                                       \
-        return reinterpret_cast<const op::reciprocal::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, y, {x}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroyReciprocalDescriptor(infiniopReciprocalDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                       \
-    case CASE:                                                                        \
-        delete reinterpret_cast<const op::reciprocal::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
+UNARY_OP_IMPL(reciprocal, Reciprocal)
diff --git a/src/infiniop/ops/round/cpu/round_cpu.cc b/src/infiniop/ops/round/cpu/round_cpu.cc
index 0b0cea7b7..20ae304bd 100644
--- a/src/infiniop/ops/round/cpu/round_cpu.cc
+++ b/src/infiniop/ops/round/cpu/round_cpu.cc
@@ -1,48 +1,8 @@
 #include "round_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::round::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_UNARY(round)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<RoundOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<RoundOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::round::cpu
diff --git a/src/infiniop/ops/round/cpu/round_cpu.h b/src/infiniop/ops/round/cpu/round_cpu.h
index eccd6df0f..1a755dbf8 100644
--- a/src/infiniop/ops/round/cpu/round_cpu.h
+++ b/src/infiniop/ops/round/cpu/round_cpu.h
@@ -2,24 +2,8 @@
 #define __ROUND_CPU_H__
 
 #include "../../../elementwise/cpu/elementwise_cpu.h"
-#include <cmath>
+#include "../../../elementwise/unary.h"
 
-ELEMENTWISE_DESCRIPTOR(round, cpu)
-
-namespace op::round::cpu {
-typedef struct RoundOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    T operator()(const T &x) const {
-        if constexpr (std::is_integral_v<T>) {
-            return x;
-        } else {
-            return std::nearbyint(x);
-        }
-    }
-} RoundOp;
-} // namespace op::round::cpu
+UNARY_ELEMENTWISE_DESCRIPTOR(round, cpu, op::elementwise::unary::UnaryMode::Round)
 
 #endif // __ROUND_CPU_H__
diff --git a/src/infiniop/ops/round/cuda/kernel.cuh b/src/infiniop/ops/round/cuda/kernel.cuh
index c52a10716..f4de9c772 100644
--- a/src/infiniop/ops/round/cuda/kernel.cuh
+++ b/src/infiniop/ops/round/cuda/kernel.cuh
@@ -1,34 +1,10 @@
 #ifndef __ROUND_CUDA_H__
 #define __ROUND_CUDA_H__
 
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
-#include <cuda_fp16.h>
+#include "../../../elementwise/unary.h"
 
 namespace op::round::cuda {
-typedef struct RoundOp {
-public:
-    static constexpr size_t num_inputs = 1;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            return h2rint(x);
-        } else if constexpr (std::is_same_v<T, half>) {
-            return hrint(x);
-        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
-            float x0 = __bfloat162float(__low2bfloat16(x));
-            float x1 = __bfloat162float(__high2bfloat16(x));
-            return __floats2bfloat162_rn(rintf(x0), rintf(x1));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-            return __float2bfloat16_rn(rintf(__bfloat162float(x)));
-        } else if constexpr (std::is_same_v<T, float>) {
-            return rintf(x);
-        } else if constexpr (std::is_integral_v<T>) {
-            return x;
-        } else {
-            return std::nearbyint(x);
-        }
-    }
-} RoundOp;
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Round>;
 } // namespace op::round::cuda
 
 #endif // __ROUND_CUDA_H__
diff --git a/src/infiniop/ops/round/nvidia/round_nvidia.cu b/src/infiniop/ops/round/nvidia/round_nvidia.cu
index c1fabc885..dc84388a3 100644
--- a/src/infiniop/ops/round/nvidia/round_nvidia.cu
+++ b/src/infiniop/ops/round/nvidia/round_nvidia.cu
@@ -1,54 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "round_nvidia.cuh"
 
 namespace op::round::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_UNARY(round)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::RoundOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::RoundOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::round::nvidia
diff --git a/src/infiniop/ops/round/operator.cc b/src/infiniop/ops/round/operator.cc
index 9468803c8..a20fbcb17 100644
--- a/src/infiniop/ops/round/operator.cc
+++ b/src/infiniop/ops/round/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/round.h"
 
 #ifdef ENABLE_CPU_API
@@ -9,131 +8,4 @@
 #include "nvidia/round_nvidia.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreateRoundDescriptor(
-    infiniopHandle_t handle,
-    infiniopRoundDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                              \
-    case CASE:                                                               \
-        return op::round::NAMESPACE::Descriptor::create(                     \
-            handle,                                                          \
-            reinterpret_cast<op::round::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc,                                                          \
-            {x_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetRoundWorkspaceSize(infiniopRoundDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                                 \
-    case CASE:                                                                               \
-        *size = reinterpret_cast<op::round::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia)
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia)
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia)
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopRound(
-    infiniopRoundDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                              \
-    case CASE:                                                                  \
-        return reinterpret_cast<const op::round::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, y, {x}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroyRoundDescriptor(infiniopRoundDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                  \
-    case CASE:                                                                   \
-        delete reinterpret_cast<const op::round::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
+UNARY_OP_IMPL(round, Round)
diff --git a/src/infiniop/ops/sign/cpu/sign_cpu.cc b/src/infiniop/ops/sign/cpu/sign_cpu.cc
index 1f3430e73..c65868d09 100644
--- a/src/infiniop/ops/sign/cpu/sign_cpu.cc
+++ b/src/infiniop/ops/sign/cpu/sign_cpu.cc
@@ -1,48 +1,8 @@
 #include "sign_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::sign::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_UNARY(sign)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<SignOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<SignOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::sign::cpu
diff --git a/src/infiniop/ops/sign/cpu/sign_cpu.h b/src/infiniop/ops/sign/cpu/sign_cpu.h
index 505194c85..7ddeec543 100644
--- a/src/infiniop/ops/sign/cpu/sign_cpu.h
+++ b/src/infiniop/ops/sign/cpu/sign_cpu.h
@@ -2,19 +2,8 @@
 #define __SIGN_CPU_H__
 
 #include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
 
-ELEMENTWISE_DESCRIPTOR(sign, cpu)
-
-namespace op::sign::cpu {
-typedef struct SignOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    T operator()(const T &x) const {
-        return x > T(0) ? T(1) : (x == T(0) ? T(0) : T(-1));
-    }
-} SignOp;
-} // namespace op::sign::cpu
+UNARY_ELEMENTWISE_DESCRIPTOR(sign, cpu, op::elementwise::unary::UnaryMode::Sign)
 
 #endif // __SIGN_CPU_H__
diff --git a/src/infiniop/ops/sign/cuda/kernel.cuh b/src/infiniop/ops/sign/cuda/kernel.cuh
index 3737282b0..a1216fb82 100644
--- a/src/infiniop/ops/sign/cuda/kernel.cuh
+++ b/src/infiniop/ops/sign/cuda/kernel.cuh
@@ -1,25 +1,10 @@
 #ifndef __SIGN_CUDA_H__
 #define __SIGN_CUDA_H__
 
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
-#include <cuda_fp16.h>
+#include "../../../elementwise/unary.h"
 
 namespace op::sign::cuda {
-typedef struct SignOp {
-public:
-    static constexpr size_t num_inputs = 1;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            const auto lt_mask = __hlt2(x, __floats2half2_rn(0.0f, 0.0f));
-            return __hadd2(__hneg2(lt_mask), __hsub2(__floats2half2_rn(1.0f, 1.0f), lt_mask));
-        } else if constexpr (std::is_same_v<T, half>) {
-            return x > half(0) ? half(1) : (x == half(0) ? half(0) : half(-1));
-        } else {
-            return x > T(0) ? T(1) : (x == T(0) ? T(0) : T(-1));
-        }
-    }
-} SignOp;
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Sign>;
 } // namespace op::sign::cuda
 
 #endif // __SIGN_CUDA_H__
diff --git a/src/infiniop/ops/sign/nvidia/sign_nvidia.cu b/src/infiniop/ops/sign/nvidia/sign_nvidia.cu
index 6a3152e41..2a11f9e23 100644
--- a/src/infiniop/ops/sign/nvidia/sign_nvidia.cu
+++ b/src/infiniop/ops/sign/nvidia/sign_nvidia.cu
@@ -1,54 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "sign_nvidia.cuh"
 
 namespace op::sign::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_UNARY(sign)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::SignOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::SignOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::sign::nvidia
diff --git a/src/infiniop/ops/sign/operator.cc b/src/infiniop/ops/sign/operator.cc
index 8f658a9b3..1a4599d5d 100644
--- a/src/infiniop/ops/sign/operator.cc
+++ b/src/infiniop/ops/sign/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/sign.h"
 
 #ifdef ENABLE_CPU_API
@@ -9,131 +8,4 @@
 #include "nvidia/sign_nvidia.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreateSignDescriptor(
-    infiniopHandle_t handle,
-    infiniopSignDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                             \
-    case CASE:                                                              \
-        return op::sign::NAMESPACE::Descriptor::create(                     \
-            handle,                                                         \
-            reinterpret_cast<op::sign::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc,                                                         \
-            {x_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetSignWorkspaceSize(infiniopSignDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                                \
-    case CASE:                                                                              \
-        *size = reinterpret_cast<op::sign::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia)
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia)
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia)
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopSign(
-    infiniopSignDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                             \
-    case CASE:                                                                 \
-        return reinterpret_cast<const op::sign::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, y, {x}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroySignDescriptor(infiniopSignDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                 \
-    case CASE:                                                                  \
-        delete reinterpret_cast<const op::sign::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
+UNARY_OP_IMPL(sign, Sign)
diff --git a/src/infiniop/ops/sinh/cpu/sinh_cpu.cc b/src/infiniop/ops/sinh/cpu/sinh_cpu.cc
index 40685847d..897439905 100644
--- a/src/infiniop/ops/sinh/cpu/sinh_cpu.cc
+++ b/src/infiniop/ops/sinh/cpu/sinh_cpu.cc
@@ -1,48 +1,8 @@
 #include "sinh_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::sinh::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_UNARY(sinh)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<SinhOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<SinhOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::sinh::cpu
diff --git a/src/infiniop/ops/sinh/cpu/sinh_cpu.h b/src/infiniop/ops/sinh/cpu/sinh_cpu.h
index dbc8f3c7e..573027ee3 100644
--- a/src/infiniop/ops/sinh/cpu/sinh_cpu.h
+++ b/src/infiniop/ops/sinh/cpu/sinh_cpu.h
@@ -1,22 +1,9 @@
 #ifndef __SINH_CPU_H__
 #define __SINH_CPU_H__
 
-#include <cmath>
-
 #include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
 
-ELEMENTWISE_DESCRIPTOR(sinh, cpu)
-
-namespace op::sinh::cpu {
-typedef struct SinhOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    T operator()(const T &x) const {
-        return std::sinh(x);
-    }
-} SinhOp;
-} // namespace op::sinh::cpu
+UNARY_ELEMENTWISE_DESCRIPTOR(sinh, cpu, op::elementwise::unary::UnaryMode::Sinh)
 
 #endif // __SINH_CPU_H__
diff --git a/src/infiniop/ops/sinh/cuda/kernel.cuh b/src/infiniop/ops/sinh/cuda/kernel.cuh
index c09150666..d5bb7491f 100644
--- a/src/infiniop/ops/sinh/cuda/kernel.cuh
+++ b/src/infiniop/ops/sinh/cuda/kernel.cuh
@@ -1,32 +1,10 @@
 #ifndef __SINH_CUDA_H__
 #define __SINH_CUDA_H__
 
-#include <cmath>
-#include <cuda_fp16.h>
+#include "../../../elementwise/unary.h"
 
 namespace op::sinh::cuda {
-typedef struct SinhOp {
-public:
-    static constexpr size_t num_inputs = 1;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            return __floats2half2_rn(sinhf(__half2float(__low2half(x))), sinhf(__half2float(__high2half(x))));
-        } else if constexpr (std::is_same_v<T, half>) {
-            return __float2half(sinhf(__half2float(x)));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
-            float x0 = __bfloat162float(__low2bfloat16(x));
-            float x1 = __bfloat162float(__high2bfloat16(x));
-            return __floats2bfloat162_rn(sinhf(x0), sinhf(x1));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-            return __float2bfloat16_rn(sinhf(__bfloat162float(x)));
-        } else if constexpr (std::is_same_v<T, float>) {
-            return sinhf(x);
-        } else {
-            return std::sinh(x);
-        }
-    }
-} SinhOp;
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Sinh>;
 } // namespace op::sinh::cuda
 
 #endif // __SINH_CUDA_H__
diff --git a/src/infiniop/ops/sinh/nvidia/sinh_nvidia.cu b/src/infiniop/ops/sinh/nvidia/sinh_nvidia.cu
index d4c3fd165..3abfc2973 100644
--- a/src/infiniop/ops/sinh/nvidia/sinh_nvidia.cu
+++ b/src/infiniop/ops/sinh/nvidia/sinh_nvidia.cu
@@ -1,54 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "sinh_nvidia.cuh"
 
 namespace op::sinh::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_UNARY(sinh)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::SinhOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::SinhOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::sinh::nvidia
diff --git a/src/infiniop/ops/sinh/operator.cc b/src/infiniop/ops/sinh/operator.cc
index 1636ce2c8..41940d235 100644
--- a/src/infiniop/ops/sinh/operator.cc
+++ b/src/infiniop/ops/sinh/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/sinh.h"
 
 #ifdef ENABLE_CPU_API
@@ -9,131 +8,4 @@
 #include "nvidia/sinh_nvidia.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreateSinhDescriptor(
-    infiniopHandle_t handle,
-    infiniopSinhDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                             \
-    case CASE:                                                              \
-        return op::sinh::NAMESPACE::Descriptor::create(                     \
-            handle,                                                         \
-            reinterpret_cast<op::sinh::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc,                                                         \
-            {x_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetSinhWorkspaceSize(infiniopSinhDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                                \
-    case CASE:                                                                              \
-        *size = reinterpret_cast<op::sinh::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia)
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia)
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia)
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopSinh(
-    infiniopSinhDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                             \
-    case CASE:                                                                 \
-        return reinterpret_cast<const op::sinh::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, y, {x}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroySinhDescriptor(infiniopSinhDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                 \
-    case CASE:                                                                  \
-        delete reinterpret_cast<const op::sinh::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
+UNARY_OP_IMPL(sinh, Sinh)
diff --git a/src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc b/src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc
index 99e723126..eb9ac4d66 100644
--- a/src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc
+++ b/src/infiniop/ops/sqrt/cpu/sqrt_cpu.cc
@@ -1,48 +1,8 @@
 #include "sqrt_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::sqrt::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_UNARY(sqrt)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<SqrtOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<SqrtOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::sqrt::cpu
diff --git a/src/infiniop/ops/sqrt/cpu/sqrt_cpu.h b/src/infiniop/ops/sqrt/cpu/sqrt_cpu.h
index 3d026cf63..ed6217e1f 100644
--- a/src/infiniop/ops/sqrt/cpu/sqrt_cpu.h
+++ b/src/infiniop/ops/sqrt/cpu/sqrt_cpu.h
@@ -1,22 +1,9 @@
 #ifndef __SQRT_CPU_H__
 #define __SQRT_CPU_H__
 
-#include <cmath>
-
 #include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
 
-ELEMENTWISE_DESCRIPTOR(sqrt, cpu)
-
-namespace op::sqrt::cpu {
-typedef struct SqrtOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    T operator()(const T &x) const {
-        return std::sqrt(x);
-    }
-} SqrtOp;
-} // namespace op::sqrt::cpu
+UNARY_ELEMENTWISE_DESCRIPTOR(sqrt, cpu, op::elementwise::unary::UnaryMode::Sqrt)
 
 #endif // __SQRT_CPU_H__
diff --git a/src/infiniop/ops/sqrt/cuda/kernel.cuh b/src/infiniop/ops/sqrt/cuda/kernel.cuh
index c82cd7dd5..40ab9708f 100644
--- a/src/infiniop/ops/sqrt/cuda/kernel.cuh
+++ b/src/infiniop/ops/sqrt/cuda/kernel.cuh
@@ -1,32 +1,10 @@
 #ifndef __SQRT_CUDA_H__
 #define __SQRT_CUDA_H__
 
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
-#include <cuda_fp16.h>
+#include "../../../elementwise/unary.h"
 
 namespace op::sqrt::cuda {
-typedef struct SqrtOp {
-public:
-    static constexpr size_t num_inputs = 1;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            return h2sqrt(x);
-        } else if constexpr (std::is_same_v<T, half>) {
-            return hsqrt(x);
-        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
-            float x0 = __bfloat162float(__low2bfloat16(x));
-            float x1 = __bfloat162float(__high2bfloat16(x));
-            return __floats2bfloat162_rn(__fsqrt_rn(x0), __fsqrt_rn(x1));
-        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-            return __float2bfloat16_rn(__fsqrt_rn(__bfloat162float(x)));
-        } else if constexpr (std::is_same_v<T, float>) {
-            return __fsqrt_rn(x);
-        } else {
-            return std::sqrt(x);
-        }
-    }
-} SqrtOp;
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Sqrt>;
 } // namespace op::sqrt::cuda
 
 #endif // __SQRT_CUDA_H__
diff --git a/src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cu b/src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cu
index 519d06e89..4d6c70d72 100644
--- a/src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cu
+++ b/src/infiniop/ops/sqrt/nvidia/sqrt_nvidia.cu
@@ -1,54 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "sqrt_nvidia.cuh"
 
 namespace op::sqrt::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_UNARY(sqrt)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::SqrtOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::SqrtOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::sqrt::nvidia
diff --git a/src/infiniop/ops/sqrt/operator.cc b/src/infiniop/ops/sqrt/operator.cc
index b11c8a4b5..fe999f58f 100644
--- a/src/infiniop/ops/sqrt/operator.cc
+++ b/src/infiniop/ops/sqrt/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/sqrt.h"
 
 #ifdef ENABLE_CPU_API
@@ -9,131 +8,4 @@
 #include "nvidia/sqrt_nvidia.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreateSqrtDescriptor(
-    infiniopHandle_t handle,
-    infiniopSqrtDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                             \
-    case CASE:                                                              \
-        return op::sqrt::NAMESPACE::Descriptor::create(                     \
-            handle,                                                         \
-            reinterpret_cast<op::sqrt::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc,                                                         \
-            {x_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetSqrtWorkspaceSize(infiniopSqrtDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                                \
-    case CASE:                                                                              \
-        *size = reinterpret_cast<op::sqrt::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia)
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia)
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia)
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopSqrt(
-    infiniopSqrtDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                             \
-    case CASE:                                                                 \
-        return reinterpret_cast<const op::sqrt::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, y, {x}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroySqrtDescriptor(infiniopSqrtDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                 \
-    case CASE:                                                                  \
-        delete reinterpret_cast<const op::sqrt::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
+UNARY_OP_IMPL(sqrt, Sqrt)
diff --git a/src/infiniop/ops/tan/cpu/tan_cpu.cc b/src/infiniop/ops/tan/cpu/tan_cpu.cc
index 2947dfc5e..5166cf64f 100644
--- a/src/infiniop/ops/tan/cpu/tan_cpu.cc
+++ b/src/infiniop/ops/tan/cpu/tan_cpu.cc
@@ -1,48 +1,8 @@
 #include "tan_cpu.h"
+#include "../../../elementwise/cpu/elementwise_cpu_impl.h"
 
 namespace op::tan::cpu {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_CPU_IMPL_UNARY(tan)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<TanOp, fp16_t>(_info, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<TanOp, float>(_info, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::tan::cpu
diff --git a/src/infiniop/ops/tan/cpu/tan_cpu.h b/src/infiniop/ops/tan/cpu/tan_cpu.h
index c3a22456c..6c697c311 100644
--- a/src/infiniop/ops/tan/cpu/tan_cpu.h
+++ b/src/infiniop/ops/tan/cpu/tan_cpu.h
@@ -1,22 +1,9 @@
 #ifndef __TAN_CPU_H__
 #define __TAN_CPU_H__
 
-#include <cmath>
-
 #include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../../../elementwise/unary.h"
 
-ELEMENTWISE_DESCRIPTOR(tan, cpu)
-
-namespace op::tan::cpu {
-typedef struct TanOp {
-public:
-    static constexpr size_t num_inputs = 1;
-
-    template <typename T>
-    T operator()(const T &x) const {
-        return std::tan(x);
-    }
-} TanOp;
-} // namespace op::tan::cpu
+UNARY_ELEMENTWISE_DESCRIPTOR(tan, cpu, op::elementwise::unary::UnaryMode::Tan)
 
 #endif // __TAN_CPU_H__
diff --git a/src/infiniop/ops/tan/cuda/kernel.cuh b/src/infiniop/ops/tan/cuda/kernel.cuh
index bbd8facaa..c3cf45350 100644
--- a/src/infiniop/ops/tan/cuda/kernel.cuh
+++ b/src/infiniop/ops/tan/cuda/kernel.cuh
@@ -1,55 +1,10 @@
 #ifndef __TAN_CUDA_H__
 #define __TAN_CUDA_H__
 
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
-#include <cmath>
-#include <cuda_fp16.h>
-
-#define TAN_THRESHOLD 15000
+#include "../../../elementwise/unary.h"
 
 namespace op::tan::cuda {
-typedef struct TanOp {
-public:
-    static constexpr size_t num_inputs = 1;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &x) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            return h2sin(x) / h2cos(x);
-        } else if constexpr (std::is_same_v<T, half>) {
-            float tan_f = __tanf(__half2float(x));
-            if (std::fabs(tan_f) > TAN_THRESHOLD) {
-                return __float2half(tanf(__half2float(x)));
-            }
-            return __float2half(tan_f);
-        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
-            float x0 = __bfloat162float(__low2bfloat16(x));
-            float x1 = __bfloat162float(__high2bfloat16(x));
-            float tan_f0 = __tanf(x0);
-            float tan_f1 = __tanf(x1);
-            if (std::fabs(tan_f0) > TAN_THRESHOLD) {
-                tan_f0 = tanf(x0);
-            }
-            if (std::fabs(tan_f1) > TAN_THRESHOLD) {
-                tan_f1 = tanf(x1);
-            }
-            return __floats2bfloat162_rn(tan_f0, tan_f1);
-        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-            float tan_f = __tanf(__bfloat162float(x));
-            if (std::fabs(tan_f) > TAN_THRESHOLD) {
-                return __float2bfloat16_rn(tanf(__bfloat162float(x)));
-            }
-            return __float2bfloat16_rn(tan_f);
-        } else if constexpr (std::is_same_v<T, float>) {
-            float tan_f = __tanf(x);
-            if (std::fabs(tan_f) > TAN_THRESHOLD) {
-                return tanf(x);
-            }
-            return tan_f;
-        } else {
-            return std::tan(x);
-        }
-    }
-} TanOp;
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Tan>;
 } // namespace op::tan::cuda
 
 #endif // __TAN_CUDA_H__
diff --git a/src/infiniop/ops/tan/nvidia/tan_nvidia.cu b/src/infiniop/ops/tan/nvidia/tan_nvidia.cu
index b4c24e2fe..5f56dcb6f 100644
--- a/src/infiniop/ops/tan/nvidia/tan_nvidia.cu
+++ b/src/infiniop/ops/tan/nvidia/tan_nvidia.cu
@@ -1,54 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "tan_nvidia.cuh"
 
 namespace op::tan::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_UNARY(tan)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::TanOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::TanOp, float>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::tan::nvidia
diff --git a/src/infiniop/ops/tan/operator.cc b/src/infiniop/ops/tan/operator.cc
index 48ae8d48e..ae506dcd8 100644
--- a/src/infiniop/ops/tan/operator.cc
+++ b/src/infiniop/ops/tan/operator.cc
@@ -1,5 +1,4 @@
-#include "../../operator.h"
-#include "../../handle.h"
+#include "../../operator_impl.h"
 #include "infiniop/ops/tan.h"
 
 #ifdef ENABLE_CPU_API
@@ -9,131 +8,4 @@
 #include "nvidia/tan_nvidia.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreateTanDescriptor(
-    infiniopHandle_t handle,
-    infiniopTanDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                             \
-        return op::tan::NAMESPACE::Descriptor::create(                     \
-            handle,                                                        \
-            reinterpret_cast<op::tan::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc,                                                        \
-            {x_desc})
-
-    switch (handle->device) {
-
-#ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CREATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CREATE
-}
-
-__C infiniStatus_t infiniopGetTanWorkspaceSize(infiniopTanDescriptor_t desc, size_t *size) {
-
-#define GET(CASE, NAMESPACE)                                                               \
-    case CASE:                                                                             \
-        *size = reinterpret_cast<op::tan::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
-
-    switch (desc->device_type) {
-#ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
-#endif
-#ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, nvidia)
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        GET(INFINI_DEVICE_ILUVATAR, nvidia)
-#endif
-#ifdef ENABLE_QY_API
-        GET(INFINI_DEVICE_QY, nvidia)
-#endif
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-#undef GET
-
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-}
-
-__C infiniStatus_t infiniopTan(
-    infiniopTanDescriptor_t desc,
-    void *workspace,
-    size_t workspace_size,
-    void *y,
-    const void *x,
-    void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                            \
-    case CASE:                                                                \
-        return reinterpret_cast<const op::tan::NAMESPACE::Descriptor *>(desc) \
-            ->calculate(workspace, workspace_size, y, {x}, stream)
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        CALCULATE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef CALCULATE
-}
-
-__C infiniStatus_t
-infiniopDestroyTanDescriptor(infiniopTanDescriptor_t desc) {
-
-#define DELETE(CASE, NAMESPACE)                                                \
-    case CASE:                                                                 \
-        delete reinterpret_cast<const op::tan::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS
-
-    switch (desc->device_type) {
-
-#ifdef ENABLE_CPU_API
-        DELETE(INFINI_DEVICE_CPU, cpu);
-#endif
-#ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
-#endif
-#ifdef ENABLE_ILUVATAR_API
-        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
-#endif
-#ifdef ENABLE_QY_API
-        DELETE(INFINI_DEVICE_QY, nvidia);
-#endif
-
-    default:
-        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
-    }
-
-#undef DELETE
-}
+UNARY_OP_IMPL(tan, Tan)
diff --git a/src/infiniop/ops/tanh/cuda/kernel.cuh b/src/infiniop/ops/tanh/cuda/kernel.cuh
index e336a4995..d987ac7c5 100644
--- a/src/infiniop/ops/tanh/cuda/kernel.cuh
+++ b/src/infiniop/ops/tanh/cuda/kernel.cuh
@@ -1,44 +1,10 @@
 #ifndef __TANH_CUDA_H__
 #define __TANH_CUDA_H__
 
-#include <cmath>
+#include "../../../elementwise/unary.h"
 
 namespace op::tanh::cuda {
-typedef struct TanhOp {
-    static constexpr size_t num_inputs = 1;
-
-    __device__ __forceinline__ float tanh_f32_func(float x) const {
-        return tanhf(x);
-    }
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &input) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            float2 vf = __half22float2(input);
-            float2 vr = make_float2(tanh_f32_func(vf.x), tanh_f32_func(vf.y));
-            return __float22half2_rn(vr);
-        } else if constexpr (std::is_same_v<T, half>) {
-            float xf = __half2float(input);
-            float yf = tanh_f32_func(xf);
-            return __float2half_rn(yf);
-        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
-            float f0 = __bfloat162float(__low2bfloat16(input));
-            float f1 = __bfloat162float(__high2bfloat16(input));
-            float r0 = tanh_f32_func(f0);
-            float r1 = tanh_f32_func(f1);
-            return __floats2bfloat162_rn(r0, r1);
-        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-            float xf = __bfloat162float(input);
-            float rf = tanh_f32_func(xf);
-            return __float2bfloat16_rn(rf);
-        } else if constexpr (std::is_same_v<T, float>) {
-            return tanh_f32_func(input);
-        } else if constexpr (std::is_same_v<T, double>) {
-            return std::tanh(input);
-        } else {
-            return std::tanh(input);
-        }
-    }
-} TanhOp;
+using Op = op::elementwise::unary::cuda::UnaryOp<op::elementwise::unary::UnaryMode::Tanh>;
 } // namespace op::tanh::cuda
 
 #endif // __TANH_CUDA_H__
diff --git a/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu
index a2c36551c..62f02da67 100644
--- a/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu
+++ b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu
@@ -1,59 +1,10 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../../../elementwise/nvidia/elementwise_nvidia_impl.cuh"
 
 #include "../cuda/kernel.cuh"
 #include "tanh_nvidia.cuh"
 
 namespace op::tanh::nvidia {
 
-Descriptor::~Descriptor() = default;
+ELEMENTWISE_NVIDIA_IMPL_UNARY(tanh)
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle_,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
-
-    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &input_desc = input_desc_vec.at(0);
-    const auto &output_shape = out_desc->shape();
-    const auto &input_shape = input_desc->shape();
-
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
-
-    CHECK_SAME_SHAPE(output_shape, input_shape);
-
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
-
-    return INFINI_STATUS_SUCCESS;
-}
-
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    void *output,
-    std::vector<const void *> inputs,
-    void *stream) const {
-
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::TanhOp, half>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_BF16:
-        return _device_info->calculate<256, cuda::TanhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::TanhOp, float>(_info, workspace, output, inputs, stream);
-    case INFINI_DTYPE_F64:
-        return _device_info->calculate<256, cuda::TanhOp, double>(_info, workspace, output, inputs, stream);
-    default:
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
-
-    return INFINI_STATUS_SUCCESS;
-}
 } // namespace op::tanh::nvidia